Define the record_stream method in native_functions.yaml (pytorch#44301)

Summary: The record_stream method was hard coded for CUDA device. Define the record_stream in the native_functions.yaml to enable the dynamic dispatch to different end device. Fixes pytorch#36556 Pull Request resolved: pytorch#44301 Reviewed By: glaringlee Differential Revision: D23763954 Pulled By: ezyang fbshipit-source-id: e6d24f5e7892b56101fa858a6cad2abc5cdc4293
ChaoFang-TRI · Oct 13, 2020 · 5741de8 · 5741de8
1 parent d705083
commit 5741de8
Show file tree

Hide file tree

Showing 35 changed files with 286 additions and 34 deletions.
diff --git a/aten/src/ATen/core/ivalue.cpp b/aten/src/ATen/core/ivalue.cpp
@@ -97,6 +97,8 @@ TypePtr IValue::type() const {
       return RRefType::create(toRRef()->type());
     case Tag::Device:
       return DeviceObjType::get();
+    case Tag::Stream:
+      return StreamObjType::get();
     case Tag::Object:
       return toObjectRef().type();
     case Tag::PyObject:
@@ -269,6 +271,8 @@ IValue IValue::equals(const IValue& rhs) const {
       return rhs.isGenericDict() && lhs.toGenericDict() == rhs.toGenericDict();
     case Tag::Tuple:
       return rhs.isTuple() && *lhs.toTuple() == *rhs.toTuple();
+    case Tag::Stream:
+      return rhs.isStream() && lhs.toStream() == lhs.toStream();
     case Tag::Device:
       return rhs.isDevice() && lhs.toDevice() == rhs.toDevice();
     case Tag::GenericList:
@@ -634,6 +638,8 @@ std::ostream& operator<<(std::ostream & out, const IValue & v) {
       return out << "Uninitialized";
     case IValue::Tag::Device:
       return out << v.toDevice();
+    case IValue::Tag::Stream:
+      return out << v.toStream();
     case IValue::Tag::GenericDict:
       return printDict(out, v.toGenericDict(), formatter);
     case IValue::Tag::PyObject: {

diff --git a/aten/src/ATen/core/ivalue.h b/aten/src/ATen/core/ivalue.h
@@ -98,6 +98,7 @@ struct OptionalArray {
   _(GenericDict)             \
   _(Future)                  \
   _(Device)                  \
+  _(Stream)                  \
   _(Object)                  \
   _(PyObject)                \
   _(Uninitialized)           \
@@ -551,6 +552,15 @@ struct CAFFE2_API IValue final {
     return c10::Device(payload.as_device.type, payload.as_device.index);
   }
 
+  //Stream
+  IValue(c10::Stream stream)
+    : tag(Tag::Stream), is_intrusive_ptr(false) {
+    payload.as_int = stream.pack();
+  }
+  c10::Stream toStream() &&;
+  c10::Stream toStream() const &;
+  bool isStream() const { return Tag::Stream == tag; }
+
   // ScalarType
   IValue(ScalarType t)
   : IValue(static_cast<std::underlying_type<ScalarType>::type>(t)) {}

diff --git a/aten/src/ATen/core/ivalue_inl.h b/aten/src/ATen/core/ivalue_inl.h
@@ -130,6 +130,12 @@ inline at::Tensor IValue::toTensor() const & {
   AT_ASSERT(isTensor(), "Expected Tensor but got ", tagKind());
   return at::Tensor(toIntrusivePtr<at::TensorImpl, at::UndefinedTensorImpl>());
 }
+inline c10::Stream IValue::toStream() && {
+  return c10::Stream::unpack(payload.as_int);
+}
+inline c10::Stream IValue::toStream() const & {
+  return c10::Stream::unpack(payload.as_int);
+}
 inline c10::intrusive_ptr<caffe2::Blob> IValue::toBlob() && {
   AT_ASSERT(isBlob(), "Expected Blob but got ", tagKind());
   return moveToIntrusivePtr<caffe2::Blob>();
@@ -645,6 +651,7 @@ inline type IValue::to<type>() const & { \
   return this->method_name(); \
 }
 DEFINE_TO(at::Tensor, toTensor)
+DEFINE_TO(c10::Stream, toStream)
 DEFINE_TO(float, toDouble)
 DEFINE_TO(double, toDouble)
 DEFINE_TO(unsigned char, toInt)

diff --git a/aten/src/ATen/core/jit_type.h b/aten/src/ATen/core/jit_type.h
@@ -47,6 +47,7 @@ using OptNameList = c10::optional<std::vector<std::string>>;
   _(OptionalType)           \
   _(VarType)                \
   _(DeviceObjType)          \
+  _(StreamObjType)          \
   _(FunctionType)           \
   _(ClassType)              \
   _(PyObjectType)           \
@@ -1550,6 +1551,28 @@ struct CAFFE2_API DeviceObjType : public Type {
   DeviceObjType() : Type(TypeKind::DeviceObjType) {}
 };
 
+struct StreamObjType;
+using StreamObjTypePtr = std::shared_ptr<StreamObjType>;
+// This type represents a Generator
+struct CAFFE2_API StreamObjType : public Type {
+  static StreamObjTypePtr create() {
+    return StreamObjTypePtr(
+      new StreamObjType()); // NOLINT(modernize-make-shared)
+  }
+  bool operator==(const Type& rhs) const override {
+    return rhs.kind() == kind();
+  }
+  std::string str() const override {
+    return "Stream";
+  }
+  static const TypeKind Kind = TypeKind::StreamObjType;
+  // global singleton
+  static StreamObjTypePtr get();
+
+private:
+  StreamObjType() : Type(TypeKind::StreamObjType) {}
+};
+
 struct VarType;
 using VarTypePtr = std::shared_ptr<VarType>;
 // This type represents a type variable, used in FunctionSchema
@@ -1727,6 +1750,12 @@ struct getTypePtr_<at::Tensor> final {
   }
 };
 template <>
+struct getTypePtr_<c10::Stream> final {
+  static TypePtr call() {
+    return StreamObjType::get();
+  }
+};
+template <>
 struct getTypePtr_<double> final {
   static TypePtr call() {
     return FloatType::get();

diff --git a/aten/src/ATen/core/type.cpp b/aten/src/ATen/core/type.cpp
@@ -158,6 +158,10 @@ DeviceObjTypePtr DeviceObjType::get() {
   static auto value = DeviceObjType::create();
   return value;
 }
+StreamObjTypePtr StreamObjType::get() {
+  static auto value = StreamObjType::create();
+  return value;
+}
 ScalarTypeTypePtr ScalarTypeType::get() {
 static auto value = ScalarTypeType::create();
 return value;

diff --git a/aten/src/ATen/function_wrapper.py b/aten/src/ATen/function_wrapper.py
diff --git a/aten/src/ATen/native/cuda/RecordStream.cu b/aten/src/ATen/native/cuda/RecordStream.cu
@@ -0,0 +1,7 @@
+#include <ATen/ATen.h>
+#include <c10/cuda/CUDACachingAllocator.h>
+namespace at { namespace native {
+void record_stream_cuda(Tensor& self, c10::Stream stream) {
+  c10::cuda::CUDACachingAllocator::recordStream(self.storage().data_ptr(), at::cuda::CUDAStream::unpack(stream.pack()));
+}
+}}  // namespace at::native
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
@@ -8161,6 +8161,12 @@
   variants: function, method
   device_guard: False
 
+- func: record_stream(Tensor(a!) self, Stream s) -> ()
+  use_c10_dispatcher: full
+  variants: method
+  dispatch:
+    CUDA: record_stream_cuda
+
 - func: isposinf(Tensor self) -> Tensor
   use_c10_dispatcher: full
   variants: function, method

diff --git a/aten/src/ATen/templates/TensorBody.h b/aten/src/ATen/templates/TensorBody.h
@@ -4,6 +4,7 @@
 #include <c10/core/Layout.h>
 #include <c10/core/MemoryFormat.h>
 #include <c10/core/QScheme.h>
+#include <c10/core/Stream.h>
 #include <c10/core/Scalar.h>
 #include <c10/core/ScalarType.h>
 #include <c10/core/Storage.h>
@@ -49,6 +50,8 @@ namespace at {
 class Tensor;
 using TensorList = ArrayRef<Tensor>;
 
+using Stream = c10::Stream;
+
 namespace impl {
 inline bool variable_excluded_from_dispatch() {
 #ifdef C10_MOBILE

diff --git a/aten/src/ATen/templates/TensorMethods.cpp b/aten/src/ATen/templates/TensorMethods.cpp
@@ -1,6 +1,7 @@
 #include <c10/core/Scalar.h>
 #include <c10/core/MemoryFormat.h>
 #include <c10/core/QScheme.h>
+#include <c10/core/Stream.h>
 #include <c10/macros/Macros.h>
 #include <c10/core/TensorOptions.h>
 #include <c10/util/intrusive_ptr.h>
@@ -14,6 +15,8 @@
 
 namespace at {
 
+using Stream = c10::Stream;
+
 Tensor Tensor::cpu() const {
   return to(options().device(DeviceType::CPU), /*non_blocking*/ false, /*copy*/ false);
 }

diff --git a/test/test_overrides.py b/test/test_overrides.py
@@ -575,6 +575,8 @@ def instance_gen():
                     func_args.append(False)
                 elif t.startswith('int') or t in {'Dimname', 'DimnameList'}:
                     func_args.append(0)
+                elif t in {'Stream'}:
+                    func_args.append(torch.Stream())
                 elif t.startswith('float') or t == 'double':
                     func_args.append(1.0)
                 elif t in {'Generator', 'MemoryFormat', 'TensorOptions'}:

diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py
@@ -265,6 +265,7 @@ def create_python_bindings(python_functions, is_python_method, module):
 UNPACK_METHODS = {
     'const Tensor &': 'tensor',
     'Tensor &': 'tensor',
+    'Stream': 'stream',
     'c10::optional<Tensor>': 'optionalTensor',
     'const c10::optional<Tensor>&': 'optionalTensor',
     'c10::optional<Generator>': 'generator',

diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
@@ -148,6 +148,8 @@
     'bucketize',
     # Functions that return booleans are not differentiable
     'isnan', 'isposinf', 'isneginf', 'isinf'
+    # Functions return none are not differentiable
+    'record_stream',
 }
 
 # The C -> R functions at the time of adding this are still being audited and tested

diff --git a/tools/autograd/templates/python_variable_methods.cpp b/tools/autograd/templates/python_variable_methods.cpp
@@ -12,7 +12,6 @@
 #include "torch/csrc/autograd/utils/wrap_outputs.h"
 #include "torch/csrc/jit/frontend/tracer.h"
 #ifdef USE_CUDA
-#include "torch/csrc/cuda/Stream.h"
 #include "torch/csrc/cuda/Event.h"
 #endif
 #include "torch/csrc/utils/cuda_lazy_init.h"
@@ -30,6 +29,7 @@
 
 #include <ATen/ATen.h>
 #include "c10/util/Optional.h"
+#include "c10/core/Stream.h"
 
 #include <stdexcept>
 
@@ -40,6 +40,7 @@ using at::Backend;
 using at::Scalar;
 using at::ScalarType;
 using at::Tensor;
+using c10::Stream;
 using namespace torch::autograd::utils;
 
 namespace torch { namespace autograd {
@@ -704,27 +705,6 @@ static PyObject * THPVariable_numpy(PyObject* self, PyObject* arg)
   END_HANDLE_TH_ERRORS
 }
 
-// TODO: move this to ATen. We would need to expose Stream objects in ATen.
-static PyObject * THPVariable_record_stream(PyObject* self, PyObject* arg)
-{
-  HANDLE_TH_ERRORS
-  if (check_has_torch_function(self)) {
-    auto args = py::make_tuple(py::handle(arg));
-    return handle_torch_function(self, "record_stream", args.ptr());
-  }
-#ifdef USE_CUDA
-  auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
-  if (!THCPStream_Check(arg)) {
-    return PyErr_Format(PyExc_TypeError, "expected Stream object");
-  }
-  c10::cuda::CUDACachingAllocator::recordStream(self_.storage().data_ptr(), at::cuda::CUDAStream::unpack(((THCPStream*)arg)->cdata));
-  Py_RETURN_NONE;
-#else
-  throw std::runtime_error("PyTorch compiled without CUDA support");
-#endif
-  END_HANDLE_TH_ERRORS
-}
-
 static PyObject * THPVariable_requires_grad_(PyObject* self, PyObject* args, PyObject* kwargs)
 {
   HANDLE_TH_ERRORS
@@ -1181,7 +1161,6 @@ PyMethodDef variable_methods[] = {
   {"nonzero", (PyCFunction)(void(*)(void))THPVariable_nonzero, METH_VARARGS | METH_KEYWORDS, NULL},
   {"numel", (PyCFunction)THPVariable_numel, METH_NOARGS, NULL},
   {"numpy", (PyCFunction)THPVariable_numpy, METH_NOARGS, NULL},
-  {"record_stream", (PyCFunction)THPVariable_record_stream, METH_O, NULL},
   {"requires_grad_", (PyCFunction)(void(*)(void))THPVariable_requires_grad_, METH_VARARGS | METH_KEYWORDS, NULL},
   {"set_", (PyCFunction)(void (*)(void))THPVariable_set_, METH_VARARGS | METH_KEYWORDS, NULL},
   {"short", (PyCFunction)(void(*)(void))THPVariable_short, METH_VARARGS | METH_KEYWORDS, NULL},

diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl
@@ -474,6 +474,7 @@ libtorch_python_core_sources = [
     "torch/csrc/python_dimname.cpp",
     "torch/csrc/Size.cpp",
     "torch/csrc/Storage.cpp",
+    "torch/csrc/Stream.cpp",
     "torch/csrc/TypeInfo.cpp",
     "torch/csrc/api/src/python/init.cpp",
     "torch/csrc/autograd/functions/init.cpp",

diff --git a/tools/codegen/api/cpp.py b/tools/codegen/api/cpp.py
@@ -45,7 +45,7 @@ def valuetype_type(t: Type) -> Optional[str]:
         elif t.name in [BaseTy.bool, BaseTy.QScheme, BaseTy.Scalar,
                         BaseTy.ScalarType, BaseTy.Generator, BaseTy.Storage,
                         BaseTy.Layout, BaseTy.Device, BaseTy.MemoryFormat,
-                        BaseTy.Dimname, BaseTy.ConstQuantizerPtr]:
+                        BaseTy.Dimname, BaseTy.Stream, BaseTy.ConstQuantizerPtr]:
             # These C++ names line up with their schema names
             return t.name.name
         else:

diff --git a/tools/codegen/model.py b/tools/codegen/model.py
@@ -582,6 +582,7 @@ def is_list_like(self) -> Optional['ListType']:
     'MemoryFormat',
     'QScheme',
     'Storage',
+    'Stream',
     'ConstQuantizerPtr',  # TODO: rename
 ))
 

diff --git a/tools/pyi/gen_pyi.py b/tools/pyi/gen_pyi.py
@@ -146,7 +146,8 @@ def type_to_python(typename, size=None):
         'Dimname': 'Union[str, ellipsis, None]',
         'DimnameList': 'Sequence[Union[str, ellipsis, None]]',
         'QScheme': '_qscheme',
-        'ArrayRef<double>' : 'Sequence[float]'
+        'ArrayRef<double>' : 'Sequence[float]',
+        'Stream': 'Stream',
     }[typename]
 
     return typename

diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
@@ -35,6 +35,13 @@ class device:
 
     def __reduce__(self) -> Tuple[Any, ...]: ...  # THPDevice_reduce
 
+# Defined in torch/csrc/Stream.cpp
+class Stream:
+    _cdata: _int  # Stream handle
+    device: device # The device of the stream
+
+    ...
+
 # Defined in torch/csrc/Size.cpp
 class Size(Tuple[_int, ...]):
     # TODO: __reduce__
@@ -676,6 +683,10 @@ class DeviceObjType(JitType):
     @staticmethod
     def get() -> DeviceObjType: ...
 
+class StreamObjType(JitType):
+    @staticmethod
+    def get() -> StreamObjType: ...
+
 class ListType(JitType):
     def __init__(self, a: JitType) -> None: ...
     def getElementType(self) -> JitType: ...

diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
@@ -23,6 +23,7 @@
 #include <torch/csrc/THP.h>
 #include <torch/csrc/DynamicTypes.h>
 #include <torch/csrc/Device.h>
+#include <torch/csrc/Stream.h>
 #include <torch/csrc/Dtype.h>
 #include <torch/csrc/DataLoader.h>
 #include <torch/csrc/Generator.h>
@@ -723,6 +724,7 @@ PyObject* initModule() {
   THPMemoryFormat_init(module);
   THPQScheme_init(module);
   THPDevice_init(module);
+  THPStream_init(module);
   ASSERT_TRUE(THPVariable_initModule(module));
   ASSERT_TRUE(THPFunction_initModule(module));
   ASSERT_TRUE(THPEngine_initModule(module));