2 files changed, 351 insertions, 0 deletions
diff --git a/numpy/core/src/common/dlpack/dlpack.h b/numpy/core/src/common/dlpack/dlpack.h
new file mode 100644
index 000000000..84afca248
--- /dev/null
+++ b/numpy/core/src/common/dlpack/dlpack.h
@@ -0,0 +1,188 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file dlpack.h
+ * \brief The common header of DLPack.
+ */
+#ifndef DLPACK_DLPACK_H_
+#define DLPACK_DLPACK_H_
+
+#ifdef __cplusplus
+#define DLPACK_EXTERN_C extern "C"
+#else
+#define DLPACK_EXTERN_C
+#endif
+
+/*! \brief The current version of dlpack */
+#define DLPACK_VERSION 050
+
+/*! \brief DLPACK_DLL prefix for windows */
+#ifdef _WIN32
+#ifdef DLPACK_EXPORTS
+#define DLPACK_DLL __declspec(dllexport)
+#else
+#define DLPACK_DLL __declspec(dllimport)
+#endif
+#else
+#define DLPACK_DLL
+#endif
+
+#include <stdint.h>
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*!
+ * \brief The device type in DLDevice.
+ */
+typedef enum {
+  /*! \brief CPU device */
+  kDLCPU = 1,
+  /*! \brief CUDA GPU device */
+  kDLCUDA = 2,
+  /*!
+   * \brief Pinned CUDA CPU memory by cudaMallocHost
+   */
+  kDLCUDAHost = 3,
+  /*! \brief OpenCL devices. */
+  kDLOpenCL = 4,
+  /*! \brief Vulkan buffer for next generation graphics. */
+  kDLVulkan = 7,
+  /*! \brief Metal for Apple GPU. */
+  kDLMetal = 8,
+  /*! \brief Verilog simulator buffer */
+  kDLVPI = 9,
+  /*! \brief ROCm GPUs for AMD GPUs */
+  kDLROCM = 10,
+  /*!
+   * \brief Reserved extension device type,
+   * used for quickly test extension device
+   * The semantics can differ depending on the implementation.
+   */
+  kDLExtDev = 12,
+} DLDeviceType;
+
+/*!
+ * \brief A Device for Tensor and operator.
+ */
+typedef struct {
+  /*! \brief The device type used in the device. */
+  DLDeviceType device_type;
+  /*! \brief The device index */
+  int device_id;
+} DLDevice;
+
+/*!
+ * \brief The type code options DLDataType.
+ */
+typedef enum {
+  /*! \brief signed integer */
+  kDLInt = 0U,
+  /*! \brief unsigned integer */
+  kDLUInt = 1U,
+  /*! \brief IEEE floating point */
+  kDLFloat = 2U,
+  /*!
+   * \brief Opaque handle type, reserved for testing purposes.
+   * Frameworks need to agree on the handle data type for the exchange to be well-defined.
+   */
+  kDLOpaqueHandle = 3U,
+  /*! \brief bfloat16 */
+  kDLBfloat = 4U,
+  /*!
+   * \brief complex number
+   * (C/C++/Python layout: compact struct per complex number)
+   */
+  kDLComplex = 5U,
+} DLDataTypeCode;
+
+/*!
+ * \brief The data type the tensor can hold.
+ *
+ *  Examples
+ *   - float: type_code = 2, bits = 32, lanes=1
+ *   - float4(vectorized 4 float): type_code = 2, bits = 32, lanes=4
+ *   - int8: type_code = 0, bits = 8, lanes=1
+ *   - std::complex<float>: type_code = 5, bits = 64, lanes = 1
+ */
+typedef struct {
+  /*!
+   * \brief Type code of base types.
+   * We keep it uint8_t instead of DLDataTypeCode for minimal memory
+   * footprint, but the value should be one of DLDataTypeCode enum values.
+   * */
+  uint8_t code;
+  /*!
+   * \brief Number of bits, common choices are 8, 16, 32.
+   */
+  uint8_t bits;
+  /*! \brief Number of lanes in the type, used for vector types. */
+  uint16_t lanes;
+} DLDataType;
+
+/*!
+ * \brief Plain C Tensor object, does not manage memory.
+ */
+typedef struct {
+  /*!
+   * \brief The opaque data pointer points to the allocated data. This will be
+   * CUDA device pointer or cl_mem handle in OpenCL. This pointer is always
+   * aligned to 256 bytes as in CUDA.
+   *
+   * For given DLTensor, the size of memory required to store the contents of
+   * data is calculated as follows:
+   *
+   * \code{.c}
+   * static inline size_t GetDataSize(const DLTensor* t) {
+   *   size_t size = 1;
+   *   for (tvm_index_t i = 0; i < t->ndim; ++i) {
+   *     size *= t->shape[i];
+   *   }
+   *   size *= (t->dtype.bits * t->dtype.lanes + 7) / 8;
+   *   return size;
+   * }
+   * \endcode
+   */
+  void* data;
+  /*! \brief The device of the tensor */
+  DLDevice device;
+  /*! \brief Number of dimensions */
+  int ndim;
+  /*! \brief The data type of the pointer*/
+  DLDataType dtype;
+  /*! \brief The shape of the tensor */
+  int64_t* shape;
+  /*!
+   * \brief strides of the tensor (in number of elements, not bytes)
+   *  can be NULL, indicating tensor is compact and row-majored.
+   */
+  int64_t* strides;
+  /*! \brief The offset in bytes to the beginning pointer to data */
+  uint64_t byte_offset;
+} DLTensor;
+
+/*!
+ * \brief C Tensor object, manage memory of DLTensor. This data structure is
+ *  intended to facilitate the borrowing of DLTensor by another framework. It is
+ *  not meant to transfer the tensor. When the borrowing framework doesn't need
+ *  the tensor, it should call the deleter to notify the host that the resource
+ *  is no longer needed.
+ */
+typedef struct DLManagedTensor {
+  /*! \brief DLTensor which is being memory managed */
+  DLTensor dl_tensor;
+  /*! \brief the context of the original host framework of DLManagedTensor in
+   *   which DLManagedTensor is used in the framework. It can also be NULL.
+   */
+  void * manager_ctx;
+  /*! \brief Destructor signature void (*)(void*) - this should be called
+   *   to destruct manager_ctx which holds the DLManagedTensor. It can be NULL
+   *   if there is no way for the caller to provide a reasonable destructor.
+   *   The destructors deletes the argument self as well.
+   */
+  void (*deleter)(struct DLManagedTensor * self);
+} DLManagedTensor;
+#ifdef __cplusplus
+}  // DLPACK_EXTERN_C
+#endif
+#endif  // DLPACK_DLPACK_H_
diff --git a/numpy/core/src/multiarray/methods.c b/numpy/core/src/multiarray/methods.c
index 2d66c77dc..2251d4b69 100644
--- a/numpy/core/src/multiarray/methods.c
+++ b/numpy/core/src/multiarray/methods.c
@@ -31,6 +31,7 @@
 #include "alloc.h"
 
 #include <stdarg.h>
+#include "common/dlpack/dlpack.h"
 
 
 /* NpyArg_ParseKeywords
@@ -2762,6 +2763,158 @@ array_class_getitem(PyObject *cls, PyObject *args)
     generic_alias = NULL;
 #endif
     return generic_alias;
+
+#define NPY_DLPACK_CAPSULE_NAME "dltensor"
+#define NPY_DLPACK_USED_CAPSULE_NAME "used_dltensor"
+
+static void array_dlpack_capsule_deleter(PyObject *self)
+{
+    if (!PyCapsule_IsValid(self, NPY_DLPACK_CAPSULE_NAME)) {
+        if (!PyCapsule_IsValid(self, NPY_DLPACK_USED_CAPSULE_NAME)) {
+            PyErr_SetString(PyExc_RuntimeError, "Invalid capsule name.");
+        }
+        return;
+    }
+
+    DLManagedTensor *managed = 
+        (DLManagedTensor *)PyCapsule_GetPointer(self, NPY_DLPACK_CAPSULE_NAME);
+    managed->deleter(managed);
+}
+
+static void array_dlpack_deleter(DLManagedTensor *self)
+{
+    PyArrayObject *array = (PyArrayObject *)self->manager_ctx;
+    // This will also free the strides as it's one allocation.
+    PyMem_Free(self->dl_tensor.shape);
+    PyMem_Free(self);
+
+    PyArray_XDECREF(array);
+}
+
+static PyObject *
+array_dlpack(PyArrayObject *self,
+        PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames)
+{
+    PyObject *stream = Py_None;
+    NPY_PREPARE_ARGPARSER;
+    if (npy_parse_arguments("__dlpack__", args, len_args, kwnames,
+            "$stream", NULL, &stream, NULL, NULL, NULL)) {
+        return NULL;
+    }
+
+    if (stream != Py_None) {
+        PyErr_SetString(PyExc_RuntimeError, "NumPy only supports "
+                "stream=None.");
+        return NULL;
+    }
+
+    npy_intp itemsize = PyArray_ITEMSIZE(self);
+    int ndim = PyArray_NDIM(self);
+    npy_intp *strides = PyArray_STRIDES(self);
+    npy_intp *shape = PyArray_SHAPE(self);
+
+    for (int i = 0; i < ndim; ++i) {
+        if (strides[i] % itemsize != 0) {
+            PyErr_SetString(PyExc_RuntimeError,
+                    "DLPack only supports strides which are a multiple of "
+                    "itemsize.");
+            return NULL;
+        }
+    }
+
+    DLDataType managed_dtype;
+    PyArray_Descr *dtype = PyArray_DESCR(self);
+
+    if (PyDataType_ISBYTESWAPPED(dtype)) {
+        PyErr_SetString(PyExc_TypeError, "DLPack only supports native "
+                    "byte swapping.");
+            return NULL;
+    }
+    
+    managed_dtype.bits = 8 * itemsize;
+    managed_dtype.lanes = 1;
+
+    if (PyDataType_ISSIGNED(dtype)) {
+        managed_dtype.code = kDLInt;
+    } else if (PyDataType_ISUNSIGNED(dtype)) {
+        managed_dtype.code = kDLUInt;
+    } else if (PyDataType_ISFLOAT(dtype)) {
+        // We can't be sure that the dtype is
+        // IEEE or padded.
+        if (itemsize > 8) {
+            PyErr_SetString(PyExc_TypeError, "DLPack only supports IEEE "
+                    "floating point types without padding.");
+            return NULL;
+        } 
+        managed_dtype.code = kDLFloat;
+    } else if (PyDataType_ISCOMPLEX(dtype)) {
+        // We can't be sure that the dtype is
+        // IEEE or padded.
+        if (itemsize > 16) {
+            PyErr_SetString(PyExc_TypeError, "DLPack only supports IEEE "
+                    "complex point types without padding.");
+            return NULL;
+        } 
+        managed_dtype.code = kDLComplex;
+    } else {
+        PyErr_SetString(PyExc_TypeError,
+                        "DLPack only supports signed/unsigned integers, float "
+                        "and complex dtypes.");
+        return NULL;
+    }
+
+    DLManagedTensor *managed = PyMem_Malloc(sizeof(DLManagedTensor));
+    if (managed == NULL) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+
+    managed->dl_tensor.data = PyArray_DATA(self);
+    managed->dl_tensor.device.device_type = kDLCPU;
+    managed->dl_tensor.device.device_id = 0;
+    managed->dl_tensor.dtype = managed_dtype;
+
+
+    int64_t *managed_shape_strides = PyMem_Malloc(sizeof(int64_t) * ndim * 2);
+    if (managed_shape_strides == NULL) {
+        PyErr_NoMemory();
+        PyMem_Free(managed);
+        return NULL;
+    }
+
+    int64_t *managed_shape = managed_shape_strides;
+    int64_t *managed_strides = managed_shape_strides + ndim;
+    for (int i = 0; i < ndim; ++i) {
+        managed_shape[i] = shape[i];
+        // Strides in DLPack are items; in NumPy are bytes.
+        managed_strides[i] = strides[i] / itemsize;
+    }
+
+    managed->dl_tensor.ndim = ndim;
+    managed->dl_tensor.shape = managed_shape;
+    managed->dl_tensor.strides = managed_strides;
+    managed->dl_tensor.byte_offset = 0;
+    managed->manager_ctx = self;
+    managed->deleter = array_dlpack_deleter;
+
+    PyObject *capsule = PyCapsule_New(managed, NPY_DLPACK_CAPSULE_NAME,
+            array_dlpack_capsule_deleter);
+    if (capsule == NULL) {
+        PyMem_Free(managed);
+        PyMem_Free(managed_shape_strides);
+        return NULL;
+    }
+    
+    // the capsule holds a reference
+    PyArray_INCREF(self);
+    return capsule;
+}
+
+static PyObject *
+array_dlpack_device(PyArrayObject *NPY_UNUSED(self), PyObject *NPY_UNUSED(args))
+{
+    return Py_BuildValue("ii", kDLCPU, 0);
+>>>>>>> ENH: Add the __dlpack__ and __dlpack_device__ methods to ndarray.
 }
 
 NPY_NO_EXPORT PyMethodDef array_methods[] = {
@@ -2989,5 +3142,15 @@ NPY_NO_EXPORT PyMethodDef array_methods[] = {
     {"view",
         (PyCFunction)array_view,
         METH_FASTCALL | METH_KEYWORDS, NULL},
+    
+    // For data interchange between libraries
+    {"__dlpack__",
+        (PyCFunction)array_dlpack,
+        METH_FASTCALL | METH_KEYWORDS, NULL},
+
+    {"__dlpack_device__",
+        (PyCFunction)array_dlpack_device,
+        METH_NOARGS, NULL},
+    
     {NULL, NULL, 0, NULL}           /* sentinel */
 };