30 files changed, 917 insertions, 284 deletions
diff --git a/numpy/core/_add_newdocs.py b/numpy/core/_add_newdocs.py
index 52ab9c994..3e9d05674 100644
--- a/numpy/core/_add_newdocs.py
+++ b/numpy/core/_add_newdocs.py
@@ -2614,7 +2614,7 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('argmin',
 
 add_newdoc('numpy.core.multiarray', 'ndarray', ('argsort',
     """
-    a.argsort(axis=-1, kind='quicksort', order=None)
+    a.argsort(axis=-1, kind=None, order=None)
 
     Returns the indices that would sort this array.
 
@@ -3800,7 +3800,7 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('setflags',
 
 add_newdoc('numpy.core.multiarray', 'ndarray', ('sort',
     """
-    a.sort(axis=-1, kind='quicksort', order=None)
+    a.sort(axis=-1, kind=None, order=None)
 
     Sort an array in-place. Refer to `numpy.sort` for full documentation.
 
diff --git a/numpy/core/_exceptions.py b/numpy/core/_exceptions.py
index a1997ef6b..1dcea6255 100644
--- a/numpy/core/_exceptions.py
+++ b/numpy/core/_exceptions.py
@@ -121,3 +121,15 @@ class AxisError(ValueError, IndexError):
                 msg = "{}: {}".format(msg_prefix, msg)
 
         super(AxisError, self).__init__(msg)
+
+
+@_display_as_base
+class _ArrayMemoryError(MemoryError):
+    """ Thrown when an array cannot be allocated"""
+    def __init__(self, shape, dtype):
+        self.shape = shape
+        self.dtype = dtype
+
+    def __str__(self):
+        return "Unable to allocate array with shape {} and data type {}".format(self.shape, self.dtype)
+
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index e17523451..76360eb5a 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -697,6 +697,7 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.exp'),
           None,
+          TD('e', f='exp', astype={'e':'f'}),
           TD('f', simd=[('avx2', 'f'), ('avx512f', 'f')]),
           TD(inexact, f='exp', astype={'e':'f'}),
           TD(P, f='exp'),
@@ -719,6 +720,7 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.log'),
           None,
+          TD('e', f='log', astype={'e':'f'}),
           TD('f', simd=[('avx2', 'f'), ('avx512f', 'f')]),
           TD(inexact, f='log', astype={'e':'f'}),
           TD(P, f='log'),
@@ -1154,7 +1156,6 @@ def make_code(funcdict, filename):
 
 if __name__ == "__main__":
     filename = __file__
-    fid = open('__umath_generated.c', 'w')
     code = make_code(defdict, filename)
-    fid.write(code)
-    fid.close()
+    with open('__umath_generated.c', 'w') as fid:
+        fid.write(code)
diff --git a/numpy/core/defchararray.py b/numpy/core/defchararray.py
index 3fd7d14c4..d7ecce1b4 100644
--- a/numpy/core/defchararray.py
+++ b/numpy/core/defchararray.py
@@ -2124,7 +2124,7 @@ class chararray(ndarray):
     def __rmod__(self, other):
         return NotImplemented
 
-    def argsort(self, axis=-1, kind='quicksort', order=None):
+    def argsort(self, axis=-1, kind=None, order=None):
         """
         Return the indices that sort the array lexicographically.
 
diff --git a/numpy/core/fromnumeric.py b/numpy/core/fromnumeric.py
index b4d721940..7024ac237 100644
--- a/numpy/core/fromnumeric.py
+++ b/numpy/core/fromnumeric.py
@@ -824,7 +824,7 @@ def _sort_dispatcher(a, axis=None, kind=None, order=None):
 
 
 @array_function_dispatch(_sort_dispatcher)
-def sort(a, axis=-1, kind='quicksort', order=None):
+def sort(a, axis=-1, kind=None, order=None):
     """
     Return a sorted copy of an array.
 
@@ -837,8 +837,8 @@ def sort(a, axis=-1, kind='quicksort', order=None):
         sorting. The default is -1, which sorts along the last axis.
     kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, optional
         Sorting algorithm. The default is 'quicksort'. Note that both 'stable'
-        and 'mergesort' use timsort under the covers and, in general, the
-        actual implementation will vary with data type. The 'mergesort' option
+        and 'mergesort' use timsort or radix sort under the covers and, in general,
+        the actual implementation will vary with data type. The 'mergesort' option
         is retained for backwards compatibility.
 
         .. versionchanged:: 1.15.0.
@@ -914,7 +914,8 @@ def sort(a, axis=-1, kind='quicksort', order=None):
 
     'stable' automatically choses the best stable sorting algorithm
     for the data type being sorted. It, along with 'mergesort' is
-    currently mapped to timsort. API forward compatibility currently limits the
+    currently mapped to timsort or radix sort depending on the
+    data type. API forward compatibility currently limits the
     ability to select the implementation and it is hardwired for the different
     data types.
 
@@ -925,7 +926,8 @@ def sort(a, axis=-1, kind='quicksort', order=None):
     mergesort. It is now used for stable sort while quicksort is still the
     default sort if none is chosen. For details of timsort, refer to
     `CPython listsort.txt <https://github.com/python/cpython/blob/3.7/Objects/listsort.txt>`_.
-
+    'mergesort' and 'stable' are mapped to radix sort for integer data types. Radix sort is an
+    O(n) sort instead of O(n log n).
 
     Examples
     --------
@@ -974,7 +976,7 @@ def _argsort_dispatcher(a, axis=None, kind=None, order=None):
 
 
 @array_function_dispatch(_argsort_dispatcher)
-def argsort(a, axis=-1, kind='quicksort', order=None):
+def argsort(a, axis=-1, kind=None, order=None):
     """
     Returns the indices that would sort an array.
 
@@ -997,8 +999,6 @@ def argsort(a, axis=-1, kind='quicksort', order=None):
 
         .. versionchanged:: 1.15.0.
            The 'stable' option was added.
-
-
     order : str or list of str, optional
         When `a` is an array with fields defined, this argument specifies
         which fields to compare first, second, etc.  A single field can
diff --git a/numpy/core/multiarray.py b/numpy/core/multiarray.py
index 54b3a3e5e..78fec1aab 100644
--- a/numpy/core/multiarray.py
+++ b/numpy/core/multiarray.py
@@ -8,6 +8,7 @@ by importing from the extension module.
 
 import functools
 import warnings
+import sys
 
 from . import overrides
 from . import _multiarray_umath
@@ -71,9 +72,9 @@ array_function_from_c_func_and_dispatcher = functools.partial(
 
 
 @array_function_from_c_func_and_dispatcher(_multiarray_umath.empty_like)
-def empty_like(prototype, dtype=None, order=None, subok=None):
+def empty_like(prototype, dtype=None, order=None, subok=None, shape=None):
     """
-    empty_like(prototype, dtype=None, order='K', subok=True)
+    empty_like(prototype, dtype=None, order='K', subok=True, shape=None)
 
     Return a new array with the same shape and type as a given array.
 
@@ -97,6 +98,12 @@ def empty_like(prototype, dtype=None, order=None, subok=None):
         If True, then the newly created array will use the sub-class
         type of 'a', otherwise it will be a base-class array. Defaults
         to True.
+    shape : int or sequence of ints, optional.
+        Overrides the shape of the result. If order='K' and the number of
+        dimensions is unchanged, will try to keep order, otherwise,
+        order='C' is implied.
+
+        .. versionadded:: 1.17.0
 
     Returns
     -------
@@ -1107,7 +1114,7 @@ def putmask(a, mask, values):
 
 
 @array_function_from_c_func_and_dispatcher(_multiarray_umath.packbits)
-def packbits(a, axis=None):
+def packbits(a, axis=None, bitorder='big'):
     """
     packbits(a, axis=None)
 
@@ -1123,6 +1130,13 @@ def packbits(a, axis=None):
     axis : int, optional
         The dimension over which bit-packing is done.
         ``None`` implies packing the flattened array.
+    bitorder : {'big', 'little'}, optional
+        The order of the input bits. 'big' will mimic bin(val),
+        ``[0, 0, 0, 0, 0, 0, 1, 1] => 3 = 0b00000011 => ``, 'little' will
+        reverse the order so ``[1, 1, 0, 0, 0, 0, 0, 0] => 3``.
+        Defaults to 'big'.
+
+        .. versionadded:: 1.17.0
 
     Returns
     -------
@@ -1158,7 +1172,7 @@ def packbits(a, axis=None):
 
 
 @array_function_from_c_func_and_dispatcher(_multiarray_umath.unpackbits)
-def unpackbits(a, axis=None, count=None):
+def unpackbits(a, axis=None, count=None, bitorder='big'):
     """
     unpackbits(a, axis=None, count=None)
 
@@ -1188,6 +1202,14 @@ def unpackbits(a, axis=None, count=None):
 
         .. versionadded:: 1.17.0
 
+    bitorder : {'big', 'little'}, optional
+        The order of the returned bits. 'big' will mimic bin(val),
+        ``3 = 0b00000011 => [0, 0, 0, 0, 0, 0, 1, 1]``, 'little' will reverse
+        the order to ``[1, 1, 0, 0, 0, 0, 0, 0]``.
+        Defaults to 'big'.
+
+        .. versionadded:: 1.17.0
+
     Returns
     -------
     unpacked : ndarray, uint8 type
diff --git a/numpy/core/numeric.py b/numpy/core/numeric.py
index 34705efc7..e72ab3012 100644
--- a/numpy/core/numeric.py
+++ b/numpy/core/numeric.py
@@ -90,12 +90,12 @@ class ComplexWarning(RuntimeWarning):
     pass
 
 
-def _zeros_like_dispatcher(a, dtype=None, order=None, subok=None):
+def _zeros_like_dispatcher(a, dtype=None, order=None, subok=None, shape=None):
     return (a,)
 
 
 @array_function_dispatch(_zeros_like_dispatcher)
-def zeros_like(a, dtype=None, order='K', subok=True):
+def zeros_like(a, dtype=None, order='K', subok=True, shape=None):
     """
     Return an array of zeros with the same shape and type as a given array.
 
@@ -119,6 +119,12 @@ def zeros_like(a, dtype=None, order='K', subok=True):
         If True, then the newly created array will use the sub-class
         type of 'a', otherwise it will be a base-class array. Defaults
         to True.
+    shape : int or sequence of ints, optional.
+        Overrides the shape of the result. If order='K' and the number of
+        dimensions is unchanged, will try to keep order, otherwise,
+        order='C' is implied.
+
+        .. versionadded:: 1.17.0
 
     Returns
     -------
@@ -150,7 +156,7 @@ def zeros_like(a, dtype=None, order='K', subok=True):
     array([0.,  0.,  0.])
 
     """
-    res = empty_like(a, dtype=dtype, order=order, subok=subok)
+    res = empty_like(a, dtype=dtype, order=order, subok=subok, shape=shape)
     # needed instead of a 0 to get same result as zeros for for string dtypes
     z = zeros(1, dtype=res.dtype)
     multiarray.copyto(res, z, casting='unsafe')
@@ -210,12 +216,12 @@ def ones(shape, dtype=None, order='C'):
     return a
 
 
-def _ones_like_dispatcher(a, dtype=None, order=None, subok=None):
+def _ones_like_dispatcher(a, dtype=None, order=None, subok=None, shape=None):
     return (a,)
 
 
 @array_function_dispatch(_ones_like_dispatcher)
-def ones_like(a, dtype=None, order='K', subok=True):
+def ones_like(a, dtype=None, order='K', subok=True, shape=None):
     """
     Return an array of ones with the same shape and type as a given array.
 
@@ -239,6 +245,12 @@ def ones_like(a, dtype=None, order='K', subok=True):
         If True, then the newly created array will use the sub-class
         type of 'a', otherwise it will be a base-class array. Defaults
         to True.
+    shape : int or sequence of ints, optional.
+        Overrides the shape of the result. If order='K' and the number of
+        dimensions is unchanged, will try to keep order, otherwise,
+        order='C' is implied.
+
+        .. versionadded:: 1.17.0
 
     Returns
     -------
@@ -270,7 +282,7 @@ def ones_like(a, dtype=None, order='K', subok=True):
     array([1.,  1.,  1.])
 
     """
-    res = empty_like(a, dtype=dtype, order=order, subok=subok)
+    res = empty_like(a, dtype=dtype, order=order, subok=subok, shape=shape)
     multiarray.copyto(res, 1, casting='unsafe')
     return res
 
@@ -322,12 +334,12 @@ def full(shape, fill_value, dtype=None, order='C'):
     return a
 
 
-def _full_like_dispatcher(a, fill_value, dtype=None, order=None, subok=None):
+def _full_like_dispatcher(a, fill_value, dtype=None, order=None, subok=None, shape=None):
     return (a,)
 
 
 @array_function_dispatch(_full_like_dispatcher)
-def full_like(a, fill_value, dtype=None, order='K', subok=True):
+def full_like(a, fill_value, dtype=None, order='K', subok=True, shape=None):
     """
     Return a full array with the same shape and type as a given array.
 
@@ -349,6 +361,12 @@ def full_like(a, fill_value, dtype=None, order='K', subok=True):
         If True, then the newly created array will use the sub-class
         type of 'a', otherwise it will be a base-class array. Defaults
         to True.
+    shape : int or sequence of ints, optional.
+        Overrides the shape of the result. If order='K' and the number of
+        dimensions is unchanged, will try to keep order, otherwise,
+        order='C' is implied.
+
+        .. versionadded:: 1.17.0
 
     Returns
     -------
@@ -379,7 +397,7 @@ def full_like(a, fill_value, dtype=None, order='K', subok=True):
     array([0.1,  0.1,  0.1,  0.1,  0.1,  0.1])
 
     """
-    res = empty_like(a, dtype=dtype, order=order, subok=subok)
+    res = empty_like(a, dtype=dtype, order=order, subok=subok, shape=shape)
     multiarray.copyto(res, fill_value, casting='unsafe')
     return res
 
@@ -1984,7 +2002,8 @@ def load(file):
         "np.core.numeric.load is deprecated, use pickle.load instead",
         DeprecationWarning, stacklevel=2)
     if isinstance(file, type("")):
-        file = open(file, "rb")
+        with open(file, "rb") as file_pointer:
+            return pickle.load(file_pointer)
     return pickle.load(file)
 
 
diff --git a/numpy/core/overrides.py b/numpy/core/overrides.py
index d4697b8a6..ad4d1c721 100644
--- a/numpy/core/overrides.py
+++ b/numpy/core/overrides.py
@@ -1,6 +1,7 @@
 """Implementation of __array_function__ overrides from NEP-18."""
 import collections
 import functools
+import textwrap
 
 from numpy.core._multiarray_umath import (
     add_docstring, implement_array_function, _get_implementing_args)
@@ -143,11 +144,30 @@ def array_function_dispatch(dispatcher, module=None, verify=True,
         if docs_from_dispatcher:
             add_docstring(implementation, dispatcher.__doc__)
 
+        # Equivalently, we could define this function directly instead of using
+        # exec. This version has the advantage of giving the helper function a
+        # more interpettable name. Otherwise, the original function does not
+        # show up at all in many cases, e.g., if it's written in C or if the
+        # dispatcher gets an invalid keyword argument.
+        source = textwrap.dedent("""
         @functools.wraps(implementation)
-        def public_api(*args, **kwargs):
+        def {name}(*args, **kwargs):
             relevant_args = dispatcher(*args, **kwargs)
             return implement_array_function(
-                implementation, public_api, relevant_args, args, kwargs)
+                implementation, {name}, relevant_args, args, kwargs)
+        """).format(name=implementation.__name__)
+
+        source_object = compile(
+            source, filename='<__array_function__ internals>', mode='exec')
+        scope = {
+            'implementation': implementation,
+            'dispatcher': dispatcher,
+            'functools': functools,
+            'implement_array_function': implement_array_function,
+        }
+        exec(source_object, scope)
+
+        public_api = scope[implementation.__name__]
 
         if module is not None:
             public_api.__module__ = module
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 45b4fb3c7..81d140d5e 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -710,6 +710,7 @@ def configuration(parent_package='',top_path=None):
                        join('src', 'npysort', 'mergesort.c.src'),
                        join('src', 'npysort', 'timsort.c.src'),
                        join('src', 'npysort', 'heapsort.c.src'),
+                       join('src', 'npysort', 'radixsort.c.src'),
                        join('src', 'common', 'npy_partition.h.src'),
                        join('src', 'npysort', 'selection.c.src'),
                        join('src', 'common', 'npy_binsearch.h.src'),
diff --git a/numpy/core/setup_common.py b/numpy/core/setup_common.py
index 885aec443..32d52d93e 100644
--- a/numpy/core/setup_common.py
+++ b/numpy/core/setup_common.py
@@ -81,7 +81,7 @@ def get_api_versions(apiversion, codegen_dir):
     return curapi_hash, apis_hash[apiversion]
 
 def check_api_version(apiversion, codegen_dir):
-    """Emits a MismacthCAPIWarning if the C API version needs updating."""
+    """Emits a MismatchCAPIWarning if the C API version needs updating."""
     curapi_hash, api_hash = get_api_versions(apiversion, codegen_dir)
 
     # If different hash, it means that the api .txt files in
diff --git a/numpy/core/shape_base.py b/numpy/core/shape_base.py
index 0eac772e8..3a037e7d2 100644
--- a/numpy/core/shape_base.py
+++ b/numpy/core/shape_base.py
@@ -832,7 +832,7 @@ def block(arrays):
 
 # Theses helper functions are mostly used for testing.
 # They allow us to write tests that directly call `_block_slicing`
-# or `_block_concatenate` wtihout blocking large arrays to forse the wisdom
+# or `_block_concatenate` without blocking large arrays to forse the wisdom
 # to trigger the desired path.
 def _block_setup(arrays):
     """
diff --git a/numpy/core/src/common/npy_longdouble.c b/numpy/core/src/common/npy_longdouble.c
index 561f4b825..c580e0cce 100644
--- a/numpy/core/src/common/npy_longdouble.c
+++ b/numpy/core/src/common/npy_longdouble.c
@@ -6,6 +6,7 @@
 #include "numpy/ndarraytypes.h"
 #include "numpy/npy_math.h"
 #include "npy_pycompat.h"
+#include "numpyos.h"
 
 /*
  * Heavily derived from PyLong_FromDouble
@@ -94,3 +95,84 @@ done:
     Py_DECREF(l_chunk_size);
     return v;
 }
+
+/* Helper function to get unicode(PyLong).encode('utf8') */
+static PyObject *
+_PyLong_Bytes(PyObject *long_obj) {
+    PyObject *bytes;
+#if defined(NPY_PY3K)
+    PyObject *unicode = PyObject_Str(long_obj);
+    if (unicode == NULL) {
+        return NULL;
+    }
+    bytes = PyUnicode_AsUTF8String(unicode);
+    Py_DECREF(unicode);
+#else
+    bytes = PyObject_Str(long_obj);
+#endif
+    return bytes;
+}
+
+
+/**
+ * TODO: currently a hack that converts the long through a string. This is
+ * correct, but slow.
+ *
+ * Another approach would be to do this numerically, in a similar way to
+ * PyLong_AsDouble.
+ * However, in order to respect rounding modes correctly, this needs to know
+ * the size of the mantissa, which is platform-dependent.
+ */
+NPY_VISIBILITY_HIDDEN npy_longdouble
+npy_longdouble_from_PyLong(PyObject *long_obj) {
+    npy_longdouble result = 1234;
+    char *end;
+    char *cstr;
+    PyObject *bytes;
+
+    /* convert the long to a string */
+    bytes = _PyLong_Bytes(long_obj);
+    if (bytes == NULL) {
+        return -1;
+    }
+
+    cstr = PyBytes_AsString(bytes);
+    if (cstr == NULL) {
+        goto fail;
+    }
+    end = NULL;
+
+    /* convert the string to a long double and capture errors */
+    errno = 0;
+    result = NumPyOS_ascii_strtold(cstr, &end);
+    if (errno == ERANGE) {
+        /* strtold returns INFINITY of the correct sign. */
+        if (PyErr_Warn(PyExc_RuntimeWarning,
+                "overflow encountered in conversion from python long") < 0) {
+            goto fail;
+        }
+    }
+    else if (errno) {
+        PyErr_Format(PyExc_RuntimeError,
+                     "Could not parse python long as longdouble: %s (%s)",
+                     cstr,
+                     strerror(errno));
+        goto fail;
+    }
+
+    /* Extra characters at the end of the string, or nothing parsed */
+    if (end == cstr || *end != '\0') {
+        PyErr_Format(PyExc_RuntimeError,
+                     "Could not parse long as longdouble: %s",
+                     cstr);
+        goto fail;
+    }
+
+    /* finally safe to decref now that we're done with `end` */
+    Py_DECREF(bytes);
+    return result;
+
+fail:
+    Py_DECREF(bytes);
+    return -1;
+}
diff --git a/numpy/core/src/common/npy_longdouble.h b/numpy/core/src/common/npy_longdouble.h
index 036b53070..01db06de7 100644
--- a/numpy/core/src/common/npy_longdouble.h
+++ b/numpy/core/src/common/npy_longdouble.h
@@ -14,4 +14,14 @@
 NPY_VISIBILITY_HIDDEN PyObject *
 npy_longdouble_to_PyLong(npy_longdouble ldval);
 
+/* Convert a python `long` integer to a npy_longdouble
+ *
+ * This performs the same task as PyLong_AsDouble, but for long doubles
+ * which have a greater range.
+ *
+ * Returns -1 if an error occurs.
+ */
+NPY_VISIBILITY_HIDDEN npy_longdouble
+npy_longdouble_from_PyLong(PyObject *long_obj);
+
 #endif
diff --git a/numpy/core/src/common/npy_sort.h.src b/numpy/core/src/common/npy_sort.h.src
index 521f0fee5..16a105499 100644
--- a/numpy/core/src/common/npy_sort.h.src
+++ b/numpy/core/src/common/npy_sort.h.src
@@ -44,6 +44,17 @@ int atimsort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *null);
 
 /**end repeat**/
 
+/**begin repeat
+ *
+ * #suff = bool, byte, ubyte, short, ushort, int, uint, long, ulong,
+ *         longlong, ulonglong#
+ */
+
+int radixsort_@suff@(void *vec, npy_intp cnt, void *null);
+int aradixsort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+
+/**end repeat**/
+
 
 
 /*
diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src
index 49819ca4a..1463d8990 100644
--- a/numpy/core/src/multiarray/arraytypes.c.src
+++ b/numpy/core/src/multiarray/arraytypes.c.src
@@ -30,6 +30,7 @@
 #include <emmintrin.h>
 #endif
 
+#include "npy_longdouble.h"
 #include "numpyos.h"
 #include <string.h>
 
@@ -328,6 +329,17 @@ string_to_long_double(PyObject*op)
     npy_longdouble temp;
     PyObject* b;
 
+    /* Convert python long objects to a longdouble, without precision or range
+     * loss via a double.
+     */
+    if ((PyLong_Check(op) && !PyBool_Check(op))
+#if !defined(NPY_PY3K)
+        || (PyInt_Check(op) && !PyBool_Check(op))
+#endif
+    ) {
+        return npy_longdouble_from_PyLong(op);
+    }
+
     if (PyUnicode_Check(op)) {
         b = PyUnicode_AsUTF8String(op);
         if (!b) {
@@ -4405,6 +4417,7 @@ static PyArray_Descr @from@_Descr = {
  *             npy_half, npy_float, npy_double, npy_longdouble,
  *             npy_cfloat, npy_cdouble, npy_clongdouble,
  *             PyObject *, npy_datetime, npy_timedelta#
+ * #rsort = 1*5, 0*16#
  * #NAME = Bool,
  *         Byte, UByte, Short, UShort, Int, UInt,
  *         Long, ULong, LongLong, ULongLong,
@@ -4461,12 +4474,20 @@ static PyArray_ArrFuncs _Py@NAME@_ArrFuncs = {
     {
         quicksort_@suff@,
         heapsort_@suff@,
-        timsort_@suff@
+        #if @rsort@
+            radixsort_@suff@
+        #else
+            timsort_@suff@
+        #endif
     },
     {
         aquicksort_@suff@,
         aheapsort_@suff@,
-        atimsort_@suff@
+        #if @rsort@
+            aradixsort_@suff@
+        #else
+            atimsort_@suff@            
+        #endif
     },
 #else
     {
diff --git a/numpy/core/src/multiarray/compiled_base.c b/numpy/core/src/multiarray/compiled_base.c
index d80d16529..25dc6951c 100644
--- a/numpy/core/src/multiarray/compiled_base.c
+++ b/numpy/core/src/multiarray/compiled_base.c
@@ -921,7 +921,7 @@ fail:
     return -1;
 }
 
-/* Inner loop for unravel_index */
+/* Inner loop for ravel_multi_index */
 static int
 ravel_multi_index_loop(int ravel_ndim, npy_intp *ravel_dims,
                         npy_intp *ravel_strides,
@@ -1135,67 +1135,44 @@ fail:
     return NULL;
 }
 
-/* C-order inner loop for unravel_index */
-static int
-unravel_index_loop_corder(int unravel_ndim, npy_intp *unravel_dims,
-                        npy_intp unravel_size, npy_intp count,
-                        char *indices, npy_intp indices_stride,
-                        npy_intp *coords)
-{
-    int i;
-    char invalid;
-    npy_intp val;
 
-    NPY_BEGIN_ALLOW_THREADS;
-    invalid = 0;
-    while (count--) {
-        val = *(npy_intp *)indices;
-        if (val < 0 || val >= unravel_size) {
-            invalid = 1;
-            break;
-        }
-        for (i = unravel_ndim-1; i >= 0; --i) {
-            coords[i] = val % unravel_dims[i];
-            val /= unravel_dims[i];
-        }
-        coords += unravel_ndim;
-        indices += indices_stride;
-    }
-    NPY_END_ALLOW_THREADS;
-    if (invalid) {
-        PyErr_Format(PyExc_ValueError,
-            "index %" NPY_INTP_FMT " is out of bounds for array with size "
-            "%" NPY_INTP_FMT,
-            val, unravel_size
-        );
-        return NPY_FAIL;
-    }
-    return NPY_SUCCEED;
-}
-
-/* Fortran-order inner loop for unravel_index */
+/* 
+ * Inner loop for unravel_index
+ * order must be NPY_CORDER or NPY_FORTRANORDER
+ */
 static int
-unravel_index_loop_forder(int unravel_ndim, npy_intp *unravel_dims,
-                        npy_intp unravel_size, npy_intp count,
-                        char *indices, npy_intp indices_stride,
-                        npy_intp *coords)
+unravel_index_loop(int unravel_ndim, npy_intp *unravel_dims,
+                   npy_intp unravel_size, npy_intp count,
+                   char *indices, npy_intp indices_stride,
+                   npy_intp *coords, NPY_ORDER order)
 {
-    int i;
-    char invalid;
-    npy_intp val;
+    int i, idx;
+    int idx_start = (order == NPY_CORDER) ? unravel_ndim - 1: 0;
+    int idx_step = (order == NPY_CORDER) ? -1 : 1;
+    char invalid = 0;
+    npy_intp val = 0;
 
     NPY_BEGIN_ALLOW_THREADS;
-    invalid = 0;
+    /* NPY_KEEPORDER or NPY_ANYORDER have no meaning in this setting */
+    assert(order == NPY_CORDER || order == NPY_FORTRANORDER);
     while (count--) {
         val = *(npy_intp *)indices;
         if (val < 0 || val >= unravel_size) {
             invalid = 1;
             break;
         }
+        idx = idx_start;
         for (i = 0; i < unravel_ndim; ++i) {
-            *coords++ = val % unravel_dims[i];
-            val /= unravel_dims[i];
+            /* 
+             * Using a local seems to enable single-divide optimization
+             * but only if the / precedes the %
+             */
+            npy_intp tmp = val / unravel_dims[idx];
+            coords[idx] = val % unravel_dims[idx];
+            val = tmp;
+            idx += idx_step;
         }
+        coords += unravel_ndim;
         indices += indices_stride;
     }
     NPY_END_ALLOW_THREADS;
@@ -1214,11 +1191,12 @@ unravel_index_loop_forder(int unravel_ndim, npy_intp *unravel_dims,
 NPY_NO_EXPORT PyObject *
 arr_unravel_index(PyObject *self, PyObject *args, PyObject *kwds)
 {
-    PyObject *indices0 = NULL, *ret_tuple = NULL;
+    PyObject *indices0 = NULL;
+    PyObject *ret_tuple = NULL;
     PyArrayObject *ret_arr = NULL;
     PyArrayObject *indices = NULL;
     PyArray_Descr *dtype = NULL;
-    PyArray_Dims dimensions={0,0};
+    PyArray_Dims dimensions = {0, 0};
     NPY_ORDER order = NPY_CORDER;
     npy_intp unravel_size;
 
@@ -1261,7 +1239,13 @@ arr_unravel_index(PyObject *self, PyObject *args, PyObject *kwds)
         goto fail;
     }
 
-    unravel_size = PyArray_MultiplyList(dimensions.ptr, dimensions.len);
+    unravel_size = PyArray_OverflowMultiplyList(dimensions.ptr, dimensions.len);
+    if (unravel_size == -1) {
+        PyErr_SetString(PyExc_ValueError,
+                        "dimensions are too large; arrays and shapes with "
+                        "a total size greater than 'intp' are not supported.");
+        goto fail;
+    }
 
     indices = astype_anyint(indices0);
     if (indices == NULL) {
@@ -1315,64 +1299,35 @@ arr_unravel_index(PyObject *self, PyObject *args, PyObject *kwds)
         goto fail;
     }
 
-    if (order == NPY_CORDER) {
-        if (NpyIter_GetIterSize(iter) != 0) {
-            NpyIter_IterNextFunc *iternext;
-            char **dataptr;
-            npy_intp *strides;
-            npy_intp *countptr, count;
-            npy_intp *coordsptr = (npy_intp *)PyArray_DATA(ret_arr);
+    if (order != NPY_CORDER && order != NPY_FORTRANORDER) {
+        PyErr_SetString(PyExc_ValueError,
+                        "only 'C' or 'F' order is permitted");
+        goto fail;
+    }
+    if (NpyIter_GetIterSize(iter) != 0) {
+        NpyIter_IterNextFunc *iternext;
+        char **dataptr;
+        npy_intp *strides;
+        npy_intp *countptr, count;
+        npy_intp *coordsptr = (npy_intp *)PyArray_DATA(ret_arr);
 
-            iternext = NpyIter_GetIterNext(iter, NULL);
-            if (iternext == NULL) {
-                goto fail;
-            }
-            dataptr = NpyIter_GetDataPtrArray(iter);
-            strides = NpyIter_GetInnerStrideArray(iter);
-            countptr = NpyIter_GetInnerLoopSizePtr(iter);
-
-            do {
-                count = *countptr;
-                if (unravel_index_loop_corder(dimensions.len, dimensions.ptr,
-                            unravel_size, count, *dataptr, *strides,
-                            coordsptr) != NPY_SUCCEED) {
-                    goto fail;
-                }
-                coordsptr += count*dimensions.len;
-            } while(iternext(iter));
+        iternext = NpyIter_GetIterNext(iter, NULL);
+        if (iternext == NULL) {
+            goto fail;
         }
-    }
-    else if (order == NPY_FORTRANORDER) {
-        if (NpyIter_GetIterSize(iter) != 0) {
-            NpyIter_IterNextFunc *iternext;
-            char **dataptr;
-            npy_intp *strides;
-            npy_intp *countptr, count;
-            npy_intp *coordsptr = (npy_intp *)PyArray_DATA(ret_arr);
+        dataptr = NpyIter_GetDataPtrArray(iter);
+        strides = NpyIter_GetInnerStrideArray(iter);
+        countptr = NpyIter_GetInnerLoopSizePtr(iter);
 
-            iternext = NpyIter_GetIterNext(iter, NULL);
-            if (iternext == NULL) {
+        do {
+            count = *countptr;
+            if (unravel_index_loop(dimensions.len, dimensions.ptr,
+                                   unravel_size, count, *dataptr, *strides,
+                                   coordsptr, order) != NPY_SUCCEED) {
                 goto fail;
             }
-            dataptr = NpyIter_GetDataPtrArray(iter);
-            strides = NpyIter_GetInnerStrideArray(iter);
-            countptr = NpyIter_GetInnerLoopSizePtr(iter);
-
-            do {
-                count = *countptr;
-                if (unravel_index_loop_forder(dimensions.len, dimensions.ptr,
-                            unravel_size, count, *dataptr, *strides,
-                            coordsptr) != NPY_SUCCEED) {
-                    goto fail;
-                }
-                coordsptr += count*dimensions.len;
-            } while(iternext(iter));
-        }
-    }
-    else {
-        PyErr_SetString(PyExc_ValueError,
-                        "only 'C' or 'F' order is permitted");
-        goto fail;
+            coordsptr += count * dimensions.len;
+        } while (iternext(iter));
     }
 
 
@@ -1556,7 +1511,8 @@ pack_inner(const char *inptr,
            npy_intp in_stride,
            char *outptr,
            npy_intp n_out,
-           npy_intp out_stride)
+           npy_intp out_stride,
+           char order)
 {
     /*
      * Loop through the elements of inptr.
@@ -1576,9 +1532,13 @@ pack_inner(const char *inptr,
         vn_out -= (vn_out & 1);
         for (index = 0; index < vn_out; index += 2) {
             unsigned int r;
-            /* swap as packbits is "big endian", note x86 can load unaligned */
-            npy_uint64 a = npy_bswap8(*(npy_uint64*)inptr);
-            npy_uint64 b = npy_bswap8(*(npy_uint64*)(inptr + 8));
+            npy_uint64 a = *(npy_uint64*)inptr;
+            npy_uint64 b = *(npy_uint64*)(inptr + 8);
+            if (order == 'b') {
+                a = npy_bswap8(a);
+                b = npy_bswap8(b);
+            }
+            /* note x86 can load unaligned */
             __m128i v = _mm_set_epi64(_m_from_int64(b), _m_from_int64(a));
             /* false -> 0x00 and true -> 0xFF (there is no cmpneq) */
             v = _mm_cmpeq_epi8(v, zero);
@@ -1598,30 +1558,45 @@ pack_inner(const char *inptr,
     if (remain == 0) {                  /* assumes n_in > 0 */
         remain = 8;
     }
-    /* don't reset index to handle remainder of above block */
+    /* Don't reset index. Just handle remainder of above block */
     for (; index < n_out; index++) {
-        char build = 0;
+        unsigned char build = 0;
         int i, maxi;
         npy_intp j;
 
         maxi = (index == n_out - 1) ? remain : 8;
-        for (i = 0; i < maxi; i++) {
-            build <<= 1;
-            for (j = 0; j < element_size; j++) {
-                build |= (inptr[j] != 0);
+        if (order == 'b') {
+            for (i = 0; i < maxi; i++) {
+                build <<= 1;
+                for (j = 0; j < element_size; j++) {
+                    build |= (inptr[j] != 0);
+                }
+                inptr += in_stride;
+            }
+            if (index == n_out - 1) {
+                build <<= 8 - remain;
             }
-            inptr += in_stride;
         }
-        if (index == n_out - 1) {
-            build <<= 8 - remain;
+        else
+        {
+            for (i = 0; i < maxi; i++) {
+                build >>= 1;
+                for (j = 0; j < element_size; j++) {
+                    build |= (inptr[j] != 0) ? 128 : 0;
+                }
+                inptr += in_stride;
+            }
+            if (index == n_out - 1) {
+                build >>= 8 - remain;
+            }
         }
-        *outptr = build;
+        *outptr = (char)build;
         outptr += out_stride;
     }
 }
 
 static PyObject *
-pack_bits(PyObject *input, int axis)
+pack_bits(PyObject *input, int axis, char order)
 {
     PyArrayObject *inp;
     PyArrayObject *new = NULL;
@@ -1706,7 +1681,7 @@ pack_bits(PyObject *input, int axis)
         pack_inner(PyArray_ITER_DATA(it), PyArray_ITEMSIZE(new),
                    PyArray_DIM(new, axis), PyArray_STRIDE(new, axis),
                    PyArray_ITER_DATA(ot), PyArray_DIM(out, axis),
-                   PyArray_STRIDE(out, axis));
+                   PyArray_STRIDE(out, axis), order);
         PyArray_ITER_NEXT(it);
         PyArray_ITER_NEXT(ot);
     }
@@ -1726,10 +1701,8 @@ fail:
 }
 
 static PyObject *
-unpack_bits(PyObject *input, int axis, PyObject *count_obj)
+unpack_bits(PyObject *input, int axis, PyObject *count_obj, char order)
 {
-    static int unpack_init = 0;
-    static char unpack_lookup[256][8];
     PyArrayObject *inp;
     PyArrayObject *new = NULL;
     PyArrayObject *out = NULL;
@@ -1815,28 +1788,6 @@ unpack_bits(PyObject *input, int axis, PyObject *count_obj)
         goto fail;
     }
 
-    /* setup lookup table under GIL, big endian 0..256 as bytes */
-    if (unpack_init == 0) {
-        npy_uint64 j;
-        npy_uint64 * unpack_lookup_64 = (npy_uint64 *)unpack_lookup;
-        for (j=0; j < 256; j++) {
-            npy_uint64 v = 0;
-            v |= (npy_uint64)((j &   1) ==   1);
-            v |= (npy_uint64)((j &   2) ==   2) << 8;
-            v |= (npy_uint64)((j &   4) ==   4) << 16;
-            v |= (npy_uint64)((j &   8) ==   8) << 24;
-            v |= (npy_uint64)((j &  16) ==  16) << 32;
-            v |= (npy_uint64)((j &  32) ==  32) << 40;
-            v |= (npy_uint64)((j &  64) ==  64) << 48;
-            v |= (npy_uint64)((j & 128) == 128) << 56;
-#if NPY_BYTE_ORDER == NPY_LITTLE_ENDIAN
-            v = npy_bswap8(v);
-#endif
-            unpack_lookup_64[j] = v;
-        }
-        unpack_init = 1;
-    }
-
     count = PyArray_DIM(new, axis) * 8;
     if (outdims[axis] > count) {
         in_n = count / 8;
@@ -1859,45 +1810,40 @@ unpack_bits(PyObject *input, int axis, PyObject *count_obj)
         unsigned const char *inptr = PyArray_ITER_DATA(it);
         char *outptr = PyArray_ITER_DATA(ot);
 
-        if (out_stride == 1) {
-            /* for unity stride we can just copy out of the lookup table */
+        if (order == 'b') {
             for (index = 0; index < in_n; index++) {
-                memcpy(outptr, unpack_lookup[*inptr], 8);
-                outptr += 8;
+                for (i = 0; i < 8; i++) {
+                    *outptr = ((*inptr & (128 >> i)) != 0);
+                    outptr += out_stride;
+                }
                 inptr += in_stride;
             }
             /* Clean up the tail portion */
-            if (in_tail) {
-                memcpy(outptr, unpack_lookup[*inptr], in_tail);
-            }
-            /* Add padding */
-            else if (out_pad) {
-                memset(outptr, 0, out_pad);
+            for (i = 0; i < in_tail; i++) {
+                *outptr = ((*inptr & (128 >> i)) != 0);
+                outptr += out_stride;
             }
         }
         else {
-            unsigned char mask;
-
             for (index = 0; index < in_n; index++) {
-                for (mask = 128; mask; mask >>= 1) {
-                    *outptr = ((mask & (*inptr)) != 0);
+                for (i = 0; i < 8; i++) {
+                    *outptr = ((*inptr & (1 << i)) != 0);
                     outptr += out_stride;
                 }
                 inptr += in_stride;
             }
             /* Clean up the tail portion */
-            mask = 128;
             for (i = 0; i < in_tail; i++) {
-                *outptr = ((mask & (*inptr)) != 0);
-                outptr += out_stride;
-                mask >>= 1;
-            }
-            /* Add padding */
-            for (index = 0; index < out_pad; index++) {
-                *outptr = 0;
+                *outptr = ((*inptr & (1 << i)) != 0);
                 outptr += out_stride;
             }
         }
+        /* Add padding */
+        for (index = 0; index < out_pad; index++) {
+            *outptr = 0;
+            outptr += out_stride;
+        }
+
         PyArray_ITER_NEXT(it);
         PyArray_ITER_NEXT(ot);
     }
@@ -1921,13 +1867,26 @@ io_pack(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwds)
 {
     PyObject *obj;
     int axis = NPY_MAXDIMS;
-    static char *kwlist[] = {"a", "axis", NULL};
+    static char *kwlist[] = {"in", "axis", "bitorder", NULL};
+    char c = 'b';
+    const char * order_str = NULL;
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|O&:pack" , kwlist,
-                                     &obj, PyArray_AxisConverter, &axis)) {
+    if (!PyArg_ParseTupleAndKeywords( args, kwds, "O|O&s:pack" , kwlist,
+                &obj, PyArray_AxisConverter, &axis, &order_str)) {
         return NULL;
     }
-    return pack_bits(obj, axis);
+    if (order_str != NULL) {
+        if (strncmp(order_str, "little", 6) == 0)
+            c = 'l';
+        else if (strncmp(order_str, "big", 3) == 0)
+            c = 'b';
+        else {
+            PyErr_SetString(PyExc_ValueError,
+                    "'order' must be either 'little' or 'big'");
+            return NULL;
+        }
+    }
+    return pack_bits(obj, axis, c);
 }
 
 
@@ -1937,12 +1896,20 @@ io_unpack(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwds)
     PyObject *obj;
     int axis = NPY_MAXDIMS;
     PyObject *count = Py_None;
-    static char *kwlist[] = {"a", "axis", "count", NULL};
+    static char *kwlist[] = {"in", "axis", "count", "bitorder", NULL};
+    const char * c = NULL;
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|O&O:unpack" , kwlist,
-                                     &obj, PyArray_AxisConverter, &axis,
-                                     &count)) {
+    if (!PyArg_ParseTupleAndKeywords( args, kwds, "O|O&Os:unpack" , kwlist,
+                &obj, PyArray_AxisConverter, &axis, &count, &c)) {
+        return NULL;
+    }
+    if (c == NULL) {
+        c = "b";
+    }
+    if (c[0] != 'l' && c[0] != 'b') {
+        PyErr_SetString(PyExc_ValueError,
+                    "'order' must begin with 'l' or 'b'");
         return NULL;
     }
-    return unpack_bits(obj, axis, count);
+    return unpack_bits(obj, axis, count, c[0]);
 }
diff --git a/numpy/core/src/multiarray/conversion_utils.c b/numpy/core/src/multiarray/conversion_utils.c
index fa8de8b37..a370874a6 100644
--- a/numpy/core/src/multiarray/conversion_utils.c
+++ b/numpy/core/src/multiarray/conversion_utils.c
@@ -116,8 +116,8 @@ PyArray_IntpConverter(PyObject *obj, PyArray_Dims *seq)
         return NPY_FAIL;
     }
     if (len > NPY_MAXDIMS) {
-        PyErr_Format(PyExc_ValueError, "sequence too large; "
-                     "cannot be greater than %d", NPY_MAXDIMS);
+        PyErr_Format(PyExc_ValueError, "maximum supported dimension for an ndarray is %d"
+                     ", found %d", NPY_MAXDIMS, len);
         return NPY_FAIL;
     }
     if (len > 0) {
@@ -393,6 +393,11 @@ PyArray_SortkindConverter(PyObject *obj, NPY_SORTKIND *sortkind)
     char *str;
     PyObject *tmp = NULL;
 
+    if (obj == Py_None) {
+        *sortkind = NPY_QUICKSORT;
+        return NPY_SUCCEED;
+    }
+
     if (PyUnicode_Check(obj)) {
         obj = tmp = PyUnicode_AsASCIIString(obj);
         if (obj == NULL) {
@@ -401,6 +406,8 @@ PyArray_SortkindConverter(PyObject *obj, NPY_SORTKIND *sortkind)
     }
 
     *sortkind = NPY_QUICKSORT;
+        
+
     str = PyBytes_AsString(obj);
     if (!str) {
         Py_XDECREF(tmp);
@@ -424,7 +431,7 @@ PyArray_SortkindConverter(PyObject *obj, NPY_SORTKIND *sortkind)
          * That maintains backwards compatibility while
          * allowing other types of stable sorts to be used.
          */
-        *sortkind = NPY_STABLESORT;
+        *sortkind = NPY_MERGESORT;
     }
     else if (str[0] == 's' || str[0] == 'S') {
         /*
diff --git a/numpy/core/src/multiarray/ctors.c b/numpy/core/src/multiarray/ctors.c
index 42cd31069..dc42d1cfe 100644
--- a/numpy/core/src/multiarray/ctors.c
+++ b/numpy/core/src/multiarray/ctors.c
@@ -1072,8 +1072,30 @@ PyArray_NewFromDescr_int(PyTypeObject *subtype, PyArray_Descr *descr, int nd,
             data = npy_alloc_cache(nbytes);
         }
         if (data == NULL) {
-            PyErr_NoMemory();
-            goto fail;
+            static PyObject *exc_type = NULL;
+            
+            npy_cache_import(
+                "numpy.core._exceptions", "_ArrayMemoryError",
+                &exc_type);
+            if (exc_type == NULL) {
+                return NULL;
+            }
+            
+            PyObject *shape = PyArray_IntTupleFromIntp(fa->nd,fa->dimensions);
+            if (shape == NULL) {
+                return NULL;
+            }
+            
+            /* produce an error object */
+            PyObject *exc_value = PyTuple_Pack(2, shape, descr);
+            Py_DECREF(shape);
+            if (exc_value == NULL){
+                return NULL;
+            }
+            PyErr_SetObject(exc_type, exc_value);
+            Py_DECREF(exc_value);
+            return NULL;
+
         }
         fa->flags |= NPY_ARRAY_OWNDATA;
 
@@ -1183,9 +1205,9 @@ PyArray_NewFromDescrAndBase(
                                     flags, obj, base, 0, 0);
 }
 
-/*NUMPY_API
+/*
  * Creates a new array with the same shape as the provided one,
- * with possible memory layout order and data type changes.
+ * with possible memory layout order, data type and shape changes.
  *
  * prototype - The array the new one should be like.
  * order     - NPY_CORDER - C-contiguous result.
@@ -1193,6 +1215,8 @@ PyArray_NewFromDescrAndBase(
  *             NPY_ANYORDER - Fortran if prototype is Fortran, C otherwise.
  *             NPY_KEEPORDER - Keeps the axis ordering of prototype.
  * dtype     - If not NULL, overrides the data type of the result.
+ * ndim      - If not 0 and dims not NULL, overrides the shape of the result.
+ * dims      - If not NULL and ndim not 0, overrides the shape of the result.
  * subok     - If 1, use the prototype's array subtype, otherwise
  *             always create a base-class array.
  *
@@ -1200,11 +1224,18 @@ PyArray_NewFromDescrAndBase(
  * dtype->subarray is true, dtype will be decrefed.
  */
 NPY_NO_EXPORT PyObject *
-PyArray_NewLikeArray(PyArrayObject *prototype, NPY_ORDER order,
-                     PyArray_Descr *dtype, int subok)
+PyArray_NewLikeArrayWithShape(PyArrayObject *prototype, NPY_ORDER order,
+                              PyArray_Descr *dtype, int ndim, npy_intp *dims, int subok)
 {
     PyObject *ret = NULL;
-    int ndim = PyArray_NDIM(prototype);
+
+    if (dims == NULL) {
+        ndim = PyArray_NDIM(prototype);
+        dims = PyArray_DIMS(prototype);
+    }
+    else if (order == NPY_KEEPORDER && (ndim != PyArray_NDIM(prototype))) {
+        order = NPY_CORDER;
+    }
 
     /* If no override data type, use the one from the prototype */
     if (dtype == NULL) {
@@ -1237,7 +1268,7 @@ PyArray_NewLikeArray(PyArrayObject *prototype, NPY_ORDER order,
         ret = PyArray_NewFromDescr(subok ? Py_TYPE(prototype) : &PyArray_Type,
                                         dtype,
                                         ndim,
-                                        PyArray_DIMS(prototype),
+                                        dims,
                                         NULL,
                                         NULL,
                                         order,
@@ -1246,11 +1277,10 @@ PyArray_NewLikeArray(PyArrayObject *prototype, NPY_ORDER order,
     /* KEEPORDER needs some analysis of the strides */
     else {
         npy_intp strides[NPY_MAXDIMS], stride;
-        npy_intp *shape = PyArray_DIMS(prototype);
         npy_stride_sort_item strideperm[NPY_MAXDIMS];
         int idim;
 
-        PyArray_CreateSortedStridePerm(PyArray_NDIM(prototype),
+        PyArray_CreateSortedStridePerm(ndim,
                                         PyArray_STRIDES(prototype),
                                         strideperm);
 
@@ -1259,14 +1289,14 @@ PyArray_NewLikeArray(PyArrayObject *prototype, NPY_ORDER order,
         for (idim = ndim-1; idim >= 0; --idim) {
             npy_intp i_perm = strideperm[idim].perm;
             strides[i_perm] = stride;
-            stride *= shape[i_perm];
+            stride *= dims[i_perm];
         }
 
         /* Finally, allocate the array */
         ret = PyArray_NewFromDescr(subok ? Py_TYPE(prototype) : &PyArray_Type,
                                         dtype,
                                         ndim,
-                                        shape,
+                                        dims,
                                         strides,
                                         NULL,
                                         0,
@@ -1277,6 +1307,29 @@ PyArray_NewLikeArray(PyArrayObject *prototype, NPY_ORDER order,
 }
 
 /*NUMPY_API
+ * Creates a new array with the same shape as the provided one,
+ * with possible memory layout order and data type changes.
+ *
+ * prototype - The array the new one should be like.
+ * order     - NPY_CORDER - C-contiguous result.
+ *             NPY_FORTRANORDER - Fortran-contiguous result.
+ *             NPY_ANYORDER - Fortran if prototype is Fortran, C otherwise.
+ *             NPY_KEEPORDER - Keeps the axis ordering of prototype.
+ * dtype     - If not NULL, overrides the data type of the result.
+ * subok     - If 1, use the prototype's array subtype, otherwise
+ *             always create a base-class array.
+ *
+ * NOTE: If dtype is not NULL, steals the dtype reference.  On failure or when
+ * dtype->subarray is true, dtype will be decrefed.
+ */
+NPY_NO_EXPORT PyObject *
+PyArray_NewLikeArray(PyArrayObject *prototype, NPY_ORDER order,
+                     PyArray_Descr *dtype, int subok)
+{
+    return PyArray_NewLikeArrayWithShape(prototype, order, dtype, 0, NULL, subok);
+}
+
+/*NUMPY_API
  * Generic new array creation routine.
  */
 NPY_NO_EXPORT PyObject *
diff --git a/numpy/core/src/multiarray/ctors.h b/numpy/core/src/multiarray/ctors.h
index e9a2532da..4932130a7 100644
--- a/numpy/core/src/multiarray/ctors.h
+++ b/numpy/core/src/multiarray/ctors.h
@@ -18,6 +18,10 @@ PyArray_NewFromDescr_int(PyTypeObject *subtype, PyArray_Descr *descr, int nd,
                          int flags, PyObject *obj, PyObject *base, int zeroed,
                          int allow_emptystring);
 
+NPY_NO_EXPORT PyObject *
+PyArray_NewLikeArrayWithShape(PyArrayObject *prototype, NPY_ORDER order,
+                              PyArray_Descr *dtype, int ndim, npy_intp *dims, int subok);
+
 NPY_NO_EXPORT PyObject *PyArray_New(PyTypeObject *, int nd, npy_intp *,
                              int, npy_intp *, void *, int, int, PyObject *);
 
diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c
index 6065c1df4..f4d2513ca 100644
--- a/numpy/core/src/multiarray/item_selection.c
+++ b/numpy/core/src/multiarray/item_selection.c
@@ -1126,7 +1126,7 @@ fail:
 NPY_NO_EXPORT int
 PyArray_Sort(PyArrayObject *op, int axis, NPY_SORTKIND which)
 {
-    PyArray_SortFunc *sort;
+    PyArray_SortFunc *sort = NULL;
     int n = PyArray_NDIM(op);
 
     if (check_and_adjust_axis(&axis, n) < 0) {
@@ -1143,6 +1143,7 @@ PyArray_Sort(PyArrayObject *op, int axis, NPY_SORTKIND which)
     }
 
     sort = PyArray_DESCR(op)->f->sort[which];
+
     if (sort == NULL) {
         if (PyArray_DESCR(op)->f->compare) {
             switch (which) {
@@ -1284,16 +1285,11 @@ NPY_NO_EXPORT PyObject *
 PyArray_ArgSort(PyArrayObject *op, int axis, NPY_SORTKIND which)
 {
     PyArrayObject *op2;
-    PyArray_ArgSortFunc *argsort;
+    PyArray_ArgSortFunc *argsort = NULL;
     PyObject *ret;
 
-    if (which < 0 || which >= NPY_NSORTS) {
-        PyErr_SetString(PyExc_ValueError,
-                        "not a valid sort kind");
-        return NULL;
-    }
-
     argsort = PyArray_DESCR(op)->f->argsort[which];
+
     if (argsort == NULL) {
         if (PyArray_DESCR(op)->f->compare) {
             switch (which) {
diff --git a/numpy/core/src/multiarray/iterators.c b/numpy/core/src/multiarray/iterators.c
index 9fcdc91b2..73c15b099 100644
--- a/numpy/core/src/multiarray/iterators.c
+++ b/numpy/core/src/multiarray/iterators.c
@@ -1388,7 +1388,7 @@ arraymultiter_new(PyTypeObject *NPY_UNUSED(subtype), PyObject *args, PyObject *k
     PyArrayMultiIterObject *multi;
     PyObject *arr;
 
-    if (kwds != NULL) {
+    if (kwds != NULL && PyDict_Size(kwds) > 0) {
         PyErr_SetString(PyExc_ValueError,
                         "keyword arguments not accepted.");
         return NULL;
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index 06e15225d..e15ab5172 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -1758,7 +1758,7 @@ static PyObject *
 array_copyto(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
 {
 
-    static char *kwlist[] = {"dst","src","casting","where",NULL};
+    static char *kwlist[] = {"dst", "src", "casting", "where", NULL};
     PyObject *wheremask_in = NULL;
     PyArrayObject *dst = NULL, *src = NULL, *wheremask = NULL;
     NPY_CASTING casting = NPY_SAME_KIND_CASTING;
@@ -1803,7 +1803,7 @@ static PyObject *
 array_empty(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
 {
 
-    static char *kwlist[] = {"shape","dtype","order",NULL};
+    static char *kwlist[] = {"shape", "dtype", "order", NULL};
     PyArray_Descr *typecode = NULL;
     PyArray_Dims shape = {NULL, 0};
     NPY_ORDER order = NPY_CORDER;
@@ -1846,23 +1846,28 @@ static PyObject *
 array_empty_like(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
 {
 
-    static char *kwlist[] = {"prototype","dtype","order","subok",NULL};
+    static char *kwlist[] = {"prototype", "dtype", "order", "subok", "shape", NULL};
     PyArrayObject *prototype = NULL;
     PyArray_Descr *dtype = NULL;
     NPY_ORDER order = NPY_KEEPORDER;
     PyArrayObject *ret = NULL;
     int subok = 1;
+    PyArray_Dims shape = {NULL, 0};
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O&|O&O&i:empty_like", kwlist,
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O&|O&O&iO&:empty_like", kwlist,
                 &PyArray_Converter, &prototype,
                 &PyArray_DescrConverter2, &dtype,
                 &PyArray_OrderConverter, &order,
-                &subok)) {
+                &subok,
+                &PyArray_IntpConverter, &shape)) {
         goto fail;
     }
     /* steals the reference to dtype if it's not NULL */
-    ret = (PyArrayObject *)PyArray_NewLikeArray(prototype,
-                                            order, dtype, subok);
+    ret = (PyArrayObject *)PyArray_NewLikeArrayWithShape(prototype, order, dtype,
+                                                         shape.len, shape.ptr, subok);
+    if (!ret) {
+        goto fail;
+    }
     Py_DECREF(prototype);
 
     return (PyObject *)ret;
@@ -1881,7 +1886,7 @@ static PyObject *
 array_scalar(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
 {
 
-    static char *kwlist[] = {"dtype","obj", NULL};
+    static char *kwlist[] = {"dtype", "obj", NULL};
     PyArray_Descr *typecode;
     PyObject *obj = NULL, *tmpobj = NULL;
     int alloc = 0;
@@ -1957,7 +1962,7 @@ array_scalar(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
 static PyObject *
 array_zeros(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
 {
-    static char *kwlist[] = {"shape","dtype","order",NULL};
+    static char *kwlist[] = {"shape", "dtype", "order", NULL};
     PyArray_Descr *typecode = NULL;
     PyArray_Dims shape = {NULL, 0};
     NPY_ORDER order = NPY_CORDER;
diff --git a/numpy/core/src/multiarray/typeinfo.c b/numpy/core/src/multiarray/typeinfo.c
index bc4147841..14c4f27cb 100644
--- a/numpy/core/src/multiarray/typeinfo.c
+++ b/numpy/core/src/multiarray/typeinfo.c
@@ -104,8 +104,10 @@ PyArray_typeinforanged(
     return entry;
 }
 
-/* Backport, only needed here */
-#if PY_VERSION_HEX < 0x03040000
+/* Python version only needed for backport to 2.7 */
+#if (PY_VERSION_HEX < 0x03040000) \
+    || (defined(PYPY_VERSION_NUM) && (PYPY_VERSION_NUM < 0x07020000))
+
     static int
     PyStructSequence_InitType2(PyTypeObject *type, PyStructSequence_Desc *desc) {
         PyStructSequence_InitType(type, desc);
diff --git a/numpy/core/src/npysort/radixsort.c.src b/numpy/core/src/npysort/radixsort.c.src
new file mode 100644
index 000000000..c1435bd96
--- /dev/null
+++ b/numpy/core/src/npysort/radixsort.c.src
@@ -0,0 +1,229 @@
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "npy_sort.h"
+#include "npysort_common.h"
+#include <stdlib.h>
+
+/*
+ *****************************************************************************
+ **                            INTEGER SORTS                                **
+ *****************************************************************************
+ */
+
+
+/**begin repeat
+ *
+ * #TYPE = BOOL, BYTE, UBYTE, SHORT, USHORT, INT, UINT, LONG, ULONG,
+ *         LONGLONG, ULONGLONG#
+ * #suff = bool, byte, ubyte, short, ushort, int, uint, long, ulong,
+ *         longlong, ulonglong#
+ * #type = npy_ubyte, npy_ubyte, npy_ubyte, npy_ushort, npy_ushort, npy_uint,
+ *         npy_uint, npy_ulong, npy_ulong, npy_ulonglong, npy_ulonglong#
+ * #sign = 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0#
+ * #floating = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0#
+ */
+
+// Reference: https://github.com/eloj/radix-sorting#-key-derivation
+#if @sign@
+    // Floating-point is currently disabled.
+    // Floating-point tests succeed for double and float on macOS but not on Windows/Linux.
+    // Basic sorting tests succeed but others relying on sort fail.
+    // Possibly related to floating-point normalisation or multiple NaN reprs? Not sure.
+    #if @floating@
+        // For floats, we invert the key if the sign bit is set, else we invert the sign bit.
+        #define KEY_OF(x) ((x) ^ (-((x) >> (sizeof(@type@) * 8 - 1)) | ((@type@)1 << (sizeof(@type@) * 8 - 1))))
+    #else
+        // For signed ints, we flip the sign bit so the negatives are below the positives.
+        #define KEY_OF(x) ((x) ^ ((@type@)1 << (sizeof(@type@) * 8 - 1)))
+    #endif
+#else
+    // For unsigned ints, the key is as-is
+    #define KEY_OF(x) (x)
+#endif
+
+static inline npy_ubyte
+nth_byte_@suff@(@type@ key, npy_intp l) {
+    return (key >> (l << 3)) & 0xFF;
+}
+
+@type@*
+radixsort0_@suff@(@type@ *arr, @type@ *aux, npy_intp num)
+{
+    npy_intp cnt[sizeof(@type@)][1 << 8] = { { 0 } };
+    npy_intp i, l;
+    @type@ key0 = KEY_OF(arr[0]);
+    npy_intp ncols = 0;
+    npy_ubyte cols[sizeof(@type@)];
+
+    for (i = 0; i < num; i++) {
+        @type@ k = KEY_OF(arr[i]);
+
+        for (l = 0; l < sizeof(@type@); l++) {
+            cnt[l][nth_byte_@suff@(k, l)]++;
+        }
+    }
+
+    for (l = 0; l < sizeof(@type@); l++) {
+	    if (cnt[l][nth_byte_@suff@(key0, l)] != num) {
+	        cols[ncols++] = l;
+        }
+    }
+
+    for (l = 0; l < ncols; l++) {
+        npy_intp a = 0;
+        for (i = 0; i < 256; i++) {
+            npy_intp b = cnt[cols[l]][i];
+            cnt[cols[l]][i] = a;
+            a += b;
+        }
+    }
+
+    for (l = 0; l < ncols; l++) {
+        @type@* temp;
+        for (i = 0; i < num; i++) {
+            @type@ k = KEY_OF(arr[i]);
+            npy_intp dst = cnt[cols[l]][nth_byte_@suff@(k, cols[l])]++;
+            aux[dst] = arr[i];
+        }
+
+        temp = aux;
+        aux = arr;
+        arr = temp;
+    }
+
+    return arr;
+}
+
+int
+radixsort_@suff@(void *start, npy_intp num, void *NPY_UNUSED(varr))
+{
+    void *sorted;
+    @type@ *aux;
+    @type@ *arr = start;
+    @type@ k1, k2;
+    npy_bool all_sorted = 1;
+
+    if (num < 2) {
+        return 0;
+    }
+
+    k1 = KEY_OF(arr[0]);
+    for (npy_intp i = 1; i < num; i++) {
+        k2 = KEY_OF(arr[i]);
+        if (k1 > k2) {
+            all_sorted = 0;
+            break;
+        }
+        k1 = k2;
+    }
+
+    if (all_sorted) {
+        return 0;
+    }
+
+    aux = malloc(num * sizeof(@type@));
+    if (aux == NULL) {
+        return -NPY_ENOMEM;
+    }
+
+    sorted = radixsort0_@suff@(start, aux, num);
+    if (sorted != start) {
+        memcpy(start, sorted, num * sizeof(@type@));
+    }
+
+    free(aux);
+    return 0;
+}
+
+npy_intp*
+aradixsort0_@suff@(@type@ *arr, npy_intp *aux, npy_intp *tosort, npy_intp num)
+{
+    npy_intp cnt[sizeof(@type@)][1 << 8] = { { 0 } };
+    npy_intp i, l;
+    @type@ key0 = KEY_OF(arr[0]);
+    npy_intp ncols = 0;
+    npy_ubyte cols[sizeof(@type@)];
+
+    for (i = 0; i < num; i++) {
+        @type@ k = KEY_OF(arr[i]);
+
+        for (l = 0; l < sizeof(@type@); l++) {
+            cnt[l][nth_byte_@suff@(k, l)]++;
+        }
+    }
+
+    for (l = 0; l < sizeof(@type@); l++) {
+        if (cnt[l][nth_byte_@suff@(key0, l)] != num) {
+            cols[ncols++] = l;
+        }
+    }
+
+    for (l = 0; l < ncols; l++) {
+        npy_intp a = 0;
+        for (i = 0; i < 256; i++) {
+            npy_intp b = cnt[cols[l]][i];
+            cnt[cols[l]][i] = a;
+            a += b;
+        }
+    }
+
+    for (l = 0; l < ncols; l++) {
+        npy_intp* temp;
+        for (i = 0; i < num; i++) {
+            @type@ k = KEY_OF(arr[tosort[i]]);
+            npy_intp dst = cnt[cols[l]][nth_byte_@suff@(k, cols[l])]++;
+            aux[dst] = tosort[i];
+        }
+
+        temp = aux;
+        aux = tosort;
+        tosort = temp;
+    }
+
+    return tosort;
+}
+
+int
+aradixsort_@suff@(void *start, npy_intp* tosort, npy_intp num, void *NPY_UNUSED(varr))
+{
+    npy_intp *sorted;
+    npy_intp *aux;
+    @type@ *arr = start;
+    @type@ k1, k2;
+    npy_bool all_sorted = 1;
+
+    if (num < 2) {
+        return 0;
+    }
+
+    k1 = KEY_OF(arr[0]);
+    for (npy_intp i = 1; i < num; i++) {
+        k2 = KEY_OF(arr[i]);
+        if (k1 > k2) {
+            all_sorted = 0;
+            break;
+        }
+        k1 = k2;
+    }
+
+    if (all_sorted) {
+        return 0;
+    }
+
+    aux = malloc(num * sizeof(npy_intp));
+    if (aux == NULL) {
+        return -NPY_ENOMEM;
+    }
+
+    sorted = aradixsort0_@suff@(start, aux, tosort, num);
+    if (sorted != tosort) {
+        memcpy(tosort, sorted, num * sizeof(npy_intp));
+    }
+
+    free(aux);
+    return 0;
+}
+
+#undef KEY_OF
+
+/**end repeat**/
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 9a96ec3f4..b94a5a0f7 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -40,7 +40,6 @@ abs_ptrdiff(char *a, char *b)
     return (a > b) ? (a - b) : (b - a);
 }
 
-
 /*
  * stride is equal to element size and input and destination are equal or
  * don't overlap within one register. The check of the steps against
@@ -133,7 +132,7 @@ abs_ptrdiff(char *a, char *b)
  */
 
 static void
-@ISA@_@func@_FLOAT(npy_float *, npy_float *, const npy_int n);
+@ISA@_@func@_FLOAT(npy_float *, npy_float *, const npy_intp n);
 
 /**end repeat1**/
 #endif
@@ -1150,7 +1149,10 @@ avx2_get_exponent(__m256 x)
 
     __m256 two_power_100 = _mm256_castsi256_ps(_mm256_set1_epi32(0x71800000));
     __m256 denormal_mask = _mm256_cmp_ps(x, _mm256_set1_ps(FLT_MIN), _CMP_LT_OQ);
-    __m256 temp = _mm256_mul_ps(x, two_power_100);
+    __m256 normal_mask = _mm256_cmp_ps(x, _mm256_set1_ps(FLT_MIN), _CMP_GE_OQ);
+
+    __m256 temp1 = _mm256_blendv_ps(x, _mm256_set1_ps(0.0f), normal_mask);
+    __m256 temp = _mm256_mul_ps(temp1, two_power_100);
     x = _mm256_blendv_ps(x, temp, denormal_mask);
 
     __m256 exp = _mm256_cvtepi32_ps(
@@ -1174,7 +1176,10 @@ avx2_get_mantissa(__m256 x)
 
     __m256 two_power_100 = _mm256_castsi256_ps(_mm256_set1_epi32(0x71800000));
     __m256 denormal_mask = _mm256_cmp_ps(x, _mm256_set1_ps(FLT_MIN), _CMP_LT_OQ);
-    __m256 temp = _mm256_mul_ps(x, two_power_100);
+    __m256 normal_mask = _mm256_cmp_ps(x, _mm256_set1_ps(FLT_MIN), _CMP_GE_OQ);
+
+    __m256 temp1 = _mm256_blendv_ps(x, _mm256_set1_ps(0.0f), normal_mask);
+    __m256 temp = _mm256_mul_ps(temp1, two_power_100);
     x = _mm256_blendv_ps(x, temp, denormal_mask);
 
     __m256i mantissa_bits = _mm256_set1_epi32(0x7fffff);
@@ -1261,7 +1266,9 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
  * #BYTES = 32, 64#
  * #mask = __m256, __mmask16#
  * #vsub = , _mask#
+ * #or_masks =_mm256_or_ps, _mm512_kor#
  * #and_masks =_mm256_and_ps, _mm512_kand#
+ * #xor_masks =_mm256_xor_ps, _mm512_kxor#
  * #fmadd = avx2_fmadd,_mm512_fmadd_ps#
  * #mask_to_int = _mm256_movemask_ps, #
  * #full_mask= 0xFF, 0xFFFF#
@@ -1287,7 +1294,7 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
 
 #if defined HAVE_ATTRIBUTE_TARGET_@ISA@_WITH_INTRINSICS
 static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
-@ISA@_exp_FLOAT(npy_float * op, npy_float * ip, const npy_int array_size)
+@ISA@_exp_FLOAT(npy_float * op, npy_float * ip, const npy_intp array_size)
 {
     const npy_int num_lanes = @BYTES@/sizeof(npy_float);
     npy_float xmax = 88.72283935546875f;
@@ -1312,9 +1319,10 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
     @vtype@ poly, num_poly, denom_poly, quadrant;
     @vtype@i exponent;
 
-    @mask@ xmax_mask, xmin_mask;
+    @mask@ xmax_mask, xmin_mask, nan_mask, inf_mask;
+    @mask@ overflow_mask = @isa@_get_partial_load_mask(0, num_lanes);
     @mask@ load_mask = @isa@_get_full_load_mask();
-    npy_int num_remaining_elements = array_size;
+    npy_intp num_remaining_elements = array_size;
 
     while (num_remaining_elements > 0) {
 
@@ -1322,11 +1330,16 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
             load_mask = @isa@_get_partial_load_mask(num_remaining_elements,
                                                          num_lanes);
         @vtype@ x  = @isa@_masked_load(load_mask, ip);
+
         xmax_mask = _mm@vsize@_cmp_ps@vsub@(x, _mm@vsize@_set1_ps(xmax), _CMP_GE_OQ);
         xmin_mask = _mm@vsize@_cmp_ps@vsub@(x, _mm@vsize@_set1_ps(xmin), _CMP_LE_OQ);
+        nan_mask = _mm@vsize@_cmp_ps@vsub@(x, x, _CMP_NEQ_UQ);
+        inf_mask = _mm@vsize@_cmp_ps@vsub@(x, inf, _CMP_EQ_OQ);
+        overflow_mask = @or_masks@(overflow_mask,
+                                    @xor_masks@(xmax_mask, inf_mask));
 
-        x = @isa@_set_masked_lanes(x, zeros_f,
-                                   @and_masks@(xmax_mask,xmin_mask));
+        x = @isa@_set_masked_lanes(x, zeros_f, @or_masks@(
+                                    @or_masks@(nan_mask, xmin_mask), xmax_mask));
 
         quadrant = _mm@vsize@_mul_ps(x, log2e);
 
@@ -1335,8 +1348,7 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
         quadrant = _mm@vsize@_sub_ps(quadrant, cvt_magic);
 
         /* Cody-Waite's range reduction algorithm */
-        x = @isa@_range_reduction(x, quadrant,
-                                  codyw_c1, codyw_c2, zeros_f);
+        x = @isa@_range_reduction(x, quadrant, codyw_c1, codyw_c2, zeros_f);
 
         num_poly = @fmadd@(exp_p5, x, exp_p4);
         num_poly = @fmadd@(num_poly, x, exp_p3);
@@ -1357,7 +1369,12 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
                     _mm@vsize@_add_epi32(
                         _mm@vsize@_castps_si@vsize@(poly), exponent));
 
-        /* elem > xmax; return inf, elem < xmin; return 0.0f */
+        /*
+         * elem > xmax; return inf
+         * elem < xmin; return 0.0f
+         * elem = +/- nan, return nan
+         */
+        poly = @isa@_set_masked_lanes(poly, _mm@vsize@_set1_ps(NPY_NANF), nan_mask);
         poly = @isa@_set_masked_lanes(poly, inf, xmax_mask);
         poly = @isa@_set_masked_lanes(poly, zeros_f, xmin_mask);
 
@@ -1367,6 +1384,9 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
         op += num_lanes;
         num_remaining_elements -= num_lanes;
     }
+
+    if (@mask_to_int@(overflow_mask))
+        npy_set_floatstatus_overflow();
 }
 
 /*
@@ -1384,7 +1404,7 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
  */
 
 static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
-@ISA@_log_FLOAT(npy_float * op, npy_float * ip, const npy_int array_size)
+@ISA@_log_FLOAT(npy_float * op, npy_float * ip, const npy_intp array_size)
 {
     const npy_int num_lanes = @BYTES@/sizeof(npy_float);
 
@@ -1402,15 +1422,18 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
     @vtype@ log_q4 = _mm@vsize@_set1_ps(NPY_COEFF_Q4_LOGf);
     @vtype@ log_q5 = _mm@vsize@_set1_ps(NPY_COEFF_Q5_LOGf);
     @vtype@ loge2 = _mm@vsize@_set1_ps(NPY_LOGE2f);
-    @vtype@ neg_nan = _mm@vsize@_set1_ps(-NPY_NANF);
+    @vtype@ nan = _mm@vsize@_set1_ps(NPY_NANF);
     @vtype@ neg_inf = _mm@vsize@_set1_ps(-NPY_INFINITYF);
+    @vtype@ inf = _mm@vsize@_set1_ps(NPY_INFINITYF);
     @vtype@ zeros_f = _mm@vsize@_set1_ps(0.0f);
     @vtype@ ones_f = _mm@vsize@_set1_ps(1.0f);
     @vtype@ poly, num_poly, denom_poly, exponent;
 
-    @mask@ inf_nan_mask, sqrt2_mask, zero_mask, negx_mask;
+    @mask@ inf_mask, nan_mask, sqrt2_mask, zero_mask, negx_mask;
+    @mask@ invalid_mask = @isa@_get_partial_load_mask(0, num_lanes);
+    @mask@ divide_by_zero_mask = invalid_mask;
     @mask@ load_mask = @isa@_get_full_load_mask();
-    npy_int num_remaining_elements = array_size;
+    npy_intp num_remaining_elements = array_size;
 
     while (num_remaining_elements > 0) {
 
@@ -1421,7 +1444,11 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
 
         negx_mask = _mm@vsize@_cmp_ps@vsub@(x_in, zeros_f, _CMP_LT_OQ);
         zero_mask = _mm@vsize@_cmp_ps@vsub@(x_in, zeros_f, _CMP_EQ_OQ);
-        inf_nan_mask = _mm@vsize@_cmp_ps@vsub@(x_in, _mm@vsize@_set1_ps(FLT_MAX), _CMP_GT_OQ);
+        inf_mask = _mm@vsize@_cmp_ps@vsub@(x_in, inf, _CMP_EQ_OQ);
+        nan_mask = _mm@vsize@_cmp_ps@vsub@(x_in, x_in, _CMP_NEQ_UQ);
+        divide_by_zero_mask = @or_masks@(divide_by_zero_mask,
+                                        @and_masks@(zero_mask, load_mask));
+        invalid_mask = @or_masks@(invalid_mask, negx_mask);
 
         @vtype@ x = @isa@_set_masked_lanes(x_in, zeros_f, negx_mask);
 
@@ -1453,13 +1480,13 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
         poly = @fmadd@(exponent, loge2, poly);
 
         /*
-         * x < 0.0f; return -NAN
+         * x < 0.0f; return NAN
+         * x = +/- NAN; return NAN
          * x = 0.0f; return -INF
-         * x > FLT_MAX; return x
          */
-        poly = @isa@_set_masked_lanes(poly, neg_nan, negx_mask);
+        poly = @isa@_set_masked_lanes(poly, nan, @or_masks@(negx_mask, nan_mask));
         poly = @isa@_set_masked_lanes(poly, neg_inf, zero_mask);
-        poly = @isa@_set_masked_lanes(poly, x_in, inf_nan_mask);
+        poly = @isa@_set_masked_lanes(poly, inf, inf_mask);
 
         @masked_store@(op, @cvtps_epi32@(load_mask), poly);
 
@@ -1467,6 +1494,11 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
         op += num_lanes;
         num_remaining_elements -= num_lanes;
     }
+
+    if (@mask_to_int@(invalid_mask))
+        npy_set_floatstatus_invalid();
+    if (@mask_to_int@(divide_by_zero_mask))
+        npy_set_floatstatus_divbyzero();
 }
 #endif
 /**end repeat**/
diff --git a/numpy/core/tests/test_longdouble.py b/numpy/core/tests/test_longdouble.py
index cf50d5d5c..ee4197f8f 100644
--- a/numpy/core/tests/test_longdouble.py
+++ b/numpy/core/tests/test_longdouble.py
@@ -1,5 +1,6 @@
 from __future__ import division, absolute_import, print_function
 
+import warnings
 import pytest
 
 import numpy as np
@@ -205,3 +206,28 @@ class TestCommaDecimalPointLocale(CommaDecimalPointLocale):
     def test_fromstring_foreign_value(self):
         b = np.fromstring("1,234", dtype=np.longdouble, sep=" ")
         assert_array_equal(b[0], 1)
+
+@pytest.mark.parametrize("int_val", [
+    # cases discussed in gh-10723
+    # and gh-9968
+    2 ** 1024, 0])
+def test_longdouble_from_int(int_val):
+    # for issue gh-9968
+    str_val = str(int_val)
+    # we'll expect a RuntimeWarning on platforms
+    # with np.longdouble equivalent to np.double
+    # for large integer input
+    with warnings.catch_warnings(record=True) as w:
+        warnings.filterwarnings('always', '', RuntimeWarning)
+        # can be inf==inf on some platforms
+        assert np.longdouble(int_val) == np.longdouble(str_val)
+        # we can't directly compare the int and
+        # max longdouble value on all platforms
+        if np.allclose(np.finfo(np.longdouble).max,
+                       np.finfo(np.double).max) and w:
+            assert w[0].category is RuntimeWarning
+
+@pytest.mark.parametrize("bool_val", [
+    True, False])
+def test_longdouble_from_bool(bool_val):
+    assert np.longdouble(bool_val) == np.longdouble(int(bool_val))
diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index b29daa675..d22f16099 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -1616,16 +1616,31 @@ class TestMethods(object):
         # of sorted items must be greater than ~50 to check the actual
         # algorithm because quick and merge sort fall over to insertion
         # sort for small arrays.
-        a = np.arange(101)
-        b = a[::-1].copy()
-        for kind in self.sort_kinds:
-            msg = "scalar sort, kind=%s" % kind
-            c = a.copy()
-            c.sort(kind=kind)
-            assert_equal(c, a, msg)
-            c = b.copy()
-            c.sort(kind=kind)
-            assert_equal(c, a, msg)
+        # Test unsigned dtypes and nonnegative numbers
+        for dtype in [np.uint8, np.uint16, np.uint32, np.uint64, np.float16, np.float32, np.float64, np.longdouble]:
+            a = np.arange(101, dtype=dtype)
+            b = a[::-1].copy()
+            for kind in self.sort_kinds:
+                msg = "scalar sort, kind=%s, dtype=%s" % (kind, dtype)
+                c = a.copy()
+                c.sort(kind=kind)
+                assert_equal(c, a, msg)
+                c = b.copy()
+                c.sort(kind=kind)
+                assert_equal(c, a, msg)
+
+        # Test signed dtypes and negative numbers as well
+        for dtype in [np.int8, np.int16, np.int32, np.int64, np.float16, np.float32, np.float64, np.longdouble]:
+            a = np.arange(-50, 51, dtype=dtype)
+            b = a[::-1].copy()
+            for kind in self.sort_kinds:
+                msg = "scalar sort, kind=%s, dtype=%s" % (kind, dtype)
+                c = a.copy()
+                c.sort(kind=kind)
+                assert_equal(c, a, msg)
+                c = b.copy()
+                c.sort(kind=kind)
+                assert_equal(c, a, msg)
 
         # test complex sorts. These use the same code as the scalars
         # but the compare function differs.
@@ -1881,12 +1896,14 @@ class TestMethods(object):
         # of sorted items must be greater than ~50 to check the actual
         # algorithm because quick and merge sort fall over to insertion
         # sort for small arrays.
-        a = np.arange(101)
-        b = a[::-1].copy()
-        for kind in self.sort_kinds:
-            msg = "scalar argsort, kind=%s" % kind
-            assert_equal(a.copy().argsort(kind=kind), a, msg)
-            assert_equal(b.copy().argsort(kind=kind), b, msg)
+
+        for dtype in [np.int32, np.uint32, np.float32]:
+            a = np.arange(101, dtype=dtype)
+            b = a[::-1].copy()
+            for kind in self.sort_kinds:
+                msg = "scalar argsort, kind=%s, dtype=%s" % (kind, dtype)
+                assert_equal(a.copy().argsort(kind=kind), a, msg)
+                assert_equal(b.copy().argsort(kind=kind), b, msg)
 
         # test complex argsorts. These use the same code as the scalars
         # but the compare function differs.
diff --git a/numpy/core/tests/test_numeric.py b/numpy/core/tests/test_numeric.py
index 1822a7adf..406110fa7 100644
--- a/numpy/core/tests/test_numeric.py
+++ b/numpy/core/tests/test_numeric.py
@@ -2157,6 +2157,7 @@ class TestLikeFuncs(object):
                 (np.arange(24).reshape(2, 3, 4).swapaxes(0, 1), None),
                 (np.arange(24).reshape(4, 3, 2).swapaxes(0, 1), '?'),
                      ]
+        self.shapes = [(5,), (5,6,), (5,6,7,)]
 
     def compare_array_value(self, dz, value, fill_value):
         if value is not None:
@@ -2222,6 +2223,34 @@ class TestLikeFuncs(object):
                 assert_equal(dz.dtype, np.dtype(dtype))
             self.compare_array_value(dz, value, fill_value)
 
+            # Test the 'shape' parameter
+            for s in self.shapes:
+                for o in 'CFA':
+                    sz = like_function(d, dtype=dtype, shape=s, order=o,
+                                       **fill_kwarg)
+                    assert_equal(sz.shape, s)
+                    if dtype is None:
+                        assert_equal(sz.dtype, d.dtype)
+                    else:
+                        assert_equal(sz.dtype, np.dtype(dtype))
+                    if o == 'C' or (o == 'A' and d.flags.c_contiguous):
+                        assert_(sz.flags.c_contiguous)
+                    elif o == 'F' or (o == 'A' and d.flags.f_contiguous):
+                        assert_(sz.flags.f_contiguous)
+                    self.compare_array_value(sz, value, fill_value)
+
+                if (d.ndim != len(s)):
+                    assert_equal(np.argsort(like_function(d, dtype=dtype,
+                                                          shape=s, order='K',
+                                                          **fill_kwarg).strides),
+                                 np.argsort(np.empty(s, dtype=dtype,
+                                                     order='C').strides))
+                else:
+                    assert_equal(np.argsort(like_function(d, dtype=dtype,
+                                                          shape=s, order='K',
+                                                          **fill_kwarg).strides),
+                                 np.argsort(d.strides))
+
         # Test the 'subok' parameter
         class MyNDArray(np.ndarray):
             pass
@@ -2737,6 +2766,18 @@ class TestBroadcast(object):
                 mit = np.broadcast(*arrs)
                 assert_equal(mit.numiter, j)
 
+    def test_broadcast_error_kwargs(self):
+        #gh-13455
+        arrs = [np.empty((5, 6, 7))]
+        mit  = np.broadcast(*arrs)
+        mit2 = np.broadcast(*arrs, **{})
+        assert_equal(mit.shape, mit2.shape)
+        assert_equal(mit.ndim, mit2.ndim)
+        assert_equal(mit.nd, mit2.nd)
+        assert_equal(mit.numiter, mit2.numiter)
+        assert_(mit.iters[0].base is mit2.iters[0].base)
+
+        assert_raises(ValueError, np.broadcast, 1, **{'x': 1})
 
 class TestKeepdims(object):
 
diff --git a/numpy/core/tests/test_ufunc.py b/numpy/core/tests/test_ufunc.py
index b6b68d922..4b26c2208 100644
--- a/numpy/core/tests/test_ufunc.py
+++ b/numpy/core/tests/test_ufunc.py
@@ -1936,3 +1936,31 @@ class TestUfunc(object):
             assert not np.isinf(nat)
         except TypeError:
             pass  # ok, just not implemented
+
+
+@pytest.mark.parametrize('ufunc', [getattr(np, x) for x in dir(np)
+                                if isinstance(getattr(np, x), np.ufunc)])
+def test_ufunc_types(ufunc):
+    '''
+    Check all ufuncs that the correct type is returned. Avoid
+    object and boolean types since many operations are not defined for
+    for them.
+
+    Choose the shape so even dot and matmul will succeed
+    '''
+    for typ in ufunc.types:
+        # types is a list of strings like ii->i
+        if 'O' in typ or '?' in typ:
+            continue
+        inp, out = typ.split('->')
+        args = [np.ones((3, 3), t) for t in inp]
+        with warnings.catch_warnings(record=True):
+            warnings.filterwarnings("always")
+            res = ufunc(*args)
+        if isinstance(res, tuple):
+            outs = tuple(out)
+            assert len(res) == len(outs)
+            for r, t in zip(res, outs):
+                assert r.dtype == np.dtype(t)
+        else:
+            assert res.dtype == np.dtype(out)
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index e0b0e11cf..2c963b5eb 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -17,7 +17,6 @@ from numpy.testing import (
     _gen_alignment_data
     )
 
-
 def on_powerpc():
     """ True if we are running on a Power PC platform."""
     return platform.processor() == 'powerpc' or \
@@ -650,6 +649,34 @@ class TestExp(object):
             yf = np.array(y, dtype=dt)*log2_
             assert_almost_equal(np.exp(yf), xf)
 
+class TestSpecialFloats(object):
+    def test_exp_values(self):
+        x = [np.nan,  np.nan, np.inf, 0.]
+        y = [np.nan, -np.nan, np.inf, -np.inf]
+        for dt in ['f', 'd', 'g']:
+            xf = np.array(x, dtype=dt)
+            yf = np.array(y, dtype=dt)
+            assert_equal(np.exp(yf), xf)
+
+        with np.errstate(over='raise'):
+            assert_raises(FloatingPointError, np.exp, np.float32(100.))
+            assert_raises(FloatingPointError, np.exp, np.float32(1E19))
+
+    def test_log_values(self):
+        with np.errstate(all='ignore'):
+            x = [np.nan,  np.nan, np.inf, np.nan, -np.inf, np.nan]
+            y = [np.nan, -np.nan, np.inf, -np.inf, 0., -1.0]
+            for dt in ['f', 'd', 'g']:
+                xf = np.array(x, dtype=dt)
+                yf = np.array(y, dtype=dt)
+                assert_equal(np.log(yf), xf)
+
+        with np.errstate(divide='raise'):
+            assert_raises(FloatingPointError, np.log, np.float32(0.))
+
+        with np.errstate(invalid='raise'):
+            assert_raises(FloatingPointError, np.log, np.float32(-np.inf))
+            assert_raises(FloatingPointError, np.log, np.float32(-1.0))
 
 class TestLogAddExp(_FilterInvalids):
     def test_logaddexp_values(self):