Merge pull request #28393 from linux-on-ibm-z/main-vec-fp-operators-clang-s390x

BUG: Fix building on s390x with clang
diff --git a/.github/workflows/compiler_sanitizers.yml b/.github/workflows/compiler_sanitizers.yml
index 9477e0b..9452289 100644
--- a/.github/workflows/compiler_sanitizers.yml
+++ b/.github/workflows/compiler_sanitizers.yml
@@ -53,10 +53,12 @@
         echo CPPFLAGS="-I$LLVM_PREFIX/include" >> $GITHUB_ENV
     - name: Build Python with address sanitizer
       run: |
-        CONFIGURE_OPTS="--with-address-sanitizer" pyenv install 3.13
-        pyenv global 3.13
+        CONFIGURE_OPTS="--with-address-sanitizer" pyenv install 3.13t
+        pyenv global 3.13t
     - name: Install dependencies
       run: |
+        # TODO: remove when a released cython supports free-threaded python
+        pip install -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple cython
         pip install -r requirements/build_requirements.txt
         pip install -r requirements/ci_requirements.txt
         pip install -r requirements/test_requirements.txt
@@ -68,7 +70,7 @@
     - name: Test
       run: |
         # pass -s to pytest to see ASAN errors and warnings, otherwise pytest captures them
-        ASAN_OPTIONS=detect_leaks=0:symbolize=1:strict_init_order=true:allocator_may_return_null=1:halt_on_error=1 \
+        ASAN_OPTIONS=detect_leaks=0:symbolize=1:strict_init_order=true:allocator_may_return_null=1 \
         python -m spin test -- -v -s --timeout=600 --durations=10
 
   clang_TSAN:
@@ -121,7 +123,7 @@
     - name: Test
       run: |
         # These tests are slow, so only run tests in files that do "import threading" to make them count
-        TSAN_OPTIONS=allocator_may_return_null=1:halt_on_error=1 \
+        TSAN_OPTIONS="allocator_may_return_null=1:suppressions=$GITHUB_WORKSPACE/tools/ci/tsan_suppressions.txt" \
         python -m spin test \
         `find numpy -name "test*.py" | xargs grep -l "import threading" | tr '\n' ' '` \
         -- -v -s --timeout=600 --durations=10
diff --git a/doc/release/upcoming_changes/26018.change.rst b/doc/release/upcoming_changes/26018.change.rst
new file mode 100644
index 0000000..9d7c139
--- /dev/null
+++ b/doc/release/upcoming_changes/26018.change.rst
@@ -0,0 +1,7 @@
+``unique_values`` may return unsorted data
+------------------------------------------
+The relatively new function (added in NumPy 2.0) ``unique_values`` may now
+return unsorted results.  Just as ``unique_counts`` and ``unique_all``
+these never guaranteed a sorted result, however, the result
+was sorted until now.  In cases where these do return a sorted result, this
+may change in future releases to improve performance.
diff --git a/doc/release/upcoming_changes/26018.performance.rst b/doc/release/upcoming_changes/26018.performance.rst
new file mode 100644
index 0000000..ffeab51
--- /dev/null
+++ b/doc/release/upcoming_changes/26018.performance.rst
@@ -0,0 +1,7 @@
+Performance improvements to ``np.unique``
+-----------------------------------------
+``np.unique`` now tries to use a hash table to find unique values instead of sorting
+values before finding unique values. This is limited to certain dtypes for now, and
+the function is now faster for those dtypes. The function now also exposes a ``sorted``
+parameter to allow returning unique values as they were found, instead of sorting them
+afterwards.
\ No newline at end of file
diff --git a/doc/source/building/cross_compilation.rst b/doc/source/building/cross_compilation.rst
index 82b896a..0a2e3a5 100644
--- a/doc/source/building/cross_compilation.rst
+++ b/doc/source/building/cross_compilation.rst
@@ -15,7 +15,7 @@
 distros:
 
 - `Void Linux <https://github.com/void-linux/void-packages/blob/master/srcpkgs/python3-numpy/template>`_
-- `Nix <https://github.com/nixos/nixpkgs/blob/master/pkgs/development/python-modules/numpy/default.nix>`_
+- `Nix <https://github.com/NixOS/nixpkgs/tree/master/pkgs/development/python-modules/numpy>`_
 - `Conda-forge <https://github.com/conda-forge/numpy-feedstock/blob/main/recipe/build.sh>`_
 
 See also `Meson's documentation on cross compilation
diff --git a/doc/source/reference/c-api/array.rst b/doc/source/reference/c-api/array.rst
index aface4e..14ff61a 100644
--- a/doc/source/reference/c-api/array.rst
+++ b/doc/source/reference/c-api/array.rst
@@ -121,7 +121,7 @@
 
     Returns the total size (in number of elements) of the array.
 
-.. c:function:: npy_intp PyArray_Size(PyArrayObject* obj)
+.. c:function:: npy_intp PyArray_Size(PyObject* obj)
 
     Returns 0 if *obj* is not a sub-class of ndarray. Otherwise,
     returns the total number of elements in the array. Safer version
diff --git a/doc/source/reference/random/extending.rst b/doc/source/reference/random/extending.rst
index 7aead60..20c8375 100644
--- a/doc/source/reference/random/extending.rst
+++ b/doc/source/reference/random/extending.rst
@@ -11,10 +11,13 @@
 
 Numba
 -----
-Numba can be used with either CTypes or CFFI.  The current iteration of the
+Numba can be used with either
+`CTypes <https://docs.python.org/3/library/ctypes.html>`_
+or `CFFI <https://cffi.readthedocs.io/en/stable/overview.html>`_.
+The current iteration of the
 `BitGenerator`\ s all export a small set of functions through both interfaces.
 
-This example shows how numba can be used to produce gaussian samples using
+This example shows how Numba can be used to produce Gaussian samples using
 a pure Python implementation which is then compiled.  The random numbers are
 provided by ``ctypes.next_double``.
 
diff --git a/numpy/_core/_add_newdocs.py b/numpy/_core/_add_newdocs.py
index ece371d..c1ab650 100644
--- a/numpy/_core/_add_newdocs.py
+++ b/numpy/_core/_add_newdocs.py
@@ -6945,9 +6945,10 @@ def refer_to_array_attribute(attr, method=True):
     array([False, True, False])
 
     >>> np.array([1.2, object(), "hello world"],
-    ...          dtype=StringDType(coerce=True))
-    ValueError: StringDType only allows string data when string coercion
-    is disabled.
+    ...          dtype=StringDType(coerce=False))
+    Traceback (most recent call last):
+        ...
+    ValueError: StringDType only allows string data when string coercion is disabled.
 
     >>> np.array(["hello", "world"], dtype=StringDType(coerce=True))
     array(["hello", "world"], dtype=StringDType(coerce=True))
diff --git a/numpy/_core/meson.build b/numpy/_core/meson.build
index bc21752..c3b5451 100644
--- a/numpy/_core/meson.build
+++ b/numpy/_core/meson.build
@@ -685,6 +685,16 @@
   cflags_large_file_support,
 ]
 
+# CPP exceptions are handled in the unique_hash code and therefore the `-fexceptions`
+# flag.
+unique_hash_cpp_args = c_args_common
+if cc.get_argument_syntax() != 'msvc'
+  unique_hash_cpp_args += [
+    '-fexceptions',
+    '-fno-rtti',  # no runtime type information
+  ]
+endif
+
 # Same as NPY_CXX_FLAGS (TODO: extend for what ccompiler_opt adds)
 cpp_args_common = c_args_common + [
 ]
@@ -1063,7 +1073,6 @@
   'src/common/npy_hashtable.cpp',
   'src/common/npy_import.c',
   'src/common/npy_longdouble.c',
-  'src/common/ucsnarrow.c',
   'src/common/ufunc_override.c',
   'src/common/numpyos.c',
   'src/common/npy_cpu_features.c',
@@ -1221,6 +1230,21 @@
   endforeach
 endif
 
+unique_hash_so = static_library(
+  'unique_hash',
+  ['src/multiarray/unique.cpp'],
+  c_args: c_args_common,
+  cpp_args: unique_hash_cpp_args,
+  include_directories: [
+    'include',
+    'src/common',
+  ],
+  dependencies: [
+    py_dep,
+    np_core_dep,
+  ],
+)
+
 py.extension_module('_multiarray_umath',
   [
     config_h,
@@ -1245,7 +1269,11 @@
     'src/highway'
   ],
   dependencies: [blas_dep],
-  link_with: [npymath_lib, multiarray_umath_mtargets.static_lib('_multiarray_umath_mtargets')] + highway_lib,
+  link_with: [
+    npymath_lib,
+    unique_hash_so,
+    multiarray_umath_mtargets.static_lib('_multiarray_umath_mtargets')
+  ] + highway_lib,
   install: true,
   subdir: 'numpy/_core',
 )
diff --git a/numpy/_core/src/common/ucsnarrow.c b/numpy/_core/src/common/ucsnarrow.c
deleted file mode 100644
index 203e02f..0000000
--- a/numpy/_core/src/common/ucsnarrow.c
+++ /dev/null
@@ -1,71 +0,0 @@
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-#define _MULTIARRAYMODULE
-
-#define PY_SSIZE_T_CLEAN
-#include <Python.h>
-
-#include "numpy/arrayobject.h"
-#include "numpy/npy_math.h"
-
-#include "npy_config.h"
-
-
-#include "ctors.h"
-
-/*
- * This file originally contained functions only needed on narrow builds of
- * Python for converting back and forth between the NumPy Unicode data-type
- * (always 4-bytes) and the Python Unicode scalar (2-bytes on a narrow build).
- *
- * This "narrow" interface is now deprecated in python and unused in NumPy.
- */
-
-/*
- * Returns a PyUnicodeObject initialized from a buffer containing
- * UCS4 unicode.
- *
- * Parameters
- * ----------
- *  src: char *
- *      Pointer to buffer containing UCS4 unicode.
- *  size: Py_ssize_t
- *      Size of buffer in bytes.
- *  swap: int
- *      If true, the data will be swapped.
- *  align: int
- *      If true, the data will be aligned.
- *
- * Returns
- * -------
- * new_reference: PyUnicodeObject
- */
-NPY_NO_EXPORT PyUnicodeObject *
-PyUnicode_FromUCS4(char const *src_char, Py_ssize_t size, int swap, int align)
-{
-    Py_ssize_t ucs4len = size / sizeof(npy_ucs4);
-    npy_ucs4 const *src = (npy_ucs4 const *)src_char;
-    npy_ucs4 *buf = NULL;
-
-    /* swap and align if needed */
-    if (swap || align) {
-        buf = (npy_ucs4 *)malloc(size);
-        if (buf == NULL) {
-            PyErr_NoMemory();
-            return NULL;
-        }
-        memcpy(buf, src, size);
-        if (swap) {
-            byte_swap_vector(buf, ucs4len, sizeof(npy_ucs4));
-        }
-        src = buf;
-    }
-
-    /* trim trailing zeros */
-    while (ucs4len > 0 && src[ucs4len - 1] == 0) {
-        ucs4len--;
-    }
-    PyUnicodeObject *ret = (PyUnicodeObject *)PyUnicode_FromKindAndData(
-        PyUnicode_4BYTE_KIND, src, ucs4len);
-    free(buf);
-    return ret;
-}
diff --git a/numpy/_core/src/common/ucsnarrow.h b/numpy/_core/src/common/ucsnarrow.h
deleted file mode 100644
index 4b17a28..0000000
--- a/numpy/_core/src/common/ucsnarrow.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef NUMPY_CORE_SRC_COMMON_NPY_UCSNARROW_H_
-#define NUMPY_CORE_SRC_COMMON_NPY_UCSNARROW_H_
-
-NPY_NO_EXPORT PyUnicodeObject *
-PyUnicode_FromUCS4(char const *src, Py_ssize_t size, int swap, int align);
-
-#endif  /* NUMPY_CORE_SRC_COMMON_NPY_UCSNARROW_H_ */
diff --git a/numpy/_core/src/multiarray/arraytypes.c.src b/numpy/_core/src/multiarray/arraytypes.c.src
index 931ced5..8de16af 100644
--- a/numpy/_core/src/multiarray/arraytypes.c.src
+++ b/numpy/_core/src/multiarray/arraytypes.c.src
@@ -632,10 +632,33 @@
 {
     PyArrayObject *ap = vap;
     Py_ssize_t size = PyArray_ITEMSIZE(ap);
+    Py_ssize_t ucs4len = size / sizeof(npy_ucs4);
     int swap = PyArray_ISBYTESWAPPED(ap);
     int align = !PyArray_ISALIGNED(ap);
+    npy_ucs4 const *src = (npy_ucs4 const*)ip;
+    npy_ucs4 *buf = NULL;
 
-    return (PyObject *)PyUnicode_FromUCS4(ip, size, swap, align);
+    /* swap and align if needed */
+    if (swap || align) {
+        buf = (npy_ucs4 *)malloc(size);
+        if (buf == NULL) {
+            PyErr_NoMemory();
+            return NULL;
+        }
+        memcpy(buf, src, size);
+        if (swap) {
+            byte_swap_vector(buf, ucs4len, sizeof(npy_ucs4));
+        }
+        src = buf;
+    }
+
+    /* trim trailing zeros */
+    while (ucs4len > 0 && src[ucs4len - 1] == 0) {
+        ucs4len--;
+    }
+    PyObject *ret = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, src, ucs4len);
+    free(buf);
+    return ret;
 }
 
 static int
diff --git a/numpy/_core/src/multiarray/common.h b/numpy/_core/src/multiarray/common.h
index 46fe2a6..e356b82 100644
--- a/numpy/_core/src/multiarray/common.h
+++ b/numpy/_core/src/multiarray/common.h
@@ -318,8 +318,6 @@
 check_is_convertible_to_scalar(PyArrayObject *v);
 
 
-#include "ucsnarrow.h"
-
 /*
  * Make a new empty array, of the passed size, of a type that takes the
  * priority of ap1 and ap2 into account.
diff --git a/numpy/_core/src/multiarray/item_selection.c b/numpy/_core/src/multiarray/item_selection.c
index 254f1ea..d02e420 100644
--- a/numpy/_core/src/multiarray/item_selection.c
+++ b/numpy/_core/src/multiarray/item_selection.c
@@ -1028,6 +1028,7 @@
     }
     dtype = PyArray_DESCR(mps[0]);
 
+    int copy_existing_out = 0;
     /* Set-up return array */
     if (out == NULL) {
         Py_INCREF(dtype);
@@ -1039,10 +1040,6 @@
                                                     (PyObject *)ap);
     }
     else {
-        int flags = NPY_ARRAY_CARRAY |
-                    NPY_ARRAY_WRITEBACKIFCOPY |
-                    NPY_ARRAY_FORCECAST;
-
         if ((PyArray_NDIM(out) != multi->nd)
                     || !PyArray_CompareLists(PyArray_DIMS(out),
                                              multi->dimensions,
@@ -1052,9 +1049,13 @@
             goto fail;
         }
 
+        if (PyArray_FailUnlessWriteable(out, "output array") < 0) {
+            goto fail;
+        }
+
         for (i = 0; i < n; i++) {
             if (arrays_overlap(out, mps[i])) {
-                flags |= NPY_ARRAY_ENSURECOPY;
+                copy_existing_out = 1;
             }
         }
 
@@ -1064,10 +1065,25 @@
              * so the input array is not changed
              * before the error is called
              */
-            flags |= NPY_ARRAY_ENSURECOPY;
+            copy_existing_out = 1;
         }
-        Py_INCREF(dtype);
-        obj = (PyArrayObject *)PyArray_FromArray(out, dtype, flags);
+
+        if (!PyArray_EquivTypes(dtype, PyArray_DESCR(out))) {
+            copy_existing_out = 1;
+        }
+
+        if (copy_existing_out) {
+            Py_INCREF(dtype);
+            obj = (PyArrayObject *)PyArray_NewFromDescr(&PyArray_Type,
+                                                        dtype,
+                                                        multi->nd,
+                                                        multi->dimensions,
+                                                        NULL, NULL, 0,
+                                                        (PyObject *)out);
+        }
+        else {
+            obj = (PyArrayObject *)Py_NewRef(out);
+        }
     }
 
     if (obj == NULL) {
@@ -1080,12 +1096,13 @@
     NPY_ARRAYMETHOD_FLAGS transfer_flags = 0;
     if (PyDataType_REFCHK(dtype)) {
         int is_aligned = IsUintAligned(obj);
+        PyArray_Descr *obj_dtype = PyArray_DESCR(obj);
         PyArray_GetDTypeTransferFunction(
                     is_aligned,
                     dtype->elsize,
-                    dtype->elsize,
+                    obj_dtype->elsize,
                     dtype,
-                    dtype, 0, &cast_info,
+                    obj_dtype, 0, &cast_info,
                     &transfer_flags);
     }
 
@@ -1142,11 +1159,13 @@
     }
     Py_DECREF(ap);
     PyDataMem_FREE(mps);
-    if (out != NULL && out != obj) {
-        Py_INCREF(out);
-        PyArray_ResolveWritebackIfCopy(obj);
+    if (copy_existing_out) {
+        int res = PyArray_CopyInto(out, obj);
         Py_DECREF(obj);
-        obj = out;
+        if (res < 0) {
+            return NULL;
+        }
+        return Py_NewRef(out);
     }
     return (PyObject *)obj;
 
@@ -2893,10 +2912,11 @@
              * the fast bool count is followed by this sparse path is faster
              * than combining the two loops, even for larger arrays
              */
+            npy_intp * multi_index_end = multi_index + nonzero_count;
             if (((double)nonzero_count / count) <= 0.1) {
                 npy_intp subsize;
                 npy_intp j = 0;
-                while (1) {
+                while (multi_index < multi_index_end) {
                     npy_memchr(data + j * stride, 0, stride, count - j,
                                &subsize, 1);
                     j += subsize;
@@ -2911,11 +2931,10 @@
              * stalls that are very expensive on most modern processors.
              */
             else {
-                npy_intp *multi_index_end = multi_index + nonzero_count;
                 npy_intp j = 0;
 
                 /* Manually unroll for GCC and maybe other compilers */
-                while (multi_index + 4 < multi_index_end) {
+                while (multi_index + 4 < multi_index_end && (j < count - 4) ) {
                     *multi_index = j;
                     multi_index += data[0] != 0;
                     *multi_index = j + 1;
@@ -2928,7 +2947,7 @@
                     j += 4;
                 }
 
-                while (multi_index < multi_index_end) {
+                while (multi_index < multi_index_end && (j < count) ) {
                     *multi_index = j;
                     multi_index += *data != 0;
                     data += stride;
diff --git a/numpy/_core/src/multiarray/multiarraymodule.c b/numpy/_core/src/multiarray/multiarraymodule.c
index 54165222..d24af97 100644
--- a/numpy/_core/src/multiarray/multiarraymodule.c
+++ b/numpy/_core/src/multiarray/multiarraymodule.c
@@ -83,6 +83,8 @@
 
 #include "umathmodule.h"
 
+#include "unique.h"
+
 /*
  *****************************************************************************
  **                    INCLUDE GENERATED CODE                               **
@@ -4562,6 +4564,8 @@
         "Give a warning on reload and big warning in sub-interpreters."},
     {"from_dlpack", (PyCFunction)from_dlpack,
         METH_FASTCALL | METH_KEYWORDS, NULL},
+    {"_unique_hash",  (PyCFunction)array__unique_hash,
+        METH_O, "Collect unique values via a hash map."},
     {NULL, NULL, 0, NULL}                /* sentinel */
 };
 
diff --git a/numpy/_core/src/multiarray/stringdtype/casts.cpp b/numpy/_core/src/multiarray/stringdtype/casts.cpp
index f74f642..f667275 100644
--- a/numpy/_core/src/multiarray/stringdtype/casts.cpp
+++ b/numpy/_core/src/multiarray/stringdtype/casts.cpp
@@ -1,13 +1,13 @@
-#include <cmath>
-#include <type_traits>
-
-#include "numpy/npy_common.h"
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
+#include "numpy/npy_common.h"
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
 #define _UMATHMODULE
 
+#include <cmath>
+#include <type_traits>
+
 #include "numpy/ndarraytypes.h"
 #include "numpy/arrayobject.h"
 #include "numpy/halffloat.h"
diff --git a/numpy/_core/src/multiarray/unique.cpp b/numpy/_core/src/multiarray/unique.cpp
new file mode 100644
index 0000000..f36acfd
--- /dev/null
+++ b/numpy/_core/src/multiarray/unique.cpp
@@ -0,0 +1,183 @@
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+
+#include <Python.h>
+
+#include <unordered_set>
+#include <functional>
+
+#include <numpy/npy_common.h>
+#include "numpy/arrayobject.h"
+
+// This is to use RAII pattern to handle cpp exceptions while avoiding memory leaks.
+// Adapted from https://stackoverflow.com/a/25510879/2536294
+template <typename F>
+struct FinalAction {
+    FinalAction(F f) : clean_{f} {}
+    ~FinalAction() { clean_(); }
+  private:
+    F clean_;
+};
+
+template <typename F>
+FinalAction<F> finally(F f) {
+    return FinalAction<F>(f);
+}
+
+template<typename T>
+static PyObject*
+unique(PyArrayObject *self)
+{
+    /* This function takes a numpy array and returns a numpy array containing
+    the unique values.
+
+    It assumes the numpy array includes data that can be viewed as unsigned integers
+    of a certain size (sizeof(T)).
+
+    It doesn't need to know the actual type, since it needs to find unique values
+    among binary representations of the input data. This means it won't apply to
+    custom or complicated dtypes or string values.
+    */
+    NPY_ALLOW_C_API_DEF;
+    std::unordered_set<T> hashset;
+
+    NpyIter *iter = NpyIter_New(self, NPY_ITER_READONLY |
+                                      NPY_ITER_EXTERNAL_LOOP |
+                                      NPY_ITER_REFS_OK |
+                                      NPY_ITER_ZEROSIZE_OK |
+                                      NPY_ITER_GROWINNER,
+                                NPY_KEEPORDER, NPY_NO_CASTING,
+                                NULL);
+    // Making sure the iterator is deallocated when the function returns, with
+    // or w/o an exception
+    auto iter_dealloc = finally([&]() { NpyIter_Deallocate(iter); });
+    if (iter == NULL) {
+        return NULL;
+    }
+
+    NpyIter_IterNextFunc *iternext = NpyIter_GetIterNext(iter, NULL);
+    if (iternext == NULL) {
+        return NULL;
+    }
+    char **dataptr = NpyIter_GetDataPtrArray(iter);
+    npy_intp *strideptr = NpyIter_GetInnerStrideArray(iter);
+    npy_intp *innersizeptr = NpyIter_GetInnerLoopSizePtr(iter);
+
+    // release the GIL
+    PyThreadState *_save;
+    _save = PyEval_SaveThread();
+    // Making sure the GIL is re-acquired when the function returns, with
+    // or w/o an exception
+    auto grab_gil = finally([&]() { PyEval_RestoreThread(_save); });
+    // first we put the data in a hash map
+
+    if (NpyIter_GetIterSize(iter) > 0) {
+        do {
+            char* data = *dataptr;
+            npy_intp stride = *strideptr;
+            npy_intp count = *innersizeptr;
+
+            while (count--) {
+                hashset.insert(*((T *) data));
+                data += stride;
+            }
+        } while (iternext(iter));
+    }
+
+    npy_intp length = hashset.size();
+
+    NPY_ALLOW_C_API;
+    PyArray_Descr *descr = PyArray_DESCR(self);
+    Py_INCREF(descr);
+    PyObject *res_obj = PyArray_NewFromDescr(
+        &PyArray_Type,
+        descr,
+        1, // ndim
+        &length, // shape
+        NULL, // strides
+        NULL, // data
+        // This flag is needed to be able to call .sort on it.
+        NPY_ARRAY_WRITEABLE, // flags
+        NULL // obj
+    );
+    NPY_DISABLE_C_API;
+
+    if (res_obj == NULL) {
+        return NULL;
+    }
+
+    // then we iterate through the map's keys to get the unique values
+    T* data = (T *)PyArray_DATA((PyArrayObject *)res_obj);
+    auto it = hashset.begin();
+    size_t i = 0;
+    for (; it != hashset.end(); it++, i++) {
+        data[i] = *it;
+    }
+
+    return res_obj;
+}
+
+
+// this map contains the functions used for each item size.
+typedef std::function<PyObject *(PyArrayObject *)> function_type;
+std::unordered_map<int, function_type> unique_funcs = {
+    {NPY_BYTE, unique<npy_byte>},
+    {NPY_UBYTE, unique<npy_ubyte>},
+    {NPY_SHORT, unique<npy_short>},
+    {NPY_USHORT, unique<npy_ushort>},
+    {NPY_INT, unique<npy_int>},
+    {NPY_UINT, unique<npy_uint>},
+    {NPY_LONG, unique<npy_long>},
+    {NPY_ULONG, unique<npy_ulong>},
+    {NPY_LONGLONG, unique<npy_longlong>},
+    {NPY_ULONGLONG, unique<npy_ulonglong>},
+    {NPY_INT8, unique<npy_int8>},
+    {NPY_INT16, unique<npy_int16>},
+    {NPY_INT32, unique<npy_int32>},
+    {NPY_INT64, unique<npy_int64>},
+    {NPY_UINT8, unique<npy_uint8>},
+    {NPY_UINT16, unique<npy_uint16>},
+    {NPY_UINT32, unique<npy_uint32>},
+    {NPY_UINT64, unique<npy_uint64>},
+    {NPY_DATETIME, unique<npy_uint64>},
+};
+
+
+/**
+ * Python exposed implementation of `_unique_hash`.
+ *
+ * This is a C only function wrapping code that may cause C++ exceptions into
+ * try/catch.
+ *
+ * @param arr NumPy array to find the unique values of.
+ * @return Base-class NumPy array with unique values, `NotImplemented` if the
+ * type is unsupported or `NULL` with an error set.
+ */
+extern "C" NPY_NO_EXPORT PyObject *
+array__unique_hash(PyObject *NPY_UNUSED(module), PyObject *arr_obj)
+{
+    if (!PyArray_Check(arr_obj)) {
+        PyErr_SetString(PyExc_TypeError,
+                "_unique_hash() requires a NumPy array input.");
+        return NULL;
+    }
+    PyArrayObject *arr = (PyArrayObject *)arr_obj;
+
+    try {
+        auto type = PyArray_TYPE(arr);
+        // we only support data types present in our unique_funcs map
+        if (unique_funcs.find(type) == unique_funcs.end()) {
+            Py_RETURN_NOTIMPLEMENTED;
+        }
+
+        return unique_funcs[type](arr);
+    }
+    catch (const std::bad_alloc &e) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+    catch (const std::exception &e) {
+        PyErr_SetString(PyExc_RuntimeError, e.what());
+        return NULL;
+    }
+}
diff --git a/numpy/_core/src/multiarray/unique.h b/numpy/_core/src/multiarray/unique.h
new file mode 100644
index 0000000..3e25840
--- /dev/null
+++ b/numpy/_core/src/multiarray/unique.h
@@ -0,0 +1,14 @@
+#ifndef NUMPY_CORE_INCLUDE_NUMPY_MULTIARRAY_UNIQUE_H_
+#define NUMPY_CORE_INCLUDE_NUMPY_MULTIARRAY_UNIQUE_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+PyObject* array__unique_hash(PyObject *NPY_UNUSED(dummy), PyObject *args);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // NUMPY_CORE_INCLUDE_NUMPY_MULTIARRAY_UNIQUE_H_
diff --git a/numpy/_core/tests/test_multiarray.py b/numpy/_core/tests/test_multiarray.py
index fba5f47..3de42ba 100644
--- a/numpy/_core/tests/test_multiarray.py
+++ b/numpy/_core/tests/test_multiarray.py
@@ -1980,6 +1980,12 @@ def test_choose(self):
         y = np.choose([0, 0, 0], [x[:3], x[:3], x[:3]], out=x[1:4], mode='wrap')
         assert_equal(y, np.array([0, 1, 2]))
 
+        # gh_28206 check fail when out not writeable
+        x = np.arange(3)
+        out = np.zeros(3)
+        out.setflags(write=False)
+        assert_raises(ValueError, np.choose, [0, 1, 2], [x, x, x], out=out)
+
     def test_prod(self):
         ba = [1, 2, 10, 11, 6, 5, 4]
         ba2 = [[1, 2, 3, 4], [5, 6, 7, 9], [10, 3, 4, 5]]
@@ -10287,6 +10293,16 @@ def test_gh_24459():
         np.choose(a, [3, -1])
 
 
+def test_gh_28206():
+    a = np.arange(3)
+    b = np.ones((3, 3), dtype=np.int64)
+    out = np.array([np.nan, np.nan, np.nan])
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", RuntimeWarning)
+        np.choose(a, b, out=out)
+
+
 @pytest.mark.parametrize("N", np.arange(2, 512))
 @pytest.mark.parametrize("dtype", [np.int16, np.uint16,
                         np.int32, np.uint32, np.int64, np.uint64])
diff --git a/numpy/_core/tests/test_multithreading.py b/numpy/_core/tests/test_multithreading.py
index 133268d..03f4b40 100644
--- a/numpy/_core/tests/test_multithreading.py
+++ b/numpy/_core/tests/test_multithreading.py
@@ -271,3 +271,26 @@ def closure(b):
         # Reducing the number of threads means the test doesn't trigger the
         # bug. Better to skip on some platforms than add a useless test.
         pytest.skip("Couldn't spawn enough threads to run the test")
+
+@pytest.mark.parametrize("dtype", [bool, int, float])
+def test_nonzero(dtype):
+    # See: gh-28361
+    #
+    # np.nonzero uses np.count_nonzero to determine the size of the output array
+    # In a second pass the indices of the non-zero elements are determined, but they can have changed
+    #
+    # This test triggers a data race which is suppressed in the TSAN CI. The test is to ensure
+    # np.nonzero does not generate a segmentation fault
+    x = np.random.randint(4, size=100).astype(dtype)
+
+    def func(index):
+        for _ in range(10):
+            if index == 0:
+                x[::2] = np.random.randint(2)
+            else:
+                try:
+                    _ = np.nonzero(x)
+                except RuntimeError as ex:
+                    assert 'number of non-zero array elements changed during function execution' in str(ex)
+
+    run_threaded(func, max_workers=10, pass_count=True, outer_iterations=5)
diff --git a/numpy/lib/_arraysetops_impl.py b/numpy/lib/_arraysetops_impl.py
index 97dae64..5217704 100644
--- a/numpy/lib/_arraysetops_impl.py
+++ b/numpy/lib/_arraysetops_impl.py
@@ -21,6 +21,7 @@
 import numpy as np
 from numpy._core import overrides
 from numpy._core._multiarray_umath import _array_converter
+from numpy._core._multiarray_umath import _unique_hash
 
 
 array_function_dispatch = functools.partial(
@@ -138,13 +139,15 @@ def _unpack_tuple(x):
 
 
 def _unique_dispatcher(ar, return_index=None, return_inverse=None,
-                       return_counts=None, axis=None, *, equal_nan=None):
+                       return_counts=None, axis=None, *, equal_nan=None,
+                       sorted=True):
     return (ar,)
 
 
 @array_function_dispatch(_unique_dispatcher)
 def unique(ar, return_index=False, return_inverse=False,
-           return_counts=False, axis=None, *, equal_nan=True):
+           return_counts=False, axis=None, *, equal_nan=True,
+           sorted=True):
     """
     Find the unique elements of an array.
 
@@ -182,6 +185,11 @@ def unique(ar, return_index=False, return_inverse=False,
 
         .. versionadded:: 1.24
 
+    sorted : bool, optional
+        If True, the unique elements are sorted.
+
+        .. versionadded:: 2.3
+
     Returns
     -------
     unique : ndarray
@@ -284,7 +292,8 @@ def unique(ar, return_index=False, return_inverse=False,
     ar = np.asanyarray(ar)
     if axis is None:
         ret = _unique1d(ar, return_index, return_inverse, return_counts,
-                        equal_nan=equal_nan, inverse_shape=ar.shape, axis=None)
+                        equal_nan=equal_nan, inverse_shape=ar.shape, axis=None,
+                        sorted=sorted)
         return _unpack_tuple(ret)
 
     # axis was specified and not None
@@ -331,16 +340,18 @@ def reshape_uniq(uniq):
     output = _unique1d(consolidated, return_index,
                        return_inverse, return_counts,
                        equal_nan=equal_nan, inverse_shape=inverse_shape,
-                       axis=axis)
+                       axis=axis, sorted=sorted)
     output = (reshape_uniq(output[0]),) + output[1:]
     return _unpack_tuple(output)
 
 
 def _unique1d(ar, return_index=False, return_inverse=False,
               return_counts=False, *, equal_nan=True, inverse_shape=None,
-              axis=None):
+              axis=None, sorted=True):
     """
     Find the unique elements of an array, ignoring shape.
+
+    Uses a hash table to find the unique elements if possible.
     """
     ar = np.asanyarray(ar).flatten()
     if len(ar.shape) != 1:
@@ -350,6 +361,26 @@ def _unique1d(ar, return_index=False, return_inverse=False,
 
     optional_indices = return_index or return_inverse
 
+    if (optional_indices or return_counts) and not sorted:
+        raise ValueError(
+            "Currently, `sorted` can only be False if `return_index`, "
+            "`return_inverse`, and `return_counts` are all False."
+        )
+
+    # masked arrays are not supported yet.
+    if not optional_indices and not return_counts and not np.ma.is_masked(ar):
+        # First we convert the array to a numpy array, later we wrap it back
+        # in case it was a subclass of numpy.ndarray.
+        conv = _array_converter(ar)
+        ar_, = conv
+
+        if (hash_unique := _unique_hash(ar_)) is not NotImplemented:
+            if sorted:
+                hash_unique.sort()
+            # We wrap the result back in case it was a subclass of numpy.ndarray.
+            return (conv.wrap(hash_unique),)
+
+    # If we don't use the hash map, we use the slower sorting method.
     if optional_indices:
         perm = ar.argsort(kind='mergesort' if return_index else 'quicksort')
         aux = ar[perm]
@@ -460,7 +491,7 @@ def unique_all(x):
         return_index=True,
         return_inverse=True,
         return_counts=True,
-        equal_nan=False
+        equal_nan=False,
     )
     return UniqueAllResult(*result)
 
@@ -512,7 +543,7 @@ def unique_counts(x):
         return_index=False,
         return_inverse=False,
         return_counts=True,
-        equal_nan=False
+        equal_nan=False,
     )
     return UniqueCountsResult(*result)
 
@@ -565,7 +596,7 @@ def unique_inverse(x):
         return_index=False,
         return_inverse=True,
         return_counts=False,
-        equal_nan=False
+        equal_nan=False,
     )
     return UniqueInverseResult(*result)
 
@@ -601,7 +632,7 @@ def unique_values(x):
     --------
     >>> import numpy as np
     >>> np.unique_values([1, 1, 2])
-    array([1, 2])
+    array([1, 2])  # may vary
 
     """
     return unique(
@@ -609,7 +640,8 @@ def unique_values(x):
         return_index=False,
         return_inverse=False,
         return_counts=False,
-        equal_nan=False
+        equal_nan=False,
+        sorted=False,
     )
 
 
diff --git a/numpy/lib/tests/test_arraysetops.py b/numpy/lib/tests/test_arraysetops.py
index e89adb8..3de5e68 100644
--- a/numpy/lib/tests/test_arraysetops.py
+++ b/numpy/lib/tests/test_arraysetops.py
@@ -628,56 +628,72 @@ def test_manyways(self):
 
 class TestUnique:
 
+    def check_all(self, a, b, i1, i2, c, dt):
+        base_msg = 'check {0} failed for type {1}'
+
+        msg = base_msg.format('values', dt)
+        v = unique(a)
+        assert_array_equal(v, b, msg)
+        assert type(v) == type(b)
+
+        msg = base_msg.format('return_index', dt)
+        v, j = unique(a, True, False, False)
+        assert_array_equal(v, b, msg)
+        assert_array_equal(j, i1, msg)
+        assert type(v) == type(b)
+
+        msg = base_msg.format('return_inverse', dt)
+        v, j = unique(a, False, True, False)
+        assert_array_equal(v, b, msg)
+        assert_array_equal(j, i2, msg)
+        assert type(v) == type(b)
+
+        msg = base_msg.format('return_counts', dt)
+        v, j = unique(a, False, False, True)
+        assert_array_equal(v, b, msg)
+        assert_array_equal(j, c, msg)
+        assert type(v) == type(b)
+
+        msg = base_msg.format('return_index and return_inverse', dt)
+        v, j1, j2 = unique(a, True, True, False)
+        assert_array_equal(v, b, msg)
+        assert_array_equal(j1, i1, msg)
+        assert_array_equal(j2, i2, msg)
+        assert type(v) == type(b)
+
+        msg = base_msg.format('return_index and return_counts', dt)
+        v, j1, j2 = unique(a, True, False, True)
+        assert_array_equal(v, b, msg)
+        assert_array_equal(j1, i1, msg)
+        assert_array_equal(j2, c, msg)
+        assert type(v) == type(b)
+
+        msg = base_msg.format('return_inverse and return_counts', dt)
+        v, j1, j2 = unique(a, False, True, True)
+        assert_array_equal(v, b, msg)
+        assert_array_equal(j1, i2, msg)
+        assert_array_equal(j2, c, msg)
+        assert type(v) == type(b)
+
+        msg = base_msg.format(('return_index, return_inverse '
+                                'and return_counts'), dt)
+        v, j1, j2, j3 = unique(a, True, True, True)
+        assert_array_equal(v, b, msg)
+        assert_array_equal(j1, i1, msg)
+        assert_array_equal(j2, i2, msg)
+        assert_array_equal(j3, c, msg)
+        assert type(v) == type(b)
+
+    def get_types(self):
+        types = []
+        types.extend(np.typecodes['AllInteger'])
+        types.extend(np.typecodes['AllFloat'])
+        types.append('datetime64[D]')
+        types.append('timedelta64[D]')
+        return types
+
     def test_unique_1d(self):
 
-        def check_all(a, b, i1, i2, c, dt):
-            base_msg = 'check {0} failed for type {1}'
-
-            msg = base_msg.format('values', dt)
-            v = unique(a)
-            assert_array_equal(v, b, msg)
-
-            msg = base_msg.format('return_index', dt)
-            v, j = unique(a, True, False, False)
-            assert_array_equal(v, b, msg)
-            assert_array_equal(j, i1, msg)
-
-            msg = base_msg.format('return_inverse', dt)
-            v, j = unique(a, False, True, False)
-            assert_array_equal(v, b, msg)
-            assert_array_equal(j, i2, msg)
-
-            msg = base_msg.format('return_counts', dt)
-            v, j = unique(a, False, False, True)
-            assert_array_equal(v, b, msg)
-            assert_array_equal(j, c, msg)
-
-            msg = base_msg.format('return_index and return_inverse', dt)
-            v, j1, j2 = unique(a, True, True, False)
-            assert_array_equal(v, b, msg)
-            assert_array_equal(j1, i1, msg)
-            assert_array_equal(j2, i2, msg)
-
-            msg = base_msg.format('return_index and return_counts', dt)
-            v, j1, j2 = unique(a, True, False, True)
-            assert_array_equal(v, b, msg)
-            assert_array_equal(j1, i1, msg)
-            assert_array_equal(j2, c, msg)
-
-            msg = base_msg.format('return_inverse and return_counts', dt)
-            v, j1, j2 = unique(a, False, True, True)
-            assert_array_equal(v, b, msg)
-            assert_array_equal(j1, i2, msg)
-            assert_array_equal(j2, c, msg)
-
-            msg = base_msg.format(('return_index, return_inverse '
-                                   'and return_counts'), dt)
-            v, j1, j2, j3 = unique(a, True, True, True)
-            assert_array_equal(v, b, msg)
-            assert_array_equal(j1, i1, msg)
-            assert_array_equal(j2, i2, msg)
-            assert_array_equal(j3, c, msg)
-
         a = [5, 7, 1, 2, 1, 5, 7] * 10
         b = [1, 2, 5, 7]
         i1 = [2, 3, 0, 1]
@@ -685,15 +701,11 @@ def check_all(a, b, i1, i2, c, dt):
         c = np.multiply([2, 1, 2, 2], 10)
 
         # test for numeric arrays
-        types = []
-        types.extend(np.typecodes['AllInteger'])
-        types.extend(np.typecodes['AllFloat'])
-        types.append('datetime64[D]')
-        types.append('timedelta64[D]')
+        types = self.get_types()
         for dt in types:
             aa = np.array(a, dt)
             bb = np.array(b, dt)
-            check_all(aa, bb, i1, i2, c, dt)
+            self.check_all(aa, bb, i1, i2, c, dt)
 
         # test for object arrays
         dt = 'O'
@@ -701,13 +713,13 @@ def check_all(a, b, i1, i2, c, dt):
         aa[:] = a
         bb = np.empty(len(b), dt)
         bb[:] = b
-        check_all(aa, bb, i1, i2, c, dt)
+        self.check_all(aa, bb, i1, i2, c, dt)
 
         # test for structured arrays
         dt = [('', 'i'), ('', 'i')]
         aa = np.array(list(zip(a, a)), dt)
         bb = np.array(list(zip(b, b)), dt)
-        check_all(aa, bb, i1, i2, c, dt)
+        self.check_all(aa, bb, i1, i2, c, dt)
 
         # test for ticket #2799
         aa = [1. + 0.j, 1 - 1.j, 1]
@@ -797,6 +809,44 @@ def check_all(a, b, i1, i2, c, dt):
         assert_equal(np.unique(all_nans, return_inverse=True), (ua, ua_inv))
         assert_equal(np.unique(all_nans, return_counts=True), (ua, ua_cnt))
 
+    def test_unique_zero_sized(self):
+        # test for zero-sized arrays
+        for dt in self.get_types():
+            a = np.array([], dt)
+            b = np.array([], dt)
+            i1 = np.array([], np.int64)
+            i2 = np.array([], np.int64)
+            c = np.array([], np.int64)
+            self.check_all(a, b, i1, i2, c, dt)
+
+    def test_unique_subclass(self):
+        class Subclass(np.ndarray):
+            pass
+
+        i1 = [2, 3, 0, 1]
+        i2 = [2, 3, 0, 1, 0, 2, 3] * 10
+        c = np.multiply([2, 1, 2, 2], 10)
+
+        # test for numeric arrays
+        types = self.get_types()
+        for dt in types:
+            a = np.array([5, 7, 1, 2, 1, 5, 7] * 10, dtype=dt)
+            b = np.array([1, 2, 5, 7], dtype=dt)
+            aa = Subclass(a.shape, dtype=dt, buffer=a)
+            bb = Subclass(b.shape, dtype=dt, buffer=b)
+            self.check_all(aa, bb, i1, i2, c, dt)
+
+    @pytest.mark.parametrize("arg", ["return_index", "return_inverse", "return_counts"])
+    def test_unsupported_hash_based(self, arg):
+        """Test that hash based unique is not supported when either of
+        return_index, return_inverse, or return_counts is True.
+
+        This is WIP and the above will gradually be supported in the future.
+        """
+        msg = "Currently, `sorted` can only be False"
+        with pytest.raises(ValueError, match=msg):
+            np.unique([1, 1], sorted=False, **{arg: True})
+
     def test_unique_axis_errors(self):
         assert_raises(TypeError, self._run_axis_tests, object)
         assert_raises(TypeError, self._run_axis_tests,
diff --git a/ruff.toml b/ruff.toml
index 39c32e9..9f8bf26 100644
--- a/ruff.toml
+++ b/ruff.toml
@@ -1,4 +1,4 @@
-exclude = [
+extend-exclude = [
     "numpy/__config__.py",
     "numpy/distutils",
     "numpy/typing/_char_codes.py",
diff --git a/tools/ci/tsan_suppressions.txt b/tools/ci/tsan_suppressions.txt
new file mode 100644
index 0000000..0745deb
--- /dev/null
+++ b/tools/ci/tsan_suppressions.txt
@@ -0,0 +1,11 @@
+# This file contains suppressions for the TSAN tool
+#
+# Reference: https://github.com/google/sanitizers/wiki/ThreadSanitizerSuppressions
+
+# For np.nonzero, see gh-28361
+race:PyArray_Nonzero
+race:count_nonzero_int
+race:count_nonzero_bool
+race:count_nonzero_float
+race:DOUBLE_nonzero
+