Merge pull request #12814 from mattip/resolve-writeback

BUG: resolve writeback in arr_insert failure paths
diff --git a/.appveyor.yml b/.appveyor.yml
index 01440c6..079496d 100644
--- a/.appveyor.yml
+++ b/.appveyor.yml
@@ -30,11 +30,6 @@
       PYTHON_ARCH: 32
       TEST_MODE: fast
 
-    - PYTHON: C:\Python27-x64
-      PYTHON_VERSION: 2.7
-      PYTHON_ARCH: 64
-      TEST_MODE: fast
-
     - PYTHON: C:\Python36-x64
       PYTHON_VERSION: 3.6
       PYTHON_ARCH: 64
@@ -91,7 +86,7 @@
       $clnt.DownloadFile($env:OPENBLAS, $file)
       Get-FileHash $file | Format-List
 
-      Expand-Archive $file $tmpdir      
+      Expand-Archive $file $tmpdir
 
       rm $tmpdir\$env:PYTHON_ARCH\lib\*.dll.a
       $lib = ls $tmpdir\$env:PYTHON_ARCH\lib\*.a | ForEach { ls $_ } | Select-Object -first 1
diff --git a/.codecov.yml b/.codecov.yml
index cb3ee23..35584a1 100644
--- a/.codecov.yml
+++ b/.codecov.yml
@@ -1,29 +1,15 @@
 codecov:
   ci:
-    # we don't require appveyor or
-    # circleCI to pass to report
-    # coverage, which currently only
-    # comes from a single Python 3.6 job
-    # in Travis
     - !appveyor
-    - !circle
   notify:
-    # don't require all travis builds to pass;
-    # as long as the coverage job succeeds it
-    # can report the % coverage, even if another
-    # job needs a restart for whatever reason
-    - require_ci_to_pass: no
-    # we should only require a single build before
-    # reporting the % coverage because there's only
-    # one coverage job in Travis
-    - after_n_builds: 1
+    require_ci_to_pass: no
+    after_n_builds: 1
 coverage:
   status:
     project:
       default:
         # Require 1% coverage, i.e., always succeed
         target: 1
+    patch: false
+    changes: false
 comment: off
-
-ignore:
-  - "**/setup.py"
diff --git a/.ctags.d b/.ctags.d
new file mode 100644
index 0000000..60f7d6c
--- /dev/null
+++ b/.ctags.d
@@ -0,0 +1 @@
+--langmaps=c:+.src
diff --git a/.gitignore b/.gitignore
index 0a1e190..c2eddb8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -124,6 +124,10 @@
 numpy/core/include/numpy/multiarray_api.txt
 numpy/core/include/numpy/ufunc_api.txt
 numpy/core/lib/
+numpy/core/src/common/npy_binsearch.h
+numpy/core/src/common/npy_partition.h
+numpy/core/src/common/npy_sort.h
+numpy/core/src/common/templ_common.h
 numpy/core/src/multiarray/_multiarray_tests.c
 numpy/core/src/multiarray/arraytypes.c
 numpy/core/src/multiarray/einsum.c
@@ -150,6 +154,7 @@
 numpy/core/src/umath/scalarmath.c
 numpy/core/src/umath/funcs.inc
 numpy/core/src/umath/loops.[ch]
+numpy/core/src/umath/matmul.[ch]
 numpy/core/src/umath/operand_flag_tests.c
 numpy/core/src/umath/simd.inc
 numpy/core/src/umath/struct_ufunc_test.c
diff --git a/.lgtm.yml b/.lgtm.yml
index 8507b6d..c1c54ec 100644
--- a/.lgtm.yml
+++ b/.lgtm.yml
@@ -12,3 +12,7 @@
     python_setup:
         requirements:
           - cython>=0.29
+  cpp:
+    index:
+      build_command:
+        - python3 setup.py build
diff --git a/.travis.yml b/.travis.yml
index 491fcef..cbf71e9 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -31,7 +31,6 @@
                ahp7Qnm0rWRmA0z9SomuRUQOJQ6s684vU="
 
 python:
-  - 2.7
   - 3.5
   - 3.6
 matrix:
@@ -40,15 +39,6 @@
       dist: xenial  # Required for Python 3.7
       sudo: true    # travis-ci/travis-ci#9069
       env: INSTALL_PICKLE5=1
-    - python: 3.6
-      env: USE_CHROOT=1 ARCH=i386 DIST=bionic
-      sudo: true
-      addons:
-        apt:
-          update: true
-          packages:
-            - dpkg
-            - debootstrap
     - python: 3.5
       dist: xenial  # Required for python3.5-dbg
       sudo: true    # travis-ci/travis-ci#9069
@@ -63,8 +53,6 @@
             - python3-setuptools
     - python: 3.6
       env: USE_WHEEL=1 RUN_FULL_TESTS=1 RUN_COVERAGE=1 INSTALL_PICKLE5=1
-    - python: 2.7
-      env: USE_WHEEL=1 RUN_FULL_TESTS=1 PYTHON_OPTS="-3 -OO"
     - python: 3.6
       env: USE_SDIST=1
     - python: 3.6
@@ -83,6 +71,11 @@
     - python: 3.6
       env:
        - NUMPY_EXPERIMENTAL_ARRAY_FUNCTION=1
+    - os: linux-ppc64le
+      python: 3.6
+      env:
+       # for matrix annotation only
+       - PPC64_LE=1
 
 before_install:
   - ./tools/travis-before-install.sh
diff --git a/INSTALL.rst.txt b/INSTALL.rst.txt
index d3ed719..640ddaf 100644
--- a/INSTALL.rst.txt
+++ b/INSTALL.rst.txt
@@ -14,8 +14,7 @@
 
 Building NumPy requires the following software installed:
 
-1) For Python 2, Python__ 2.7.x or newer.
-   For Python 3, Python__ 3.4.x or newer.
+1) For Python 3, Python__ 3.5.x or newer.
 
    On Debian and derivative (Ubuntu): python python-dev
 
@@ -27,8 +26,8 @@
 
    Python must also be compiled with the zlib module enabled.
 
-2) Cython >= 0.19 (for development versions of numpy, not for released
-                   versions)
+2) Cython >= 0.29.2 (for development versions of numpy, not for released
+                     versions)
 3) pytest__ (optional) 1.15 or later
 
    This is required for testing numpy, but not for using it.
@@ -61,8 +60,6 @@
 
     python setup.py build_ext --inplace -j 4
 
-Note that the ``python`` command here is the system default Python, generally
-python 2, the ``python3`` command may be needed to install on python 3.
 See `Requirements for Installing Packages <https://packaging.python.org/tutorials/installing-packages/>`_
 for more details.
 
@@ -79,8 +76,13 @@
 building Scipy a Fortran compiler is needed though, so we include some details
 on Fortran compilers in the rest of this section.
 
-On OS X and Linux, all common compilers will work.  Note that for Fortran,
-``gfortran`` is strongly preferred over ``g77``, but if you happen to have both
+On OS X and Linux, all common compilers will work.  Note that C99 support is
+required.  For compilers that don't support the C99 language standard by
+default (such as ``gcc`` versions < 5.0), it should be enabled.  For ``gcc``::
+
+    export CFLAGS='-std=c99'
+
+For Fortran, ``gfortran`` works, ``g77`` does not.  In case ``g77`` is
 installed then ``g77`` will be detected and used first.  To explicitly select
 ``gfortran`` in that case, do::
 
diff --git a/LICENSE.txt b/LICENSE.txt
index 207a7fd..b9731f7 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -1,4 +1,4 @@
-Copyright (c) 2005-2018, NumPy Developers.
+Copyright (c) 2005-2019, NumPy Developers.
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 812315b..1c06f54 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -6,7 +6,7 @@
       - master
       - maintenance/*
 jobs:
-- job: Linux_Python_36_32bit_full
+- job: Linux_Python_36_32bit_full_with_asserts
   pool:
     vmIMage: 'ubuntu-16.04'
   steps:
@@ -17,14 +17,22 @@
            apt-get -y install python3.6-dev python3-pip locales && \
            locale-gen fr_FR && update-locale && \
            pip3 install setuptools nose cython==0.29.0 pytest pytz pickle5 && \
-           apt-get -y install libopenblas-dev gfortran && \
+           apt-get -y install gfortran-5 wget && \
+           cd .. && \
+           mkdir openblas && cd openblas && \
+           wget https://3f23b170c54c2533c070-1c8a9b3114517dc5fe17b7c3f8c63a43.ssl.cf2.rackcdn.com/openblas-v0.3.4-manylinux1_i686.tar.gz && \
+           tar zxvf openblas-v0.3.4-manylinux1_i686.tar.gz && \
+           cp -r ./usr/local/lib/* /usr/lib && \
+           cp ./usr/local/include/* /usr/include && \
+           cd ../numpy && \
            NUMPY_EXPERIMENTAL_ARRAY_FUNCTION=1 \
-           python3 runtests.py --mode=full -- -rsx --junitxml=junit/test-results.xml"
+           F77=gfortran-5 F90=gfortran-5 \
+           CFLAGS='-UNDEBUG -std=c99' python3 runtests.py --mode=full -- -rsx --junitxml=junit/test-results.xml"
     displayName: 'Run 32-bit Ubuntu Docker Build / Tests'
   - task: PublishTestResults@2
     inputs:
       testResultsFiles: '**/test-*.xml'
-      testRunTitle: 'Publish test results for Python 3.6-32 bit'
+      testRunTitle: 'Publish test results for Python 3.6-32 bit full Linux'
 - job: macOS
   pool:
     # NOTE: at time of writing, there is a danger
@@ -52,11 +60,11 @@
   # two C compilers, but with homebrew looks like we're
   # now stuck getting the full gcc toolchain instead of
   # just pulling in gfortran
-  - script: brew install gcc
+  - script: HOMEBREW_NO_AUTO_UPDATE=1 brew install gcc
     displayName: 'make gfortran available on mac os vm'
   - script: python -m pip install --upgrade pip setuptools wheel
     displayName: 'Install tools'
-  - script: python -m pip install cython nose pytz pytest pickle5 vulture
+  - script: python -m pip install cython nose pytz pytest pickle5 vulture docutils sphinx numpydoc matplotlib
     displayName: 'Install dependencies; some are optional to avoid test skips'
   - script: /bin/bash -c "! vulture . --min-confidence 100 --exclude doc/,numpy/distutils/ | grep 'unreachable'"
     displayName: 'Check for unreachable code paths in Python modules'
@@ -76,12 +84,14 @@
       ATLAS: None
       ACCELERATE: None
       CC: /usr/bin/clang
+  - script: python runtests.py -g --refguide-check
+    displayName: 'Run Refuide Check'
   - script: python runtests.py --mode=full -- -rsx --junitxml=junit/test-results.xml
     displayName: 'Run Full NumPy Test Suite'
   - task: PublishTestResults@2
     inputs:
       testResultsFiles: '**/test-*.xml'
-      testRunTitle: 'Publish test results for Python $(python.version)'
+      testRunTitle: 'Publish test results for Python 3.6 64-bit full Mac OS'
 - job: Windows
   pool:
     vmIMage: 'VS2017-Win2016'
@@ -105,12 +115,6 @@
           TEST_MODE: fast
           OPENBLAS: $(OPENBLAS_32)
           BITS: 32
-        Python27-64bit-fast:
-          PYTHON_VERSION: '2.7'
-          PYTHON_ARCH: 'x64'
-          TEST_MODE: fast
-          OPENBLAS: $(OPENBLAS_64)
-          BITS: 64
         Python35-64bit-full:
           PYTHON_VERSION: '3.5'
           PYTHON_ARCH: 'x64'
@@ -137,14 +141,6 @@
       versionSpec: $(PYTHON_VERSION)
       addToPath: true
       architecture: $(PYTHON_ARCH)
-   # as noted by numba project, currently need
-   # specific VC install for Python 2.7
-  - powershell: |
-      $wc = New-Object net.webclient
-      $wc.Downloadfile("https://download.microsoft.com/download/7/9/6/796EF2E4-801B-4FC4-AB28-B59FBF6D907B/VCForPython27.msi", "VCForPython27.msi")
-      Start-Process "VCForPython27.msi" /qn -Wait
-    displayName: 'Install VC 9.0'
-    condition: eq(variables['PYTHON_VERSION'], '2.7')
   - script: python -m pip install --upgrade pip setuptools wheel
     displayName: 'Install tools'
   - powershell: |
@@ -187,4 +183,4 @@
   - task: PublishTestResults@2
     inputs:
       testResultsFiles: '**/test-*.xml'
-      testRunTitle: 'Publish test results for Python $(python.version)'
+      testRunTitle: 'Publish test results for Python $(PYTHON_VERSION) $(BITS)-bit $(TEST_MODE) Windows'
diff --git a/benchmarks/benchmarks/bench_core.py b/benchmarks/benchmarks/bench_core.py
index 26cffca..194ce32 100644
--- a/benchmarks/benchmarks/bench_core.py
+++ b/benchmarks/benchmarks/bench_core.py
@@ -97,8 +97,8 @@
 
 
 class CorrConv(Benchmark):
-    params = [[50, 1000, 1e5],
-              [10, 100, 1000, 1e4],
+    params = [[50, 1000, int(1e5)],
+              [10, 100, 1000, int(1e4)],
               ['valid', 'same', 'full']]
     param_names = ['size1', 'size2', 'mode']
 
diff --git a/benchmarks/benchmarks/bench_function_base.py b/benchmarks/benchmarks/bench_function_base.py
index 9ef0326..64e5786 100644
--- a/benchmarks/benchmarks/bench_function_base.py
+++ b/benchmarks/benchmarks/bench_function_base.py
@@ -96,35 +96,46 @@
 
 
 class Sort(Benchmark):
-    def setup(self):
-        self.e = np.arange(10000, dtype=np.float32)
-        self.o = np.arange(10001, dtype=np.float32)
+    params = [
+        ['quick', 'merge', 'heap'],
+        ['float32', 'int32', 'uint32']
+    ]
+    param_names = ['kind', 'dtype']
+
+    def setup(self, kind, dtype):
+        self.e = np.arange(10000, dtype=dtype)
+        self.o = np.arange(10001, dtype=dtype)
         np.random.seed(25)
         np.random.shuffle(self.o)
         # quicksort implementations can have issues with equal elements
-        self.equal = np.ones(10000)
-        self.many_equal = np.sort(np.arange(10000) % 10)
+        self.equal = np.ones(10000, dtype=dtype)
+        self.many_equal = np.sort(np.arange(10000) % 10).astype(dtype)
 
-    def time_sort(self):
-        np.sort(self.e)
+        try:
+            np.sort(self.e, kind=kind)
+        except TypeError:
+            raise NotImplementedError()
 
-    def time_sort_random(self):
-        np.sort(self.o)
+    def time_sort(self, kind, dtype):
+        np.sort(self.e, kind=kind)
 
-    def time_sort_inplace(self):
-        self.e.sort()
+    def time_sort_random(self, kind, dtype):
+        np.sort(self.o, kind=kind)
 
-    def time_sort_equal(self):
-        self.equal.sort()
+    def time_sort_inplace(self, kind, dtype):
+        self.e.sort(kind=kind)
 
-    def time_sort_many_equal(self):
-        self.many_equal.sort()
+    def time_sort_equal(self, kind, dtype):
+        self.equal.sort(kind=kind)
 
-    def time_argsort(self):
-        self.e.argsort()
+    def time_sort_many_equal(self, kind, dtype):
+        self.many_equal.sort(kind=kind)
 
-    def time_argsort_random(self):
-        self.o.argsort()
+    def time_argsort(self, kind, dtype):
+        self.e.argsort(kind=kind)
+
+    def time_argsort_random(self, kind, dtype):
+        self.o.argsort(kind=kind)
 
 
 class SortWorst(Benchmark):
diff --git a/benchmarks/benchmarks/bench_lib.py b/benchmarks/benchmarks/bench_lib.py
index e6c91a2..f65a96d 100644
--- a/benchmarks/benchmarks/bench_lib.py
+++ b/benchmarks/benchmarks/bench_lib.py
@@ -9,20 +9,109 @@
 
 
 class Pad(Benchmark):
-    """Benchmarks for `numpy.pad`."""
+    """Benchmarks for `numpy.pad`.
+
+    When benchmarking the pad function it is useful to cover scenarios where
+    the ratio between the size of the input array and the output array differs
+    significantly (original area vs. padded area). This allows to evaluate for
+    which scenario a padding algorithm is optimized. Furthermore involving
+    large range of array sizes ensures that the effects of CPU-bound caching is
+    visible.
+
+    The table below shows the sizes of the arrays involved in this benchmark:
+
+    +-----------------+----------+-----------+-----------+-----------------+
+    | shape           | original | padded: 1 | padded: 8 | padded: (0, 32) |
+    +=================+==========+===========+===========+=================+
+    | (2 ** 22,)      | 32 MiB   | 32.0 MiB  | 32.0 MiB  | 32.0 MiB        |
+    +-----------------+----------+-----------+-----------+-----------------+
+    | (1024, 1024)    | 8 MiB    | 8.03 MiB  | 8.25 MiB  | 8.51 MiB        |
+    +-----------------+----------+-----------+-----------+-----------------+
+    | (256, 256, 1)   | 256 KiB  | 786 KiB   | 5.08 MiB  | 11.6 MiB        |
+    +-----------------+----------+-----------+-----------+-----------------+
+    | (4, 4, 4, 4)    | 2 KiB    | 10.1 KiB  | 1.22 MiB  | 12.8 MiB        |
+    +-----------------+----------+-----------+-----------+-----------------+
+    | (1, 1, 1, 1, 1) | 8 B      | 1.90 MiB  | 10.8 MiB  | 299 MiB         |
+    +-----------------+----------+-----------+-----------+-----------------+
+    """
 
     param_names = ["shape", "pad_width", "mode"]
     params = [
-        [(1000,), (10, 100), (10, 10, 10)],
-        [1, 3, (0, 5)],
+        # Shape of the input arrays
+        [(2 ** 22,), (1024, 1024), (256, 128, 1),
+         (4, 4, 4, 4), (1, 1, 1, 1, 1)],
+        # Tested pad widths
+        [1, 8, (0, 32)],
+        # Tested modes: mean, median, minimum & maximum use the same code path
+        #               reflect & symmetric share a lot of their code path
         ["constant", "edge", "linear_ramp", "mean", "reflect", "wrap"],
     ]
 
     def setup(self, shape, pad_width, mode):
-        # avoid np.zeros or np.empty's lazy allocation.
-        # np.full causes pagefaults to occur during setup
-        # instead of during the benchmark
-        self.array = np.full(shape, 0)
+        # Make sure to fill the array to make the OS page fault
+        # in the setup phase and not the timed phase
+        self.array = np.full(shape, fill_value=1, dtype=np.float64)
 
     def time_pad(self, shape, pad_width, mode):
         np.pad(self.array, pad_width, mode)
+
+class Nan(Benchmark):
+    """Benchmarks for nan functions"""
+
+    param_names = ["array_size", "percent_nans"]
+    params = [
+            # sizes of the 1D arrays
+            [200, int(2e5)],
+            # percent of np.nan in arrays
+            [0, 0.1, 2., 50., 90.],
+            ]
+
+    def setup(self, array_size, percent_nans):
+        np.random.seed(123)
+        # produce a randomly shuffled array with the
+        # approximate desired percentage np.nan content
+        base_array = np.random.uniform(size=array_size)
+        base_array[base_array < percent_nans / 100.] = np.nan
+        self.arr = base_array
+
+    def time_nanmin(self, array_size, percent_nans):
+        np.nanmin(self.arr)
+
+    def time_nanmax(self, array_size, percent_nans):
+        np.nanmax(self.arr)
+
+    def time_nanargmin(self, array_size, percent_nans):
+        np.nanargmin(self.arr)
+
+    def time_nanargmax(self, array_size, percent_nans):
+        np.nanargmax(self.arr)
+
+    def time_nansum(self, array_size, percent_nans):
+        np.nansum(self.arr)
+
+    def time_nanprod(self, array_size, percent_nans):
+        np.nanprod(self.arr)
+
+    def time_nancumsum(self, array_size, percent_nans):
+        np.nancumsum(self.arr)
+
+    def time_nancumprod(self, array_size, percent_nans):
+        np.nancumprod(self.arr)
+
+    def time_nanmean(self, array_size, percent_nans):
+        np.nanmean(self.arr)
+
+    def time_nanvar(self, array_size, percent_nans):
+        np.nanvar(self.arr)
+
+    def time_nanstd(self, array_size, percent_nans):
+        np.nanstd(self.arr)
+
+    def time_nanmedian(self, array_size, percent_nans):
+        np.nanmedian(self.arr)
+
+    def time_nanquantile(self, array_size, percent_nans):
+        np.nanquantile(self.arr, q=0.2)
+
+    def time_nanpercentile(self, array_size, percent_nans):
+        np.nanpercentile(self.arr, q=50)
diff --git a/benchmarks/benchmarks/bench_linalg.py b/benchmarks/benchmarks/bench_linalg.py
index a65d510..5c44162 100644
--- a/benchmarks/benchmarks/bench_linalg.py
+++ b/benchmarks/benchmarks/bench_linalg.py
@@ -106,4 +106,4 @@
         self.b = get_indexes_rand()[:100].astype(np.float64)
 
     def time_numpy_linalg_lstsq_a__b_float64(self):
-        np.linalg.lstsq(self.a, self.b)
+        np.linalg.lstsq(self.a, self.b, rcond=-1)
diff --git a/benchmarks/benchmarks/bench_overrides.py b/benchmarks/benchmarks/bench_overrides.py
index 2cb94c9..58572d0 100644
--- a/benchmarks/benchmarks/bench_overrides.py
+++ b/benchmarks/benchmarks/bench_overrides.py
@@ -2,7 +2,15 @@
 
 from .common import Benchmark
 
-from numpy.core.overrides import array_function_dispatch
+try:
+    from numpy.core.overrides import array_function_dispatch
+except ImportError:
+    # Don't fail at import time with old Numpy versions
+    def array_function_dispatch(*args, **kwargs):
+        def wrap(*args, **kwargs):
+            return None
+        return wrap
+
 import numpy as np
 
 
@@ -16,10 +24,10 @@
 
 
 def _concatenate_dispatcher(arrays, axis=None, out=None):
-    for array in arrays:
-        yield array
     if out is not None:
-        yield out
+        arrays = list(arrays)
+        arrays.append(out)
+    return arrays
 
 
 @array_function_dispatch(_concatenate_dispatcher)
diff --git a/benchmarks/benchmarks/bench_records.py b/benchmarks/benchmarks/bench_records.py
new file mode 100644
index 0000000..41a6dd7
--- /dev/null
+++ b/benchmarks/benchmarks/bench_records.py
@@ -0,0 +1,43 @@
+from __future__ import absolute_import, division, print_function
+import os
+
+from .common import Benchmark
+
+import numpy as np
+
+
+class Records(Benchmark):
+    def setup(self):
+        self.l50 = np.arange(1000)
+        self.fields_number = 10000
+        self.arrays = [self.l50 for _ in range(self.fields_number)]
+        self.formats = [self.l50.dtype.str for _ in range(self.fields_number)]
+        self.formats_str = ','.join(self.formats)
+        self.dtype_ = np.dtype(
+            [
+                ('field_{}'.format(i), self.l50.dtype.str)
+                for i in range(self.fields_number)
+            ]
+        )
+        self.buffer = self.l50.tostring() * self.fields_number
+
+    def time_fromarrays_w_dtype(self):
+        np.core.records.fromarrays(self.arrays, dtype=self.dtype_)
+
+    def time_fromarrays_wo_dtype(self):
+        np.core.records.fromarrays(self.arrays)
+
+    def time_fromarrays_formats_as_list(self):
+        np.core.records.fromarrays(self.arrays, formats=self.formats)
+
+    def time_fromarrays_formats_as_string(self):
+        np.core.records.fromarrays(self.arrays, formats=self.formats_str)
+
+    def time_fromstring_w_dtype(self):
+        np.core.records.fromstring(self.buffer, dtype=self.dtype_)
+
+    def time_fromstring_formats_as_list(self):
+        np.core.records.fromstring(self.buffer, formats=self.formats)
+
+    def time_fromstring_formats_as_string(self):
+        np.core.records.fromstring(self.buffer, formats=self.formats_str)
diff --git a/benchmarks/benchmarks/bench_ufunc.py b/benchmarks/benchmarks/bench_ufunc.py
index a7e385f..62e7078 100644
--- a/benchmarks/benchmarks/bench_ufunc.py
+++ b/benchmarks/benchmarks/bench_ufunc.py
@@ -15,7 +15,7 @@
           'isinf', 'isnan', 'isnat', 'lcm', 'ldexp', 'left_shift', 'less',
           'less_equal', 'log', 'log10', 'log1p', 'log2', 'logaddexp',
           'logaddexp2', 'logical_and', 'logical_not', 'logical_or',
-          'logical_xor', 'maximum', 'minimum', 'mod', 'modf', 'multiply',
+          'logical_xor', 'matmul', 'maximum', 'minimum', 'mod', 'modf', 'multiply',
           'negative', 'nextafter', 'not_equal', 'positive', 'power',
           'rad2deg', 'radians', 'reciprocal', 'remainder', 'right_shift',
           'rint', 'sign', 'signbit', 'sin', 'sinh', 'spacing', 'sqrt',
diff --git a/doc/Makefile b/doc/Makefile
index 667dbef..d61d115 100644
--- a/doc/Makefile
+++ b/doc/Makefile
@@ -1,7 +1,7 @@
 # Makefile for Sphinx documentation
 #
 
-PYVER = 3.6
+PYVER = 3
 PYTHON = python$(PYVER)
 
 # You can set these variables from the command line.
diff --git a/doc/RELEASE_WALKTHROUGH.rst.txt b/doc/RELEASE_WALKTHROUGH.rst.txt
index 960bb3f..79a296f 100644
--- a/doc/RELEASE_WALKTHROUGH.rst.txt
+++ b/doc/RELEASE_WALKTHROUGH.rst.txt
@@ -6,6 +6,11 @@
 Release  Walkthrough
 ====================
 
+Note that in the code snippets below, ``upstream`` refers to the root repository on
+github and ``origin`` to a fork in your personal account. You may need to make adjustments
+if you have not forked the repository but simply cloned it locally. You can
+also edit ``.git/config`` and add ``upstream`` if it isn't already present.
+
 
 Backport Pull Requests
 ----------------------
@@ -55,7 +60,7 @@
 
 Sanity check::
 
-    $ python runtests.py -m "full"
+    $ python runtests.py -m "full"  # NumPy < 1.17 only
     $ python3 runtests.py -m "full"
 
 Push this release directly onto the end of the maintenance branch. This
@@ -86,7 +91,7 @@
 may have been accessed and changed by someone else and a push will fail::
 
     $ cd ../numpy-wheels
-    $ git pull origin master
+    $ git pull upstream master
     $ git branch <new version>  # only when starting new numpy version
     $ git checkout v1.14.x  # v1.14.x already existed for the 1.14.4 release
 
@@ -96,7 +101,7 @@
 
     $ gvim .travis.yml .appveyor.yml
     $ git commit -a
-    $ git push origin HEAD
+    $ git push upstream HEAD
 
 Now wait. If you get nervous at the amount of time taken -- the builds can take
 several hours-- you can check the build progress by following the links
@@ -121,7 +126,7 @@
 upload later using ``twine``::
 
     $ cd ../terryfy
-    $ git pull origin master
+    $ git pull upstream master
     $ CDN_URL=https://3f23b170c54c2533c070-1c8a9b3114517dc5fe17b7c3f8c63a43.ssl.cf2.rackcdn.com
     $ NPY_WHLS=../numpy/release/installers
     $ ./wheel-uploader -u $CDN_URL -n -v -w $NPY_WHLS -t win numpy 1.14.5
@@ -135,7 +140,7 @@
 -------------------------
 
 This needs to be done after all installers are present, but before the pavement
-file is updated for continued development.
+file is updated for continued development::
 
     $ cd ../numpy
     $ paver write_release
@@ -158,15 +163,15 @@
 
     $ git push upstream v1.14.5
 
-We wait until this point to push the tag because it is very difficult to change
-the tag after it has been pushed.
+We wait until this point to push the tag because it is public and should not
+be changed after it has been pushed.
 
 
 Reset the maintenance branch into a development state
 -----------------------------------------------------
 
 Add another ``REL`` commit to the numpy maintenance branch, which resets the
-``ISREALEASED`` flag to ``False`` and increments the version counter::
+``ISREALEASED`` flag to ``False`` and increments the version counter.::
 
     $ gvim pavement.py setup.py
     $ git commit -a -m"REL: prepare 1.14.x for further development"
@@ -177,7 +182,7 @@
 --------------
 
 Upload to PyPI using ``twine``. A recent version of ``twine`` of is needed
-after recent PyPI changes, version ``1.11.0`` was used here. ::
+after recent PyPI changes, version ``1.11.0`` was used here.::
 
     $ cd ../numpy
     $ twine upload release/installers/*.whl
@@ -251,8 +256,9 @@
 
 The release should be announced on the numpy-discussion, scipy-devel,
 scipy-user, and python-announce-list mailing lists. Look at previous
-announcements for the basic template. The contributor and PR lists
-are the same as generated for the release notes above.
+announcements for the basic template. The contributor and PR lists are the same
+as generated for the release notes above. If you crosspost, make sure that
+python-announce-list is BCC so that replies will not be sent to that list.
 
 
 Post-Release Tasks
diff --git a/doc/TESTS.rst.txt b/doc/TESTS.rst.txt
index 5fe0be1..daf82aa 100644
--- a/doc/TESTS.rst.txt
+++ b/doc/TESTS.rst.txt
@@ -120,15 +120,6 @@
 suite with ``verbose=2`` (or similar verbosity setting).  Use plain comments
 (``#``) if necessary.
 
-Sometimes it is convenient to run ``test_yyy.py`` by itself, so we add
-
-::
-
-  if __name__ == "__main__":
-      run_module_suite()
-
-at the bottom.
-
 Labeling tests 
 --------------
 
diff --git a/doc/changelog/1.16.0-changelog.rst b/doc/changelog/1.16.0-changelog.rst
new file mode 100644
index 0000000..8aca5e6
--- /dev/null
+++ b/doc/changelog/1.16.0-changelog.rst
@@ -0,0 +1,616 @@
+
+Contributors
+============
+
+A total of 113 people contributed to this release.  People with a "+" by their
+names contributed a patch for the first time.
+
+* Alan Fontenot +
+* Allan Haldane
+* Alon Hershenhorn +
+* Alyssa Quek +
+* Andreas Nussbaumer +
+* Anner +
+* Anthony Sottile +
+* Antony Lee
+* Ayappan P +
+* Bas van Schaik +
+* C.A.M. Gerlach +
+* Charles Harris
+* Chris Billington
+* Christian Clauss
+* Christoph Gohlke
+* Christopher Pezley +
+* Daniel B Allan +
+* Daniel Smith
+* Dawid Zych +
+* Derek Kim +
+* Dima Pasechnik +
+* Edgar Giovanni Lepe +
+* Elena Mokeeva +
+* Elliott Sales de Andrade +
+* Emil Hessman +
+* Eric Larson
+* Eric Schles +
+* Eric Wieser
+* Giulio Benetti +
+* Guillaume Gautier +
+* Guo Ci
+* Heath Henley +
+* Isuru Fernando +
+* J. Lewis Muir +
+* Jack Vreeken +
+* Jaime Fernandez
+* James Bourbeau
+* Jeff VanOss
+* Jeffrey Yancey +
+* Jeremy Chen +
+* Jeremy Manning +
+* Jeroen Demeyer
+* John Darbyshire +
+* John Kirkham
+* John Zwinck
+* Jonas Jensen +
+* Joscha Reimer +
+* Juan Azcarreta +
+* Julian Taylor
+* Kevin Sheppard
+* Krzysztof Chomski +
+* Kyle Sunden
+* Lars Grüter
+* Lilian Besson +
+* MSeifert04
+* Mark Harfouche
+* Marten van Kerkwijk
+* Martin Thoma
+* Matt Harrigan +
+* Matthew Bowden +
+* Matthew Brett
+* Matthias Bussonnier
+* Matti Picus
+* Max Aifer +
+* Michael Hirsch, Ph.D +
+* Michael James Jamie  Schnaitter +
+* MichaelSaah +
+* Mike Toews
+* Minkyu Lee +
+* Mircea Akos Bruma +
+* Mircea-Akos Brumă +
+* Moshe Looks +
+* Muhammad Kasim +
+* Nathaniel J. Smith
+* Nikita Titov +
+* Paul Müller +
+* Paul van Mulbregt
+* Pauli Virtanen
+* Pierre Glaser +
+* Pim de Haan
+* Ralf Gommers
+* Robert Kern
+* Robin Aggleton +
+* Rohit Pandey +
+* Roman Yurchak +
+* Ryan Soklaski
+* Sebastian Berg
+* Sho Nakamura +
+* Simon Gibbons
+* Stan Seibert +
+* Stefan Otte
+* Stefan van der Walt
+* Stephan Hoyer
+* Stuart Archibald
+* Taylor Smith +
+* Tim Felgentreff +
+* Tim Swast +
+* Tim Teichmann +
+* Toshiki Kataoka
+* Travis Oliphant
+* Tyler Reddy
+* Uddeshya Singh +
+* Warren Weckesser
+* Weitang Li +
+* Wenjamin Petrenko +
+* William D. Irons
+* Yannick Jadoul +
+* Yaroslav Halchenko
+* Yug Khanna +
+* Yuji Kanagawa +
+* Yukun Guo +
+* @ankokumoyashi +
+* @lerbuke +
+
+Pull requests merged
+====================
+
+A total of 490 pull requests were merged for this release.
+
+* `#6256 <https://github.com/numpy/numpy/pull/6256>`__: NEP: Add proposal for oindex and vindex.
+* `#6377 <https://github.com/numpy/numpy/pull/6377>`__: BUG: define "uint-alignment", fixes complex64 alignment
+* `#8206 <https://github.com/numpy/numpy/pull/8206>`__: ENH: add padding options to diff
+* `#8923 <https://github.com/numpy/numpy/pull/8923>`__: ENH: Add 'stone' estimator to np.histogram
+* `#8955 <https://github.com/numpy/numpy/pull/8955>`__: ENH: Allow ufunc.identity to be any python object
+* `#9022 <https://github.com/numpy/numpy/pull/9022>`__: BUG: don't silence `__array_wrap__` errors in `ufunc.reduce`
+* `#10551 <https://github.com/numpy/numpy/pull/10551>`__: BUG: memmap close files when it shouldn't, load leaves them open...
+* `#10602 <https://github.com/numpy/numpy/pull/10602>`__: MAINT: Move dtype string functions to python
+* `#10704 <https://github.com/numpy/numpy/pull/10704>`__: NEP 15: Merging multiarray and umath
+* `#10797 <https://github.com/numpy/numpy/pull/10797>`__: DEP: Updated `unravel_index()` to support `shape` kwarg
+* `#10915 <https://github.com/numpy/numpy/pull/10915>`__: ENH: implement nep 0015: merge multiarray and umath
+* `#10998 <https://github.com/numpy/numpy/pull/10998>`__: DOC: removed spurious FIXME comment in number.c
+* `#11002 <https://github.com/numpy/numpy/pull/11002>`__: MAINT: add clearer message to assist users with failed builds.
+* `#11016 <https://github.com/numpy/numpy/pull/11016>`__: ENH: Add AARCH32 support.
+* `#11084 <https://github.com/numpy/numpy/pull/11084>`__: DOC: link to TESTS.rst.txt testing guidelines, tweak testing...
+* `#11119 <https://github.com/numpy/numpy/pull/11119>`__: ENH: Chain exceptions to give better error messages for invalid...
+* `#11175 <https://github.com/numpy/numpy/pull/11175>`__: ENH: Generalized ufunc signature expansion for frozen and flexible...
+* `#11197 <https://github.com/numpy/numpy/pull/11197>`__: BUG/ENH: Removed non-standard scaling of the covariance matrix...
+* `#11234 <https://github.com/numpy/numpy/pull/11234>`__: DOC: Update einsum docs
+* `#11282 <https://github.com/numpy/numpy/pull/11282>`__: MAINT: move comparison operator special-handling out of ufunc...
+* `#11297 <https://github.com/numpy/numpy/pull/11297>`__: NEP: Expansion of gufunc signatures.
+* `#11299 <https://github.com/numpy/numpy/pull/11299>`__: BUG: Prevent crashes on 0-length structured void scalars
+* `#11303 <https://github.com/numpy/numpy/pull/11303>`__: DOC: revision of NEP-18 (`__array_function__`)
+* `#11312 <https://github.com/numpy/numpy/pull/11312>`__: WIP: DOC: slightly tweak the directions to create a release
+* `#11318 <https://github.com/numpy/numpy/pull/11318>`__: REL: Setup master for 1.16 development.
+* `#11323 <https://github.com/numpy/numpy/pull/11323>`__: DEP: Actually deprecate the normed argument to histogram
+* `#11324 <https://github.com/numpy/numpy/pull/11324>`__: MAINT: Don't use dtype strings when the dtypes themselves can...
+* `#11326 <https://github.com/numpy/numpy/pull/11326>`__: DOC: Update master after NumPy 1.14.5 release.
+* `#11328 <https://github.com/numpy/numpy/pull/11328>`__: MAINT: Misc numeric cleanup
+* `#11335 <https://github.com/numpy/numpy/pull/11335>`__: DOC: Change array lengths/entries in `broadcast_arrays` example...
+* `#11336 <https://github.com/numpy/numpy/pull/11336>`__: BUG: decref in failure path; replace `PyObject_Type` by `Py_TYPE`
+* `#11338 <https://github.com/numpy/numpy/pull/11338>`__: MAINT: Ensure ufunc override call each class only once, plus...
+* `#11340 <https://github.com/numpy/numpy/pull/11340>`__: BUG: sctypeDict['f8'] randomly points to double or longdouble...
+* `#11345 <https://github.com/numpy/numpy/pull/11345>`__: BUG/ENH: Einsum optimization path updates and bug fixes.
+* `#11347 <https://github.com/numpy/numpy/pull/11347>`__: DOC: Silence many sphinx warnings
+* `#11348 <https://github.com/numpy/numpy/pull/11348>`__: ENH: Improve support for pathlib.Path objects in load functions
+* `#11349 <https://github.com/numpy/numpy/pull/11349>`__: DOC: document new functions
+* `#11351 <https://github.com/numpy/numpy/pull/11351>`__: MAINT: Improve speed of ufunc kwargs parsing
+* `#11353 <https://github.com/numpy/numpy/pull/11353>`__: DOC, MAINT: HTTP -> HTTPS, and other linkrot fixes
+* `#11356 <https://github.com/numpy/numpy/pull/11356>`__: NEP: Update NEP 19: RNG Policy
+* `#11357 <https://github.com/numpy/numpy/pull/11357>`__: MAINT: Add new `_test.c` files and `benchmarks/html` to `gitignore`
+* `#11365 <https://github.com/numpy/numpy/pull/11365>`__: BUG: add missing NpyIter_Close in einsum
+* `#11366 <https://github.com/numpy/numpy/pull/11366>`__: BUG/TST: String indexing should just fail, not emit a futurewarning
+* `#11371 <https://github.com/numpy/numpy/pull/11371>`__: DOC: Clarify requirement that histogram bins are monotonic.
+* `#11373 <https://github.com/numpy/numpy/pull/11373>`__: TST: Show that histogramdd's normed argument is histogram's density
+* `#11374 <https://github.com/numpy/numpy/pull/11374>`__: WIP: additional revision for NEP-18 (`__array_function__`)
+* `#11376 <https://github.com/numpy/numpy/pull/11376>`__: ENH: Remove NpyIter_Close
+* `#11379 <https://github.com/numpy/numpy/pull/11379>`__: BUG: changed hardcoded axis to 0 for checking indices
+* `#11382 <https://github.com/numpy/numpy/pull/11382>`__: DEP: deprecate undocumented, unused dtype type dicts
+* `#11383 <https://github.com/numpy/numpy/pull/11383>`__: ENH: Allow size=0 in numpy.random.choice
+* `#11385 <https://github.com/numpy/numpy/pull/11385>`__: BUG: Make scalar.squeeze accept axis arg
+* `#11390 <https://github.com/numpy/numpy/pull/11390>`__: REL,MAINT: Update numpyconfig.h for 1.15.
+* `#11391 <https://github.com/numpy/numpy/pull/11391>`__: MAINT: Update mailmap
+* `#11396 <https://github.com/numpy/numpy/pull/11396>`__: TST: Added regression test for #11395
+* `#11405 <https://github.com/numpy/numpy/pull/11405>`__: BUG: Ensure comparisons on scalar strings pass without warning.
+* `#11406 <https://github.com/numpy/numpy/pull/11406>`__: BUG: Ensure out is returned in einsum.
+* `#11409 <https://github.com/numpy/numpy/pull/11409>`__: DOC: Update testing section of README.
+* `#11414 <https://github.com/numpy/numpy/pull/11414>`__: DOC: major revision of NEP 21, advanced indexing
+* `#11422 <https://github.com/numpy/numpy/pull/11422>`__: BENCH: Add benchmarks for np.loadtxt reading from CSV format...
+* `#11424 <https://github.com/numpy/numpy/pull/11424>`__: ENH: Allow use of svd on empty arrays
+* `#11425 <https://github.com/numpy/numpy/pull/11425>`__: DOC: Clear up confusion between np.where(cond) and np.where(cond,...
+* `#11428 <https://github.com/numpy/numpy/pull/11428>`__: BUG: Fix incorrect deprecation logic for histogram(normed=...)...
+* `#11429 <https://github.com/numpy/numpy/pull/11429>`__: NEP: accept NEP 20 partially (frozen, flexible, but not broadcastable...
+* `#11432 <https://github.com/numpy/numpy/pull/11432>`__: MAINT: Refactor differences between cblas_matrixproduct and PyArray_MatrixProduct2
+* `#11434 <https://github.com/numpy/numpy/pull/11434>`__: MAINT: add PyPI classifier for Python 3.7
+* `#11436 <https://github.com/numpy/numpy/pull/11436>`__: DOC: Document average return type
+* `#11440 <https://github.com/numpy/numpy/pull/11440>`__: BUG: fix interpolation with inf and NaN present
+* `#11444 <https://github.com/numpy/numpy/pull/11444>`__: DOC: Fix documentation for fromfunction
+* `#11449 <https://github.com/numpy/numpy/pull/11449>`__: BUG: Revert #10229 to fix DLL loads on Windows.
+* `#11450 <https://github.com/numpy/numpy/pull/11450>`__: MAINT/DEP: properly implement `ndarray.__pos__`
+* `#11453 <https://github.com/numpy/numpy/pull/11453>`__: BENCH: add ufunc argument parsing benchmarks.
+* `#11455 <https://github.com/numpy/numpy/pull/11455>`__: BENCH: belated addition of lcm, gcd to ufunc benchmark.
+* `#11459 <https://github.com/numpy/numpy/pull/11459>`__: NEP: Add some text to NEP 0 to clarify how a NEP is accepted
+* `#11461 <https://github.com/numpy/numpy/pull/11461>`__: MAINT: Add discussion link to NEP 15
+* `#11462 <https://github.com/numpy/numpy/pull/11462>`__: Add NEP 22, a high level overview for the duck array work
+* `#11463 <https://github.com/numpy/numpy/pull/11463>`__: MAINT: Produce a more readable repr of argument packs in benchmark
+* `#11464 <https://github.com/numpy/numpy/pull/11464>`__: BUG: Don't convert inputs to `np.float64` in digitize
+* `#11468 <https://github.com/numpy/numpy/pull/11468>`__: BUG: Advanced indexing assignment incorrectly took 1-D fastpath
+* `#11470 <https://github.com/numpy/numpy/pull/11470>`__: BLD: Don't leave the build task running if runtests.py is interrupted
+* `#11471 <https://github.com/numpy/numpy/pull/11471>`__: MAINT: Remove python-side docstrings from add_newdocs.
+* `#11472 <https://github.com/numpy/numpy/pull/11472>`__: DOC: include NEP number on each NEP page
+* `#11473 <https://github.com/numpy/numpy/pull/11473>`__: MAINT: Move pytesttester outside of np.testing, to avoid creating...
+* `#11474 <https://github.com/numpy/numpy/pull/11474>`__: MAINT: Move add_newdocs into core, since it only adds docs to...
+* `#11479 <https://github.com/numpy/numpy/pull/11479>`__: BUG: Fix #define for ppc64 and ppc64le
+* `#11480 <https://github.com/numpy/numpy/pull/11480>`__: MAINT: move ufunc override code to umath and multiarray as much...
+* `#11482 <https://github.com/numpy/numpy/pull/11482>`__: DOC: Include warning in np.resize() docs
+* `#11484 <https://github.com/numpy/numpy/pull/11484>`__: BUG: Increase required cython version on python 3.7
+* `#11487 <https://github.com/numpy/numpy/pull/11487>`__: DOC: extend sanity check message
+* `#11488 <https://github.com/numpy/numpy/pull/11488>`__: NEP: clarify bugfix policy for legacy RandomState.
+* `#11501 <https://github.com/numpy/numpy/pull/11501>`__: MAINT: Tidy cython invocation
+* `#11503 <https://github.com/numpy/numpy/pull/11503>`__: MAINT: improve error message for isposinf and isneginf on complex...
+* `#11512 <https://github.com/numpy/numpy/pull/11512>`__: DOC: Add templates for issues and PRs
+* `#11514 <https://github.com/numpy/numpy/pull/11514>`__: Prefer the same-python cython to the on-PATH cython
+* `#11515 <https://github.com/numpy/numpy/pull/11515>`__: BUG: decref of field title caused segfault
+* `#11518 <https://github.com/numpy/numpy/pull/11518>`__: MAINT: Speed up normalize_axis_tuple by about 30%
+* `#11522 <https://github.com/numpy/numpy/pull/11522>`__: BUG: fix np.load() of empty .npz file
+* `#11525 <https://github.com/numpy/numpy/pull/11525>`__: MAINT: Append `*FLAGS` instead of overriding
+* `#11526 <https://github.com/numpy/numpy/pull/11526>`__: ENH: add multi-field assignment helpers in np.lib.recfunctions
+* `#11527 <https://github.com/numpy/numpy/pull/11527>`__: DOC: Note that method is the polar form of Box-Muller.
+* `#11528 <https://github.com/numpy/numpy/pull/11528>`__: ENH: Add support for ipython latex printing to polynomial
+* `#11531 <https://github.com/numpy/numpy/pull/11531>`__: ENH: Add density argument to histogramdd.
+* `#11533 <https://github.com/numpy/numpy/pull/11533>`__: DOC: Fixed example code for cheb2poly and poly2cheb (see #11519)
+* `#11534 <https://github.com/numpy/numpy/pull/11534>`__: DOC: Minor improvements to np.concatenate docstring
+* `#11535 <https://github.com/numpy/numpy/pull/11535>`__: MAINT: Improve memory usage in PEP3118 format parsing
+* `#11553 <https://github.com/numpy/numpy/pull/11553>`__: DOC: Tiny typo on numpy/reference/arrays.dtypes.html
+* `#11556 <https://github.com/numpy/numpy/pull/11556>`__: BUG: Make assert_string_equal check str equality simply without...
+* `#11559 <https://github.com/numpy/numpy/pull/11559>`__: NEP: accept nep 0015
+* `#11560 <https://github.com/numpy/numpy/pull/11560>`__: NEP: accept nep 0019
+* `#11562 <https://github.com/numpy/numpy/pull/11562>`__: DOC: update release notes for LDFLAGS append behavior (gh-11525).
+* `#11565 <https://github.com/numpy/numpy/pull/11565>`__: MAINT: convert the doctests for polynomial to regular tests
+* `#11566 <https://github.com/numpy/numpy/pull/11566>`__: BLD: Do not use gcc warnings flags when 'gcc' is actually clang.
+* `#11567 <https://github.com/numpy/numpy/pull/11567>`__: TST: Integrate codecov testing
+* `#11568 <https://github.com/numpy/numpy/pull/11568>`__: BLD: Modify cpu detection and printing to get working aarch64...
+* `#11571 <https://github.com/numpy/numpy/pull/11571>`__: DOC: Updated array2string description
+* `#11572 <https://github.com/numpy/numpy/pull/11572>`__: DOC: Updated Slice Description
+* `#11573 <https://github.com/numpy/numpy/pull/11573>`__: TST: add broadcast_arrays() kwarg unit test for TypeError
+* `#11580 <https://github.com/numpy/numpy/pull/11580>`__: MAINT: refactor ufunc iter operand flags handling
+* `#11591 <https://github.com/numpy/numpy/pull/11591>`__: MAINT: update runtests.py node id example for pytest usage
+* `#11592 <https://github.com/numpy/numpy/pull/11592>`__: DOC: add Stefan van der Walt to Steering Council
+* `#11593 <https://github.com/numpy/numpy/pull/11593>`__: ENH: handle empty matrices in qr decomposition
+* `#11594 <https://github.com/numpy/numpy/pull/11594>`__: ENH: support for empty matrices in linalg.lstsq
+* `#11595 <https://github.com/numpy/numpy/pull/11595>`__: BUG:warn on Nan in minimum,maximum for scalars, float16
+* `#11596 <https://github.com/numpy/numpy/pull/11596>`__: NEP: backwards compatibility and deprecation policy
+* `#11598 <https://github.com/numpy/numpy/pull/11598>`__: TST: Add Python 3.7 to CI testing
+* `#11601 <https://github.com/numpy/numpy/pull/11601>`__: BUG: Make np.array([[1], 2]) and np.array([1, [2]]) behave in...
+* `#11606 <https://github.com/numpy/numpy/pull/11606>`__: DOC: Post 1.15.0 release updates for master.
+* `#11607 <https://github.com/numpy/numpy/pull/11607>`__: DOC: minor clarification and typo fix to NEP 21 (outer indexing).
+* `#11610 <https://github.com/numpy/numpy/pull/11610>`__: TST: including C source line coverage for CI / codecov
+* `#11611 <https://github.com/numpy/numpy/pull/11611>`__: NEP: Add roadmap section and subdocuments to NEPs
+* `#11613 <https://github.com/numpy/numpy/pull/11613>`__: BUG: have geometric() raise ValueError on p=0
+* `#11615 <https://github.com/numpy/numpy/pull/11615>`__: BUG: Clip uses wrong memory order in output
+* `#11616 <https://github.com/numpy/numpy/pull/11616>`__: DOC: add a brief note on "Protocols for methods" to NEP 18
+* `#11621 <https://github.com/numpy/numpy/pull/11621>`__: DOC: Use "real symmetric" rather than "symmetric" in ``eigh``...
+* `#11626 <https://github.com/numpy/numpy/pull/11626>`__: DOC: Show plot in meshgrid example.
+* `#11630 <https://github.com/numpy/numpy/pull/11630>`__: DOC: Include the versionadded to the isnat documentation.
+* `#11634 <https://github.com/numpy/numpy/pull/11634>`__: MAINT: Filter Cython warnings in `__init__.py`
+* `#11637 <https://github.com/numpy/numpy/pull/11637>`__: ENH: np.angle: Remove unnecessary multiplication, and allow subclasses...
+* `#11638 <https://github.com/numpy/numpy/pull/11638>`__: ENH: Make expand_dims work on subclasses
+* `#11642 <https://github.com/numpy/numpy/pull/11642>`__: BUG: Fixes for unicode field names in Python 2
+* `#11643 <https://github.com/numpy/numpy/pull/11643>`__: DOC: Insert up to date link to Spyder website in Dev Env doc...
+* `#11644 <https://github.com/numpy/numpy/pull/11644>`__: BUG: Fix doc source links to unwrap decorators
+* `#11652 <https://github.com/numpy/numpy/pull/11652>`__: BUG: Ensure singleton dimensions are not dropped when converting...
+* `#11660 <https://github.com/numpy/numpy/pull/11660>`__: ENH: Add Nan warnings for maximum, minimum on more dtypes
+* `#11669 <https://github.com/numpy/numpy/pull/11669>`__: BUG: Fix regression in `void_getitem`
+* `#11670 <https://github.com/numpy/numpy/pull/11670>`__: MAINT: trivially refactor mapped indexing
+* `#11673 <https://github.com/numpy/numpy/pull/11673>`__: DOC: Add geomspace to "See also" of linspace
+* `#11679 <https://github.com/numpy/numpy/pull/11679>`__: TST: ignore setup.py files for codecov reports
+* `#11688 <https://github.com/numpy/numpy/pull/11688>`__: DOC: Update broadcasting doc with current exception details
+* `#11691 <https://github.com/numpy/numpy/pull/11691>`__: BUG: Make matrix_power again work for object arrays.
+* `#11692 <https://github.com/numpy/numpy/pull/11692>`__: MAINT: Remove duplicate code.
+* `#11693 <https://github.com/numpy/numpy/pull/11693>`__: NEP: Mark NEP 18 as accepted
+* `#11694 <https://github.com/numpy/numpy/pull/11694>`__: BUG: Fix pickle and memoryview for datetime64, timedelta64 scalars
+* `#11695 <https://github.com/numpy/numpy/pull/11695>`__: BUG: Add missing PyErr_NoMemory after failing malloc
+* `#11703 <https://github.com/numpy/numpy/pull/11703>`__: MAINT: Remove np.pkgload, which seems to be unusable anyway
+* `#11708 <https://github.com/numpy/numpy/pull/11708>`__: BUG: Fix regression in np.loadtxt for bz2 text files in Python...
+* `#11710 <https://github.com/numpy/numpy/pull/11710>`__: BUG: Check for compiler used in env['CC'], then config_vars['CC']
+* `#11711 <https://github.com/numpy/numpy/pull/11711>`__: BUG: Fix undefined functions on big-endian systems.
+* `#11715 <https://github.com/numpy/numpy/pull/11715>`__: TST: Fix urlopen stubbing.
+* `#11717 <https://github.com/numpy/numpy/pull/11717>`__: MAINT: Make einsum optimize default to False.
+* `#11718 <https://github.com/numpy/numpy/pull/11718>`__: BUG: Revert use of `console_scripts`.
+* `#11722 <https://github.com/numpy/numpy/pull/11722>`__: MAINT: Remove duplicate docstring and correct location of `__all__`...
+* `#11725 <https://github.com/numpy/numpy/pull/11725>`__: BUG: Fix Fortran kind detection for aarch64 & s390x.
+* `#11727 <https://github.com/numpy/numpy/pull/11727>`__: BUG: Fix printing of longdouble on ppc64le.
+* `#11729 <https://github.com/numpy/numpy/pull/11729>`__: DOC: fix capitalization of kilojoules
+* `#11731 <https://github.com/numpy/numpy/pull/11731>`__: DOC: fix typo in vectorize docstring
+* `#11733 <https://github.com/numpy/numpy/pull/11733>`__: DOC: recommend polynomial.Polynomial over np.polyfit
+* `#11735 <https://github.com/numpy/numpy/pull/11735>`__: BUG: Fix test sensitive to platform byte order.
+* `#11738 <https://github.com/numpy/numpy/pull/11738>`__: TST, MAINT: add lgtm.yml to tweak LGTM.com analysis
+* `#11739 <https://github.com/numpy/numpy/pull/11739>`__: BUG: disallow setting flag to writeable after fromstring, frombuffer
+* `#11740 <https://github.com/numpy/numpy/pull/11740>`__: BUG: Deprecation triggers segfault
+* `#11742 <https://github.com/numpy/numpy/pull/11742>`__: DOC: Reduce warnings and cleanup redundant c-api documentation
+* `#11745 <https://github.com/numpy/numpy/pull/11745>`__: DOC: Small docstring fixes for old polyfit.
+* `#11754 <https://github.com/numpy/numpy/pull/11754>`__: BUG: check return value of `_buffer_format_string`
+* `#11755 <https://github.com/numpy/numpy/pull/11755>`__: MAINT: Fix typos in random.hypergeometric's notes
+* `#11756 <https://github.com/numpy/numpy/pull/11756>`__: MAINT: Make assert_array_compare more generic.
+* `#11765 <https://github.com/numpy/numpy/pull/11765>`__: DOC: Move documentation from `help(ndarray.ctypes)` to `help(some_array.ctypes)`
+* `#11771 <https://github.com/numpy/numpy/pull/11771>`__: BUG: Make `random.shuffle` work on 1-D instances of `ndarray`...
+* `#11774 <https://github.com/numpy/numpy/pull/11774>`__: BUG: Fix regression in intersect1d.
+* `#11778 <https://github.com/numpy/numpy/pull/11778>`__: BUG: Avoid signed overflow in histogram
+* `#11783 <https://github.com/numpy/numpy/pull/11783>`__: MAINT: check `_append_char` return value
+* `#11784 <https://github.com/numpy/numpy/pull/11784>`__: MAINT: reformat line spacing before test methods
+* `#11797 <https://github.com/numpy/numpy/pull/11797>`__: DOC: Update docs after 1.15.1 release.
+* `#11800 <https://github.com/numpy/numpy/pull/11800>`__: DOC: document use when f2py is not in the PATH
+* `#11802 <https://github.com/numpy/numpy/pull/11802>`__: ENH: Use entry_points to install the f2py scripts.
+* `#11805 <https://github.com/numpy/numpy/pull/11805>`__: BUG: add type cast check for ediff1d
+* `#11806 <https://github.com/numpy/numpy/pull/11806>`__: DOC: Polybase augmented assignment notes
+* `#11812 <https://github.com/numpy/numpy/pull/11812>`__: DOC: edit setup.py docstring that is displayed on PyPI.
+* `#11813 <https://github.com/numpy/numpy/pull/11813>`__: BUG: fix array_split incorrect behavior with array size bigger...
+* `#11814 <https://github.com/numpy/numpy/pull/11814>`__: DOC, MAINT: Fixes for errstate() and README.md documentation.
+* `#11817 <https://github.com/numpy/numpy/pull/11817>`__: DOC: add examples and extend existing dos for polynomial subclasses
+* `#11818 <https://github.com/numpy/numpy/pull/11818>`__: TST: add missing tests for all polynomial subclass pow fns.
+* `#11823 <https://github.com/numpy/numpy/pull/11823>`__: TST: add test for array2string unexpected kwarg
+* `#11830 <https://github.com/numpy/numpy/pull/11830>`__: MAINT: reduce void type repr code duplication
+* `#11834 <https://github.com/numpy/numpy/pull/11834>`__: MAINT, DOC: Replace 'an' by 'a' in some docstrings.
+* `#11837 <https://github.com/numpy/numpy/pull/11837>`__: DOC: Make clear the connection between numpy types and C types
+* `#11840 <https://github.com/numpy/numpy/pull/11840>`__: BUG: Let 0-D arrays of Python timedelta convert to np.timedelta64.
+* `#11843 <https://github.com/numpy/numpy/pull/11843>`__: MAINT: remove surviving, unused, list comprehension
+* `#11849 <https://github.com/numpy/numpy/pull/11849>`__: TST: reorder duplicate mem_overlap.c compile
+* `#11850 <https://github.com/numpy/numpy/pull/11850>`__: DOC: add comment to remove fn after python 2 support is dropped
+* `#11852 <https://github.com/numpy/numpy/pull/11852>`__: BUG: timedelta64 now accepts NumPy ints
+* `#11858 <https://github.com/numpy/numpy/pull/11858>`__: DOC: add docstrings for numeric types
+* `#11862 <https://github.com/numpy/numpy/pull/11862>`__: BUG: Re-add `_ones_like` to numpy.core.umath.
+* `#11864 <https://github.com/numpy/numpy/pull/11864>`__: TST: Update travis testing to use latest virtualenv.
+* `#11865 <https://github.com/numpy/numpy/pull/11865>`__: DOC: add a Code of Conduct document.
+* `#11866 <https://github.com/numpy/numpy/pull/11866>`__: TST: Drop Python 3.4 testing
+* `#11868 <https://github.com/numpy/numpy/pull/11868>`__: MAINT: include benchmarks, complete docs, dev tool files in sdist.
+* `#11870 <https://github.com/numpy/numpy/pull/11870>`__: MAINT: dtype(unicode) should raise TypeError on failure
+* `#11874 <https://github.com/numpy/numpy/pull/11874>`__: BENCH: split out slow setup method in bench_shape_base.Block
+* `#11877 <https://github.com/numpy/numpy/pull/11877>`__: BUG: Fix memory leak in pyfragments.swg
+* `#11880 <https://github.com/numpy/numpy/pull/11880>`__: BUG: The multiarray/ufunc merge broke old wheels.
+* `#11882 <https://github.com/numpy/numpy/pull/11882>`__: DOC: Recommend the use of `np.ndim` over `np.isscalar`, and explain...
+* `#11889 <https://github.com/numpy/numpy/pull/11889>`__: BENCH: Split bench_function_base.Sort into Sort and SortWorst.
+* `#11891 <https://github.com/numpy/numpy/pull/11891>`__: MAINT: remove exec_command() from build_ext
+* `#11892 <https://github.com/numpy/numpy/pull/11892>`__: TST: Parametrize PEP3118 scalar tests.
+* `#11893 <https://github.com/numpy/numpy/pull/11893>`__: TST: Fix duplicated test name.
+* `#11894 <https://github.com/numpy/numpy/pull/11894>`__: TST: Parametrize f2py tests.
+* `#11895 <https://github.com/numpy/numpy/pull/11895>`__: TST: Parametrize some linalg tests over types.
+* `#11896 <https://github.com/numpy/numpy/pull/11896>`__: BUG: Fix matrix PendingDeprecationWarning suppression for pytest...
+* `#11898 <https://github.com/numpy/numpy/pull/11898>`__: MAINT: remove exec_command usage from ccompiler.py
+* `#11899 <https://github.com/numpy/numpy/pull/11899>`__: MAINT: remove exec_command from system_info.py
+* `#11900 <https://github.com/numpy/numpy/pull/11900>`__: MAINT: remove exec_command from gnu.py
+* `#11901 <https://github.com/numpy/numpy/pull/11901>`__: MAINT: remove exec_command usage in ibm.py
+* `#11904 <https://github.com/numpy/numpy/pull/11904>`__: Use pytest for some already-parametrized core tests
+* `#11905 <https://github.com/numpy/numpy/pull/11905>`__: TST: Start testing with "-std=c99" on travisCI.
+* `#11906 <https://github.com/numpy/numpy/pull/11906>`__: TST: add shippable ARMv8 to CI
+* `#11907 <https://github.com/numpy/numpy/pull/11907>`__: Link HOWTO_DOCUMENT to specific section on docstrings
+* `#11909 <https://github.com/numpy/numpy/pull/11909>`__: MAINT: flake8 cleanups
+* `#11910 <https://github.com/numpy/numpy/pull/11910>`__: MAINT: test, refactor design of recursive closures
+* `#11912 <https://github.com/numpy/numpy/pull/11912>`__: DOC: dtype offset and itemsize is limited by range of C int
+* `#11914 <https://github.com/numpy/numpy/pull/11914>`__: DOC: Clarify difference between PySequence_GETITEM, PyArray_GETITEM
+* `#11916 <https://github.com/numpy/numpy/pull/11916>`__: DEP: deprecate np.set_numeric_ops and friends
+* `#11920 <https://github.com/numpy/numpy/pull/11920>`__: TST: Fix 'def' test_numerictypes.py::TestSctypeDict to 'class'...
+* `#11921 <https://github.com/numpy/numpy/pull/11921>`__: MAINT: Don't rely on `__name__` in bitname - use the information...
+* `#11922 <https://github.com/numpy/numpy/pull/11922>`__: TST: Add tests for maximum_sctype
+* `#11929 <https://github.com/numpy/numpy/pull/11929>`__: DOC: #defining -> #define / Added a short explanation for Numeric
+* `#11930 <https://github.com/numpy/numpy/pull/11930>`__: DOC: fix scipy-sphinx-theme license path
+* `#11932 <https://github.com/numpy/numpy/pull/11932>`__: MAINT: Move `np.dtype.name.__get__` to python
+* `#11933 <https://github.com/numpy/numpy/pull/11933>`__: TST: Fix unit tests that used to call unittest.TestCase.fail
+* `#11934 <https://github.com/numpy/numpy/pull/11934>`__: NEP: Revert "NEP: Mark NEP 18 as accepted"
+* `#11935 <https://github.com/numpy/numpy/pull/11935>`__: MAINT: remove usage of exec_command in config.py
+* `#11937 <https://github.com/numpy/numpy/pull/11937>`__: MAINT: remove exec_command() from f2py init
+* `#11941 <https://github.com/numpy/numpy/pull/11941>`__: BUG: Ensure einsum(optimize=True) dispatches tensordot deterministically
+* `#11943 <https://github.com/numpy/numpy/pull/11943>`__: DOC: Add warning/clarification about backwards compat in NEP-18
+* `#11948 <https://github.com/numpy/numpy/pull/11948>`__: DEP: finish making all comparisons to NaT false
+* `#11949 <https://github.com/numpy/numpy/pull/11949>`__: MAINT: Small tidy-ups to `np.core._dtype`
+* `#11950 <https://github.com/numpy/numpy/pull/11950>`__: MAINT: Extract tangential improvements made in #11175
+* `#11952 <https://github.com/numpy/numpy/pull/11952>`__: MAINT: test NPY_INTERNAL_BUILD only if defined
+* `#11953 <https://github.com/numpy/numpy/pull/11953>`__: TST: codecov.yml improvements
+* `#11957 <https://github.com/numpy/numpy/pull/11957>`__: ENH: mark that large allocations can use huge pages
+* `#11958 <https://github.com/numpy/numpy/pull/11958>`__: TST: Add a test for np.pad where constant_values is an object
+* `#11959 <https://github.com/numpy/numpy/pull/11959>`__: MAINT: Explicitely cause pagefaults to happen before starting...
+* `#11961 <https://github.com/numpy/numpy/pull/11961>`__: TST: Add more tests for np.pad
+* `#11962 <https://github.com/numpy/numpy/pull/11962>`__: ENH: maximum lines of content to be read from numpy.loadtxt
+* `#11965 <https://github.com/numpy/numpy/pull/11965>`__: BENCH: Add a benchmark comparing block to copy in the 3D case
+* `#11966 <https://github.com/numpy/numpy/pull/11966>`__: MAINT: Rewrite shape normalization in pad function
+* `#11967 <https://github.com/numpy/numpy/pull/11967>`__: BUG: fix refcount leak in PyArray_AdaptFlexibleDType
+* `#11971 <https://github.com/numpy/numpy/pull/11971>`__: MAINT: Block algorithm with a single copy per call to `block`
+* `#11973 <https://github.com/numpy/numpy/pull/11973>`__: BUG: fix cached allocations without the GIL
+* `#11976 <https://github.com/numpy/numpy/pull/11976>`__: MAINT/DOC: Show the location of an empty list in np.block
+* `#11979 <https://github.com/numpy/numpy/pull/11979>`__: MAINT: Ensure that a copy of the array is returned when calling...
+* `#11989 <https://github.com/numpy/numpy/pull/11989>`__: BUG: Ensure boolean indexing of subclasses sets base correctly.
+* `#11991 <https://github.com/numpy/numpy/pull/11991>`__: MAINT: speed up `_block` by avoiding a recursive closure
+* `#11996 <https://github.com/numpy/numpy/pull/11996>`__: TST: Parametrize and break apart dtype tests
+* `#11997 <https://github.com/numpy/numpy/pull/11997>`__: MAINT: Extract string helpers to a new private file
+* `#12002 <https://github.com/numpy/numpy/pull/12002>`__: Revert "NEP: Revert "NEP: Mark NEP 18 as accepted""
+* `#12004 <https://github.com/numpy/numpy/pull/12004>`__: BUG: Fix f2py compile function testing.
+* `#12005 <https://github.com/numpy/numpy/pull/12005>`__: ENH: initial implementation of core `__array_function__` machinery
+* `#12008 <https://github.com/numpy/numpy/pull/12008>`__: MAINT: Reassociate `np.cast` with the comment describing it
+* `#12009 <https://github.com/numpy/numpy/pull/12009>`__: MAINT: Eliminate the private `numerictypes._typestr`
+* `#12011 <https://github.com/numpy/numpy/pull/12011>`__: ENH: implementation of array_reduce_ex
+* `#12012 <https://github.com/numpy/numpy/pull/12012>`__: MAINT: Extract the crazy number of type aliases to their own...
+* `#12014 <https://github.com/numpy/numpy/pull/12014>`__: TST: prefer pytest.skip() over SkipTest
+* `#12015 <https://github.com/numpy/numpy/pull/12015>`__: TST: improve warnings parallel test safety
+* `#12017 <https://github.com/numpy/numpy/pull/12017>`__: NEP: add 3 missing data NEPs rescued from 2011-2012
+* `#12018 <https://github.com/numpy/numpy/pull/12018>`__: MAINT: Simplify parts of `_type_aliases`
+* `#12019 <https://github.com/numpy/numpy/pull/12019>`__: DOC: MAINT: address comments @eric-wieser on NEP 24-26 PR.
+* `#12020 <https://github.com/numpy/numpy/pull/12020>`__: TST: Add tests for np.sctype2char
+* `#12021 <https://github.com/numpy/numpy/pull/12021>`__: DOC: Post NumPy 1.15.2 release updates.[ci skip]
+* `#12024 <https://github.com/numpy/numpy/pull/12024>`__: MAINT: Normalize axes the normal way in fftpack.py
+* `#12027 <https://github.com/numpy/numpy/pull/12027>`__: DOC: Add docstrings for abstract types in scalar type hierarchy
+* `#12030 <https://github.com/numpy/numpy/pull/12030>`__: DOC: use "import numpy as np" style
+* `#12032 <https://github.com/numpy/numpy/pull/12032>`__: BUG: check return value from PyArray_PromoteTypes
+* `#12033 <https://github.com/numpy/numpy/pull/12033>`__: TST: Mark check for f2py script xfail.
+* `#12034 <https://github.com/numpy/numpy/pull/12034>`__: MAINT: Add version deprecated to some deprecation messages.
+* `#12035 <https://github.com/numpy/numpy/pull/12035>`__: BUG: Fix memory leak in PY3K buffer code.
+* `#12041 <https://github.com/numpy/numpy/pull/12041>`__: MAINT: remove duplicate imports
+* `#12042 <https://github.com/numpy/numpy/pull/12042>`__: MAINT: cleanup and better document core/overrides.py
+* `#12045 <https://github.com/numpy/numpy/pull/12045>`__: BUG: fix memory leak of buffer format string
+* `#12048 <https://github.com/numpy/numpy/pull/12048>`__: BLD: pin sphinx to 1.7.9
+* `#12051 <https://github.com/numpy/numpy/pull/12051>`__: TST: add macos azure testing to CI
+* `#12054 <https://github.com/numpy/numpy/pull/12054>`__: MAINT: avoid modifying mutable default values
+* `#12056 <https://github.com/numpy/numpy/pull/12056>`__: MAINT: The crackfortran function is called with an extra argument
+* `#12057 <https://github.com/numpy/numpy/pull/12057>`__: MAINT: remove unused imports
+* `#12058 <https://github.com/numpy/numpy/pull/12058>`__: MAINT: remove redundant assignment
+* `#12060 <https://github.com/numpy/numpy/pull/12060>`__: MAINT: remove unused stdlib imports
+* `#12061 <https://github.com/numpy/numpy/pull/12061>`__: MAINT: remove redundant imports
+* `#12062 <https://github.com/numpy/numpy/pull/12062>`__: BUG: `OBJECT_to_*` should check for errors
+* `#12064 <https://github.com/numpy/numpy/pull/12064>`__: MAINT: delay initialization of getlimits (circular imports)
+* `#12072 <https://github.com/numpy/numpy/pull/12072>`__: BUG: test_path() now uses Path.resolve()
+* `#12073 <https://github.com/numpy/numpy/pull/12073>`__: MAINT Avoid some memory copies in numpy.polynomial.hermite
+* `#12079 <https://github.com/numpy/numpy/pull/12079>`__: MAINT: Blacklist some MSVC complex functions.
+* `#12081 <https://github.com/numpy/numpy/pull/12081>`__: TST: add Windows test matrix to Azure CI
+* `#12082 <https://github.com/numpy/numpy/pull/12082>`__: TST: Add Python 3.5 to Azure windows CI.
+* `#12088 <https://github.com/numpy/numpy/pull/12088>`__: BUG: limit default for get_num_build_jobs() to 8
+* `#12089 <https://github.com/numpy/numpy/pull/12089>`__: BUG: Fix in-place permutation
+* `#12090 <https://github.com/numpy/numpy/pull/12090>`__: TST, MAINT: Update pickling tests by making them loop over all...
+* `#12091 <https://github.com/numpy/numpy/pull/12091>`__: TST: Install pickle5 for CI testing with python 3.6/7
+* `#12093 <https://github.com/numpy/numpy/pull/12093>`__: Provide information about what kind is actually not integer kind
+* `#12099 <https://github.com/numpy/numpy/pull/12099>`__: ENH: Validate dispatcher functions in array_function_dispatch
+* `#12102 <https://github.com/numpy/numpy/pull/12102>`__: TST: improve coverage of nd_grid
+* `#12103 <https://github.com/numpy/numpy/pull/12103>`__: MAINT: Add azure-pipeline status badge to README.md
+* `#12106 <https://github.com/numpy/numpy/pull/12106>`__: TST, MAINT: Skip some f2py tests on Mac.
+* `#12108 <https://github.com/numpy/numpy/pull/12108>`__: BUG: Allow boolean subtract in histogram
+* `#12109 <https://github.com/numpy/numpy/pull/12109>`__: TST: add unit test for issctype
+* `#12112 <https://github.com/numpy/numpy/pull/12112>`__: ENH: check getfield arguments to prevent invalid memory access
+* `#12115 <https://github.com/numpy/numpy/pull/12115>`__: ENH: `__array_function__` support for most of `numpy.core`
+* `#12116 <https://github.com/numpy/numpy/pull/12116>`__: ENH: `__array_function__` support for `np.lib`, part 1/2
+* `#12117 <https://github.com/numpy/numpy/pull/12117>`__: ENH: `__array_function__` support for `np.fft` and `np.linalg`
+* `#12119 <https://github.com/numpy/numpy/pull/12119>`__: ENH: `__array_function__` support for `np.lib`, part 2/2
+* `#12120 <https://github.com/numpy/numpy/pull/12120>`__: ENH: add timedelta modulus operator support (mm)
+* `#12121 <https://github.com/numpy/numpy/pull/12121>`__: MAINT: Clarify the error message for resize failure
+* `#12123 <https://github.com/numpy/numpy/pull/12123>`__: DEP: deprecate asscalar
+* `#12124 <https://github.com/numpy/numpy/pull/12124>`__: BUG: refactor float error status to support Alpine linux
+* `#12125 <https://github.com/numpy/numpy/pull/12125>`__: TST: expand cases in test_issctype()
+* `#12127 <https://github.com/numpy/numpy/pull/12127>`__: BUG: Fix memory leak in mapping.c
+* `#12131 <https://github.com/numpy/numpy/pull/12131>`__: BUG: fix PyDataType_ISBOOL
+* `#12133 <https://github.com/numpy/numpy/pull/12133>`__: MAINT, TST refactor pickle imports and tests
+* `#12134 <https://github.com/numpy/numpy/pull/12134>`__: DOC: Remove duplicated sentence in numpy.multiply
+* `#12137 <https://github.com/numpy/numpy/pull/12137>`__: TST: error tests for fill_diagonal()
+* `#12138 <https://github.com/numpy/numpy/pull/12138>`__: TST: error tests for diag_indices_from()
+* `#12140 <https://github.com/numpy/numpy/pull/12140>`__: DOC: fixups for NEP-18 based on the implementation
+* `#12141 <https://github.com/numpy/numpy/pull/12141>`__: DOC: minor tweak to CoC (update NumFOCUS contact address).
+* `#12145 <https://github.com/numpy/numpy/pull/12145>`__: MAINT: Update ndarrayobject.h `__cplusplus` block.
+* `#12146 <https://github.com/numpy/numpy/pull/12146>`__: MAINT: Fix typo in comment
+* `#12147 <https://github.com/numpy/numpy/pull/12147>`__: MAINT: Move duplicated type_reso_error code into a helper function
+* `#12148 <https://github.com/numpy/numpy/pull/12148>`__: DOC: document NEP-18 overrides in release notes
+* `#12151 <https://github.com/numpy/numpy/pull/12151>`__: TST: byte_bounds contiguity handling
+* `#12153 <https://github.com/numpy/numpy/pull/12153>`__: DOC, TST: cover setdiff1d assume_unique
+* `#12154 <https://github.com/numpy/numpy/pull/12154>`__: ENH: `__array_function__` for `np.core.defchararray`
+* `#12155 <https://github.com/numpy/numpy/pull/12155>`__: MAINT: Define Py_SETREF for pre-3.5.2 python and use in code
+* `#12157 <https://github.com/numpy/numpy/pull/12157>`__: ENH: Add support for third-party path-like objects by backporting...
+* `#12159 <https://github.com/numpy/numpy/pull/12159>`__: MAINT: remove unused nd_grid `__len__`.
+* `#12163 <https://github.com/numpy/numpy/pull/12163>`__: ENH: `__array_function__` for `np.einsum` and `np.block`
+* `#12165 <https://github.com/numpy/numpy/pull/12165>`__: Mark NEP 22 as accepted, and add "Informational" NEPs to NEP...
+* `#12166 <https://github.com/numpy/numpy/pull/12166>`__: NEP: Add zero-rank arrays historical info NEP
+* `#12173 <https://github.com/numpy/numpy/pull/12173>`__: NEP: add notes about updates to NEP-18
+* `#12174 <https://github.com/numpy/numpy/pull/12174>`__: NEP 16 abstract arrays: rebased and marked as "Withdrawn"
+* `#12175 <https://github.com/numpy/numpy/pull/12175>`__: ENH: `__array_function__` for multiarray functions
+* `#12176 <https://github.com/numpy/numpy/pull/12176>`__: TST: add test for weighted histogram mismatch
+* `#12177 <https://github.com/numpy/numpy/pull/12177>`__: MAINT: remove unused `_assertSquareness()`
+* `#12179 <https://github.com/numpy/numpy/pull/12179>`__: MAINT: Move `_kind_to_stem` to `np.core._dtype`, so that it can...
+* `#12180 <https://github.com/numpy/numpy/pull/12180>`__: NEP: change toc titles, cross reference, mark 16 superseded
+* `#12181 <https://github.com/numpy/numpy/pull/12181>`__: MAINT: fix depreciation message typo for np.sum
+* `#12185 <https://github.com/numpy/numpy/pull/12185>`__: TST: test multi_dot with 2 arrays
+* `#12199 <https://github.com/numpy/numpy/pull/12199>`__: TST: add Azure CI triggers
+* `#12209 <https://github.com/numpy/numpy/pull/12209>`__: Delay import of distutils.msvccompiler to avoid warning on non-Windows.
+* `#12211 <https://github.com/numpy/numpy/pull/12211>`__: DOC: Clarify the examples for argmax and argmin
+* `#12212 <https://github.com/numpy/numpy/pull/12212>`__: MAINT: `ndarray.__repr__` should not rely on `__array_function__`
+* `#12214 <https://github.com/numpy/numpy/pull/12214>`__: TST: add test for tensorinv()
+* `#12215 <https://github.com/numpy/numpy/pull/12215>`__: TST: test dims match on lstsq()
+* `#12216 <https://github.com/numpy/numpy/pull/12216>`__: TST: test invalid histogram range
+* `#12217 <https://github.com/numpy/numpy/pull/12217>`__: TST: test histogram bins dims
+* `#12219 <https://github.com/numpy/numpy/pull/12219>`__: ENH: make matmul into a ufunc
+* `#12222 <https://github.com/numpy/numpy/pull/12222>`__: TST: unit tests for column_stack.
+* `#12224 <https://github.com/numpy/numpy/pull/12224>`__: BUG: Fix MaskedArray fill_value type conversion.
+* `#12229 <https://github.com/numpy/numpy/pull/12229>`__: MAINT: Fix typo in comment
+* `#12236 <https://github.com/numpy/numpy/pull/12236>`__: BUG: maximum, minimum no longer emit warnings on NAN
+* `#12240 <https://github.com/numpy/numpy/pull/12240>`__: BUG: Fix crash in repr of void subclasses
+* `#12241 <https://github.com/numpy/numpy/pull/12241>`__: TST: arg handling tests in histogramdd
+* `#12243 <https://github.com/numpy/numpy/pull/12243>`__: BUG: Fix misleading assert message in assert_almost_equal #12200
+* `#12245 <https://github.com/numpy/numpy/pull/12245>`__: TST: tests for sort_complex()
+* `#12246 <https://github.com/numpy/numpy/pull/12246>`__: DOC: Update docs after NumPy 1.15.3 release.
+* `#12249 <https://github.com/numpy/numpy/pull/12249>`__: BUG: Dealloc cached buffer info
+* `#12250 <https://github.com/numpy/numpy/pull/12250>`__: DOC: add missing docs
+* `#12251 <https://github.com/numpy/numpy/pull/12251>`__: MAINT: improved error message when no `__array_function__` implementation...
+* `#12254 <https://github.com/numpy/numpy/pull/12254>`__: MAINT: Move ctype -> dtype conversion to python
+* `#12257 <https://github.com/numpy/numpy/pull/12257>`__: BUG: Fix fill value in masked array '==' and '!=' ops.
+* `#12259 <https://github.com/numpy/numpy/pull/12259>`__: TST: simplify how the different code paths for block are tested.
+* `#12265 <https://github.com/numpy/numpy/pull/12265>`__: BUG: Revert linspace import for concatenation funcs
+* `#12266 <https://github.com/numpy/numpy/pull/12266>`__: BUG: Avoid SystemErrors by checking the return value of PyPrint
+* `#12268 <https://github.com/numpy/numpy/pull/12268>`__: DOC: add broadcasting article from scipy old-wiki
+* `#12270 <https://github.com/numpy/numpy/pull/12270>`__: MAINT: set `__module__` for more `array_function_dispatch` uses
+* `#12276 <https://github.com/numpy/numpy/pull/12276>`__: MAINT: remove unused parse_index()
+* `#12279 <https://github.com/numpy/numpy/pull/12279>`__: NEP: tweak and mark NEP 0027 as final
+* `#12280 <https://github.com/numpy/numpy/pull/12280>`__: DEP: deprecate passing a generator to stack functions
+* `#12281 <https://github.com/numpy/numpy/pull/12281>`__: NEP: revise note for NEP 27
+* `#12285 <https://github.com/numpy/numpy/pull/12285>`__: ENH: array does not need to be writable to use as input to take
+* `#12286 <https://github.com/numpy/numpy/pull/12286>`__: ENH: Do not emit compiler warning if forcing old API
+* `#12288 <https://github.com/numpy/numpy/pull/12288>`__: BUILD: force LGTM to use cython>=0.29
+* `#12291 <https://github.com/numpy/numpy/pull/12291>`__: MAINT: `_set_out_array()` syntax fix
+* `#12292 <https://github.com/numpy/numpy/pull/12292>`__: MAINT: removed unused vars in f2py test code
+* `#12299 <https://github.com/numpy/numpy/pull/12299>`__: BUILD: use system python3 in the chroot
+* `#12302 <https://github.com/numpy/numpy/pull/12302>`__: DOC: Update the docstring of asfortranarray and ascontiguousarray
+* `#12306 <https://github.com/numpy/numpy/pull/12306>`__: TST: add 32-bit linux Azure CI job
+* `#12312 <https://github.com/numpy/numpy/pull/12312>`__: MAINT, TST: unreachable Python code paths
+* `#12321 <https://github.com/numpy/numpy/pull/12321>`__: MAINT: Simple speed-ups for getting overloaded types
+* `#12326 <https://github.com/numpy/numpy/pull/12326>`__: DOC: NumPy 1.15.4 post release documentation update.
+* `#12328 <https://github.com/numpy/numpy/pull/12328>`__: MAINT: Allow subclasses in `ndarray.__array_function__`.
+* `#12330 <https://github.com/numpy/numpy/pull/12330>`__: TST: test_tofile_fromfile now uses initialized memory
+* `#12331 <https://github.com/numpy/numpy/pull/12331>`__: DEV: change ASV benchmarks to run on Python 3.6 by default
+* `#12338 <https://github.com/numpy/numpy/pull/12338>`__: DOC: add a docstring for the function 'compare_chararrays' (See...
+* `#12342 <https://github.com/numpy/numpy/pull/12342>`__: BUG: Fix for np.dtype(ctypes.Structure) does not respect _pack_...
+* `#12347 <https://github.com/numpy/numpy/pull/12347>`__: DOC: typo in docstring numpy.random.beta, shape parameters must...
+* `#12349 <https://github.com/numpy/numpy/pull/12349>`__: TST, DOC: store circleci doc artifacts
+* `#12353 <https://github.com/numpy/numpy/pull/12353>`__: BUG: test, fix for threshold='nan'
+* `#12354 <https://github.com/numpy/numpy/pull/12354>`__: BUG: Fix segfault when an error occurs in np.fromfile
+* `#12355 <https://github.com/numpy/numpy/pull/12355>`__: BUG: fix a bug in npy_PyFile_Dup2 where it didn't return immediately...
+* `#12357 <https://github.com/numpy/numpy/pull/12357>`__: MAINT: Cleanup pavement file
+* `#12358 <https://github.com/numpy/numpy/pull/12358>`__: BUG: test, fix loading structured dtypes with padding
+* `#12362 <https://github.com/numpy/numpy/pull/12362>`__: MAINT: disable `__array_function__` dispatch unless environment...
+* `#12363 <https://github.com/numpy/numpy/pull/12363>`__: MAINT: update gfortran RPATH for AIX/Windows non-support.
+* `#12364 <https://github.com/numpy/numpy/pull/12364>`__: NEP: clarify the purpose of "types" in `__array_function__`.
+* `#12366 <https://github.com/numpy/numpy/pull/12366>`__: MAINT: Refactor sorting header file
+* `#12372 <https://github.com/numpy/numpy/pull/12372>`__: BUG: random: Fix handling of a=0 for numpy.random.weibull.
+* `#12373 <https://github.com/numpy/numpy/pull/12373>`__: MAINT: Improve error message for legal but unsupported PEP3118...
+* `#12376 <https://github.com/numpy/numpy/pull/12376>`__: BUG: do not override exception on import failure
+* `#12377 <https://github.com/numpy/numpy/pull/12377>`__: NEP: move nep 15 from accepted to final
+* `#12378 <https://github.com/numpy/numpy/pull/12378>`__: TST: Update complex long double precision tests.
+* `#12380 <https://github.com/numpy/numpy/pull/12380>`__: BUG: Fix for #10533 np.dtype(ctype) does not respect endianness
+* `#12381 <https://github.com/numpy/numpy/pull/12381>`__: BUG: graceful DataSource __del__ when __init__ fails
+* `#12382 <https://github.com/numpy/numpy/pull/12382>`__: ENH: set correct __module__ for objects in numpy's public API
+* `#12388 <https://github.com/numpy/numpy/pull/12388>`__: ENH: allow arrays for start and stop in {lin,log,geom}space
+* `#12390 <https://github.com/numpy/numpy/pull/12390>`__: DEV: remove shim added in 1.4
+* `#12391 <https://github.com/numpy/numpy/pull/12391>`__: DEP: raise on a call to deprecated numpy.lib.function_base.unique
+* `#12392 <https://github.com/numpy/numpy/pull/12392>`__: DOC: Add release notes for ctypes improvements
+* `#12398 <https://github.com/numpy/numpy/pull/12398>`__: BUG: fix possible overlap issues with avx enabled
+* `#12399 <https://github.com/numpy/numpy/pull/12399>`__: DOC: Fix typo in polyint. Fixes #12386.
+* `#12405 <https://github.com/numpy/numpy/pull/12405>`__: ENH: Add support for `np.dtype(ctypes.Union)`
+* `#12407 <https://github.com/numpy/numpy/pull/12407>`__: BUG: Fall back to 'ascii' locale in build (if needed)
+* `#12408 <https://github.com/numpy/numpy/pull/12408>`__: BUG: multifield-view of MaskedArray gets bad fill_value
+* `#12409 <https://github.com/numpy/numpy/pull/12409>`__: MAINT: correct the dtype.descr docstring
+* `#12413 <https://github.com/numpy/numpy/pull/12413>`__: BUG: Do not double-quote arguments to the command line
+* `#12414 <https://github.com/numpy/numpy/pull/12414>`__: MAINT: Update cversion hash.
+* `#12417 <https://github.com/numpy/numpy/pull/12417>`__: BUG: Fix regression on np.dtype(ctypes.c_void_p)
+* `#12419 <https://github.com/numpy/numpy/pull/12419>`__: Fix PyArray_FillFunc function definitions
+* `#12420 <https://github.com/numpy/numpy/pull/12420>`__: gfortran needs -lpthread & -maix64(64 build) in AIX
+* `#12422 <https://github.com/numpy/numpy/pull/12422>`__: MNT: Reword error message about loading pickled data.
+* `#12424 <https://github.com/numpy/numpy/pull/12424>`__: BUG: Fix inconsistent cache keying in ndpointer
+* `#12429 <https://github.com/numpy/numpy/pull/12429>`__: MAINT: Update mailmap for 1.16.0 release.
+* `#12431 <https://github.com/numpy/numpy/pull/12431>`__: BUG/ENH: Fix use of ndpointer in return values
+* `#12437 <https://github.com/numpy/numpy/pull/12437>`__: MAINT: refactor datetime.c_metadata creation
+* `#12439 <https://github.com/numpy/numpy/pull/12439>`__: BUG: test, fix NPY_VISIBILITY_HIDDEN on gcc, which becomes NPY_NO_EXPORT
+* `#12440 <https://github.com/numpy/numpy/pull/12440>`__: BUG: don't override original errors when casting inside np.dot()...
+* `#12443 <https://github.com/numpy/numpy/pull/12443>`__: MAINT Use set litterals
+* `#12445 <https://github.com/numpy/numpy/pull/12445>`__: MAINT: Use list and dict comprehension when possible
+* `#12446 <https://github.com/numpy/numpy/pull/12446>`__: MAINT: Fixups to new functions in np.lib.recfunctions
+* `#12447 <https://github.com/numpy/numpy/pull/12447>`__: ENH: add back the multifield copy->view change
+* `#12448 <https://github.com/numpy/numpy/pull/12448>`__: MAINT: Review F401,F841,F842 flake8 errors (unused variables...
+* `#12455 <https://github.com/numpy/numpy/pull/12455>`__: TST: use condition directive for Azure 2.7 check
+* `#12458 <https://github.com/numpy/numpy/pull/12458>`__: MAINT, DOC: fix Azure README badge
+* `#12464 <https://github.com/numpy/numpy/pull/12464>`__: BUG: IndexError for empty list on structured MaskedArray.
+* `#12466 <https://github.com/numpy/numpy/pull/12466>`__: TST: use openblas for Windows CI
+* `#12470 <https://github.com/numpy/numpy/pull/12470>`__: MAINT: remove wrapper functions from numpy.core.multiarray
+* `#12471 <https://github.com/numpy/numpy/pull/12471>`__: ENH: override support for np.linspace and friends
+* `#12474 <https://github.com/numpy/numpy/pull/12474>`__: TST: enable dispatcher test coverage
+* `#12477 <https://github.com/numpy/numpy/pull/12477>`__: DOC: fix example for __call__. See #12451
+* `#12486 <https://github.com/numpy/numpy/pull/12486>`__: DOC: Update copyright year in the license
+* `#12488 <https://github.com/numpy/numpy/pull/12488>`__: ENH: implement matmul on NDArrayOperatorsMixin
+* `#12493 <https://github.com/numpy/numpy/pull/12493>`__: BUG: fix records.fromfile fails to read data >4 GB
+* `#12494 <https://github.com/numpy/numpy/pull/12494>`__: BUG: test, fix matmul, dot for vector array with stride[i]=0
+* `#12498 <https://github.com/numpy/numpy/pull/12498>`__: TST: sync Azure Win openblas
+* `#12501 <https://github.com/numpy/numpy/pull/12501>`__: MAINT: removed word/typo from comment in site.cfg.example
+* `#12556 <https://github.com/numpy/numpy/pull/12556>`__: BUG: only override vector size for avx code for 1.16
+* `#12562 <https://github.com/numpy/numpy/pull/12562>`__: DOC, MAINT: Make `PYVER = 3` in doc/Makefile.
+* `#12563 <https://github.com/numpy/numpy/pull/12563>`__: DOC: more doc updates for structured arrays
+* `#12564 <https://github.com/numpy/numpy/pull/12564>`__: BUG: fix an unsafe PyTuple_GET_ITEM call
+* `#12565 <https://github.com/numpy/numpy/pull/12565>`__: Fix lgtm.com C/C++ build
+* `#12567 <https://github.com/numpy/numpy/pull/12567>`__: BUG: reorder operations for VS2015
+* `#12568 <https://github.com/numpy/numpy/pull/12568>`__: BUG: fix improper use of C-API
+* `#12569 <https://github.com/numpy/numpy/pull/12569>`__: BUG: Make new-lines in compiler error messages print to the console
+* `#12570 <https://github.com/numpy/numpy/pull/12570>`__: MAINT: don't check alignment size=0 arrays (RELAXED_STRIDES)
+* `#12573 <https://github.com/numpy/numpy/pull/12573>`__: BUG: fix refcount issue caused by #12524
+* `#12580 <https://github.com/numpy/numpy/pull/12580>`__: BUG: fix segfault in ctypeslib with obj being collected
+* `#12581 <https://github.com/numpy/numpy/pull/12581>`__: TST: activate shippable maintenance branches
+* `#12582 <https://github.com/numpy/numpy/pull/12582>`__: BUG: fix f2py pep338 execution method
+* `#12587 <https://github.com/numpy/numpy/pull/12587>`__: BUG: Make `arr.ctypes.data` hold a reference to the underlying...
+* `#12588 <https://github.com/numpy/numpy/pull/12588>`__: BUG: check for errors after PyArray_DESCR_REPLACE
+* `#12590 <https://github.com/numpy/numpy/pull/12590>`__: DOC, MAINT: Prepare for 1.16.0rc1 release.
+* `#12603 <https://github.com/numpy/numpy/pull/12603>`__: DOC: Fix markup in 1.16.0 release notes.
+* `#12621 <https://github.com/numpy/numpy/pull/12621>`__: BUG: longdouble with elsize 12 is never uint alignable.
+* `#12622 <https://github.com/numpy/numpy/pull/12622>`__: BUG: Add missing free in ufunc dealloc
+* `#12623 <https://github.com/numpy/numpy/pull/12623>`__: MAINT: add test for 12-byte alignment
+* `#12655 <https://github.com/numpy/numpy/pull/12655>`__: BUG: fix uint alignment asserts in lowlevel loops
+* `#12656 <https://github.com/numpy/numpy/pull/12656>`__: BENCH: don't fail at import time with old Numpy
+* `#12657 <https://github.com/numpy/numpy/pull/12657>`__: DOC: update 2018 -> 2019
+* `#12705 <https://github.com/numpy/numpy/pull/12705>`__: ENH: Better links in documentation
+* `#12706 <https://github.com/numpy/numpy/pull/12706>`__: MAINT: Further fixups to uint alignment checks
+* `#12707 <https://github.com/numpy/numpy/pull/12707>`__: BUG: Add 'sparc' to platforms implementing 16 byte reals.
+* `#12708 <https://github.com/numpy/numpy/pull/12708>`__: TST: Fix endianness in unstuctured_to_structured test
+* `#12710 <https://github.com/numpy/numpy/pull/12710>`__: TST: pin Azure brew version for stability.
diff --git a/doc/neps/nep-0018-array-function-protocol.rst b/doc/neps/nep-0018-array-function-protocol.rst
index 988c908..ffe780c 100644
--- a/doc/neps/nep-0018-array-function-protocol.rst
+++ b/doc/neps/nep-0018-array-function-protocol.rst
@@ -340,7 +340,7 @@
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Given a function defining the above behavior, for now call it
-``array_function_implementation_or_override``, we now need to call that
+``implement_array_function``, we now need to call that
 function from within every relevant NumPy function. This is a pervasive change,
 but of fairly simple and innocuous code that should complete quickly and
 without effect if no arguments implement the ``__array_function__``
@@ -358,7 +358,7 @@
             @functools.wraps(implementation)
             def public_api(*args, **kwargs):
                 relevant_args = dispatcher(*args, **kwargs)
-                return array_function_implementation_or_override(
+                return implement_array_function(
                     implementation, public_api, relevant_args, args, kwargs)
             return public_api
         return decorator
@@ -395,11 +395,11 @@
 
 In a few cases, it would not make sense to use the ``array_function_dispatch``
 decorator directly, but override implementation in terms of
-``array_function_implementation_or_override`` should still be straightforward.
+``implement_array_function`` should still be straightforward.
 
 - Functions written entirely in C (e.g., ``np.concatenate``) can't use
   decorators, but they could still use a C equivalent of
-  ``array_function_implementation_or_override``. If performance is not a
+  ``implement_array_function``. If performance is not a
   concern, they could also be easily wrapped with a small Python wrapper.
 - ``np.einsum`` does complicated argument parsing to handle two different
   function signatures. It would probably be best to avoid the overhead of
@@ -475,7 +475,7 @@
 ``numpy.sum()`` function (2.6 us).
 
 Fortunately, we expect significantly less overhead with a C implementation of
-``array_function_implementation_or_override``, which is where the bulk of the
+``implement_array_function``, which is where the bulk of the
 runtime is. This would leave the ``array_function_dispatch`` decorator and
 dispatcher function on their own adding about 0.5 microseconds of overhead,
 for perhaps ~1 microsecond of overhead in the typical case.
@@ -503,7 +503,7 @@
 
 If we want to do this, we should expose at least the decorator
 ``array_function_dispatch()`` and possibly also the lower level
-``array_function_implementation_or_override()`` as part of NumPy's public API.
+``implement_array_function()`` as part of NumPy's public API.
 
 Non-goals
 ---------
@@ -807,7 +807,7 @@
 
 ``types`` is included because we can compute it almost for free as part of
 collecting ``__array_function__`` implementations to call in
-``array_function_implementation_or_override``. We also think it will be used
+``implement_array_function``. We also think it will be used
 by many ``__array_function__`` methods, which otherwise would need to extract
 this information themselves. It would be equivalently easy to provide single
 instances of each type, but providing only types seemed cleaner.
@@ -823,7 +823,7 @@
 - Access to the non-dispatched implementation (i.e., before wrapping with
   ``array_function_dispatch``) in ``ndarray.__array_function__`` would allow
   us to drop special case logic for that method from
-  ``array_function_implementation_or_override``.
+  ``implement_array_function``.
 - Access to the ``dispatcher`` function passed into
   ``array_function_dispatch()`` would allow ``__array_function__``
   implementations to determine the list of "array-like" arguments in a generic
diff --git a/doc/neps/nep-0020-gufunc-signature-enhancement.rst b/doc/neps/nep-0020-gufunc-signature-enhancement.rst
index 38a9fd5..a7a992c 100644
--- a/doc/neps/nep-0020-gufunc-signature-enhancement.rst
+++ b/doc/neps/nep-0020-gufunc-signature-enhancement.rst
@@ -3,7 +3,7 @@
 ===============================================================
 
 :Author: Marten van Kerkwijk <mhvk@astro.utoronto.ca>
-:Status: Accepted
+:Status: Final
 :Type: Standards Track
 :Created: 2018-06-10
 :Resolution: https://mail.python.org/pipermail/numpy-discussion/2018-April/077959.html,
diff --git a/doc/release/1.16.0-notes.rst b/doc/release/1.16.0-notes.rst
index 8d176c3..341d5f7 100644
--- a/doc/release/1.16.0-notes.rst
+++ b/doc/release/1.16.0-notes.rst
@@ -2,61 +2,83 @@
 NumPy 1.16.0 Release Notes
 ==========================
 
-This NumPy release is the last one to support Python 2.7. It will be maintained
-as a long term release with bug fixes only through 2020. To that end, the
-planned code reorganization detailed in `NEP 15`_ has been made in order to
-facilitate backporting fixes from future releases, which will now have the
-same code organization.
+This NumPy release is the last one to support Python 2.7 and will be maintained
+as a long term release with bug fixes until 2020.  Support for Python 3.4 been
+dropped, the supported Python versions are 2.7 and 3.5-3.7. The wheels on PyPI
+are linked with OpenBLAS v0.3.4+,  which should fix the known threading issues
+found in previous OpenBLAS versions.
 
-Support for Python 3.4 been dropped in this release, the supported Python
-versions are 2.7 and 3.5-3.7. The wheels are linked with OpenBLAS v0.3.0 .
+Downstream developers building this release should use Cython >= 0.29 and, if
+using OpenBLAS, OpenBLAS > v0.3.4.
+
+This release has seen a lot of refactoring and features many bug fixes, improved
+code organization, and better cross platform compatibility. Not all of these
+improvements will be visible to users, but they should help make maintenance
+easier going forward.
 
 
 Highlights
 ==========
 
+* Experimental (opt-in only) support for overriding numpy functions,
+  see ``__array_function__`` below.
+
+* The ``matmul`` function is now a ufunc. This provides better
+  performance and allows overriding with ``__array_ufunc__``.
+
+* Improved support for the ARM and POWER architectures.
+
+* Improved support for AIX and PyPy.
+
+* Improved interop with ctypes.
+
+* Improved support for PEP 3118.
+
+
 
 New functions
 =============
 
- * New functions in the `numpy.lib.recfunctions` module to ease the structured
-   assignment changes: `assign_fields_by_name`, `structured_to_unstructured`,
-   `unstructured_to_structured`, `apply_along_fields`, and `require_fields`.
-   See the user guide at <https://docs.scipy.org/doc/numpy/user/basics.rec.html>
-   for more info.
+* New functions added to the `numpy.lib.recfuntions` module to ease the
+  structured assignment changes:
 
-Deprecations
-============
+    * ``assign_fields_by_name``
+    * ``structured_to_unstructured``
+    * ``unstructured_to_structured``
+    * ``apply_along_fields``
+    * ``require_fields``
 
-`typeNA` and `sctypeNA` have been deprecated
---------------------------------------------
-
-The type dictionaries `numpy.core.typeNA` and `numpy.core.sctypeNA` were buggy
-and not documented. They will be removed in the 1.18 release. Use
-`numpy.sctypeDict` instead.
+  See the user guide at <https://docs.scipy.org/doc/numpy/user/basics.rec.html>
+  for more info.
 
 
-``np.PackageLoader`` and ``np.pkgload`` have been removed
----------------------------------------------------------
-These were deprecated in 1.10, had no tests, and seem to no longer work in
-1.15 anyway.
+New deprecations
+================
 
-`numpy.asscalar` has been deprecated
-------------------------------------
-It is an alias to the more powerful `numpy.ndarray.item`, not tested, and fails
-for scalars.
+* The type dictionaries `numpy.core.typeNA` and `numpy.core.sctypeNA` are
+  deprecated. They were buggy and not documented and will be removed in the
+  1.18 release. Use`numpy.sctypeDict` instead.
 
-`np.set_array_ops` and `np.get_array_ops` have been deprecated
---------------------------------------------------------------
-As part of `NEP 15`, they have been deprecated along with the C-API functions
-:c:func:`PyArray_SetNumericOps` and :c:func:`PyArray_GetNumericOps`. Users who wish to override
-the inner loop functions in built-in ufuncs should use
-:c:func:`PyUFunc_ReplaceLoopBySignature`.
+* The `numpy.asscalar` function is deprecated. It is an alias to the more
+  powerful `numpy.ndarray.item`, not tested, and fails for scalars.
 
-Future Changes
-==============
+* The `numpy.set_array_ops` and `numpy.get_array_ops` functions are deprecated.
+  As part of `NEP 15`, they have been deprecated along with the C-API functions
+  :c:func:`PyArray_SetNumericOps` and :c:func:`PyArray_GetNumericOps`. Users
+  who wish to override the inner loop functions in built-in ufuncs should use
+  :c:func:`PyUFunc_ReplaceLoopBySignature`.
 
-* NumPy 1.17 will drop support for Python 2.7.
+* The `numpy.unravel_index` keyword argument ``dims`` is deprecated, use
+  ``shape`` instead.
+
+* The `numpy.histogram` ``normed`` argument is deprecated.  It was deprecated
+  previously, but no warning was issued.
+
+* The ``positive`` operator (``+``) applied to non-numerical arrays is
+  deprecated. See below for details.
+
+* Passing an iterator to the stack functions is deprecated
+
 
 Expired deprecations
 ====================
@@ -71,6 +93,16 @@
   deprecation cycle begun in NumPy 1.7. The change was previously attempted in
   NumPy 1.14 but reverted until now.
 
+* ``np.PackageLoader`` and ``np.pkgload`` have been removed. These were
+  deprecated in 1.10, had no tests, and seem to no longer work in 1.15.
+
+
+Future changes
+==============
+
+* NumPy 1.17 will drop support for Python 2.7.
+
+
 Compatibility notes
 ===================
 
@@ -78,9 +110,9 @@
 ----------------------
 On Windows, the installed script for running f2py is now an ``.exe`` file
 rather than a ``*.py`` file and should be run from the command line as ``f2py``
-whenever the ``Scripts`` directory is in the path. Folks needing compatibility
-with earler versions of Numpy should run ``f2py`` as a module: ``python -m
-numpy.f2py [...]``.
+whenever the ``Scripts`` directory is in the path. Running ``f2py`` as a module
+``python -m numpy.f2py [...]`` will work without path modification in any
+version of NumPy.
 
 NaT comparisons
 ---------------
@@ -119,20 +151,20 @@
 
 multi-field views return a view instead of a copy
 -------------------------------------------------
-Indexing a structured array with multiple fields, e.g.,
-``arr[['f1', 'f3']]``, returns a view into the original array instead of a
-copy. The returned view will often have extra padding bytes corresponding to
-intervening fields in the original array, unlike before, which will
-affect code such as ``arr[['f1', 'f3']].view('float64')``. This change has
-been planned since numpy 1.7 and such operations have emitted
-``FutureWarnings`` since then and more since 1.12.
+Indexing a structured array with multiple fields, e.g., ``arr[['f1', 'f3']]``,
+returns a view into the original array instead of a copy. The returned view
+will often have extra padding bytes corresponding to intervening fields in the
+original array, unlike before, which will affect code such as
+``arr[['f1', 'f3']].view('float64')``. This change has been planned since numpy
+1.7. Operations hitting this path have emitted ``FutureWarnings`` since then.
+Additional ``FutureWarnings`` about this change were added in 1.12.
 
 To help users update their code to account for these changes, a number of
 functions have been added to the ``numpy.lib.recfunctions`` module which
 safely allow such operations. For instance, the code above can be replaced
 with ``structured_to_unstructured(arr[['f1', 'f3']], dtype='float64')``.
 See the "accessing multiple fields" section of the
-`user guide <https://docs.scipy.org/doc/numpy/user/basics.rec.html>`__.
+`user guide <https://docs.scipy.org/doc/numpy/user/basics.rec.html#accessing-multiple-fields>`__.
 
 
 C API changes
@@ -146,15 +178,18 @@
 * :c:member:`PyUFuncObject.identity_value`
 * :c:function:`PyUFunc_FromFuncAndDataAndSignatureAndIdentity`
 
+
 New Features
 ============
 
 Integrated squared error (ISE) estimator added to ``histogram``
 ---------------------------------------------------------------
-This method (``bins='stone'``) for optimizing the bin number is a generalization of the
-Scott's rule. The Scott's rule assumes the distribution is approximately
-Normal, while the ISE is a nonparametric method based on cross-validation.
-https://en.wikipedia.org/wiki/Histogram#Minimizing_cross-validation_estimated_squared_error
+This method (``bins='stone'``) for optimizing the bin number is a
+generalization of the Scott's rule. The Scott's rule assumes the distribution
+is approximately Normal, while the ISE_ is a non-parametric method based on
+cross-validation.
+
+.. _ISE: https://en.wikipedia.org/wiki/Histogram#Minimizing_cross-validation_estimated_squared_error
 
 ``max_rows`` keyword added for ``np.loadtxt``
 ---------------------------------------------
@@ -174,11 +209,10 @@
 no-copy pickling of numpy arrays
 --------------------------------
 Up to protocol 4, numpy array pickling created 2 spurious copies of the data
-being serlialized.
-With pickle protocol 5, and the ``PickleBuffer`` API, a large variety of numpy
-arrays can now be serialized without any copy using out-of-band buffers,
-and with one less copy using in-band buffers. This results, for large arrays,
-in an up to 66% drop in peak memory usage.
+being serialized.  With pickle protocol 5, and the ``PickleBuffer`` API, a
+large variety of numpy arrays can now be serialized without any copy using
+out-of-band buffers, and with one less copy using in-band buffers. This
+results, for large arrays, in an up to 66% drop in peak memory usage.
 
 build shell independence
 ------------------------
@@ -186,10 +220,8 @@
 shell directly. ``exec_command`` has been replaced with
 ``subprocess.check_output`` where appropriate.
 
-
 `np.polynomial.Polynomial` classes render in LaTeX in Jupyter notebooks
 -----------------------------------------------------------------------
-
 When used in a front-end that supports it, `Polynomial` instances are now
 rendered through LaTeX. The current format is experimental, and is subject to
 change.
@@ -201,23 +233,41 @@
 distribution. This has been fixed so that e.g.
 ``np.random.choice([], 0) == np.array([], dtype=float64)``.
 
-``linalg.lstsq`` and ``linalg.qr`` now work with empty matrices
----------------------------------------------------------------
+``linalg.lstsq``, ``linalg.qr``, and ``linalg.svd`` now work with empty arrays
+------------------------------------------------------------------------------
 Previously, a ``LinAlgError`` would be raised when an empty matrix/empty
 matrices (with zero rows and/or columns) is/are passed in. Now outputs of
 appropriate shapes are returned.
 
+Chain exceptions to give better error messages for invalid PEP3118 format strings
+---------------------------------------------------------------------------------
+This should help track down problems.
+
+Einsum optimization path updates and efficiency improvements
+------------------------------------------------------------
+Einsum was synchronized with the current upstream work.
+
+`numpy.angle` and `numpy.expand_dims` now work on ``ndarray`` subclasses
+------------------------------------------------------------------------
+In particular, they now work for masked arrays.
+
+``NPY_NO_DEPRECATED_API`` compiler warning suppression
+------------------------------------------------------
+Setting ``NPY_NO_DEPRECATED_API`` to a value of 0 will suppress the current compiler
+warnings when the deprecated numpy API is used.
+
 ``np.diff`` Added kwargs prepend and append
 -------------------------------------------
-Add kwargs prepend and append, allowing for values to be inserted
-on either end of the differences.  Similar to options for ediff1d.
-Allows for the inverse of cumsum easily via prepend=0
+New kwargs ``prepend`` and ``append``, allow for values to be inserted on
+either end of the differences.  Similar to options for `ediff1d`. Now the
+inverse of `cumsum` can be obtained easily via ``prepend=0``.
 
 ARM support updated
 -------------------
 Support for ARM CPUs has been updated to accommodate 32 and 64 bit targets,
 and also big and little endian byte ordering. AARCH32 memory alignment issues
-have been addressed.
+have been addressed. CI testing has been expanded to include AARCH64 targets
+via the services of shippable.com.
 
 Appending to build flags
 ------------------------
@@ -245,7 +295,6 @@
 
 Generalized ufunc signatures now allow flexible dimensions
 ----------------------------------------------------------
-
 Some functions, in particular numpy's implementation of ``@`` as ``matmul``,
 are very similar to generalized ufuncs in that they operate over core
 dimensions, but one could not present them as such because they were able to
@@ -272,11 +321,11 @@
 The ``out`` argument to these functions is now always tested for memory overlap
 to avoid corrupted results when memory overlap occurs.
 
-New value ``unscaled`` for option ``cov`` in ``np.polyfit''
+New value ``unscaled`` for option ``cov`` in ``np.polyfit``
 -----------------------------------------------------------
 A further possible value has been added to the ``cov`` parameter of the
 ``np.polyfit`` function. With ``cov='unscaled'`` the scaling of the covariance
-matrix is disabled completely (similar to setting ``absolute_sigma=True'' in
+matrix is disabled completely (similar to setting ``absolute_sigma=True`` in
 ``scipy.optimize.curve_fit``). This would be useful in occasions, where the
 weights are given by 1/sigma with sigma being the (known) standard errors of
 (Gaussian distributed) data points, in which case the unscaled matrix is
@@ -284,9 +333,9 @@
 
 Detailed docstrings for scalar numeric types
 --------------------------------------------
-The ``help`` function, when applied to numeric types such as `np.intc`,
-`np.int_`, and `np.longlong`, now lists all of the aliased names for that type,
-distinguishing between platform -dependent and -independent aliases.
+The ``help`` function, when applied to numeric types such as `numpy.intc`,
+`numpy.int_`, and `numpy.longlong`, now lists all of the aliased names for that
+type, distinguishing between platform -dependent and -independent aliases.
 
 ``__module__`` attribute now points to public modules
 -----------------------------------------------------
@@ -302,9 +351,8 @@
 On systems that support transparent hugepages over the madvise system call
 numpy now marks that large memory allocations can be backed by hugepages which
 reduces page fault overhead and can in some fault heavy cases improve
-performance significantly.
-On Linux for huge pages to be used the setting
-`/sys/kernel/mm/transparent_hugepage/enabled` must be at least `madvise`.
+performance significantly. On Linux the setting for huge pages to be used,
+`/sys/kernel/mm/transparent_hugepage/enabled`, must be at least `madvise`.
 Systems which already have it set to `always` will not see much difference as
 the kernel will automatically use huge pages where appropriate.
 
@@ -326,6 +374,11 @@
 This results in significant speedups for these large arrays, particularly for
 arrays being blocked along more than 2 dimensions.
 
+``arr.ctypes.data_as(...)`` holds a reference to arr
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Previously the caller was responsible for keeping the array alive for the
+lifetime of the pointer.
+
 Speedup ``np.take`` for read-only arrays
 ----------------------------------------
 The implementation of ``np.take`` no longer makes an unnecessary copy of the
@@ -335,8 +388,8 @@
 --------------------------------------------
 The ``np.core.records.fromfile`` function now supports ``pathlib.Path``
 and other path-like objects in addition to a file object. Furthermore, the
-``np.load`` function now also supports path-like objects when
-using memory mapping (``mmap_mode`` keyword argument).
+``np.load`` function now also supports path-like objects when using memory
+mapping (``mmap_mode`` keyword argument).
 
 Better behaviour of ufunc identities during reductions
 ------------------------------------------------------
@@ -344,10 +397,10 @@
 called on an empty axis.
 
 As of this release, the logical binary ufuncs, `logical_and`, `logical_or`,
-and `logical_xor`, now have ``identity``s of type `bool`, where previously they
-were of type `int`. This restores the 1.14 behavior of getting ``bool``s when
+and `logical_xor`, now have ``identity`` s of type `bool`, where previously they
+were of type `int`. This restores the 1.14 behavior of getting ``bool`` s when
 reducing empty object arrays with these ufuncs, while also keeping the 1.15
-behavior of getting ``int``s when reducing empty object arrays with arithmetic
+behavior of getting ``int`` s when reducing empty object arrays with arithmetic
 ufuncs like ``add`` and ``multiply``.
 
 Additionally, `logaddexp` now has an identity of ``-inf``, allowing it to be
@@ -367,26 +420,25 @@
   ``__attribute__((packed))``, is respected.
 * Endianness of all ctypes objects is preserved
 * ``ctypes.Union`` is supported
-* Unrepresentable constructs raise exceptions, rather than producing
+* Non-representable constructs raise exceptions, rather than producing
   dangerously incorrect results:
+
   * Bitfields are no longer interpreted as sub-arrays
   * Pointers are no longer replaced with the type that they point to
 
 A new ``ndpointer.contents`` member
 -----------------------------------
 This matches the ``.contents`` member of normal ctypes arrays, and can be used
-to construct an ``np.array`` around the pointers contents.
-
-This replaces ``np.array(some_nd_pointer)``, which stopped working in 1.15.
-
-As a side effect of this change, ``ndpointer`` now supports dtypes with
-overlapping fields and padding.
+to construct an ``np.array`` around the pointers contents.  This replaces
+``np.array(some_nd_pointer)``, which stopped working in 1.15.  As a side effect
+of this change, ``ndpointer`` now supports dtypes with overlapping fields and
+padding.
 
 ``matmul`` is now a ``ufunc``
 -----------------------------
 `numpy.matmul` is now a ufunc which means that both the function and the
 ``__matmul__`` operator can now be overridden by ``__array_ufunc__``. Its
-implementation has also changed, ensuring it uses the same BLAS routines as
+implementation has also changed. It uses the same BLAS routines as
 `numpy.dot`, ensuring its performance is similar for large matrices.
 
 Start and stop arrays for ``linspace``, ``logspace`` and ``geomspace``
@@ -396,6 +448,18 @@
 which has one axis prepended.  This can be used, e.g., to obtain linearly
 interpolated points between sets of points.
 
+CI extended with additional services
+------------------------------------
+We now use additional free CI services, thanks to the companies that provide:
+
+* Codecoverage testing via codecov.io
+* Arm testing via shippable.com
+* Additional test runs on azure pipelines
+
+These are in addition to our continued use of travis, appveyor (for wheels) and
+LGTM
+
+
 Changes
 =======
 
@@ -420,16 +484,16 @@
 --------------------------------------------------------------
 Previously, ``np.lib.mixins.NDArrayOperatorsMixin`` did not implement the
 special methods for Python's matrix multiplication operator (``@``). This has
-changed now that ``matmul`` is a ufunc and can be overriden using
+changed now that ``matmul`` is a ufunc and can be overridden using
 ``__array_ufunc__``.
 
 The scaling of the covariance matrix in ``np.polyfit`` is different
 -------------------------------------------------------------------
 So far, ``np.polyfit`` used a non-standard factor in the scaling of the the
-covariance matrix. Namely, rather than using the standard chisq/(M-N), it
-scales it with chisq/(M-N-2) where M is the number of data points and N is the
+covariance matrix. Namely, rather than using the standard ``chisq/(M-N)``, it
+scaled it with ``chisq/(M-N-2)`` where M is the number of data points and N is the
 number of parameters.  This scaling is inconsistent with other fitting programs
-such as e.g. ``scipy.optimize.curve_fit`` and was changed to chisq/(M-N).
+such as e.g. ``scipy.optimize.curve_fit`` and was changed to ``chisq/(M-N)``.
 
 ``maximum`` and ``minimum`` no longer emit warnings
 ---------------------------------------------------
@@ -441,29 +505,31 @@
 
 Umath and multiarray c-extension modules merged into a single module
 --------------------------------------------------------------------
-The two modules were merged, according to the first step in `NEP 15`_.
-Previously `np.core.umath` and `np.core.multiarray` were the c-extension
-modules, they are now python wrappers to the single `np.core/_multiarray_math`
-c-extension module.
+The two modules were merged, according to `NEP 15`_. Previously `np.core.umath`
+and `np.core.multiarray` were seperate c-extension modules. They are now python
+wrappers to the single `np.core/_multiarray_math` c-extension module.
 
 .. _`NEP 15` : http://www.numpy.org/neps/nep-0015-merge-multiarray-umath.html
 
 ``getfield`` validity checks extended
-----------------------------------------
+-------------------------------------
 `numpy.ndarray.getfield` now checks the dtype and offset arguments to prevent
 accessing invalid memory locations.
 
 NumPy functions now support overrides with ``__array_function__``
 -----------------------------------------------------------------
-It is now possible to override the implementation of almost all NumPy functions
-on non-NumPy arrays by defining a ``__array_function__`` method, as described
-in `NEP 18`_. The sole exception are functions for explicitly casting to NumPy
-arrays such as ``np.array``. As noted in the NEP, this feature remains
-experimental and the details of how to implement such overrides may change in
-the future.
+NumPy has a new experimental mechanism for overriding the implementation of
+almost all NumPy functions on non-NumPy arrays by defining an
+``__array_function__`` method, as described in `NEP 18`_.
 
-.. _`NEP 15` : http://www.numpy.org/neps/nep-0015-merge-multiarray-umath.html
+This feature is not yet been enabled by default, but has been released to
+facilitate experimentation by potential users. See the NEP for details on
+setting the appropriate environment variable. We expect the NumPy 1.17 release
+will enable overrides by default, which will also be more performant due to a
+new implementation written in C.
+
 .. _`NEP 18` : http://www.numpy.org/neps/nep-0018-array-function-protocol.html
+
 Arrays based off readonly buffers cannot be set ``writeable``
 -------------------------------------------------------------
 We now disallow setting the ``writeable`` flag True on arrays created
diff --git a/doc/release/1.17.0-notes.rst b/doc/release/1.17.0-notes.rst
new file mode 100644
index 0000000..73de0b1
--- /dev/null
+++ b/doc/release/1.17.0-notes.rst
@@ -0,0 +1,114 @@
+==========================
+NumPy 1.17.0 Release Notes
+==========================
+
+
+Highlights
+==========
+
+* NumPy's FFT implementation has switched to pocketfft
+
+New functions
+=============
+
+
+Deprecations
+============
+
+
+Future Changes
+==============
+
+
+Expired deprecations
+====================
+
+
+Compatibility notes
+===================
+
+
+C API changes
+=============
+
+
+New Features
+============
+
+``np.ufunc.reduce`` and related functions now accept a ``where`` mask
+---------------------------------------------------------------------
+``np.ufunc.reduce``, ``np.sum``, ``np.prod``, ``np.min``, ``np.max`` all
+now accept a ``where`` keyword argument, which can be used to tell which
+elements to include in the reduction.  For reductions that do not have an
+identity, it is necessary to also pass in an initial value (e.g.,
+``initial=np.inf`` for ``np.min``).  For instance, the equivalent of
+``nansum`` would be, ``np.sum(a, where=~np.isnan(a))``.
+
+
+``np.linalg.svd`` and ``np.linalg.pinv`` can be faster on hermitian inputs
+--------------------------------------------------------------------------
+These functions now accept a ``hermitian`` argument, matching the one added
+to ``np.linalg.matrix_rank`` in 1.14.0.
+
+
+Improvements
+============
+
+Array comparison assertions include maximum differences
+-------------------------------------------------------
+Error messages from array comparison tests such as
+`np.testing.assert_allclose` now include "max absolute difference" and
+"max relative difference," in addition to the previous "mismatch" percentage.
+This information makes it easier to update absolute and relative error
+tolerances.
+
+Replacement of the `fftpack`-based FFT module by the `pocketfft` library
+------------------------------------------------------------------------
+
+Both implementations have the same ancestor (Fortran77 `FFTPACK` by Paul N.
+Swarztrauber), but `pocketfft` contains additional modifications which
+improve both accuracy and performance in some circumstances. For FFT lengths
+containing large prime factors, `pocketfft` uses Bluestein's algorithm, which
+maintains `O(N log N)` run time complexity instead of deteriorating towards
+`O(N*N)` for prime lengths. Also, accuracy for real-valued FFTs with near-prime
+lengths has improved and is on par with complex-valued FFTs.
+
+Further improvements to ``ctypes`` support in ``np.ctypeslib``
+--------------------------------------------------------------
+A new ``np.ctypeslib.as_ctypes_type`` function has been added, which can be
+used to converts a `dtype` into a best-guess `ctypes` type. Thanks to this
+new function, ``np.ctypeslib.as_ctypes`` now supports a much wider range of
+array types, including structures, booleans, and integers of non-native
+endianness.
+
+`numpy.errstate` is now also function decorator
+-----------------------------------------------
+
+Currently, if you have a function like::
+
+    def foo():
+        pass
+
+and you want to wrap the whole thing in `errstate`, you have to rewrite it like so::
+
+    def foo():
+        with np.errstate(...):
+            pass
+
+but with this change, you can do::
+
+    @np.errstate(...)
+    def foo():
+        pass
+
+thereby saving a level of indentation
+
+Changes
+=======
+
+``median`` and ``percentile`` family of functions no longer warn about ``nan``
+------------------------------------------------------------------------------
+
+`numpy.median`, `numpy.percentile`, and `numpy.quantile` used to emit a
+``RuntimeWarning`` when encountering an `numpy.nan`. Since they return the
+``nan`` value, the warning is redundant and has been removed.
diff --git a/doc/source/conf.py b/doc/source/conf.py
index 455e974..072a3b4 100644
--- a/doc/source/conf.py
+++ b/doc/source/conf.py
@@ -39,7 +39,7 @@
 
 # General substitutions.
 project = 'NumPy'
-copyright = '2008-2018, The SciPy community'
+copyright = '2008-2019, The SciPy community'
 
 # The default replacements for |version| and |release|, also used in various
 # other places throughout the built documents.
diff --git a/doc/source/dev/development_environment.rst b/doc/source/dev/development_environment.rst
index aa4326f..f9b438b 100644
--- a/doc/source/dev/development_environment.rst
+++ b/doc/source/dev/development_environment.rst
@@ -8,7 +8,9 @@
 
 Since NumPy contains parts written in C and Cython that need to be
 compiled before use, make sure you have the necessary compilers and Python
-development headers installed - see :ref:`building-from-source`.
+development headers installed - see :ref:`building-from-source`. Building
+NumPy as of version ``1.17`` requires a C99 compliant compiler. For
+some older compilers this may require ``export CFLAGS='-std=c99'``.
 
 Having compiled code also means that importing NumPy from the development
 sources needs some additional steps, which are explained below.  For the rest
diff --git a/doc/source/f2py/run_main_session.dat b/doc/source/f2py/run_main_session.dat
index b9a7e1b..be6cacd 100644
--- a/doc/source/f2py/run_main_session.dat
+++ b/doc/source/f2py/run_main_session.dat
@@ -8,7 +8,7 @@
 Building modules...
         Building module "scalar"...
         Wrote C/API module "scalar" to file "./scalarmodule.c"
->>> printr(r)
+>>> print(r)
 {'scalar': {'h': ['/home/users/pearu/src_cvs/f2py/src/fortranobject.h'],
 	 'csrc': ['./scalarmodule.c', 
                   '/home/users/pearu/src_cvs/f2py/src/fortranobject.c']}}
diff --git a/doc/source/f2py/usage.rst b/doc/source/f2py/usage.rst
index 0f5068e..5043ec4 100644
--- a/doc/source/f2py/usage.rst
+++ b/doc/source/f2py/usage.rst
@@ -214,32 +214,7 @@
   The current Python interface to the ``f2py`` module is not mature and
   may change in the future.
 
-The following functions are provided by the ``numpy.f2py`` module:
 
-``run_main(<list>)``
-  Equivalent to running::
+.. automodule:: numpy.f2py
+    :members:
 
-    f2py <args>
-
-  where ``<args>=string.join(<list>,' ')``, but in Python.  Unless
-  ``-h`` is used, this function returns a dictionary containing
-  information on generated modules and their dependencies on source
-  files.  For example, the command ``f2py -m scalar scalar.f`` can be
-  executed from Python as follows
-
-  .. include:: run_main_session.dat
-     :literal:
-
-  You cannot build extension modules with this function, that is,
-  using ``-c`` is not allowed. Use ``compile`` command instead, see
-  below.
-
-``compile(source, modulename='untitled', extra_args='', verbose=1, source_fn=None)``
-  Build extension module from Fortran 77 source string ``source``.
-  Return 0 if successful.
-  Note that this function actually calls ``f2py -c ..`` from shell to
-  ensure safety of the current Python process.
-  For example,
-
-  .. include:: compile_session.dat
-    :literal:
diff --git a/doc/source/reference/alignment.rst b/doc/source/reference/alignment.rst
index c749972..ebc8f35 100644
--- a/doc/source/reference/alignment.rst
+++ b/doc/source/reference/alignment.rst
@@ -34,6 +34,14 @@
 alignment of 4 and "uint" alignment of 8 (equal to the true alignment of
 ``uint64``).
 
+Some cases where uint and true alignment are different (default gcc linux):
+   arch     type        true-aln    uint-aln
+   ----     ----        --------    --------
+   x86_64   complex64          4           8
+   x86_64   float128          16           8
+   x86      float96            4           -
+
+
 Variables in Numpy which control and describe alignment
 -------------------------------------------------------
 
@@ -82,17 +90,15 @@
     appropriate N. Otherwise numpy copies by doing ``memcpy(dst, src, N)``.
  5. Nditer code: Since this often calls the strided copy code, it must
     check for "uint alignment".
- 6. Cast code: if the array is "uint aligned" this will essentially do
-    ``*dst = CASTFUNC(*src)``. If not, it does
+ 6. Cast code: This checks for "true" alignment, as it does
+    ``*dst = CASTFUNC(*src)`` if aligned. Otherwise, it does
     ``memmove(srcval, src); dstval = CASTFUNC(srcval); memmove(dst, dstval)``
     where dstval/srcval are aligned.
 
-Note that in principle, only "true alignment" is required for casting code.
-However, because the casting code and copy code are deeply intertwined they
-both use "uint" alignment. This should be safe assuming uint alignment is
-always larger than true alignment, though it can cause unnecessary buffering if
-an array is "true aligned" but not "uint aligned". If there is ever a big
-rewrite of this code it would be good to allow them to use different
-alignments.
+Note that the strided-copy and strided-cast code are deeply intertwined and so
+any arrays being processed by them must be both uint and true aligned, even
+though the copy-code only needs uint alignment and the cast code only true
+alignment.  If there is ever a big rewrite of this code it would be good to
+allow them to use different alignments.
 
 
diff --git a/doc/source/reference/arrays.indexing.rst b/doc/source/reference/arrays.indexing.rst
index 62d36e2..3a319ec 100644
--- a/doc/source/reference/arrays.indexing.rst
+++ b/doc/source/reference/arrays.indexing.rst
@@ -111,9 +111,10 @@
               [5],
               [6]]])
 
-- :const:`Ellipsis` expand to the number of ``:`` objects needed to
-  make a selection tuple of the same length as ``x.ndim``. There may
-  only be a single ellipsis present.
+- :const:`Ellipsis` expands to the number of ``:`` objects needed for the
+  selection tuple to index all dimensions. In most cases, this means that
+  length of the expanded selection tuple is ``x.ndim``. There may only be a
+  single ellipsis present.
 
   .. admonition:: Example
 
@@ -513,14 +514,10 @@
 :ref:`record array <arrays.classes.rec>` scalars can be "indexed" this way.
 
 Indexing into a structured array can also be done with a list of field names,
-*e.g.* ``x[['field-name1','field-name2']]``. Currently this returns a new
-array containing a copy of the values in the fields specified in the list.
-As of NumPy 1.7, returning a copy is being deprecated in favor of returning
-a view. A copy will continue to be returned for now, but a FutureWarning
-will be issued when writing to the copy. If you depend on the current
-behavior, then we suggest copying the returned array explicitly, i.e. use
-x[['field-name1','field-name2']].copy(). This will work with both past and
-future versions of NumPy.
+*e.g.* ``x[['field-name1','field-name2']]``. As of NumPy 1.16 this returns a
+view containing only those fields. In older versions of numpy it returned a
+copy. See the user guide section on :ref:`structured_arrays` for more
+information on multifield indexing.
 
 If the accessed field is a sub-array, the dimensions of the sub-array
 are appended to the shape of the result.
diff --git a/doc/source/reference/c-api.array.rst b/doc/source/reference/c-api.array.rst
index 76aa680..7c298e1 100644
--- a/doc/source/reference/c-api.array.rst
+++ b/doc/source/reference/c-api.array.rst
@@ -307,10 +307,10 @@
 .. c:function:: PyObject* PyArray_SimpleNewFromDescr( \
         int nd, npy_intp* dims, PyArray_Descr* descr)
 
-    This function steals a reference to *descr* if it is not NULL.
+    This function steals a reference to *descr*.
 
-    Create a new array with the provided data-type descriptor, *descr*
-    , of the shape determined by *nd* and *dims*.
+    Create a new array with the provided data-type descriptor, *descr*,
+    of the shape determined by *nd* and *dims*.
 
 .. c:function:: PyArray_FILLWBYTE(PyObject* obj, int val)
 
@@ -1904,10 +1904,10 @@
         all values are clipped to the region [0, len(*op*) ).
 
 
-.. c:function:: PyObject* PyArray_Sort(PyArrayObject* self, int axis)
+.. c:function:: PyObject* PyArray_Sort(PyArrayObject* self, int axis, NPY_SORTKIND kind)
 
-    Equivalent to :meth:`ndarray.sort<numpy.ndarray.sort>` (*self*, *axis*). Return an array with
-    the items of *self* sorted along *axis*.
+    Equivalent to :meth:`ndarray.sort<numpy.ndarray.sort>` (*self*, *axis*, *kind*). Return an array with
+    the items of *self* sorted along *axis*.Array is sorted according to *kind* which is an integer/enum pointing to the type of sorting algorithms used.
 
 .. c:function:: PyObject* PyArray_ArgSort(PyArrayObject* self, int axis)
 
diff --git a/doc/source/reference/c-api.coremath.rst b/doc/source/reference/c-api.coremath.rst
index 691f732..bf08d48 100644
--- a/doc/source/reference/c-api.coremath.rst
+++ b/doc/source/reference/c-api.coremath.rst
@@ -80,8 +80,9 @@
 Useful math constants
 ~~~~~~~~~~~~~~~~~~~~~
 
-The following math constants are available in npy_math.h. Single and extended
-precision are also available by adding the F and L suffixes respectively.
+The following math constants are available in ``npy_math.h``. Single
+and extended precision are also available by adding the ``f`` and
+``l`` suffixes respectively.
 
 .. c:var:: NPY_E
 
diff --git a/doc/source/reference/maskedarray.generic.rst b/doc/source/reference/maskedarray.generic.rst
index 07ad6c2..7375d60 100644
--- a/doc/source/reference/maskedarray.generic.rst
+++ b/doc/source/reference/maskedarray.generic.rst
@@ -2,7 +2,7 @@
 
 .. _maskedarray.generic:
 
-
+.. module:: numpy.ma
 
 The :mod:`numpy.ma` module
 ==========================
diff --git a/doc/source/reference/routines.ctypeslib.rst b/doc/source/reference/routines.ctypeslib.rst
index b04713b..71b944a 100644
--- a/doc/source/reference/routines.ctypeslib.rst
+++ b/doc/source/reference/routines.ctypeslib.rst
@@ -1,3 +1,5 @@
+.. module:: numpy.ctypeslib
+
 ***********************************************************
 C-Types Foreign Function Interface (:mod:`numpy.ctypeslib`)
 ***********************************************************
diff --git a/doc/source/reference/routines.linalg.rst b/doc/source/reference/routines.linalg.rst
index 0520df4..c6bffc8 100644
--- a/doc/source/reference/routines.linalg.rst
+++ b/doc/source/reference/routines.linalg.rst
@@ -1,5 +1,7 @@
 .. _routines.linalg:
 
+.. module:: numpy.linalg
+
 Linear algebra (:mod:`numpy.linalg`)
 ************************************
 
diff --git a/doc/source/reference/routines.matlib.rst b/doc/source/reference/routines.matlib.rst
index a35eaec..c7f6754 100644
--- a/doc/source/reference/routines.matlib.rst
+++ b/doc/source/reference/routines.matlib.rst
@@ -1,3 +1,5 @@
+.. module:: numpy.matlib
+
 Matrix library (:mod:`numpy.matlib`)
 ************************************
 
diff --git a/doc/source/reference/routines.polynomials.package.rst b/doc/source/reference/routines.polynomials.package.rst
index 61cb57f..7e40d9f 100644
--- a/doc/source/reference/routines.polynomials.package.rst
+++ b/doc/source/reference/routines.polynomials.package.rst
@@ -1,3 +1,5 @@
+.. module:: numpy.polynomial
+
 Polynomial Package
 ==================
 
diff --git a/doc/source/reference/routines.polynomials.polynomial.rst b/doc/source/reference/routines.polynomials.polynomial.rst
index 8194ca8..365c8da 100644
--- a/doc/source/reference/routines.polynomials.polynomial.rst
+++ b/doc/source/reference/routines.polynomials.polynomial.rst
@@ -1,3 +1,5 @@
+.. module:: numpy.polynomial.polynomial
+
 Polynomial Module (:mod:`numpy.polynomial.polynomial`)
 ======================================================
 
diff --git a/doc/source/reference/routines.random.rst b/doc/source/reference/routines.random.rst
index c8b097d..cda4e2b 100644
--- a/doc/source/reference/routines.random.rst
+++ b/doc/source/reference/routines.random.rst
@@ -1,5 +1,7 @@
 .. _routines.random:
 
+.. module:: numpy.random
+
 Random sampling (:mod:`numpy.random`)
 *************************************
 
diff --git a/doc/source/reference/routines.testing.rst b/doc/source/reference/routines.testing.rst
index 5a52a40..77c0467 100644
--- a/doc/source/reference/routines.testing.rst
+++ b/doc/source/reference/routines.testing.rst
@@ -1,5 +1,7 @@
 .. _numpy-testing:
 
+.. module:: numpy.testing
+
 Test Support (:mod:`numpy.testing`)
 ===================================
 
diff --git a/doc/source/release.rst b/doc/source/release.rst
index 1cf2155..11a25d1 100644
--- a/doc/source/release.rst
+++ b/doc/source/release.rst
@@ -2,6 +2,7 @@
 Release Notes
 *************
 
+.. include:: ../release/1.17.0-notes.rst
 .. include:: ../release/1.16.0-notes.rst
 .. include:: ../release/1.15.4-notes.rst
 .. include:: ../release/1.15.3-notes.rst
diff --git a/doc/source/user/numpy-for-matlab-users.rst b/doc/source/user/numpy-for-matlab-users.rst
index 399237c..16ee48c 100644
--- a/doc/source/user/numpy-for-matlab-users.rst
+++ b/doc/source/user/numpy-for-matlab-users.rst
@@ -547,7 +547,7 @@
      - eigenvalues and eigenvectors of ``a``
 
    * - ``[V,D]=eig(a,b)``
-     - ``V,D = np.linalg.eig(a,b)``
+     - ``D,V = scipy.linalg.eig(a,b)``
      - eigenvalues and eigenvectors of ``a``, ``b``
 
    * - ``[V,D]=eigs(a,k)``
@@ -693,19 +693,19 @@
 
 ::
 
-    # Make all numpy available via shorter 'num' prefix
-    import numpy as num
+    # Make all numpy available via shorter 'np' prefix
+    import numpy as np
     # Make all matlib functions accessible at the top level via M.func()
     import numpy.matlib as M
     # Make some matlib functions accessible directly at the top level via, e.g. rand(3,3)
     from numpy.matlib import rand,zeros,ones,empty,eye
     # Define a Hermitian function
     def hermitian(A, **kwargs):
-        return num.transpose(A,**kwargs).conj()
+        return np.transpose(A,**kwargs).conj()
     # Make some shortcuts for transpose,hermitian:
-    #    num.transpose(A) --> T(A)
+    #    np.transpose(A) --> T(A)
     #    hermitian(A) --> H(A)
-    T = num.transpose
+    T = np.transpose
     H = hermitian
 
 Links
diff --git a/numpy/core/_add_newdocs.py b/numpy/core/_add_newdocs.py
index 668aee9..0727a72 100644
--- a/numpy/core/_add_newdocs.py
+++ b/numpy/core/_add_newdocs.py
@@ -49,7 +49,7 @@
     >>> x = np.arange(6).reshape(2, 3)
     >>> fl = x.flat
     >>> type(fl)
-    <type 'numpy.flatiter'>
+    <class 'numpy.flatiter'>
     >>> for item in fl:
     ...     print(item)
     ...
@@ -320,71 +320,68 @@
     Here is how we might write an ``iter_add`` function, using the
     Python iterator protocol::
 
-        def iter_add_py(x, y, out=None):
-            addop = np.add
-            it = np.nditer([x, y, out], [],
-                        [['readonly'], ['readonly'], ['writeonly','allocate']])
-            with it:
-                for (a, b, c) in it:
-                    addop(a, b, out=c)
-            return it.operands[2]
+    >>> def iter_add_py(x, y, out=None):
+    ...     addop = np.add
+    ...     it = np.nditer([x, y, out], [],
+    ...                 [['readonly'], ['readonly'], ['writeonly','allocate']])
+    ...     with it:
+    ...         for (a, b, c) in it:
+    ...             addop(a, b, out=c)
+    ...     return it.operands[2]
 
     Here is the same function, but following the C-style pattern::
 
-        def iter_add(x, y, out=None):
-            addop = np.add
-
-            it = np.nditer([x, y, out], [],
-                        [['readonly'], ['readonly'], ['writeonly','allocate']])
-            with it:
-                while not it.finished:
-                    addop(it[0], it[1], out=it[2])
-                    it.iternext()
-
-                return it.operands[2]
+    >>> def iter_add(x, y, out=None):
+    ...    addop = np.add
+    ...    it = np.nditer([x, y, out], [],
+    ...                [['readonly'], ['readonly'], ['writeonly','allocate']])
+    ...    with it:
+    ...        while not it.finished:
+    ...            addop(it[0], it[1], out=it[2])
+    ...            it.iternext()
+    ...        return it.operands[2]
 
     Here is an example outer product function::
 
-        def outer_it(x, y, out=None):
-            mulop = np.multiply
+    >>> def outer_it(x, y, out=None):
+    ...     mulop = np.multiply
+    ...     it = np.nditer([x, y, out], ['external_loop'],
+    ...             [['readonly'], ['readonly'], ['writeonly', 'allocate']],
+    ...             op_axes=[list(range(x.ndim)) + [-1] * y.ndim,
+    ...                      [-1] * x.ndim + list(range(y.ndim)),
+    ...                      None])
+    ...     with it:
+    ...         for (a, b, c) in it:
+    ...             mulop(a, b, out=c)
+    ...         return it.operands[2]
 
-            it = np.nditer([x, y, out], ['external_loop'],
-                    [['readonly'], ['readonly'], ['writeonly', 'allocate']],
-                    op_axes=[list(range(x.ndim)) + [-1] * y.ndim,
-                             [-1] * x.ndim + list(range(y.ndim)),
-                             None])
-            with it:
-                for (a, b, c) in it:
-                    mulop(a, b, out=c)
-                return it.operands[2]
-
-        >>> a = np.arange(2)+1
-        >>> b = np.arange(3)+1
-        >>> outer_it(a,b)
-        array([[1, 2, 3],
-               [2, 4, 6]])
+    >>> a = np.arange(2)+1
+    >>> b = np.arange(3)+1
+    >>> outer_it(a,b)
+    array([[1, 2, 3],
+           [2, 4, 6]])
 
     Here is an example function which operates like a "lambda" ufunc::
 
-        def luf(lamdaexpr, *args, **kwargs):
-            "luf(lambdaexpr, op1, ..., opn, out=None, order='K', casting='safe', buffersize=0)"
-            nargs = len(args)
-            op = (kwargs.get('out',None),) + args
-            it = np.nditer(op, ['buffered','external_loop'],
-                    [['writeonly','allocate','no_broadcast']] +
-                                    [['readonly','nbo','aligned']]*nargs,
-                    order=kwargs.get('order','K'),
-                    casting=kwargs.get('casting','safe'),
-                    buffersize=kwargs.get('buffersize',0))
-            while not it.finished:
-                it[0] = lamdaexpr(*it[1:])
-                it.iternext()
-                return it.operands[0]
+    >>> def luf(lamdaexpr, *args, **kwargs):
+    ...    '''luf(lambdaexpr, op1, ..., opn, out=None, order='K', casting='safe', buffersize=0)'''
+    ...    nargs = len(args)
+    ...    op = (kwargs.get('out',None),) + args
+    ...    it = np.nditer(op, ['buffered','external_loop'],
+    ...            [['writeonly','allocate','no_broadcast']] +
+    ...                            [['readonly','nbo','aligned']]*nargs,
+    ...            order=kwargs.get('order','K'),
+    ...            casting=kwargs.get('casting','safe'),
+    ...            buffersize=kwargs.get('buffersize',0))
+    ...    while not it.finished:
+    ...        it[0] = lamdaexpr(*it[1:])
+    ...        it.iternext()
+    ...        return it.operands[0]
 
-        >>> a = np.arange(5)
-        >>> b = np.ones(5)
-        >>> luf(lambda i,j:i*i + j/2, a, b)
-        array([  0.5,   1.5,   4.5,   9.5,  16.5])
+    >>> a = np.arange(5)
+    >>> b = np.ones(5)
+    >>> luf(lambda i,j:i*i + j/2, a, b)
+    array([  0.5,   1.5,   4.5,   9.5,  16.5])
 
     If operand flags `"writeonly"` or `"readwrite"` are used the operands may
     be views into the original data with the `WRITEBACKIFCOPY` flag. In this case
@@ -393,16 +390,16 @@
     data will be written back to the original data when the `__exit__`
     function is called but not before:
 
-        >>> a = np.arange(6, dtype='i4')[::-2]
-        >>> with nditer(a, [],
-        ...        [['writeonly', 'updateifcopy']],
-        ...        casting='unsafe',
-        ...        op_dtypes=[np.dtype('f4')]) as i:
-        ...    x = i.operands[0]
-        ...    x[:] = [-1, -2, -3]
-        ...    # a still unchanged here
-        >>> a, x
-        array([-1, -2, -3]), array([-1, -2, -3])
+    >>> a = np.arange(6, dtype='i4')[::-2]
+    >>> with np.nditer(a, [],
+    ...        [['writeonly', 'updateifcopy']],
+    ...        casting='unsafe',
+    ...        op_dtypes=[np.dtype('f4')]) as i:
+    ...    x = i.operands[0]
+    ...    x[:] = [-1, -2, -3]
+    ...    # a still unchanged here
+    >>> a, x
+    (array([-1, -2, -3], dtype=int32), array([-1., -2., -3.], dtype=float32))
 
     It is important to note that once the iterator is exited, dangling
     references (like `x` in the example) may or may not share data with
@@ -428,10 +425,10 @@
     >>> x = np.arange(10)
     >>> y = x + 1
     >>> it = np.nditer([x, y])
-    >>> it.next()
+    >>> next(it)
     (array(0), array(1))
     >>> it2 = it.copy()
-    >>> it2.next()
+    >>> next(it2)
     (array(1), array(2))
 
     """))
@@ -544,7 +541,6 @@
     ...      print(i.multi_index)
     ...      for y in j:
     ...          print('', j.multi_index, y)
-
     (0,)
      (0, 0) 0
      (0, 1) 1
@@ -617,9 +613,9 @@
     >>> out = np.empty(b.shape)
     >>> out.flat = [u+v for (u,v) in b]
     >>> out
-    array([[ 5.,  6.,  7.],
-           [ 6.,  7.,  8.],
-           [ 7.,  8.,  9.]])
+    array([[5.,  6.,  7.],
+           [6.,  7.,  8.],
+           [7.,  8.,  9.]])
 
     Compare against built-in broadcasting:
 
@@ -643,7 +639,7 @@
     >>> b = np.broadcast(x, y)
     >>> b.index
     0
-    >>> b.next(), b.next(), b.next()
+    >>> next(b), next(b), next(b)
     ((1, 4), (1, 5), (1, 6))
     >>> b.index
     3
@@ -762,11 +758,11 @@
     Examples
     --------
     >>> x = np.array([1, 2, 3])
-    >>> y = np.array([[4], [5], [6]]
+    >>> y = np.array([[4], [5], [6]])
     >>> b = np.broadcast(x, y)
     >>> b.index
     0
-    >>> b.next(), b.next(), b.next()
+    >>> next(b), next(b), next(b)
     ((1, 4), (2, 4), (3, 4))
     >>> b.index
     3
@@ -1189,32 +1185,32 @@
     --------
     Construct an ndarray:
 
-    >>> dt = np.dtype([('time', [('min', int), ('sec', int)]),
+    >>> dt = np.dtype([('time', [('min', np.int64), ('sec', np.int64)]),
     ...                ('temp', float)])
     >>> x = np.zeros((1,), dtype=dt)
     >>> x['time']['min'] = 10; x['temp'] = 98.25
     >>> x
     array([((10, 0), 98.25)],
-          dtype=[('time', [('min', '<i4'), ('sec', '<i4')]), ('temp', '<f8')])
+          dtype=[('time', [('min', '<i8'), ('sec', '<i8')]), ('temp', '<f8')])
 
     Save the raw data to disk:
 
-    >>> import os
-    >>> fname = os.tmpnam()
+    >>> import tempfile
+    >>> fname = tempfile.mkstemp()[1]
     >>> x.tofile(fname)
 
     Read the raw data from disk:
 
     >>> np.fromfile(fname, dtype=dt)
     array([((10, 0), 98.25)],
-          dtype=[('time', [('min', '<i4'), ('sec', '<i4')]), ('temp', '<f8')])
+          dtype=[('time', [('min', '<i8'), ('sec', '<i8')]), ('temp', '<f8')])
 
     The recommended way to store and load data:
 
     >>> np.save(fname, x)
     >>> np.load(fname + '.npy')
     array([((10, 0), 98.25)],
-          dtype=[('time', [('min', '<i4'), ('sec', '<i4')]), ('temp', '<f8')])
+          dtype=[('time', [('min', '<i8'), ('sec', '<i8')]), ('temp', '<f8')])
 
     """)
 
@@ -1242,17 +1238,16 @@
 
       >>> dt = np.dtype(int)
       >>> dt = dt.newbyteorder('>')
-      >>> np.frombuffer(buf, dtype=dt)
+      >>> np.frombuffer(buf, dtype=dt) # doctest: +SKIP
 
     The data of the resulting array will not be byteswapped, but will be
     interpreted correctly.
 
     Examples
     --------
-    >>> s = 'hello world'
+    >>> s = b'hello world'
     >>> np.frombuffer(s, dtype='S1', count=5, offset=6)
-    array(['w', 'o', 'r', 'l', 'd'],
-          dtype='|S1')
+    array([b'w', b'o', b'r', b'l', b'd'], dtype='|S1')
 
     >>> np.frombuffer(b'\\x01\\x02', dtype=np.uint8)
     array([1, 2], dtype=uint8)
@@ -1941,8 +1936,8 @@
     First mode, `buffer` is None:
 
     >>> np.ndarray(shape=(2,2), dtype=float, order='F')
-    array([[ -1.13698227e+002,   4.25087011e-303],
-           [  2.88528414e-306,   3.27025015e-309]])         #random
+    array([[0.0e+000, 0.0e+000], # random
+           [     nan, 2.5e-323]])
 
     Second mode:
 
@@ -2047,14 +2042,6 @@
 
     .. automethod:: numpy.core._internal._ctypes.strides_as
 
-    Be careful using the ctypes attribute - especially on temporary
-    arrays or arrays constructed on the fly. For example, calling
-    ``(a+b).ctypes.data_as(ctypes.c_void_p)`` returns a pointer to memory
-    that is invalid because the array created as (a+b) is deallocated
-    before the next Python statement. You can avoid this problem using
-    either ``c=a+b`` or ``ct=(a+b).ctypes``. In the latter case, ct will
-    hold a reference to the array until ct is deleted or re-assigned.
-
     If the ctypes module is not available, then the ctypes attribute
     of array objects still returns something useful, but ctypes objects
     are not returned and errors may be raised instead. In particular,
@@ -2256,7 +2243,7 @@
     >>> x.T.flat[3]
     5
     >>> type(x.flat)
-    <type 'numpy.flatiter'>
+    <class 'numpy.flatiter'>
 
     An assignment example:
 
@@ -2706,7 +2693,7 @@
     --------
     >>> x = np.array([1, 2, 2.5])
     >>> x
-    array([ 1. ,  2. ,  2.5])
+    array([1. ,  2. ,  2.5])
 
     >>> x.astype(int)
     array([1, 2, 2])
@@ -2737,19 +2724,20 @@
     Examples
     --------
     >>> A = np.array([1, 256, 8755], dtype=np.int16)
-    >>> map(hex, A)
+    >>> list(map(hex, A))
     ['0x1', '0x100', '0x2233']
     >>> A.byteswap(inplace=True)
     array([  256,     1, 13090], dtype=int16)
-    >>> map(hex, A)
+    >>> list(map(hex, A))
     ['0x100', '0x1', '0x3322']
 
     Arrays of strings are not swapped
 
     >>> A = np.array(['ceg', 'fac'])
     >>> A.byteswap()
-    array(['ceg', 'fac'],
-          dtype='|S3')
+    Traceback (most recent call last):
+        ...
+    UnicodeDecodeError: ...
 
     """))
 
@@ -2937,14 +2925,14 @@
     >>> a = np.eye(2)
     >>> b = np.ones((2, 2)) * 2
     >>> a.dot(b)
-    array([[ 2.,  2.],
-           [ 2.,  2.]])
+    array([[2.,  2.],
+           [2.,  2.]])
 
     This array method can be conveniently chained:
 
     >>> a.dot(b).dot(b)
-    array([[ 8.,  8.],
-           [ 8.,  8.]])
+    array([[8.,  8.],
+           [8.,  8.]])
 
     """))
 
@@ -2997,7 +2985,7 @@
     >>> a = np.empty(2)
     >>> a.fill(1)
     >>> a
-    array([ 1.,  1.])
+    array([1.,  1.])
 
     """))
 
@@ -3066,18 +3054,18 @@
     >>> x = np.diag([1.+1.j]*2)
     >>> x[1, 1] = 2 + 4.j
     >>> x
-    array([[ 1.+1.j,  0.+0.j],
-           [ 0.+0.j,  2.+4.j]])
+    array([[1.+1.j,  0.+0.j],
+           [0.+0.j,  2.+4.j]])
     >>> x.getfield(np.float64)
-    array([[ 1.,  0.],
-           [ 0.,  2.]])
+    array([[1.,  0.],
+           [0.,  2.]])
 
     By choosing an offset of 8 bytes we can select the complex part of the
     array for our view:
 
     >>> x.getfield(np.float64, offset=8)
-    array([[ 1.,  0.],
-       [ 0.,  4.]])
+    array([[1.,  0.],
+           [0.,  4.]])
 
     """))
 
@@ -3123,19 +3111,20 @@
 
     Examples
     --------
+    >>> np.random.seed(123)
     >>> x = np.random.randint(9, size=(3, 3))
     >>> x
-    array([[3, 1, 7],
-           [2, 8, 3],
-           [8, 5, 3]])
+    array([[2, 2, 6],
+           [1, 3, 6],
+           [1, 0, 1]])
     >>> x.item(3)
-    2
-    >>> x.item(7)
-    5
-    >>> x.item((0, 1))
     1
+    >>> x.item(7)
+    0
+    >>> x.item((0, 1))
+    2
     >>> x.item((2, 2))
-    3
+    1
 
     """))
 
@@ -3171,24 +3160,25 @@
 
     Examples
     --------
+    >>> np.random.seed(123)
     >>> x = np.random.randint(9, size=(3, 3))
     >>> x
-    array([[3, 1, 7],
-           [2, 8, 3],
-           [8, 5, 3]])
+    array([[2, 2, 6],
+           [1, 3, 6],
+           [1, 0, 1]])
     >>> x.itemset(4, 0)
     >>> x.itemset((2, 2), 9)
     >>> x
-    array([[3, 1, 7],
-           [2, 0, 3],
-           [8, 5, 9]])
+    array([[2, 2, 6],
+           [1, 0, 6],
+           [1, 0, 9]])
 
     """))
 
 
 add_newdoc('numpy.core.multiarray', 'ndarray', ('max',
     """
-    a.max(axis=None, out=None, keepdims=False)
+    a.max(axis=None, out=None, keepdims=False, initial=<no value>, where=True)
 
     Return the maximum along a given axis.
 
@@ -3218,7 +3208,7 @@
 
 add_newdoc('numpy.core.multiarray', 'ndarray', ('min',
     """
-    a.min(axis=None, out=None, keepdims=False)
+    a.min(axis=None, out=None, keepdims=False, initial=<no value>, where=True)
 
     Return the minimum along a given axis.
 
@@ -3371,7 +3361,7 @@
 
 add_newdoc('numpy.core.multiarray', 'ndarray', ('prod',
     """
-    a.prod(axis=None, dtype=None, out=None, keepdims=False)
+    a.prod(axis=None, dtype=None, out=None, keepdims=False, initial=1, where=True)
 
     Return the product of the array elements over the given axis
 
@@ -3622,7 +3612,7 @@
     >>> a.resize((1, 1))
     Traceback (most recent call last):
     ...
-    ValueError: cannot resize an array that has been referenced ...
+    ValueError: cannot resize an array that references or is referenced ...
 
     Unless `refcheck` is False:
 
@@ -3695,23 +3685,23 @@
     --------
     >>> x = np.eye(3)
     >>> x.getfield(np.float64)
-    array([[ 1.,  0.,  0.],
-           [ 0.,  1.,  0.],
-           [ 0.,  0.,  1.]])
+    array([[1.,  0.,  0.],
+           [0.,  1.,  0.],
+           [0.,  0.,  1.]])
     >>> x.setfield(3, np.int32)
     >>> x.getfield(np.int32)
     array([[3, 3, 3],
            [3, 3, 3],
-           [3, 3, 3]])
+           [3, 3, 3]], dtype=int32)
     >>> x
-    array([[  1.00000000e+000,   1.48219694e-323,   1.48219694e-323],
-           [  1.48219694e-323,   1.00000000e+000,   1.48219694e-323],
-           [  1.48219694e-323,   1.48219694e-323,   1.00000000e+000]])
+    array([[1.0e+000, 1.5e-323, 1.5e-323],
+           [1.5e-323, 1.0e+000, 1.5e-323],
+           [1.5e-323, 1.5e-323, 1.0e+000]])
     >>> x.setfield(np.eye(3), np.int32)
     >>> x
-    array([[ 1.,  0.,  0.],
-           [ 0.,  1.,  0.],
-           [ 0.,  0.,  1.]])
+    array([[1.,  0.,  0.],
+           [0.,  1.,  0.],
+           [0.,  0.,  1.]])
 
     """))
 
@@ -3764,6 +3754,9 @@
 
     Examples
     --------
+    >>> y = np.array([[3, 1, 7],
+    ...               [2, 0, 0],
+    ...               [8, 5, 9]])
     >>> y
     array([[3, 1, 7],
            [2, 0, 0],
@@ -3843,8 +3836,8 @@
     >>> a = np.array([('a', 2), ('c', 1)], dtype=[('x', 'S1'), ('y', int)])
     >>> a.sort(order='y')
     >>> a
-    array([('c', 1), ('a', 2)],
-          dtype=[('x', '|S1'), ('y', '<i4')])
+    array([(b'c', 1), (b'a', 2)],
+          dtype=[('x', 'S1'), ('y', '<i8')])
 
     """))
 
@@ -3900,6 +3893,7 @@
     array([2, 1, 3, 4])
 
     >>> a.partition((1, 3))
+    >>> a
     array([1, 2, 3, 4])
     """))
 
@@ -3936,7 +3930,7 @@
 
 add_newdoc('numpy.core.multiarray', 'ndarray', ('sum',
     """
-    a.sum(axis=None, dtype=None, out=None, keepdims=False)
+    a.sum(axis=None, dtype=None, out=None, keepdims=False, initial=0, where=True)
 
     Return the sum of the array elements over the given axis.
 
@@ -4023,10 +4017,14 @@
     """
     a.tolist()
 
-    Return the array as a (possibly nested) list.
+    Return the array as an ``a.ndim``-levels deep nested list of Python scalars.
 
     Return a copy of the array data as a (nested) Python list.
-    Data items are converted to the nearest compatible Python type.
+    Data items are converted to the nearest compatible builtin Python type, via
+    the `~numpy.ndarray.item` function.
+
+    If ``a.ndim`` is 0, then since the depth of the nested list is 0, it will
+    not be a list at all, but a simple Python scalar.
 
     Parameters
     ----------
@@ -4034,24 +4032,41 @@
 
     Returns
     -------
-    y : list
+    y : object, or list of object, or list of list of object, or ...
         The possibly nested list of array elements.
 
     Notes
     -----
-    The array may be recreated, ``a = np.array(a.tolist())``.
+    The array may be recreated via ``a = np.array(a.tolist())``, although this
+    may sometimes lose precision.
 
     Examples
     --------
+    For a 1D array, ``a.tolist()`` is almost the same as ``list(a)``:
+
     >>> a = np.array([1, 2])
+    >>> list(a)
+    [1, 2]
     >>> a.tolist()
     [1, 2]
+
+    However, for a 2D array, ``tolist`` applies recursively:
+
     >>> a = np.array([[1, 2], [3, 4]])
     >>> list(a)
     [array([1, 2]), array([3, 4])]
     >>> a.tolist()
     [[1, 2], [3, 4]]
 
+    The base case for this recursion is a 0D array:
+
+    >>> a = np.array(1)
+    >>> list(a)
+    Traceback (most recent call last):
+      ...
+    TypeError: iteration over a 0-d array
+    >>> a.tolist()
+    1
     """))
 
 
@@ -4081,13 +4096,13 @@
 
     Examples
     --------
-    >>> x = np.array([[0, 1], [2, 3]])
+    >>> x = np.array([[0, 1], [2, 3]], dtype='<u2')
     >>> x.tobytes()
-    b'\\x00\\x00\\x00\\x00\\x01\\x00\\x00\\x00\\x02\\x00\\x00\\x00\\x03\\x00\\x00\\x00'
+    b'\\x00\\x00\\x01\\x00\\x02\\x00\\x03\\x00'
     >>> x.tobytes('C') == x.tobytes()
     True
     >>> x.tobytes('F')
-    b'\\x00\\x00\\x00\\x00\\x02\\x00\\x00\\x00\\x01\\x00\\x00\\x00\\x03\\x00\\x00\\x00'
+    b'\\x00\\x00\\x02\\x00\\x01\\x00\\x03\\x00'
 
     """
 
@@ -4237,7 +4252,7 @@
     >>> y
     matrix([[513]], dtype=int16)
     >>> print(type(y))
-    <class 'numpy.matrixlib.defmatrix.matrix'>
+    <class 'numpy.matrix'>
 
     Creating a view on a structured array so it can be used in calculations
 
@@ -4247,19 +4262,19 @@
     array([[1, 2],
            [3, 4]], dtype=int8)
     >>> xv.mean(0)
-    array([ 2.,  3.])
+    array([2.,  3.])
 
     Making changes to the view changes the underlying array
 
     >>> xv[0,1] = 20
-    >>> print(x)
-    [(1, 20) (3, 4)]
+    >>> x
+    array([(1, 20), (3,  4)], dtype=[('a', 'i1'), ('b', 'i1')])
 
     Using a view to convert an array to a recarray:
 
     >>> z = x.view(np.recarray)
     >>> z.a
-    array([1], dtype=int8)
+    array([1, 3], dtype=int8)
 
     Views share data:
 
@@ -4277,8 +4292,8 @@
            [4, 5]], dtype=int16)
     >>> y.view(dtype=[('width', np.int16), ('length', np.int16)])
     Traceback (most recent call last):
-      File "<stdin>", line 1, in <module>
-    ValueError: new type not compatible with array.
+        ...
+    ValueError: To change to a dtype of a different size, the array must be C-contiguous
     >>> z = y.copy()
     >>> z.view(dtype=[('width', np.int16), ('length', np.int16)])
     array([[(1, 2)],
@@ -4329,10 +4344,9 @@
 
     >>> oct_array = np.frompyfunc(oct, 1, 1)
     >>> oct_array(np.array((10, 30, 100)))
-    array([012, 036, 0144], dtype=object)
+    array(['0o12', '0o36', '0o144'], dtype=object)
     >>> np.array((oct(10), oct(30), oct(100))) # for comparison
-    array(['012', '036', '0144'],
-          dtype='|S4')
+    array(['0o12', '0o36', '0o144'], dtype='<U5')
 
     """)
 
@@ -4394,7 +4408,7 @@
     >>> np.base_repr(np.geterrobj()[1], 8)
     '0'
     >>> old_err = np.seterr(divide='warn', over='log', under='call',
-                            invalid='print')
+    ...                     invalid='print')
     >>> np.base_repr(np.geterrobj()[1], 8)
     '4351'
 
@@ -4540,7 +4554,10 @@
     ...                [0,0,1]]])
     >>> b = np.packbits(a, axis=-1)
     >>> b
-    array([[[160],[64]],[[192],[32]]], dtype=uint8)
+    array([[[160],
+            [ 64]],
+           [[192],
+            [ 32]]], dtype=uint8)
 
     Note that in binary 160 = 1010 0000, 64 = 0100 0000, 192 = 1100 0000,
     and 32 = 0010 0000.
@@ -4880,7 +4897,7 @@
 
 add_newdoc('numpy.core', 'ufunc', ('reduce',
     """
-    reduce(a, axis=0, dtype=None, out=None, keepdims=False, initial)
+    reduce(a, axis=0, dtype=None, out=None, keepdims=False, initial=<no value>, where=True)
 
     Reduces `a`'s dimension by one, by applying ufunc along one axis.
 
@@ -4945,6 +4962,14 @@
 
         .. versionadded:: 1.15.0
 
+    where : array_like of bool, optional
+        A boolean array which is broadcasted to match the dimensions
+        of `a`, and selects elements to include in the reduction. Note
+        that for ufuncs like ``minimum`` that do not have an identity
+        defined, one has to pass in also ``initial``.
+
+        .. versionadded:: 1.17.0
+
     Returns
     -------
     r : ndarray
@@ -4976,19 +5001,24 @@
     array([[ 1,  5],
            [ 9, 13]])
 
-    You can use the ``initial`` keyword argument to initialize the reduction with a
-    different value.
+    You can use the ``initial`` keyword argument to initialize the reduction
+    with a different value, and ``where`` to select specific elements to include:
 
     >>> np.add.reduce([10], initial=5)
     15
-    >>> np.add.reduce(np.ones((2, 2, 2)), axis=(0, 2), initializer=10)
+    >>> np.add.reduce(np.ones((2, 2, 2)), axis=(0, 2), initial=10)
     array([14., 14.])
+    >>> a = np.array([10., np.nan, 10])
+    >>> np.add.reduce(a, where=~np.isnan(a))
+    20.0
 
     Allows reductions of empty arrays where they would normally fail, i.e.
     for ufuncs without an identity.
 
     >>> np.minimum.reduce([], initial=np.inf)
     inf
+    >>> np.minimum.reduce([[1., 2.], [3., 4.]], initial=10., where=[True, False])
+    array([ 1., 10.])
     >>> np.minimum.reduce([])
     Traceback (most recent call last):
         ...
@@ -5054,23 +5084,23 @@
 
     >>> I = np.eye(2)
     >>> I
-    array([[ 1.,  0.],
-           [ 0.,  1.]])
+    array([[1.,  0.],
+           [0.,  1.]])
 
     Accumulate along axis 0 (rows), down columns:
 
     >>> np.add.accumulate(I, 0)
-    array([[ 1.,  0.],
-           [ 1.,  1.]])
+    array([[1.,  0.],
+           [1.,  1.]])
     >>> np.add.accumulate(I) # no axis specified = axis zero
-    array([[ 1.,  0.],
-           [ 1.,  1.]])
+    array([[1.,  0.],
+           [1.,  1.]])
 
     Accumulate along axis 1 (columns), through rows:
 
     >>> np.add.accumulate(I, 1)
-    array([[ 1.,  1.],
-           [ 0.,  1.]])
+    array([[1.,  1.],
+           [0.,  1.]])
 
     """))
 
@@ -5147,10 +5177,10 @@
 
     >>> x = np.linspace(0, 15, 16).reshape(4,4)
     >>> x
-    array([[  0.,   1.,   2.,   3.],
-           [  4.,   5.,   6.,   7.],
-           [  8.,   9.,  10.,  11.],
-           [ 12.,  13.,  14.,  15.]])
+    array([[ 0.,   1.,   2.,   3.],
+           [ 4.,   5.,   6.,   7.],
+           [ 8.,   9.,  10.,  11.],
+           [12.,  13.,  14.,  15.]])
 
     ::
 
@@ -5162,11 +5192,11 @@
      # [row1 + row2 + row3 + row4]
 
     >>> np.add.reduceat(x, [0, 3, 1, 2, 0])
-    array([[ 12.,  15.,  18.,  21.],
-           [ 12.,  13.,  14.,  15.],
-           [  4.,   5.,   6.,   7.],
-           [  8.,   9.,  10.,  11.],
-           [ 24.,  28.,  32.,  36.]])
+    array([[12.,  15.,  18.,  21.],
+           [12.,  13.,  14.,  15.],
+           [ 4.,   5.,   6.,   7.],
+           [ 8.,   9.,  10.,  11.],
+           [24.,  28.,  32.,  36.]])
 
     ::
 
@@ -5174,10 +5204,10 @@
      # [col1 * col2 * col3, col4]
 
     >>> np.multiply.reduceat(x, [0, 3], 1)
-    array([[    0.,     3.],
-           [  120.,     7.],
-           [  720.,    11.],
-           [ 2184.,    15.]])
+    array([[   0.,     3.],
+           [ 120.,     7.],
+           [ 720.,    11.],
+           [2184.,    15.]])
 
     """))
 
@@ -5276,14 +5306,14 @@
 
     >>> a = np.array([1, 2, 3, 4])
     >>> np.negative.at(a, [0, 1])
-    >>> print(a)
-    array([-1, -2, 3, 4])
+    >>> a
+    array([-1, -2,  3,  4])
 
     Increment items 0 and 1, and increment item 2 twice:
 
     >>> a = np.array([1, 2, 3, 4])
     >>> np.add.at(a, [0, 1, 2, 2], 1)
-    >>> print(a)
+    >>> a
     array([2, 3, 5, 4])
 
     Add items 0 and 1 in first array to second array,
@@ -5292,7 +5322,7 @@
     >>> a = np.array([1, 2, 3, 4])
     >>> b = np.array([1, 2])
     >>> np.add.at(a, [0, 1], b)
-    >>> print(a)
+    >>> a
     array([2, 4, 3, 4])
 
     """))
@@ -5357,13 +5387,13 @@
     Structured type, two fields: the first field contains an unsigned int, the
     second an int32:
 
-    >>> np.dtype([('f1', np.uint), ('f2', np.int32)])
-    dtype([('f1', '<u4'), ('f2', '<i4')])
+    >>> np.dtype([('f1', np.uint64), ('f2', np.int32)])
+    dtype([('f1', '<u8'), ('f2', '<i4')])
 
     Using array-protocol type strings:
 
     >>> np.dtype([('a','f8'),('b','S10')])
-    dtype([('a', '<f8'), ('b', '|S10')])
+    dtype([('a', '<f8'), ('b', 'S10')])
 
     Using comma-separated field formats.  The shape is (2,3):
 
@@ -5373,24 +5403,24 @@
     Using tuples.  ``int`` is a fixed type, 3 the field's shape.  ``void``
     is a flexible type, here of size 10:
 
-    >>> np.dtype([('hello',(int,3)),('world',np.void,10)])
-    dtype([('hello', '<i4', 3), ('world', '|V10')])
+    >>> np.dtype([('hello',(np.int64,3)),('world',np.void,10)])
+    dtype([('hello', '<i8', (3,)), ('world', 'V10')])
 
     Subdivide ``int16`` into 2 ``int8``'s, called x and y.  0 and 1 are
     the offsets in bytes:
 
     >>> np.dtype((np.int16, {'x':(np.int8,0), 'y':(np.int8,1)}))
-    dtype(('<i2', [('x', '|i1'), ('y', '|i1')]))
+    dtype((numpy.int16, [('x', 'i1'), ('y', 'i1')]))
 
     Using dictionaries.  Two fields named 'gender' and 'age':
 
     >>> np.dtype({'names':['gender','age'], 'formats':['S1',np.uint8]})
-    dtype([('gender', '|S1'), ('age', '|u1')])
+    dtype([('gender', 'S1'), ('age', 'u1')])
 
     Offsets in bytes, here 0 and 25:
 
     >>> np.dtype({'surname':('S25',0),'age':(np.uint8,25)})
-    dtype([('surname', '|S25'), ('age', '|u1')])
+    dtype([('surname', 'S25'), ('age', 'u1')])
 
     """)
 
@@ -5794,7 +5824,7 @@
     ...             holidays=['2011-07-01', '2011-07-04', '2011-07-17'])
     >>> # Default is Monday to Friday weekdays
     ... bdd.weekmask
-    array([ True,  True,  True,  True,  True, False, False], dtype='bool')
+    array([ True,  True,  True,  True,  True, False, False])
     >>> # Any holidays already on the weekend are removed
     ... bdd.holidays
     array(['2011-07-01', '2011-07-04'], dtype='datetime64[D]')
@@ -5891,7 +5921,7 @@
     as a timedelta
 
     >>> np.datetime64('2010', np.datetime_data(dt_25s))
-    numpy.datetime64('2010-01-01T00:00:00', '25s')
+    numpy.datetime64('2010-01-01T00:00:00','25s')
     """)
 
 
@@ -6725,25 +6755,25 @@
 add_newdoc('numpy.core.numerictypes', 'number',
     """
     Abstract base class of all numeric scalar types.
-    
+
     """)
 
 add_newdoc('numpy.core.numerictypes', 'integer',
     """
     Abstract base class of all integer scalar types.
-    
+
     """)
 
 add_newdoc('numpy.core.numerictypes', 'signedinteger',
     """
     Abstract base class of all signed integer scalar types.
-    
+
     """)
 
 add_newdoc('numpy.core.numerictypes', 'unsignedinteger',
     """
     Abstract base class of all unsigned integer scalar types.
-    
+
     """)
 
 add_newdoc('numpy.core.numerictypes', 'inexact',
@@ -6751,20 +6781,20 @@
     Abstract base class of all numeric scalar types with a (potentially)
     inexact representation of the values in its range, such as
     floating-point numbers.
-    
+
     """)
 
 add_newdoc('numpy.core.numerictypes', 'floating',
     """
     Abstract base class of all floating-point scalar types.
-    
+
     """)
 
 add_newdoc('numpy.core.numerictypes', 'complexfloating',
     """
     Abstract base class of all complex number scalar types that are made up of
     floating-point numbers.
-    
+
     """)
 
 add_newdoc('numpy.core.numerictypes', 'flexible',
@@ -6772,13 +6802,13 @@
     Abstract base class of all scalar types without predefined length.
     The actual size of these types depends on the specific `np.dtype`
     instantiation.
-    
+
     """)
 
 add_newdoc('numpy.core.numerictypes', 'character',
     """
     Abstract base class of all character string scalar types.
-    
+
     """)
 
 
diff --git a/numpy/core/_exceptions.py b/numpy/core/_exceptions.py
new file mode 100644
index 0000000..5e0105b
--- /dev/null
+++ b/numpy/core/_exceptions.py
@@ -0,0 +1,100 @@
+"""
+Various richly-typed exceptions, that also help us deal with string formatting
+in python where it's easier.
+
+By putting the formatting in `__str__`, we also avoid paying the cost for
+users who silence the exceptions.
+"""
+from numpy.core.overrides import set_module
+
+
+def _unpack_tuple(tup):
+    if len(tup) == 1:
+        return tup[0]
+    else:
+        return tup
+
+
+def _display_as_base(cls):
+    """
+    A decorator that makes an exception class look like its base.
+
+    We use this to hide subclasses that are implementation details - the user
+    should catch the base type, which is what the traceback will show them.
+
+    Classes decorated with this decorator are subject to removal without a
+    deprecation warning.
+    """
+    assert issubclass(cls, Exception)
+    cls.__name__ = cls.__base__.__name__
+    cls.__qualname__ = cls.__base__.__qualname__
+    return cls
+
+
+class UFuncTypeError(TypeError):
+    """ Base class for all ufunc exceptions """
+    def __init__(self, ufunc):
+        self.ufunc = ufunc
+
+
+@_display_as_base
+class _UFuncNoLoopError(UFuncTypeError):
+    """ Thrown when a ufunc loop cannot be found """
+    def __init__(self, ufunc, dtypes):
+        super().__init__(ufunc)
+        self.dtypes = tuple(dtypes)
+
+    def __str__(self):
+        return (
+            "ufunc {!r} did not contain a loop with signature matching types "
+            "{!r} -> {!r}"
+        ).format(
+            self.ufunc.__name__,
+            _unpack_tuple(self.dtypes[:self.ufunc.nin]),
+            _unpack_tuple(self.dtypes[self.ufunc.nin:])
+        )
+
+
+@_display_as_base
+class _UFuncCastingError(UFuncTypeError):
+    def __init__(self, ufunc, casting, from_, to):
+        super().__init__(ufunc)
+        self.casting = casting
+        self.from_ = from_
+        self.to = to
+
+
+@_display_as_base
+class _UFuncInputCastingError(_UFuncCastingError):
+    """ Thrown when a ufunc input cannot be casted """
+    def __init__(self, ufunc, casting, from_, to, i):
+        super().__init__(ufunc, casting, from_, to)
+        self.in_i = i
+
+    def __str__(self):
+        # only show the number if more than one input exists
+        i_str = "{} ".format(self.in_i) if self.ufunc.nin != 1 else ""
+        return (
+            "Cannot cast ufunc {!r} input {}from {!r} to {!r} with casting "
+            "rule {!r}"
+        ).format(
+            self.ufunc.__name__, i_str, self.from_, self.to, self.casting
+        )
+
+
+@_display_as_base
+class _UFuncOutputCastingError(_UFuncCastingError):
+    """ Thrown when a ufunc output cannot be casted """
+    def __init__(self, ufunc, casting, from_, to, i):
+        super().__init__(ufunc, casting, from_, to)
+        self.out_i = i
+
+    def __str__(self):
+        # only show the number if more than one output exists
+        i_str = "{} ".format(self.out_i) if self.ufunc.nout != 1 else ""
+        return (
+            "Cannot cast ufunc {!r} output {}from {!r} to {!r} with casting "
+            "rule {!r}"
+        ).format(
+            self.ufunc.__name__, i_str, self.from_, self.to, self.casting
+        )
diff --git a/numpy/core/_internal.py b/numpy/core/_internal.py
index 59da602..1d3bb55 100644
--- a/numpy/core/_internal.py
+++ b/numpy/core/_internal.py
@@ -238,19 +238,68 @@
 
 class _missing_ctypes(object):
     def cast(self, num, obj):
-        return num
+        return num.value
 
-    def c_void_p(self, num):
-        return num
+    class c_void_p(object):
+        def __init__(self, ptr):
+            self.value = ptr
+
+
+class _unsafe_first_element_pointer(object):
+    """
+    Helper to allow viewing an array as a ctypes pointer to the first element
+
+    This avoids:
+      * dealing with strides
+      * `.view` rejecting object-containing arrays
+      * `memoryview` not supporting overlapping fields
+    """
+    def __init__(self, arr):
+        self.base = arr
+
+    @property
+    def __array_interface__(self):
+        i = dict(
+            shape=(),
+            typestr='|V0',
+            data=(self.base.__array_interface__['data'][0], False),
+            strides=(),
+            version=3,
+        )
+        return i
+
+
+def _get_void_ptr(arr):
+    """
+    Get a `ctypes.c_void_p` to arr.data, that keeps a reference to the array
+    """
+    import numpy as np
+    # convert to a 0d array that has a data pointer referrign to the start
+    # of arr. This holds a reference to arr.
+    simple_arr = np.asarray(_unsafe_first_element_pointer(arr))
+
+    # create a `char[0]` using the same memory.
+    c_arr = (ctypes.c_char * 0).from_buffer(simple_arr)
+
+    # finally cast to void*
+    return ctypes.cast(ctypes.pointer(c_arr), ctypes.c_void_p)
+
 
 class _ctypes(object):
     def __init__(self, array, ptr=None):
+        self._arr = array
+
         if ctypes:
             self._ctypes = ctypes
+            # get a void pointer to the buffer, which keeps the array alive
+            self._data = _get_void_ptr(array)
+            assert self._data.value == ptr
         else:
+            # fake a pointer-like object that holds onto the reference
             self._ctypes = _missing_ctypes()
-        self._arr = array
-        self._data = ptr
+            self._data = self._ctypes.c_void_p(ptr)
+            self._data._objects = array
+
         if self._arr.ndim == 0:
             self._zerod = True
         else:
@@ -263,6 +312,8 @@
         ``self.data_as(ctypes.c_void_p)``. Perhaps you want to use the data as a
         pointer to a ctypes array of floating-point data:
         ``self.data_as(ctypes.POINTER(ctypes.c_double))``.
+
+        The returned pointer will keep a reference to the array.
         """
         return self._ctypes.cast(self._data, obj)
 
@@ -284,7 +335,8 @@
             return None
         return (obj*self._arr.ndim)(*self._arr.strides)
 
-    def get_data(self):
+    @property
+    def data(self):
         """
         A pointer to the memory area of the array as a Python integer.
         This memory area may contain data that is not aligned, or not in correct
@@ -293,10 +345,16 @@
         attribute to arbitrary C-code to avoid trouble that can include Python
         crashing. User Beware! The value of this attribute is exactly the same
         as ``self._array_interface_['data'][0]``.
-        """
-        return self._data
 
-    def get_shape(self):
+        Note that unlike `data_as`, a reference will not be kept to the array:
+        code like ``ctypes.c_void_p((a + b).ctypes.data)`` will result in a
+        pointer to a deallocated array, and should be spelt
+        ``(a + b).ctypes.data_as(ctypes.c_void_p)``
+        """
+        return self._data.value
+
+    @property
+    def shape(self):
         """
         (c_intp*self.ndim): A ctypes array of length self.ndim where
         the basetype is the C-integer corresponding to ``dtype('p')`` on this
@@ -307,7 +365,8 @@
         """
         return self.shape_as(_getintp_ctype())
 
-    def get_strides(self):
+    @property
+    def strides(self):
         """
         (c_intp*self.ndim): A ctypes array of length self.ndim where
         the basetype is the same as for the shape attribute. This ctypes array
@@ -317,13 +376,20 @@
         """
         return self.strides_as(_getintp_ctype())
 
-    def get_as_parameter(self):
-        return self._ctypes.c_void_p(self._data)
+    @property
+    def _as_parameter_(self):
+        """
+        Overrides the ctypes semi-magic method
 
-    data = property(get_data)
-    shape = property(get_shape)
-    strides = property(get_strides)
-    _as_parameter_ = property(get_as_parameter, None, doc="_as parameter_")
+        Enables `c_func(some_array.ctypes)`
+        """
+        return self._data
+
+    # kept for compatibility
+    get_data = data.fget
+    get_shape = shape.fget
+    get_strides = strides.fget
+    get_as_parameter = _as_parameter_.fget
 
 
 def _newnames(datatype, order):
@@ -764,6 +830,13 @@
             .format(ufunc, method, args_string, types_string))
 
 
+def array_function_errmsg_formatter(public_api, types):
+    """ Format the error message for when __array_ufunc__ gives up. """
+    func_name = '{}.{}'.format(public_api.__module__, public_api.__name__)
+    return ("no implementation found for '{}' on types that implement "
+            '__array_function__: {}'.format(func_name, list(types)))
+
+
 def _ufunc_doc_signature_formatter(ufunc):
     """
     Builds a signature string which resembles PEP 457
diff --git a/numpy/core/_methods.py b/numpy/core/_methods.py
index baeab63..51362c7 100644
--- a/numpy/core/_methods.py
+++ b/numpy/core/_methods.py
@@ -24,20 +24,20 @@
 # avoid keyword arguments to speed up parsing, saves about 15%-20% for very
 # small reductions
 def _amax(a, axis=None, out=None, keepdims=False,
-          initial=_NoValue):
-    return umr_maximum(a, axis, None, out, keepdims, initial)
+          initial=_NoValue, where=True):
+    return umr_maximum(a, axis, None, out, keepdims, initial, where)
 
 def _amin(a, axis=None, out=None, keepdims=False,
-          initial=_NoValue):
-    return umr_minimum(a, axis, None, out, keepdims, initial)
+          initial=_NoValue, where=True):
+    return umr_minimum(a, axis, None, out, keepdims, initial, where)
 
 def _sum(a, axis=None, dtype=None, out=None, keepdims=False,
-         initial=_NoValue):
-    return umr_sum(a, axis, dtype, out, keepdims, initial)
+         initial=_NoValue, where=True):
+    return umr_sum(a, axis, dtype, out, keepdims, initial, where)
 
 def _prod(a, axis=None, dtype=None, out=None, keepdims=False,
-          initial=_NoValue):
-    return umr_prod(a, axis, dtype, out, keepdims, initial)
+          initial=_NoValue, where=True):
+    return umr_prod(a, axis, dtype, out, keepdims, initial, where)
 
 def _any(a, axis=None, dtype=None, out=None, keepdims=False):
     return umr_any(a, axis, dtype, out, keepdims)
@@ -154,15 +154,3 @@
         umr_minimum(a, axis, None, None, keepdims),
         out
     )
-
-_NDARRAY_ARRAY_FUNCTION = mu.ndarray.__array_function__
-
-def _array_function(self, func, types, args, kwargs):
-    # TODO: rewrite this in C
-    # Cannot handle items that have __array_function__ other than our own.
-    for t in types:
-        if not issubclass(t, mu.ndarray) and hasattr(t, '__array_function__'):
-            return NotImplemented
-
-    # The regular implementation can handle this, so we call it directly.
-    return func.__wrapped__(*args, **kwargs)
diff --git a/numpy/core/arrayprint.py b/numpy/core/arrayprint.py
index 6a71de2..7d8785c 100644
--- a/numpy/core/arrayprint.py
+++ b/numpy/core/arrayprint.py
@@ -201,21 +201,21 @@
     Floating point precision can be set:
 
     >>> np.set_printoptions(precision=4)
-    >>> print(np.array([1.123456789]))
-    [ 1.1235]
+    >>> np.array([1.123456789])
+    [1.1235]
 
     Long arrays can be summarised:
 
     >>> np.set_printoptions(threshold=5)
-    >>> print(np.arange(10))
-    [0 1 2 ..., 7 8 9]
+    >>> np.arange(10)
+    array([0, 1, 2, ..., 7, 8, 9])
 
     Small results can be suppressed:
 
     >>> eps = np.finfo(float).eps
     >>> x = np.arange(4.)
     >>> x**2 - (x + eps)**2
-    array([ -4.9304e-32,  -4.4409e-16,   0.0000e+00,   0.0000e+00])
+    array([-4.9304e-32, -4.4409e-16,  0.0000e+00,  0.0000e+00])
     >>> np.set_printoptions(suppress=True)
     >>> x**2 - (x + eps)**2
     array([-0., -0.,  0.,  0.])
@@ -299,9 +299,10 @@
     Examples
     --------
 
+    >>> from numpy.testing import assert_equal
     >>> with np.printoptions(precision=2):
-    ...     print(np.array([2.0])) / 3
-    [0.67]
+    ...     np.array([2.0]) / 3
+    array([0.67])
 
     The `as`-clause of the `with`-statement gives the current print options:
 
@@ -644,9 +645,9 @@
     Examples
     --------
     >>> x = np.array([1e-16,1,2,3])
-    >>> print(np.array2string(x, precision=2, separator=',',
-    ...                       suppress_small=True))
-    [ 0., 1., 2., 3.]
+    >>> np.array2string(x, precision=2, separator=',',
+    ...                       suppress_small=True)
+    '[0.,1.,2.,3.]'
 
     >>> x  = np.arange(3.)
     >>> np.array2string(x, formatter={'float_kind':lambda x: "%.2f" % x})
@@ -654,7 +655,7 @@
 
     >>> x  = np.arange(3)
     >>> np.array2string(x, formatter={'int':lambda x: hex(x)})
-    '[0x0L 0x1L 0x2L]'
+    '[0x0 0x1 0x2]'
 
     """
     legacy = kwarg.pop('legacy', None)
@@ -1357,7 +1358,7 @@
     >>> np.core.arrayprint.dtype_is_implied(np.int8)
     False
     >>> np.array([1, 2, 3], np.int8)
-    array([1, 2, 3], dtype=np.int8)
+    array([1, 2, 3], dtype=int8)
     """
     dtype = np.dtype(dtype)
     if _format_options['legacy'] == '1.13' and dtype.type == bool_:
@@ -1377,6 +1378,7 @@
     The intent is roughly that the following holds
 
     >>> from numpy import *
+    >>> dt = np.int64([1, 2]).dtype
     >>> assert eval(dtype_short_repr(dt)) == dt
     """
     if dtype.names is not None:
@@ -1480,13 +1482,13 @@
     >>> np.array_repr(np.array([1,2]))
     'array([1, 2])'
     >>> np.array_repr(np.ma.array([0.]))
-    'MaskedArray([ 0.])'
+    'MaskedArray([0.])'
     >>> np.array_repr(np.array([], np.int32))
     'array([], dtype=int32)'
 
     >>> x = np.array([1e-6, 4e-7, 2, 3])
     >>> np.array_repr(x, precision=6, suppress_small=True)
-    'array([ 0.000001,  0.      ,  2.      ,  3.      ])'
+    'array([0.000001,  0.      ,  2.      ,  3.      ])'
 
     """
     return _array_repr_implementation(
@@ -1597,8 +1599,8 @@
     >>> a = np.arange(10)
     >>> a
     HA! - What are you going to do now?
-    >>> print(a)
-    [0 1 2 3 4 5 6 7 8 9]
+    >>> _ = a
+    >>> # [0 1 2 3 4 5 6 7 8 9]
 
     We can reset the function to the default:
 
@@ -1616,7 +1618,7 @@
     >>> x.__str__()
     'random'
     >>> x.__repr__()
-    'array([     0,      1,      2,      3])'
+    'array([0, 1, 2, 3])'
 
     """
     if f is None:
diff --git a/numpy/core/code_generators/genapi.py b/numpy/core/code_generators/genapi.py
index 1d2cd25..4aca237 100644
--- a/numpy/core/code_generators/genapi.py
+++ b/numpy/core/code_generators/genapi.py
@@ -19,6 +19,7 @@
 
 # The files under src/ that are scanned for API functions
 API_FILES = [join('multiarray', 'alloc.c'),
+             join('multiarray', 'arrayfunction_override.c'),
              join('multiarray', 'array_assign_array.c'),
              join('multiarray', 'array_assign_scalar.c'),
              join('multiarray', 'arrayobject.c'),
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index f5ee02c..108fff6 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -315,7 +315,7 @@
           TD(intfltcmplx),
           [TypeDescription('m', FullTypeDescr, 'mq', 'm'),
            TypeDescription('m', FullTypeDescr, 'md', 'm'),
-           #TypeDescription('m', FullTypeDescr, 'mm', 'd'),
+           TypeDescription('m', FullTypeDescr, 'mm', 'q'),
           ],
           TD(O, f='PyNumber_FloorDivide'),
           ),
diff --git a/numpy/core/code_generators/ufunc_docstrings.py b/numpy/core/code_generators/ufunc_docstrings.py
index 8a690c4..6dd6982 100644
--- a/numpy/core/code_generators/ufunc_docstrings.py
+++ b/numpy/core/code_generators/ufunc_docstrings.py
@@ -32,6 +32,9 @@
             For other keyword-only arguments, see the
             :ref:`ufunc docs <ufuncs.kwargs>`.
     """).strip(),
+    'BROADCASTABLE_2': ("If ``x1.shape != x2.shape``, they must be "
+                        "broadcastable to a common shape (which becomes the "
+                        "shape of the output)."),
     'OUT_SCALAR_1': "This is a scalar if `x` is a scalar.",
     'OUT_SCALAR_2': "This is a scalar if both `x1` and `x2` are scalars.",
 }
@@ -104,9 +107,7 @@
     Parameters
     ----------
     x1, x2 : array_like
-        The arrays to be added.  If ``x1.shape != x2.shape``, they must be
-        broadcastable to a common shape (which may be the shape of one or
-        the other).
+        The arrays to be added. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -432,8 +433,7 @@
     x1 : array_like, real-valued
         `y`-coordinates.
     x2 : array_like, real-valued
-        `x`-coordinates. `x2` must be broadcastable to match the shape of
-        `x1` or vice versa.
+        `x`-coordinates. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -556,7 +556,7 @@
     Parameters
     ----------
     x1, x2 : array_like
-        Only integer and boolean types are handled.
+        Only integer and boolean types are handled. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -609,7 +609,7 @@
     Parameters
     ----------
     x1, x2 : array_like
-        Only integer and boolean types are handled.
+        Only integer and boolean types are handled. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -648,8 +648,8 @@
     array([  6,   5, 255])
     >>> np.array([2, 5, 255]) | np.array([4, 4, 4])
     array([  6,   5, 255])
-    >>> np.bitwise_or(np.array([2, 5, 255, 2147483647L], dtype=np.int32),
-    ...               np.array([4, 4, 4, 2147483647L], dtype=np.int32))
+    >>> np.bitwise_or(np.array([2, 5, 255, 2147483647], dtype=np.int32),
+    ...               np.array([4, 4, 4, 2147483647], dtype=np.int32))
     array([         6,          5,        255, 2147483647])
     >>> np.bitwise_or([True, True], [False, True])
     array([ True,  True])
@@ -667,7 +667,7 @@
     Parameters
     ----------
     x1, x2 : array_like
-        Only integer and boolean types are handled.
+        Only integer and boolean types are handled. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -837,6 +837,7 @@
     array([  1.00000000e+00,   6.12303177e-17,  -1.00000000e+00])
     >>>
     >>> # Example of providing the optional output parameter
+    >>> out1 = np.array([0], dtype='d')
     >>> out2 = np.cos([0.1], out1)
     >>> out2 is out1
     True
@@ -845,7 +846,7 @@
     >>> np.cos(np.zeros((3,3)),np.zeros((2,2)))
     Traceback (most recent call last):
       File "<stdin>", line 1, in <module>
-    ValueError: invalid return array shape
+    ValueError: operands could not be broadcast together with shapes (3,3) (2,2)
 
     """)
 
@@ -912,7 +913,7 @@
             270.,  300.,  330.])
 
     >>> out = np.zeros((rad.shape))
-    >>> r = degrees(rad, out)
+    >>> r = np.degrees(rad, out)
     >>> np.all(r == out)
     True
 
@@ -969,7 +970,7 @@
     x1 : array_like
         Input values.
     x2 : array_like
-        The value of the function when x1 is 0.
+        The value of the function when x1 is 0. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -1004,7 +1005,7 @@
     x1 : array_like
         Dividend array.
     x2 : array_like
-        Divisor array.
+        Divisor array. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -1073,7 +1074,7 @@
     Parameters
     ----------
     x1, x2 : array_like
-        Input arrays of the same shape.
+        Input arrays. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -1320,7 +1321,7 @@
     x1 : array_like
         Numerator.
     x2 : array_like
-        Denominator.
+        Denominator. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -1360,7 +1361,7 @@
     x1 : array_like
         Dividend.
     x2 : array_like
-        Divisor.
+        Divisor. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -1410,9 +1411,7 @@
     Parameters
     ----------
     x1, x2 : array_like
-        Input arrays.  If ``x1.shape != x2.shape``, they must be
-        broadcastable to a common shape (which may be the shape of one or
-        the other).
+        Input arrays. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -1448,9 +1447,7 @@
     Parameters
     ----------
     x1, x2 : array_like
-        Input arrays.  If ``x1.shape != x2.shape``, they must be
-        broadcastable to a common shape (which may be the shape of one or
-        the other).
+        Input arrays. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -1483,7 +1480,7 @@
     Parameters
     ----------
     x1, x2 : array_like
-        Leg of the triangle(s).
+        Leg of the triangle(s). $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -1559,33 +1556,31 @@
     We've seen that 13 is represented by ``00001101``.
     The invert or bit-wise NOT of 13 is then:
 
-    >>> np.invert(np.array([13], dtype=uint8))
-    array([242], dtype=uint8)
+    >>> x = np.invert(np.array(13, dtype=np.uint8))
+    >>> x
+    242
     >>> np.binary_repr(x, width=8)
-    '00001101'
-    >>> np.binary_repr(242, width=8)
     '11110010'
 
     The result depends on the bit-width:
 
-    >>> np.invert(np.array([13], dtype=uint16))
-    array([65522], dtype=uint16)
+    >>> x = np.invert(np.array(13, dtype=np.uint16))
+    >>> x
+    65522
     >>> np.binary_repr(x, width=16)
-    '0000000000001101'
-    >>> np.binary_repr(65522, width=16)
     '1111111111110010'
 
     When using signed integer types the result is the two's complement of
     the result for the unsigned type:
 
-    >>> np.invert(np.array([13], dtype=int8))
+    >>> np.invert(np.array([13], dtype=np.int8))
     array([-14], dtype=int8)
     >>> np.binary_repr(-14, width=8)
     '11110010'
 
     Booleans are accepted as well:
 
-    >>> np.invert(array([True, False]))
+    >>> np.invert(np.array([True, False]))
     array([False,  True])
 
     """)
@@ -1784,6 +1779,7 @@
         Input values.
     x2 : array_like of integer type
         Number of zeros to append to `x1`. Has to be non-negative.
+        $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -1819,9 +1815,7 @@
     Parameters
     ----------
     x1, x2 : array_like
-        Input arrays.  If ``x1.shape != x2.shape``, they must be
-        broadcastable to a common shape (which may be the shape of one or
-        the other).
+        Input arrays. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -1849,9 +1843,7 @@
     Parameters
     ----------
     x1, x2 : array_like
-        Input arrays.  If ``x1.shape != x2.shape``, they must be
-        broadcastable to a common shape (which may be the shape of one or
-        the other).
+        Input arrays. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -1969,7 +1961,7 @@
     Examples
     --------
     >>> np.log10([1e-15, -3.])
-    array([-15.,  NaN])
+    array([-15.,  nan])
 
     """)
 
@@ -2035,7 +2027,7 @@
     Parameters
     ----------
     x1, x2 : array_like
-        Input values.
+        Input values. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -2077,7 +2069,7 @@
     Parameters
     ----------
     x1, x2 : array_like
-        Input values.
+        Input values. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -2168,14 +2160,14 @@
     Parameters
     ----------
     x1, x2 : array_like
-        Input arrays. `x1` and `x2` must be of the same shape.
+        Input arrays. $BROADCASTABLE_2
     $PARAMS
 
     Returns
     -------
     y : ndarray or bool
-        Boolean result with the same shape as `x1` and `x2` of the logical
-        AND operation on corresponding elements of `x1` and `x2`.
+        Boolean result of the logical OR operation applied to the elements
+        of `x1` and `x2`; the shape is determined by broadcasting.
         $OUT_SCALAR_2
 
     See Also
@@ -2238,14 +2230,14 @@
     ----------
     x1, x2 : array_like
         Logical OR is applied to the elements of `x1` and `x2`.
-        They have to be of the same shape.
+        $BROADCASTABLE_2
     $PARAMS
 
     Returns
     -------
     y : ndarray or bool
-        Boolean result with the same shape as `x1` and `x2` of the logical
-        OR operation on elements of `x1` and `x2`.
+        Boolean result of the logical OR operation applied to the elements
+        of `x1` and `x2`; the shape is determined by broadcasting.
         $OUT_SCALAR_2
 
     See Also
@@ -2273,16 +2265,14 @@
     Parameters
     ----------
     x1, x2 : array_like
-        Logical XOR is applied to the elements of `x1` and `x2`.  They must
-        be broadcastable to the same shape.
+        Logical XOR is applied to the elements of `x1` and `x2`. $BROADCASTABLE_2
     $PARAMS
 
     Returns
     -------
     y : bool or ndarray of bool
         Boolean result of the logical XOR operation applied to the elements
-        of `x1` and `x2`; the shape is determined by whether or not
-        broadcasting of one or both arrays was required.
+        of `x1` and `x2`; the shape is determined by broadcasting.
         $OUT_SCALAR_2
 
     See Also
@@ -2322,8 +2312,7 @@
     Parameters
     ----------
     x1, x2 : array_like
-        The arrays holding the elements to be compared. They must have
-        the same shape, or shapes that can be broadcast to a single shape.
+        The arrays holding the elements to be compared. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -2361,7 +2350,7 @@
            [ 0.5,  2. ]])
 
     >>> np.maximum([np.nan, 0, np.nan], [0, np.nan, np.nan])
-    array([ NaN,  NaN,  NaN])
+    array([nan, nan, nan])
     >>> np.maximum(np.Inf, 1)
     inf
 
@@ -2381,8 +2370,7 @@
     Parameters
     ----------
     x1, x2 : array_like
-        The arrays holding the elements to be compared. They must have
-        the same shape, or shapes that can be broadcast to a single shape.
+        The arrays holding the elements to be compared. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -2420,7 +2408,7 @@
            [ 0. ,  1. ]])
 
     >>> np.minimum([np.nan, 0, np.nan],[0, np.nan, np.nan])
-    array([ NaN,  NaN,  NaN])
+    array([nan, nan, nan])
     >>> np.minimum(-np.Inf, 1)
     -inf
 
@@ -2440,8 +2428,7 @@
     Parameters
     ----------
     x1, x2 : array_like
-        The arrays holding the elements to be compared. They must have
-        the same shape.
+        The arrays holding the elements to be compared. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -2480,7 +2467,7 @@
            [ 0.5,  2. ]])
 
     >>> np.fmax([np.nan, 0, np.nan],[0, np.nan, np.nan])
-    array([  0.,   0.,  NaN])
+    array([ 0.,  0., nan])
 
     """)
 
@@ -2498,8 +2485,7 @@
     Parameters
     ----------
     x1, x2 : array_like
-        The arrays holding the elements to be compared. They must have
-        the same shape.
+        The arrays holding the elements to be compared. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -2538,7 +2524,7 @@
            [ 0. ,  1. ]])
 
     >>> np.fmin([np.nan, 0, np.nan],[0, np.nan, np.nan])
-    array([  0.,   0.,  NaN])
+    array([ 0.,  0., nan])
 
     """)
 
@@ -2604,12 +2590,13 @@
     - Stacks of matrices are broadcast together as if the matrices
       were elements, respecting the signature ``(n,k),(k,m)->(n,m)``:
 
-      >>> a = a = np.full([9,5,7,3], True, dtype=bool)
-      >>> c = np.full([9, 5, 4,3], True, dtype=bool)
+      >>> a = np.ones([9, 5, 7, 4])
+      >>> c = np.ones([9, 5, 4, 3])
       >>> np.dot(a, c).shape
-      (9, 5, 7, 9, 5, 4)
-      >>> np.matmul(a, c).shape # n is 5, k is 3, m is 4
-      (9, 5, 7, 4)
+      (9, 5, 7, 9, 5, 3)
+      >>> np.matmul(a, c).shape
+      (9, 5, 7, 3)
+      >>> # n is 7, k is 4, m is 3
 
     The matmul function implements the semantics of the `@` operator introduced
     in Python 3.5 following PEP465.
@@ -2621,7 +2608,7 @@
     >>> a = np.array([[1, 0],
     ...               [0, 1]])
     >>> b = np.array([[4, 1], 
-    ...               [2, 2]]
+    ...               [2, 2]])
     >>> np.matmul(a, b)
     array([[4, 1],
            [2, 2]])
@@ -2629,7 +2616,7 @@
     For 2-D mixed with 1-D, the result is the usual.
 
     >>> a = np.array([[1, 0],
-    ...               [0, 1]]
+    ...               [0, 1]])
     >>> b = np.array([1, 2])
     >>> np.matmul(a, b)
     array([1, 2])
@@ -2711,7 +2698,7 @@
     Parameters
     ----------
     x1, x2 : array_like
-        Input arrays to be multiplied.
+        Input arrays to be multiplied. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -2792,7 +2779,7 @@
     Parameters
     ----------
     x1, x2 : array_like
-        Input arrays.
+        Input arrays.  $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -2841,7 +2828,7 @@
     x1 : array_like
         The bases.
     x2 : array_like
-        The exponents.
+        The exponents. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -2900,7 +2887,7 @@
     x1 : array_like
         The bases.
     x2 : array_like
-        The exponents.
+        The exponents. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -3072,7 +3059,7 @@
     x1 : array_like
         Dividend array.
     x2 : array_like
-        Divisor array.
+        Divisor array. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -3117,7 +3104,7 @@
     x1 : array_like
         Dividend array.
     x2 : array_like
-        Divisor array.
+        Divisor array. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -3156,7 +3143,7 @@
     x1 : array_like, int
         Input values.
     x2 : array_like, int
-        Number of bits to remove at the right of `x1`.
+        Number of bits to remove at the right of `x1`. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -3283,16 +3270,14 @@
     """
     Change the sign of x1 to that of x2, element-wise.
 
-    If both arguments are arrays or sequences, they have to be of the same
-    length. If `x2` is a scalar, its sign will be copied to all elements of
-    `x1`.
+    If `x2` is a scalar, its sign will be copied to all elements of `x1`.
 
     Parameters
     ----------
     x1 : array_like
         Values to change the sign of.
     x2 : array_like
-        The sign of `x2` is copied to `x1`.
+        The sign of `x2` is copied to `x1`. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -3327,6 +3312,7 @@
         Values to find the next representable value of.
     x2 : array_like
         The direction where to look for the next representable value of `x1`.
+        $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -3475,6 +3461,7 @@
     >>> # Discrepancy due to vagaries of floating point arithmetic.
 
     >>> # Example of providing the optional output parameter
+    >>> out1 = np.array([0], dtype='d')
     >>> out2 = np.sinh([0.1], out1)
     >>> out2 is out1
     True
@@ -3483,7 +3470,7 @@
     >>> np.sinh(np.zeros((3,3)),np.zeros((2,2)))
     Traceback (most recent call last):
       File "<stdin>", line 1, in <module>
-    ValueError: invalid return array shape
+    ValueError: operands could not be broadcast together with shapes (3,3) (2,2)
 
     """)
 
@@ -3528,8 +3515,8 @@
     >>> np.sqrt([4, -1, -3+4J])
     array([ 2.+0.j,  0.+1.j,  1.+2.j])
 
-    >>> np.sqrt([4, -1, numpy.inf])
-    array([  2.,  NaN,  Inf])
+    >>> np.sqrt([4, -1, np.inf])
+    array([ 2., nan, inf])
 
     """)
 
@@ -3597,7 +3584,7 @@
     Parameters
     ----------
     x1, x2 : array_like
-        The arrays to be subtracted from each other.
+        The arrays to be subtracted from each other. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -3660,6 +3647,7 @@
     >>>
     >>> # Example of providing the optional output parameter illustrating
     >>> # that what is returned is a reference to said parameter
+    >>> out1 = np.array([0], dtype='d')
     >>> out2 = np.cos([0.1], out1)
     >>> out2 is out1
     True
@@ -3668,7 +3656,7 @@
     >>> np.cos(np.zeros((3,3)),np.zeros((2,2)))
     Traceback (most recent call last):
       File "<stdin>", line 1, in <module>
-    ValueError: invalid return array shape
+    ValueError: operands could not be broadcast together with shapes (3,3) (2,2)
 
     """)
 
@@ -3711,6 +3699,7 @@
 
     >>> # Example of providing the optional output parameter illustrating
     >>> # that what is returned is a reference to said parameter
+    >>> out1 = np.array([0], dtype='d')
     >>> out2 = np.tanh([0.1], out1)
     >>> out2 is out1
     True
@@ -3719,7 +3708,7 @@
     >>> np.tanh(np.zeros((3,3)),np.zeros((2,2)))
     Traceback (most recent call last):
       File "<stdin>", line 1, in <module>
-    ValueError: invalid return array shape
+    ValueError: operands could not be broadcast together with shapes (3,3) (2,2)
 
     """)
 
@@ -3736,7 +3725,7 @@
     x1 : array_like
         Dividend array.
     x2 : array_like
-        Divisor array.
+        Divisor array. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -3761,8 +3750,6 @@
     >>> np.true_divide(x, 4)
     array([ 0.  ,  0.25,  0.5 ,  0.75,  1.  ])
 
-    >>> x/4
-    array([0, 0, 0, 0, 1])
     >>> x//4
     array([0, 0, 0, 0, 1])
 
@@ -3835,7 +3822,7 @@
     x1 : array_like
         Array of multipliers.
     x2 : array_like, int
-        Array of twos exponents.
+        Array of twos exponents. $BROADCASTABLE_2
     $PARAMS
 
     Returns
@@ -3858,7 +3845,7 @@
     Examples
     --------
     >>> np.ldexp(5, np.arange(4))
-    array([  5.,  10.,  20.,  40.], dtype=float32)
+    array([ 5., 10., 20., 40.], dtype=float16)
 
     >>> x = np.arange(6)
     >>> np.ldexp(*np.frexp(x))
@@ -3873,7 +3860,7 @@
     Parameters
     ----------
     x1, x2 : array_like, int
-        Arrays of values
+        Arrays of values. $BROADCASTABLE_2
 
     Returns
     -------
@@ -3903,7 +3890,7 @@
     Parameters
     ----------
     x1, x2 : array_like, int
-        Arrays of values
+        Arrays of values. $BROADCASTABLE_2
 
     Returns
     -------
diff --git a/numpy/core/defchararray.py b/numpy/core/defchararray.py
index 12ba3f0..007fc61 100644
--- a/numpy/core/defchararray.py
+++ b/numpy/core/defchararray.py
@@ -498,8 +498,7 @@
     --------
     >>> c = np.array(['aAaAaA', '  aA  ', 'abBABba'])
     >>> c
-    array(['aAaAaA', '  aA  ', 'abBABba'],
-        dtype='|S7')
+    array(['aAaAaA', '  aA  ', 'abBABba'], dtype='<U7')
     >>> np.char.count(c, 'A')
     array([3, 1, 1])
     >>> np.char.count(c, 'aA')
@@ -552,8 +551,7 @@
     --------
     >>> c = np.array(['aAaAaA', '  aA  ', 'abBABba'])
     >>> c
-    array(['aAaAaA', '  aA  ', 'abBABba'],
-        dtype='|S7')
+    array(['aAaAaA', '  aA  ', 'abBABba'], dtype='<U7')
     >>> np.char.encode(c, encoding='cp037')
     array(['\\x81\\xc1\\x81\\xc1\\x81\\xc1', '@@\\x81\\xc1@@',
         '\\x81\\x82\\xc2\\xc1\\xc2\\x82\\x81'],
@@ -637,8 +635,7 @@
     >>> s[0] = 'foo'
     >>> s[1] = 'bar'
     >>> s
-    array(['foo', 'bar'],
-        dtype='|S3')
+    array(['foo', 'bar'], dtype='<U3')
     >>> np.char.endswith(s, 'ar')
     array([False,  True])
     >>> np.char.endswith(s, 'a', start=1, end=2)
@@ -1036,11 +1033,9 @@
     Examples
     --------
     >>> c = np.array(['A1B C', '1BCA', 'BCA1']); c
-    array(['A1B C', '1BCA', 'BCA1'],
-          dtype='|S5')
+    array(['A1B C', '1BCA', 'BCA1'], dtype='<U5')
     >>> np.char.lower(c)
-    array(['a1b c', '1bca', 'bca1'],
-          dtype='|S5')
+    array(['a1b c', '1bca', 'bca1'], dtype='<U5')
 
     """
     a_arr = numpy.asarray(a)
@@ -1084,23 +1079,20 @@
     --------
     >>> c = np.array(['aAaAaA', '  aA  ', 'abBABba'])
     >>> c
-    array(['aAaAaA', '  aA  ', 'abBABba'],
-        dtype='|S7')
+    array(['aAaAaA', '  aA  ', 'abBABba'], dtype='<U7')
 
     The 'a' variable is unstripped from c[1] because whitespace leading.
 
     >>> np.char.lstrip(c, 'a')
-    array(['AaAaA', '  aA  ', 'bBABba'],
-        dtype='|S7')
+    array(['AaAaA', '  aA  ', 'bBABba'], dtype='<U7')
 
 
     >>> np.char.lstrip(c, 'A') # leaves c unchanged
-    array(['aAaAaA', '  aA  ', 'abBABba'],
-        dtype='|S7')
+    array(['aAaAaA', '  aA  ', 'abBABba'], dtype='<U7')
     >>> (np.char.lstrip(c, ' ') == np.char.lstrip(c, '')).all()
-    ... # XXX: is this a regression? this line now returns False
+    ... # XXX: is this a regression? This used to return True
     ... # np.char.lstrip(c,'') does not modify c at all.
-    True
+    False
     >>> (np.char.lstrip(c, ' ') == np.char.lstrip(c, None)).all()
     True
 
@@ -1400,10 +1392,10 @@
     >>> c = np.array(['aAaAaA', 'abBABba'], dtype='S7'); c
     array(['aAaAaA', 'abBABba'],
         dtype='|S7')
-    >>> np.char.rstrip(c, 'a')
+    >>> np.char.rstrip(c, b'a')
     array(['aAaAaA', 'abBABb'],
         dtype='|S7')
-    >>> np.char.rstrip(c, 'A')
+    >>> np.char.rstrip(c, b'A')
     array(['aAaAa', 'abBABba'],
         dtype='|S7')
 
@@ -1549,17 +1541,13 @@
     --------
     >>> c = np.array(['aAaAaA', '  aA  ', 'abBABba'])
     >>> c
-    array(['aAaAaA', '  aA  ', 'abBABba'],
-        dtype='|S7')
+    array(['aAaAaA', '  aA  ', 'abBABba'], dtype='<U7')
     >>> np.char.strip(c)
-    array(['aAaAaA', 'aA', 'abBABba'],
-        dtype='|S7')
+    array(['aAaAaA', 'aA', 'abBABba'], dtype='<U7')
     >>> np.char.strip(c, 'a') # 'a' unstripped from c[1] because whitespace leads
-    array(['AaAaA', '  aA  ', 'bBABb'],
-        dtype='|S7')
+    array(['AaAaA', '  aA  ', 'bBABb'], dtype='<U7')
     >>> np.char.strip(c, 'A') # 'A' unstripped from c[1] because (unprinted) ws trails
-    array(['aAaAa', '  aA  ', 'abBABba'],
-        dtype='|S7')
+    array(['aAaAa', '  aA  ', 'abBABba'], dtype='<U7')
 
     """
     a_arr = numpy.asarray(a)
@@ -1711,11 +1699,9 @@
     Examples
     --------
     >>> c = np.array(['a1b c', '1bca', 'bca1']); c
-    array(['a1b c', '1bca', 'bca1'],
-        dtype='|S5')
+    array(['a1b c', '1bca', 'bca1'], dtype='<U5')
     >>> np.char.upper(c)
-    array(['A1B C', '1BCA', 'BCA1'],
-        dtype='|S5')
+    array(['A1B C', '1BCA', 'BCA1'], dtype='<U5')
 
     """
     a_arr = numpy.asarray(a)
@@ -1950,18 +1936,16 @@
     >>> charar = np.chararray((3, 3))
     >>> charar[:] = 'a'
     >>> charar
-    chararray([['a', 'a', 'a'],
-           ['a', 'a', 'a'],
-           ['a', 'a', 'a']],
-          dtype='|S1')
+    chararray([[b'a', b'a', b'a'],
+               [b'a', b'a', b'a'],
+               [b'a', b'a', b'a']], dtype='|S1')
 
     >>> charar = np.chararray(charar.shape, itemsize=5)
     >>> charar[:] = 'abc'
     >>> charar
-    chararray([['abc', 'abc', 'abc'],
-           ['abc', 'abc', 'abc'],
-           ['abc', 'abc', 'abc']],
-          dtype='|S5')
+    chararray([[b'abc', b'abc', b'abc'],
+               [b'abc', b'abc', b'abc'],
+               [b'abc', b'abc', b'abc']], dtype='|S5')
 
     """
     def __new__(subtype, shape, itemsize=1, unicode=False, buffer=None,
diff --git a/numpy/core/einsumfunc.py b/numpy/core/einsumfunc.py
index c4fc77e..83b7d82 100644
--- a/numpy/core/einsumfunc.py
+++ b/numpy/core/einsumfunc.py
@@ -41,10 +41,10 @@
     --------
 
     >>> _flop_count('abc', False, 1, {'a': 2, 'b':3, 'c':5})
-    90
+    30
 
     >>> _flop_count('abc', True, 2, {'a': 2, 'b':3, 'c':5})
-    270
+    60
 
     """
 
@@ -171,7 +171,7 @@
     >>> isets = [set('abd'), set('ac'), set('bdc')]
     >>> oset = set()
     >>> idx_sizes = {'a': 1, 'b':2, 'c':3, 'd':4}
-    >>> _path__optimal_path(isets, oset, idx_sizes, 5000)
+    >>> _optimal_path(isets, oset, idx_sizes, 5000)
     [(0, 2), (0, 1)]
     """
 
@@ -342,7 +342,7 @@
     >>> isets = [set('abd'), set('ac'), set('bdc')]
     >>> oset = set()
     >>> idx_sizes = {'a': 1, 'b':2, 'c':3, 'd':4}
-    >>> _path__greedy_path(isets, oset, idx_sizes, 5000)
+    >>> _greedy_path(isets, oset, idx_sizes, 5000)
     [(0, 2), (0, 1)]
     """
 
@@ -539,13 +539,14 @@
     --------
     The operand list is simplified to reduce printing:
 
+    >>> np.random.seed(123)
     >>> a = np.random.rand(4, 4)
     >>> b = np.random.rand(4, 4, 4)
-    >>> __parse_einsum_input(('...a,...a->...', a, b))
-    ('za,xza', 'xz', [a, b])
+    >>> _parse_einsum_input(('...a,...a->...', a, b))
+    ('za,xza', 'xz', [a, b]) # may vary
 
-    >>> __parse_einsum_input((a, [Ellipsis, 0], b, [Ellipsis, 0]))
-    ('za,xza', 'xz', [a, b])
+    >>> _parse_einsum_input((a, [Ellipsis, 0], b, [Ellipsis, 0]))
+    ('za,xza', 'xz', [a, b]) # may vary
     """
 
     if len(operands) == 0:
@@ -763,6 +764,7 @@
     of the contraction and the remaining contraction ``(0, 1)`` is then
     completed.
 
+    >>> np.random.seed(123)
     >>> a = np.random.rand(2, 2)
     >>> b = np.random.rand(2, 5)
     >>> c = np.random.rand(5, 2)
@@ -770,7 +772,7 @@
     >>> print(path_info[0])
     ['einsum_path', (1, 2), (0, 1)]
     >>> print(path_info[1])
-      Complete contraction:  ij,jk,kl->il
+      Complete contraction:  ij,jk,kl->il # may vary
              Naive scaling:  4
          Optimized scaling:  3
           Naive FLOP count:  1.600e+02
@@ -789,12 +791,12 @@
     >>> I = np.random.rand(10, 10, 10, 10)
     >>> C = np.random.rand(10, 10)
     >>> path_info = np.einsum_path('ea,fb,abcd,gc,hd->efgh', C, C, I, C, C,
-                                   optimize='greedy')
+    ...                            optimize='greedy')
 
     >>> print(path_info[0])
     ['einsum_path', (0, 2), (0, 3), (0, 2), (0, 1)]
-    >>> print(path_info[1])
-      Complete contraction:  ea,fb,abcd,gc,hd->efgh
+    >>> print(path_info[1]) 
+      Complete contraction:  ea,fb,abcd,gc,hd->efgh # may vary
              Naive scaling:  8
          Optimized scaling:  5
           Naive FLOP count:  8.000e+08
@@ -1274,32 +1276,32 @@
     >>> a = np.arange(60.).reshape(3,4,5)
     >>> b = np.arange(24.).reshape(4,3,2)
     >>> np.einsum('ijk,jil->kl', a, b)
-    array([[ 4400.,  4730.],
-           [ 4532.,  4874.],
-           [ 4664.,  5018.],
-           [ 4796.,  5162.],
-           [ 4928.,  5306.]])
+    array([[4400., 4730.],
+           [4532., 4874.],
+           [4664., 5018.],
+           [4796., 5162.],
+           [4928., 5306.]])
     >>> np.einsum(a, [0,1,2], b, [1,0,3], [2,3])
-    array([[ 4400.,  4730.],
-           [ 4532.,  4874.],
-           [ 4664.,  5018.],
-           [ 4796.,  5162.],
-           [ 4928.,  5306.]])
+    array([[4400., 4730.],
+           [4532., 4874.],
+           [4664., 5018.],
+           [4796., 5162.],
+           [4928., 5306.]])
     >>> np.tensordot(a,b, axes=([1,0],[0,1]))
-    array([[ 4400.,  4730.],
-           [ 4532.,  4874.],
-           [ 4664.,  5018.],
-           [ 4796.,  5162.],
-           [ 4928.,  5306.]])
+    array([[4400., 4730.],
+           [4532., 4874.],
+           [4664., 5018.],
+           [4796., 5162.],
+           [4928., 5306.]])
 
     Writeable returned arrays (since version 1.10.0):
 
     >>> a = np.zeros((3, 3))
     >>> np.einsum('ii->i', a)[:] = 1
     >>> a
-    array([[ 1.,  0.,  0.],
-           [ 0.,  1.,  0.],
-           [ 0.,  0.,  1.]])
+    array([[1., 0., 0.],
+           [0., 1., 0.],
+           [0., 0., 1.]])
 
     Example of ellipsis use:
 
@@ -1322,19 +1324,27 @@
     particularly significant with larger arrays:
 
     >>> a = np.ones(64).reshape(2,4,8)
-    # Basic `einsum`: ~1520ms  (benchmarked on 3.1GHz Intel i5.)
+
+    Basic `einsum`: ~1520ms  (benchmarked on 3.1GHz Intel i5.)
+
     >>> for iteration in range(500):
-    ...     np.einsum('ijk,ilm,njm,nlk,abc->',a,a,a,a,a)
-    # Sub-optimal `einsum` (due to repeated path calculation time): ~330ms
+    ...     _ = np.einsum('ijk,ilm,njm,nlk,abc->',a,a,a,a,a)
+
+    Sub-optimal `einsum` (due to repeated path calculation time): ~330ms
+
     >>> for iteration in range(500):
-    ...     np.einsum('ijk,ilm,njm,nlk,abc->',a,a,a,a,a, optimize='optimal')
-    # Greedy `einsum` (faster optimal path approximation): ~160ms
+    ...     _ = np.einsum('ijk,ilm,njm,nlk,abc->',a,a,a,a,a, optimize='optimal')
+
+    Greedy `einsum` (faster optimal path approximation): ~160ms
+
     >>> for iteration in range(500):
-    ...     np.einsum('ijk,ilm,njm,nlk,abc->',a,a,a,a,a, optimize='greedy')
-    # Optimal `einsum` (best usage pattern in some use cases): ~110ms
+    ...     _ = np.einsum('ijk,ilm,njm,nlk,abc->',a,a,a,a,a, optimize='greedy')
+
+    Optimal `einsum` (best usage pattern in some use cases): ~110ms
+
     >>> path = np.einsum_path('ijk,ilm,njm,nlk,abc->',a,a,a,a,a, optimize='optimal')[0]
     >>> for iteration in range(500):
-    ...     np.einsum('ijk,ilm,njm,nlk,abc->',a,a,a,a,a, optimize=path)
+    ...     _ = np.einsum('ijk,ilm,njm,nlk,abc->',a,a,a,a,a, optimize=path)
 
     """
 
diff --git a/numpy/core/fromnumeric.py b/numpy/core/fromnumeric.py
index 59a820d..d943729 100644
--- a/numpy/core/fromnumeric.py
+++ b/numpy/core/fromnumeric.py
@@ -240,12 +240,16 @@
     you should assign the new shape to the shape attribute of the array::
 
      >>> a = np.zeros((10, 2))
+
      # A transpose makes the array non-contiguous
      >>> b = a.T
+
      # Taking a view makes it possible to modify the shape without modifying
      # the initial object.
      >>> c = b.view()
      >>> c.shape = (20)
+     Traceback (most recent call last):
+        ...
      AttributeError: incompatible shape for a non-contiguous array
 
     The `order` keyword gives the index ordering both for *fetching* the values
@@ -1448,7 +1452,7 @@
         same type as `a` is returned unless `a` is a `matrix`, in which case
         a 1-D array rather than a (2-D) `matrix` is returned in order to
         maintain backward compatibility.
-        
+
         If ``a.ndim > 2``, then the dimensions specified by `axis1` and `axis2`
         are removed, and a new axis inserted at the end corresponding to the
         diagonal.
@@ -1644,21 +1648,21 @@
     It is equivalent to ``reshape(-1, order=order)``.
 
     >>> x = np.array([[1, 2, 3], [4, 5, 6]])
-    >>> print(np.ravel(x))
-    [1 2 3 4 5 6]
+    >>> np.ravel(x)
+    array([1, 2, 3, 4, 5, 6])
 
-    >>> print(x.reshape(-1))
-    [1 2 3 4 5 6]
+    >>> x.reshape(-1)
+    array([1, 2, 3, 4, 5, 6])
 
-    >>> print(np.ravel(x, order='F'))
-    [1 4 2 5 3 6]
+    >>> np.ravel(x, order='F')
+    array([1, 4, 2, 5, 3, 6])
 
     When ``order`` is 'A', it will preserve the array's 'C' or 'F' ordering:
 
-    >>> print(np.ravel(x.T))
-    [1 4 2 5 3 6]
-    >>> print(np.ravel(x.T, order='A'))
-    [1 2 3 4 5 6]
+    >>> np.ravel(x.T)
+    array([1, 4, 2, 5, 3, 6])
+    >>> np.ravel(x.T, order='A')
+    array([1, 2, 3, 4, 5, 6])
 
     When ``order`` is 'K', it will preserve orderings that are neither 'C'
     nor 'F', but won't reverse axes:
@@ -1747,7 +1751,7 @@
     array([[0, 0],
            [1, 1],
            [2, 0],
-           [2, 1])
+           [2, 1]])
 
     A common use for ``nonzero`` is to find the indices of an array, where
     a condition is True.  Given an array `a`, the condition `a` > 3 is a
@@ -1959,12 +1963,13 @@
 
 
 def _sum_dispatcher(a, axis=None, dtype=None, out=None, keepdims=None,
-                    initial=None):
+                    initial=None, where=None):
     return (a, out)
 
 
 @array_function_dispatch(_sum_dispatcher)
-def sum(a, axis=None, dtype=None, out=None, keepdims=np._NoValue, initial=np._NoValue):
+def sum(a, axis=None, dtype=None, out=None, keepdims=np._NoValue,
+        initial=np._NoValue, where=np._NoValue):
     """
     Sum of array elements over a given axis.
 
@@ -2008,6 +2013,11 @@
 
         .. versionadded:: 1.15.0
 
+    where : array_like of bool, optional
+        Elements to include in the sum. See `~numpy.ufunc.reduce` for details.
+
+        .. versionadded:: 1.17.0
+
     Returns
     -------
     sum_along_axis : ndarray
@@ -2048,6 +2058,8 @@
     array([0, 6])
     >>> np.sum([[0, 1], [0, 5]], axis=1)
     array([1, 5])
+    >>> np.sum([[0, 1], [np.nan, 5]], where=[False, True], axis=1)
+    array([1., 5.])
 
     If the accumulator is too small, overflow occurs:
 
@@ -2073,7 +2085,7 @@
         return res
 
     return _wrapreduction(a, np.add, 'sum', axis, dtype, out, keepdims=keepdims,
-                          initial=initial)
+                          initial=initial, where=where)
 
 
 def _any_dispatcher(a, axis=None, out=None, keepdims=None):
@@ -2150,10 +2162,10 @@
     >>> np.any(np.nan)
     True
 
-    >>> o=np.array([False])
+    >>> o=np.array(False)
     >>> z=np.any([-1, 4, 5], out=o)
     >>> z, o
-    (array([ True]), array([ True]))
+    (array(True), array(True))
     >>> # Check now that z is a reference to o
     >>> z is o
     True
@@ -2236,10 +2248,10 @@
     >>> np.all([1.0, np.nan])
     True
 
-    >>> o=np.array([False])
+    >>> o=np.array(False)
     >>> z=np.all([-1, 4, 5], out=o)
-    >>> id(z), id(o), z                             # doctest: +SKIP
-    (28293632, 28293632, array([ True]))
+    >>> id(z), id(o), z
+    (28293632, 28293632, array(True)) # may vary
 
     """
     return _wrapreduction(a, np.logical_and, 'all', axis, None, out, keepdims=keepdims)
@@ -2390,12 +2402,14 @@
     return _methods._ptp(a, axis=axis, out=out, **kwargs)
 
 
-def _amax_dispatcher(a, axis=None, out=None, keepdims=None, initial=None):
+def _amax_dispatcher(a, axis=None, out=None, keepdims=None, initial=None,
+                     where=None):
     return (a, out)
 
 
 @array_function_dispatch(_amax_dispatcher)
-def amax(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue):
+def amax(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue,
+         where=np._NoValue):
     """
     Return the maximum of an array or maximum along an axis.
 
@@ -2433,6 +2447,11 @@
 
         .. versionadded:: 1.15.0
 
+    where : array_like of bool, optional
+        Elements to compare for the maximum. See `~numpy.ufunc.reduce`
+        for details.
+
+        .. versionadded:: 1.17.0
 
     Returns
     -------
@@ -2478,11 +2497,14 @@
     array([2, 3])
     >>> np.amax(a, axis=1)   # Maxima along the second axis
     array([1, 3])
-
+    >>> np.amax(a, where=[False, True], initial=-1, axis=0)
+    array([-1,  3])
     >>> b = np.arange(5, dtype=float)
     >>> b[2] = np.NaN
     >>> np.amax(b)
     nan
+    >>> np.amax(b, where=~np.isnan(b), initial=-1)
+    4.0
     >>> np.nanmax(b)
     4.0
 
@@ -2501,16 +2523,18 @@
     >>> max([5], default=6)
     5
     """
-    return _wrapreduction(a, np.maximum, 'max', axis, None, out, keepdims=keepdims,
-                          initial=initial)
+    return _wrapreduction(a, np.maximum, 'max', axis, None, out,
+                          keepdims=keepdims, initial=initial, where=where)
 
 
-def _amin_dispatcher(a, axis=None, out=None, keepdims=None, initial=None):
+def _amin_dispatcher(a, axis=None, out=None, keepdims=None, initial=None,
+                     where=None):
     return (a, out)
 
 
 @array_function_dispatch(_amin_dispatcher)
-def amin(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue):
+def amin(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue,
+         where=np._NoValue):
     """
     Return the minimum of an array or minimum along an axis.
 
@@ -2548,6 +2572,12 @@
 
         .. versionadded:: 1.15.0
 
+    where : array_like of bool, optional
+        Elements to compare for the minimum. See `~numpy.ufunc.reduce`
+        for details.
+
+        .. versionadded:: 1.17.0
+
     Returns
     -------
     amin : ndarray or scalar
@@ -2592,11 +2622,15 @@
     array([0, 1])
     >>> np.amin(a, axis=1)   # Minima along the second axis
     array([0, 2])
+    >>> np.amin(a, where=[False, True], initial=10, axis=0)
+    array([10,  1])
 
     >>> b = np.arange(5, dtype=float)
     >>> b[2] = np.NaN
     >>> np.amin(b)
     nan
+    >>> np.amin(b, where=~np.isnan(b), initial=10)
+    0.0
     >>> np.nanmin(b)
     0.0
 
@@ -2614,8 +2648,8 @@
     >>> min([6], default=5)
     6
     """
-    return _wrapreduction(a, np.minimum, 'min', axis, None, out, keepdims=keepdims,
-                          initial=initial)
+    return _wrapreduction(a, np.minimum, 'min', axis, None, out,
+                          keepdims=keepdims, initial=initial, where=where)
 
 
 def _alen_dispathcer(a):
@@ -2656,13 +2690,14 @@
         return len(array(a, ndmin=1))
 
 
-def _prod_dispatcher(
-        a, axis=None, dtype=None, out=None, keepdims=None, initial=None):
+def _prod_dispatcher(a, axis=None, dtype=None, out=None, keepdims=None,
+                     initial=None, where=None):
     return (a, out)
 
 
 @array_function_dispatch(_prod_dispatcher)
-def prod(a, axis=None, dtype=None, out=None, keepdims=np._NoValue, initial=np._NoValue):
+def prod(a, axis=None, dtype=None, out=None, keepdims=np._NoValue,
+         initial=np._NoValue, where=np._NoValue):
     """
     Return the product of array elements over a given axis.
 
@@ -2707,6 +2742,11 @@
 
         .. versionadded:: 1.15.0
 
+    where : array_like of bool, optional
+        Elements to include in the product. See `~numpy.ufunc.reduce` for details.
+
+        .. versionadded:: 1.17.0
+
     Returns
     -------
     product_along_axis : ndarray, see `dtype` parameter above.
@@ -2724,8 +2764,8 @@
     raised on overflow.  That means that, on a 32-bit platform:
 
     >>> x = np.array([536870910, 536870910, 536870910, 536870910])
-    >>> np.prod(x)  # random
-    16
+    >>> np.prod(x)
+    16 # may vary
 
     The product of an empty array is the neutral element 1:
 
@@ -2749,6 +2789,11 @@
     >>> np.prod([[1.,2.],[3.,4.]], axis=1)
     array([  2.,  12.])
 
+    Or select specific elements to include:
+
+    >>> np.prod([1., np.nan, 3.], where=[True, False, True])
+    3.0
+
     If the type of `x` is unsigned, then the output type is
     the unsigned platform integer:
 
@@ -2768,8 +2813,8 @@
     >>> np.prod([1, 2], initial=5)
     10
     """
-    return _wrapreduction(a, np.multiply, 'prod', axis, dtype, out, keepdims=keepdims,
-                          initial=initial)
+    return _wrapreduction(a, np.multiply, 'prod', axis, dtype, out,
+                          keepdims=keepdims, initial=initial, where=where)
 
 
 def _cumprod_dispatcher(a, axis=None, dtype=None, out=None):
@@ -2993,11 +3038,11 @@
     Examples
     --------
     >>> np.around([0.37, 1.64])
-    array([ 0.,  2.])
+    array([0.,  2.])
     >>> np.around([0.37, 1.64], decimals=1)
-    array([ 0.4,  1.6])
+    array([0.4,  1.6])
     >>> np.around([.5, 1.5, 2.5, 3.5, 4.5]) # rounds to nearest even value
-    array([ 0.,  2.,  2.,  4.,  4.])
+    array([0.,  2.,  2.,  4.,  4.])
     >>> np.around([1,2,3,11], decimals=1) # ndarray of ints is returned
     array([ 1,  2,  3, 11])
     >>> np.around([1,2,3,11], decimals=-1)
@@ -3085,9 +3130,9 @@
     >>> np.mean(a)
     2.5
     >>> np.mean(a, axis=0)
-    array([ 2.,  3.])
+    array([2., 3.])
     >>> np.mean(a, axis=1)
-    array([ 1.5,  3.5])
+    array([1.5, 3.5])
 
     In single precision, `mean` can be inaccurate:
 
@@ -3100,7 +3145,7 @@
     Computing the mean in float64 is more accurate:
 
     >>> np.mean(a, dtype=np.float64)
-    0.55000000074505806
+    0.55000000074505806 # may vary
 
     """
     kwargs = {}
@@ -3206,11 +3251,11 @@
     --------
     >>> a = np.array([[1, 2], [3, 4]])
     >>> np.std(a)
-    1.1180339887498949
+    1.1180339887498949 # may vary
     >>> np.std(a, axis=0)
-    array([ 1.,  1.])
+    array([1.,  1.])
     >>> np.std(a, axis=1)
-    array([ 0.5,  0.5])
+    array([0.5,  0.5])
 
     In single precision, std() can be inaccurate:
 
@@ -3223,7 +3268,7 @@
     Computing the standard deviation in float64 is more accurate:
 
     >>> np.std(a, dtype=np.float64)
-    0.44999999925494177
+    0.44999999925494177 # may vary
 
     """
     kwargs = {}
@@ -3330,9 +3375,9 @@
     >>> np.var(a)
     1.25
     >>> np.var(a, axis=0)
-    array([ 1.,  1.])
+    array([1.,  1.])
     >>> np.var(a, axis=1)
-    array([ 0.25,  0.25])
+    array([0.25,  0.25])
 
     In single precision, var() can be inaccurate:
 
@@ -3345,7 +3390,7 @@
     Computing the variance in float64 is more accurate:
 
     >>> np.var(a, dtype=np.float64)
-    0.20249999932944759
+    0.20249999932944759 # may vary
     >>> ((1-0.55)**2 + (0.1-0.55)**2)/2
     0.2025
 
diff --git a/numpy/core/function_base.py b/numpy/core/function_base.py
index b68fd40..f8800b8 100644
--- a/numpy/core/function_base.py
+++ b/numpy/core/function_base.py
@@ -102,11 +102,11 @@
     Examples
     --------
     >>> np.linspace(2.0, 3.0, num=5)
-    array([ 2.  ,  2.25,  2.5 ,  2.75,  3.  ])
+    array([2.  , 2.25, 2.5 , 2.75, 3.  ])
     >>> np.linspace(2.0, 3.0, num=5, endpoint=False)
-    array([ 2. ,  2.2,  2.4,  2.6,  2.8])
+    array([2. ,  2.2,  2.4,  2.6,  2.8])
     >>> np.linspace(2.0, 3.0, num=5, retstep=True)
-    (array([ 2.  ,  2.25,  2.5 ,  2.75,  3.  ]), 0.25)
+    (array([2.  ,  2.25,  2.5 ,  2.75,  3.  ]), 0.25)
 
     Graphical illustration:
 
@@ -252,11 +252,11 @@
     Examples
     --------
     >>> np.logspace(2.0, 3.0, num=4)
-    array([  100.        ,   215.443469  ,   464.15888336,  1000.        ])
+    array([ 100.        ,  215.443469  ,  464.15888336, 1000.        ])
     >>> np.logspace(2.0, 3.0, num=4, endpoint=False)
-    array([ 100.        ,  177.827941  ,  316.22776602,  562.34132519])
+    array([100.        ,  177.827941  ,  316.22776602,  562.34132519])
     >>> np.logspace(2.0, 3.0, num=4, base=2.0)
-    array([ 4.        ,  5.0396842 ,  6.34960421,  8.        ])
+    array([4.        ,  5.0396842 ,  6.34960421,  8.        ])
 
     Graphical illustration:
 
@@ -361,15 +361,15 @@
     Negative, decreasing, and complex inputs are allowed:
 
     >>> np.geomspace(1000, 1, num=4)
-    array([ 1000.,   100.,    10.,     1.])
+    array([1000.,  100.,   10.,    1.])
     >>> np.geomspace(-1000, -1, num=4)
     array([-1000.,  -100.,   -10.,    -1.])
     >>> np.geomspace(1j, 1000j, num=4)  # Straight line
-    array([ 0.   +1.j,  0.  +10.j,  0. +100.j,  0.+1000.j])
+    array([0.   +1.j, 0.  +10.j, 0. +100.j, 0.+1000.j])
     >>> np.geomspace(-1+0j, 1+0j, num=5)  # Circle
-    array([-1.00000000+0.j        , -0.70710678+0.70710678j,
-            0.00000000+1.j        ,  0.70710678+0.70710678j,
-            1.00000000+0.j        ])
+    array([-1.00000000e+00+1.22464680e-16j, -7.07106781e-01+7.07106781e-01j,
+            6.12323400e-17+1.00000000e+00j,  7.07106781e-01+7.07106781e-01j,
+            1.00000000e+00+0.00000000e+00j])
 
     Graphical illustration of ``endpoint`` parameter:
 
@@ -377,8 +377,11 @@
     >>> N = 10
     >>> y = np.zeros(N)
     >>> plt.semilogx(np.geomspace(1, 1000, N, endpoint=True), y + 1, 'o')
+    [<matplotlib.lines.Line2D object at 0x...>]
     >>> plt.semilogx(np.geomspace(1, 1000, N, endpoint=False), y + 2, 'o')
+    [<matplotlib.lines.Line2D object at 0x...>]
     >>> plt.axis([0.5, 2000, 0, 3])
+    [0.5, 2000, 0, 3]
     >>> plt.grid(True, color='0.7', linestyle='-', which='both', axis='both')
     >>> plt.show()
 
diff --git a/numpy/core/include/numpy/ndarraytypes.h b/numpy/core/include/numpy/ndarraytypes.h
index b0b749c..cfa1dba 100644
--- a/numpy/core/include/numpy/ndarraytypes.h
+++ b/numpy/core/include/numpy/ndarraytypes.h
@@ -950,12 +950,12 @@
  */
 
 
-#define PyArray_ISCONTIGUOUS(m) PyArray_CHKFLAGS(m, NPY_ARRAY_C_CONTIGUOUS)
-#define PyArray_ISWRITEABLE(m) PyArray_CHKFLAGS(m, NPY_ARRAY_WRITEABLE)
-#define PyArray_ISALIGNED(m) PyArray_CHKFLAGS(m, NPY_ARRAY_ALIGNED)
+#define PyArray_ISCONTIGUOUS(m) PyArray_CHKFLAGS((m), NPY_ARRAY_C_CONTIGUOUS)
+#define PyArray_ISWRITEABLE(m) PyArray_CHKFLAGS((m), NPY_ARRAY_WRITEABLE)
+#define PyArray_ISALIGNED(m) PyArray_CHKFLAGS((m), NPY_ARRAY_ALIGNED)
 
-#define PyArray_IS_C_CONTIGUOUS(m) PyArray_CHKFLAGS(m, NPY_ARRAY_C_CONTIGUOUS)
-#define PyArray_IS_F_CONTIGUOUS(m) PyArray_CHKFLAGS(m, NPY_ARRAY_F_CONTIGUOUS)
+#define PyArray_IS_C_CONTIGUOUS(m) PyArray_CHKFLAGS((m), NPY_ARRAY_C_CONTIGUOUS)
+#define PyArray_IS_F_CONTIGUOUS(m) PyArray_CHKFLAGS((m), NPY_ARRAY_F_CONTIGUOUS)
 
 /* the variable is used in some places, so always define it */
 #define NPY_BEGIN_THREADS_DEF PyThreadState *_save=NULL;
@@ -965,15 +965,15 @@
 #define NPY_BEGIN_THREADS do {_save = PyEval_SaveThread();} while (0);
 #define NPY_END_THREADS   do { if (_save) \
                 { PyEval_RestoreThread(_save); _save = NULL;} } while (0);
-#define NPY_BEGIN_THREADS_THRESHOLDED(loop_size) do { if (loop_size > 500) \
+#define NPY_BEGIN_THREADS_THRESHOLDED(loop_size) do { if ((loop_size) > 500) \
                 { _save = PyEval_SaveThread();} } while (0);
 
 #define NPY_BEGIN_THREADS_DESCR(dtype) \
-        do {if (!(PyDataType_FLAGCHK(dtype, NPY_NEEDS_PYAPI))) \
+        do {if (!(PyDataType_FLAGCHK((dtype), NPY_NEEDS_PYAPI))) \
                 NPY_BEGIN_THREADS;} while (0);
 
 #define NPY_END_THREADS_DESCR(dtype) \
-        do {if (!(PyDataType_FLAGCHK(dtype, NPY_NEEDS_PYAPI))) \
+        do {if (!(PyDataType_FLAGCHK((dtype), NPY_NEEDS_PYAPI))) \
                 NPY_END_THREADS; } while (0);
 
 #define NPY_ALLOW_C_API_DEF  PyGILState_STATE __save__;
@@ -1110,7 +1110,7 @@
 
 
 /* Iterator API */
-#define PyArrayIter_Check(op) PyObject_TypeCheck(op, &PyArrayIter_Type)
+#define PyArrayIter_Check(op) PyObject_TypeCheck((op), &PyArrayIter_Type)
 
 #define _PyAIT(it) ((PyArrayIterObject *)(it))
 #define PyArray_ITER_RESET(it) do { \
@@ -1188,7 +1188,7 @@
 
 #define PyArray_ITER_GOTO1D(it, ind) do { \
         int __npy_i; \
-        npy_intp __npy_ind = (npy_intp) (ind); \
+        npy_intp __npy_ind = (npy_intp)(ind); \
         if (__npy_ind < 0) __npy_ind += _PyAIT(it)->size; \
         _PyAIT(it)->index = __npy_ind; \
         if (_PyAIT(it)->nd_m1 == 0) { \
diff --git a/numpy/core/include/numpy/ufuncobject.h b/numpy/core/include/numpy/ufuncobject.h
index 90d837a..15dcdf0 100644
--- a/numpy/core/include/numpy/ufuncobject.h
+++ b/numpy/core/include/numpy/ufuncobject.h
@@ -120,7 +120,11 @@
          */
         int nin, nout, nargs;
 
-        /* Identity for reduction, either PyUFunc_One or PyUFunc_Zero */
+        /*
+         * Identity for reduction, any of PyUFunc_One, PyUFunc_Zero
+         * PyUFunc_MinusOne, PyUFunc_None, PyUFunc_ReorderableNone,
+         * PyUFunc_IdentityValue.
+         */
         int identity;
 
         /* Array of one-dimensional core loops */
@@ -301,7 +305,7 @@
  */
 #define PyUFunc_ReorderableNone -2
 /*
- * UFunc unit is in identity_value, and the order of operations can be reordered
+ * UFunc unit is an identity_value, and the order of operations can be reordered
  * This case allows reduction with multiple axes at once.
  */
 #define PyUFunc_IdentityValue -3
diff --git a/numpy/core/memmap.py b/numpy/core/memmap.py
index 82bc470..9ba4817 100644
--- a/numpy/core/memmap.py
+++ b/numpy/core/memmap.py
@@ -135,9 +135,9 @@
 
     >>> fp = np.memmap(filename, dtype='float32', mode='w+', shape=(3,4))
     >>> fp
-    memmap([[ 0.,  0.,  0.,  0.],
-            [ 0.,  0.,  0.,  0.],
-            [ 0.,  0.,  0.,  0.]], dtype=float32)
+    memmap([[0., 0., 0., 0.],
+            [0., 0., 0., 0.],
+            [0., 0., 0., 0.]], dtype=float32)
 
     Write data to memmap array:
 
diff --git a/numpy/core/multiarray.py b/numpy/core/multiarray.py
index df0ed2d..4c27158 100644
--- a/numpy/core/multiarray.py
+++ b/numpy/core/multiarray.py
@@ -117,11 +117,11 @@
     --------
     >>> a = ([1,2,3], [4,5,6])                         # a is array-like
     >>> np.empty_like(a)
-    array([[-1073741821, -1073741821,           3],    #random
+    array([[-1073741821, -1073741821,           3],    # random
            [          0,           0, -1073741821]])
     >>> a = np.array([[1., 2., 3.],[4.,5.,6.]])
     >>> np.empty_like(a)
-    array([[ -2.00000715e+000,   1.48219694e-323,  -2.00000572e+000],#random
+    array([[ -2.00000715e+000,   1.48219694e-323,  -2.00000572e+000], # random
            [  4.38791518e-305,  -2.00000715e+000,   4.17269252e-309]])
 
     """
@@ -211,9 +211,11 @@
            fill_value=999999)
 
     """
-    for array in arrays:
-        yield array
-    yield out
+    if out is not None:
+        # optimize for the typical case where only arrays is provided
+        arrays = list(arrays)
+        arrays.append(out)
+    return arrays
 
 
 @array_function_from_c_func_and_dispatcher(_multiarray_umath.inner)
@@ -286,8 +288,8 @@
     An example where `b` is a scalar:
 
     >>> np.inner(np.eye(2), 7)
-    array([[ 7.,  0.],
-           [ 0.,  7.]])
+    array([[7., 0.],
+           [0., 7.]])
 
     """
     return (a, b)
@@ -421,8 +423,8 @@
     >>> a = [1,5,1,4,3,4,4] # First column
     >>> b = [9,4,0,4,0,2,1] # Second column
     >>> ind = np.lexsort((b,a)) # Sort by a, then by b
-    >>> print(ind)
-    [2 0 4 6 5 3 1]
+    >>> ind
+    array([2, 0, 4, 6, 5, 3, 1])
 
     >>> [(a[i],b[i]) for i in ind]
     [(1, 0), (1, 9), (3, 0), (4, 1), (4, 2), (4, 4), (5, 4)]
@@ -1139,7 +1141,10 @@
     ...                [0,0,1]]])
     >>> b = np.packbits(a, axis=-1)
     >>> b
-    array([[[160],[64]],[[192],[32]]], dtype=uint8)
+    array([[[160],
+            [ 64]],
+           [[192],
+            [ 32]]], dtype=uint8)
 
     Note that in binary 160 = 1010 0000, 64 = 0100 0000, 192 = 1100 0000,
     and 32 = 0010 0000.
@@ -1329,7 +1334,7 @@
     >>> # The weekdays are Friday, Saturday, and Monday
     ... np.is_busday(['2011-07-01', '2011-07-02', '2011-07-18'],
     ...                 holidays=['2011-07-01', '2011-07-04', '2011-07-17'])
-    array([False, False,  True], dtype='bool')
+    array([False, False,  True])
     """
     return (dates, weekmask, holidays, out)
 
@@ -1403,27 +1408,27 @@
     --------
     >>> # First business day in October 2011 (not accounting for holidays)
     ... np.busday_offset('2011-10', 0, roll='forward')
-    numpy.datetime64('2011-10-03','D')
+    numpy.datetime64('2011-10-03')
     >>> # Last business day in February 2012 (not accounting for holidays)
     ... np.busday_offset('2012-03', -1, roll='forward')
-    numpy.datetime64('2012-02-29','D')
+    numpy.datetime64('2012-02-29')
     >>> # Third Wednesday in January 2011
     ... np.busday_offset('2011-01', 2, roll='forward', weekmask='Wed')
-    numpy.datetime64('2011-01-19','D')
+    numpy.datetime64('2011-01-19')
     >>> # 2012 Mother's Day in Canada and the U.S.
     ... np.busday_offset('2012-05', 1, roll='forward', weekmask='Sun')
-    numpy.datetime64('2012-05-13','D')
+    numpy.datetime64('2012-05-13')
 
     >>> # First business day on or after a date
     ... np.busday_offset('2011-03-20', 0, roll='forward')
-    numpy.datetime64('2011-03-21','D')
+    numpy.datetime64('2011-03-21')
     >>> np.busday_offset('2011-03-22', 0, roll='forward')
-    numpy.datetime64('2011-03-22','D')
+    numpy.datetime64('2011-03-22')
     >>> # First business day after a date
     ... np.busday_offset('2011-03-20', 1, roll='backward')
-    numpy.datetime64('2011-03-21','D')
+    numpy.datetime64('2011-03-21')
     >>> np.busday_offset('2011-03-22', 1, roll='backward')
-    numpy.datetime64('2011-03-23','D')
+    numpy.datetime64('2011-03-23')
     """
     return (dates, offsets, weekmask, holidays, out)
 
@@ -1487,7 +1492,7 @@
     ... np.busday_count('2011-01', '2011-02')
     21
     >>> # Number of weekdays in 2011
-    ...  np.busday_count('2011', '2012')
+    >>> np.busday_count('2011', '2012')
     260
     >>> # Number of Saturdays in 2011
     ... np.busday_count('2011', '2012', weekmask='Sat')
@@ -1525,6 +1530,7 @@
 
     Examples
     --------
+    >>> import pytz
     >>> d = np.arange('2002-10-27T04:30', 4*60, 60, dtype='M8[m]')
     >>> d
     array(['2002-10-27T04:30', '2002-10-27T05:30', '2002-10-27T06:30',
@@ -1555,6 +1561,8 @@
     'casting' can be used to specify whether precision can be changed
 
     >>> np.datetime_as_string(d, unit='h', casting='safe')
+    Traceback (most recent call last):
+        ...
     TypeError: Cannot create a datetime string as units 'h' from a NumPy
     datetime with units 'm' according to the rule 'safe'
     """
diff --git a/numpy/core/numeric.py b/numpy/core/numeric.py
index 8768cbe..1b8f36c 100644
--- a/numpy/core/numeric.py
+++ b/numpy/core/numeric.py
@@ -12,6 +12,7 @@
 import sys
 import warnings
 import numbers
+import contextlib
 
 import numpy as np
 from . import multiarray
@@ -160,9 +161,9 @@
 
     >>> y = np.arange(3, dtype=float)
     >>> y
-    array([ 0.,  1.,  2.])
+    array([0., 1., 2.])
     >>> np.zeros_like(y)
-    array([ 0.,  0.,  0.])
+    array([0.,  0.,  0.])
 
     """
     res = empty_like(a, dtype=dtype, order=order, subok=subok)
@@ -205,19 +206,19 @@
     Examples
     --------
     >>> np.ones(5)
-    array([ 1.,  1.,  1.,  1.,  1.])
+    array([1., 1., 1., 1., 1.])
 
     >>> np.ones((5,), dtype=int)
     array([1, 1, 1, 1, 1])
 
     >>> np.ones((2, 1))
-    array([[ 1.],
-           [ 1.]])
+    array([[1.],
+           [1.]])
 
     >>> s = (2,2)
     >>> np.ones(s)
-    array([[ 1.,  1.],
-           [ 1.,  1.]])
+    array([[1.,  1.],
+           [1.,  1.]])
 
     """
     a = empty(shape, dtype, order)
@@ -280,9 +281,9 @@
 
     >>> y = np.arange(3, dtype=float)
     >>> y
-    array([ 0.,  1.,  2.])
+    array([0., 1., 2.])
     >>> np.ones_like(y)
-    array([ 1.,  1.,  1.])
+    array([1.,  1.,  1.])
 
     """
     res = empty_like(a, dtype=dtype, order=order, subok=subok)
@@ -323,8 +324,8 @@
     Examples
     --------
     >>> np.full((2, 2), np.inf)
-    array([[ inf,  inf],
-           [ inf,  inf]])
+    array([[inf, inf],
+           [inf, inf]])
     >>> np.full((2, 2), 10)
     array([[10, 10],
            [10, 10]])
@@ -385,13 +386,13 @@
     >>> np.full_like(x, 0.1)
     array([0, 0, 0, 0, 0, 0])
     >>> np.full_like(x, 0.1, dtype=np.double)
-    array([ 0.1,  0.1,  0.1,  0.1,  0.1,  0.1])
+    array([0.1, 0.1, 0.1, 0.1, 0.1, 0.1])
     >>> np.full_like(x, np.nan, dtype=np.double)
-    array([ nan,  nan,  nan,  nan,  nan,  nan])
+    array([nan, nan, nan, nan, nan, nan])
 
     >>> y = np.arange(6, dtype=np.double)
     >>> np.full_like(y, 0.1)
-    array([ 0.1,  0.1,  0.1,  0.1,  0.1,  0.1])
+    array([0.1,  0.1,  0.1,  0.1,  0.1,  0.1])
 
     """
     res = empty_like(a, dtype=dtype, order=order, subok=subok)
@@ -620,8 +621,8 @@
     --------
     >>> x = np.arange(6).reshape(2,3)
     >>> np.ascontiguousarray(x, dtype=np.float32)
-    array([[ 0.,  1.,  2.],
-           [ 3.,  4.,  5.]], dtype=float32)
+    array([[0., 1., 2.],
+           [3., 4., 5.]], dtype=float32)
     >>> x.flags['C_CONTIGUOUS']
     True
 
@@ -802,7 +803,7 @@
     >>> np.isfortran(a)
     False
 
-    >>> b = np.array([[1, 2, 3], [4, 5, 6]], order='FORTRAN')
+    >>> b = np.array([[1, 2, 3], [4, 5, 6]], order='F')
     >>> b
     array([[1, 2, 3],
            [4, 5, 6]])
@@ -987,11 +988,11 @@
     Examples
     --------
     >>> np.correlate([1, 2, 3], [0, 1, 0.5])
-    array([ 3.5])
+    array([3.5])
     >>> np.correlate([1, 2, 3], [0, 1, 0.5], "same")
-    array([ 2. ,  3.5,  3. ])
+    array([2. ,  3.5,  3. ])
     >>> np.correlate([1, 2, 3], [0, 1, 0.5], "full")
-    array([ 0.5,  2. ,  3.5,  3. ,  0. ])
+    array([0.5,  2. ,  3.5,  3. ,  0. ])
 
     Using complex sequences:
 
@@ -1087,20 +1088,20 @@
     before "sliding" the two across one another:
 
     >>> np.convolve([1, 2, 3], [0, 1, 0.5])
-    array([ 0. ,  1. ,  2.5,  4. ,  1.5])
+    array([0. , 1. , 2.5, 4. , 1.5])
 
     Only return the middle values of the convolution.
     Contains boundary effects, where zeros are taken
     into account:
 
     >>> np.convolve([1,2,3],[0,1,0.5], 'same')
-    array([ 1. ,  2.5,  4. ])
+    array([1. ,  2.5,  4. ])
 
     The two arrays are of the same length, so there
     is only one position where they completely overlap:
 
     >>> np.convolve([1,2,3],[0,1,0.5], 'valid')
-    array([ 2.5])
+    array([2.5])
 
     """
     a, v = array(a, copy=False, ndmin=1), array(v, copy=False, ndmin=1)
@@ -1176,11 +1177,11 @@
            [-2., -1.,  0.,  1.,  2.]])
     >>> im = np.outer(1j*np.linspace(2, -2, 5), np.ones((5,)))
     >>> im
-    array([[ 0.+2.j,  0.+2.j,  0.+2.j,  0.+2.j,  0.+2.j],
-           [ 0.+1.j,  0.+1.j,  0.+1.j,  0.+1.j,  0.+1.j],
-           [ 0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j,  0.+0.j],
-           [ 0.-1.j,  0.-1.j,  0.-1.j,  0.-1.j,  0.-1.j],
-           [ 0.-2.j,  0.-2.j,  0.-2.j,  0.-2.j,  0.-2.j]])
+    array([[0.+2.j, 0.+2.j, 0.+2.j, 0.+2.j, 0.+2.j],
+           [0.+1.j, 0.+1.j, 0.+1.j, 0.+1.j, 0.+1.j],
+           [0.+0.j, 0.+0.j, 0.+0.j, 0.+0.j, 0.+0.j],
+           [0.-1.j, 0.-1.j, 0.-1.j, 0.-1.j, 0.-1.j],
+           [0.-2.j, 0.-2.j, 0.-2.j, 0.-2.j, 0.-2.j]])
     >>> grid = rl + im
     >>> grid
     array([[-2.+2.j, -1.+2.j,  0.+2.j,  1.+2.j,  2.+2.j],
@@ -1193,9 +1194,9 @@
 
     >>> x = np.array(['a', 'b', 'c'], dtype=object)
     >>> np.outer(x, [1, 2, 3])
-    array([[a, aa, aaa],
-           [b, bb, bbb],
-           [c, cc, ccc]], dtype=object)
+    array([['a', 'aa', 'aaa'],
+           ['b', 'bb', 'bbb'],
+           ['c', 'cc', 'ccc']], dtype=object)
 
     """
     a = asarray(a)
@@ -1264,11 +1265,11 @@
     >>> c.shape
     (5, 2)
     >>> c
-    array([[ 4400.,  4730.],
-           [ 4532.,  4874.],
-           [ 4664.,  5018.],
-           [ 4796.,  5162.],
-           [ 4928.,  5306.]])
+    array([[4400., 4730.],
+           [4532., 4874.],
+           [4664., 5018.],
+           [4796., 5162.],
+           [4928., 5306.]])
     >>> # A slower but equivalent way of computing the same...
     >>> d = np.zeros((5,2))
     >>> for i in range(5):
@@ -1294,40 +1295,40 @@
             [3, 4]],
            [[5, 6],
             [7, 8]]])
-    array([[a, b],
-           [c, d]], dtype=object)
+    array([['a', 'b'],
+           ['c', 'd']], dtype=object)
 
     >>> np.tensordot(a, A) # third argument default is 2 for double-contraction
-    array([abbcccdddd, aaaaabbbbbbcccccccdddddddd], dtype=object)
+    array(['abbcccdddd', 'aaaaabbbbbbcccccccdddddddd'], dtype=object)
 
     >>> np.tensordot(a, A, 1)
-    array([[[acc, bdd],
-            [aaacccc, bbbdddd]],
-           [[aaaaacccccc, bbbbbdddddd],
-            [aaaaaaacccccccc, bbbbbbbdddddddd]]], dtype=object)
+    array([[['acc', 'bdd'],
+            ['aaacccc', 'bbbdddd']],
+           [['aaaaacccccc', 'bbbbbdddddd'],
+            ['aaaaaaacccccccc', 'bbbbbbbdddddddd']]], dtype=object)
 
     >>> np.tensordot(a, A, 0) # tensor product (result too long to incl.)
-    array([[[[[a, b],
-              [c, d]],
+    array([[[[['a', 'b'],
+              ['c', 'd']],
               ...
 
     >>> np.tensordot(a, A, (0, 1))
-    array([[[abbbbb, cddddd],
-            [aabbbbbb, ccdddddd]],
-           [[aaabbbbbbb, cccddddddd],
-            [aaaabbbbbbbb, ccccdddddddd]]], dtype=object)
+    array([[['abbbbb', 'cddddd'],
+            ['aabbbbbb', 'ccdddddd']],
+           [['aaabbbbbbb', 'cccddddddd'],
+            ['aaaabbbbbbbb', 'ccccdddddddd']]], dtype=object)
 
     >>> np.tensordot(a, A, (2, 1))
-    array([[[abb, cdd],
-            [aaabbbb, cccdddd]],
-           [[aaaaabbbbbb, cccccdddddd],
-            [aaaaaaabbbbbbbb, cccccccdddddddd]]], dtype=object)
+    array([[['abb', 'cdd'],
+            ['aaabbbb', 'cccdddd']],
+           [['aaaaabbbbbb', 'cccccdddddd'],
+            ['aaaaaaabbbbbbbb', 'cccccccdddddddd']]], dtype=object)
 
     >>> np.tensordot(a, A, ((0, 1), (0, 1)))
-    array([abbbcccccddddddd, aabbbbccccccdddddddd], dtype=object)
+    array(['abbbcccccddddddd', 'aabbbbccccccdddddddd'], dtype=object)
 
     >>> np.tensordot(a, A, ((2, 1), (1, 0)))
-    array([acccbbdddd, aaaaacccccccbbbbbbdddddddd], dtype=object)
+    array(['acccbbdddd', 'aaaaacccccccbbbbbbdddddddd'], dtype=object)
 
     """
     try:
@@ -1780,7 +1781,7 @@
     >>> x = [1,2]
     >>> y = [4,5]
     >>> np.cross(x, y)
-    -3
+    array(-3)
 
     Multiple vector cross-products. Note that the direction of the cross
     product vector is defined by the `right-hand rule`.
@@ -2097,10 +2098,10 @@
     NumPy supports PEP 3141 numbers:
 
     >>> from fractions import Fraction
-    >>> isscalar(Fraction(5, 17))
+    >>> np.isscalar(Fraction(5, 17))
     True
     >>> from numbers import Number
-    >>> isscalar(Number())
+    >>> np.isscalar(Number())
     True
 
     """
@@ -2339,9 +2340,9 @@
     Examples
     --------
     >>> np.identity(3)
-    array([[ 1.,  0.,  0.],
-           [ 0.,  1.,  0.],
-           [ 0.,  0.,  1.]])
+    array([[1.,  0.,  0.],
+           [0.,  1.,  0.],
+           [0.,  0.,  1.]])
 
     """
     from numpy import eye
@@ -2487,23 +2488,23 @@
     Examples
     --------
     >>> np.isclose([1e10,1e-7], [1.00001e10,1e-8])
-    array([True, False])
+    array([ True, False])
     >>> np.isclose([1e10,1e-8], [1.00001e10,1e-9])
-    array([True, True])
+    array([ True, True])
     >>> np.isclose([1e10,1e-8], [1.0001e10,1e-9])
-    array([False, True])
+    array([False,  True])
     >>> np.isclose([1.0, np.nan], [1.0, np.nan])
-    array([True, False])
+    array([ True, False])
     >>> np.isclose([1.0, np.nan], [1.0, np.nan], equal_nan=True)
-    array([True, True])
+    array([ True, True])
     >>> np.isclose([1e-8, 1e-7], [0.0, 0.0])
-    array([ True, False], dtype=bool)
+    array([ True, False])
     >>> np.isclose([1e-100, 1e-7], [0.0, 0.0], atol=0.0)
-    array([False, False], dtype=bool)
+    array([False, False])
     >>> np.isclose([1e-10, 1e-10], [1e-20, 0.0])
-    array([ True,  True], dtype=bool)
+    array([ True,  True])
     >>> np.isclose([1e-10, 1e-10], [1e-20, 0.999999e-10], atol=0.0)
-    array([False,  True], dtype=bool)
+    array([False,  True])
     """
     def within_tol(x, y, atol, rtol):
         with errstate(invalid='ignore'):
@@ -2710,11 +2711,9 @@
     --------
     >>> old_settings = np.seterr(all='ignore')  #seterr to known value
     >>> np.seterr(over='raise')
-    {'over': 'ignore', 'divide': 'ignore', 'invalid': 'ignore',
-     'under': 'ignore'}
+    {'divide': 'ignore', 'over': 'ignore', 'under': 'ignore', 'invalid': 'ignore'}
     >>> np.seterr(**old_settings)  # reset to default
-    {'over': 'raise', 'divide': 'ignore', 'invalid': 'ignore',
-     'under': 'ignore'}
+    {'divide': 'ignore', 'over': 'raise', 'under': 'ignore', 'invalid': 'ignore'}
 
     >>> np.int16(32000) * np.int16(3)
     30464
@@ -2724,11 +2723,11 @@
       File "<stdin>", line 1, in <module>
     FloatingPointError: overflow encountered in short_scalars
 
+    >>> from collections import OrderedDict
     >>> old_settings = np.seterr(all='print')
-    >>> np.geterr()
-    {'over': 'print', 'divide': 'print', 'invalid': 'print', 'under': 'print'}
+    >>> OrderedDict(np.geterr())
+    OrderedDict([('divide', 'print'), ('over', 'print'), ('under', 'print'), ('invalid', 'print')])
     >>> np.int16(32000) * np.int16(3)
-    Warning: overflow encountered in short_scalars
     30464
 
     """
@@ -2779,18 +2778,17 @@
 
     Examples
     --------
-    >>> np.geterr()
-    {'over': 'warn', 'divide': 'warn', 'invalid': 'warn',
-    'under': 'ignore'}
+    >>> from collections import OrderedDict
+    >>> sorted(np.geterr().items())
+    [('divide', 'warn'), ('invalid', 'warn'), ('over', 'warn'), ('under', 'ignore')]
     >>> np.arange(3.) / np.arange(3.)
-    array([ NaN,   1.,   1.])
+    array([nan,  1.,  1.])
 
     >>> oldsettings = np.seterr(all='warn', over='raise')
-    >>> np.geterr()
-    {'over': 'raise', 'divide': 'warn', 'invalid': 'warn', 'under': 'warn'}
+    >>> OrderedDict(sorted(np.geterr().items()))
+    OrderedDict([('divide', 'warn'), ('invalid', 'warn'), ('over', 'raise'), ('under', 'warn')])
     >>> np.arange(3.) / np.arange(3.)
-    __main__:1: RuntimeWarning: invalid value encountered in divide
-    array([ NaN,   1.,   1.])
+    array([nan,  1.,  1.])
 
     """
     maskvalue = umath.geterrobj()[1]
@@ -2897,15 +2895,16 @@
 
     >>> saved_handler = np.seterrcall(err_handler)
     >>> save_err = np.seterr(all='call')
+    >>> from collections import OrderedDict
 
     >>> np.array([1, 2, 3]) / 0.0
     Floating point error (divide by zero), with flag 1
-    array([ Inf,  Inf,  Inf])
+    array([inf, inf, inf])
 
     >>> np.seterrcall(saved_handler)
     <function err_handler at 0x...>
-    >>> np.seterr(**save_err)
-    {'over': 'call', 'divide': 'call', 'invalid': 'call', 'under': 'call'}
+    >>> OrderedDict(sorted(np.seterr(**save_err).items()))
+    OrderedDict([('divide', 'call'), ('invalid', 'call'), ('over', 'call'), ('under', 'call')])
 
     Log error message:
 
@@ -2919,14 +2918,13 @@
     >>> save_err = np.seterr(all='log')
 
     >>> np.array([1, 2, 3]) / 0.0
-    LOG: Warning: divide by zero encountered in divide
-    <BLANKLINE>
-    array([ Inf,  Inf,  Inf])
+    LOG: Warning: divide by zero encountered in true_divide
+    array([inf, inf, inf])
 
     >>> np.seterrcall(saved_handler)
-    <__main__.Log object at 0x...>
-    >>> np.seterr(**save_err)
-    {'over': 'log', 'divide': 'log', 'invalid': 'log', 'under': 'log'}
+    <numpy.core.numeric.Log object at 0x...>
+    >>> OrderedDict(sorted(np.seterr(**save_err).items()))
+    OrderedDict([('divide', 'log'), ('invalid', 'log'), ('over', 'log'), ('under', 'log')])
 
     """
     if func is not None and not isinstance(func, collections_abc.Callable):
@@ -2975,7 +2973,7 @@
     >>> oldhandler = np.seterrcall(err_handler)
     >>> np.array([1, 2, 3]) / 0.0
     Floating point error (divide by zero), with flag 1
-    array([ Inf,  Inf,  Inf])
+    array([inf, inf, inf])
 
     >>> cur_handler = np.geterrcall()
     >>> cur_handler is err_handler
@@ -2993,7 +2991,7 @@
 
 
 @set_module('numpy')
-class errstate(object):
+class errstate(contextlib.ContextDecorator):
     """
     errstate(**kwargs)
 
@@ -3003,7 +3001,12 @@
     that context to execute with a known error handling behavior. Upon entering
     the context the error handling is set with `seterr` and `seterrcall`, and
     upon exiting it is reset to what it was before.
-
+    
+    ..  versionchanged:: 1.17.0
+        `errstate` is also usable as a function decorator, saving
+        a level of indentation if an entire function is wrapped.
+        See :py:class:`contextlib.ContextDecorator` for more information. 
+    
     Parameters
     ----------
     kwargs : {divide, over, under, invalid}
@@ -3023,15 +3026,14 @@
 
     Examples
     --------
+    >>> from collections import OrderedDict
     >>> olderr = np.seterr(all='ignore')  # Set error handling to known state.
 
     >>> np.arange(3) / 0.
-    array([ NaN,  Inf,  Inf])
+    array([nan, inf, inf])
     >>> with np.errstate(divide='warn'):
     ...     np.arange(3) / 0.
-    ...
-    __main__:2: RuntimeWarning: divide by zero encountered in divide
-    array([ NaN,  Inf,  Inf])
+    array([nan, inf, inf])
 
     >>> np.sqrt(-1)
     nan
@@ -3043,9 +3045,8 @@
 
     Outside the context the error handling behavior has not changed:
 
-    >>> np.geterr()
-    {'over': 'warn', 'divide': 'warn', 'invalid': 'warn',
-    'under': 'ignore'}
+    >>> OrderedDict(sorted(np.geterr().items()))
+    OrderedDict([('divide', 'ignore'), ('invalid', 'ignore'), ('over', 'ignore'), ('under', 'ignore')])
 
     """
     # Note that we don't want to run the above doctests because they will fail
diff --git a/numpy/core/numerictypes.py b/numpy/core/numerictypes.py
index f00f922..5bc37b7 100644
--- a/numpy/core/numerictypes.py
+++ b/numpy/core/numerictypes.py
@@ -163,19 +163,19 @@
     Examples
     --------
     >>> np.maximum_sctype(int)
-    <type 'numpy.int64'>
+    <class 'numpy.int64'>
     >>> np.maximum_sctype(np.uint8)
-    <type 'numpy.uint64'>
+    <class 'numpy.uint64'>
     >>> np.maximum_sctype(complex)
-    <type 'numpy.complex192'>
+    <class 'numpy.complex256'> # may vary
 
     >>> np.maximum_sctype(str)
-    <type 'numpy.string_'>
+    <class 'numpy.str_'>
 
     >>> np.maximum_sctype('i2')
-    <type 'numpy.int64'>
+    <class 'numpy.int64'>
     >>> np.maximum_sctype('f4')
-    <type 'numpy.float96'>
+    <class 'numpy.float128'> # may vary
 
     """
     g = obj2sctype(t)
@@ -260,19 +260,18 @@
     Examples
     --------
     >>> np.obj2sctype(np.int32)
-    <type 'numpy.int32'>
+    <class 'numpy.int32'>
     >>> np.obj2sctype(np.array([1., 2.]))
-    <type 'numpy.float64'>
+    <class 'numpy.float64'>
     >>> np.obj2sctype(np.array([1.j]))
-    <type 'numpy.complex128'>
+    <class 'numpy.complex128'>
 
     >>> np.obj2sctype(dict)
-    <type 'numpy.object_'>
+    <class 'numpy.object_'>
     >>> np.obj2sctype('string')
-    <type 'numpy.string_'>
 
     >>> np.obj2sctype(1, default=list)
-    <type 'list'>
+    <class 'list'>
 
     """
     # prevent abtract classes being upcast
@@ -319,7 +318,7 @@
     Examples
     --------
     >>> np.issubclass_(np.int32, int)
-    True
+    False # True on Python 2.7
     >>> np.issubclass_(np.int32, float)
     False
 
@@ -352,7 +351,7 @@
     Examples
     --------
     >>> np.issubsctype('S8', str)
-    True
+    False
     >>> np.issubsctype(np.array([1]), int)
     True
     >>> np.issubsctype(np.array([1]), float)
@@ -485,9 +484,9 @@
 
     Examples
     --------
-    >>> for sctype in [np.int32, float, complex, np.string_, np.ndarray]:
+    >>> for sctype in [np.int32, np.double, np.complex, np.string_, np.ndarray]:
     ...     print(np.sctype2char(sctype))
-    l
+    l # may vary
     d
     D
     S
diff --git a/numpy/core/overrides.py b/numpy/core/overrides.py
index 0979858..c55174e 100644
--- a/numpy/core/overrides.py
+++ b/numpy/core/overrides.py
@@ -1,73 +1,23 @@
-"""Preliminary implementation of NEP-18
-
-TODO: rewrite this in C for performance.
-"""
+"""Implementation of __array_function__ overrides from NEP-18."""
 import collections
 import functools
 import os
 
-from numpy.core._multiarray_umath import add_docstring, ndarray
+from numpy.core._multiarray_umath import (
+    add_docstring, implement_array_function, _get_implementing_args)
 from numpy.compat._inspect import getargspec
 
 
-_NDARRAY_ARRAY_FUNCTION = ndarray.__array_function__
-_NDARRAY_ONLY = [ndarray]
-
 ENABLE_ARRAY_FUNCTION = bool(
     int(os.environ.get('NUMPY_EXPERIMENTAL_ARRAY_FUNCTION', 0)))
 
 
-def get_overloaded_types_and_args(relevant_args):
-    """Returns a list of arguments on which to call __array_function__.
-
-    Parameters
-    ----------
-    relevant_args : iterable of array-like
-        Iterable of array-like arguments to check for __array_function__
-        methods.
-
-    Returns
-    -------
-    overloaded_types : collection of types
-        Types of arguments from relevant_args with __array_function__ methods.
-    overloaded_args : list
-        Arguments from relevant_args on which to call __array_function__
-        methods, in the order in which they should be called.
+add_docstring(
+    implement_array_function,
     """
-    # Runtime is O(num_arguments * num_unique_types)
-    overloaded_types = []
-    overloaded_args = []
-    for arg in relevant_args:
-        arg_type = type(arg)
-        # We only collect arguments if they have a unique type, which ensures
-        # reasonable performance even with a long list of possibly overloaded
-        # arguments.
-        if (arg_type not in overloaded_types and
-                hasattr(arg_type, '__array_function__')):
+    Implement a function with checks for __array_function__ overrides.
 
-            # Create lists explicitly for the first type (usually the only one
-            # done) to avoid setting up the iterator for overloaded_args.
-            if overloaded_types:
-                overloaded_types.append(arg_type)
-                # By default, insert argument at the end, but if it is
-                # subclass of another argument, insert it before that argument.
-                # This ensures "subclasses before superclasses".
-                index = len(overloaded_args)
-                for i, old_arg in enumerate(overloaded_args):
-                    if issubclass(arg_type, type(old_arg)):
-                        index = i
-                        break
-                overloaded_args.insert(index, arg)
-            else:
-                overloaded_types = [arg_type]
-                overloaded_args = [arg]
-
-    return overloaded_types, overloaded_args
-
-
-def array_function_implementation_or_override(
-        implementation, public_api, relevant_args, args, kwargs):
-    """Implement a function with checks for __array_function__ overrides.
+    All arguments are required, and can only be passed by position.
 
     Arguments
     ---------
@@ -82,41 +32,37 @@
         Iterable of arguments to check for __array_function__ methods.
     args : tuple
         Arbitrary positional arguments originally passed into ``public_api``.
-    kwargs : tuple
+    kwargs : dict
         Arbitrary keyword arguments originally passed into ``public_api``.
 
     Returns
     -------
-    Result from calling `implementation()` or an `__array_function__`
+    Result from calling ``implementation()`` or an ``__array_function__``
     method, as appropriate.
 
     Raises
     ------
     TypeError : if no implementation is found.
+    """)
+
+
+# exposed for testing purposes; used internally by implement_array_function
+add_docstring(
+    _get_implementing_args,
     """
-    # Check for __array_function__ methods.
-    types, overloaded_args = get_overloaded_types_and_args(relevant_args)
-    # Short-cut for common cases: no overload or only ndarray overload
-    # (directly or with subclasses that do not override __array_function__).
-    if (not overloaded_args or types == _NDARRAY_ONLY or
-            all(type(arg).__array_function__ is _NDARRAY_ARRAY_FUNCTION
-                for arg in overloaded_args)):
-        return implementation(*args, **kwargs)
+    Collect arguments on which to call __array_function__.
 
-    # Call overrides
-    for overloaded_arg in overloaded_args:
-        # Use `public_api` instead of `implemenation` so __array_function__
-        # implementations can do equality/identity comparisons.
-        result = overloaded_arg.__array_function__(
-            public_api, types, args, kwargs)
+    Parameters
+    ----------
+    relevant_args : iterable of array-like
+        Iterable of possibly array-like arguments to check for
+        __array_function__ methods.
 
-        if result is not NotImplemented:
-            return result
-
-    func_name = '{}.{}'.format(public_api.__module__, public_api.__name__)
-    raise TypeError("no implementation found for '{}' on types that implement "
-                    '__array_function__: {}'
-                    .format(func_name, list(map(type, overloaded_args))))
+    Returns
+    -------
+    Sequence of arguments with __array_function__ methods, in the order in
+    which they should be called.
+    """)
 
 
 ArgSpec = collections.namedtuple('ArgSpec', 'args varargs keywords defaults')
@@ -215,7 +161,7 @@
         @functools.wraps(implementation)
         def public_api(*args, **kwargs):
             relevant_args = dispatcher(*args, **kwargs)
-            return array_function_implementation_or_override(
+            return implement_array_function(
                 implementation, public_api, relevant_args, args, kwargs)
 
         if module is not None:
diff --git a/numpy/core/records.py b/numpy/core/records.py
index 86a4330..42aca5b 100644
--- a/numpy/core/records.py
+++ b/numpy/core/records.py
@@ -7,10 +7,9 @@
 integers, bools etc.  However, it is possible for elements to be combinations
 of these using structured types, such as::
 
-  >>> a = np.array([(1, 2.0), (1, 2.0)], dtype=[('x', int), ('y', float)])
+  >>> a = np.array([(1, 2.0), (1, 2.0)], dtype=[('x', np.int64), ('y', np.float64)])
   >>> a
-  array([(1, 2.0), (1, 2.0)],
-        dtype=[('x', '<i4'), ('y', '<f8')])
+  array([(1, 2.), (1, 2.)], dtype=[('x', '<i8'), ('y', '<f8')])
 
 Here, each element consists of two fields: x (and int), and y (a float).
 This is known as a structured array.  The different fields are analogous
@@ -21,7 +20,7 @@
   array([1, 1])
 
   >>> a['y']
-  array([ 2.,  2.])
+  array([2., 2.])
 
 Record arrays allow us to access fields as properties::
 
@@ -31,7 +30,7 @@
   array([1, 1])
 
   >>> ar.y
-  array([ 2.,  2.])
+  array([2., 2.])
 
 """
 from __future__ import division, absolute_import, print_function
@@ -39,6 +38,7 @@
 import sys
 import os
 import warnings
+from collections import Counter, OrderedDict
 
 from . import numeric as sb
 from . import numerictypes as nt
@@ -74,14 +74,25 @@
 
 numfmt = nt.typeDict
 
+# taken from OrderedDict recipes in the Python documentation
+# https://docs.python.org/3.3/library/collections.html#ordereddict-examples-and-recipes
+class _OrderedCounter(Counter, OrderedDict):
+    """Counter that remembers the order elements are first encountered"""
+
+    def __repr__(self):
+        return '%s(%r)' % (self.__class__.__name__, OrderedDict(self))
+
+    def __reduce__(self):
+        return self.__class__, (OrderedDict(self),)
+
+
 def find_duplicate(list):
     """Find duplication in a list, return a list of duplicated elements"""
-    dup = []
-    for i in range(len(list)):
-        if (list[i] in list[i + 1:]):
-            if (list[i] not in dup):
-                dup.append(list[i])
-    return dup
+    return [
+        item
+        for item, counts in _OrderedCounter(list).items()
+        if counts > 1
+    ]
 
 
 @set_module('numpy')
@@ -128,10 +139,9 @@
 
     Examples
     --------
-    >>> np.format_parser(['f8', 'i4', 'a5'], ['col1', 'col2', 'col3'],
+    >>> np.format_parser(['<f8', '<i4', '<a5'], ['col1', 'col2', 'col3'],
     ...                  ['T1', 'T2', 'T3']).dtype
-    dtype([(('T1', 'col1'), '<f8'), (('T2', 'col2'), '<i4'),
-           (('T3', 'col3'), '|S5')])
+    dtype([(('T1', 'col1'), '<f8'), (('T2', 'col2'), '<i4'), (('T3', 'col3'), 'S5')])
 
     `names` and/or `titles` can be empty lists. If `titles` is an empty list,
     titles will simply not appear. If `names` is empty, default field names
@@ -139,9 +149,9 @@
 
     >>> np.format_parser(['f8', 'i4', 'a5'], ['col1', 'col2', 'col3'],
     ...                  []).dtype
-    dtype([('col1', '<f8'), ('col2', '<i4'), ('col3', '|S5')])
-    >>> np.format_parser(['f8', 'i4', 'a5'], [], []).dtype
-    dtype([('f0', '<f8'), ('f1', '<i4'), ('f2', '|S5')])
+    dtype([('col1', '<f8'), ('col2', '<i4'), ('col3', '<S5')])
+    >>> np.format_parser(['<f8', '<i4', '<a5'], [], []).dtype
+    dtype([('f0', '<f8'), ('f1', '<i4'), ('f2', 'S5')])
 
     """
 
@@ -157,10 +167,12 @@
         if formats is None:
             raise ValueError("Need formats argument")
         if isinstance(formats, list):
-            if len(formats) < 2:
-                formats.append('')
-            formats = ','.join(formats)
-        dtype = sb.dtype(formats, aligned)
+            dtype = sb.dtype(
+                [('f{}'.format(i), format_) for i, format_ in enumerate(formats)],
+                aligned,
+            )
+        else:
+            dtype = sb.dtype(formats, aligned)
         fields = dtype.fields
         if fields is None:
             dtype = sb.dtype([('f1', dtype)], aligned)
@@ -380,20 +392,19 @@
     --------
     Create an array with two fields, ``x`` and ``y``:
 
-    >>> x = np.array([(1.0, 2), (3.0, 4)], dtype=[('x', float), ('y', int)])
+    >>> x = np.array([(1.0, 2), (3.0, 4)], dtype=[('x', '<f8'), ('y', '<i8')])
     >>> x
-    array([(1.0, 2), (3.0, 4)],
-          dtype=[('x', '<f8'), ('y', '<i4')])
+    array([(1., 2), (3., 4)], dtype=[('x', '<f8'), ('y', '<i8')])
 
     >>> x['x']
-    array([ 1.,  3.])
+    array([1., 3.])
 
     View the array as a record array:
 
     >>> x = x.view(np.recarray)
 
     >>> x.x
-    array([ 1.,  3.])
+    array([1., 3.])
 
     >>> x.y
     array([2, 4])
@@ -580,7 +591,7 @@
     >>> x3=np.array([1.1,2,3,4])
     >>> r = np.core.records.fromarrays([x1,x2,x3],names='a,b,c')
     >>> print(r[1])
-    (2, 'dd', 2.0)
+    (2, 'dd', 2.0) # may vary
     >>> x1[1]=34
     >>> r.a
     array([1, 2, 3, 4])
@@ -602,7 +613,6 @@
             if not isinstance(obj, ndarray):
                 raise ValueError("item in the array list must be an ndarray.")
             formats.append(obj.dtype.str)
-        formats = ','.join(formats)
 
     if dtype is not None:
         descr = sb.dtype(dtype)
@@ -659,11 +669,11 @@
     >>> r.col1
     array([456,   2])
     >>> r.col2
-    array(['dbe', 'de'],
-          dtype='|S3')
+    array(['dbe', 'de'], dtype='<U3')
     >>> import pickle
-    >>> print(pickle.loads(pickle.dumps(r)))
-    [(456, 'dbe', 1.2) (2, 'de', 1.3)]
+    >>> pickle.loads(pickle.dumps(r))
+    rec.array([(456, 'dbe', 1.2), (  2, 'de', 1.3)],
+              dtype=[('col1', '<i8'), ('col2', '<U3'), ('col3', '<f8')])
     """
 
     if formats is None and dtype is None:  # slower
@@ -711,7 +721,7 @@
     a string"""
 
     if dtype is None and formats is None:
-        raise ValueError("Must have dtype= or formats=")
+        raise TypeError("fromstring() needs a 'dtype' or 'formats' argument")
 
     if dtype is not None:
         descr = sb.dtype(dtype)
@@ -750,7 +760,7 @@
     >>> a = a.newbyteorder('<')
     >>> a.tofile(fd)
     >>>
-    >>> fd.seek(0)
+    >>> _ = fd.seek(0)
     >>> r=np.core.records.fromfile(fd, formats='f8,i4,a5', shape=10,
     ... byteorder='<')
     >>> print(r[5])
@@ -758,6 +768,9 @@
     >>> r.shape
     (10,)
     """
+    
+    if dtype is None and formats is None:
+        raise TypeError("fromfile() needs a 'dtype' or 'formats' argument")
 
     if (shape is None or shape == 0):
         shape = (-1,)
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 467b590..9ccca62 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -775,6 +775,7 @@
     multiarray_deps = [
             join('src', 'multiarray', 'arrayobject.h'),
             join('src', 'multiarray', 'arraytypes.h'),
+            join('src', 'multiarray', 'arrayfunction_override.h'),
             join('src', 'multiarray', 'buffer.h'),
             join('src', 'multiarray', 'calculation.h'),
             join('src', 'multiarray', 'common.h'),
@@ -827,6 +828,7 @@
             join('src', 'multiarray', 'arraytypes.c.src'),
             join('src', 'multiarray', 'array_assign_scalar.c'),
             join('src', 'multiarray', 'array_assign_array.c'),
+            join('src', 'multiarray', 'arrayfunction_override.c'),
             join('src', 'multiarray', 'buffer.c'),
             join('src', 'multiarray', 'calculation.c'),
             join('src', 'multiarray', 'compiled_base.c'),
diff --git a/numpy/core/shape_base.py b/numpy/core/shape_base.py
index a529d2a..f8332c3 100644
--- a/numpy/core/shape_base.py
+++ b/numpy/core/shape_base.py
@@ -48,13 +48,13 @@
     Examples
     --------
     >>> np.atleast_1d(1.0)
-    array([ 1.])
+    array([1.])
 
     >>> x = np.arange(9.0).reshape(3,3)
     >>> np.atleast_1d(x)
-    array([[ 0.,  1.,  2.],
-           [ 3.,  4.,  5.],
-           [ 6.,  7.,  8.]])
+    array([[0., 1., 2.],
+           [3., 4., 5.],
+           [6., 7., 8.]])
     >>> np.atleast_1d(x) is x
     True
 
@@ -106,11 +106,11 @@
     Examples
     --------
     >>> np.atleast_2d(3.0)
-    array([[ 3.]])
+    array([[3.]])
 
     >>> x = np.arange(3.0)
     >>> np.atleast_2d(x)
-    array([[ 0.,  1.,  2.]])
+    array([[0., 1., 2.]])
     >>> np.atleast_2d(x).base is x
     True
 
@@ -166,7 +166,7 @@
     Examples
     --------
     >>> np.atleast_3d(3.0)
-    array([[[ 3.]]])
+    array([[[3.]]])
 
     >>> x = np.arange(3.0)
     >>> np.atleast_3d(x).shape
@@ -179,7 +179,7 @@
     True
 
     >>> for arr in np.atleast_3d([1, 2], [[1, 2]], [[[1, 2]]]):
-    ...     print(arr, arr.shape)
+    ...     print(arr, arr.shape) # doctest: +SKIP
     ...
     [[[1]
       [2]]] (1, 2, 1)
@@ -342,10 +342,11 @@
 
 def _stack_dispatcher(arrays, axis=None, out=None):
     arrays = _arrays_for_stack_dispatcher(arrays, stacklevel=6)
-    for a in arrays:
-        yield a
     if out is not None:
-        yield out
+        # optimize for the typical case where only arrays is provided
+        arrays = list(arrays)
+        arrays.append(out)
+    return arrays
 
 
 @array_function_dispatch(_stack_dispatcher)
@@ -760,11 +761,11 @@
     ...     [A,               np.zeros((2, 3))],
     ...     [np.ones((3, 2)), B               ]
     ... ])
-    array([[ 2.,  0.,  0.,  0.,  0.],
-           [ 0.,  2.,  0.,  0.,  0.],
-           [ 1.,  1.,  3.,  0.,  0.],
-           [ 1.,  1.,  0.,  3.,  0.],
-           [ 1.,  1.,  0.,  0.,  3.]])
+    array([[2., 0., 0., 0., 0.],
+           [0., 2., 0., 0., 0.],
+           [1., 1., 3., 0., 0.],
+           [1., 1., 0., 3., 0.],
+           [1., 1., 0., 0., 3.]])
 
     With a list of depth 1, `block` can be used as `hstack`
 
@@ -774,7 +775,7 @@
     >>> a = np.array([1, 2, 3])
     >>> b = np.array([2, 3, 4])
     >>> np.block([a, b, 10])             # hstack([a, b, 10])
-    array([1, 2, 3, 2, 3, 4, 10])
+    array([ 1,  2,  3,  2,  3,  4, 10])
 
     >>> A = np.ones((2, 2), int)
     >>> B = 2 * A
diff --git a/numpy/core/src/common/array_assign.c b/numpy/core/src/common/array_assign.c
index ac3fdbe..02a423e 100644
--- a/numpy/core/src/common/array_assign.c
+++ b/numpy/core/src/common/array_assign.c
@@ -125,9 +125,13 @@
 
         return npy_is_aligned((void *)align_check, alignment);
     }
-    else {
+    else if (alignment == 1) {
         return 1;
     }
+    else {
+        /* always return false for alignment == 0, which means cannot-be-aligned */
+        return 0;
+    }
 }
 
 NPY_NO_EXPORT int
diff --git a/numpy/core/src/common/array_assign.h b/numpy/core/src/common/array_assign.h
index 07438c5..69ef56b 100644
--- a/numpy/core/src/common/array_assign.h
+++ b/numpy/core/src/common/array_assign.h
@@ -87,8 +87,10 @@
 
 /*
  * Checks whether a data pointer + set of strides refers to a raw
- * array whose elements are all aligned to a given alignment.
- * alignment should be a power of two.
+ * array whose elements are all aligned to a given alignment. Returns
+ * 1 if data is aligned to alignment or 0 if not.
+ * alignment should be a power of two, or may be the sentinel value 0 to mean
+ * cannot-be-aligned, in which case 0 (false) is always returned.
  */
 NPY_NO_EXPORT int
 raw_array_is_aligned(int ndim, npy_intp *shape,
diff --git a/numpy/core/src/common/get_attr_string.h b/numpy/core/src/common/get_attr_string.h
index bec87c5..d458d95 100644
--- a/numpy/core/src/common/get_attr_string.h
+++ b/numpy/core/src/common/get_attr_string.h
@@ -103,7 +103,6 @@
     if (_is_basic_python_type(tp)) {
         return NULL;
     }
-
     return maybe_get_attr((PyObject *)tp, name);
 }
 
diff --git a/numpy/core/src/common/ufunc_override.c b/numpy/core/src/common/ufunc_override.c
index b674221..89f08a9 100644
--- a/numpy/core/src/common/ufunc_override.c
+++ b/numpy/core/src/common/ufunc_override.c
@@ -71,7 +71,7 @@
  * Get possible out argument from kwds, and returns the number of outputs
  * contained within it: if a tuple, the number of elements in it, 1 otherwise.
  * The out argument itself is returned in out_kwd_obj, and the outputs
- * in the out_obj array (all as borrowed references).
+ * in the out_obj array (as borrowed references).
  *
  * Returns 0 if no outputs found, -1 if kwds is not a dict (with an error set).
  */
@@ -79,24 +79,42 @@
 PyUFuncOverride_GetOutObjects(PyObject *kwds, PyObject **out_kwd_obj, PyObject ***out_objs)
 {
     if (kwds == NULL) {
+        Py_INCREF(Py_None);
+        *out_kwd_obj = Py_None;
         return 0;
     }
     if (!PyDict_CheckExact(kwds)) {
         PyErr_SetString(PyExc_TypeError,
                         "Internal Numpy error: call to PyUFuncOverride_GetOutObjects "
                         "with non-dict kwds");
+        *out_kwd_obj = NULL;
         return -1;
     }
     /* borrowed reference */
     *out_kwd_obj = PyDict_GetItemString(kwds, "out");
     if (*out_kwd_obj == NULL) {
+        Py_INCREF(Py_None);
+        *out_kwd_obj = Py_None;
         return 0;
     }
     if (PyTuple_CheckExact(*out_kwd_obj)) {
-        *out_objs = PySequence_Fast_ITEMS(*out_kwd_obj);
-        return PySequence_Fast_GET_SIZE(*out_kwd_obj);
+        /*
+         * The C-API recommends calling PySequence_Fast before any of the other
+         * PySequence_Fast* functions. This is required for PyPy
+         */
+        PyObject *seq;
+        seq = PySequence_Fast(*out_kwd_obj,
+                              "Could not convert object to sequence");
+        if (seq == NULL) {
+            *out_kwd_obj = NULL;
+            return -1;
+        }
+        *out_objs = PySequence_Fast_ITEMS(seq);
+        *out_kwd_obj = seq;
+        return PySequence_Fast_GET_SIZE(seq);
     }
     else {
+        Py_INCREF(*out_kwd_obj);
         *out_objs = out_kwd_obj;
         return 1;
     }
diff --git a/numpy/core/src/common/ufunc_override.h b/numpy/core/src/common/ufunc_override.h
index cc39166..bf86865 100644
--- a/numpy/core/src/common/ufunc_override.h
+++ b/numpy/core/src/common/ufunc_override.h
@@ -28,7 +28,7 @@
  * Get possible out argument from kwds, and returns the number of outputs
  * contained within it: if a tuple, the number of elements in it, 1 otherwise.
  * The out argument itself is returned in out_kwd_obj, and the outputs
- * in the out_obj array (all as borrowed references).
+ * in the out_obj array (as borrowed references).
  *
  * Returns 0 if no outputs found, -1 if kwds is not a dict (with an error set).
  */
diff --git a/numpy/core/src/multiarray/_multiarray_tests.c.src b/numpy/core/src/multiarray/_multiarray_tests.c.src
index 2a82755..c26bd16 100644
--- a/numpy/core/src/multiarray/_multiarray_tests.c.src
+++ b/numpy/core/src/multiarray/_multiarray_tests.c.src
@@ -1871,11 +1871,14 @@
 static PyObject *
 getset_numericops(PyObject* NPY_UNUSED(self), PyObject* NPY_UNUSED(args))
 {
-    PyObject * ops = PyArray_GetNumericOps();
+    PyObject *ret;
+    PyObject *ops = PyArray_GetNumericOps();
     if (ops == NULL) {
         return NULL;
     }
-    return PyLong_FromLong(PyArray_SetNumericOps(ops));
+    ret = PyLong_FromLong(PyArray_SetNumericOps(ops));
+    Py_DECREF(ops);
+    return ret;
 }
 
 static PyMethodDef Multiarray_TestsMethods[] = {
diff --git a/numpy/core/src/multiarray/array_assign_array.c b/numpy/core/src/multiarray/array_assign_array.c
index f692e03..6e31fd3 100644
--- a/numpy/core/src/multiarray/array_assign_array.c
+++ b/numpy/core/src/multiarray/array_assign_array.c
@@ -25,6 +25,47 @@
 #include "array_assign.h"
 
 /*
+ * Check that array data is both uint-aligned and true-aligned for all array
+ * elements, as required by the copy/casting code in lowlevel_strided_loops.c
+ */
+NPY_NO_EXPORT int
+copycast_isaligned(int ndim, npy_intp *shape,
+        PyArray_Descr *dtype, char *data, npy_intp *strides)
+{
+    int aligned;
+    int big_aln, small_aln;
+
+    int uint_aln = npy_uint_alignment(dtype->elsize);
+    int true_aln = dtype->alignment;
+
+    /* uint alignment can be 0, meaning not uint alignable */
+    if (uint_aln == 0) {
+        return 0;
+    }
+
+    /*
+     * As an optimization, it is unnecessary to check the alignment to the
+     * smaller of (uint_aln, true_aln) if the data is aligned to the bigger of
+     * the two and the big is a multiple of the small aln. We check the bigger
+     * one first and only check the smaller if necessary.
+     */
+    if (true_aln >= uint_aln) {
+        big_aln = true_aln;
+        small_aln = uint_aln;
+    }
+    else {
+        big_aln = uint_aln;
+        small_aln = true_aln;
+    }
+
+    aligned = raw_array_is_aligned(ndim, shape, data, strides, big_aln);
+    if (aligned && big_aln % small_aln != 0) {
+        aligned = raw_array_is_aligned(ndim, shape, data, strides, small_aln);
+    }
+    return aligned;
+}
+
+/*
  * Assigns the array from 'src' to 'dst'. The strides must already have
  * been broadcast.
  *
@@ -48,11 +89,9 @@
 
     NPY_BEGIN_THREADS_DEF;
 
-    /* Check alignment */
-    aligned = raw_array_is_aligned(ndim, shape, dst_data, dst_strides,
-                                   npy_uint_alignment(dst_dtype->elsize)) &&
-              raw_array_is_aligned(ndim, shape, src_data, src_strides,
-                                   npy_uint_alignment(src_dtype->elsize));
+    aligned =
+        copycast_isaligned(ndim, shape, dst_dtype, dst_data, dst_strides) &&
+        copycast_isaligned(ndim, shape, src_dtype, src_data, src_strides);
 
     /* Use raw iteration with no heap allocation */
     if (PyArray_PrepareTwoRawArrayIter(
@@ -133,11 +172,9 @@
 
     NPY_BEGIN_THREADS_DEF;
 
-    /* Check alignment */
-    aligned = raw_array_is_aligned(ndim, shape, dst_data, dst_strides,
-                                   npy_uint_alignment(dst_dtype->elsize)) &&
-              raw_array_is_aligned(ndim, shape, src_data, src_strides,
-                                   npy_uint_alignment(src_dtype->elsize));
+    aligned =
+        copycast_isaligned(ndim, shape, dst_dtype, dst_data, dst_strides) &&
+        copycast_isaligned(ndim, shape, src_dtype, src_data, src_strides);
 
     /* Use raw iteration with no heap allocation */
     if (PyArray_PrepareThreeRawArrayIter(
diff --git a/numpy/core/src/multiarray/array_assign_scalar.c b/numpy/core/src/multiarray/array_assign_scalar.c
index 841a418..ecb5be4 100644
--- a/numpy/core/src/multiarray/array_assign_scalar.c
+++ b/numpy/core/src/multiarray/array_assign_scalar.c
@@ -45,10 +45,13 @@
 
     NPY_BEGIN_THREADS_DEF;
 
-    /* Check alignment */
+    /* Check both uint and true alignment */
     aligned = raw_array_is_aligned(ndim, shape, dst_data, dst_strides,
                                    npy_uint_alignment(dst_dtype->elsize)) &&
-              npy_is_aligned(src_data, npy_uint_alignment(src_dtype->elsize));
+              raw_array_is_aligned(ndim, shape, dst_data, dst_strides,
+                                   dst_dtype->alignment) &&
+              npy_is_aligned(src_data, npy_uint_alignment(src_dtype->elsize) &&
+              npy_is_aligned(src_data, src_dtype->alignment));
 
     /* Use raw iteration with no heap allocation */
     if (PyArray_PrepareOneRawArrayIter(
@@ -116,10 +119,13 @@
 
     NPY_BEGIN_THREADS_DEF;
 
-    /* Check alignment */
+    /* Check both uint and true alignment */
     aligned = raw_array_is_aligned(ndim, shape, dst_data, dst_strides,
                                    npy_uint_alignment(dst_dtype->elsize)) &&
-              npy_is_aligned(src_data, npy_uint_alignment(src_dtype->elsize));
+              raw_array_is_aligned(ndim, shape, dst_data, dst_strides,
+                                   dst_dtype->alignment) &&
+              npy_is_aligned(src_data, npy_uint_alignment(src_dtype->elsize) &&
+              npy_is_aligned(src_data, src_dtype->alignment));
 
     /* Use raw iteration with no heap allocation */
     if (PyArray_PrepareTwoRawArrayIter(
@@ -220,7 +226,8 @@
      * we also skip this if 'dst' has an object dtype.
      */
     if ((!PyArray_EquivTypes(PyArray_DESCR(dst), src_dtype) ||
-            !npy_is_aligned(src_data, npy_uint_alignment(src_dtype->elsize))) &&
+            !(npy_is_aligned(src_data, npy_uint_alignment(src_dtype->elsize)) &&
+              npy_is_aligned(src_data, src_dtype->alignment))) &&
                     PyArray_SIZE(dst) > 1 &&
                     !PyDataType_REFCHK(PyArray_DESCR(dst))) {
         char *tmp_src_data;
diff --git a/numpy/core/src/multiarray/arrayfunction_override.c b/numpy/core/src/multiarray/arrayfunction_override.c
new file mode 100644
index 0000000..e62b32a
--- /dev/null
+++ b/numpy/core/src/multiarray/arrayfunction_override.c
@@ -0,0 +1,376 @@
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+
+#include "npy_pycompat.h"
+#include "get_attr_string.h"
+#include "npy_import.h"
+#include "multiarraymodule.h"
+
+
+/* Return the ndarray.__array_function__ method. */
+static PyObject *
+get_ndarray_array_function(void)
+{
+    PyObject* method = PyObject_GetAttrString((PyObject *)&PyArray_Type,
+                                              "__array_function__");
+    assert(method != NULL);
+    return method;
+}
+
+
+/*
+ * Get an object's __array_function__ method in the fastest way possible.
+ * Never raises an exception. Returns NULL if the method doesn't exist.
+ */
+static PyObject *
+get_array_function(PyObject *obj)
+{
+    static PyObject *ndarray_array_function = NULL;
+
+    if (ndarray_array_function == NULL) {
+        ndarray_array_function = get_ndarray_array_function();
+    }
+
+    /* Fast return for ndarray */
+    if (PyArray_CheckExact(obj)) {
+        Py_INCREF(ndarray_array_function);
+        return ndarray_array_function;
+    }
+
+    return PyArray_LookupSpecial(obj, "__array_function__");
+}
+
+
+/*
+ * Like list.insert(), but for C arrays of PyObject*. Skips error checking.
+ */
+static void
+pyobject_array_insert(PyObject **array, int length, int index, PyObject *item)
+{
+    int j;
+
+    for (j = length; j > index; j--) {
+        array[j] = array[j - 1];
+    }
+    array[index] = item;
+}
+
+
+/*
+ * Collects arguments with __array_function__ and their corresponding methods
+ * in the order in which they should be tried (i.e., skipping redundant types).
+ * `relevant_args` is expected to have been produced by PySequence_Fast.
+ * Returns the number of arguments, or -1 on failure. 
+ */
+static int
+get_implementing_args_and_methods(PyObject *relevant_args,
+                                  PyObject **implementing_args,
+                                  PyObject **methods)
+{
+    int num_implementing_args = 0;
+    Py_ssize_t i;
+    int j;
+
+    PyObject **items = PySequence_Fast_ITEMS(relevant_args);
+    Py_ssize_t length = PySequence_Fast_GET_SIZE(relevant_args);
+
+    for (i = 0; i < length; i++) {
+        int new_class = 1;
+        PyObject *argument = items[i];
+
+        /* Have we seen this type before? */
+        for (j = 0; j < num_implementing_args; j++) {
+            if (Py_TYPE(argument) == Py_TYPE(implementing_args[j])) {
+                new_class = 0;
+                break;
+            }
+        }
+        if (new_class) {
+            PyObject *method = get_array_function(argument);
+
+            if (method != NULL) {
+                int arg_index;
+
+                if (num_implementing_args >= NPY_MAXARGS) {
+                    PyErr_Format(
+                        PyExc_TypeError,
+                        "maximum number (%d) of distinct argument types " \
+                        "implementing __array_function__ exceeded",
+                        NPY_MAXARGS);
+                    Py_DECREF(method);
+                    goto fail;
+                }
+
+                /* "subclasses before superclasses, otherwise left to right" */
+                arg_index = num_implementing_args;
+                for (j = 0; j < num_implementing_args; j++) {
+                    PyObject *other_type;
+                    other_type = (PyObject *)Py_TYPE(implementing_args[j]);
+                    if (PyObject_IsInstance(argument, other_type)) {
+                        arg_index = j;
+                        break;
+                    }
+                }
+                Py_INCREF(argument);
+                pyobject_array_insert(implementing_args, num_implementing_args,
+                                      arg_index, argument);
+                pyobject_array_insert(methods, num_implementing_args,
+                                      arg_index, method);
+                ++num_implementing_args;
+            }
+        }
+    }
+    return num_implementing_args;
+
+fail:
+    for (j = 0; j < num_implementing_args; j++) {
+        Py_DECREF(implementing_args[j]);
+        Py_DECREF(methods[j]);
+    }
+    return -1;
+}
+
+
+/*
+ * Is this object ndarray.__array_function__?
+ */
+static int
+is_default_array_function(PyObject *obj)
+{
+    static PyObject *ndarray_array_function = NULL;
+
+    if (ndarray_array_function == NULL) {
+        ndarray_array_function = get_ndarray_array_function();
+    }
+    return obj == ndarray_array_function;
+}
+
+
+/*
+ * Core implementation of ndarray.__array_function__. This is exposed
+ * separately so we can avoid the overhead of a Python method call from
+ * within `implement_array_function`.
+ */
+NPY_NO_EXPORT PyObject *
+array_function_method_impl(PyObject *func, PyObject *types, PyObject *args,
+                           PyObject *kwargs)
+{
+    Py_ssize_t j;
+    PyObject *implementation, *result;
+
+    PyObject **items = PySequence_Fast_ITEMS(types);
+    Py_ssize_t length = PySequence_Fast_GET_SIZE(types);
+
+    for (j = 0; j < length; j++) {
+        int is_subclass = PyObject_IsSubclass(
+            items[j], (PyObject *)&PyArray_Type);
+        if (is_subclass == -1) {
+            return NULL;
+        }
+        if (!is_subclass) {
+            Py_INCREF(Py_NotImplemented);
+            return Py_NotImplemented;
+        }
+    }
+
+    implementation = PyObject_GetAttr(func, npy_ma_str_wrapped);
+    if (implementation == NULL) {
+        return NULL;
+    }
+    result = PyObject_Call(implementation, args, kwargs);
+    Py_DECREF(implementation);
+    return result;
+}
+
+
+/*
+ * Calls __array_function__ on the provided argument, with a fast-path for
+ * ndarray.
+ */
+static PyObject *
+call_array_function(PyObject* argument, PyObject* method,
+                    PyObject* public_api, PyObject* types,
+                    PyObject* args, PyObject* kwargs)
+{
+    if (is_default_array_function(method)) {
+        return array_function_method_impl(public_api, types, args, kwargs);
+    }
+    else {
+        return PyObject_CallFunctionObjArgs(
+            method, argument, public_api, types, args, kwargs, NULL);
+    }
+}
+
+
+/*
+ * Implements the __array_function__ protocol for a function, as described in
+ * in NEP-18. See numpy.core.overrides for a full docstring.
+ */
+NPY_NO_EXPORT PyObject *
+array_implement_array_function(
+    PyObject *NPY_UNUSED(dummy), PyObject *positional_args)
+{
+    PyObject *implementation, *public_api, *relevant_args, *args, *kwargs;
+
+    PyObject *types = NULL;
+    PyObject *implementing_args[NPY_MAXARGS];
+    PyObject *array_function_methods[NPY_MAXARGS];
+
+    int j, any_overrides;
+    int num_implementing_args = 0;
+    PyObject *result = NULL;
+
+    static PyObject *errmsg_formatter = NULL;
+
+    if (!PyArg_UnpackTuple(
+            positional_args, "implement_array_function", 5, 5,
+            &implementation, &public_api, &relevant_args, &args, &kwargs)) {
+        return NULL;
+    }
+
+    relevant_args = PySequence_Fast(
+        relevant_args,
+        "dispatcher for __array_function__ did not return an iterable");
+    if (relevant_args == NULL) {
+        return NULL;
+    }
+
+    /* Collect __array_function__ implementations */
+    num_implementing_args = get_implementing_args_and_methods(
+        relevant_args, implementing_args, array_function_methods);
+    if (num_implementing_args == -1) {
+        goto cleanup;
+    }
+
+    /*
+     * Handle the typical case of no overrides. This is merely an optimization
+     * if some arguments are ndarray objects, but is also necessary if no
+     * arguments implement __array_function__ at all (e.g., if they are all
+     * built-in types).
+     */
+    any_overrides = 0;
+    for (j = 0; j < num_implementing_args; j++) {
+        if (!is_default_array_function(array_function_methods[j])) {
+            any_overrides = 1;
+            break;
+        }
+    }
+    if (!any_overrides) {
+        result = PyObject_Call(implementation, args, kwargs);
+        goto cleanup;
+    }
+
+    /*
+     * Create a Python object for types.
+     * We use a tuple, because it's the fastest Python collection to create
+     * and has the bonus of being immutable.
+     */
+    types = PyTuple_New(num_implementing_args);
+    if (types == NULL) {
+        goto cleanup;
+    }
+    for (j = 0; j < num_implementing_args; j++) {
+        PyObject *arg_type = (PyObject *)Py_TYPE(implementing_args[j]);
+        Py_INCREF(arg_type);
+        PyTuple_SET_ITEM(types, j, arg_type);
+    }
+
+    /* Call __array_function__ methods */
+    for (j = 0; j < num_implementing_args; j++) {
+        PyObject *argument = implementing_args[j];
+        PyObject *method = array_function_methods[j];
+
+        /*
+         * We use `public_api` instead of `implementation` here so
+         * __array_function__ implementations can do equality/identity
+         * comparisons.
+         */
+        result = call_array_function(
+            argument, method, public_api, types, args, kwargs);
+
+        if (result == Py_NotImplemented) {
+            /* Try the next one */
+            Py_DECREF(result);
+            result = NULL;
+        }
+        else {
+            /* Either a good result, or an exception was raised. */
+            goto cleanup;
+        }
+    }
+
+    /* No acceptable override found, raise TypeError. */
+    npy_cache_import("numpy.core._internal",
+                     "array_function_errmsg_formatter",
+                     &errmsg_formatter);
+    if (errmsg_formatter != NULL) {
+        PyObject *errmsg = PyObject_CallFunctionObjArgs(
+            errmsg_formatter, public_api, types, NULL);
+        if (errmsg != NULL) {
+            PyErr_SetObject(PyExc_TypeError, errmsg);
+            Py_DECREF(errmsg);
+        }
+    }
+
+cleanup:
+    for (j = 0; j < num_implementing_args; j++) {
+        Py_DECREF(implementing_args[j]);
+        Py_DECREF(array_function_methods[j]);
+    }
+    Py_XDECREF(types);
+    Py_DECREF(relevant_args);
+    return result;
+}
+
+
+/*
+ * Python wrapper for get_implementing_args_and_methods, for testing purposes.
+ */
+NPY_NO_EXPORT PyObject *
+array__get_implementing_args(
+    PyObject *NPY_UNUSED(dummy), PyObject *positional_args)
+{
+    PyObject *relevant_args;
+    int j;
+    int num_implementing_args = 0;
+    PyObject *implementing_args[NPY_MAXARGS];
+    PyObject *array_function_methods[NPY_MAXARGS];
+    PyObject *result = NULL;
+
+    if (!PyArg_ParseTuple(positional_args, "O:array__get_implementing_args",
+                          &relevant_args)) {
+        return NULL;
+    }
+
+    relevant_args = PySequence_Fast(
+        relevant_args,
+        "dispatcher for __array_function__ did not return an iterable");
+    if (relevant_args == NULL) {
+        return NULL;
+    }
+
+    num_implementing_args = get_implementing_args_and_methods(
+        relevant_args, implementing_args, array_function_methods);
+    if (num_implementing_args == -1) {
+        goto cleanup;
+    }
+
+    /* create a Python object for implementing_args */
+    result = PyList_New(num_implementing_args);
+    if (result == NULL) {
+        goto cleanup;
+    }
+    for (j = 0; j < num_implementing_args; j++) {
+        PyObject *argument = implementing_args[j];
+        Py_INCREF(argument);
+        PyList_SET_ITEM(result, j, argument);
+    }
+
+cleanup:
+    for (j = 0; j < num_implementing_args; j++) {
+        Py_DECREF(implementing_args[j]);
+        Py_DECREF(array_function_methods[j]);
+    }
+    Py_DECREF(relevant_args);
+    return result;
+}
diff --git a/numpy/core/src/multiarray/arrayfunction_override.h b/numpy/core/src/multiarray/arrayfunction_override.h
new file mode 100644
index 0000000..0d224e2
--- /dev/null
+++ b/numpy/core/src/multiarray/arrayfunction_override.h
@@ -0,0 +1,16 @@
+#ifndef _NPY_PRIVATE__ARRAYFUNCTION_OVERRIDE_H
+#define _NPY_PRIVATE__ARRAYFUNCTION_OVERRIDE_H
+
+NPY_NO_EXPORT PyObject *
+array_implement_array_function(
+    PyObject *NPY_UNUSED(dummy), PyObject *positional_args);
+
+NPY_NO_EXPORT PyObject *
+array__get_implementing_args(
+    PyObject *NPY_UNUSED(dummy), PyObject *positional_args);
+
+NPY_NO_EXPORT PyObject *
+array_function_method_impl(PyObject *func, PyObject *types, PyObject *args,
+                           PyObject *kwargs);
+
+#endif
diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src
index 823ee71..ca5f5a4 100644
--- a/numpy/core/src/multiarray/arraytypes.c.src
+++ b/numpy/core/src/multiarray/arraytypes.c.src
@@ -2205,15 +2205,19 @@
 VOID_copyswapn (char *dst, npy_intp dstride, char *src, npy_intp sstride,
                 npy_intp n, int swap, PyArrayObject *arr)
 {
+    PyArray_Descr *descr;
+
     if (arr == NULL) {
         return;
     }
+
+    descr = PyArray_DESCR(arr);
+
     if (PyArray_HASFIELDS(arr)) {
         PyObject *key, *value;
-        PyArray_Descr *descr;
+
         Py_ssize_t pos = 0;
 
-        descr = PyArray_DESCR(arr);
         while (PyDict_Next(descr->fields, &pos, &key, &value)) {
             npy_intp offset;
             PyArray_Descr * new;
@@ -2236,14 +2240,28 @@
         ((PyArrayObject_fields *)arr)->descr = descr;
         return;
     }
-    if (swap && PyArray_DESCR(arr)->subarray != NULL) {
-        PyArray_Descr *descr, *new;
+    if (PyDataType_HASSUBARRAY(descr)) {
+        PyArray_Descr *new;
         npy_intp num;
         npy_intp i;
         int subitemsize;
         char *dstptr, *srcptr;
+        /*
+         * In certain cases subarray copy can be optimized. This is when
+         * swapping is unecessary and the subarrays data type can certainly
+         * be simply copied (no object, fields, subarray, and not a user dtype).
+         */
+        npy_bool can_optimize_subarray = (!swap &&
+                !PyDataType_HASFIELDS(descr->subarray->base) &&
+                !PyDataType_HASSUBARRAY(descr->subarray->base) &&
+                !PyDataType_REFCHK(descr->subarray->base) &&
+                (descr->subarray->base->type_num < NPY_NTYPES));
 
-        descr = PyArray_DESCR(arr);
+        if (can_optimize_subarray) {
+            _basic_copyn(dst, dstride, src, sstride, n, descr->elsize);
+            return;
+        }
+
         new = descr->subarray->base;
         /*
          * TODO: temporarily modifying the array like this
@@ -2253,6 +2271,10 @@
         dstptr = dst;
         srcptr = src;
         subitemsize = new->elsize;
+        if (subitemsize == 0) {
+            /* There cannot be any elements, so return */
+            return;
+        }
         num = descr->elsize / subitemsize;
         for (i = 0; i < n; i++) {
             new->f->copyswapn(dstptr, subitemsize, srcptr,
@@ -2265,22 +2287,26 @@
         ((PyArrayObject_fields *)arr)->descr = descr;
         return;
     }
-    _basic_copyn(dst, dstride, src, sstride, n, PyArray_DESCR(arr)->elsize);
+    /* Must be a naive Void type (e.g. a "V8") so simple copy is sufficient. */
+    _basic_copyn(dst, dstride, src, sstride, n, descr->elsize);
     return;
 }
 
 static void
 VOID_copyswap (char *dst, char *src, int swap, PyArrayObject *arr)
 {
+    PyArray_Descr *descr;
+
     if (arr == NULL) {
         return;
     }
+
+    descr = PyArray_DESCR(arr);
+
     if (PyArray_HASFIELDS(arr)) {
         PyObject *key, *value;
-        PyArray_Descr *descr;
         Py_ssize_t pos = 0;
 
-        descr = PyArray_DESCR(arr);
         while (PyDict_Next(descr->fields, &pos, &key, &value)) {
             npy_intp offset;
             PyArray_Descr * new;
@@ -2303,28 +2329,45 @@
         ((PyArrayObject_fields *)arr)->descr = descr;
         return;
     }
-    if (swap && PyArray_DESCR(arr)->subarray != NULL) {
-        PyArray_Descr *descr, *new;
+    if (PyDataType_HASSUBARRAY(descr)) {
+        PyArray_Descr *new;
         npy_intp num;
-        int itemsize;
+        int subitemsize;
+        /*
+         * In certain cases subarray copy can be optimized. This is when
+         * swapping is unecessary and the subarrays data type can certainly
+         * be simply copied (no object, fields, subarray, and not a user dtype).
+         */
+        npy_bool can_optimize_subarray = (!swap &&
+                !PyDataType_HASFIELDS(descr->subarray->base) &&
+                !PyDataType_HASSUBARRAY(descr->subarray->base) &&
+                !PyDataType_REFCHK(descr->subarray->base) &&
+                (descr->subarray->base->type_num < NPY_NTYPES));
 
-        descr = PyArray_DESCR(arr);
+        if (can_optimize_subarray) {
+            _basic_copy(dst, src, descr->elsize);
+            return;
+        }
+
         new = descr->subarray->base;
         /*
          * TODO: temporarily modifying the array like this
          *       is bad coding style, should be changed.
          */
         ((PyArrayObject_fields *)arr)->descr = new;
-        itemsize = new->elsize;
-        num = descr->elsize / itemsize;
-        new->f->copyswapn(dst, itemsize, src,
-                itemsize, num, swap, arr);
+        subitemsize = new->elsize;
+        if (subitemsize == 0) {
+            /* There cannot be any elements, so return */
+            return;
+        }
+        num = descr->elsize / subitemsize;
+        new->f->copyswapn(dst, subitemsize, src,
+                subitemsize, num, swap, arr);
         ((PyArrayObject_fields *)arr)->descr = descr;
         return;
     }
-
-    /* copy first if needed */
-    _basic_copy(dst, src, PyArray_DESCR(arr)->elsize);
+    /* Must be a naive Void type (e.g. a "V8") so simple copy is sufficient. */
+    _basic_copy(dst, src, descr->elsize);
     return;
 }
 
diff --git a/numpy/core/src/multiarray/buffer.c b/numpy/core/src/multiarray/buffer.c
index 2f66d7f..d8ad802 100644
--- a/numpy/core/src/multiarray/buffer.c
+++ b/numpy/core/src/multiarray/buffer.c
@@ -509,6 +509,10 @@
     PyArray_Descr *descr = NULL;
     int err = 0;
 
+    /*
+     * Note that the buffer info is cached as pyints making them appear like
+     * unreachable lost memory to valgrind.
+     */
     info = malloc(sizeof(_buffer_info_t));
     if (info == NULL) {
         PyErr_NoMemory();
@@ -579,9 +583,11 @@
     err = _buffer_format_string(descr, &fmt, obj, NULL, NULL);
     Py_DECREF(descr);
     if (err != 0) {
+        free(info->shape);
         goto fail;
     }
     if (_append_char(&fmt, '\0') < 0) {
+        free(info->shape);
         goto fail;
     }
     info->format = fmt.s;
diff --git a/numpy/core/src/multiarray/common.c b/numpy/core/src/multiarray/common.c
index 3e5221a..addb677 100644
--- a/numpy/core/src/multiarray/common.c
+++ b/numpy/core/src/multiarray/common.c
@@ -164,7 +164,7 @@
 
             if (string_type == NPY_STRING) {
                 if ((temp = PyObject_Str(obj)) == NULL) {
-                    return -1;
+                    goto fail;
                 }
 #if defined(NPY_PY3K)
     #if PY_VERSION_HEX >= 0x03030000
@@ -182,7 +182,7 @@
 #else
                 if ((temp = PyObject_Unicode(obj)) == NULL) {
 #endif
-                    return -1;
+                    goto fail;
                 }
                 itemsize = PyUnicode_GET_DATA_SIZE(temp);
 #ifndef Py_UNICODE_WIDE
@@ -216,7 +216,7 @@
 
             if (string_type == NPY_STRING) {
                 if ((temp = PyObject_Str(obj)) == NULL) {
-                    return -1;
+                    goto fail;
                 }
 #if defined(NPY_PY3K)
     #if PY_VERSION_HEX >= 0x03030000
@@ -234,7 +234,7 @@
 #else
                 if ((temp = PyObject_Unicode(obj)) == NULL) {
 #endif
-                    return -1;
+                    goto fail;
                 }
                 itemsize = PyUnicode_GET_DATA_SIZE(temp);
 #ifndef Py_UNICODE_WIDE
@@ -440,12 +440,18 @@
         return 0;
     }
 
-    /* Recursive case, first check the sequence contains only one type */
+    /*
+     * The C-API recommends calling PySequence_Fast before any of the other
+     * PySequence_Fast* functions. This is required for PyPy
+     */
     seq = PySequence_Fast(obj, "Could not convert object to sequence");
     if (seq == NULL) {
         goto fail;
     }
+
+    /* Recursive case, first check the sequence contains only one type */
     size = PySequence_Fast_GET_SIZE(seq);
+    /* objects is borrowed, do not release seq */
     objects = PySequence_Fast_ITEMS(seq);
     common_type = size > 0 ? Py_TYPE(objects[0]) : NULL;
     for (i = 1; i < size; ++i) {
@@ -505,7 +511,7 @@
         PyArray_Descr *res_dtype = PyArray_PromoteTypes(dtype, *out_dtype);
         Py_DECREF(dtype);
         if (res_dtype == NULL) {
-            return -1;
+            goto fail;
         }
         if (!string_type &&
                 res_dtype->type_num == NPY_UNICODE &&
diff --git a/numpy/core/src/multiarray/common.h b/numpy/core/src/multiarray/common.h
index 2b8d3d3..0e16290 100644
--- a/numpy/core/src/multiarray/common.h
+++ b/numpy/core/src/multiarray/common.h
@@ -182,6 +182,7 @@
 
 /* used for some alignment checks */
 #define _ALIGN(type) offsetof(struct {char c; type v;}, v)
+#define _UINT_ALIGN(type) npy_uint_alignment(sizeof(type))
 /*
  * Disable harmless compiler warning "4116: unnamed type definition in
  * parentheses" which is caused by the _ALIGN macro.
@@ -201,6 +202,7 @@
      * Assumes cast from pointer to uintp gives a sensible representation we
      * can use bitwise & on (not required by C standard, but used by glibc).
      * This test is faster than a direct modulo.
+     * Note alignment value of 0 is allowed and returns False.
      */
     return ((npy_uintp)(p) & ((alignment) - 1)) == 0;
 }
diff --git a/numpy/core/src/multiarray/compiled_base.c b/numpy/core/src/multiarray/compiled_base.c
index b23e55b..625028b 100644
--- a/numpy/core/src/multiarray/compiled_base.c
+++ b/numpy/core/src/multiarray/compiled_base.c
@@ -1577,6 +1577,7 @@
     if (!PyArray_ISBOOL(inp) && !PyArray_ISINTEGER(inp)) {
         PyErr_SetString(PyExc_TypeError,
                 "Expected an input array of integer or boolean data type");
+        Py_DECREF(inp);
         goto fail;
     }
 
@@ -1684,6 +1685,7 @@
     if (PyArray_TYPE(inp) != NPY_UBYTE) {
         PyErr_SetString(PyExc_TypeError,
                 "Expected an input array of unsigned byte data type");
+        Py_DECREF(inp);
         goto fail;
     }
 
diff --git a/numpy/core/src/multiarray/convert_datatype.c b/numpy/core/src/multiarray/convert_datatype.c
index 33a7064..c59979e 100644
--- a/numpy/core/src/multiarray/convert_datatype.c
+++ b/numpy/core/src/multiarray/convert_datatype.c
@@ -47,11 +47,10 @@
     PyObject *out;
 
     /* If the requested dtype is flexible, adapt it */
-    PyArray_AdaptFlexibleDType((PyObject *)arr, PyArray_DESCR(arr), &dtype);
+    dtype = PyArray_AdaptFlexibleDType((PyObject *)arr, PyArray_DESCR(arr), dtype);
     if (dtype == NULL) {
         return NULL;
     }
-
     out = PyArray_NewFromDescr(Py_TYPE(arr), dtype,
                                PyArray_NDIM(arr),
                                PyArray_DIMS(arr),
@@ -128,9 +127,9 @@
 }
 
 /*
- * This function calls Py_DECREF on flex_dtype, and replaces it with
- * a new dtype that has been adapted based on the values in data_dtype
- * and data_obj. If the flex_dtype is not flexible, it leaves it as is.
+ * This function returns a dtype based on flex_dtype and the values in
+ * data_dtype and data_obj. It also calls Py_DECREF on the flex_dtype. If the
+ * flex_dtype is not flexible, it returns it as-is.
  *
  * Usually, if data_obj is not an array, dtype should be the result
  * given by the PyArray_GetArrayParamsFromObject function.
@@ -138,40 +137,37 @@
  * The data_obj may be NULL if just a dtype is known for the source.
  *
  * If *flex_dtype is NULL, returns immediately, without setting an
- * exception. This basically assumes an error was already set previously.
+ * exception, leaving any previous error handling intact.
  *
  * The current flexible dtypes include NPY_STRING, NPY_UNICODE, NPY_VOID,
  * and NPY_DATETIME with generic units.
  */
-NPY_NO_EXPORT void
+NPY_NO_EXPORT PyArray_Descr *
 PyArray_AdaptFlexibleDType(PyObject *data_obj, PyArray_Descr *data_dtype,
-                            PyArray_Descr **flex_dtype)
+                            PyArray_Descr *flex_dtype)
 {
     PyArray_DatetimeMetaData *meta;
+    PyArray_Descr *retval = NULL;
     int flex_type_num;
 
-    if (*flex_dtype == NULL) {
-        if (!PyErr_Occurred()) {
-            PyErr_SetString(PyExc_RuntimeError,
-                    "NumPy AdaptFlexibleDType was called with NULL flex_dtype "
-                    "but no error set");
-        }
-        return;
+    if (flex_dtype == NULL) {
+        return retval;
     }
 
-    flex_type_num = (*flex_dtype)->type_num;
+    flex_type_num = flex_dtype->type_num;
 
     /* Flexible types with expandable size */
-    if (PyDataType_ISUNSIZED(*flex_dtype)) {
+    if (PyDataType_ISUNSIZED(flex_dtype)) {
         /* First replace the flex_dtype */
-        PyArray_DESCR_REPLACE(*flex_dtype);
-        if (*flex_dtype == NULL) {
-            return;
+        retval = PyArray_DescrNew(flex_dtype);
+        Py_DECREF(flex_dtype);
+        if (retval == NULL) {
+            return retval;
         }
 
         if (data_dtype->type_num == flex_type_num ||
                                     flex_type_num == NPY_VOID) {
-            (*flex_dtype)->elsize = data_dtype->elsize;
+            (retval)->elsize = data_dtype->elsize;
         }
         else if (flex_type_num == NPY_STRING || flex_type_num == NPY_UNICODE) {
             npy_intp size = 8;
@@ -199,7 +195,7 @@
                     }
                     else if (data_dtype->elsize > 8 ||
                              data_dtype->elsize < 0) {
-                        /* 
+                        /*
                          * Element size should never be greater than 8 or
                          * less than 0 for integer type, but just in case...
                          */
@@ -237,9 +233,8 @@
                                 PyObject *s = PyObject_Str(list);
                                 if (s == NULL) {
                                     Py_DECREF(list);
-                                    Py_DECREF(*flex_dtype);
-                                    *flex_dtype = NULL;
-                                    return;
+                                    Py_DECREF(retval);
+                                    return NULL;
                                 }
                                 else {
                                     size = PyObject_Length(s);
@@ -262,9 +257,16 @@
                             list = PyArray_ToList((PyArrayObject *)data_obj);
                             result = PyArray_GetArrayParamsFromObject(
                                     list,
-                                    *flex_dtype,
+                                    retval,
                                     0, &dtype,
                                     &ndim, dims, &arr, NULL);
+                            Py_DECREF(list);
+                            Py_XDECREF(arr);
+                            if (result < 0) {
+                                Py_XDECREF(dtype);
+                                Py_DECREF(retval);
+                                return NULL;
+                            }
                             if (result == 0 && dtype != NULL) {
                                 if (flex_type_num == NPY_UNICODE) {
                                     size = dtype->elsize / 4;
@@ -274,15 +276,12 @@
                                 }
                             }
                             Py_XDECREF(dtype);
-                            Py_XDECREF(arr);
-                            Py_DECREF(list);
                         }
                         else if (PyArray_IsPythonScalar(data_obj)) {
                             PyObject *s = PyObject_Str(data_obj);
                             if (s == NULL) {
-                                Py_DECREF(*flex_dtype);
-                                *flex_dtype = NULL;
-                                return;
+                                Py_DECREF(retval);
+                                return NULL;
                             }
                             else {
                                 size = PyObject_Length(s);
@@ -301,9 +300,8 @@
                 case NPY_DATETIME:
                     meta = get_datetime_metadata_from_dtype(data_dtype);
                     if (meta == NULL) {
-                        Py_DECREF(*flex_dtype);
-                        *flex_dtype = NULL;
-                        return;
+                        Py_DECREF(retval);
+                        return NULL;
                     }
                     size = get_datetime_iso_8601_strlen(0, meta->base);
                     break;
@@ -313,10 +311,10 @@
             }
 
             if (flex_type_num == NPY_STRING) {
-                (*flex_dtype)->elsize = size;
+                retval->elsize = size;
             }
             else if (flex_type_num == NPY_UNICODE) {
-                (*flex_dtype)->elsize = size * 4;
+                retval->elsize = size * 4;
             }
         }
         else {
@@ -326,18 +324,17 @@
              */
             PyErr_SetString(PyExc_TypeError,
                     "don't know how to adapt flex dtype");
-            *flex_dtype = NULL;
-            return;
+            Py_DECREF(retval);
+            return NULL;
         }
     }
     /* Flexible type with generic time unit that adapts */
     else if (flex_type_num == NPY_DATETIME ||
                 flex_type_num == NPY_TIMEDELTA) {
-        meta = get_datetime_metadata_from_dtype(*flex_dtype);
+        meta = get_datetime_metadata_from_dtype(flex_dtype);
+        retval = flex_dtype;
         if (meta == NULL) {
-            Py_DECREF(*flex_dtype);
-            *flex_dtype = NULL;
-            return;
+            return NULL;
         }
 
         if (meta->base == NPY_FR_GENERIC) {
@@ -345,22 +342,24 @@
                     data_dtype->type_num == NPY_TIMEDELTA) {
                 meta = get_datetime_metadata_from_dtype(data_dtype);
                 if (meta == NULL) {
-                    Py_DECREF(*flex_dtype);
-                    *flex_dtype = NULL;
-                    return;
+                    return NULL;
                 }
 
-                Py_DECREF(*flex_dtype);
-                *flex_dtype = create_datetime_dtype(flex_type_num, meta);
+                retval = create_datetime_dtype(flex_type_num, meta);
+                Py_DECREF(flex_dtype);
             }
             else if (data_obj != NULL) {
                 /* Detect the unit from the input's data */
-                Py_DECREF(*flex_dtype);
-                *flex_dtype = find_object_datetime_type(data_obj,
+                retval = find_object_datetime_type(data_obj,
                                                     flex_type_num);
+                Py_DECREF(flex_dtype);
             }
         }
     }
+    else {
+        retval = flex_dtype;
+    }
+    return retval;
 }
 
 /*
@@ -518,7 +517,7 @@
          * stringified value of the object.
          */
         else if (to_type_num == NPY_STRING || to_type_num == NPY_UNICODE) {
-            /* 
+            /*
              * Boolean value cast to string type is 5 characters max
              * for string 'False'.
              */
@@ -531,7 +530,7 @@
             if (PyDataType_ISUNSIZED(to)) {
                 ret = 1;
             }
-            /* 
+            /*
              * Need at least 5 characters to convert from boolean
              * to 'True' or 'False'.
              */
@@ -1166,7 +1165,11 @@
                 PyArray_Descr *ret = NULL;
                 PyArray_Descr *temp = PyArray_DescrNew(type1);
                 PyDataType_MAKEUNSIZED(temp);
-                PyArray_AdaptFlexibleDType(NULL, type2, &temp);
+
+                temp = PyArray_AdaptFlexibleDType(NULL, type2, temp);
+                if (temp == NULL) {
+                    return NULL;
+                }
                 if (temp->elsize > type1->elsize) {
                     ret = ensure_dtype_nbo(temp);
                 }
@@ -1204,7 +1207,10 @@
                 PyArray_Descr *ret = NULL;
                 PyArray_Descr *temp = PyArray_DescrNew(type1);
                 PyDataType_MAKEUNSIZED(temp);
-                PyArray_AdaptFlexibleDType(NULL, type2, &temp);
+                temp = PyArray_AdaptFlexibleDType(NULL, type2, temp);
+                if (temp == NULL) {
+                    return NULL;
+                }
                 if (temp->elsize > type1->elsize) {
                     ret = ensure_dtype_nbo(temp);
                 }
@@ -1252,7 +1258,10 @@
                 PyArray_Descr *ret = NULL;
                 PyArray_Descr *temp = PyArray_DescrNew(type2);
                 PyDataType_MAKEUNSIZED(temp);
-                PyArray_AdaptFlexibleDType(NULL, type1, &temp);
+                temp = PyArray_AdaptFlexibleDType(NULL, type1, temp);
+                if (temp == NULL) {
+                    return NULL;
+                }
                 if (temp->elsize > type2->elsize) {
                     ret = ensure_dtype_nbo(temp);
                 }
@@ -1269,7 +1278,10 @@
                 PyArray_Descr *ret = NULL;
                 PyArray_Descr *temp = PyArray_DescrNew(type2);
                 PyDataType_MAKEUNSIZED(temp);
-                PyArray_AdaptFlexibleDType(NULL, type1, &temp);
+                temp = PyArray_AdaptFlexibleDType(NULL, type1, temp);
+                if (temp == NULL) {
+                    return NULL;
+                }
                 if (temp->elsize > type2->elsize) {
                     ret = ensure_dtype_nbo(temp);
                 }
diff --git a/numpy/core/src/multiarray/convert_datatype.h b/numpy/core/src/multiarray/convert_datatype.h
index bf77d69..6535571 100644
--- a/numpy/core/src/multiarray/convert_datatype.h
+++ b/numpy/core/src/multiarray/convert_datatype.h
@@ -21,13 +21,21 @@
 /*
  * This function calls Py_DECREF on flex_dtype, and replaces it with
  * a new dtype that has been adapted based on the values in data_dtype
- * and data_obj. If the flex_dtype is not flexible, it leaves it as is.
+ * and data_obj. If the flex_dtype is not flexible, it returns it as-is.
+ *
+ * Usually, if data_obj is not an array, dtype should be the result
+ * given by the PyArray_GetArrayParamsFromObject function.
+ *
+ * The data_obj may be NULL if just a dtype is known for the source.
+ *
+ * If *flex_dtype is NULL, returns immediately, without setting an
+ * exception, leaving any previous error handling intact.
  *
  * The current flexible dtypes include NPY_STRING, NPY_UNICODE, NPY_VOID,
  * and NPY_DATETIME with generic units.
  */
-NPY_NO_EXPORT void
+NPY_NO_EXPORT PyArray_Descr *
 PyArray_AdaptFlexibleDType(PyObject *data_obj, PyArray_Descr *data_dtype,
-                            PyArray_Descr **flex_dtype);
+                            PyArray_Descr *flex_dtype);
 
 #endif
diff --git a/numpy/core/src/multiarray/ctors.c b/numpy/core/src/multiarray/ctors.c
index 23a8dce..b2e329d 100644
--- a/numpy/core/src/multiarray/ctors.c
+++ b/numpy/core/src/multiarray/ctors.c
@@ -1410,6 +1410,7 @@
          * dimensions, so the array is now 0d.
          */
         nd = 0;
+        Py_DECREF(descr);
         descr = (PyArray_Descr *)PyObject_CallFunctionObjArgs(
                 (PyObject *)&PyArrayDescr_Type, Py_TYPE(view->obj), NULL);
         if (descr == NULL) {
@@ -1811,9 +1812,12 @@
 
     /* If the requested dtype is flexible, adapt it */
     if (newtype != NULL) {
-        PyArray_AdaptFlexibleDType(op,
+        newtype = PyArray_AdaptFlexibleDType(op,
                     (dtype == NULL) ? PyArray_DESCR(arr) : dtype,
-                    &newtype);
+                    newtype);
+        if (newtype == NULL) {
+            return NULL;
+        }
     }
 
     /* If we got dimensions and dtype instead of an array */
@@ -2024,7 +2028,7 @@
         newtype = oldtype;
         Py_INCREF(oldtype);
     }
-    if (PyDataType_ISUNSIZED(newtype)) {
+    else if (PyDataType_ISUNSIZED(newtype)) {
         PyArray_DESCR_REPLACE(newtype);
         if (newtype == NULL) {
             return NULL;
@@ -2128,12 +2132,15 @@
              */
 
             /* 2017-Nov-10 1.14 */
-            if (DEPRECATE("NPY_ARRAY_UPDATEIFCOPY, NPY_ARRAY_INOUT_ARRAY, and "
-                "NPY_ARRAY_INOUT_FARRAY are deprecated, use NPY_WRITEBACKIFCOPY, "
-                "NPY_ARRAY_INOUT_ARRAY2, or NPY_ARRAY_INOUT_FARRAY2 respectively "
-                "instead, and call PyArray_ResolveWritebackIfCopy before the "
-                "array is deallocated, i.e. before the last call to Py_DECREF.") < 0)
+            if (DEPRECATE(
+                    "NPY_ARRAY_UPDATEIFCOPY, NPY_ARRAY_INOUT_ARRAY, and "
+                    "NPY_ARRAY_INOUT_FARRAY are deprecated, use NPY_WRITEBACKIFCOPY, "
+                    "NPY_ARRAY_INOUT_ARRAY2, or NPY_ARRAY_INOUT_FARRAY2 respectively "
+                    "instead, and call PyArray_ResolveWritebackIfCopy before the "
+                    "array is deallocated, i.e. before the last call to Py_DECREF.") < 0) {
+                Py_DECREF(ret);
                 return NULL;
+            }
             Py_INCREF(arr);
             if (PyArray_SetWritebackIfCopyBase(ret, arr) < 0) {
                 Py_DECREF(ret);
@@ -2160,14 +2167,12 @@
 
         Py_DECREF(newtype);
         if (needview) {
-            PyArray_Descr *dtype = PyArray_DESCR(arr);
             PyTypeObject *subtype = NULL;
 
             if (flags & NPY_ARRAY_ENSUREARRAY) {
                 subtype = &PyArray_Type;
             }
 
-            Py_INCREF(dtype);
             ret = (PyArrayObject *)PyArray_View(arr, NULL, subtype);
             if (ret == NULL) {
                 return NULL;
@@ -2495,6 +2500,11 @@
             &PyArray_Type, dtype,
             n, dims, NULL, data,
             dataflags, NULL, base);
+    /*
+     * Ref to dtype was stolen by PyArray_NewFromDescrAndBase
+     * Prevent DECREFing dtype in fail codepath by setting to NULL
+     */
+    dtype = NULL;
     if (ret == NULL) {
         goto fail;
     }
@@ -2827,7 +2837,8 @@
      * contiguous strides, etc.
      */
     if (PyArray_GetDTypeTransferFunction(
-                    IsUintAligned(src) && IsUintAligned(dst),
+                    IsUintAligned(src) && IsAligned(src) &&
+                    IsUintAligned(dst) && IsAligned(dst),
                     src_stride, dst_stride,
                     PyArray_DESCR(src), PyArray_DESCR(dst),
                     0,
diff --git a/numpy/core/src/multiarray/datetime.c b/numpy/core/src/multiarray/datetime.c
index a8550d9..54d19d9 100644
--- a/numpy/core/src/multiarray/datetime.c
+++ b/numpy/core/src/multiarray/datetime.c
@@ -3822,18 +3822,26 @@
                  * single object using [()], but not by using
                  * __getitem__(integer) approaches
                  */
-                PyObject *item, *meth, *args;
+                PyObject *item, *args;
 
-                meth = PyObject_GetAttrString(obj, "__getitem__");
-                args = Py_BuildValue("(())");
-                item = PyObject_CallObject(meth, args);
+                args = PyTuple_New(0);
+                if (args == NULL) {
+                    return 0;
+                }
+                item = PyObject_GetItem(obj, args);
+                Py_DECREF(args);
+                if (item == NULL) {
+                    return 0;
+                }
                 /*
                  * NOTE: may need other type checks here in the future
                  * for expanded 0 D datetime array conversions?
                  */
                 if (PyDelta_Check(item)) {
+                    Py_DECREF(item);
                     return delta_checker(meta);
                 }
+                Py_DECREF(item);
             }
         }
     }
diff --git a/numpy/core/src/multiarray/descriptor.c b/numpy/core/src/multiarray/descriptor.c
index b9be3c0..0471a2a 100644
--- a/numpy/core/src/multiarray/descriptor.c
+++ b/numpy/core/src/multiarray/descriptor.c
@@ -257,6 +257,9 @@
             return NULL;
         }
         PyArray_DESCR_REPLACE(type);
+        if (type == NULL) {
+            return NULL;
+        }
         if (type->type_num == NPY_UNICODE) {
             type->elsize = itemsize << 2;
         }
@@ -512,6 +515,7 @@
 #if defined(NPY_PY3K)
             Py_DECREF(name);
 #endif
+            Py_DECREF(conv);
             goto fail;
         }
         dtypeflags |= (conv->flags & NPY_FROM_FIELDS);
@@ -834,9 +838,11 @@
     else if (new->elsize != conv->elsize) {
         PyErr_SetString(PyExc_ValueError,
                 "mismatch in size of old and new data-descriptor");
+        Py_DECREF(new);
         goto fail;
     }
     else if (invalid_union_object_dtype(new, conv)) {
+        Py_DECREF(new);
         goto fail;
     }
 
@@ -1651,6 +1657,9 @@
 
     if (PyDataType_ISUNSIZED(*at) && (*at)->elsize != elsize) {
         PyArray_DESCR_REPLACE(*at);
+        if (*at == NULL) {
+            goto error;
+        }
         (*at)->elsize = elsize;
     }
     if (endian != '=' && PyArray_ISNBO(endian)) {
@@ -1659,6 +1668,9 @@
     if (endian != '=' && (*at)->byteorder != '|'
         && (*at)->byteorder != endian) {
         PyArray_DESCR_REPLACE(*at);
+        if (*at == NULL) {
+            goto error;
+        }
         (*at)->byteorder = endian;
     }
     return NPY_SUCCEED;
@@ -1719,6 +1731,7 @@
         newdescr->c_metadata = NPY_AUXDATA_CLONE(base->c_metadata);
         if (newdescr->c_metadata == NULL) {
             PyErr_NoMemory();
+            /* TODO: This seems wrong, as the old fields get decref'd? */
             Py_DECREF(newdescr);
             return NULL;
         }
@@ -3327,12 +3340,15 @@
 _subscript_by_index(PyArray_Descr *self, Py_ssize_t i)
 {
     PyObject *name = PySequence_GetItem(self->names, i);
+    PyObject *ret;
     if (name == NULL) {
         PyErr_Format(PyExc_IndexError,
                      "Field index %zd out of range.", i);
         return NULL;
     }
-    return _subscript_by_name(self, name);
+    ret = _subscript_by_name(self, name);
+    Py_DECREF(name);
+    return ret;
 }
 
 static PyObject *
diff --git a/numpy/core/src/multiarray/dtype_transfer.c b/numpy/core/src/multiarray/dtype_transfer.c
index 2b29d4f..3ab07ad 100644
--- a/numpy/core/src/multiarray/dtype_transfer.c
+++ b/numpy/core/src/multiarray/dtype_transfer.c
@@ -26,6 +26,7 @@
 #include "_datetime.h"
 #include "datetime_strings.h"
 #include "descriptor.h"
+#include "array_assign.h"
 
 #include "shape.h"
 #include "lowlevel_strided_loops.h"
@@ -1126,7 +1127,7 @@
 
     /* Get an ASCII string data type, adapted to match the UNICODE one */
     str_dtype = PyArray_DescrFromType(NPY_STRING);
-    PyArray_AdaptFlexibleDType(NULL, dst_dtype, &str_dtype);
+    str_dtype = PyArray_AdaptFlexibleDType(NULL, dst_dtype, str_dtype);
     if (str_dtype == NULL) {
         return NPY_FAIL;
     }
@@ -1248,7 +1249,7 @@
 
     /* Get an ASCII string data type, adapted to match the UNICODE one */
     str_dtype = PyArray_DescrFromType(NPY_STRING);
-    PyArray_AdaptFlexibleDType(NULL, src_dtype, &str_dtype);
+    str_dtype = PyArray_AdaptFlexibleDType(NULL, src_dtype, str_dtype);
     if (str_dtype == NULL) {
         return NPY_FAIL;
     }
@@ -1571,12 +1572,30 @@
                                 src_dtype,
                                 &tobuffer, &todata);
 
+        if (!PyDataType_REFCHK(dst_dtype)) {
+            /* Copying from buffer is a simple copy/swap operation */
+            PyArray_GetDTypeCopySwapFn(aligned,
+                                    dst_itemsize, dst_stride,
+                                    dst_dtype,
+                                    &frombuffer, &fromdata);
+        }
+        else {
+            /*
+             * Since the buffer is initialized to NULL, need to move the
+             * references in order to DECREF the existing data.
+             */
+             /* Object types cannot be byte swapped */
+            assert(PyDataType_ISNOTSWAPPED(dst_dtype));
+            /* The loop already needs the python api if this is reached */
+            assert(*out_needs_api);
 
-        /* Get the copy/swap operation to dst */
-        PyArray_GetDTypeCopySwapFn(aligned,
-                                dst_itemsize, dst_stride,
-                                dst_dtype,
-                                &frombuffer, &fromdata);
+            if (PyArray_GetDTypeTransferFunction(
+                    aligned, dst_itemsize, dst_stride,
+                    dst_dtype, dst_dtype, 1,
+                    &frombuffer, &fromdata, out_needs_api) != NPY_SUCCEED) {
+                return NPY_FAIL;
+            }
+        }
 
         if (frombuffer == NULL || tobuffer == NULL) {
             NPY_AUXDATA_FREE(castdata);
@@ -2000,6 +2019,7 @@
     _subarray_broadcast_offsetrun offsetruns;
 } _subarray_broadcast_data;
 
+
 /* transfer data free function */
 static void _subarray_broadcast_data_free(NpyAuxData *data)
 {
@@ -3765,11 +3785,15 @@
         return NPY_SUCCEED;
     }
 
-    /* Check data alignment */
-    aligned = (((npy_intp)src | src_stride) &
-                                (src_dtype->alignment - 1)) == 0 &&
-              (((npy_intp)dst | dst_stride) &
-                                (dst_dtype->alignment - 1)) == 0;
+    /* Check data alignment, both uint and true */
+    aligned = raw_array_is_aligned(1, &count, dst, &dst_stride,
+                                   npy_uint_alignment(dst_dtype->elsize)) &&
+              raw_array_is_aligned(1, &count, dst, &dst_stride,
+                                   dst_dtype->alignment) &&
+              raw_array_is_aligned(1, &count, src, &src_stride,
+                                   npy_uint_alignment(src_dtype->elsize)) &&
+              raw_array_is_aligned(1, &count, src, &src_stride,
+                                   src_dtype->alignment);
 
     /* Get the function to do the casting */
     if (PyArray_GetDTypeTransferFunction(aligned,
diff --git a/numpy/core/src/multiarray/einsum.c.src b/numpy/core/src/multiarray/einsum.c.src
index 1765982..eb2b338 100644
--- a/numpy/core/src/multiarray/einsum.c.src
+++ b/numpy/core/src/multiarray/einsum.c.src
@@ -1992,12 +1992,13 @@
 
 
 /*
- * When there's just one operand and no reduction, we
- * can return a view into op.  This calculates the view
- * if possible.
+ * When there's just one operand and no reduction we can return a view
+ * into 'op'.  This calculates the view and stores it in 'ret', if
+ * possible.  Returns -1 on error, 0 otherwise.  Note that a 0 return
+ * does not mean that a view was successfully created.
  */
 static int
-get_single_op_view(PyArrayObject *op, int  iop, char *labels,
+get_single_op_view(PyArrayObject *op, char *labels,
                    int ndim_output, char *output_labels,
                    PyArrayObject **ret)
 {
@@ -2052,13 +2053,11 @@
             }
             /* Update the dimensions and strides of the output */
             i = out_label - output_labels;
-            if (new_dims[i] != 0 &&
-                    new_dims[i] != PyArray_DIM(op, idim)) {
+            if (new_dims[i] != 0 && new_dims[i] != PyArray_DIM(op, idim)) {
                 PyErr_Format(PyExc_ValueError,
-                        "dimensions in operand %d for collapsing "
+                        "dimensions in single operand for collapsing "
                         "index '%c' don't match (%d != %d)",
-                        iop, label, (int)new_dims[i],
-                        (int)PyArray_DIM(op, idim));
+                        label, (int)new_dims[i], (int)PyArray_DIM(op, idim));
                 return -1;
             }
             new_dims[i] = PyArray_DIM(op, idim);
@@ -2086,80 +2085,107 @@
     return 0;
 }
 
+
+/*
+ * The char type may be either signed or unsigned, we need it to be
+ * signed here.
+ */
+static int
+_any_labels_are_negative(signed char *labels, int ndim)
+{
+    int idim;
+
+    for (idim = 0; idim < ndim; ++idim) {
+        if (labels[idim] < 0) {
+            return 1;
+        }
+    }
+
+    return 0;
+}
+
+/*
+ * Given the labels for an operand array, returns a view of the array
+ * with all repeated labels collapsed into a single dimension along
+ * the corresponding diagonal. The labels are also updated to match
+ * the dimensions of the new array. If no label is repeated, the
+ * original array is reference increased and returned unchanged.
+ */
 static PyArrayObject *
 get_combined_dims_view(PyArrayObject *op, int iop, char *labels)
 {
     npy_intp new_strides[NPY_MAXDIMS];
     npy_intp new_dims[NPY_MAXDIMS];
-    int idim, ndim, icombine, combineoffset;
+    int idim, icombine;
     int icombinemap[NPY_MAXDIMS];
-
+    int ndim = PyArray_NDIM(op);
     PyArrayObject *ret = NULL;
 
-    ndim = PyArray_NDIM(op);
+    /* A fast path to avoid unnecessary calculations. */
+    if (!_any_labels_are_negative((signed char *)labels, ndim)) {
+        Py_INCREF(op);
 
-    /* Initialize the dimensions and strides to zero */
-    for (idim = 0; idim < ndim; ++idim) {
-        new_dims[idim] = 0;
-        new_strides[idim] = 0;
+        return op;
     }
 
-    /* Copy the dimensions and strides, except when collapsing */
+    /* Combine repeated labels. */
     icombine = 0;
-    for (idim = 0; idim < ndim; ++idim) {
+    for(idim = 0; idim < ndim; ++idim) {
         /*
          * The char type may be either signed or unsigned, we
          * need it to be signed here.
          */
         int label = (signed char)labels[idim];
-        /* If this label says to merge axes, get the actual label */
-        if (label < 0) {
-            combineoffset = label;
-            label = labels[idim+label];
-        }
-        else {
-            combineoffset = 0;
-            if (icombine != idim) {
-                labels[icombine] = labels[idim];
-            }
+        npy_intp dim = PyArray_DIM(op, idim);
+        npy_intp stride = PyArray_STRIDE(op, idim);
+
+        /* A label seen for the first time, add it to the op view. */
+        if (label >= 0) {
+            /*
+             * icombinemap maps dimensions in the original array to
+             * their position in the combined dimensions view.
+             */
             icombinemap[idim] = icombine;
+            new_dims[icombine] = dim;
+            new_strides[icombine] = stride;
+            ++icombine;
         }
-        /* If the label is 0, it's an unlabeled broadcast dimension */
-        if (label == 0) {
-            new_dims[icombine] = PyArray_DIM(op, idim);
-            new_strides[icombine] = PyArray_STRIDE(op, idim);
-        }
+        /* A repeated label, find the original one and merge them. */
         else {
-            /* Update the combined axis dimensions and strides */
-            int i = icombinemap[idim + combineoffset];
-            if (combineoffset < 0 && new_dims[i] != 0 &&
-                        new_dims[i] != PyArray_DIM(op, idim)) {
+            int i = icombinemap[idim + label];
+
+            icombinemap[idim] = -1;
+            if (new_dims[i] != dim) {
                 PyErr_Format(PyExc_ValueError,
-                        "dimensions in operand %d for collapsing "
-                        "index '%c' don't match (%d != %d)",
-                        iop, label, (int)new_dims[i],
-                        (int)PyArray_DIM(op, idim));
+                             "dimensions in operand %d for collapsing "
+                             "index '%c' don't match (%d != %d)",
+                             iop, label, (int)new_dims[i], (int)dim);
                 return NULL;
             }
-            new_dims[i] = PyArray_DIM(op, idim);
-            new_strides[i] += PyArray_STRIDE(op, idim);
-        }
-
-        /* If the label didn't say to combine axes, increment dest i */
-        if (combineoffset == 0) {
-            icombine++;
+            new_strides[i] += stride;
         }
     }
 
-    /* The compressed number of dimensions */
+    /* Overwrite labels to match the new operand view. */
+    for (idim = 0; idim < ndim; ++idim) {
+        int i = icombinemap[idim];
+
+        if (i >= 0) {
+            labels[i] = labels[idim];
+        }
+    }
+
+    /* The number of dimensions of the combined view. */
     ndim = icombine;
 
+    /* Create a view of the operand with the compressed dimensions. */
     Py_INCREF(PyArray_DESCR(op));
     ret = (PyArrayObject *)PyArray_NewFromDescrAndBase(
             Py_TYPE(op), PyArray_DESCR(op),
             ndim, new_dims, new_strides, PyArray_DATA(op),
             PyArray_ISWRITEABLE(op) ? NPY_ARRAY_WRITEABLE : 0,
             (PyObject *)op, (PyObject *)op);
+
     return ret;
 }
 
@@ -2620,6 +2646,24 @@
         return NULL;
     }
 
+    /*
+     * If there's just one operand and no output parameter,
+     * first try remapping the axes to the output to return
+     * a view instead of a copy.
+     */
+    if (nop == 1 && out == NULL) {
+        ret = NULL;
+
+        if (get_single_op_view(op_in[0], op_labels[0], ndim_output,
+                               output_labels, &ret) < 0) {
+            return NULL;
+        }
+
+        if (ret != NULL) {
+            return ret;
+        }
+    }
+
     /* Set all the op references to NULL */
     for (iop = 0; iop < nop; ++iop) {
         op[iop] = NULL;
@@ -2631,53 +2675,10 @@
      */
     for (iop = 0; iop < nop; ++iop) {
         char *labels = op_labels[iop];
-        int combine, ndim;
 
-        ndim = PyArray_NDIM(op_in[iop]);
-
-        /*
-         * If there's just one operand and no output parameter,
-         * first try remapping the axes to the output to return
-         * a view instead of a copy.
-         */
-        if (iop == 0 && nop == 1 && out == NULL) {
-            ret = NULL;
-
-            if (get_single_op_view(op_in[iop], iop, labels,
-                                   ndim_output, output_labels,
-                                   &ret) < 0) {
-                return NULL;
-            }
-
-            if (ret != NULL) {
-                return ret;
-            }
-        }
-
-        /*
-         * Check whether any dimensions need to be combined
-         *
-         * The char type may be either signed or unsigned, we
-         * need it to be signed here.
-         */
-        combine = 0;
-        for (idim = 0; idim < ndim; ++idim) {
-            if ((signed char)labels[idim] < 0) {
-                combine = 1;
-            }
-        }
-
-        /* If any dimensions are combined, create a view which combines them */
-        if (combine) {
-            op[iop] = get_combined_dims_view(op_in[iop], iop, labels);
-            if (op[iop] == NULL) {
-                goto fail;
-            }
-        }
-        /* No combining needed */
-        else {
-            Py_INCREF(op_in[iop]);
-            op[iop] = op_in[iop];
+        op[iop] = get_combined_dims_view(op_in[iop], iop, labels);
+        if (op[iop] == NULL) {
+            goto fail;
         }
     }
 
diff --git a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
index 159bb41..16bacf1 100644
--- a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
+++ b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
@@ -82,7 +82,7 @@
 /**begin repeat
  * #elsize = 1, 2, 4, 8, 16#
  * #elsize_half = 0, 1, 2, 4, 8#
- * #type = npy_uint8, npy_uint16, npy_uint32, npy_uint64, npy_uint128#
+ * #type = npy_uint8, npy_uint16, npy_uint32, npy_uint64, npy_uint64#
  */
 /**begin repeat1
  * #oper = strided_to_strided, strided_to_contig,
@@ -119,10 +119,10 @@
                         npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
                         NpyAuxData *NPY_UNUSED(data))
 {
-#if @is_aligned@ && @elsize@ != 16
+#if @is_aligned@
     /* sanity check */
-    assert(npy_is_aligned(dst, _ALIGN(@type@)));
-    assert(npy_is_aligned(src, _ALIGN(@type@)));
+    assert(N == 0 || npy_is_aligned(dst, _UINT_ALIGN(@type@)));
+    assert(N == 0 || npy_is_aligned(src, _UINT_ALIGN(@type@)));
 #endif
     /*printf("fn @prefix@_@oper@_size@elsize@\n");*/
     while (N > 0) {
@@ -201,8 +201,8 @@
     }
 #if @is_aligned@ && @elsize@ != 16
     /* sanity check */
-    assert(npy_is_aligned(dst, _ALIGN(@type@)));
-    assert(npy_is_aligned(src, _ALIGN(@type@)));
+    assert(N == 0 || npy_is_aligned(dst, _UINT_ALIGN(@type@)));
+    assert(N == 0 || npy_is_aligned(src, _UINT_ALIGN(@type@)));
 #endif
 #if @elsize@ == 1 && @dst_contig@
     memset(dst, *src, N);
@@ -808,12 +808,8 @@
 
 #if @aligned@
    /* sanity check */
-#  if !@is_complex1@
-    assert(npy_is_aligned(src, _ALIGN(_TYPE1)));
-#  endif
-#  if !@is_complex2@
-    assert(npy_is_aligned(dst, _ALIGN(_TYPE2)));
-#  endif
+    assert(N == 0 || npy_is_aligned(src, _ALIGN(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, _ALIGN(_TYPE2)));
 #endif
 
     /*printf("@prefix@_cast_@name1@_to_@name2@\n");*/
@@ -1425,7 +1421,7 @@
         while (itersize--) {
             char * self_ptr;
             npy_intp indval = *((npy_intp*)ind_ptr);
-            assert(npy_is_aligned(ind_ptr, _ALIGN(npy_intp)));
+            assert(npy_is_aligned(ind_ptr, _UINT_ALIGN(npy_intp)));
 #if @isget@
             if (check_and_adjust_index(&indval, fancy_dim, 0, _save) < 0 ) {
                 return -1;
@@ -1439,8 +1435,8 @@
 
 #if @isget@
 #if @elsize@
-            assert(npy_is_aligned(result_ptr, _ALIGN(@copytype@)));
-            assert(npy_is_aligned(self_ptr, _ALIGN(@copytype@)));
+            assert(npy_is_aligned(result_ptr, _UINT_ALIGN(@copytype@)));
+            assert(npy_is_aligned(self_ptr, _UINT_ALIGN(@copytype@)));
             *(@copytype@ *)result_ptr = *(@copytype@ *)self_ptr;
 #else
             copyswap(result_ptr, self_ptr, 0, self);
@@ -1448,8 +1444,8 @@
 
 #else /* !@isget@ */
 #if @elsize@
-            assert(npy_is_aligned(result_ptr, _ALIGN(@copytype@)));
-            assert(npy_is_aligned(self_ptr, _ALIGN(@copytype@)));
+            assert(npy_is_aligned(result_ptr, _UINT_ALIGN(@copytype@)));
+            assert(npy_is_aligned(self_ptr, _UINT_ALIGN(@copytype@)));
             *(@copytype@ *)self_ptr = *(@copytype@ *)result_ptr;
 #else
             copyswap(self_ptr, result_ptr, 0, self);
@@ -1571,7 +1567,7 @@
                         for (i=0; i < @numiter@; i++) {
                             npy_intp indval = *((npy_intp*)outer_ptrs[i]);
                             assert(npy_is_aligned(outer_ptrs[i],
-                                                  _ALIGN(npy_intp)));
+                                                  _UINT_ALIGN(npy_intp)));
 
 #if @isget@ && @one_iter@
                             if (check_and_adjust_index(&indval, fancy_dims[i],
@@ -1591,16 +1587,20 @@
 
 #if @isget@
 #if @elsize@
-                        assert(npy_is_aligned(outer_ptrs[i], _ALIGN(@copytype@)));
-                        assert(npy_is_aligned(self_ptr, _ALIGN(@copytype@)));
+                        assert(npy_is_aligned(outer_ptrs[i],
+                                              _UINT_ALIGN(@copytype@)));
+                        assert(npy_is_aligned(self_ptr,
+                                              _UINT_ALIGN(@copytype@)));
                         *(@copytype@ *)(outer_ptrs[i]) = *(@copytype@ *)self_ptr;
 #else
                         copyswap(outer_ptrs[i], self_ptr, 0, array);
 #endif
 #else /* !@isget@ */
 #if @elsize@
-                        assert(npy_is_aligned(outer_ptrs[i], _ALIGN(@copytype@)));
-                        assert(npy_is_aligned(self_ptr, _ALIGN(@copytype@)));
+                        assert(npy_is_aligned(outer_ptrs[i],
+                               _UINT_ALIGN(@copytype@)));
+                        assert(npy_is_aligned(self_ptr,
+                               _UINT_ALIGN(@copytype@)));
                         *(@copytype@ *)self_ptr = *(@copytype@ *)(outer_ptrs[i]);
 #else
                         copyswap(self_ptr, outer_ptrs[i], 0, array);
diff --git a/numpy/core/src/multiarray/mapping.c b/numpy/core/src/multiarray/mapping.c
index 1b05fae..17edd2b 100644
--- a/numpy/core/src/multiarray/mapping.c
+++ b/numpy/core/src/multiarray/mapping.c
@@ -1064,7 +1064,8 @@
 
         /* Get a dtype transfer function */
         NpyIter_GetInnerFixedStrideArray(iter, fixed_strides);
-        if (PyArray_GetDTypeTransferFunction(IsUintAligned(self),
+        if (PyArray_GetDTypeTransferFunction(
+                        IsUintAligned(self) && IsAligned(self),
                         fixed_strides[0], itemsize,
                         dtype, dtype,
                         0,
@@ -1253,7 +1254,8 @@
         /* Get a dtype transfer function */
         NpyIter_GetInnerFixedStrideArray(iter, fixed_strides);
         if (PyArray_GetDTypeTransferFunction(
-                        IsUintAligned(self) && IsUintAligned(v),
+                        IsUintAligned(self) && IsAligned(self) &&
+                        IsUintAligned(v) && IsAligned(v),
                         v_stride, fixed_strides[0],
                         PyArray_DESCR(v), PyArray_DESCR(self),
                         0,
diff --git a/numpy/core/src/multiarray/methods.c b/numpy/core/src/multiarray/methods.c
index 231bd86..6005b97 100644
--- a/numpy/core/src/multiarray/methods.c
+++ b/numpy/core/src/multiarray/methods.c
@@ -8,6 +8,7 @@
 #include "numpy/arrayobject.h"
 #include "numpy/arrayscalars.h"
 
+#include "arrayfunction_override.h"
 #include "npy_config.h"
 #include "npy_pycompat.h"
 #include "npy_import.h"
@@ -187,7 +188,7 @@
     }
 
     if (n <= 1) {
-        if (PyTuple_GET_ITEM(args, 0) == Py_None) {
+        if (n != 0 && PyTuple_GET_ITEM(args, 0) == Py_None) {
             return PyArray_View(self, NULL, NULL);
         }
         if (!PyArg_ParseTuple(args, "O&:reshape", PyArray_IntpConverter,
@@ -823,8 +824,8 @@
         PyArrayObject *ret;
 
         /* If the requested dtype is flexible, adapt it */
-        PyArray_AdaptFlexibleDType((PyObject *)self, PyArray_DESCR(self),
-                                                                    &dtype);
+        dtype = PyArray_AdaptFlexibleDType((PyObject *)self,
+                                           PyArray_DESCR(self), dtype);
         if (dtype == NULL) {
             return NULL;
         }
@@ -1003,6 +1004,7 @@
     int i;
     int nin, nout;
     PyObject *out_kwd_obj;
+    PyObject *fast;
     PyObject **in_objs, **out_objs;
 
     /* check inputs */
@@ -1010,12 +1012,18 @@
     if (nin < 0) {
         return -1;
     }
-    in_objs = PySequence_Fast_ITEMS(args);
+    fast = PySequence_Fast(args, "Could not convert object to sequence");
+    if (fast == NULL) {
+        return -1;
+    }
+    in_objs = PySequence_Fast_ITEMS(fast);
     for (i = 0; i < nin; ++i) {
         if (PyUFunc_HasOverride(in_objs[i])) {
+            Py_DECREF(fast);
             return 1;
         }
     }
+    Py_DECREF(fast);
     /* check outputs, if any */
     nout = PyUFuncOverride_GetOutObjects(kwds, &out_kwd_obj, &out_objs);
     if (nout < 0) {
@@ -1023,9 +1031,11 @@
     }
     for (i = 0; i < nout; i++) {
         if (PyUFunc_HasOverride(out_objs[i])) {
+            Py_DECREF(out_kwd_obj);
             return 1;
         }
     }
+    Py_DECREF(out_kwd_obj);
     return 0;
 }
 
@@ -1079,13 +1089,29 @@
     return result;
 }
 
-
 static PyObject *
-array_function(PyArrayObject *self, PyObject *args, PyObject *kwds)
+array_function(PyArrayObject *self, PyObject *c_args, PyObject *c_kwds)
 {
-    NPY_FORWARD_NDARRAY_METHOD("_array_function");
-}
+    PyObject *func, *types, *args, *kwargs, *result;
+    static char *kwlist[] = {"func", "types", "args", "kwargs", NULL};
 
+    if (!PyArg_ParseTupleAndKeywords(
+            c_args, c_kwds, "OOOO:__array_function__", kwlist,
+            &func, &types, &args, &kwargs)) {
+        return NULL;
+    }
+
+    types = PySequence_Fast(
+        types,
+        "types argument to ndarray.__array_function__ must be iterable");
+    if (types == NULL) {
+        return NULL;
+    }
+
+    result = array_function_method_impl(func, types, args, kwargs);
+    Py_DECREF(types);
+    return result;
+}
 
 static PyObject *
 array_copy(PyArrayObject *self, PyObject *args, PyObject *kwds)
@@ -1355,6 +1381,7 @@
             return NULL;
         }
         newd = PyArray_DescrNew(saved);
+        Py_DECREF(newd->names);
         newd->names = new_name;
         ((PyArrayObject_fields *)self)->descr = newd;
     }
@@ -1409,6 +1436,7 @@
             return NULL;
         }
         newd = PyArray_DescrNew(saved);
+        Py_DECREF(newd->names);
         newd->names = new_name;
         ((PyArrayObject_fields *)self)->descr = newd;
     }
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index ce8af43..ce6a387 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -34,6 +34,7 @@
 NPY_NO_EXPORT int NPY_NUMUSERTYPES = 0;
 
 /* Internal APIs */
+#include "arrayfunction_override.h"
 #include "arraytypes.h"
 #include "arrayobject.h"
 #include "hashdescr.h"
@@ -408,9 +409,12 @@
         npy_intp *arr_shape;
 
         if (PyArray_NDIM(arrays[iarrays]) != ndim) {
-            PyErr_SetString(PyExc_ValueError,
-                            "all the input arrays must have same "
-                            "number of dimensions");
+            PyErr_Format(PyExc_ValueError,
+                         "all the input arrays must have same number of "
+                         "dimensions, but the array at index %d has %d "
+                         "dimension(s) and the array at index %d has %d "
+                         "dimension(s)",
+                         0, ndim, iarrays, PyArray_NDIM(arrays[iarrays]));
             return NULL;
         }
         arr_shape = PyArray_SHAPE(arrays[iarrays]);
@@ -422,10 +426,12 @@
             }
             /* Validate that the rest of the dimensions match */
             else if (shape[idim] != arr_shape[idim]) {
-                PyErr_SetString(PyExc_ValueError,
-                                "all the input array dimensions "
-                                "except for the concatenation axis "
-                                "must match exactly");
+                PyErr_Format(PyExc_ValueError,
+                             "all the input array dimensions for the "
+                             "concatenation axis must match exactly, but "
+                             "along dimension %d, the array at index %d has "
+                             "size %d and the array at index %d has size %d",
+                             idim, 0, shape[idim], iarrays, arr_shape[idim]);
                 return NULL;
             }
         }
@@ -982,7 +988,7 @@
     for (i = 0; i < PyArray_NDIM(ap2) - 2; i++) {
         dimensions[j++] = PyArray_DIMS(ap2)[i];
     }
-    if(PyArray_NDIM(ap2) > 1) {
+    if (PyArray_NDIM(ap2) > 1) {
         dimensions[j++] = PyArray_DIMS(ap2)[PyArray_NDIM(ap2)-1];
     }
 
@@ -1318,7 +1324,7 @@
      */
     if (inverted) {
         st = _pyarray_revert(ret);
-        if(st) {
+        if (st) {
             goto clean_ret;
         }
     }
@@ -1365,7 +1371,7 @@
     }
 
     ret = _pyarray_correlate(ap1, ap2, typenum, mode, &unused);
-    if(ret == NULL) {
+    if (ret == NULL) {
         goto fail;
     }
     Py_DECREF(ap1);
@@ -1654,7 +1660,7 @@
     }
 
 full_path:
-    if(!PyArg_ParseTupleAndKeywords(args, kws, "O|O&O&O&O&i:array", kwd,
+    if (!PyArg_ParseTupleAndKeywords(args, kws, "O|O&O&O&O&i:array", kwd,
                 &op,
                 PyArray_DescrConverter2, &type,
                 PyArray_BoolConverter, &copy,
@@ -2489,7 +2495,7 @@
                         "operand and a subscripts list to einsum");
         return -1;
     }
-    else if(nop >= NPY_MAXARGS) {
+    else if (nop >= NPY_MAXARGS) {
         PyErr_SetString(PyExc_ValueError, "too many operands");
         return -1;
     }
@@ -2724,7 +2730,7 @@
     static char *kwd[]= {"start", "stop", "step", "dtype", NULL};
     PyArray_Descr *typecode = NULL;
 
-    if(!PyArg_ParseTupleAndKeywords(args, kws, "O|OOO&:arange", kwd,
+    if (!PyArg_ParseTupleAndKeywords(args, kws, "O|OOO&:arange", kwd,
                 &o_start,
                 &o_stop,
                 &o_step,
@@ -2762,7 +2768,7 @@
 {
     static char *kwlist[] = {NULL};
 
-    if(!PyArg_ParseTupleAndKeywords(args, kwds, "", kwlist )) {
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "", kwlist )) {
         return NULL;
     }
     return PyInt_FromLong( (long) PyArray_GetNDArrayCVersion() );
@@ -2835,7 +2841,7 @@
     int repr = 1;
     static char *kwlist[] = {"f", "repr", NULL};
 
-    if(!PyArg_ParseTupleAndKeywords(args, kwds, "|Oi:set_string_function", kwlist, &op, &repr)) {
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oi:set_string_function", kwlist, &op, &repr)) {
         return NULL;
     }
     /* reset the array_repr function to built-in */
@@ -3145,7 +3151,7 @@
     PyArray_Descr *d1 = NULL;
     PyArray_Descr *d2 = NULL;
     PyObject *ret = NULL;
-    if(!PyArg_ParseTuple(args, "O&O&:promote_types",
+    if (!PyArg_ParseTuple(args, "O&O&:promote_types",
                 PyArray_DescrConverter2, &d1, PyArray_DescrConverter2, &d2)) {
         goto finish;
     }
@@ -3171,7 +3177,7 @@
     PyArrayObject *array;
     PyObject *ret = NULL;
 
-    if(!PyArg_ParseTuple(args, "O:min_scalar_type", &array_in)) {
+    if (!PyArg_ParseTuple(args, "O:min_scalar_type", &array_in)) {
         return NULL;
     }
 
@@ -3248,12 +3254,13 @@
     PyArray_Descr *dtype;
     PyArray_DatetimeMetaData *meta;
 
-    if(!PyArg_ParseTuple(args, "O&:datetime_data",
+    if (!PyArg_ParseTuple(args, "O&:datetime_data",
                 PyArray_DescrConverter, &dtype)) {
         return NULL;
     }
 
     meta = get_datetime_metadata_from_dtype(dtype);
+    Py_DECREF(dtype);    
     if (meta == NULL) {
         return NULL;
     }
@@ -3267,7 +3274,7 @@
 {
     int size;
 
-    if(!PyArg_ParseTuple(args, "i:buffer", &size)) {
+    if (!PyArg_ParseTuple(args, "i:buffer", &size)) {
         return NULL;
     }
     return PyBuffer_New(size);
@@ -3618,6 +3625,7 @@
     if (nargs == -1 || nargs > NPY_MAXARGS) {
         PyErr_Format(PyExc_ValueError,
                 "len(args) must be < %d", NPY_MAXARGS - 1);
+        Py_DECREF(type);
         goto err;
     }
 
@@ -3625,6 +3633,7 @@
     for (i = 1; i < nargs; i++) {
         PyObject* item = PySequence_GetItem(args, i-1);
         if (item == NULL) {
+            Py_DECREF(type);
             goto err;
         }
         broadcast_args[i] = item;
@@ -3633,6 +3642,7 @@
     in_iter = (PyArrayMultiIterObject*)PyArray_MultiIterFromObjects
         (broadcast_args, nargs, 0);
     if (in_iter == NULL) {
+        Py_DECREF(type);
         goto err;
     }
     n = in_iter->numiter;
@@ -3713,6 +3723,7 @@
 
     in_iter = (PyArrayIterObject*)PyArray_IterNew((PyObject*)char_array);
     if (in_iter == NULL) {
+        Py_DECREF(type);
         goto err;
     }
 
@@ -3769,7 +3780,7 @@
 _vec_string(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *kwds)
 {
     PyArrayObject* char_array = NULL;
-    PyArray_Descr *type = NULL;
+    PyArray_Descr *type;
     PyObject* method_name;
     PyObject* args_seq = NULL;
 
@@ -3806,6 +3817,7 @@
         result = _vec_string_with_args(char_array, type, method, args_seq);
     }
     else {
+        Py_DECREF(type);
         PyErr_SetString(PyExc_TypeError,
                 "'args' must be a sequence of arguments");
         goto err;
@@ -4062,6 +4074,9 @@
 }
 
 static struct PyMethodDef array_module_methods[] = {
+    {"_get_implementing_args",
+        (PyCFunction)array__get_implementing_args,
+        METH_VARARGS, NULL},
     {"_get_ndarray_c_version",
         (PyCFunction)array__get_ndarray_c_version,
         METH_VARARGS|METH_KEYWORDS, NULL},
@@ -4224,6 +4239,9 @@
         METH_VARARGS | METH_KEYWORDS, NULL},
     {"_monotonicity", (PyCFunction)arr__monotonicity,
         METH_VARARGS | METH_KEYWORDS, NULL},
+    {"implement_array_function",
+        (PyCFunction)array_implement_array_function,
+        METH_VARARGS, NULL},
     {"interp", (PyCFunction)arr_interp,
         METH_VARARGS | METH_KEYWORDS, NULL},
     {"interp_complex", (PyCFunction)arr_interp_complex,
@@ -4476,6 +4494,7 @@
 NPY_VISIBILITY_HIDDEN PyObject * npy_ma_str_array_finalize = NULL;
 NPY_VISIBILITY_HIDDEN PyObject * npy_ma_str_buffer = NULL;
 NPY_VISIBILITY_HIDDEN PyObject * npy_ma_str_ufunc = NULL;
+NPY_VISIBILITY_HIDDEN PyObject * npy_ma_str_wrapped = NULL;
 NPY_VISIBILITY_HIDDEN PyObject * npy_ma_str_order = NULL;
 NPY_VISIBILITY_HIDDEN PyObject * npy_ma_str_copy = NULL;
 NPY_VISIBILITY_HIDDEN PyObject * npy_ma_str_dtype = NULL;
@@ -4492,6 +4511,7 @@
     npy_ma_str_array_finalize = PyUString_InternFromString("__array_finalize__");
     npy_ma_str_buffer = PyUString_InternFromString("__buffer__");
     npy_ma_str_ufunc = PyUString_InternFromString("__array_ufunc__");
+    npy_ma_str_wrapped = PyUString_InternFromString("__wrapped__");
     npy_ma_str_order = PyUString_InternFromString("order");
     npy_ma_str_copy = PyUString_InternFromString("copy");
     npy_ma_str_dtype = PyUString_InternFromString("dtype");
@@ -4501,7 +4521,7 @@
 
     return npy_ma_str_array && npy_ma_str_array_prepare &&
            npy_ma_str_array_wrap && npy_ma_str_array_finalize &&
-           npy_ma_str_buffer && npy_ma_str_ufunc &&
+           npy_ma_str_buffer && npy_ma_str_ufunc && npy_ma_str_wrapped &&
            npy_ma_str_order && npy_ma_str_copy && npy_ma_str_dtype &&
            npy_ma_str_ndmin && npy_ma_str_axis1 && npy_ma_str_axis2;
 }
@@ -4570,6 +4590,10 @@
      */
     PyArray_Type.tp_hash = PyObject_HashNotImplemented;
 
+    if (PyType_Ready(&PyUFunc_Type) < 0) {
+        goto err;
+    }
+
     /* Load the ufunc operators into the array module's namespace */
     if (InitOperators(d) < 0) {
         goto err;
@@ -4580,8 +4604,9 @@
     }
     initialize_casting_tables();
     initialize_numeric_types();
-    if(initscalarmath(m) < 0)
+    if (initscalarmath(m) < 0) {
         goto err;
+    }
 
     if (PyType_Ready(&PyArray_Type) < 0) {
         goto err;
diff --git a/numpy/core/src/multiarray/multiarraymodule.h b/numpy/core/src/multiarray/multiarraymodule.h
index 3de68c5..60a3965 100644
--- a/numpy/core/src/multiarray/multiarraymodule.h
+++ b/numpy/core/src/multiarray/multiarraymodule.h
@@ -7,6 +7,7 @@
 NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_array_finalize;
 NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_buffer;
 NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_ufunc;
+NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_wrapped;
 NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_order;
 NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_copy;
 NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_dtype;
diff --git a/numpy/core/src/multiarray/nditer_constr.c b/numpy/core/src/multiarray/nditer_constr.c
index dbb24f2..dc58b3a 100644
--- a/numpy/core/src/multiarray/nditer_constr.c
+++ b/numpy/core/src/multiarray/nditer_constr.c
@@ -1101,8 +1101,8 @@
             /* We just have a borrowed reference to op_request_dtype */
             Py_INCREF(op_request_dtype);
             /* If the requested dtype is flexible, adapt it */
-            PyArray_AdaptFlexibleDType((PyObject *)(*op), PyArray_DESCR(*op),
-                                        &op_request_dtype);
+            op_request_dtype = PyArray_AdaptFlexibleDType((PyObject *)(*op), PyArray_DESCR(*op),
+                                                          op_request_dtype);
             if (op_request_dtype == NULL) {
                 return 0;
             }
@@ -1132,7 +1132,7 @@
         /* Check if the operand is aligned */
         if (op_flags & NPY_ITER_ALIGNED) {
             /* Check alignment */
-            if (!IsUintAligned(*op)) {
+            if (!IsAligned(*op)) {
                 NPY_IT_DBG_PRINT("Iterator: Setting NPY_OP_ITFLAG_CAST "
                                     "because of NPY_ITER_ALIGNED\n");
                 *op_itflags |= NPY_OP_ITFLAG_CAST;
@@ -1248,9 +1248,9 @@
     return 1;
 
   fail_nop:
-    iop = nop;
+    iop = nop - 1;
   fail_iop:
-    for (i = 0; i < iop; ++i) {
+    for (i = 0; i < iop+1; ++i) {
         Py_XDECREF(op[i]);
         Py_XDECREF(op_dtype[i]);
     }
@@ -2851,8 +2851,14 @@
             npyiter_replace_axisdata(iter, iop, op[iop], ondim,
                     PyArray_DATA(op[iop]), op_axes ? op_axes[iop] : NULL);
 
-            /* New arrays are aligned and need no cast */
-            op_itflags[iop] |= NPY_OP_ITFLAG_ALIGNED;
+            /*
+             * New arrays are guaranteed true-aligned, but copy/cast code
+             * needs uint-alignment in addition.
+             */
+            if (IsUintAligned(out)) {
+                op_itflags[iop] |= NPY_OP_ITFLAG_ALIGNED;
+            }
+            /* New arrays need no cast */
             op_itflags[iop] &= ~NPY_OP_ITFLAG_CAST;
         }
         /*
@@ -2888,11 +2894,17 @@
                     PyArray_DATA(op[iop]), NULL);
 
             /*
-             * New arrays are aligned need no cast, and in the case
+             * New arrays are guaranteed true-aligned, but copy/cast code
+             * needs uint-alignment in addition.
+             */
+            if (IsUintAligned(temp)) {
+                op_itflags[iop] |= NPY_OP_ITFLAG_ALIGNED;
+            }
+            /*
+             * New arrays need no cast, and in the case
              * of scalars, always have stride 0 so never need buffering
              */
-            op_itflags[iop] |= (NPY_OP_ITFLAG_ALIGNED |
-                                  NPY_OP_ITFLAG_BUFNEVER);
+            op_itflags[iop] |= NPY_OP_ITFLAG_BUFNEVER;
             op_itflags[iop] &= ~NPY_OP_ITFLAG_CAST;
             if (itflags & NPY_ITFLAG_BUFFER) {
                 NBF_STRIDES(bufferdata)[iop] = 0;
@@ -2953,8 +2965,14 @@
             npyiter_replace_axisdata(iter, iop, op[iop], ondim,
                     PyArray_DATA(op[iop]), op_axes ? op_axes[iop] : NULL);
 
-            /* The temporary copy is aligned and needs no cast */
-            op_itflags[iop] |= NPY_OP_ITFLAG_ALIGNED;
+            /*
+             * New arrays are guaranteed true-aligned, but copy/cast code
+             * additionally needs uint-alignment in addition.
+             */
+            if (IsUintAligned(temp)) {
+                op_itflags[iop] |= NPY_OP_ITFLAG_ALIGNED;
+            }
+            /* The temporary copy needs no cast */
             op_itflags[iop] &= ~NPY_OP_ITFLAG_CAST;
         }
         else {
@@ -3157,6 +3175,7 @@
                                         &stransfer,
                                         &transferdata,
                                         &needs_api) != NPY_SUCCEED) {
+                    iop -= 1;  /* This one cannot be cleaned up yet. */
                     goto fail;
                 }
                 readtransferfn[iop] = stransfer;
@@ -3250,7 +3269,7 @@
     return 1;
 
 fail:
-    for (i = 0; i < iop; ++i) {
+    for (i = 0; i < iop+1; ++i) {
         if (readtransferdata[iop] != NULL) {
             NPY_AUXDATA_FREE(readtransferdata[iop]);
             readtransferdata[iop] = NULL;
diff --git a/numpy/core/src/multiarray/nditer_pywrap.c b/numpy/core/src/multiarray/nditer_pywrap.c
index 5a9f3c5..30a81e0 100644
--- a/numpy/core/src/multiarray/nditer_pywrap.c
+++ b/numpy/core/src/multiarray/nditer_pywrap.c
@@ -2355,6 +2355,8 @@
     }
     ret = NpyIter_Deallocate(iter);
     self->iter = NULL;
+    Py_XDECREF(self->nested_child);
+    self->nested_child = NULL;
     if (ret < 0) {
         return NULL;
     }
diff --git a/numpy/core/src/multiarray/number.c b/numpy/core/src/multiarray/number.c
index d153a8a..420501c 100644
--- a/numpy/core/src/multiarray/number.c
+++ b/numpy/core/src/multiarray/number.c
@@ -599,15 +599,16 @@
             PyErr_Restore(exc, val, tb);
             return NULL;
         }
+        Py_XDECREF(exc);
+        Py_XDECREF(val);
+        Py_XDECREF(tb);
+
         /* 2018-06-28, 1.16.0 */
         if (DEPRECATE("Applying '+' to a non-numerical array is "
                       "ill-defined. Returning a copy, but in the future "
                       "this will error.") < 0) {
             return NULL;
         }
-        Py_XDECREF(exc);
-        Py_XDECREF(val);
-        Py_XDECREF(tb);
         value = PyArray_Return((PyArrayObject *)PyArray_Copy(m1));
     }
     return value;
diff --git a/numpy/core/src/multiarray/refcount.c b/numpy/core/src/multiarray/refcount.c
index 4b018b0..b8230c8 100644
--- a/numpy/core/src/multiarray/refcount.c
+++ b/numpy/core/src/multiarray/refcount.c
@@ -19,8 +19,12 @@
 static void
 _fillobject(char *optr, PyObject *obj, PyArray_Descr *dtype);
 
-/* Incref all objects found at this record */
+
 /*NUMPY_API
+ * XINCREF all objects in a single array item. This is complicated for
+ * structured datatypes where the position of objects needs to be extracted.
+ * The function is execute recursively for each nested field or subarrays dtype
+ * such as as `np.dtype([("field1", "O"), ("field2", "f,O", (3,2))])`
  */
 NPY_NO_EXPORT void
 PyArray_Item_INCREF(char *data, PyArray_Descr *descr)
@@ -51,11 +55,37 @@
             PyArray_Item_INCREF(data + offset, new);
         }
     }
+    else if (PyDataType_HASSUBARRAY(descr)) {
+        int size, i, inner_elsize;
+
+        inner_elsize = descr->subarray->base->elsize;
+        if (inner_elsize == 0) {
+            /* There cannot be any elements, so return */
+            return;
+        }
+        /* Subarrays are always contiguous in memory */
+        size = descr->elsize / inner_elsize;
+
+        for (i = 0; i < size; i++){
+            /* Recursively increment the reference count of subarray elements */
+            PyArray_Item_INCREF(data + i * inner_elsize,
+                                descr->subarray->base);
+        }
+    }
+    else {
+        /* This path should not be reachable. */
+        assert(0);
+    }
     return;
 }
 
-/* XDECREF all objects found at this record */
+
 /*NUMPY_API
+ *
+ * XDECREF all objects in a single array item. This is complicated for
+ * structured datatypes where the position of objects needs to be extracted.
+ * The function is execute recursively for each nested field or subarrays dtype
+ * such as as `np.dtype([("field1", "O"), ("field2", "f,O", (3,2))])`
  */
 NPY_NO_EXPORT void
 PyArray_Item_XDECREF(char *data, PyArray_Descr *descr)
@@ -87,6 +117,27 @@
                 PyArray_Item_XDECREF(data + offset, new);
             }
         }
+    else if (PyDataType_HASSUBARRAY(descr)) {
+        int size, i, inner_elsize;
+
+        inner_elsize = descr->subarray->base->elsize;
+        if (inner_elsize == 0) {
+            /* There cannot be any elements, so return */
+            return;
+        }
+        /* Subarrays are always contiguous in memory */
+        size = descr->elsize / inner_elsize;
+
+        for (i = 0; i < size; i++){
+            /* Recursively decrement the reference count of subarray elements */
+            PyArray_Item_XDECREF(data + i * inner_elsize,
+                                 descr->subarray->base);
+        }
+    }
+    else {
+        /* This path should not be reachable. */
+        assert(0);
+    }
     return;
 }
 
@@ -258,6 +309,10 @@
             Py_XDECREF(arr);
         }
     }
+    if (dtype->type_num == NPY_OBJECT) {
+        Py_XINCREF(obj);
+        NPY_COPY_PYOBJECT_PTR(optr, &obj);
+    }
     else if (PyDataType_HASFIELDS(dtype)) {
         PyObject *key, *value, *title = NULL;
         PyArray_Descr *new;
@@ -274,15 +329,26 @@
             _fillobject(optr + offset, obj, new);
         }
     }
-    else {
-        npy_intp i;
-        npy_intp nsize = dtype->elsize / sizeof(obj);
+    else if (PyDataType_HASSUBARRAY(dtype)) {
+        int size, i, inner_elsize;
 
-        for (i = 0; i < nsize; i++) {
-            Py_XINCREF(obj);
-            NPY_COPY_PYOBJECT_PTR(optr, &obj);
-            optr += sizeof(obj);
+        inner_elsize = dtype->subarray->base->elsize;
+        if (inner_elsize == 0) {
+            /* There cannot be any elements, so return */
+            return;
         }
-        return;
+        /* Subarrays are always contiguous in memory */
+        size = dtype->elsize / inner_elsize;
+
+        /* Call _fillobject on each item recursively. */
+        for (i = 0; i < size; i++){
+            _fillobject(optr, obj, dtype->subarray->base);
+            optr += inner_elsize;
+        }
     }
+    else {
+        /* This path should not be reachable. */
+        assert(0);
+    }
+    return;
 }
diff --git a/numpy/core/src/multiarray/scalartypes.c.src b/numpy/core/src/multiarray/scalartypes.c.src
index 2f71c8a..52de312 100644
--- a/numpy/core/src/multiarray/scalartypes.c.src
+++ b/numpy/core/src/multiarray/scalartypes.c.src
@@ -2599,6 +2599,8 @@
 static void
 void_dealloc(PyVoidScalarObject *v)
 {
+    _dealloc_cached_buffer_info((PyObject *)v);
+
     if (v->flags & NPY_ARRAY_OWNDATA) {
         npy_free_cache(v->obval, Py_SIZE(v));
     }
diff --git a/numpy/core/src/multiarray/usertypes.c b/numpy/core/src/multiarray/usertypes.c
index 8e80900..2e8fb51 100644
--- a/numpy/core/src/multiarray/usertypes.c
+++ b/numpy/core/src/multiarray/usertypes.c
@@ -40,19 +40,27 @@
 
 NPY_NO_EXPORT PyArray_Descr **userdescrs=NULL;
 
-static int *
-_append_new(int *types, int insert)
+static int
+_append_new(int **p_types, int insert)
 {
     int n = 0;
     int *newtypes;
+    int *types = *p_types;
 
     while (types[n] != NPY_NOTYPE) {
         n++;
     }
     newtypes = (int *)realloc(types, (n + 2)*sizeof(int));
+    if (newtypes == NULL) {
+        PyErr_NoMemory();
+        return -1;
+    }
     newtypes[n] = insert;
     newtypes[n + 1] = NPY_NOTYPE;
-    return newtypes;
+
+    /* Replace the passed-in pointer */
+    *p_types = newtypes;
+    return 0;
 }
 
 static npy_bool
@@ -247,10 +255,13 @@
          */
         if (descr->f->cancastto == NULL) {
             descr->f->cancastto = (int *)malloc(1*sizeof(int));
+            if (descr->f->cancastto == NULL) {
+                PyErr_NoMemory();
+                return -1;
+            }
             descr->f->cancastto[0] = NPY_NOTYPE;
         }
-        descr->f->cancastto = _append_new(descr->f->cancastto,
-                                          totype);
+        return _append_new(&descr->f->cancastto, totype);
     }
     else {
         /* register with cancastscalarkindto */
@@ -258,6 +269,10 @@
             int i;
             descr->f->cancastscalarkindto =
                 (int **)malloc(NPY_NSCALARKINDS* sizeof(int*));
+            if (descr->f->cancastscalarkindto == NULL) {
+                PyErr_NoMemory();
+                return -1;
+            }
             for (i = 0; i < NPY_NSCALARKINDS; i++) {
                 descr->f->cancastscalarkindto[i] = NULL;
             }
@@ -265,11 +280,13 @@
         if (descr->f->cancastscalarkindto[scalar] == NULL) {
             descr->f->cancastscalarkindto[scalar] =
                 (int *)malloc(1*sizeof(int));
+            if (descr->f->cancastscalarkindto[scalar] == NULL) {
+                PyErr_NoMemory();
+                return -1;
+            }
             descr->f->cancastscalarkindto[scalar][0] =
                 NPY_NOTYPE;
         }
-        descr->f->cancastscalarkindto[scalar] =
-            _append_new(descr->f->cancastscalarkindto[scalar], totype);
+        return _append_new(&descr->f->cancastscalarkindto[scalar], totype);
     }
-    return 0;
 }
diff --git a/numpy/core/src/umath/_struct_ufunc_tests.c.src b/numpy/core/src/umath/_struct_ufunc_tests.c.src
index b831d5c..5c6e235 100644
--- a/numpy/core/src/umath/_struct_ufunc_tests.c.src
+++ b/numpy/core/src/umath/_struct_ufunc_tests.c.src
@@ -114,6 +114,7 @@
                                 dtypes,
                                 NULL);
 
+    Py_DECREF(dtype);
     d = PyModule_GetDict(m);
 
     PyDict_SetItemString(d, "add_triplet", add_triplet);
diff --git a/numpy/core/src/umath/_umath_tests.c.src b/numpy/core/src/umath/_umath_tests.c.src
index 8cb74f1..6c3bcce 100644
--- a/numpy/core/src/umath/_umath_tests.c.src
+++ b/numpy/core/src/umath/_umath_tests.c.src
@@ -564,7 +564,7 @@
         core_dim_sizes = Py_None;
     }
     Py_DECREF(f);
-    return Py_BuildValue("iOOOO", core_enabled, core_num_dims,
+    return Py_BuildValue("iNNNN", core_enabled, core_num_dims,
                          core_dim_ixs, core_dim_flags, core_dim_sizes);
 
 fail:
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index f96e621..6accf30 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -566,16 +566,36 @@
 PyUFunc_O_O_method(char **args, npy_intp *dimensions, npy_intp *steps, void *func)
 {
     char *meth = (char *)func;
+    PyObject *tup = PyTuple_New(0);
+    if (tup == NULL) {
+        return;
+    }
     UNARY_LOOP {
         PyObject *in1 = *(PyObject **)ip1;
         PyObject **out = (PyObject **)op1;
-        PyObject *ret = PyObject_CallMethod(in1 ? in1 : Py_None, meth, NULL);
+        PyObject *ret, *func;
+        func = PyObject_GetAttrString(in1 ? in1 : Py_None, meth);
+        if (func == NULL || !PyCallable_Check(func)) {
+            PyObject *exc, *val, *tb;
+            PyTypeObject *type = in1 ? Py_TYPE(in1) : Py_TYPE(Py_None);
+            PyErr_Fetch(&exc, &val, &tb);
+            PyErr_Format(PyExc_TypeError,
+                         "loop of ufunc does not support argument %d of "
+                         "type %s which has no callable %s method",
+                         i, type->tp_name, meth);
+            npy_PyErr_ChainExceptionsCause(exc, val, tb);
+            Py_DECREF(tup);
+            return;
+        }
+        ret = PyObject_Call(func, tup, NULL);
         if (ret == NULL) {
+            Py_DECREF(tup);
             return;
         }
         Py_XDECREF(*out);
         *out = ret;
     }
+    Py_DECREF(tup);
 }
 
 /*UFUNC_API*/
@@ -1619,6 +1639,31 @@
     }
 }
 
+NPY_NO_EXPORT void
+TIMEDELTA_mm_q_floor_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_timedelta in1 = *(npy_timedelta *)ip1;
+        const npy_timedelta in2 = *(npy_timedelta *)ip2;
+        if (in1 == NPY_DATETIME_NAT || in2 == NPY_DATETIME_NAT) {
+            npy_set_floatstatus_invalid();
+            *((npy_timedelta *)op1) = 0;
+        }
+        else if (in2 == 0) {
+            npy_set_floatstatus_divbyzero();
+            *((npy_timedelta *)op1) = 0;
+        }
+        else {
+            if (((in1 > 0) != (in2 > 0)) && (in1 % in2 != 0)) {
+                *((npy_timedelta *)op1) = in1/in2 - 1;
+            }
+            else {
+                *((npy_timedelta *)op1) = in1/in2;
+            }
+        }
+    }
+}
+
 /*
  *****************************************************************************
  **                             FLOAT LOOPS                                 **
@@ -1861,7 +1906,8 @@
         if (!run_unary_reduce_simd_@kind@_@TYPE@(args, dimensions, steps)) {
             BINARY_REDUCE_LOOP(@type@) {
                 const @type@ in2 = *(@type@ *)ip2;
-                io1 = (npy_isnan(io1) || io1 @OP@ in2) ? io1 : in2;
+                /* Order of operations important for MSVC 2015 */
+                io1 = (io1 @OP@ in2 || npy_isnan(io1)) ? io1 : in2;
             }
             *((@type@ *)iop1) = io1;
         }
@@ -1870,7 +1916,8 @@
         BINARY_LOOP {
             @type@ in1 = *(@type@ *)ip1;
             const @type@ in2 = *(@type@ *)ip2;
-            in1 = (npy_isnan(in1) || in1 @OP@ in2) ? in1 : in2;
+            /* Order of operations important for MSVC 2015 */
+            in1 = (in1 @OP@ in2 || npy_isnan(in1)) ? in1 : in2;
             *((@type@ *)op1) = in1;
         }
     }
@@ -1889,7 +1936,8 @@
     if (IS_BINARY_REDUCE) {
         BINARY_REDUCE_LOOP(@type@) {
             const @type@ in2 = *(@type@ *)ip2;
-            io1 = (npy_isnan(in2) || io1 @OP@ in2) ? io1 : in2;
+            /* Order of operations important for MSVC 2015 */
+            io1 = (io1 @OP@ in2 || npy_isnan(in2)) ? io1 : in2;
         }
         *((@type@ *)iop1) = io1;
     }
@@ -1897,7 +1945,8 @@
         BINARY_LOOP {
             const @type@ in1 = *(@type@ *)ip1;
             const @type@ in2 = *(@type@ *)ip2;
-            *((@type@ *)op1) = (npy_isnan(in2) || in1 @OP@ in2) ? in1 : in2;
+            /* Order of operations important for MSVC 2015 */
+            *((@type@ *)op1) = (in1 @OP@ in2 || npy_isnan(in2)) ? in1 : in2;
         }
     }
     npy_clear_floatstatus_barrier((char*)dimensions);
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index 9b63273..3c90812 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -474,6 +474,9 @@
 TIMEDELTA_mm_d_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
 
 NPY_NO_EXPORT void
+TIMEDELTA_mm_q_floor_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
 TIMEDELTA_mm_m_remainder(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
 
 /* Special case equivalents to above functions */
diff --git a/numpy/core/src/umath/override.c b/numpy/core/src/umath/override.c
index c56f43f..8d67f96 100644
--- a/numpy/core/src/umath/override.c
+++ b/numpy/core/src/umath/override.c
@@ -86,6 +86,7 @@
             ++num_override_args;
         }
     }
+    Py_DECREF(out_kwd_obj);
     return num_override_args;
 
 fail:
@@ -93,6 +94,7 @@
         Py_DECREF(with_override[i]);
         Py_DECREF(methods[i]);
     }
+    Py_DECREF(out_kwd_obj);
     return -1;
 }
 
@@ -224,14 +226,14 @@
     PyObject *obj;
     static PyObject *NoValue = NULL;
     static char *kwlist[] = {"array", "axis", "dtype", "out", "keepdims",
-        "initial"};
+                             "initial", "where"};
 
     npy_cache_import("numpy", "_NoValue", &NoValue);
     if (NoValue == NULL) return -1;
 
-    if (nargs < 1 || nargs > 6) {
+    if (nargs < 1 || nargs > 7) {
         PyErr_Format(PyExc_TypeError,
-                     "ufunc.reduce() takes from 1 to 6 positional "
+                     "ufunc.reduce() takes from 1 to 7 positional "
                      "arguments but %"NPY_INTP_FMT" were given", nargs);
         return -1;
     }
diff --git a/numpy/core/src/umath/reduction.c b/numpy/core/src/umath/reduction.c
index 6d04ce3..4174e69 100644
--- a/numpy/core/src/umath/reduction.c
+++ b/numpy/core/src/umath/reduction.c
@@ -186,7 +186,6 @@
             return NULL;
         }
 
-        Py_INCREF(ret);
         if (PyArray_SetWritebackIfCopyBase(ret_copy, (PyArrayObject *)ret) < 0) {
             Py_DECREF(ret);
             Py_DECREF(ret_copy);
@@ -444,9 +443,9 @@
 
     /* Iterator parameters */
     NpyIter *iter = NULL;
-    PyArrayObject *op[2];
-    PyArray_Descr *op_dtypes[2];
-    npy_uint32 flags, op_flags[2];
+    PyArrayObject *op[3];
+    PyArray_Descr *op_dtypes[3];
+    npy_uint32 flags, op_flags[3];
 
     /* More than one axis means multiple orders are possible */
     if (!reorderable && count_axes(PyArray_NDIM(operand), axis_flags) > 1) {
@@ -456,13 +455,12 @@
                      funcname);
         return NULL;
     }
-
-
-    /* Validate that the parameters for future expansion are NULL */
-    if (wheremask != NULL) {
-        PyErr_SetString(PyExc_RuntimeError,
-                "Reduce operations in NumPy do not yet support "
-                "a where mask");
+    /* Can only use where with an initial ( from identity or argument) */
+    if (wheremask != NULL && identity == Py_None) {
+        PyErr_Format(PyExc_ValueError,
+                     "reduction operation '%s' does not have an identity, "
+                     "so to use a where mask one has to specify 'initial'",
+                     funcname);
         return NULL;
     }
 
@@ -524,8 +522,16 @@
                   NPY_ITER_NO_SUBTYPE;
     op_flags[1] = NPY_ITER_READONLY |
                   NPY_ITER_ALIGNED;
+    if (wheremask != NULL) {
+        op[2] = wheremask;
+        op_dtypes[2] = PyArray_DescrFromType(NPY_BOOL);
+        if (op_dtypes[2] == NULL) {
+            goto fail;
+        }
+        op_flags[2] = NPY_ITER_READONLY;
+    }
 
-    iter = NpyIter_AdvancedNew(2, op, flags,
+    iter = NpyIter_AdvancedNew(wheremask == NULL ? 2 : 3, op, flags,
                                NPY_KEEPORDER, casting,
                                op_flags,
                                op_dtypes,
@@ -568,7 +574,7 @@
             goto fail;
         }
     }
-    
+
     /* Check whether any errors occurred during the loop */
     if (PyErr_Occurred() ||
             _check_ufunc_fperr(errormask, NULL, "reduce") < 0) {
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index a3e00b5..4bb8569 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -32,13 +32,7 @@
 #include <float.h>
 #include <string.h> /* for memcpy */
 
-#if defined __AVX512F__
-#define VECTOR_SIZE_BYTES 64
-#elif defined __AVX2__
-#define VECTOR_SIZE_BYTES 32
-#else
 #define VECTOR_SIZE_BYTES 16
-#endif
 
 static NPY_INLINE npy_uintp
 abs_ptrdiff(char *a, char *b)
@@ -190,17 +184,24 @@
     @type@ * ip2 = (@type@ *)args[1];
     @type@ * op = (@type@ *)args[2];
     npy_intp n = dimensions[0];
+#if defined __AVX512F__
+    const npy_intp vector_size_bytes = 64;
+#elif defined __AVX2__
+    const npy_intp vector_size_bytes = 32;
+#else
+    const npy_intp vector_size_bytes = 32;
+#endif
     /* argument one scalar */
-    if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), VECTOR_SIZE_BYTES)) {
+    if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), vector_size_bytes)) {
         sse2_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n);
         return 1;
     }
     /* argument two scalar */
-    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), VECTOR_SIZE_BYTES)) {
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), vector_size_bytes)) {
         sse2_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n);
         return 1;
     }
-    else if (IS_BLOCKABLE_BINARY(sizeof(@type@), VECTOR_SIZE_BYTES)) {
+    else if (IS_BLOCKABLE_BINARY(sizeof(@type@), vector_size_bytes)) {
         sse2_binary_@kind@_@TYPE@(op, ip1, ip2, n);
         return 1;
     }
@@ -427,19 +428,20 @@
 sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
 {
 #ifdef  __AVX512F__
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
+    const npy_intp vector_size_bytes = 64;
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
         op[i] = ip1[i] @OP@ ip2[i];
     /* lots of specializations, to squeeze out max performance */
-    if (npy_is_aligned(&ip1[i], VECTOR_SIZE_BYTES) && npy_is_aligned(&ip2[i], VECTOR_SIZE_BYTES)) {
+    if (npy_is_aligned(&ip1[i], vector_size_bytes) && npy_is_aligned(&ip2[i], vector_size_bytes)) {
         if (ip1 == ip2) {
-            LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
+            LOOP_BLOCKED(@type@, vector_size_bytes) {
                 @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
                 @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, a);
                 @vpre512@_store_@vsuf@(&op[i], c);
             }
         }
         else {
-            LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
+            LOOP_BLOCKED(@type@, vector_size_bytes) {
                 @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
                 @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]);
                 @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
@@ -447,16 +449,16 @@
             }
         }
     }
-    else if (npy_is_aligned(&ip1[i], VECTOR_SIZE_BYTES)) {
-        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
+    else if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
+        LOOP_BLOCKED(@type@, vector_size_bytes) {
             @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
             @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]);
             @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
             @vpre512@_store_@vsuf@(&op[i], c);
         }
     }
-    else if (npy_is_aligned(&ip2[i], VECTOR_SIZE_BYTES)) {
-        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
+    else if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
+        LOOP_BLOCKED(@type@, vector_size_bytes) {
             @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
             @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]);
             @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
@@ -465,14 +467,14 @@
     }
     else {
         if (ip1 == ip2) {
-            LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
+            LOOP_BLOCKED(@type@, vector_size_bytes) {
                 @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
                 @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, a);
                 @vpre512@_store_@vsuf@(&op[i], c);
             }
         }
         else {
-            LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
+            LOOP_BLOCKED(@type@, vector_size_bytes) {
                 @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
                 @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]);
                 @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
@@ -481,20 +483,21 @@
         }
     }
 #elif __AVX2__
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
+    const npy_intp vector_size_bytes = 32;
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
         op[i] = ip1[i] @OP@ ip2[i];
     /* lots of specializations, to squeeze out max performance */
-    if (npy_is_aligned(&ip1[i], VECTOR_SIZE_BYTES) &&
-            npy_is_aligned(&ip2[i], VECTOR_SIZE_BYTES)) {
+    if (npy_is_aligned(&ip1[i], vector_size_bytes) &&
+            npy_is_aligned(&ip2[i], vector_size_bytes)) {
         if (ip1 == ip2) {
-            LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
+            LOOP_BLOCKED(@type@, vector_size_bytes) {
                 @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
                 @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, a);
                 @vpre256@_store_@vsuf@(&op[i], c);
             }
         }
         else {
-            LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
+            LOOP_BLOCKED(@type@, vector_size_bytes) {
                 @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
                 @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]);
                 @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
@@ -502,16 +505,16 @@
             }
         }
     }
-    else if (npy_is_aligned(&ip1[i], VECTOR_SIZE_BYTES)) {
-        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
+    else if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
+        LOOP_BLOCKED(@type@, vector_size_bytes) {
             @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
             @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]);
             @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
             @vpre256@_store_@vsuf@(&op[i], c);
         }
     }
-    else if (npy_is_aligned(&ip2[i], VECTOR_SIZE_BYTES)) {
-        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
+    else if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
+        LOOP_BLOCKED(@type@, vector_size_bytes) {
             @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
             @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]);
             @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
@@ -520,14 +523,14 @@
     }
     else {
         if (ip1 == ip2) {
-            LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
+            LOOP_BLOCKED(@type@, vector_size_bytes) {
                 @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
                 @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, a);
                 @vpre256@_store_@vsuf@(&op[i], c);
             }
         }
         else {
-            LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
+            LOOP_BLOCKED(@type@, vector_size_bytes) {
                 @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
                 @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]);
                 @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
@@ -601,18 +604,19 @@
 sse2_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
 {
 #ifdef __AVX512F__
+    const npy_intp vector_size_bytes = 64;
     const @vtype512@ a = @vpre512@_set1_@vsuf@(ip1[0]);
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
         op[i] = ip1[0] @OP@ ip2[i];
-    if (npy_is_aligned(&ip2[i], VECTOR_SIZE_BYTES)) {
-        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
+    if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
+        LOOP_BLOCKED(@type@, vector_size_bytes) {
             @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]);
             @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
             @vpre512@_store_@vsuf@(&op[i], c);
         }
     }
     else {
-        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
+        LOOP_BLOCKED(@type@, vector_size_bytes) {
             @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]);
             @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
             @vpre512@_store_@vsuf@(&op[i], c);
@@ -621,18 +625,19 @@
 
 
 #elif __AVX2__
+    const npy_intp vector_size_bytes = 32;
     const @vtype256@ a = @vpre256@_set1_@vsuf@(ip1[0]);
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
         op[i] = ip1[0] @OP@ ip2[i];
-    if (npy_is_aligned(&ip2[i], VECTOR_SIZE_BYTES)) {
-        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
+    if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
+        LOOP_BLOCKED(@type@, vector_size_bytes) {
             @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]);
             @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
             @vpre256@_store_@vsuf@(&op[i], c);
         }
     }
     else {
-        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
+        LOOP_BLOCKED(@type@, vector_size_bytes) {
             @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]);
             @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
             @vpre256@_store_@vsuf@(&op[i], c);
@@ -667,18 +672,19 @@
 sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
 {
 #ifdef __AVX512F__
+    const npy_intp vector_size_bytes = 64;
     const @vtype512@ b = @vpre512@_set1_@vsuf@(ip2[0]);
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
         op[i] = ip1[i] @OP@ ip2[0];
-    if (npy_is_aligned(&ip1[i], VECTOR_SIZE_BYTES)) {
-        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
+    if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
+        LOOP_BLOCKED(@type@, vector_size_bytes) {
             @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
             @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
             @vpre512@_store_@vsuf@(&op[i], c);
         }
     }
     else {
-        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
+        LOOP_BLOCKED(@type@, vector_size_bytes) {
             @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
             @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
             @vpre512@_store_@vsuf@(&op[i], c);
@@ -686,18 +692,19 @@
     }
 
 #elif __AVX2__
+    const npy_intp vector_size_bytes = 32;
     const @vtype256@ b = @vpre256@_set1_@vsuf@(ip2[0]);
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
         op[i] = ip1[i] @OP@ ip2[0];
-    if (npy_is_aligned(&ip1[i], VECTOR_SIZE_BYTES)) {
-        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
+    if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
+        LOOP_BLOCKED(@type@, vector_size_bytes) {
             @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
             @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
             @vpre256@_store_@vsuf@(&op[i], c);
         }
     }
     else {
-        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
+        LOOP_BLOCKED(@type@, vector_size_bytes) {
             @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
             @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
             @vpre256@_store_@vsuf@(&op[i], c);
@@ -1029,7 +1036,8 @@
 {
     const npy_intp stride = VECTOR_SIZE_BYTES / (npy_intp)sizeof(@type@);
     LOOP_BLOCK_ALIGN_VAR(ip, @type@, VECTOR_SIZE_BYTES) {
-        *op = (npy_isnan(*op) || *op @OP@ ip[i]) ? *op : ip[i];
+        /* Order of operations important for MSVC 2015 */
+        *op = (*op @OP@ ip[i] || npy_isnan(*op)) ? *op : ip[i];
     }
     assert(n < (stride) || npy_is_aligned(&ip[i], VECTOR_SIZE_BYTES));
     if (i + 3 * stride <= n) {
@@ -1053,11 +1061,13 @@
         }
         else {
             @type@ tmp = sse2_horizontal_@VOP@_@vtype@(c1);
-            *op  = (npy_isnan(*op) || *op @OP@ tmp) ? *op : tmp;
+            /* Order of operations important for MSVC 2015 */
+            *op  = (*op @OP@ tmp || npy_isnan(*op)) ? *op : tmp;
         }
     }
     LOOP_BLOCKED_END {
-        *op  = (npy_isnan(*op) || *op @OP@ ip[i]) ? *op : ip[i];
+        /* Order of operations important for MSVC 2015 */
+        *op  = (*op @OP@ ip[i] || npy_isnan(*op)) ? *op : ip[i];
     }
     npy_clear_floatstatus_barrier((char*)op);
 }
diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c
index ea0007a..ab986ca 100644
--- a/numpy/core/src/umath/ufunc_object.c
+++ b/numpy/core/src/umath/ufunc_object.c
@@ -3063,8 +3063,10 @@
     Py_XDECREF(axis);
     Py_XDECREF(full_args.in);
     Py_XDECREF(full_args.out);
+    PyArray_free(remap_axis_memory);
+    PyArray_free(remap_axis);
 
-    NPY_UF_DBG_PRINT1("Returning code %d\n", reval);
+    NPY_UF_DBG_PRINT1("Returning code %d\n", retval);
 
     return retval;
 
@@ -3466,12 +3468,15 @@
     PyUFuncObject *ufunc = (PyUFuncObject *)data;
     char *dataptrs_copy[3];
     npy_intp strides_copy[3];
+    npy_bool masked;
 
     /* The normal selected inner loop */
     PyUFuncGenericFunction innerloop = NULL;
     void *innerloopdata = NULL;
 
     NPY_BEGIN_THREADS_DEF;
+    /* Get the number of operands, to determine whether "where" is used */
+    masked = (NpyIter_GetNOp(iter) == 3);
 
     /* Get the inner loop */
     iter_dtypes = NpyIter_GetDescrArray(iter);
@@ -3531,8 +3536,36 @@
         strides_copy[0] = strides[0];
         strides_copy[1] = strides[1];
         strides_copy[2] = strides[0];
-        innerloop(dataptrs_copy, countptr,
-                    strides_copy, innerloopdata);
+
+        if (!masked) {
+            innerloop(dataptrs_copy, countptr,
+                      strides_copy, innerloopdata);
+        }
+        else {
+            npy_intp count = *countptr;
+            char *maskptr = dataptrs[2];
+            npy_intp mask_stride = strides[2];
+            /* Optimization for when the mask is broadcast */
+            npy_intp n = mask_stride == 0 ? count : 1;
+            while (count) {
+                char mask = *maskptr;
+                maskptr += mask_stride;
+                while (n < count && mask == *maskptr) {
+                    n++;
+                    maskptr += mask_stride;
+                }
+                /* If mask set, apply inner loop on this contiguous region */
+                if (mask) {
+                    innerloop(dataptrs_copy, &n,
+                              strides_copy, innerloopdata);
+                }
+                dataptrs_copy[0] += n * strides[0];
+                dataptrs_copy[1] += n * strides[1];
+                dataptrs_copy[2] = dataptrs_copy[0];
+                count -= n;
+                n = 1;
+            }
+        }
     } while (iternext(iter));
 
 finish_loop:
@@ -3561,7 +3594,7 @@
 static PyArrayObject *
 PyUFunc_Reduce(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *out,
         int naxes, int *axes, PyArray_Descr *odtype, int keepdims,
-        PyObject *initial)
+        PyObject *initial, PyArrayObject *wheremask)
 {
     int iaxes, ndim;
     npy_bool reorderable;
@@ -3627,7 +3660,7 @@
         return NULL;
     }
 
-    result = PyUFunc_ReduceWrapper(arr, out, NULL, dtype, dtype,
+    result = PyUFunc_ReduceWrapper(arr, out, wheremask, dtype, dtype,
                                    NPY_UNSAFE_CASTING,
                                    axis_flags, reorderable,
                                    keepdims, 0,
@@ -4384,7 +4417,7 @@
     int i, naxes=0, ndim;
     int axes[NPY_MAXDIMS];
     PyObject *axes_in = NULL;
-    PyArrayObject *mp = NULL, *ret = NULL;
+    PyArrayObject *mp = NULL, *wheremask = NULL, *ret = NULL;
     PyObject *op;
     PyObject *obj_ind, *context;
     PyArrayObject *indices = NULL;
@@ -4393,7 +4426,7 @@
     int keepdims = 0;
     PyObject *initial = NULL;
     static char *reduce_kwlist[] = {
-            "array", "axis", "dtype", "out", "keepdims", "initial", NULL};
+        "array", "axis", "dtype", "out", "keepdims", "initial", "where", NULL};
     static char *accumulate_kwlist[] = {
             "array", "axis", "dtype", "out", NULL};
     static char *reduceat_kwlist[] = {
@@ -4456,22 +4489,23 @@
     }
     else if (operation == UFUNC_ACCUMULATE) {
         if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|OO&O&:accumulate",
-                                        accumulate_kwlist,
-                                        &op,
-                                        &axes_in,
-                                        PyArray_DescrConverter2, &otype,
-                                        PyArray_OutputConverter, &out)) {
+                                         accumulate_kwlist,
+                                         &op,
+                                         &axes_in,
+                                         PyArray_DescrConverter2, &otype,
+                                         PyArray_OutputConverter, &out)) {
             goto fail;
         }
     }
     else {
-        if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|OO&O&iO:reduce",
-                                        reduce_kwlist,
-                                        &op,
-                                        &axes_in,
-                                        PyArray_DescrConverter2, &otype,
-                                        PyArray_OutputConverter, &out,
-                                        &keepdims, &initial)) {
+        if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|OO&O&iOO&:reduce",
+                                         reduce_kwlist,
+                                         &op,
+                                         &axes_in,
+                                         PyArray_DescrConverter2, &otype,
+                                         PyArray_OutputConverter, &out,
+                                         &keepdims, &initial,
+                                         _wheremask_converter, &wheremask)) {
             goto fail;
         }
     }
@@ -4602,7 +4636,8 @@
     switch(operation) {
     case UFUNC_REDUCE:
         ret = PyUFunc_Reduce(ufunc, mp, out, naxes, axes,
-                                          otype, keepdims, initial);
+                             otype, keepdims, initial, wheremask);
+        Py_XDECREF(wheremask);
         break;
     case UFUNC_ACCUMULATE:
         if (naxes != 1) {
@@ -4660,6 +4695,7 @@
 fail:
     Py_XDECREF(otype);
     Py_XDECREF(mp);
+    Py_XDECREF(wheremask);
     return NULL;
 }
 
@@ -4892,12 +4928,15 @@
         return NULL;
     }
 
-    ufunc = PyArray_malloc(sizeof(PyUFuncObject));
+    ufunc = PyObject_GC_New(PyUFuncObject, &PyUFunc_Type);
+    /*
+     * We use GC_New here for ufunc->obj, but do not use GC_Track since
+     * ufunc->obj is still NULL at the end of this function.
+     * See ufunc_frompyfunc where ufunc->obj is set and GC_Track is called.
+     */
     if (ufunc == NULL) {
         return NULL;
     }
-    memset(ufunc, 0, sizeof(PyUFuncObject));
-    PyObject_Init((PyObject *)ufunc, &PyUFunc_Type);
 
     ufunc->nin = nin;
     ufunc->nout = nout;
@@ -4905,13 +4944,30 @@
     ufunc->identity = identity;
     if (ufunc->identity == PyUFunc_IdentityValue) {
         Py_INCREF(identity_value);
+        ufunc->identity_value = identity_value;
     }
-    ufunc->identity_value = identity_value;
+    else {
+        ufunc->identity_value = NULL;
+    }
 
     ufunc->functions = func;
     ufunc->data = data;
     ufunc->types = types;
     ufunc->ntypes = ntypes;
+    ufunc->core_signature = NULL;
+    ufunc->core_enabled = 0;
+    ufunc->obj = NULL;
+    ufunc->core_num_dims = NULL;
+    ufunc->core_num_dim_ix = 0;
+    ufunc->core_offsets = NULL;
+    ufunc->core_dim_ixs = NULL;
+    ufunc->core_dim_sizes = NULL;
+    ufunc->core_dim_flags = NULL;
+    ufunc->userloops = NULL;
+    ufunc->ptr = NULL;
+    ufunc->reserved2 = NULL;
+    ufunc->reserved1 = 0;
+    ufunc->iter_flags = 0;
 
     /* Type resolution and inner loop selection functions */
     ufunc->type_resolver = &PyUFunc_DefaultTypeResolver;
@@ -5277,18 +5333,23 @@
 static void
 ufunc_dealloc(PyUFuncObject *ufunc)
 {
+    PyObject_GC_UnTrack((PyObject *)ufunc);
     PyArray_free(ufunc->core_num_dims);
     PyArray_free(ufunc->core_dim_ixs);
+    PyArray_free(ufunc->core_dim_sizes);
+    PyArray_free(ufunc->core_dim_flags);
     PyArray_free(ufunc->core_offsets);
     PyArray_free(ufunc->core_signature);
     PyArray_free(ufunc->ptr);
     PyArray_free(ufunc->op_flags);
     Py_XDECREF(ufunc->userloops);
-    Py_XDECREF(ufunc->obj);
     if (ufunc->identity == PyUFunc_IdentityValue) {
         Py_DECREF(ufunc->identity_value);
     }
-    PyArray_free(ufunc);
+    if (ufunc->obj != NULL) {
+        Py_DECREF(ufunc->obj);
+    }
+    PyObject_GC_Del(ufunc);
 }
 
 static PyObject *
@@ -5297,6 +5358,15 @@
     return PyUString_FromFormat("<ufunc '%s'>", ufunc->name);
 }
 
+static int
+ufunc_traverse(PyUFuncObject *self, visitproc visit, void *arg)
+{
+    Py_VISIT(self->obj);
+    if (self->identity == PyUFunc_IdentityValue) {
+        Py_VISIT(self->identity_value);
+    }
+    return 0;
+}
 
 /******************************************************************************
  ***                          UFUNC METHODS                                 ***
@@ -6013,9 +6083,9 @@
     0,                                          /* tp_getattro */
     0,                                          /* tp_setattro */
     0,                                          /* tp_as_buffer */
-    Py_TPFLAGS_DEFAULT,                         /* tp_flags */
+    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,    /* tp_flags */
     0,                                          /* tp_doc */
-    0,                                          /* tp_traverse */
+    (traverseproc)ufunc_traverse,               /* tp_traverse */
     0,                                          /* tp_clear */
     0,                                          /* tp_richcompare */
     0,                                          /* tp_weaklistoffset */
diff --git a/numpy/core/src/umath/ufunc_type_resolution.c b/numpy/core/src/umath/ufunc_type_resolution.c
index ec60d9c..e2f4d80 100644
--- a/numpy/core/src/umath/ufunc_type_resolution.c
+++ b/numpy/core/src/umath/ufunc_type_resolution.c
@@ -16,6 +16,7 @@
 
 #include "npy_config.h"
 #include "npy_pycompat.h"
+#include "npy_import.h"
 
 #include "numpy/ufuncobject.h"
 #include "ufunc_type_resolution.h"
@@ -27,6 +28,26 @@
 #include "cblasfuncs.h"
 #endif
 
+static PyObject *
+npy_casting_to_py_object(NPY_CASTING casting)
+{
+    switch (casting) {
+        case NPY_NO_CASTING:
+            return PyUString_FromString("no");
+        case NPY_EQUIV_CASTING:
+            return PyUString_FromString("equiv");
+        case NPY_SAFE_CASTING:
+            return PyUString_FromString("safe");
+        case NPY_SAME_KIND_CASTING:
+            return PyUString_FromString("same_kind");
+        case NPY_UNSAFE_CASTING:
+            return PyUString_FromString("unsafe");
+        default:
+            return PyInt_FromLong(casting);
+    }
+}
+
+
 static const char *
 npy_casting_to_string(NPY_CASTING casting)
 {
@@ -46,6 +67,9 @@
     }
 }
 
+/**
+ * Always returns -1 to indicate the exception was raised, for convenience
+ */
 static int
 raise_binary_type_reso_error(PyUFuncObject *ufunc, PyArrayObject **operands) {
     PyObject *errmsg;
@@ -63,6 +87,126 @@
     return -1;
 }
 
+/** Helper function to raise UFuncNoLoopError
+ * Always returns -1 to indicate the exception was raised, for convenience
+ */
+static int
+raise_no_loop_found_error(
+        PyUFuncObject *ufunc, PyArray_Descr **dtypes, npy_intp n_dtypes)
+{
+    static PyObject *exc_type = NULL;
+    PyObject *exc_value;
+    PyObject *dtypes_tup;
+    npy_intp i;
+
+    npy_cache_import(
+        "numpy.core._exceptions", "_UFuncNoLoopError",
+        &exc_type);
+    if (exc_type == NULL) {
+        return -1;
+    }
+
+    /* convert dtypes to a tuple */
+    dtypes_tup = PyTuple_New(n_dtypes);
+    if (dtypes_tup == NULL) {
+        return -1;
+    }
+    for (i = 0; i < n_dtypes; ++i) {
+        Py_INCREF(dtypes[i]);
+        PyTuple_SET_ITEM(dtypes_tup, i, (PyObject *)dtypes[i]);
+    }
+
+    /* produce an error object */
+    exc_value = PyTuple_Pack(2, ufunc, dtypes_tup);
+    Py_DECREF(dtypes_tup);
+    if (exc_value == NULL){
+        return -1;
+    }
+    PyErr_SetObject(exc_type, exc_value);
+    Py_DECREF(exc_value);
+
+    return -1;
+}
+
+static int
+raise_casting_error(
+        PyObject *exc_type,
+        PyUFuncObject *ufunc,
+        NPY_CASTING casting,
+        PyArray_Descr *from,
+        PyArray_Descr *to,
+        npy_intp i)
+{
+    PyObject *exc_value;
+    PyObject *casting_value;
+
+    casting_value = npy_casting_to_py_object(casting);
+    if (casting_value == NULL) {
+        return -1;
+    }
+
+    exc_value = Py_BuildValue(
+        "ONOOi",
+        ufunc,
+        casting_value,
+        (PyObject *)from,
+        (PyObject *)to,
+        i
+    );
+    if (exc_value == NULL){
+        return -1;
+    }
+    PyErr_SetObject(exc_type, exc_value);
+    Py_DECREF(exc_value);
+
+    return -1;
+}
+
+/** Helper function to raise UFuncInputCastingError
+ * Always returns -1 to indicate the exception was raised, for convenience
+ */
+static int
+raise_input_casting_error(
+        PyUFuncObject *ufunc,
+        NPY_CASTING casting,
+        PyArray_Descr *from,
+        PyArray_Descr *to,
+        npy_intp i)
+{
+    static PyObject *exc_type = NULL;
+    npy_cache_import(
+        "numpy.core._exceptions", "_UFuncInputCastingError",
+        &exc_type);
+    if (exc_type == NULL) {
+        return -1;
+    }
+
+    return raise_casting_error(exc_type, ufunc, casting, from, to, i);
+}
+
+
+/** Helper function to raise UFuncOutputCastingError
+ * Always returns -1 to indicate the exception was raised, for convenience
+ */
+static int
+raise_output_casting_error(
+        PyUFuncObject *ufunc,
+        NPY_CASTING casting,
+        PyArray_Descr *from,
+        PyArray_Descr *to,
+        npy_intp i)
+{
+    static PyObject *exc_type = NULL;
+    npy_cache_import(
+        "numpy.core._exceptions", "_UFuncOutputCastingError",
+        &exc_type);
+    if (exc_type == NULL) {
+        return -1;
+    }
+
+    return raise_casting_error(exc_type, ufunc, casting, from, to, i);
+}
+
 
 /*UFUNC_API
  *
@@ -79,45 +223,18 @@
                             PyArray_Descr **dtypes)
 {
     int i, nin = ufunc->nin, nop = nin + ufunc->nout;
-    const char *ufunc_name = ufunc_get_name_cstr(ufunc);
 
     for (i = 0; i < nop; ++i) {
         if (i < nin) {
             if (!PyArray_CanCastArrayTo(operands[i], dtypes[i], casting)) {
-                PyObject *errmsg;
-                errmsg = PyUString_FromFormat("Cannot cast ufunc %s "
-                                "input from ", ufunc_name);
-                PyUString_ConcatAndDel(&errmsg,
-                        PyObject_Repr((PyObject *)PyArray_DESCR(operands[i])));
-                PyUString_ConcatAndDel(&errmsg,
-                        PyUString_FromString(" to "));
-                PyUString_ConcatAndDel(&errmsg,
-                        PyObject_Repr((PyObject *)dtypes[i]));
-                PyUString_ConcatAndDel(&errmsg,
-                        PyUString_FromFormat(" with casting rule %s",
-                                        npy_casting_to_string(casting)));
-                PyErr_SetObject(PyExc_TypeError, errmsg);
-                Py_DECREF(errmsg);
-                return -1;
+                return raise_input_casting_error(
+                    ufunc, casting, PyArray_DESCR(operands[i]), dtypes[i], i);
             }
         } else if (operands[i] != NULL) {
             if (!PyArray_CanCastTypeTo(dtypes[i],
                                     PyArray_DESCR(operands[i]), casting)) {
-                PyObject *errmsg;
-                errmsg = PyUString_FromFormat("Cannot cast ufunc %s "
-                                "output from ", ufunc_name);
-                PyUString_ConcatAndDel(&errmsg,
-                        PyObject_Repr((PyObject *)dtypes[i]));
-                PyUString_ConcatAndDel(&errmsg,
-                        PyUString_FromString(" to "));
-                PyUString_ConcatAndDel(&errmsg,
-                        PyObject_Repr((PyObject *)PyArray_DESCR(operands[i])));
-                PyUString_ConcatAndDel(&errmsg,
-                        PyUString_FromFormat(" with casting rule %s",
-                                        npy_casting_to_string(casting)));
-                PyErr_SetObject(PyExc_TypeError, errmsg);
-                Py_DECREF(errmsg);
-                return -1;
+                return raise_output_casting_error(
+                    ufunc, casting, dtypes[i], PyArray_DESCR(operands[i]), i);
             }
         }
     }
@@ -1114,7 +1231,16 @@
             }
             out_dtypes[1] = out_dtypes[0];
             Py_INCREF(out_dtypes[1]);
+
+            /*
+             * TODO: split function into truediv and floordiv resolvers
+             */
+            if (strcmp(ufunc->name, "floor_divide") == 0) {
+                out_dtypes[2] = PyArray_DescrFromType(NPY_LONGLONG);
+            }
+            else {
             out_dtypes[2] = PyArray_DescrFromType(NPY_DOUBLE);
+            }
             if (out_dtypes[2] == NULL) {
                 Py_DECREF(out_dtypes[0]);
                 out_dtypes[0] = NULL;
@@ -1373,12 +1499,8 @@
 {
     int nargs = ufunc->nargs;
     char *types;
-    const char *ufunc_name;
-    PyObject *errmsg;
     int i, j;
 
-    ufunc_name = ufunc_get_name_cstr(ufunc);
-
     /*
      * If there are user-loops search them first.
      * TODO: There needs to be a loop selection acceleration structure,
@@ -1413,19 +1535,7 @@
         types += nargs;
     }
 
-    errmsg = PyUString_FromFormat("ufunc '%s' did not contain a loop "
-                    "with signature matching types ", ufunc_name);
-    for (i = 0; i < nargs; ++i) {
-        PyUString_ConcatAndDel(&errmsg,
-                PyObject_Repr((PyObject *)dtypes[i]));
-        if (i < nargs - 1) {
-            PyUString_ConcatAndDel(&errmsg, PyUString_FromString(" "));
-        }
-    }
-    PyErr_SetObject(PyExc_TypeError, errmsg);
-    Py_DECREF(errmsg);
-
-    return -1;
+    return raise_no_loop_found_error(ufunc, dtypes, nargs);
 }
 
 typedef struct {
@@ -2242,7 +2352,7 @@
 
     /* If no function was found, throw an error */
     PyErr_Format(PyExc_TypeError,
-            "No loop matching the specified signature and casting\n"
+            "No loop matching the specified signature and casting "
             "was found for ufunc %s", ufunc_name);
 
     return -1;
diff --git a/numpy/core/src/umath/umathmodule.c b/numpy/core/src/umath/umathmodule.c
index 8277ad6..23e3ffc 100644
--- a/numpy/core/src/umath/umathmodule.c
+++ b/numpy/core/src/umath/umathmodule.c
@@ -161,6 +161,7 @@
 
     self->type_resolver = &object_ufunc_type_resolver;
     self->legacy_inner_loop_selector = &object_ufunc_loop_selector;
+    PyObject_GC_Track(self);
 
     return (PyObject *)self;
 }
@@ -170,7 +171,7 @@
 add_newdoc_ufunc(PyObject *NPY_UNUSED(dummy), PyObject *args)
 {
     PyUFuncObject *ufunc;
-    PyObject *str;
+    PyObject *str, *tmp;
     char *docstr, *newdocstr;
 
 #if defined(NPY_PY3K)
@@ -178,7 +179,11 @@
                                         &PyUnicode_Type, &str)) {
         return NULL;
     }
-    docstr = PyBytes_AS_STRING(PyUnicode_AsUTF8String(str));
+    tmp = PyUnicode_AsUTF8String(str);
+    if (tmp == NULL) {
+        return NULL;
+    }
+    docstr = PyBytes_AS_STRING(tmp);
 #else
     if (!PyArg_ParseTuple(args, "O!O!:_add_newdoc_ufunc", &PyUFunc_Type, &ufunc,
                                          &PyString_Type, &str)) {
@@ -190,6 +195,9 @@
     if (NULL != ufunc->doc) {
         PyErr_SetString(PyExc_ValueError,
                 "Cannot change docstring of ufunc with non-NULL docstring");
+#if defined(NPY_PY3K)
+        Py_DECREF(tmp);
+#endif
         return NULL;
     }
 
@@ -203,6 +211,9 @@
     strcpy(newdocstr, docstr);
     ufunc->doc = newdocstr;
 
+#if defined(NPY_PY3K)
+    Py_DECREF(tmp);
+#endif
     Py_RETURN_NONE;
 }
 
@@ -268,10 +279,6 @@
     UFUNC_FLOATING_POINT_SUPPORT = 0;
 #endif
 
-    /* Initialize the types */
-    if (PyType_Ready(&PyUFunc_Type) < 0)
-        return -1;
-
     /* Add some symbolic constants to the module */
     d = PyModule_GetDict(m);
 
diff --git a/numpy/core/tests/test_arrayprint.py b/numpy/core/tests/test_arrayprint.py
index 7a858d2..f2b8fdc 100644
--- a/numpy/core/tests/test_arrayprint.py
+++ b/numpy/core/tests/test_arrayprint.py
@@ -90,6 +90,7 @@
         assert_equal(repr(x),
             'sub(sub(sub(..., dtype=object), dtype=object), dtype=object)')
         assert_equal(str(x), '...')
+        x[()] = 0  # resolve circular references for garbage collector
 
         # nested 0d-subclass-object
         x = sub(None)
@@ -124,11 +125,13 @@
         arr0d[()] = arr0d
         assert_equal(repr(arr0d),
             'array(array(..., dtype=object), dtype=object)')
+        arr0d[()] = 0  # resolve recursion for garbage collector
 
         arr1d = np.array([None, None])
         arr1d[1] = arr1d
         assert_equal(repr(arr1d),
             'array([None, array(..., dtype=object)], dtype=object)')
+        arr1d[1] = 0  # resolve recursion for garbage collector
 
         first = np.array(None)
         second = np.array(None)
@@ -136,6 +139,7 @@
         second[()] = first
         assert_equal(repr(first),
             'array(array(array(..., dtype=object), dtype=object), dtype=object)')
+        first[()] = 0  # resolve circular references for garbage collector
 
     def test_containing_list(self):
         # printing square brackets directly would be ambiguuous
diff --git a/numpy/core/tests/test_datetime.py b/numpy/core/tests/test_datetime.py
index b2ce040..cb7555a 100644
--- a/numpy/core/tests/test_datetime.py
+++ b/numpy/core/tests/test_datetime.py
@@ -1081,6 +1081,86 @@
                 check(np.timedelta64(0), f, nat)
                 check(nat, f, nat)
 
+    @pytest.mark.parametrize("op1, op2, exp", [
+        # m8 same units round down
+        (np.timedelta64(7, 's'),
+         np.timedelta64(4, 's'),
+         1),
+        # m8 same units round down with negative
+        (np.timedelta64(7, 's'),
+         np.timedelta64(-4, 's'),
+         -2),
+        # m8 same units negative no round down
+        (np.timedelta64(8, 's'),
+         np.timedelta64(-4, 's'),
+         -2),
+        # m8 different units
+        (np.timedelta64(1, 'm'),
+         np.timedelta64(31, 's'),
+         1),
+        # m8 generic units
+        (np.timedelta64(1890),
+         np.timedelta64(31),
+         60),
+        # Y // M works
+        (np.timedelta64(2, 'Y'),
+         np.timedelta64('13', 'M'),
+         1),
+        # handle 1D arrays
+        (np.array([1, 2, 3], dtype='m8'),
+         np.array([2], dtype='m8'),
+         np.array([0, 1, 1], dtype=np.int64)),
+        ])
+    def test_timedelta_floor_divide(self, op1, op2, exp):
+        assert_equal(op1 // op2, exp)
+
+    @pytest.mark.parametrize("op1, op2", [
+        # div by 0
+        (np.timedelta64(10, 'us'),
+         np.timedelta64(0, 'us')),
+        # div with NaT
+        (np.timedelta64('NaT'),
+         np.timedelta64(50, 'us')),
+        # special case for int64 min
+        # in integer floor division
+        (np.timedelta64(np.iinfo(np.int64).min),
+         np.timedelta64(-1)),
+        ])
+    def test_timedelta_floor_div_warnings(self, op1, op2):
+        with assert_warns(RuntimeWarning):
+            actual = op1 // op2
+            assert_equal(actual, 0)
+            assert_equal(actual.dtype, np.int64)
+
+    @pytest.mark.parametrize("val1, val2", [
+        # the smallest integer that can't be represented
+        # exactly in a double should be preserved if we avoid
+        # casting to double in floordiv operation
+        (9007199254740993, 1),
+        # stress the alternate floordiv code path where
+        # operand signs don't match and remainder isn't 0
+        (9007199254740999, -2),
+        ])
+    def test_timedelta_floor_div_precision(self, val1, val2):
+        op1 = np.timedelta64(val1)
+        op2 = np.timedelta64(val2)
+        actual = op1 // op2
+        # Python reference integer floor
+        expected = val1 // val2
+        assert_equal(actual, expected)
+
+    @pytest.mark.parametrize("val1, val2", [
+        # years and months sometimes can't be unambiguously
+        # divided for floor division operation
+        (np.timedelta64(7, 'Y'),
+         np.timedelta64(3, 's')),
+        (np.timedelta64(7, 'M'),
+         np.timedelta64(1, 'D')),
+        ])
+    def test_timedelta_floor_div_error(self, val1, val2):
+        with assert_raises_regex(TypeError, "common metadata divisor"):
+            val1 // val2
+
     def test_datetime_divide(self):
         for dta, tda, tdb, tdc, tdd in \
                     [
@@ -1111,8 +1191,6 @@
             assert_equal(tda / tdd, 60.0)
             assert_equal(tdd / tda, 1.0 / 60.0)
 
-            # m8 // m8
-            assert_raises(TypeError, np.floor_divide, tda, tdb)
             # int / m8
             assert_raises(TypeError, np.divide, 2, tdb)
             # float / m8
diff --git a/numpy/core/tests/test_dtype.py b/numpy/core/tests/test_dtype.py
index c55751e..8f37119 100644
--- a/numpy/core/tests/test_dtype.py
+++ b/numpy/core/tests/test_dtype.py
@@ -4,10 +4,12 @@
 import operator
 import pytest
 import ctypes
+import gc
 
 import numpy as np
 from numpy.core._rational_tests import rational
-from numpy.testing import assert_, assert_equal, assert_raises
+from numpy.testing import (
+    assert_, assert_equal, assert_array_equal, assert_raises, HAS_REFCOUNT)
 from numpy.core.numeric import pickle
 
 def assert_dtype_equal(a, b):
@@ -446,6 +448,173 @@
         assert_equal(t1.alignment, t2.alignment)
 
 
+def iter_struct_object_dtypes():
+    """
+    Iterates over a few complex dtypes and object pattern which
+    fill the array with a given object (defaults to a singleton).
+
+    Yields
+    ------
+    dtype : dtype
+    pattern : tuple
+        Structured tuple for use with `np.array`.
+    count : int
+        Number of objects stored in the dtype.
+    singleton : object
+        A singleton object. The returned pattern is constructed so that
+        all objects inside the datatype are set to the singleton.
+    """
+    obj = object()
+
+    dt = np.dtype([('b', 'O', (2, 3))])
+    p = ([[obj] * 3] * 2,)
+    yield pytest.param(dt, p, 6, obj, id="<subarray>")
+
+    dt = np.dtype([('a', 'i4'), ('b', 'O', (2, 3))])
+    p = (0, [[obj] * 3] * 2)
+    yield pytest.param(dt, p, 6, obj, id="<subarray in field>")
+
+    dt = np.dtype([('a', 'i4'),
+                   ('b', [('ba', 'O'), ('bb', 'i1')], (2, 3))])
+    p = (0, [[(obj, 0)] * 3] * 2)
+    yield pytest.param(dt, p, 6, obj, id="<structured subarray 1>")
+
+    dt = np.dtype([('a', 'i4'),
+                   ('b', [('ba', 'O'), ('bb', 'O')], (2, 3))])
+    p = (0, [[(obj, obj)] * 3] * 2)
+    yield pytest.param(dt, p, 12, obj, id="<structured subarray 2>")
+
+
+@pytest.mark.skipif(not HAS_REFCOUNT, reason="Python lacks refcounts")
+class TestStructuredObjectRefcounting:
+    """These tests cover various uses of complicated structured types which
+    include objects and thus require reference counting.
+    """
+    @pytest.mark.parametrize(['dt', 'pat', 'count', 'singleton'],
+                             iter_struct_object_dtypes())
+    @pytest.mark.parametrize(["creation_func", "creation_obj"], [
+        pytest.param(np.empty, None,
+             # None is probably used for too many things
+             marks=pytest.mark.skip("unreliable due to python's behaviour")),
+        (np.ones, 1),
+        (np.zeros, 0)])
+    def test_structured_object_create_delete(self, dt, pat, count, singleton,
+                                             creation_func, creation_obj):
+        """Structured object reference counting in creation and deletion"""
+        # The test assumes that 0, 1, and None are singletons.
+        gc.collect()
+        before = sys.getrefcount(creation_obj)
+        arr = creation_func(3, dt)
+
+        now = sys.getrefcount(creation_obj)
+        assert now - before == count * 3
+        del arr
+        now = sys.getrefcount(creation_obj)
+        assert now == before
+
+    @pytest.mark.parametrize(['dt', 'pat', 'count', 'singleton'],
+                             iter_struct_object_dtypes())
+    def test_structured_object_item_setting(self, dt, pat, count, singleton):
+        """Structured object reference counting for simple item setting"""
+        one = 1
+
+        gc.collect()
+        before = sys.getrefcount(singleton)
+        arr = np.array([pat] * 3, dt)
+        assert sys.getrefcount(singleton) - before == count * 3
+        # Fill with `1` and check that it was replaced correctly:
+        before2 = sys.getrefcount(one)
+        arr[...] = one
+        after2 = sys.getrefcount(one)
+        assert after2 - before2 == count * 3
+        del arr
+        gc.collect()
+        assert sys.getrefcount(one) == before2
+        assert sys.getrefcount(singleton) == before
+
+    @pytest.mark.parametrize(['dt', 'pat', 'count', 'singleton'],
+                             iter_struct_object_dtypes())
+    @pytest.mark.parametrize(
+        ['shape', 'index', 'items_changed'],
+        [((3,), ([0, 2],), 2),
+         ((3, 2), ([0, 2], slice(None)), 4),
+         ((3, 2), ([0, 2], [1]), 2),
+         ((3,), ([True, False, True]), 2)])
+    def test_structured_object_indexing(self, shape, index, items_changed,
+                                        dt, pat, count, singleton):
+        """Structured object reference counting for advanced indexing."""
+        zero = 0
+        one = 1
+
+        arr = np.zeros(shape, dt)
+
+        gc.collect()
+        before_zero = sys.getrefcount(zero)
+        before_one = sys.getrefcount(one)
+        # Test item getting:
+        part = arr[index]
+        after_zero = sys.getrefcount(zero)
+        assert after_zero - before_zero == count * items_changed
+        del part
+        # Test item setting:
+        arr[index] = one
+        gc.collect()
+        after_zero = sys.getrefcount(zero)
+        after_one = sys.getrefcount(one)
+        assert before_zero - after_zero == count * items_changed
+        assert after_one - before_one == count * items_changed
+
+    @pytest.mark.parametrize(['dt', 'pat', 'count', 'singleton'],
+                             iter_struct_object_dtypes())
+    def test_structured_object_take_and_repeat(self, dt, pat, count, singleton):
+        """Structured object reference counting for specialized functions.
+        The older functions such as take and repeat use different code paths
+        then item setting (when writing this).
+        """
+        indices = [0, 1]
+
+        arr = np.array([pat] * 3, dt)
+        gc.collect()
+        before = sys.getrefcount(singleton)
+        res = arr.take(indices)
+        after = sys.getrefcount(singleton)
+        assert after - before == count * 2
+        new = res.repeat(10)
+        gc.collect()
+        after_repeat = sys.getrefcount(singleton)
+        assert after_repeat - after == count * 2 * 10
+
+
+class TestStructuredDtypeSparseFields(object):
+    """Tests subarray fields which contain sparse dtypes so that
+    not all memory is used by the dtype work. Such dtype's should
+    leave the underlying memory unchanged.
+    """
+    dtype = np.dtype([('a', {'names':['aa', 'ab'], 'formats':['f', 'f'],
+                             'offsets':[0, 4]}, (2, 3))])
+    sparse_dtype = np.dtype([('a', {'names':['ab'], 'formats':['f'],
+                                    'offsets':[4]}, (2, 3))])
+
+    @pytest.mark.xfail(reason="inaccessible data is changed see gh-12686.")
+    @pytest.mark.valgrind_error(reason="reads from unitialized buffers.")
+    def test_sparse_field_assignment(self):
+        arr = np.zeros(3, self.dtype)
+        sparse_arr = arr.view(self.sparse_dtype)
+
+        sparse_arr[...] = np.finfo(np.float32).max
+        # dtype is reduced when accessing the field, so shape is (3, 2, 3):
+        assert_array_equal(arr["a"]["aa"], np.zeros((3, 2, 3)))
+
+    def test_sparse_field_assignment_fancy(self):
+        # Fancy assignment goes to the copyswap function for comlex types:
+        arr = np.zeros(3, self.dtype)
+        sparse_arr = arr.view(self.sparse_dtype)
+
+        sparse_arr[[0, 1, 2]] = np.finfo(np.float32).max
+        # dtype is reduced when accessing the field, so shape is (3, 2, 3):
+        assert_array_equal(arr["a"]["aa"], np.zeros((3, 2, 3)))
+
+
 class TestMonsterType(object):
     """Test deeply nested subtypes."""
 
diff --git a/numpy/core/tests/test_errstate.py b/numpy/core/tests/test_errstate.py
index 670d485..0008c4c 100644
--- a/numpy/core/tests/test_errstate.py
+++ b/numpy/core/tests/test_errstate.py
@@ -39,3 +39,11 @@
             with np.errstate(call=None):
                 assert_(np.geterrcall() is None, 'call is not None')
         assert_(np.geterrcall() is olderrcall, 'call is not olderrcall')
+
+    def test_errstate_decorator(self):
+        @np.errstate(all='ignore')
+        def foo():
+            a = -np.arange(3)
+            a // 0
+            
+        foo()
diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index a7517aa..241f8e4 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -54,7 +54,12 @@
 
 
 def _aligned_zeros(shape, dtype=float, order="C", align=None):
-    """Allocate a new ndarray with aligned memory."""
+    """
+    Allocate a new ndarray with aligned memory.
+
+    The ndarray is guaranteed *not* aligned to twice the requested alignment.
+    Eg, if align=4, guarantees it is not aligned to 8. If align=None uses
+    dtype.alignment."""
     dtype = np.dtype(dtype)
     if dtype == np.dtype(object):
         # Can't do this, fall back to standard allocation (which
@@ -67,10 +72,15 @@
     if not hasattr(shape, '__len__'):
         shape = (shape,)
     size = functools.reduce(operator.mul, shape) * dtype.itemsize
-    buf = np.empty(size + align + 1, np.uint8)
-    offset = buf.__array_interface__['data'][0] % align
+    buf = np.empty(size + 2*align + 1, np.uint8)
+
+    ptr = buf.__array_interface__['data'][0]
+    offset = ptr % align
     if offset != 0:
         offset = align - offset
+    if (ptr % (2*align)) == 0:
+        offset += align
+
     # Note: slices producing 0-size arrays do not necessarily change
     # data pointer --- so we use and allocate size+1
     buf = buf[offset:offset+size+1][:-1]
@@ -3124,8 +3134,8 @@
         assert_equal(ac, np.conjugate(a))
 
         a = np.array([1-1j, 1, 2.0, 'f'], object)
-        assert_raises(AttributeError, lambda: a.conj())
-        assert_raises(AttributeError, lambda: a.conjugate())
+        assert_raises(TypeError, lambda: a.conj())
+        assert_raises(TypeError, lambda: a.conjugate())
 
     def test__complex__(self):
         dtypes = ['i1', 'i2', 'i4', 'i8',
@@ -5910,7 +5920,7 @@
         assert_array_equal(out, tgt, err_msg=msg)
 
         # test out with not allowed type cast (safe casting)
-        msg = "Cannot cast ufunc matmul output"
+        msg = "Cannot cast ufunc .* output"
         out = np.zeros((5, 2), dtype=np.int32)
         assert_raises_regex(TypeError, msg, self.matmul, a, b, out=out)
 
@@ -6999,12 +7009,11 @@
             assert_raises(AttributeError, delattr, a, s)
 
 
-def test_array_interface():
-    # Test scalar coercion within the array interface
+class TestArrayInterface():
     class Foo(object):
         def __init__(self, value):
             self.value = value
-            self.iface = {'typestr': '=f8'}
+            self.iface = {'typestr': 'f8'}
 
         def __float__(self):
             return float(self.value)
@@ -7013,22 +7022,39 @@
         def __array_interface__(self):
             return self.iface
 
-    f = Foo(0.5)
-    assert_equal(np.array(f), 0.5)
-    assert_equal(np.array([f]), [0.5])
-    assert_equal(np.array([f, f]), [0.5, 0.5])
-    assert_equal(np.array(f).dtype, np.dtype('=f8'))
-    # Test various shape definitions
-    f.iface['shape'] = ()
-    assert_equal(np.array(f), 0.5)
-    f.iface['shape'] = None
-    assert_raises(TypeError, np.array, f)
-    f.iface['shape'] = (1, 1)
-    assert_equal(np.array(f), [[0.5]])
-    f.iface['shape'] = (2,)
-    assert_raises(ValueError, np.array, f)
 
-    # test scalar with no shape
+    f = Foo(0.5)
+
+    @pytest.mark.parametrize('val, iface, expected', [
+        (f, {}, 0.5),
+        ([f], {}, [0.5]),
+        ([f, f], {}, [0.5, 0.5]),
+        (f, {'shape': ()}, 0.5),
+        (f, {'shape': None}, TypeError),
+        (f, {'shape': (1, 1)}, [[0.5]]),
+        (f, {'shape': (2,)}, ValueError),
+        (f, {'strides': ()}, 0.5),
+        (f, {'strides': (2,)}, ValueError),
+        (f, {'strides': 16}, TypeError),
+        ])
+    def test_scalar_interface(self, val, iface, expected):
+        # Test scalar coercion within the array interface
+        self.f.iface = {'typestr': 'f8'}
+        self.f.iface.update(iface)
+        if HAS_REFCOUNT:
+            pre_cnt = sys.getrefcount(np.dtype('f8'))
+        if isinstance(expected, type):
+            assert_raises(expected, np.array, val)
+        else:
+            result = np.array(val)
+            assert_equal(np.array(val), expected)
+            assert result.dtype == 'f8'
+            del result
+        if HAS_REFCOUNT:
+            post_cnt = sys.getrefcount(np.dtype('f8'))
+            assert_equal(pre_cnt, post_cnt)
+
+def test_interface_no_shape():
     class ArrayLike(object):
         array = np.array(1)
         __array_interface__ = array.__array_interface__
@@ -7201,6 +7227,7 @@
         except NameError:
             Error = RuntimeError  # python < 3.5
         assert_raises(Error, bool, self_containing)  # previously stack overflow
+        self_containing[0] = None  # resolve circular reference
 
     def test_to_int_scalar(self):
         # gh-9972 means that these aren't always the same
@@ -7626,6 +7653,55 @@
         finally:
             _internal.ctypes = ctypes
 
+    def _make_readonly(x):
+        x.flags.writeable = False
+        return x
+
+    @pytest.mark.parametrize('arr', [
+        np.array([1, 2, 3]),
+        np.array([['one', 'two'], ['three', 'four']]),
+        np.array((1, 2), dtype='i4,i4'),
+        np.zeros((2,), dtype=
+            np.dtype(dict(
+                formats=['<i4', '<i4'],
+                names=['a', 'b'],
+                offsets=[0, 2],
+                itemsize=6
+            ))
+        ),
+        np.array([None], dtype=object),
+        np.array([]),
+        np.empty((0, 0)),
+        _make_readonly(np.array([1, 2, 3])),
+    ], ids=[
+        '1d',
+        '2d',
+        'structured',
+        'overlapping',
+        'object',
+        'empty',
+        'empty-2d',
+        'readonly'
+    ])
+    def test_ctypes_data_as_holds_reference(self, arr):
+        # gh-9647
+        # create a copy to ensure that pytest does not mess with the refcounts
+        arr = arr.copy()
+
+        arr_ref = weakref.ref(arr)
+
+        ctypes_ptr = arr.ctypes.data_as(ctypes.c_void_p)
+
+        # `ctypes_ptr` should hold onto `arr`
+        del arr
+        gc.collect()
+        assert_(arr_ref() is not None, "ctypes pointer did not hold onto a reference")
+
+        # but when the `ctypes_ptr` object dies, so should `arr`
+        del ctypes_ptr
+        gc.collect()
+        assert_(arr_ref() is None, "unknowable whether ctypes pointer holds a reference")
+
 
 class TestWritebackIfCopy(object):
     # all these tests use the WRITEBACKIFCOPY mechanism
@@ -7925,6 +8001,77 @@
     dst = np.zeros((2,2), dtype='c8')
     dst[:,1] = src[:,1]  # assert in lowlevel_strided_loops fails?
 
+class TestAlignment(object):
+    # adapted from scipy._lib.tests.test__util.test__aligned_zeros
+    # Checks that unusual memory alignments don't trip up numpy.
+    # In particular, check RELAXED_STRIDES don't trip alignment assertions in
+    # NDEBUG mode for size-0 arrays (gh-12503)
+
+    def check(self, shape, dtype, order, align):
+        err_msg = repr((shape, dtype, order, align))
+        x = _aligned_zeros(shape, dtype, order, align=align)
+        if align is None:
+            align = np.dtype(dtype).alignment
+        assert_equal(x.__array_interface__['data'][0] % align, 0)
+        if hasattr(shape, '__len__'):
+            assert_equal(x.shape, shape, err_msg)
+        else:
+            assert_equal(x.shape, (shape,), err_msg)
+        assert_equal(x.dtype, dtype)
+        if order == "C":
+            assert_(x.flags.c_contiguous, err_msg)
+        elif order == "F":
+            if x.size > 0:
+                assert_(x.flags.f_contiguous, err_msg)
+        elif order is None:
+            assert_(x.flags.c_contiguous, err_msg)
+        else:
+            raise ValueError()
+
+    def test_various_alignments(self):
+        for align in [1, 2, 3, 4, 8, 12, 16, 32, 64, None]:
+            for n in [0, 1, 3, 11]:
+                for order in ["C", "F", None]:
+                    for dtype in list(np.typecodes["All"]) + ['i4,i4,i4']:
+                        if dtype == 'O':
+                            # object dtype can't be misaligned
+                            continue
+                        for shape in [n, (1, 2, 3, n)]:
+                            self.check(shape, np.dtype(dtype), order, align)
+
+    def test_strided_loop_alignments(self):
+        # particularly test that complex64 and float128 use right alignment
+        # code-paths, since these are particularly problematic. It is useful to
+        # turn on USE_DEBUG for this test, so lowlevel-loop asserts are run.
+        for align in [1, 2, 4, 8, 12, 16, None]:
+            xf64 = _aligned_zeros(3, np.float64)
+
+            xc64 = _aligned_zeros(3, np.complex64, align=align)
+            xf128 = _aligned_zeros(3, np.longdouble, align=align)
+
+            # test casting, both to and from misaligned
+            with suppress_warnings() as sup:
+                sup.filter(np.ComplexWarning, "Casting complex values")
+                xc64.astype('f8')
+            xf64.astype(np.complex64)
+            test = xc64 + xf64
+
+            xf128.astype('f8')
+            xf64.astype(np.longdouble)
+            test = xf128 + xf64
+
+            test = xf128 + xc64
+
+            # test copy, both to and from misaligned
+            # contig copy
+            xf64[:] = xf64.copy()
+            xc64[:] = xc64.copy()
+            xf128[:] = xf128.copy()
+            # strided copy
+            xf64[::2] = xf64[::2].copy()
+            xc64[::2] = xc64[::2].copy()
+            xf128[::2] = xf128[::2].copy()
+
 def test_getfield():
     a = np.arange(32, dtype='uint16')
     if sys.byteorder == 'little':
diff --git a/numpy/core/tests/test_overrides.py b/numpy/core/tests/test_overrides.py
index 62b2a3e..8f1c165 100644
--- a/numpy/core/tests/test_overrides.py
+++ b/numpy/core/tests/test_overrides.py
@@ -7,7 +7,7 @@
 from numpy.testing import (
     assert_, assert_equal, assert_raises, assert_raises_regex)
 from numpy.core.overrides import (
-    get_overloaded_types_and_args, array_function_dispatch,
+    _get_implementing_args, array_function_dispatch,
     verify_matching_signatures, ENABLE_ARRAY_FUNCTION)
 from numpy.core.numeric import pickle
 import pytest
@@ -18,11 +18,6 @@
     reason="__array_function__ dispatch not enabled.")
 
 
-def _get_overloaded_args(relevant_args):
-    types, args = get_overloaded_types_and_args(relevant_args)
-    return args
-
-
 def _return_not_implemented(self, *args, **kwargs):
     return NotImplemented
 
@@ -41,26 +36,21 @@
 
 
 @requires_array_function
-class TestGetOverloadedTypesAndArgs(object):
+class TestGetImplementingArgs(object):
 
     def test_ndarray(self):
         array = np.array(1)
 
-        types, args = get_overloaded_types_and_args([array])
-        assert_equal(set(types), {np.ndarray})
+        args = _get_implementing_args([array])
         assert_equal(list(args), [array])
 
-        types, args = get_overloaded_types_and_args([array, array])
-        assert_equal(len(types), 1)
-        assert_equal(set(types), {np.ndarray})
+        args = _get_implementing_args([array, array])
         assert_equal(list(args), [array])
 
-        types, args = get_overloaded_types_and_args([array, 1])
-        assert_equal(set(types), {np.ndarray})
+        args = _get_implementing_args([array, 1])
         assert_equal(list(args), [array])
 
-        types, args = get_overloaded_types_and_args([1, array])
-        assert_equal(set(types), {np.ndarray})
+        args = _get_implementing_args([1, array])
         assert_equal(list(args), [array])
 
     def test_ndarray_subclasses(self):
@@ -75,17 +65,14 @@
         override_sub = np.array(1).view(OverrideSub)
         no_override_sub = np.array(1).view(NoOverrideSub)
 
-        types, args = get_overloaded_types_and_args([array, override_sub])
-        assert_equal(set(types), {np.ndarray, OverrideSub})
+        args = _get_implementing_args([array, override_sub])
         assert_equal(list(args), [override_sub, array])
 
-        types, args = get_overloaded_types_and_args([array, no_override_sub])
-        assert_equal(set(types), {np.ndarray, NoOverrideSub})
+        args = _get_implementing_args([array, no_override_sub])
         assert_equal(list(args), [no_override_sub, array])
 
-        types, args = get_overloaded_types_and_args(
+        args = _get_implementing_args(
             [override_sub, no_override_sub])
-        assert_equal(set(types), {OverrideSub, NoOverrideSub})
         assert_equal(list(args), [override_sub, no_override_sub])
 
     def test_ndarray_and_duck_array(self):
@@ -96,12 +83,10 @@
         array = np.array(1)
         other = Other()
 
-        types, args = get_overloaded_types_and_args([other, array])
-        assert_equal(set(types), {np.ndarray, Other})
+        args = _get_implementing_args([other, array])
         assert_equal(list(args), [other, array])
 
-        types, args = get_overloaded_types_and_args([array, other])
-        assert_equal(set(types), {np.ndarray, Other})
+        args = _get_implementing_args([array, other])
         assert_equal(list(args), [array, other])
 
     def test_ndarray_subclass_and_duck_array(self):
@@ -116,9 +101,9 @@
         subarray = np.array(1).view(OverrideSub)
         other = Other()
 
-        assert_equal(_get_overloaded_args([array, subarray, other]),
+        assert_equal(_get_implementing_args([array, subarray, other]),
                      [subarray, array, other])
-        assert_equal(_get_overloaded_args([array, other, subarray]),
+        assert_equal(_get_implementing_args([array, other, subarray]),
                      [subarray, array, other])
 
     def test_many_duck_arrays(self):
@@ -140,15 +125,26 @@
         c = C()
         d = D()
 
-        assert_equal(_get_overloaded_args([1]), [])
-        assert_equal(_get_overloaded_args([a]), [a])
-        assert_equal(_get_overloaded_args([a, 1]), [a])
-        assert_equal(_get_overloaded_args([a, a, a]), [a])
-        assert_equal(_get_overloaded_args([a, d, a]), [a, d])
-        assert_equal(_get_overloaded_args([a, b]), [b, a])
-        assert_equal(_get_overloaded_args([b, a]), [b, a])
-        assert_equal(_get_overloaded_args([a, b, c]), [b, c, a])
-        assert_equal(_get_overloaded_args([a, c, b]), [c, b, a])
+        assert_equal(_get_implementing_args([1]), [])
+        assert_equal(_get_implementing_args([a]), [a])
+        assert_equal(_get_implementing_args([a, 1]), [a])
+        assert_equal(_get_implementing_args([a, a, a]), [a])
+        assert_equal(_get_implementing_args([a, d, a]), [a, d])
+        assert_equal(_get_implementing_args([a, b]), [b, a])
+        assert_equal(_get_implementing_args([b, a]), [b, a])
+        assert_equal(_get_implementing_args([a, b, c]), [b, c, a])
+        assert_equal(_get_implementing_args([a, c, b]), [c, b, a])
+
+    def test_too_many_duck_arrays(self):
+        namespace = dict(__array_function__=_return_not_implemented)
+        types = [type('A' + str(i), (object,), namespace) for i in range(33)]
+        relevant_args = [t() for t in types]
+
+        actual = _get_implementing_args(relevant_args[:32])
+        assert_equal(actual, relevant_args[:32])
+
+        with assert_raises_regex(TypeError, 'distinct argument types'):
+            _get_implementing_args(relevant_args)
 
 
 @requires_array_function
@@ -201,6 +197,14 @@
         result = np.concatenate((array, override_sub))
         assert_equal(result, expected.view(OverrideSub))
 
+    def test_no_wrapper(self):
+        array = np.array(1)
+        func = dispatched_one_arg.__wrapped__
+        with assert_raises_regex(AttributeError, '__wrapped__'):
+            array.__array_function__(func=func,
+                                     types=(np.ndarray,),
+                                     args=(array,), kwargs={})
+
 
 @requires_array_function
 class TestArrayFunctionDispatch(object):
diff --git a/numpy/core/tests/test_regression.py b/numpy/core/tests/test_regression.py
index 2421a11..17c4898 100644
--- a/numpy/core/tests/test_regression.py
+++ b/numpy/core/tests/test_regression.py
@@ -46,7 +46,7 @@
             assert_array_equal(a, b)
 
     def test_typeNA(self):
-        # Issue gh-515 
+        # Issue gh-515
         with suppress_warnings() as sup:
             sup.filter(np.VisibleDeprecationWarning)
             assert_equal(np.typeNA[np.int64], 'Int64')
@@ -2415,3 +2415,11 @@
         # gh-11993
         arr = np.array(['AAAAA', 18465886.0, 18465886.0], dtype=object)
         assert_raises(TypeError, arr.astype, 'c8')
+
+    def test_eff1d_casting(self):
+        # gh-12711
+        x = np.array([1, 2, 4, 7, 0], dtype=np.int16)
+        res = np.ediff1d(x, to_begin=-99, to_end=np.array([88, 99]))
+        assert_equal(res, [-99,   1,   2,   3,  -7,  88,  99])
+        assert_raises(ValueError, np.ediff1d, x, to_begin=(1<<20))
+        assert_raises(ValueError, np.ediff1d, x, to_end=(1<<20))
diff --git a/numpy/core/tests/test_shape_base.py b/numpy/core/tests/test_shape_base.py
index ef5c118..53d272f 100644
--- a/numpy/core/tests/test_shape_base.py
+++ b/numpy/core/tests/test_shape_base.py
@@ -224,13 +224,27 @@
         assert_raises(ValueError, concatenate, (0,))
         assert_raises(ValueError, concatenate, (np.array(0),))
 
+        # dimensionality must match
+        assert_raises_regex(
+            ValueError,
+            r"all the input arrays must have same number of dimensions, but "
+            r"the array at index 0 has 1 dimension\(s\) and the array at "
+            r"index 1 has 2 dimension\(s\)",
+            np.concatenate, (np.zeros(1), np.zeros((1, 1))))
+
         # test shapes must match except for concatenation axis
         a = np.ones((1, 2, 3))
         b = np.ones((2, 2, 3))
         axis = list(range(3))
         for i in range(3):
             np.concatenate((a, b), axis=axis[0])  # OK
-            assert_raises(ValueError, np.concatenate, (a, b), axis=axis[1])
+            assert_raises_regex(
+                ValueError,
+                "all the input array dimensions for the concatenation axis "
+                "must match exactly, but along dimension {}, the array at "
+                "index 0 has size 1 and the array at index 1 has size 2"
+                .format(i),
+                np.concatenate, (a, b), axis=axis[1])
             assert_raises(ValueError, np.concatenate, (a, b), axis=axis[2])
             a = np.moveaxis(a, -1, 0)
             b = np.moveaxis(b, -1, 0)
@@ -373,6 +387,10 @@
     # empty arrays
     assert_(stack([[], [], []]).shape == (3, 0))
     assert_(stack([[], [], []], axis=1).shape == (0, 3))
+    # out
+    out = np.zeros_like(r1)
+    np.stack((a, b), out=out)
+    assert_array_equal(out, r1)
     # edge cases
     assert_raises_regex(ValueError, 'need at least one array', stack, [])
     assert_raises_regex(ValueError, 'must have the same shape',
diff --git a/numpy/core/tests/test_ufunc.py b/numpy/core/tests/test_ufunc.py
index b83b8cc..fa62767 100644
--- a/numpy/core/tests/test_ufunc.py
+++ b/numpy/core/tests/test_ufunc.py
@@ -3,6 +3,8 @@
 import warnings
 import itertools
 
+import pytest
+
 import numpy as np
 import numpy.core._umath_tests as umt
 import numpy.linalg._umath_linalg as uml
@@ -596,6 +598,12 @@
         assert_equal(np.sum(np.ones((2, 3, 5), dtype=np.int64), axis=(0, 2), initial=2),
                      [12, 12, 12])
 
+    def test_sum_where(self):
+        # More extensive tests done in test_reduction_with_where.
+        assert_equal(np.sum([[1., 2.], [3., 4.]], where=[True, False]), 4.)
+        assert_equal(np.sum([[1., 2.], [3., 4.]], axis=0, initial=5.,
+                            where=[True, False]), [9., 5.])
+
     def test_inner1d(self):
         a = np.arange(6).reshape((2, 3))
         assert_array_equal(umt.inner1d(a, a), np.sum(a*a, axis=-1))
@@ -1162,6 +1170,8 @@
         assert_equal(np.array([[1]], dtype=object).sum(), 1)
         assert_equal(np.array([[[1, 2]]], dtype=object).sum((0, 1)), [1, 2])
         assert_equal(np.array([1], dtype=object).sum(initial=1), 2)
+        assert_equal(np.array([[1], [2, 3]], dtype=object)
+                     .sum(initial=[0], where=[False, True]), [0, 2, 3])
 
     def test_object_array_accumulate_inplace(self):
         # Checks that in-place accumulates work, see also gh-7402
@@ -1396,6 +1406,44 @@
         res = np.add.reduce(a, initial=5)
         assert_equal(res, 15)
 
+    @pytest.mark.parametrize('axis', (0, 1, None))
+    @pytest.mark.parametrize('where', (np.array([False, True, True]),
+                                       np.array([[True], [False], [True]]),
+                                       np.array([[True, False, False],
+                                                 [False, True, False],
+                                                 [False, True, True]])))
+    def test_reduction_with_where(self, axis, where):
+        a = np.arange(9.).reshape(3, 3)
+        a_copy = a.copy()
+        a_check = np.zeros_like(a)
+        np.positive(a, out=a_check, where=where)
+
+        res = np.add.reduce(a, axis=axis, where=where)
+        check = a_check.sum(axis)
+        assert_equal(res, check)
+        # Check we do not overwrite elements of a internally.
+        assert_array_equal(a, a_copy)
+
+    @pytest.mark.parametrize(('axis', 'where'),
+                             ((0, np.array([True, False, True])),
+                              (1, [True, True, False]),
+                              (None, True)))
+    @pytest.mark.parametrize('initial', (-np.inf, 5.))
+    def test_reduction_with_where_and_initial(self, axis, where, initial):
+        a = np.arange(9.).reshape(3, 3)
+        a_copy = a.copy()
+        a_check = np.full(a.shape, -np.inf)
+        np.positive(a, out=a_check, where=where)
+
+        res = np.maximum.reduce(a, axis=axis, where=where, initial=initial)
+        check = a_check.max(axis, initial=initial)
+        assert_equal(res, check)
+
+    def test_reduction_where_initial_needed(self):
+        a = np.arange(9.).reshape(3, 3)
+        m = [False, True, False]
+        assert_raises(ValueError, np.maximum.reduce, a, where=m)
+
     def test_identityless_reduction_nonreorderable(self):
         a = np.array([[8.0, 2.0, 2.0], [1.0, 0.5, 0.25]])
 
@@ -1749,16 +1797,19 @@
         assert_equal(f(d, 0, None, None, True), r.reshape((1,) + r.shape))
         assert_equal(f(d, 0, None, None, False, 0), r)
         assert_equal(f(d, 0, None, None, False, initial=0), r)
+        assert_equal(f(d, 0, None, None, False, 0, True), r)
+        assert_equal(f(d, 0, None, None, False, 0, where=True), r)
         # multiple keywords
         assert_equal(f(d, axis=0, dtype=None, out=None, keepdims=False), r)
         assert_equal(f(d, 0, dtype=None, out=None, keepdims=False), r)
         assert_equal(f(d, 0, None, out=None, keepdims=False), r)
-        assert_equal(f(d, 0, None, out=None, keepdims=False, initial=0), r)
+        assert_equal(f(d, 0, None, out=None, keepdims=False, initial=0,
+                       where=True), r)
 
         # too little
         assert_raises(TypeError, f)
         # too much
-        assert_raises(TypeError, f, d, 0, None, None, False, 0, 1)
+        assert_raises(TypeError, f, d, 0, None, None, False, 0, True, 1)
         # invalid axis
         assert_raises(TypeError, f, d, "invalid")
         assert_raises(TypeError, f, d, axis="invalid")
@@ -1857,3 +1908,9 @@
     def test_no_doc_string(self):
         # gh-9337
         assert_('\n' not in umt.inner1d_no_doc.__doc__)
+
+    def test_invalid_args(self):
+        # gh-7961
+        exc = pytest.raises(TypeError, np.sqrt, None)
+        # minimally check the exception text
+        assert 'loop of ufunc does not support' in str(exc)
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index 2f8edeb..2109724 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -1894,7 +1894,8 @@
 
         # reduce, kwargs
         res = np.multiply.reduce(a, axis='axis0', dtype='dtype0', out='out0',
-                                 keepdims='keep0', initial='init0')
+                                 keepdims='keep0', initial='init0',
+                                 where='where0')
         assert_equal(res[0], a)
         assert_equal(res[1], np.multiply)
         assert_equal(res[2], 'reduce')
@@ -1903,7 +1904,8 @@
                               'out': ('out0',),
                               'keepdims': 'keep0',
                               'axis': 'axis0',
-                              'initial': 'init0'})
+                              'initial': 'init0',
+                              'where': 'where0'})
 
         # reduce, output equal to None removed, but not other explicit ones,
         # even if they are at their default value.
@@ -1913,14 +1915,18 @@
         assert_equal(res[4], {'axis': 0, 'keepdims': True})
         res = np.multiply.reduce(a, None, out=(None,), dtype=None)
         assert_equal(res[4], {'axis': None, 'dtype': None})
-        res = np.multiply.reduce(a, 0, None, None, False, 2)
-        assert_equal(res[4], {'axis': 0, 'dtype': None, 'keepdims': False, 'initial': 2})
-        # np._NoValue ignored for initial.
-        res = np.multiply.reduce(a, 0, None, None, False, np._NoValue)
-        assert_equal(res[4], {'axis': 0, 'dtype': None, 'keepdims': False})
-        # None kept for initial.
-        res = np.multiply.reduce(a, 0, None, None, False, None)
-        assert_equal(res[4], {'axis': 0, 'dtype': None, 'keepdims': False, 'initial': None})
+        res = np.multiply.reduce(a, 0, None, None, False, 2, True)
+        assert_equal(res[4], {'axis': 0, 'dtype': None, 'keepdims': False,
+                              'initial': 2, 'where': True})
+        # np._NoValue ignored for initial
+        res = np.multiply.reduce(a, 0, None, None, False,
+                                 np._NoValue, True)
+        assert_equal(res[4], {'axis': 0, 'dtype': None, 'keepdims': False,
+                              'where': True})
+        # None kept for initial, True for where.
+        res = np.multiply.reduce(a, 0, None, None, False, None, True)
+        assert_equal(res[4], {'axis': 0, 'dtype': None, 'keepdims': False,
+                              'initial': None, 'where': True})
 
         # reduce, wrong args
         assert_raises(ValueError, np.multiply.reduce, a, out=())
diff --git a/numpy/core/tests/test_umath_complex.py b/numpy/core/tests/test_umath_complex.py
index 785ae8c..1f5b407 100644
--- a/numpy/core/tests/test_umath_complex.py
+++ b/numpy/core/tests/test_umath_complex.py
@@ -5,7 +5,8 @@
 import pytest
 
 import numpy as np
-import numpy.core.umath as ncu
+# import the c-extension module directly since _arg is not exported via umath
+import numpy.core._multiarray_umath as ncu
 from numpy.testing import (
     assert_raises, assert_equal, assert_array_equal, assert_almost_equal
     )
diff --git a/numpy/core/umath.py b/numpy/core/umath.py
index a0e8ad4..f3b26ab 100644
--- a/numpy/core/umath.py
+++ b/numpy/core/umath.py
@@ -9,7 +9,7 @@
 from . import _multiarray_umath
 from numpy.core._multiarray_umath import *
 from numpy.core._multiarray_umath import (
-    _UFUNC_API, _add_newdoc_ufunc, _arg, _ones_like
+    _UFUNC_API, _add_newdoc_ufunc, _ones_like
     )
 
 __all__ = [
@@ -18,7 +18,7 @@
     'FPE_DIVIDEBYZERO', 'FPE_INVALID', 'FPE_OVERFLOW', 'FPE_UNDERFLOW', 'NAN',
     'NINF', 'NZERO', 'PINF', 'PZERO', 'SHIFT_DIVIDEBYZERO', 'SHIFT_INVALID',
     'SHIFT_OVERFLOW', 'SHIFT_UNDERFLOW', 'UFUNC_BUFSIZE_DEFAULT',
-    'UFUNC_PYVALS_NAME', '_add_newdoc_ufunc', '_arg', 'absolute', 'add',
+    'UFUNC_PYVALS_NAME', '_add_newdoc_ufunc', 'absolute', 'add',
     'arccos', 'arccosh', 'arcsin', 'arcsinh', 'arctan', 'arctan2', 'arctanh',
     'bitwise_and', 'bitwise_or', 'bitwise_xor', 'cbrt', 'ceil', 'conj',
     'conjugate', 'copysign', 'cos', 'cosh', 'deg2rad', 'degrees', 'divide',
diff --git a/numpy/ctypeslib.py b/numpy/ctypeslib.py
index 1136858..02c3bd2 100644
--- a/numpy/ctypeslib.py
+++ b/numpy/ctypeslib.py
@@ -93,7 +93,7 @@
     def load_library(libname, loader_path):
         """
         It is possible to load a library using 
-        >>> lib = ctypes.cdll[<full_path_name>]
+        >>> lib = ctypes.cdll[<full_path_name>] # doctest: +SKIP
 
         But there are cross-platform considerations, such as library file extensions,
         plus the fact Windows will just load the first library it finds with that name.  
@@ -346,27 +346,157 @@
     return klass
 
 
-def _get_typecodes():
-    """ Return a dictionary mapping __array_interface__ formats to ctypes types """
-    ct = ctypes
-    simple_types = [
-        ct.c_byte, ct.c_short, ct.c_int, ct.c_long, ct.c_longlong,
-        ct.c_ubyte, ct.c_ushort, ct.c_uint, ct.c_ulong, ct.c_ulonglong,
-        ct.c_float, ct.c_double,
-    ]
-
-    return {_dtype(ctype).str: ctype for ctype in simple_types}
-
-
-def _ctype_ndarray(element_type, shape):
-    """ Create an ndarray of the given element type and shape """
-    for dim in shape[::-1]:
-        element_type = element_type * dim
-    return element_type
-
-
 if ctypes is not None:
-    _typecodes = _get_typecodes()
+    def _ctype_ndarray(element_type, shape):
+        """ Create an ndarray of the given element type and shape """
+        for dim in shape[::-1]:
+            element_type = dim * element_type
+            # prevent the type name include np.ctypeslib
+            element_type.__module__ = None
+        return element_type
+
+
+    def _get_scalar_type_map():
+        """
+        Return a dictionary mapping native endian scalar dtype to ctypes types
+        """
+        ct = ctypes
+        simple_types = [
+            ct.c_byte, ct.c_short, ct.c_int, ct.c_long, ct.c_longlong,
+            ct.c_ubyte, ct.c_ushort, ct.c_uint, ct.c_ulong, ct.c_ulonglong,
+            ct.c_float, ct.c_double,
+            ct.c_bool,
+        ]
+        return {_dtype(ctype): ctype for ctype in simple_types}
+
+
+    _scalar_type_map = _get_scalar_type_map()
+
+
+    def _ctype_from_dtype_scalar(dtype):
+        # swapping twice ensure that `=` is promoted to <, >, or |
+        dtype_with_endian = dtype.newbyteorder('S').newbyteorder('S')
+        dtype_native = dtype.newbyteorder('=')
+        try:
+            ctype = _scalar_type_map[dtype_native]
+        except KeyError:
+            raise NotImplementedError(
+                "Converting {!r} to a ctypes type".format(dtype)
+            )
+
+        if dtype_with_endian.byteorder == '>':
+            ctype = ctype.__ctype_be__
+        elif dtype_with_endian.byteorder == '<':
+            ctype = ctype.__ctype_le__
+
+        return ctype
+
+
+    def _ctype_from_dtype_subarray(dtype):
+        element_dtype, shape = dtype.subdtype
+        ctype = _ctype_from_dtype(element_dtype)
+        return _ctype_ndarray(ctype, shape)
+
+
+    def _ctype_from_dtype_structured(dtype):
+        # extract offsets of each field
+        field_data = []
+        for name in dtype.names:
+            field_dtype, offset = dtype.fields[name][:2]
+            field_data.append((offset, name, _ctype_from_dtype(field_dtype)))
+
+        # ctypes doesn't care about field order
+        field_data = sorted(field_data, key=lambda f: f[0])
+
+        if len(field_data) > 1 and all(offset == 0 for offset, name, ctype in field_data):
+            # union, if multiple fields all at address 0
+            size = 0
+            _fields_ = []
+            for offset, name, ctype in field_data:
+                _fields_.append((name, ctype))
+                size = max(size, ctypes.sizeof(ctype))
+
+            # pad to the right size
+            if dtype.itemsize != size:
+                _fields_.append(('', ctypes.c_char * dtype.itemsize))
+
+            # we inserted manual padding, so always `_pack_`
+            return type('union', (ctypes.Union,), dict(
+                _fields_=_fields_,
+                _pack_=1,
+                __module__=None,
+            ))
+        else:
+            last_offset = 0
+            _fields_ = []
+            for offset, name, ctype in field_data:
+                padding = offset - last_offset
+                if padding < 0:
+                    raise NotImplementedError("Overlapping fields")
+                if padding > 0:
+                    _fields_.append(('', ctypes.c_char * padding))
+
+                _fields_.append((name, ctype))
+                last_offset = offset + ctypes.sizeof(ctype)
+
+
+            padding = dtype.itemsize - last_offset
+            if padding > 0:
+                _fields_.append(('', ctypes.c_char * padding))
+
+            # we inserted manual padding, so always `_pack_`
+            return type('struct', (ctypes.Structure,), dict(
+                _fields_=_fields_,
+                _pack_=1,
+                __module__=None,
+            ))
+
+
+    def _ctype_from_dtype(dtype):
+        if dtype.fields is not None:
+            return _ctype_from_dtype_structured(dtype)
+        elif dtype.subdtype is not None:
+            return _ctype_from_dtype_subarray(dtype)
+        else:
+            return _ctype_from_dtype_scalar(dtype)
+
+
+    def as_ctypes_type(dtype):
+        """
+        Convert a dtype into a ctypes type.
+
+        Parameters
+        ----------
+        dtype : dtype
+            The dtype to convert
+
+        Returns
+        -------
+        ctypes
+            A ctype scalar, union, array, or struct
+
+        Raises
+        ------
+        NotImplementedError
+            If the conversion is not possible
+
+        Notes
+        -----
+        This function does not losslessly round-trip in either direction.
+
+        ``np.dtype(as_ctypes_type(dt))`` will:
+         - insert padding fields
+         - reorder fields to be sorted by offset
+         - discard field titles
+
+        ``as_ctypes_type(np.dtype(ctype))`` will:
+         - discard the class names of ``Structure``s and ``Union``s
+         - convert single-element ``Union``s into single-element ``Structure``s
+         - insert padding fields
+
+        """
+        return _ctype_from_dtype(_dtype(dtype))
+
 
     def as_array(obj, shape=None):
         """
@@ -388,6 +518,7 @@
 
         return array(obj, copy=False)
 
+
     def as_ctypes(obj):
         """Create and return a ctypes object from a numpy array.  Actually
         anything that exposes the __array_interface__ is accepted."""
@@ -399,7 +530,8 @@
         addr, readonly = ai["data"]
         if readonly:
             raise TypeError("readonly arrays unsupported")
-        tp = _ctype_ndarray(_typecodes[ai["typestr"]], ai["shape"])
-        result = tp.from_address(addr)
-        result.__keep = ai
+
+        dtype = _dtype((ai["typestr"], ai["shape"]))
+        result = as_ctypes_type(dtype).from_address(addr)
+        result.__keep = obj
         return result
diff --git a/numpy/distutils/ccompiler.py b/numpy/distutils/ccompiler.py
index 5b7cb3f..100d0d0 100644
--- a/numpy/distutils/ccompiler.py
+++ b/numpy/distutils/ccompiler.py
@@ -17,7 +17,9 @@
 
 from numpy.distutils import log
 from numpy.distutils.compat import get_exception
-from numpy.distutils.exec_command import filepath_from_subprocess_output
+from numpy.distutils.exec_command import (
+    filepath_from_subprocess_output, forward_bytes_to_stdout
+)
 from numpy.distutils.misc_util import cyg2win32, is_sequence, mingw32, \
                                       get_num_build_jobs, \
                                       _commandline_dep_string
@@ -159,11 +161,9 @@
 
     if is_sequence(cmd):
         cmd = ' '.join(list(cmd))
-    try:
-        print(o)
-    except UnicodeError:
-        # When installing through pip, `o` can contain non-ascii chars
-        pass
+
+    forward_bytes_to_stdout(o)
+
     if re.search(b'Too many open files', o):
         msg = '\nTry rerunning setup command until build succeeds.'
     else:
diff --git a/numpy/distutils/exec_command.py b/numpy/distutils/exec_command.py
index aaeca99..ede347b 100644
--- a/numpy/distutils/exec_command.py
+++ b/numpy/distutils/exec_command.py
@@ -81,6 +81,29 @@
         output = output.encode('ascii', errors='replace')
     return output
 
+
+def forward_bytes_to_stdout(val):
+    """
+    Forward bytes from a subprocess call to the console, without attempting to
+    decode them.
+
+    The assumption is that the subprocess call already returned bytes in
+    a suitable encoding.
+    """
+    if sys.version_info.major < 3:
+        # python 2 has binary output anyway
+        sys.stdout.write(val)
+    elif hasattr(sys.stdout, 'buffer'):
+        # use the underlying binary output if there is one
+        sys.stdout.buffer.write(val)
+    elif hasattr(sys.stdout, 'encoding'):
+        # round-trip the encoding if necessary
+        sys.stdout.write(val.decode(sys.stdout.encoding))
+    else:
+        # make a best-guess at the encoding
+        sys.stdout.write(val.decode('utf8', errors='replace'))
+
+
 def temp_file_name():
     fo, name = make_temp_file()
     fo.close()
diff --git a/numpy/distutils/fcompiler/environment.py b/numpy/distutils/fcompiler/environment.py
index 4897845..4238f35 100644
--- a/numpy/distutils/fcompiler/environment.py
+++ b/numpy/distutils/fcompiler/environment.py
@@ -1,6 +1,7 @@
 from __future__ import division, absolute_import, print_function
 
 import os
+import warnings
 from distutils.dist import Distribution
 
 __metaclass__ = type
@@ -54,8 +55,18 @@
         if envvar is not None:
             envvar_contents = os.environ.get(envvar)
             if envvar_contents is not None:
-                if var and append and os.environ.get('NPY_DISTUTILS_APPEND_FLAGS', '0') == '1':
-                    var = var + [envvar_contents]
+                if var and append:
+                    if os.environ.get('NPY_DISTUTILS_APPEND_FLAGS', '0') == '1':
+                        var = var + [envvar_contents]
+                    else:
+                        var = envvar_contents
+                        if 'NPY_DISTUTILS_APPEND_FLAGS' not in os.environ.keys():
+                            msg = "{} is used as is, not appended ".format(envvar) + \
+                                  "to flags already defined " + \
+                                  "by numpy.distutils! Use NPY_DISTUTILS_APPEND_FLAGS=1 " + \
+                                  "to obtain appending behavior instead (this " + \
+                                  "behavior will become default in a future release)."
+                            warnings.warn(msg, UserWarning, stacklevel=3)
                 else:
                     var = envvar_contents
         if confvar is not None and self._conf:
diff --git a/numpy/distutils/fcompiler/pg.py b/numpy/distutils/fcompiler/pg.py
index 9907180..cdba0e3 100644
--- a/numpy/distutils/fcompiler/pg.py
+++ b/numpy/distutils/fcompiler/pg.py
@@ -33,7 +33,7 @@
             'compiler_f77': ["pgfortran"],
             'compiler_fix': ["pgfortran", "-Mfixed"],
             'compiler_f90': ["pgfortran"],
-            'linker_so': ["pgfortran", "-shared", "-fpic"],
+            'linker_so': ["pgfortran"],
             'archiver': ["ar", "-cr"],
             'ranlib': ["ranlib"]
         }
@@ -56,6 +56,10 @@
         def get_flags_linker_so(self):
             return ["-dynamic", '-undefined', 'dynamic_lookup']
 
+    else:
+        def get_flags_linker_so(self):
+            return ["-shared", '-fpic']
+
     def runtime_library_dir_option(self, dir):
         return '-R"%s"' % dir
 
diff --git a/numpy/distutils/tests/test_fcompiler.py b/numpy/distutils/tests/test_fcompiler.py
index 95e44b0..ba19a97 100644
--- a/numpy/distutils/tests/test_fcompiler.py
+++ b/numpy/distutils/tests/test_fcompiler.py
@@ -1,6 +1,8 @@
 from __future__ import division, absolute_import, print_function
 
-from numpy.testing import assert_
+import pytest
+
+from numpy.testing import assert_, suppress_warnings
 import numpy.distutils.fcompiler
 
 customizable_flags = [
@@ -25,6 +27,7 @@
 
         monkeypatch.setenv(envvar, new_flag)
         new_flags = getattr(flag_vars, opt)
+
         monkeypatch.delenv(envvar)
         assert_(new_flags == [new_flag])
 
@@ -33,12 +36,46 @@
     for opt, envvar in customizable_flags:
         new_flag = '-dummy-{}-flag'.format(opt)
         prev_flags = getattr(flag_vars, opt)
-
         monkeypatch.setenv(envvar, new_flag)
         new_flags = getattr(flag_vars, opt)
+
         monkeypatch.delenv(envvar)
         if prev_flags is None:
             assert_(new_flags == [new_flag])
         else:
             assert_(new_flags == prev_flags + [new_flag])
 
+
+def test_fcompiler_flags_append_warning(monkeypatch):
+    # Test to check that the warning for append behavior changing in future
+    # is triggered.  Need to use a real compiler instance so that we have
+    # non-empty flags to start with (otherwise the "if var and append" check
+    # will always be false).
+    try:
+        with suppress_warnings() as sup:
+            sup.record()
+            fc = numpy.distutils.fcompiler.new_fcompiler(compiler='gnu95')
+            fc.customize()
+    except numpy.distutils.fcompiler.CompilerNotFound:
+        pytest.skip("gfortran not found, so can't execute this test")
+
+    # Ensure NPY_DISTUTILS_APPEND_FLAGS not defined
+    monkeypatch.delenv('NPY_DISTUTILS_APPEND_FLAGS', raising=False)
+
+    for opt, envvar in customizable_flags:
+        new_flag = '-dummy-{}-flag'.format(opt)
+        with suppress_warnings() as sup:
+            sup.record()
+            prev_flags = getattr(fc.flag_vars, opt)
+
+        monkeypatch.setenv(envvar, new_flag)
+        with suppress_warnings() as sup:
+            sup.record()
+            new_flags = getattr(fc.flag_vars, opt)
+            if prev_flags:
+                # Check that warning was issued
+                assert len(sup.log) == 1
+
+        monkeypatch.delenv(envvar)
+        assert_(new_flags == [new_flag])
+
diff --git a/numpy/doc/glossary.py b/numpy/doc/glossary.py
index a3b9423..a370734 100644
--- a/numpy/doc/glossary.py
+++ b/numpy/doc/glossary.py
@@ -270,13 +270,11 @@
          masked_array(data = [-- 2.0 --],
                       mask = [ True False  True],
                 fill_value = 1e+20)
-         <BLANKLINE>
 
          >>> x + [1, 2, 3]
          masked_array(data = [-- 4.0 --],
                       mask = [ True False  True],
                 fill_value = 1e+20)
-         <BLANKLINE>
 
 
        Masked arrays are often used when operating on arrays containing
diff --git a/numpy/doc/structured_arrays.py b/numpy/doc/structured_arrays.py
index 0fcdecf..da3a74b 100644
--- a/numpy/doc/structured_arrays.py
+++ b/numpy/doc/structured_arrays.py
@@ -13,8 +13,8 @@
  >>> x = np.array([('Rex', 9, 81.0), ('Fido', 3, 27.0)],
  ...              dtype=[('name', 'U10'), ('age', 'i4'), ('weight', 'f4')])
  >>> x
- array([('Rex', 9, 81.0), ('Fido', 3, 27.0)],
-       dtype=[('name', 'S10'), ('age', '<i4'), ('weight', '<f4')])
+ array([('Rex', 9, 81.), ('Fido', 3, 27.)],
+       dtype=[('name', 'U10'), ('age', '<i4'), ('weight', '<f4')])
 
 Here ``x`` is a one-dimensional array of length two whose datatype is a
 structure with three fields: 1. A string of length 10 or less named 'name', 2.
@@ -32,8 +32,8 @@
  array([9, 3], dtype=int32)
  >>> x['age'] = 5
  >>> x
- array([('Rex', 5, 81.0), ('Fido', 5, 27.0)],
-       dtype=[('name', 'S10'), ('age', '<i4'), ('weight', '<f4')])
+ array([('Rex', 5, 81.), ('Fido', 5, 27.)],
+       dtype=[('name', 'U10'), ('age', '<i4'), ('weight', '<f4')])
 
 Structured datatypes are designed to be able to mimic 'structs' in the C
 language, and share a similar memory layout. They are meant for interfacing with
@@ -79,14 +79,14 @@
      convertible to a datatype, and ``shape`` is a tuple of integers specifying
      subarray shape.
 
-      >>> np.dtype([('x', 'f4'), ('y', np.float32), ('z', 'f4', (2,2))])
-      dtype=[('x', '<f4'), ('y', '<f4'), ('z', '<f4', (2, 2))])
+      >>> np.dtype([('x', 'f4'), ('y', np.float32), ('z', 'f4', (2, 2))])
+      dtype([('x', '<f4'), ('y', '<f4'), ('z', '<f4', (2, 2))])
 
      If ``fieldname`` is the empty string ``''``, the field will be given a
      default name of the form ``f#``, where ``#`` is the integer index of the
      field, counting from 0 from the left::
 
-      >>> np.dtype([('x', 'f4'),('', 'i4'),('z', 'i8')])
+      >>> np.dtype([('x', 'f4'), ('', 'i4'), ('z', 'i8')])
       dtype([('x', '<f4'), ('f1', '<i4'), ('z', '<i8')])
 
      The byte offsets of the fields within the structure and the total
@@ -100,10 +100,10 @@
      automatically, and the field names are given the default names ``f0``,
      ``f1``, etc. ::
 
-      >>> np.dtype('i8,f4,S3')
+      >>> np.dtype('i8, f4, S3')
       dtype([('f0', '<i8'), ('f1', '<f4'), ('f2', 'S3')])
-      >>> np.dtype('3int8, float32, (2,3)float64')
-      dtype([('f0', 'i1', 3), ('f1', '<f4'), ('f2', '<f8', (2, 3))])
+      >>> np.dtype('3int8, float32, (2, 3)float64')
+      dtype([('f0', 'i1', (3,)), ('f1', '<f4'), ('f2', '<f8', (2, 3))])
 
 3.   A dictionary of field parameter arrays
 
@@ -121,10 +121,10 @@
      enough to contain all the fields.
      ::
 
-      >>> np.dtype({'names': ['col1', 'col2'], 'formats': ['i4','f4']})
+      >>> np.dtype({'names': ['col1', 'col2'], 'formats': ['i4', 'f4']})
       dtype([('col1', '<i4'), ('col2', '<f4')])
       >>> np.dtype({'names': ['col1', 'col2'],
-      ...           'formats': ['i4','f4'],
+      ...           'formats': ['i4', 'f4'],
       ...           'offsets': [0, 4],
       ...           'itemsize': 12})
       dtype({'names':['col1','col2'], 'formats':['<i4','<f4'], 'offsets':[0,4], 'itemsize':12})
@@ -149,8 +149,8 @@
      because older numpy code may use it. The keys of the dictionary are the
      field names and the values are tuples specifying type and offset::
 
-      >>> np.dtype=({'col1': ('i1',0), 'col2': ('f4',1)})
-      dtype([(('col1'), 'i1'), (('col2'), '>f4')])
+      >>> np.dtype({'col1': ('i1', 0), 'col2': ('f4', 1)})
+      dtype([('col1', 'i1'), ('col2', '<f4')])
 
      This form is discouraged because Python dictionaries do not preserve order
      in Python versions before Python 3.6, and the order of the fields in a
@@ -202,7 +202,7 @@
  >>> def print_offsets(d):
  ...     print("offsets:", [d.fields[name][1] for name in d.names])
  ...     print("itemsize:", d.itemsize)
- >>> print_offsets(np.dtype('u1,u1,i4,u1,i8,u2'))
+ >>> print_offsets(np.dtype('u1, u1, i4, u1, i8, u2'))
  offsets: [0, 1, 2, 6, 7, 15]
  itemsize: 17
 
@@ -215,7 +215,7 @@
 structure will also have trailing padding added so that its itemsize is a
 multiple of the largest field's alignment. ::
 
- >>> print_offsets(np.dtype('u1,u1,i4,u1,i8,u2', align=True))
+ >>> print_offsets(np.dtype('u1, u1, i4, u1, i8, u2', align=True))
  offsets: [0, 1, 4, 8, 16, 24]
  itemsize: 32
 
@@ -255,6 +255,7 @@
 example::
 
  >>> np.dtype([(('my title', 'name'), 'f4')])
+ dtype([(('my title', 'name'), '<f4')])
 
 When using the first form of dictionary-based specification, the titles may be
 supplied as an extra ``'titles'`` key as described above. When using the second
@@ -263,6 +264,7 @@
 2-element tuple::
 
  >>> np.dtype({'name': ('i4', 0, 'my title')})
+ dtype([(('my title', 'name'), '<i4')])
 
 The ``dtype.fields`` dictionary will contain :term:`titles` as keys, if any
 titles are used.  This means effectively that a field with a title will be
@@ -275,6 +277,8 @@
 
  >>> for name in d.names:
  ...     print(d.fields[name][:2])
+ (dtype('int64'), 0)
+ (dtype('float32'), 8)
 
 Union types
 -----------
@@ -305,8 +309,8 @@
 broadcasting rules. The tuple's elements are assigned to the successive fields
 of the array, from left to right::
 
- >>> x = np.array([(1,2,3),(4,5,6)], dtype='i8,f4,f8')
- >>> x[1] = (7,8,9)
+ >>> x = np.array([(1, 2, 3), (4, 5, 6)], dtype='i8, f4, f8')
+ >>> x[1] = (7, 8, 9)
  >>> x
  array([(1, 2., 3.), (7, 8., 9.)],
       dtype=[('f0', '<i8'), ('f1', '<f4'), ('f2', '<f8')])
@@ -318,14 +322,14 @@
 happens when a scalar is assigned to a structured array, or when an
 unstructured array is assigned to a structured array::
 
- >>> x = np.zeros(2, dtype='i8,f4,?,S1')
+ >>> x = np.zeros(2, dtype='i8, f4, ?, S1')
  >>> x[:] = 3
  >>> x
- array([(3, 3.0, True, b'3'), (3, 3.0, True, b'3')],
+ array([(3, 3., True, b'3'), (3, 3., True, b'3')],
        dtype=[('f0', '<i8'), ('f1', '<f4'), ('f2', '?'), ('f3', 'S1')])
  >>> x[:] = np.arange(2)
  >>> x
- array([(0, 0.0, False, b'0'), (1, 1.0, True, b'1')],
+ array([(0, 0., False, b'0'), (1, 1., True, b'1')],
        dtype=[('f0', '<i8'), ('f1', '<f4'), ('f2', '?'), ('f3', 'S1')])
 
 Structured arrays can also be assigned to unstructured arrays, but only if the
@@ -335,6 +339,8 @@
  >>> onefield = np.zeros(2, dtype=[('A', 'i4')])
  >>> nostruct = np.zeros(2, dtype='i4')
  >>> nostruct[:] = twofield
+ Traceback (most recent call last):
+    File "<stdin>", line 1, in <module>
  ValueError: Can't cast from structure to non-structure, except if the structure only has a single field.
  >>> nostruct[:] = onefield
  >>> nostruct
@@ -355,7 +361,7 @@
  >>> b = np.ones(3, dtype=[('x', 'f4'), ('y', 'S3'), ('z', 'O')])
  >>> b[:] = a
  >>> b
- array([(0.0, b'0.0', b''), (0.0, b'0.0', b''), (0.0, b'0.0', b'')],
+ array([(0., b'0.0', b''), (0., b'0.0', b''), (0., b'0.0', b'')],
        dtype=[('x', '<f4'), ('y', 'S3'), ('z', 'O')])
 
 
@@ -374,7 +380,7 @@
 Individual fields of a structured array may be accessed and modified by indexing
 the array with the field name. ::
 
- >>> x = np.array([(1,2),(3,4)], dtype=[('foo', 'i8'), ('bar', 'f4')])
+ >>> x = np.array([(1, 2), (3, 4)], dtype=[('foo', 'i8'), ('bar', 'f4')])
  >>> x['foo']
  array([1, 3])
  >>> x['foo'] = 10
@@ -386,9 +392,9 @@
 memory locations and writing to the view will modify the original array. ::
 
  >>> y = x['bar']
- >>> y[:] = 10
+ >>> y[:] = 11
  >>> x
- array([(10, 5.), (10, 5.)],
+ array([(10, 11.), (10, 11.)],
        dtype=[('foo', '<i8'), ('bar', '<f4')])
 
 This view has the same dtype and itemsize as the indexed field, so it is
@@ -397,6 +403,15 @@
  >>> y.dtype, y.shape, y.strides
  (dtype('float32'), (2,), (12,))
 
+If the accessed field is a subarray, the dimensions of the subarray
+are appended to the shape of the result::
+
+   >>> x = np.zeros((2, 2), dtype=[('a', np.int32), ('b', np.float64, (3, 3))])
+   >>> x['a'].shape
+   (2, 2)
+   >>> x['b'].shape
+   (2, 2, 3, 3)
+
 Accessing Multiple Fields
 ```````````````````````````
 
@@ -429,8 +444,9 @@
     code which depends on the data having a "packed" layout. For instance code
     such as::
 
-     >>> a = np.zeros(3, dtype=[('a', 'i4'), ('b', 'i4'), ('c', 'f4')])
-     >>> a[['a','c']].view('i8')  # Fails in Numpy 1.16
+     >>> a[['a', 'c']].view('i8')  # Fails in Numpy 1.16
+     Traceback (most recent call last):
+        File "<stdin>", line 1, in <module>
      ValueError: When changing to a smaller dtype, its size must be a divisor of the size of original dtype
 
     will need to be changed. This code has raised a ``FutureWarning`` since
@@ -450,7 +466,8 @@
     used to reproduce the old behavior, as it will return a packed copy of the
     structured array. The code above, for example, can be replaced with:
 
-     >>> repack_fields(a[['a','c']]).view('i8')  # supported in 1.16
+     >>> from numpy.lib.recfunctions import repack_fields
+     >>> repack_fields(a[['a', 'c']]).view('i8')  # supported in 1.16
      array([0, 0, 0])
 
     Furthermore, numpy now provides a new function
@@ -461,12 +478,14 @@
     account padding, often avoids a copy, and also casts the datatypes
     as needed, unlike the view. Code such as:
 
-     >>> a = np.zeros(3, dtype=[('x', 'f4'), ('y', 'f4'), ('z', 'f4')])
-     >>> a[['x', 'z']].view('f4')
+     >>> b = np.zeros(3, dtype=[('x', 'f4'), ('y', 'f4'), ('z', 'f4')])
+     >>> b[['x', 'z']].view('f4')
+     array([0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)
 
     can be made safer by replacing with:
 
-     >>> structured_to_unstructured(a[['x', 'z']])
+     >>> from numpy.lib.recfunctions import structured_to_unstructured
+     >>> structured_to_unstructured(b[['x', 'z']])
      array([0, 0, 0])
 
 
@@ -474,8 +493,8 @@
 
  >>> a[['a', 'c']] = (2, 3)
  >>> a
- array([(2, 0, 3.0), (2, 0, 3.0), (2, 0, 3.0)],
-       dtype=[('a', '<i8'), ('b', '<i4'), ('c', '<f8')])
+ array([(2, 0, 3.), (2, 0, 3.), (2, 0, 3.)],
+       dtype=[('a', '<i4'), ('b', '<i4'), ('c', '<f4')])
 
 This obeys the structured array assignment rules described above. For example,
 this means that one can swap the values of two fields using appropriate
@@ -489,19 +508,19 @@
 Indexing a single element of a structured array (with an integer index) returns
 a structured scalar::
 
- >>> x = np.array([(1, 2., 3.)], dtype='i,f,f')
+ >>> x = np.array([(1, 2., 3.)], dtype='i, f, f')
  >>> scalar = x[0]
  >>> scalar
  (1, 2., 3.)
  >>> type(scalar)
- numpy.void
+ <class 'numpy.void'>
 
 Unlike other numpy scalars, structured scalars are mutable and act like views
 into the original array, such that modifying the scalar will modify the
 original array. Structured scalars also support access and assignment by field
 name::
 
- >>> x = np.array([(1,2),(3,4)], dtype=[('foo', 'i8'), ('bar', 'f4')])
+ >>> x = np.array([(1, 2), (3, 4)], dtype=[('foo', 'i8'), ('bar', 'f4')])
  >>> s = x[0]
  >>> s['bar'] = 100
  >>> x
@@ -510,7 +529,7 @@
 
 Similarly to tuples, structured scalars can also be indexed with an integer::
 
- >>> scalar = np.array([(1, 2., 3.)], dtype='i,f,f')[0]
+ >>> scalar = np.array([(1, 2., 3.)], dtype='i, f, f')[0]
  >>> scalar[0]
  1
  >>> scalar[1] = 4
@@ -521,7 +540,7 @@
 calling :func:`ndarray.item`::
 
  >>> scalar.item(), type(scalar.item())
- ((1, 2.0, 3.0), tuple)
+ ((1, 4.0, 3.0), <class 'tuple'>)
 
 Viewing Structured Arrays Containing Objects
 --------------------------------------------
@@ -565,24 +584,24 @@
 
 The simplest way to create a record array is with :func:`numpy.rec.array`::
 
- >>> recordarr = np.rec.array([(1,2.,'Hello'),(2,3.,"World")],
+ >>> recordarr = np.rec.array([(1, 2., 'Hello'), (2, 3., "World")],
  ...                    dtype=[('foo', 'i4'),('bar', 'f4'), ('baz', 'S10')])
  >>> recordarr.bar
  array([ 2.,  3.], dtype=float32)
  >>> recordarr[1:2]
- rec.array([(2, 3.0, 'World')],
+ rec.array([(2, 3., b'World')],
        dtype=[('foo', '<i4'), ('bar', '<f4'), ('baz', 'S10')])
  >>> recordarr[1:2].foo
  array([2], dtype=int32)
  >>> recordarr.foo[1:2]
  array([2], dtype=int32)
  >>> recordarr[1].baz
- 'World'
+ b'World'
 
 :func:`numpy.rec.array` can convert a wide variety of arguments into record
 arrays, including structured arrays::
 
- >>> arr = array([(1,2.,'Hello'),(2,3.,"World")],
+ >>> arr = np.array([(1, 2., 'Hello'), (2, 3., "World")],
  ...             dtype=[('foo', 'i4'), ('bar', 'f4'), ('baz', 'S10')])
  >>> recordarr = np.rec.array(arr)
 
@@ -593,9 +612,9 @@
 A record array representation of a structured array can be obtained using the
 appropriate :ref:`view`::
 
- >>> arr = np.array([(1,2.,'Hello'),(2,3.,"World")],
+ >>> arr = np.array([(1, 2., 'Hello'), (2, 3., "World")],
  ...                dtype=[('foo', 'i4'),('bar', 'f4'), ('baz', 'a10')])
- >>> recordarr = arr.view(dtype=dtype((np.record, arr.dtype)),
+ >>> recordarr = arr.view(dtype=np.dtype((np.record, arr.dtype)),
  ...                      type=np.recarray)
 
 For convenience, viewing an ndarray as type :class:`np.recarray` will
@@ -615,12 +634,12 @@
 Record array fields accessed by index or by attribute are returned as a record
 array if the field has a structured type but as a plain ndarray otherwise. ::
 
- >>> recordarr = np.rec.array([('Hello', (1,2)),("World", (3,4))],
+ >>> recordarr = np.rec.array([('Hello', (1, 2)), ("World", (3, 4))],
  ...                 dtype=[('foo', 'S6'),('bar', [('A', int), ('B', int)])])
  >>> type(recordarr.foo)
- <type 'numpy.ndarray'>
+ <class 'numpy.ndarray'>
  >>> type(recordarr.bar)
- <class 'numpy.core.records.recarray'>
+ <class 'numpy.recarray'>
 
 Note that if a field has the same name as an ndarray attribute, the ndarray
 attribute takes precedence. Such fields will be inaccessible by attribute but
diff --git a/numpy/dual.py b/numpy/dual.py
index 3a16a8e..651e845 100644
--- a/numpy/dual.py
+++ b/numpy/dual.py
@@ -51,14 +51,14 @@
 
 def register_func(name, func):
     if name not in __all__:
-        raise ValueError("%s not a dual function." % name)
+        raise ValueError("{} not a dual function.".format(name))
     f = sys._getframe(0).f_globals
     _restore_dict[name] = f[name]
     f[name] = func
 
 def restore_func(name):
     if name not in __all__:
-        raise ValueError("%s not a dual function." % name)
+        raise ValueError("{} not a dual function.".format(name))
     try:
         val = _restore_dict[name]
     except KeyError:
diff --git a/numpy/f2py/__init__.py b/numpy/f2py/__init__.py
index 23a4b7c..d146739 100644
--- a/numpy/f2py/__init__.py
+++ b/numpy/f2py/__init__.py
@@ -28,12 +28,16 @@
             extension='.f'
            ):
     """
-    Build extension module from processing source with f2py.
+    Build extension module from a Fortran 77 source string with f2py.
 
     Parameters
     ----------
-    source : str
+    source : str or bytes
         Fortran source of module / subroutine to compile
+
+        .. versionchanged:: 1.16.0
+           Accept str as well as bytes
+
     modulename : str, optional
         The name of the compiled python module
     extra_args : str or list, optional
@@ -55,6 +59,16 @@
 
         .. versionadded:: 1.11.0
 
+    Returns
+    -------
+    result : int
+        0 on success
+
+    Examples
+    --------
+    .. include:: compile_session.dat
+        :literal:
+
     """
     import tempfile
     import shlex
@@ -67,9 +81,11 @@
     else:
         fname = source_fn
 
+    if not isinstance(source, str):
+        source = str(source, 'utf-8')
     try:
         with open(fname, 'w') as f:
-            f.write(str(source))
+            f.write(source)
 
         args = ['-c', '-m', modulename, f.name]
 
diff --git a/numpy/f2py/__main__.py b/numpy/f2py/__main__.py
index 6eff410..708f7f3 100644
--- a/numpy/f2py/__main__.py
+++ b/numpy/f2py/__main__.py
@@ -1,6 +1,6 @@
 # See http://cens.ioc.ee/projects/f2py2e/
 from __future__ import division, print_function
 
-from f2py2e import main
+from numpy.f2py.f2py2e import main
 
 main()
diff --git a/numpy/f2py/crackfortran.py b/numpy/f2py/crackfortran.py
index 2620fc9..c4a6505 100755
--- a/numpy/f2py/crackfortran.py
+++ b/numpy/f2py/crackfortran.py
@@ -2399,7 +2399,7 @@
     if p < 16:
         return 8
     machine = platform.machine().lower()
-    if machine.startswith(('aarch64', 'power', 'ppc64', 's390x')):
+    if machine.startswith(('aarch64', 'power', 'ppc64', 's390x', 'sparc')):
         if p <= 20:
             return 16
     else:
diff --git a/numpy/f2py/f2py2e.py b/numpy/f2py/f2py2e.py
index 8750ed0..4722315 100755
--- a/numpy/f2py/f2py2e.py
+++ b/numpy/f2py/f2py2e.py
@@ -396,8 +396,25 @@
 
 
 def run_main(comline_list):
-    """Run f2py as if string.join(comline_list,' ') is used as a command line.
-    In case of using -h flag, return None.
+    """
+    Equivalent to running::
+
+        f2py <args>
+
+    where ``<args>=string.join(<list>,' ')``, but in Python.  Unless
+    ``-h`` is used, this function returns a dictionary containing
+    information on generated modules and their dependencies on source
+    files.  For example, the command ``f2py -m scalar scalar.f`` can be
+    executed from Python as follows
+
+    You cannot build extension modules with this function, that is,
+    using ``-c`` is not allowed. Use ``compile`` command instead
+
+    Examples
+    --------
+    .. include:: run_main_session.dat
+        :literal:
+
     """
     crackfortran.reset_global_f2py_vars()
     f2pydir = os.path.dirname(os.path.abspath(cfuncs.__file__))
diff --git a/numpy/f2py/tests/test_compile_function.py b/numpy/f2py/tests/test_compile_function.py
index 74e0804..36abf05 100644
--- a/numpy/f2py/tests/test_compile_function.py
+++ b/numpy/f2py/tests/test_compile_function.py
@@ -106,3 +106,20 @@
         assert_equal(ret_val, 127)
     finally:
         sys.executable = temp
+
+
+@pytest.mark.parametrize('fsource',
+        ['program test_f2py\nend program test_f2py',
+         b'program test_f2py\nend program test_f2py',])
+def test_compile_from_strings(tmpdir, fsource):
+    # Make sure we can compile str and bytes gh-12796
+    cwd = os.getcwd()
+    try:
+        os.chdir(str(tmpdir))
+        ret_val = numpy.f2py.compile(
+                fsource,
+                modulename='test_compile_from_strings',
+                extension='.f90')
+        assert_equal(ret_val, 0)
+    finally:
+        os.chdir(cwd)
diff --git a/numpy/fft/README.md b/numpy/fft/README.md
new file mode 100644
index 0000000..7040a2e
--- /dev/null
+++ b/numpy/fft/README.md
@@ -0,0 +1,53 @@
+PocketFFT
+---------
+
+This is a heavily modified implementation of FFTPack [1,2], with the following
+advantages:
+
+- strictly C99 compliant
+- more accurate twiddle factor computation
+- very fast plan generation
+- worst case complexity for transform sizes with large prime factors is
+  `N*log(N)`, because Bluestein's algorithm [3] is used for these cases.
+
+License
+-------
+
+3-clause BSD (see LICENSE.md)
+
+
+Some code details
+-----------------
+
+Twiddle factor computation:
+
+- making use of symmetries to reduce number of sin/cos evaluations
+- all angles are reduced to the range `[0; pi/4]` for higher accuracy
+- an adapted implementation of `sincospi()` is used, which actually computes
+  `sin(x)` and `(cos(x)-1)`.
+- if `n` sin/cos pairs are required, the adjusted `sincospi()` is only called
+  `2*sqrt(n)` times; the remaining values are obtained by evaluating the
+  angle addition theorems in a numerically accurate way.
+
+Parallel invocation:
+
+- Plans only contain read-only data; all temporary arrays are allocated and
+  deallocated during an individual FFT execution. This means that a single plan
+  can be used in several threads at the same time.
+
+Efficient codelets are available for the factors:
+
+- 2, 3, 4, 5, 7, 11 for complex-valued FFTs
+- 2, 3, 4, 5 for real-valued FFTs
+
+Larger prime factors are handled by somewhat less efficient, generic routines.
+
+For lengths with very large prime factors, Bluestein's algorithm is used, and
+instead of an FFT of length `n`, a convolution of length `n2 >= 2*n-1`
+is performed, where `n2` is chosen to be highly composite.
+
+
+[1] Swarztrauber, P. 1982, Vectorizing the Fast Fourier Transforms
+    (New York: Academic Press), 51
+[2] https://www.netlib.org/fftpack/
+[3] https://en.wikipedia.org/wiki/Chirp_Z-transform
diff --git a/numpy/fft/__init__.py b/numpy/fft/__init__.py
index 44243b4..64b35bc 100644
--- a/numpy/fft/__init__.py
+++ b/numpy/fft/__init__.py
@@ -3,7 +3,7 @@
 # To get sub-modules
 from .info import __doc__
 
-from .fftpack import *
+from .pocketfft import *
 from .helper import *
 
 from numpy._pytesttester import PytestTester
diff --git a/numpy/fft/fftpack.c b/numpy/fft/fftpack.c
deleted file mode 100644
index 07fa2bf..0000000
--- a/numpy/fft/fftpack.c
+++ /dev/null
@@ -1,1536 +0,0 @@
-/*
- * fftpack.c : A set of FFT routines in C.
- * Algorithmically based on Fortran-77 FFTPACK by Paul N. Swarztrauber (Version 4, 1985).
-*/
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-
-#include <Python.h>
-#include <math.h>
-#include <stdio.h>
-#include <numpy/ndarraytypes.h>
-
-#define DOUBLE
-#ifdef DOUBLE
-#define Treal double
-#else
-#define Treal float
-#endif
-
-#define ref(u,a) u[a]
-
-/* Macros for accurate calculation of the twiddle factors. */
-#define TWOPI 6.283185307179586476925286766559005768391
-#define cos2pi(m, n) cos((TWOPI * (m)) / (n))
-#define sin2pi(m, n) sin((TWOPI * (m)) / (n))
-
-#define MAXFAC 13    /* maximum number of factors in factorization of n */
-#define NSPECIAL 4   /* number of factors for which we have special-case routines */
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-static void sincos2pi(int m, int n, Treal* si, Treal* co)
-/* Calculates sin(2pi * m/n) and cos(2pi * m/n). It is more accurate
- * than the naive calculation as the fraction m/n is reduced to [0, 1/8) first.
- * Due to the symmetry of sin(x) and cos(x) the values for all x can be
- * determined from the function values of the reduced argument in the first
- * octant.
- */
-    {
-        int n8, m8, octant;
-        n8 = 8 * n;
-        m8 = (8 * m) % n8;
-        octant = m8 / n;
-        m8 = m8 % n;
-        switch(octant) {
-            case 0:
-                *co = cos2pi(m8, n8);
-                *si = sin2pi(m8, n8);
-                break;
-            case 1:
-                *co = sin2pi(n-m8, n8);
-                *si = cos2pi(n-m8, n8);
-                break;
-            case 2:
-                *co = -sin2pi(m8, n8);
-                *si = cos2pi(m8, n8);
-                break;
-            case 3:
-                *co = -cos2pi(n-m8, n8);
-                *si = sin2pi(n-m8, n8);
-                break;
-            case 4:
-                *co = -cos2pi(m8, n8);
-                *si = -sin2pi(m8, n8);
-                break;
-            case 5:
-                *co = -sin2pi(n-m8, n8);
-                *si = -cos2pi(n-m8, n8);
-                break;
-            case 6:
-                *co = sin2pi(m8, n8);
-                *si = -cos2pi(m8, n8);
-                break;
-            case 7:
-                *co = cos2pi(n-m8, n8);
-                *si = -sin2pi(n-m8, n8);
-                break;
-        }
-    }
-
-/* ----------------------------------------------------------------------
-   passf2, passf3, passf4, passf5, passf. Complex FFT passes fwd and bwd.
------------------------------------------------------------------------ */
-
-static void passf2(int ido, int l1, const Treal cc[], Treal ch[], const Treal wa1[], int isign)
-  /* isign==+1 for backward transform */
-  {
-    int i, k, ah, ac;
-    Treal ti2, tr2;
-    if (ido <= 2) {
-      for (k=0; k<l1; k++) {
-        ah = k*ido;
-        ac = 2*k*ido;
-        ch[ah]              = ref(cc,ac) + ref(cc,ac + ido);
-        ch[ah + ido*l1]     = ref(cc,ac) - ref(cc,ac + ido);
-        ch[ah+1]            = ref(cc,ac+1) + ref(cc,ac + ido + 1);
-        ch[ah + ido*l1 + 1] = ref(cc,ac+1) - ref(cc,ac + ido + 1);
-      }
-    } else {
-      for (k=0; k<l1; k++) {
-        for (i=0; i<ido-1; i+=2) {
-          ah = i + k*ido;
-          ac = i + 2*k*ido;
-          ch[ah]   = ref(cc,ac) + ref(cc,ac + ido);
-          tr2      = ref(cc,ac) - ref(cc,ac + ido);
-          ch[ah+1] = ref(cc,ac+1) + ref(cc,ac + 1 + ido);
-          ti2      = ref(cc,ac+1) - ref(cc,ac + 1 + ido);
-          ch[ah+l1*ido+1] = wa1[i]*ti2 + isign*wa1[i+1]*tr2;
-          ch[ah+l1*ido]   = wa1[i]*tr2 - isign*wa1[i+1]*ti2;
-        }
-      }
-    }
-  } /* passf2 */
-
-
-static void passf3(int ido, int l1, const Treal cc[], Treal ch[],
-      const Treal wa1[], const Treal wa2[], int isign)
-  /* isign==+1 for backward transform */
-  {
-    static const Treal taur = -0.5;
-    static const Treal taui = 0.86602540378443864676;
-    int i, k, ac, ah;
-    Treal ci2, ci3, di2, di3, cr2, cr3, dr2, dr3, ti2, tr2;
-    if (ido == 2) {
-      for (k=1; k<=l1; k++) {
-        ac = (3*k - 2)*ido;
-        tr2 = ref(cc,ac) + ref(cc,ac + ido);
-        cr2 = ref(cc,ac - ido) + taur*tr2;
-        ah = (k - 1)*ido;
-        ch[ah] = ref(cc,ac - ido) + tr2;
-
-        ti2 = ref(cc,ac + 1) + ref(cc,ac + ido + 1);
-        ci2 = ref(cc,ac - ido + 1) + taur*ti2;
-        ch[ah + 1] = ref(cc,ac - ido + 1) + ti2;
-
-        cr3 = isign*taui*(ref(cc,ac) - ref(cc,ac + ido));
-        ci3 = isign*taui*(ref(cc,ac + 1) - ref(cc,ac + ido + 1));
-        ch[ah + l1*ido] = cr2 - ci3;
-        ch[ah + 2*l1*ido] = cr2 + ci3;
-        ch[ah + l1*ido + 1] = ci2 + cr3;
-        ch[ah + 2*l1*ido + 1] = ci2 - cr3;
-      }
-    } else {
-      for (k=1; k<=l1; k++) {
-        for (i=0; i<ido-1; i+=2) {
-          ac = i + (3*k - 2)*ido;
-          tr2 = ref(cc,ac) + ref(cc,ac + ido);
-          cr2 = ref(cc,ac - ido) + taur*tr2;
-          ah = i + (k-1)*ido;
-          ch[ah] = ref(cc,ac - ido) + tr2;
-          ti2 = ref(cc,ac + 1) + ref(cc,ac + ido + 1);
-          ci2 = ref(cc,ac - ido + 1) + taur*ti2;
-          ch[ah + 1] = ref(cc,ac - ido + 1) + ti2;
-          cr3 = isign*taui*(ref(cc,ac) - ref(cc,ac + ido));
-          ci3 = isign*taui*(ref(cc,ac + 1) - ref(cc,ac + ido + 1));
-          dr2 = cr2 - ci3;
-          dr3 = cr2 + ci3;
-          di2 = ci2 + cr3;
-          di3 = ci2 - cr3;
-          ch[ah + l1*ido + 1] = wa1[i]*di2 + isign*wa1[i+1]*dr2;
-          ch[ah + l1*ido] = wa1[i]*dr2 - isign*wa1[i+1]*di2;
-          ch[ah + 2*l1*ido + 1] = wa2[i]*di3 + isign*wa2[i+1]*dr3;
-          ch[ah + 2*l1*ido] = wa2[i]*dr3 - isign*wa2[i+1]*di3;
-        }
-      }
-    }
-  } /* passf3 */
-
-
-static void passf4(int ido, int l1, const Treal cc[], Treal ch[],
-      const Treal wa1[], const Treal wa2[], const Treal wa3[], int isign)
-  /* isign == -1 for forward transform and +1 for backward transform */
-  {
-    int i, k, ac, ah;
-    Treal ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
-    if (ido == 2) {
-      for (k=0; k<l1; k++) {
-        ac = 4*k*ido + 1;
-        ti1 = ref(cc,ac) - ref(cc,ac + 2*ido);
-        ti2 = ref(cc,ac) + ref(cc,ac + 2*ido);
-        tr4 = ref(cc,ac + 3*ido) - ref(cc,ac + ido);
-        ti3 = ref(cc,ac + ido) + ref(cc,ac + 3*ido);
-        tr1 = ref(cc,ac - 1) - ref(cc,ac + 2*ido - 1);
-        tr2 = ref(cc,ac - 1) + ref(cc,ac + 2*ido - 1);
-        ti4 = ref(cc,ac + ido - 1) - ref(cc,ac + 3*ido - 1);
-        tr3 = ref(cc,ac + ido - 1) + ref(cc,ac + 3*ido - 1);
-        ah = k*ido;
-        ch[ah] = tr2 + tr3;
-        ch[ah + 2*l1*ido] = tr2 - tr3;
-        ch[ah + 1] = ti2 + ti3;
-        ch[ah + 2*l1*ido + 1] = ti2 - ti3;
-        ch[ah + l1*ido] = tr1 + isign*tr4;
-        ch[ah + 3*l1*ido] = tr1 - isign*tr4;
-        ch[ah + l1*ido + 1] = ti1 + isign*ti4;
-        ch[ah + 3*l1*ido + 1] = ti1 - isign*ti4;
-      }
-    } else {
-      for (k=0; k<l1; k++) {
-        for (i=0; i<ido-1; i+=2) {
-          ac = i + 1 + 4*k*ido;
-          ti1 = ref(cc,ac) - ref(cc,ac + 2*ido);
-          ti2 = ref(cc,ac) + ref(cc,ac + 2*ido);
-          ti3 = ref(cc,ac + ido) + ref(cc,ac + 3*ido);
-          tr4 = ref(cc,ac + 3*ido) - ref(cc,ac + ido);
-          tr1 = ref(cc,ac - 1) - ref(cc,ac + 2*ido - 1);
-          tr2 = ref(cc,ac - 1) + ref(cc,ac + 2*ido - 1);
-          ti4 = ref(cc,ac + ido - 1) - ref(cc,ac + 3*ido - 1);
-          tr3 = ref(cc,ac + ido - 1) + ref(cc,ac + 3*ido - 1);
-          ah = i + k*ido;
-          ch[ah] = tr2 + tr3;
-          cr3 = tr2 - tr3;
-          ch[ah + 1] = ti2 + ti3;
-          ci3 = ti2 - ti3;
-          cr2 = tr1 + isign*tr4;
-          cr4 = tr1 - isign*tr4;
-          ci2 = ti1 + isign*ti4;
-          ci4 = ti1 - isign*ti4;
-          ch[ah + l1*ido] = wa1[i]*cr2 - isign*wa1[i + 1]*ci2;
-          ch[ah + l1*ido + 1] = wa1[i]*ci2 + isign*wa1[i + 1]*cr2;
-          ch[ah + 2*l1*ido] = wa2[i]*cr3 - isign*wa2[i + 1]*ci3;
-          ch[ah + 2*l1*ido + 1] = wa2[i]*ci3 + isign*wa2[i + 1]*cr3;
-          ch[ah + 3*l1*ido] = wa3[i]*cr4 -isign*wa3[i + 1]*ci4;
-          ch[ah + 3*l1*ido + 1] = wa3[i]*ci4 + isign*wa3[i + 1]*cr4;
-        }
-      }
-    }
-  } /* passf4 */
-
-
-static void passf5(int ido, int l1, const Treal cc[], Treal ch[],
-      const Treal wa1[], const Treal wa2[], const Treal wa3[], const Treal wa4[], int isign)
-  /* isign == -1 for forward transform and +1 for backward transform */
-  {
-    static const Treal tr11 = 0.3090169943749474241;
-    static const Treal ti11 = 0.95105651629515357212;
-    static const Treal tr12 = -0.8090169943749474241;
-    static const Treal ti12 = 0.58778525229247312917;
-    int i, k, ac, ah;
-    Treal ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4, ti2, ti3,
-        ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5;
-    if (ido == 2) {
-      for (k = 1; k <= l1; ++k) {
-        ac = (5*k - 4)*ido + 1;
-        ti5 = ref(cc,ac) - ref(cc,ac + 3*ido);
-        ti2 = ref(cc,ac) + ref(cc,ac + 3*ido);
-        ti4 = ref(cc,ac + ido) - ref(cc,ac + 2*ido);
-        ti3 = ref(cc,ac + ido) + ref(cc,ac + 2*ido);
-        tr5 = ref(cc,ac - 1) - ref(cc,ac + 3*ido - 1);
-        tr2 = ref(cc,ac - 1) + ref(cc,ac + 3*ido - 1);
-        tr4 = ref(cc,ac + ido - 1) - ref(cc,ac + 2*ido - 1);
-        tr3 = ref(cc,ac + ido - 1) + ref(cc,ac + 2*ido - 1);
-        ah = (k - 1)*ido;
-        ch[ah] = ref(cc,ac - ido - 1) + tr2 + tr3;
-        ch[ah + 1] = ref(cc,ac - ido) + ti2 + ti3;
-        cr2 = ref(cc,ac - ido - 1) + tr11*tr2 + tr12*tr3;
-        ci2 = ref(cc,ac - ido) + tr11*ti2 + tr12*ti3;
-        cr3 = ref(cc,ac - ido - 1) + tr12*tr2 + tr11*tr3;
-        ci3 = ref(cc,ac - ido) + tr12*ti2 + tr11*ti3;
-        cr5 = isign*(ti11*tr5 + ti12*tr4);
-        ci5 = isign*(ti11*ti5 + ti12*ti4);
-        cr4 = isign*(ti12*tr5 - ti11*tr4);
-        ci4 = isign*(ti12*ti5 - ti11*ti4);
-        ch[ah + l1*ido] = cr2 - ci5;
-        ch[ah + 4*l1*ido] = cr2 + ci5;
-        ch[ah + l1*ido + 1] = ci2 + cr5;
-        ch[ah + 2*l1*ido + 1] = ci3 + cr4;
-        ch[ah + 2*l1*ido] = cr3 - ci4;
-        ch[ah + 3*l1*ido] = cr3 + ci4;
-        ch[ah + 3*l1*ido + 1] = ci3 - cr4;
-        ch[ah + 4*l1*ido + 1] = ci2 - cr5;
-      }
-    } else {
-      for (k=1; k<=l1; k++) {
-        for (i=0; i<ido-1; i+=2) {
-          ac = i + 1 + (k*5 - 4)*ido;
-          ti5 = ref(cc,ac) - ref(cc,ac + 3*ido);
-          ti2 = ref(cc,ac) + ref(cc,ac + 3*ido);
-          ti4 = ref(cc,ac + ido) - ref(cc,ac + 2*ido);
-          ti3 = ref(cc,ac + ido) + ref(cc,ac + 2*ido);
-          tr5 = ref(cc,ac - 1) - ref(cc,ac + 3*ido - 1);
-          tr2 = ref(cc,ac - 1) + ref(cc,ac + 3*ido - 1);
-          tr4 = ref(cc,ac + ido - 1) - ref(cc,ac + 2*ido - 1);
-          tr3 = ref(cc,ac + ido - 1) + ref(cc,ac + 2*ido - 1);
-          ah = i + (k - 1)*ido;
-          ch[ah] = ref(cc,ac - ido - 1) + tr2 + tr3;
-          ch[ah + 1] = ref(cc,ac - ido) + ti2 + ti3;
-          cr2 = ref(cc,ac - ido - 1) + tr11*tr2 + tr12*tr3;
-
-          ci2 = ref(cc,ac - ido) + tr11*ti2 + tr12*ti3;
-          cr3 = ref(cc,ac - ido - 1) + tr12*tr2 + tr11*tr3;
-
-          ci3 = ref(cc,ac - ido) + tr12*ti2 + tr11*ti3;
-          cr5 = isign*(ti11*tr5 + ti12*tr4);
-          ci5 = isign*(ti11*ti5 + ti12*ti4);
-          cr4 = isign*(ti12*tr5 - ti11*tr4);
-          ci4 = isign*(ti12*ti5 - ti11*ti4);
-          dr3 = cr3 - ci4;
-          dr4 = cr3 + ci4;
-          di3 = ci3 + cr4;
-          di4 = ci3 - cr4;
-          dr5 = cr2 + ci5;
-          dr2 = cr2 - ci5;
-          di5 = ci2 - cr5;
-          di2 = ci2 + cr5;
-          ch[ah + l1*ido] = wa1[i]*dr2 - isign*wa1[i+1]*di2;
-          ch[ah + l1*ido + 1] = wa1[i]*di2 + isign*wa1[i+1]*dr2;
-          ch[ah + 2*l1*ido] = wa2[i]*dr3 - isign*wa2[i+1]*di3;
-          ch[ah + 2*l1*ido + 1] = wa2[i]*di3 + isign*wa2[i+1]*dr3;
-          ch[ah + 3*l1*ido] = wa3[i]*dr4 - isign*wa3[i+1]*di4;
-          ch[ah + 3*l1*ido + 1] = wa3[i]*di4 + isign*wa3[i+1]*dr4;
-          ch[ah + 4*l1*ido] = wa4[i]*dr5 - isign*wa4[i+1]*di5;
-          ch[ah + 4*l1*ido + 1] = wa4[i]*di5 + isign*wa4[i+1]*dr5;
-        }
-      }
-    }
-  } /* passf5 */
-
-
-static void passf(int *nac, int ido, int ip, int l1, int idl1,
-      Treal cc[], Treal ch[],
-      const Treal wa[], int isign)
-  /* isign is -1 for forward transform and +1 for backward transform */
-  {
-    int idij, idlj, idot, ipph, i, j, k, l, jc, lc, ik, idj, idl, inc,idp;
-    Treal wai, war;
-
-    idot = ido / 2;
-    /* nt = ip*idl1;*/
-    ipph = (ip + 1) / 2;
-    idp = ip*ido;
-    if (ido >= l1) {
-      for (j=1; j<ipph; j++) {
-        jc = ip - j;
-        for (k=0; k<l1; k++) {
-          for (i=0; i<ido; i++) {
-            ch[i + (k + j*l1)*ido] =
-                ref(cc,i + (j + k*ip)*ido) + ref(cc,i + (jc + k*ip)*ido);
-            ch[i + (k + jc*l1)*ido] =
-                ref(cc,i + (j + k*ip)*ido) - ref(cc,i + (jc + k*ip)*ido);
-          }
-        }
-      }
-      for (k=0; k<l1; k++)
-        for (i=0; i<ido; i++)
-          ch[i + k*ido] = ref(cc,i + k*ip*ido);
-    } else {
-      for (j=1; j<ipph; j++) {
-        jc = ip - j;
-        for (i=0; i<ido; i++) {
-          for (k=0; k<l1; k++) {
-            ch[i + (k + j*l1)*ido] = ref(cc,i + (j + k*ip)*ido) + ref(cc,i + (jc + k*
-                ip)*ido);
-            ch[i + (k + jc*l1)*ido] = ref(cc,i + (j + k*ip)*ido) - ref(cc,i + (jc + k*
-                ip)*ido);
-          }
-        }
-      }
-      for (i=0; i<ido; i++)
-        for (k=0; k<l1; k++)
-          ch[i + k*ido] = ref(cc,i + k*ip*ido);
-    }
-
-    idl = 2 - ido;
-    inc = 0;
-    for (l=1; l<ipph; l++) {
-      lc = ip - l;
-      idl += ido;
-      for (ik=0; ik<idl1; ik++) {
-        cc[ik + l*idl1] = ch[ik] + wa[idl - 2]*ch[ik + idl1];
-        cc[ik + lc*idl1] = isign*wa[idl-1]*ch[ik + (ip-1)*idl1];
-      }
-      idlj = idl;
-      inc += ido;
-      for (j=2; j<ipph; j++) {
-        jc = ip - j;
-        idlj += inc;
-        if (idlj > idp) idlj -= idp;
-        war = wa[idlj - 2];
-        wai = wa[idlj-1];
-        for (ik=0; ik<idl1; ik++) {
-          cc[ik + l*idl1] += war*ch[ik + j*idl1];
-          cc[ik + lc*idl1] += isign*wai*ch[ik + jc*idl1];
-        }
-      }
-    }
-    for (j=1; j<ipph; j++)
-      for (ik=0; ik<idl1; ik++)
-        ch[ik] += ch[ik + j*idl1];
-    for (j=1; j<ipph; j++) {
-      jc = ip - j;
-      for (ik=1; ik<idl1; ik+=2) {
-        ch[ik - 1 + j*idl1] = cc[ik - 1 + j*idl1] - cc[ik + jc*idl1];
-        ch[ik - 1 + jc*idl1] = cc[ik - 1 + j*idl1] + cc[ik + jc*idl1];
-        ch[ik + j*idl1] = cc[ik + j*idl1] + cc[ik - 1 + jc*idl1];
-        ch[ik + jc*idl1] = cc[ik + j*idl1] - cc[ik - 1 + jc*idl1];
-      }
-    }
-    *nac = 1;
-    if (ido == 2) return;
-    *nac = 0;
-    for (ik=0; ik<idl1; ik++)
-      cc[ik] = ch[ik];
-    for (j=1; j<ip; j++) {
-      for (k=0; k<l1; k++) {
-        cc[(k + j*l1)*ido + 0] = ch[(k + j*l1)*ido + 0];
-        cc[(k + j*l1)*ido + 1] = ch[(k + j*l1)*ido + 1];
-      }
-    }
-    if (idot <= l1) {
-      idij = 0;
-      for (j=1; j<ip; j++) {
-        idij += 2;
-        for (i=3; i<ido; i+=2) {
-          idij += 2;
-          for (k=0; k<l1; k++) {
-            cc[i - 1 + (k + j*l1)*ido] =
-                wa[idij - 2]*ch[i - 1 + (k + j*l1)*ido] -
-                isign*wa[idij-1]*ch[i + (k + j*l1)*ido];
-            cc[i + (k + j*l1)*ido] =
-                wa[idij - 2]*ch[i + (k + j*l1)*ido] +
-                isign*wa[idij-1]*ch[i - 1 + (k + j*l1)*ido];
-          }
-        }
-      }
-    } else {
-      idj = 2 - ido;
-      for (j=1; j<ip; j++) {
-        idj += ido;
-        for (k = 0; k < l1; k++) {
-          idij = idj;
-          for (i=3; i<ido; i+=2) {
-            idij += 2;
-            cc[i - 1 + (k + j*l1)*ido] =
-                wa[idij - 2]*ch[i - 1 + (k + j*l1)*ido] -
-                isign*wa[idij-1]*ch[i + (k + j*l1)*ido];
-            cc[i + (k + j*l1)*ido] =
-                wa[idij - 2]*ch[i + (k + j*l1)*ido] +
-                isign*wa[idij-1]*ch[i - 1 + (k + j*l1)*ido];
-          }
-        }
-      }
-    }
-  } /* passf */
-
-
-  /* ----------------------------------------------------------------------
-radf2,radb2, radf3,radb3, radf4,radb4, radf5,radb5, radfg,radbg.
-Treal FFT passes fwd and bwd.
----------------------------------------------------------------------- */
-
-static void radf2(int ido, int l1, const Treal cc[], Treal ch[], const Treal wa1[])
-  {
-    int i, k, ic;
-    Treal ti2, tr2;
-    for (k=0; k<l1; k++) {
-      ch[2*k*ido] =
-          ref(cc,k*ido) + ref(cc,(k + l1)*ido);
-      ch[(2*k+1)*ido + ido-1] =
-          ref(cc,k*ido) - ref(cc,(k + l1)*ido);
-    }
-    if (ido < 2) return;
-    if (ido != 2) {
-      for (k=0; k<l1; k++) {
-        for (i=2; i<ido; i+=2) {
-          ic = ido - i;
-          tr2 = wa1[i - 2]*ref(cc, i-1 + (k + l1)*ido) + wa1[i - 1]*ref(cc, i + (k + l1)*ido);
-          ti2 = wa1[i - 2]*ref(cc, i + (k + l1)*ido) - wa1[i - 1]*ref(cc, i-1 + (k + l1)*ido);
-          ch[i + 2*k*ido] = ref(cc,i + k*ido) + ti2;
-          ch[ic + (2*k+1)*ido] = ti2 - ref(cc,i + k*ido);
-          ch[i - 1 + 2*k*ido] = ref(cc,i - 1 + k*ido) + tr2;
-          ch[ic - 1 + (2*k+1)*ido] = ref(cc,i - 1 + k*ido) - tr2;
-        }
-      }
-      if (ido % 2 == 1) return;
-    }
-    for (k=0; k<l1; k++) {
-      ch[(2*k+1)*ido] = -ref(cc,ido-1 + (k + l1)*ido);
-      ch[ido-1 + 2*k*ido] = ref(cc,ido-1 + k*ido);
-    }
-  } /* radf2 */
-
-
-static void radb2(int ido, int l1, const Treal cc[], Treal ch[], const Treal wa1[])
-  {
-    int i, k, ic;
-    Treal ti2, tr2;
-    for (k=0; k<l1; k++) {
-      ch[k*ido] =
-          ref(cc,2*k*ido) + ref(cc,ido-1 + (2*k+1)*ido);
-      ch[(k + l1)*ido] =
-          ref(cc,2*k*ido) - ref(cc,ido-1 + (2*k+1)*ido);
-    }
-    if (ido < 2) return;
-    if (ido != 2) {
-      for (k = 0; k < l1; ++k) {
-        for (i = 2; i < ido; i += 2) {
-          ic = ido - i;
-          ch[i-1 + k*ido] =
-              ref(cc,i-1 + 2*k*ido) + ref(cc,ic-1 + (2*k+1)*ido);
-          tr2 = ref(cc,i-1 + 2*k*ido) - ref(cc,ic-1 + (2*k+1)*ido);
-          ch[i + k*ido] =
-              ref(cc,i + 2*k*ido) - ref(cc,ic + (2*k+1)*ido);
-          ti2 = ref(cc,i + (2*k)*ido) + ref(cc,ic + (2*k+1)*ido);
-          ch[i-1 + (k + l1)*ido] =
-              wa1[i - 2]*tr2 - wa1[i - 1]*ti2;
-          ch[i + (k + l1)*ido] =
-              wa1[i - 2]*ti2 + wa1[i - 1]*tr2;
-        }
-      }
-      if (ido % 2 == 1) return;
-    }
-    for (k = 0; k < l1; k++) {
-      ch[ido-1 + k*ido] = 2*ref(cc,ido-1 + 2*k*ido);
-      ch[ido-1 + (k + l1)*ido] = -2*ref(cc,(2*k+1)*ido);
-    }
-  } /* radb2 */
-
-
-static void radf3(int ido, int l1, const Treal cc[], Treal ch[],
-      const Treal wa1[], const Treal wa2[])
-  {
-    static const Treal taur = -0.5;
-    static const Treal taui = 0.86602540378443864676;
-    int i, k, ic;
-    Treal ci2, di2, di3, cr2, dr2, dr3, ti2, ti3, tr2, tr3;
-    for (k=0; k<l1; k++) {
-      cr2 = ref(cc,(k + l1)*ido) + ref(cc,(k + 2*l1)*ido);
-      ch[3*k*ido] = ref(cc,k*ido) + cr2;
-      ch[(3*k+2)*ido] = taui*(ref(cc,(k + l1*2)*ido) - ref(cc,(k + l1)*ido));
-      ch[ido-1 + (3*k + 1)*ido] = ref(cc,k*ido) + taur*cr2;
-    }
-    if (ido == 1) return;
-    for (k=0; k<l1; k++) {
-      for (i=2; i<ido; i+=2) {
-        ic = ido - i;
-        dr2 = wa1[i - 2]*ref(cc,i - 1 + (k + l1)*ido) +
-            wa1[i - 1]*ref(cc,i + (k + l1)*ido);
-        di2 = wa1[i - 2]*ref(cc,i + (k + l1)*ido) - wa1[i - 1]*ref(cc,i - 1 + (k + l1)*ido);
-        dr3 = wa2[i - 2]*ref(cc,i - 1 + (k + l1*2)*ido) + wa2[i - 1]*ref(cc,i + (k + l1*2)*ido);
-        di3 = wa2[i - 2]*ref(cc,i + (k + l1*2)*ido) - wa2[i - 1]*ref(cc,i - 1 + (k + l1*2)*ido);
-        cr2 = dr2 + dr3;
-        ci2 = di2 + di3;
-        ch[i - 1 + 3*k*ido] = ref(cc,i - 1 + k*ido) + cr2;
-        ch[i + 3*k*ido] = ref(cc,i + k*ido) + ci2;
-        tr2 = ref(cc,i - 1 + k*ido) + taur*cr2;
-        ti2 = ref(cc,i + k*ido) + taur*ci2;
-        tr3 = taui*(di2 - di3);
-        ti3 = taui*(dr3 - dr2);
-        ch[i - 1 + (3*k + 2)*ido] = tr2 + tr3;
-        ch[ic - 1 + (3*k + 1)*ido] = tr2 - tr3;
-        ch[i + (3*k + 2)*ido] = ti2 + ti3;
-        ch[ic + (3*k + 1)*ido] = ti3 - ti2;
-      }
-    }
-  } /* radf3 */
-
-
-static void radb3(int ido, int l1, const Treal cc[], Treal ch[],
-      const Treal wa1[], const Treal wa2[])
-  {
-    static const Treal taur = -0.5;
-    static const Treal taui = 0.86602540378443864676;
-    int i, k, ic;
-    Treal ci2, ci3, di2, di3, cr2, cr3, dr2, dr3, ti2, tr2;
-    for (k=0; k<l1; k++) {
-      tr2 = 2*ref(cc,ido-1 + (3*k + 1)*ido);
-      cr2 = ref(cc,3*k*ido) + taur*tr2;
-      ch[k*ido] = ref(cc,3*k*ido) + tr2;
-      ci3 = 2*taui*ref(cc,(3*k + 2)*ido);
-      ch[(k + l1)*ido] = cr2 - ci3;
-      ch[(k + 2*l1)*ido] = cr2 + ci3;
-    }
-    if (ido == 1) return;
-    for (k=0; k<l1; k++) {
-      for (i=2; i<ido; i+=2) {
-        ic = ido - i;
-        tr2 = ref(cc,i - 1 + (3*k + 2)*ido) + ref(cc,ic - 1 + (3*k + 1)*ido);
-        cr2 = ref(cc,i - 1 + 3*k*ido) + taur*tr2;
-        ch[i - 1 + k*ido] = ref(cc,i - 1 + 3*k*ido) + tr2;
-        ti2 = ref(cc,i + (3*k + 2)*ido) - ref(cc,ic + (3*k + 1)*ido);
-        ci2 = ref(cc,i + 3*k*ido) + taur*ti2;
-        ch[i + k*ido] = ref(cc,i + 3*k*ido) + ti2;
-        cr3 = taui*(ref(cc,i - 1 + (3*k + 2)*ido) - ref(cc,ic - 1 + (3*k + 1)*ido));
-        ci3 = taui*(ref(cc,i + (3*k + 2)*ido) + ref(cc,ic + (3*k + 1)*ido));
-        dr2 = cr2 - ci3;
-        dr3 = cr2 + ci3;
-        di2 = ci2 + cr3;
-        di3 = ci2 - cr3;
-        ch[i - 1 + (k + l1)*ido] = wa1[i - 2]*dr2 - wa1[i - 1]*di2;
-        ch[i + (k + l1)*ido] = wa1[i - 2]*di2 + wa1[i - 1]*dr2;
-        ch[i - 1 + (k + 2*l1)*ido] = wa2[i - 2]*dr3 - wa2[i - 1]*di3;
-        ch[i + (k + 2*l1)*ido] = wa2[i - 2]*di3 + wa2[i - 1]*dr3;
-      }
-    }
-  } /* radb3 */
-
-
-static void radf4(int ido, int l1, const Treal cc[], Treal ch[],
-      const Treal wa1[], const Treal wa2[], const Treal wa3[])
-  {
-    static const Treal hsqt2 = 0.70710678118654752440;
-    int i, k, ic;
-    Treal ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
-    for (k=0; k<l1; k++) {
-      tr1 = ref(cc,(k + l1)*ido) + ref(cc,(k + 3*l1)*ido);
-      tr2 = ref(cc,k*ido) + ref(cc,(k + 2*l1)*ido);
-      ch[4*k*ido] = tr1 + tr2;
-      ch[ido-1 + (4*k + 3)*ido] = tr2 - tr1;
-      ch[ido-1 + (4*k + 1)*ido] = ref(cc,k*ido) - ref(cc,(k + 2*l1)*ido);
-      ch[(4*k + 2)*ido] = ref(cc,(k + 3*l1)*ido) - ref(cc,(k + l1)*ido);
-    }
-    if (ido < 2) return;
-    if (ido != 2) {
-      for (k=0; k<l1; k++) {
-        for (i=2; i<ido; i += 2) {
-          ic = ido - i;
-          cr2 = wa1[i - 2]*ref(cc,i - 1 + (k + l1)*ido) + wa1[i - 1]*ref(cc,i + (k + l1)*ido);
-          ci2 = wa1[i - 2]*ref(cc,i + (k + l1)*ido) - wa1[i - 1]*ref(cc,i - 1 + (k + l1)*ido);
-          cr3 = wa2[i - 2]*ref(cc,i - 1 + (k + 2*l1)*ido) + wa2[i - 1]*ref(cc,i + (k + 2*l1)*
-              ido);
-          ci3 = wa2[i - 2]*ref(cc,i + (k + 2*l1)*ido) - wa2[i - 1]*ref(cc,i - 1 + (k + 2*l1)*
-              ido);
-          cr4 = wa3[i - 2]*ref(cc,i - 1 + (k + 3*l1)*ido) + wa3[i - 1]*ref(cc,i + (k + 3*l1)*
-              ido);
-          ci4 = wa3[i - 2]*ref(cc,i + (k + 3*l1)*ido) - wa3[i - 1]*ref(cc,i - 1 + (k + 3*l1)*
-              ido);
-          tr1 = cr2 + cr4;
-          tr4 = cr4 - cr2;
-          ti1 = ci2 + ci4;
-          ti4 = ci2 - ci4;
-          ti2 = ref(cc,i + k*ido) + ci3;
-          ti3 = ref(cc,i + k*ido) - ci3;
-          tr2 = ref(cc,i - 1 + k*ido) + cr3;
-          tr3 = ref(cc,i - 1 + k*ido) - cr3;
-          ch[i - 1 + 4*k*ido] = tr1 + tr2;
-          ch[ic - 1 + (4*k + 3)*ido] = tr2 - tr1;
-          ch[i + 4*k*ido] = ti1 + ti2;
-          ch[ic + (4*k + 3)*ido] = ti1 - ti2;
-          ch[i - 1 + (4*k + 2)*ido] = ti4 + tr3;
-          ch[ic - 1 + (4*k + 1)*ido] = tr3 - ti4;
-          ch[i + (4*k + 2)*ido] = tr4 + ti3;
-          ch[ic + (4*k + 1)*ido] = tr4 - ti3;
-        }
-      }
-      if (ido % 2 == 1) return;
-    }
-    for (k=0; k<l1; k++) {
-      ti1 = -hsqt2*(ref(cc,ido-1 + (k + l1)*ido) + ref(cc,ido-1 + (k + 3*l1)*ido));
-      tr1 = hsqt2*(ref(cc,ido-1 + (k + l1)*ido) - ref(cc,ido-1 + (k + 3*l1)*ido));
-      ch[ido-1 + 4*k*ido] = tr1 + ref(cc,ido-1 + k*ido);
-      ch[ido-1 + (4*k + 2)*ido] = ref(cc,ido-1 + k*ido) - tr1;
-      ch[(4*k + 1)*ido] = ti1 - ref(cc,ido-1 + (k + 2*l1)*ido);
-      ch[(4*k + 3)*ido] = ti1 + ref(cc,ido-1 + (k + 2*l1)*ido);
-    }
-  } /* radf4 */
-
-
-static void radb4(int ido, int l1, const Treal cc[], Treal ch[],
-      const Treal wa1[], const Treal wa2[], const Treal wa3[])
-  {
-    static const Treal sqrt2 = 1.41421356237309504880;
-    int i, k, ic;
-    Treal ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
-    for (k = 0; k < l1; k++) {
-      tr1 = ref(cc,4*k*ido) - ref(cc,ido-1 + (4*k + 3)*ido);
-      tr2 = ref(cc,4*k*ido) + ref(cc,ido-1 + (4*k + 3)*ido);
-      tr3 = ref(cc,ido-1 + (4*k + 1)*ido) + ref(cc,ido-1 + (4*k + 1)*ido);
-      tr4 = ref(cc,(4*k + 2)*ido) + ref(cc,(4*k + 2)*ido);
-      ch[k*ido] = tr2 + tr3;
-      ch[(k + l1)*ido] = tr1 - tr4;
-      ch[(k + 2*l1)*ido] = tr2 - tr3;
-      ch[(k + 3*l1)*ido] = tr1 + tr4;
-    }
-    if (ido < 2) return;
-    if (ido != 2) {
-      for (k = 0; k < l1; ++k) {
-        for (i = 2; i < ido; i += 2) {
-          ic = ido - i;
-          ti1 = ref(cc,i + 4*k*ido) + ref(cc,ic + (4*k + 3)*ido);
-          ti2 = ref(cc,i + 4*k*ido) - ref(cc,ic + (4*k + 3)*ido);
-          ti3 = ref(cc,i + (4*k + 2)*ido) - ref(cc,ic + (4*k + 1)*ido);
-          tr4 = ref(cc,i + (4*k + 2)*ido) + ref(cc,ic + (4*k + 1)*ido);
-          tr1 = ref(cc,i - 1 + 4*k*ido) - ref(cc,ic - 1 + (4*k + 3)*ido);
-          tr2 = ref(cc,i - 1 + 4*k*ido) + ref(cc,ic - 1 + (4*k + 3)*ido);
-          ti4 = ref(cc,i - 1 + (4*k + 2)*ido) - ref(cc,ic - 1 + (4*k + 1)*ido);
-          tr3 = ref(cc,i - 1 + (4*k + 2)*ido) + ref(cc,ic - 1 + (4*k + 1)*ido);
-          ch[i - 1 + k*ido] = tr2 + tr3;
-          cr3 = tr2 - tr3;
-          ch[i + k*ido] = ti2 + ti3;
-          ci3 = ti2 - ti3;
-          cr2 = tr1 - tr4;
-          cr4 = tr1 + tr4;
-          ci2 = ti1 + ti4;
-          ci4 = ti1 - ti4;
-          ch[i - 1 + (k + l1)*ido] = wa1[i - 2]*cr2 - wa1[i - 1]*ci2;
-          ch[i + (k + l1)*ido] = wa1[i - 2]*ci2 + wa1[i - 1]*cr2;
-          ch[i - 1 + (k + 2*l1)*ido] = wa2[i - 2]*cr3 - wa2[i - 1]*ci3;
-          ch[i + (k + 2*l1)*ido] = wa2[i - 2]*ci3 + wa2[i - 1]*cr3;
-          ch[i - 1 + (k + 3*l1)*ido] = wa3[i - 2]*cr4 - wa3[i - 1]*ci4;
-          ch[i + (k + 3*l1)*ido] = wa3[i - 2]*ci4 + wa3[i - 1]*cr4;
-        }
-      }
-      if (ido % 2 == 1) return;
-    }
-    for (k = 0; k < l1; k++) {
-      ti1 = ref(cc,(4*k + 1)*ido) + ref(cc,(4*k + 3)*ido);
-      ti2 = ref(cc,(4*k + 3)*ido) - ref(cc,(4*k + 1)*ido);
-      tr1 = ref(cc,ido-1 + 4*k*ido) - ref(cc,ido-1 + (4*k + 2)*ido);
-      tr2 = ref(cc,ido-1 + 4*k*ido) + ref(cc,ido-1 + (4*k + 2)*ido);
-      ch[ido-1 + k*ido] = tr2 + tr2;
-      ch[ido-1 + (k + l1)*ido] = sqrt2*(tr1 - ti1);
-      ch[ido-1 + (k + 2*l1)*ido] = ti2 + ti2;
-      ch[ido-1 + (k + 3*l1)*ido] = -sqrt2*(tr1 + ti1);
-    }
-  } /* radb4 */
-
-
-static void radf5(int ido, int l1, const Treal cc[], Treal ch[],
-      const Treal wa1[], const Treal wa2[], const Treal wa3[], const Treal wa4[])
-  {
-    static const Treal tr11 = 0.3090169943749474241;
-    static const Treal ti11 = 0.95105651629515357212;
-    static const Treal tr12 = -0.8090169943749474241;
-    static const Treal ti12 = 0.58778525229247312917;
-    int i, k, ic;
-    Treal ci2, di2, ci4, ci5, di3, di4, di5, ci3, cr2, cr3, dr2, dr3, dr4, dr5,
-        cr5, cr4, ti2, ti3, ti5, ti4, tr2, tr3, tr4, tr5;
-    for (k = 0; k < l1; k++) {
-      cr2 = ref(cc,(k + 4*l1)*ido) + ref(cc,(k + l1)*ido);
-      ci5 = ref(cc,(k + 4*l1)*ido) - ref(cc,(k + l1)*ido);
-      cr3 = ref(cc,(k + 3*l1)*ido) + ref(cc,(k + 2*l1)*ido);
-      ci4 = ref(cc,(k + 3*l1)*ido) - ref(cc,(k + 2*l1)*ido);
-      ch[5*k*ido] = ref(cc,k*ido) + cr2 + cr3;
-      ch[ido-1 + (5*k + 1)*ido] = ref(cc,k*ido) + tr11*cr2 + tr12*cr3;
-      ch[(5*k + 2)*ido] = ti11*ci5 + ti12*ci4;
-      ch[ido-1 + (5*k + 3)*ido] = ref(cc,k*ido) + tr12*cr2 + tr11*cr3;
-      ch[(5*k + 4)*ido] = ti12*ci5 - ti11*ci4;
-    }
-    if (ido == 1) return;
-    for (k = 0; k < l1; ++k) {
-      for (i = 2; i < ido; i += 2) {
-        ic = ido - i;
-        dr2 = wa1[i - 2]*ref(cc,i - 1 + (k + l1)*ido) + wa1[i - 1]*ref(cc,i + (k + l1)*ido);
-        di2 = wa1[i - 2]*ref(cc,i + (k + l1)*ido) - wa1[i - 1]*ref(cc,i - 1 + (k + l1)*ido);
-        dr3 = wa2[i - 2]*ref(cc,i - 1 + (k + 2*l1)*ido) + wa2[i - 1]*ref(cc,i + (k + 2*l1)*ido);
-        di3 = wa2[i - 2]*ref(cc,i + (k + 2*l1)*ido) - wa2[i - 1]*ref(cc,i - 1 + (k + 2*l1)*ido);
-        dr4 = wa3[i - 2]*ref(cc,i - 1 + (k + 3*l1)*ido) + wa3[i - 1]*ref(cc,i + (k + 3*l1)*ido);
-        di4 = wa3[i - 2]*ref(cc,i + (k + 3*l1)*ido) - wa3[i - 1]*ref(cc,i - 1 + (k + 3*l1)*ido);
-        dr5 = wa4[i - 2]*ref(cc,i - 1 + (k + 4*l1)*ido) + wa4[i - 1]*ref(cc,i + (k + 4*l1)*ido);
-        di5 = wa4[i - 2]*ref(cc,i + (k + 4*l1)*ido) - wa4[i - 1]*ref(cc,i - 1 + (k + 4*l1)*ido);
-        cr2 = dr2 + dr5;
-        ci5 = dr5 - dr2;
-        cr5 = di2 - di5;
-        ci2 = di2 + di5;
-        cr3 = dr3 + dr4;
-        ci4 = dr4 - dr3;
-        cr4 = di3 - di4;
-        ci3 = di3 + di4;
-        ch[i - 1 + 5*k*ido] = ref(cc,i - 1 + k*ido) + cr2 + cr3;
-        ch[i + 5*k*ido] = ref(cc,i + k*ido) + ci2 + ci3;
-        tr2 = ref(cc,i - 1 + k*ido) + tr11*cr2 + tr12*cr3;
-        ti2 = ref(cc,i + k*ido) + tr11*ci2 + tr12*ci3;
-        tr3 = ref(cc,i - 1 + k*ido) + tr12*cr2 + tr11*cr3;
-        ti3 = ref(cc,i + k*ido) + tr12*ci2 + tr11*ci3;
-        tr5 = ti11*cr5 + ti12*cr4;
-        ti5 = ti11*ci5 + ti12*ci4;
-        tr4 = ti12*cr5 - ti11*cr4;
-        ti4 = ti12*ci5 - ti11*ci4;
-        ch[i - 1 + (5*k + 2)*ido] = tr2 + tr5;
-        ch[ic - 1 + (5*k + 1)*ido] = tr2 - tr5;
-        ch[i + (5*k + 2)*ido] = ti2 + ti5;
-        ch[ic + (5*k + 1)*ido] = ti5 - ti2;
-        ch[i - 1 + (5*k + 4)*ido] = tr3 + tr4;
-        ch[ic - 1 + (5*k + 3)*ido] = tr3 - tr4;
-        ch[i + (5*k + 4)*ido] = ti3 + ti4;
-        ch[ic + (5*k + 3)*ido] = ti4 - ti3;
-      }
-    }
-  } /* radf5 */
-
-
-static void radb5(int ido, int l1, const Treal cc[], Treal ch[],
-      const Treal wa1[], const Treal wa2[], const Treal wa3[], const Treal wa4[])
-  {
-    static const Treal tr11 = 0.3090169943749474241;
-    static const Treal ti11 = 0.95105651629515357212;
-    static const Treal tr12 = -0.8090169943749474241;
-    static const Treal ti12 = 0.58778525229247312917;
-    int i, k, ic;
-    Treal ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4, ti2, ti3,
-        ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5;
-    for (k = 0; k < l1; k++) {
-      ti5 = 2*ref(cc,(5*k + 2)*ido);
-      ti4 = 2*ref(cc,(5*k + 4)*ido);
-      tr2 = 2*ref(cc,ido-1 + (5*k + 1)*ido);
-      tr3 = 2*ref(cc,ido-1 + (5*k + 3)*ido);
-      ch[k*ido] = ref(cc,5*k*ido) + tr2 + tr3;
-      cr2 = ref(cc,5*k*ido) + tr11*tr2 + tr12*tr3;
-      cr3 = ref(cc,5*k*ido) + tr12*tr2 + tr11*tr3;
-      ci5 = ti11*ti5 + ti12*ti4;
-      ci4 = ti12*ti5 - ti11*ti4;
-      ch[(k + l1)*ido] = cr2 - ci5;
-      ch[(k + 2*l1)*ido] = cr3 - ci4;
-      ch[(k + 3*l1)*ido] = cr3 + ci4;
-      ch[(k + 4*l1)*ido] = cr2 + ci5;
-    }
-    if (ido == 1) return;
-    for (k = 0; k < l1; ++k) {
-      for (i = 2; i < ido; i += 2) {
-        ic = ido - i;
-        ti5 = ref(cc,i + (5*k + 2)*ido) + ref(cc,ic + (5*k + 1)*ido);
-        ti2 = ref(cc,i + (5*k + 2)*ido) - ref(cc,ic + (5*k + 1)*ido);
-        ti4 = ref(cc,i + (5*k + 4)*ido) + ref(cc,ic + (5*k + 3)*ido);
-        ti3 = ref(cc,i + (5*k + 4)*ido) - ref(cc,ic + (5*k + 3)*ido);
-        tr5 = ref(cc,i - 1 + (5*k + 2)*ido) - ref(cc,ic - 1 + (5*k + 1)*ido);
-        tr2 = ref(cc,i - 1 + (5*k + 2)*ido) + ref(cc,ic - 1 + (5*k + 1)*ido);
-        tr4 = ref(cc,i - 1 + (5*k + 4)*ido) - ref(cc,ic - 1 + (5*k + 3)*ido);
-        tr3 = ref(cc,i - 1 + (5*k + 4)*ido) + ref(cc,ic - 1 + (5*k + 3)*ido);
-        ch[i - 1 + k*ido] = ref(cc,i - 1 + 5*k*ido) + tr2 + tr3;
-        ch[i + k*ido] = ref(cc,i + 5*k*ido) + ti2 + ti3;
-        cr2 = ref(cc,i - 1 + 5*k*ido) + tr11*tr2 + tr12*tr3;
-
-        ci2 = ref(cc,i + 5*k*ido) + tr11*ti2 + tr12*ti3;
-        cr3 = ref(cc,i - 1 + 5*k*ido) + tr12*tr2 + tr11*tr3;
-
-        ci3 = ref(cc,i + 5*k*ido) + tr12*ti2 + tr11*ti3;
-        cr5 = ti11*tr5 + ti12*tr4;
-        ci5 = ti11*ti5 + ti12*ti4;
-        cr4 = ti12*tr5 - ti11*tr4;
-        ci4 = ti12*ti5 - ti11*ti4;
-        dr3 = cr3 - ci4;
-        dr4 = cr3 + ci4;
-        di3 = ci3 + cr4;
-        di4 = ci3 - cr4;
-        dr5 = cr2 + ci5;
-        dr2 = cr2 - ci5;
-        di5 = ci2 - cr5;
-        di2 = ci2 + cr5;
-        ch[i - 1 + (k + l1)*ido] = wa1[i - 2]*dr2 - wa1[i - 1]*di2;
-        ch[i + (k + l1)*ido] = wa1[i - 2]*di2 + wa1[i - 1]*dr2;
-        ch[i - 1 + (k + 2*l1)*ido] = wa2[i - 2]*dr3 - wa2[i - 1]*di3;
-        ch[i + (k + 2*l1)*ido] = wa2[i - 2]*di3 + wa2[i - 1]*dr3;
-        ch[i - 1 + (k + 3*l1)*ido] = wa3[i - 2]*dr4 - wa3[i - 1]*di4;
-        ch[i + (k + 3*l1)*ido] = wa3[i - 2]*di4 + wa3[i - 1]*dr4;
-        ch[i - 1 + (k + 4*l1)*ido] = wa4[i - 2]*dr5 - wa4[i - 1]*di5;
-        ch[i + (k + 4*l1)*ido] = wa4[i - 2]*di5 + wa4[i - 1]*dr5;
-      }
-    }
-  } /* radb5 */
-
-
-static void radfg(int ido, int ip, int l1, int idl1,
-      Treal cc[], Treal ch[], const Treal wa[])
-  {
-    int idij, ipph, i, j, k, l, j2, ic, jc, lc, ik, is, nbd;    
-    Treal dc2, ai1, ai2, ar1, ar2, ds2, dcp, dsp, ar1h, ar2h;
-    sincos2pi(1, ip, &dsp, &dcp);
-    ipph = (ip + 1) / 2;
-    nbd = (ido - 1) / 2;
-    if (ido != 1) {
-      for (ik=0; ik<idl1; ik++) ch[ik] = cc[ik];
-      for (j=1; j<ip; j++)
-        for (k=0; k<l1; k++)
-          ch[(k + j*l1)*ido] = cc[(k + j*l1)*ido];
-      if (nbd <= l1) {
-        is = -ido;
-        for (j=1; j<ip; j++) {
-          is += ido;
-          idij = is-1;
-          for (i=2; i<ido; i+=2) {
-            idij += 2;
-            for (k=0; k<l1; k++) {
-              ch[i - 1 + (k + j*l1)*ido] =
-                  wa[idij - 1]*cc[i - 1 + (k + j*l1)*ido] + wa[idij]*cc[i + (k + j*l1)*ido];
-              ch[i + (k + j*l1)*ido] =
-                  wa[idij - 1]*cc[i + (k + j*l1)*ido] - wa[idij]*cc[i - 1 + (k + j*l1)*ido];
-            }
-          }
-        }
-      } else {
-        is = -ido;
-        for (j=1; j<ip; j++) {
-          is += ido;
-          for (k=0; k<l1; k++) {
-            idij = is-1;
-            for (i=2; i<ido; i+=2) {
-              idij += 2;
-              ch[i - 1 + (k + j*l1)*ido] =
-                  wa[idij - 1]*cc[i - 1 + (k + j*l1)*ido] + wa[idij]*cc[i + (k + j*l1)*ido];
-              ch[i + (k + j*l1)*ido] =
-                  wa[idij - 1]*cc[i + (k + j*l1)*ido] - wa[idij]*cc[i - 1 + (k + j*l1)*ido];
-            }
-          }
-        }
-      }
-      if (nbd >= l1) {
-        for (j=1; j<ipph; j++) {
-          jc = ip - j;
-          for (k=0; k<l1; k++) {
-            for (i=2; i<ido; i+=2) {
-              cc[i - 1 + (k + j*l1)*ido] = ch[i - 1 + (k + j*l1)*ido] + ch[i - 1 + (k + jc*l1)*ido];
-              cc[i - 1 + (k + jc*l1)*ido] = ch[i + (k + j*l1)*ido] - ch[i + (k + jc*l1)*ido];
-              cc[i + (k + j*l1)*ido] = ch[i + (k + j*l1)*ido] + ch[i + (k + jc*l1)*ido];
-              cc[i + (k + jc*l1)*ido] = ch[i - 1 + (k + jc*l1)*ido] - ch[i - 1 + (k + j*l1)*ido];
-            }
-          }
-        }
-      } else {
-        for (j=1; j<ipph; j++) {
-          jc = ip - j;
-          for (i=2; i<ido; i+=2) {
-            for (k=0; k<l1; k++) {
-              cc[i - 1 + (k + j*l1)*ido] =
-                  ch[i - 1 + (k + j*l1)*ido] + ch[i - 1 + (k + jc*l1)*ido];
-              cc[i - 1 + (k + jc*l1)*ido] = ch[i + (k + j*l1)*ido] - ch[i + (k + jc*l1)*ido];
-              cc[i + (k + j*l1)*ido] = ch[i + (k + j*l1)*ido] + ch[i + (k + jc*l1)*ido];
-              cc[i + (k + jc*l1)*ido] = ch[i - 1 + (k + jc*l1)*ido] - ch[i - 1 + (k + j*l1)*ido];
-            }
-          }
-        }
-      }
-    } else {  /* now ido == 1 */
-      for (ik=0; ik<idl1; ik++) cc[ik] = ch[ik];
-    }
-    for (j=1; j<ipph; j++) {
-      jc = ip - j;
-      for (k=0; k<l1; k++) {
-        cc[(k + j*l1)*ido] = ch[(k + j*l1)*ido] + ch[(k + jc*l1)*ido];
-        cc[(k + jc*l1)*ido] = ch[(k + jc*l1)*ido] - ch[(k + j*l1)*ido];
-      }
-    }
-
-    ar1 = 1;
-    ai1 = 0;    
-    for (l=1; l<ipph; l++) {
-      lc = ip - l;
-      ar1h = dcp*ar1 - dsp*ai1;
-      ai1 = dcp*ai1 + dsp*ar1;
-      ar1 = ar1h;
-      for (ik=0; ik<idl1; ik++) {
-        ch[ik + l*idl1] = cc[ik] + ar1*cc[ik + idl1];
-        ch[ik + lc*idl1] = ai1*cc[ik + (ip-1)*idl1];
-      }
-      dc2 = ar1;
-      ds2 = ai1;
-      ar2 = ar1;
-      ai2 = ai1;
-      for (j=2; j<ipph; j++) {
-        jc = ip - j;
-        ar2h = dc2*ar2 - ds2*ai2;
-        ai2 = dc2*ai2 + ds2*ar2;
-        ar2 = ar2h;
-        for (ik=0; ik<idl1; ik++) {
-          ch[ik + l*idl1] += ar2*cc[ik + j*idl1];
-          ch[ik + lc*idl1] += ai2*cc[ik + jc*idl1];
-        }
-      }
-    }
-    
-    for (j=1; j<ipph; j++)
-      for (ik=0; ik<idl1; ik++)
-        ch[ik] += cc[ik + j*idl1];
-
-    if (ido >= l1) {
-      for (k=0; k<l1; k++) {
-        for (i=0; i<ido; i++) {
-          ref(cc,i + k*ip*ido) = ch[i + k*ido];
-        }
-      }
-    } else {
-      for (i=0; i<ido; i++) {
-        for (k=0; k<l1; k++) {
-          ref(cc,i + k*ip*ido) = ch[i + k*ido];
-        }
-      }
-    }
-    for (j=1; j<ipph; j++) {
-      jc = ip - j;
-      j2 = 2*j;
-      for (k=0; k<l1; k++) {
-        ref(cc,ido-1 + (j2 - 1 + k*ip)*ido) =
-            ch[(k + j*l1)*ido];
-        ref(cc,(j2 + k*ip)*ido) =
-            ch[(k + jc*l1)*ido];
-      }
-    }
-    if (ido == 1) return;
-    if (nbd >= l1) {
-      for (j=1; j<ipph; j++) {
-        jc = ip - j;
-        j2 = 2*j;
-        for (k=0; k<l1; k++) {
-          for (i=2; i<ido; i+=2) {
-            ic = ido - i;
-            ref(cc,i - 1 + (j2 + k*ip)*ido) = ch[i - 1 + (k + j*l1)*ido] + ch[i - 1 + (k + jc*l1)*ido];
-            ref(cc,ic - 1 + (j2 - 1 + k*ip)*ido) = ch[i - 1 + (k + j*l1)*ido] - ch[i - 1 + (k + jc*l1)*ido];
-            ref(cc,i + (j2 + k*ip)*ido) = ch[i + (k + j*l1)*ido] + ch[i + (k + jc*l1)*ido];
-            ref(cc,ic + (j2 - 1 + k*ip)*ido) = ch[i + (k + jc*l1)*ido] - ch[i + (k + j*l1)*ido];
-          }
-        }
-      }
-    } else {
-      for (j=1; j<ipph; j++) {
-        jc = ip - j;
-        j2 = 2*j;
-        for (i=2; i<ido; i+=2) {
-          ic = ido - i;
-          for (k=0; k<l1; k++) {
-            ref(cc,i - 1 + (j2 + k*ip)*ido) = ch[i - 1 + (k + j*l1)*ido] + ch[i - 1 + (k + jc*l1)*ido];
-            ref(cc,ic - 1 + (j2 - 1 + k*ip)*ido) = ch[i - 1 + (k + j*l1)*ido] - ch[i - 1 + (k + jc*l1)*ido];
-            ref(cc,i + (j2 + k*ip)*ido) = ch[i + (k + j*l1)*ido] + ch[i + (k + jc*l1)*ido];
-            ref(cc,ic + (j2 - 1 + k*ip)*ido) = ch[i + (k + jc*l1)*ido] - ch[i + (k + j*l1)*ido];
-          }
-        }
-      }
-    }
-  } /* radfg */
-
-
-static void radbg(int ido, int ip, int l1, int idl1,
-      Treal cc[], Treal ch[], const Treal wa[])
-  {
-    int idij, ipph, i, j, k, l, j2, ic, jc, lc, ik, is;
-    Treal dc2, ai1, ai2, ar1, ar2, ds2;
-    int nbd;
-    Treal dcp, dsp, ar1h, ar2h;
-    sincos2pi(1, ip, &dsp, &dcp);
-    nbd = (ido - 1) / 2;
-    ipph = (ip + 1) / 2;
-    if (ido >= l1) {
-      for (k=0; k<l1; k++) {
-        for (i=0; i<ido; i++) {
-          ch[i + k*ido] = ref(cc,i + k*ip*ido);
-        }
-      }
-    } else {
-      for (i=0; i<ido; i++) {
-        for (k=0; k<l1; k++) {
-          ch[i + k*ido] = ref(cc,i + k*ip*ido);
-        }
-      }
-    }
-    for (j=1; j<ipph; j++) {
-      jc = ip - j;
-      j2 = 2*j;
-      for (k=0; k<l1; k++) {
-        ch[(k + j*l1)*ido] = ref(cc,ido-1 + (j2 - 1 + k*ip)*ido) + ref(cc,ido-1 + (j2 - 1 + k*ip)*
-            ido);
-        ch[(k + jc*l1)*ido] = ref(cc,(j2 + k*ip)*ido) + ref(cc,(j2 + k*ip)*ido);
-      }
-    }
-
-    if (ido != 1) {
-      if (nbd >= l1) {
-        for (j=1; j<ipph; j++) {
-          jc = ip - j;
-          for (k=0; k<l1; k++) {
-            for (i=2; i<ido; i+=2) {
-              ic = ido - i;
-              ch[i - 1 + (k + j*l1)*ido] = ref(cc,i - 1 + (2*j + k*ip)*ido) + ref(cc,
-                  ic - 1 + (2*j - 1 + k*ip)*ido);
-              ch[i - 1 + (k + jc*l1)*ido] = ref(cc,i - 1 + (2*j + k*ip)*ido) -
-                  ref(cc,ic - 1 + (2*j - 1 + k*ip)*ido);
-              ch[i + (k + j*l1)*ido] = ref(cc,i + (2*j + k*ip)*ido) - ref(cc,ic
-                  + (2*j - 1 + k*ip)*ido);
-              ch[i + (k + jc*l1)*ido] = ref(cc,i + (2*j + k*ip)*ido) + ref(cc,ic
-                  + (2*j - 1 + k*ip)*ido);
-            }
-          }
-        }
-      } else {
-        for (j=1; j<ipph; j++) {
-          jc = ip - j;
-          for (i=2; i<ido; i+=2) {
-            ic = ido - i;
-            for (k=0; k<l1; k++) {
-              ch[i - 1 + (k + j*l1)*ido] = ref(cc,i - 1 + (2*j + k*ip)*ido) + ref(cc,
-                  ic - 1 + (2*j - 1 + k*ip)*ido);
-              ch[i - 1 + (k + jc*l1)*ido] = ref(cc,i - 1 + (2*j + k*ip)*ido) -
-                  ref(cc,ic - 1 + (2*j - 1 + k*ip)*ido);
-              ch[i + (k + j*l1)*ido] = ref(cc,i + (2*j + k*ip)*ido) - ref(cc,ic
-                  + (2*j - 1 + k*ip)*ido);
-              ch[i + (k + jc*l1)*ido] = ref(cc,i + (2*j + k*ip)*ido) + ref(cc,ic
-                  + (2*j - 1 + k*ip)*ido);
-            }
-          }
-        }
-      }
-    }
-
-    ar1 = 1;
-    ai1 = 0;
-    for (l=1; l<ipph; l++) {
-      lc = ip - l;
-      ar1h = dcp*ar1 - dsp*ai1;
-      ai1 = dcp*ai1 + dsp*ar1;
-      ar1 = ar1h;
-      for (ik=0; ik<idl1; ik++) {
-        cc[ik + l*idl1] = ch[ik] + ar1*ch[ik + idl1];
-        cc[ik + lc*idl1] = ai1*ch[ik + (ip-1)*idl1];
-      }
-      dc2 = ar1;
-      ds2 = ai1;
-      ar2 = ar1;
-      ai2 = ai1;
-      for (j=2; j<ipph; j++) {
-        jc = ip - j;
-        ar2h = dc2*ar2 - ds2*ai2;
-        ai2 = dc2*ai2 + ds2*ar2;
-        ar2 = ar2h;
-        for (ik=0; ik<idl1; ik++) {
-          cc[ik + l*idl1] += ar2*ch[ik + j*idl1];
-          cc[ik + lc*idl1] += ai2*ch[ik + jc*idl1];
-        }
-      }
-    }
-    for (j=1; j<ipph; j++) {
-      for (ik=0; ik<idl1; ik++) {
-        ch[ik] += ch[ik + j*idl1];
-      }
-    }
-    for (j=1; j<ipph; j++) {
-      jc = ip - j;
-      for (k=0; k<l1; k++) {
-        ch[(k + j*l1)*ido] = cc[(k + j*l1)*ido] - cc[(k + jc*l1)*ido];
-        ch[(k + jc*l1)*ido] = cc[(k + j*l1)*ido] + cc[(k + jc*l1)*ido];
-      }
-    }
-
-    if (ido == 1) return;
-    if (nbd >= l1) {
-      for (j=1; j<ipph; j++) {
-        jc = ip - j;
-        for (k=0; k<l1; k++) {
-          for (i=2; i<ido; i+=2) {
-            ch[i - 1 + (k + j*l1)*ido] = cc[i - 1 + (k + j*l1)*ido] - cc[i + (k + jc*l1)*ido];
-            ch[i - 1 + (k + jc*l1)*ido] = cc[i - 1 + (k + j*l1)*ido] + cc[i + (k + jc*l1)*ido];
-            ch[i + (k + j*l1)*ido] = cc[i + (k + j*l1)*ido] + cc[i - 1 + (k + jc*l1)*ido];
-            ch[i + (k + jc*l1)*ido] = cc[i + (k + j*l1)*ido] - cc[i - 1 + (k + jc*l1)*ido];
-          }
-        }
-      }
-    } else {
-      for (j=1; j<ipph; j++) {
-        jc = ip - j;
-        for (i=2; i<ido; i+=2) {
-          for (k=0; k<l1; k++) {
-            ch[i - 1 + (k + j*l1)*ido] = cc[i - 1 + (k + j*l1)*ido] - cc[i + (k + jc*l1)*ido];
-            ch[i - 1 + (k + jc*l1)*ido] = cc[i - 1 + (k + j *l1)*ido] + cc[i + (k + jc*l1)*ido];
-            ch[i + (k + j*l1)*ido] = cc[i + (k + j*l1)*ido] + cc[i - 1 + (k + jc*l1)*ido];
-            ch[i + (k + jc*l1)*ido] = cc[i + (k + j*l1)*ido] - cc[i - 1 + (k + jc*l1)*ido];
-          }
-        }
-      }
-    }
-    for (ik=0; ik<idl1; ik++) cc[ik] = ch[ik];
-    for (j=1; j<ip; j++)
-      for (k=0; k<l1; k++)
-        cc[(k + j*l1)*ido] = ch[(k + j*l1)*ido];
-    if (nbd <= l1) {
-      is = -ido;
-      for (j=1; j<ip; j++) {
-        is += ido;
-        idij = is-1;
-        for (i=2; i<ido; i+=2) {
-          idij += 2;
-          for (k=0; k<l1; k++) {
-            cc[i - 1 + (k + j*l1)*ido] = wa[idij - 1]*ch[i - 1 + (k + j*l1)*ido] - wa[idij]*
-                ch[i + (k + j*l1)*ido];
-            cc[i + (k + j*l1)*ido] = wa[idij - 1]*ch[i + (k + j*l1)*ido] + wa[idij]*ch[i - 1 + (k + j*l1)*ido];
-          }
-        }
-      }
-    } else {
-      is = -ido;
-      for (j=1; j<ip; j++) {
-        is += ido;
-        for (k=0; k<l1; k++) {
-          idij = is - 1;
-          for (i=2; i<ido; i+=2) {
-            idij += 2;
-            cc[i - 1 + (k + j*l1)*ido] = wa[idij-1]*ch[i - 1 + (k + j*l1)*ido] - wa[idij]*
-                ch[i + (k + j*l1)*ido];
-            cc[i + (k + j*l1)*ido] = wa[idij-1]*ch[i + (k + j*l1)*ido] + wa[idij]*ch[i - 1 + (k + j*l1)*ido];
-          }
-        }
-      }
-    }
-  } /* radbg */
-
-  /* ------------------------------------------------------------
-cfftf1, npy_cfftf, npy_cfftb, cffti1, npy_cffti. Complex FFTs.
---------------------------------------------------------------- */
-
-static void cfftf1(int n, Treal c[], Treal ch[], const Treal wa[], const int ifac[MAXFAC+2], int isign)
-  {
-    int idot, i;
-    int k1, l1, l2;
-    int na, nf, ip, iw, ix2, ix3, ix4, nac, ido, idl1;
-    Treal *cinput, *coutput;
-    nf = ifac[1];
-    na = 0;
-    l1 = 1;
-    iw = 0;
-    for (k1=2; k1<=nf+1; k1++) {
-      ip = ifac[k1];
-      l2 = ip*l1;
-      ido = n / l2;
-      idot = ido + ido;
-      idl1 = idot*l1;
-      if (na) {
-        cinput = ch;
-        coutput = c;
-      } else {
-        cinput = c;
-        coutput = ch;
-      }
-      switch (ip) {
-      case 4:
-        ix2 = iw + idot;
-        ix3 = ix2 + idot;
-        passf4(idot, l1, cinput, coutput, &wa[iw], &wa[ix2], &wa[ix3], isign);
-        na = !na;
-        break;
-      case 2:
-        passf2(idot, l1, cinput, coutput, &wa[iw], isign);
-        na = !na;
-        break;
-      case 3:
-        ix2 = iw + idot;
-        passf3(idot, l1, cinput, coutput, &wa[iw], &wa[ix2], isign);
-        na = !na;
-        break;
-      case 5:
-        ix2 = iw + idot;
-        ix3 = ix2 + idot;
-        ix4 = ix3 + idot;
-        passf5(idot, l1, cinput, coutput, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);
-        na = !na;
-        break;
-      default:
-        passf(&nac, idot, ip, l1, idl1, cinput, coutput, &wa[iw], isign);
-        if (nac != 0) na = !na;
-      }
-      l1 = l2;
-      iw += (ip - 1)*idot;
-    }
-    if (na == 0) return;
-    for (i=0; i<2*n; i++) c[i] = ch[i];
-  } /* cfftf1 */
-
-
-NPY_VISIBILITY_HIDDEN void npy_cfftf(int n, Treal c[], Treal wsave[])
-  {
-    int iw1, iw2;
-    if (n == 1) return;
-    iw1 = 2*n;
-    iw2 = iw1 + 2*n;
-    cfftf1(n, c, wsave, wsave+iw1, (int*)(wsave+iw2), -1);
-  } /* npy_cfftf */
-
-
-NPY_VISIBILITY_HIDDEN void npy_cfftb(int n, Treal c[], Treal wsave[])
-  {
-    int iw1, iw2;
-    if (n == 1) return;
-    iw1 = 2*n;
-    iw2 = iw1 + 2*n;
-    cfftf1(n, c, wsave, wsave+iw1, (int*)(wsave+iw2), +1);
-  } /* npy_cfftb */
-
-
-static void factorize(int n, int ifac[MAXFAC+2], const int ntryh[NSPECIAL])
-  /* Factorize n in factors in ntryh and rest. On exit,
-ifac[0] contains n and ifac[1] contains number of factors,
-the factors start from ifac[2]. */
-  {
-    int ntry=3, i, j=0, ib, nf=0, nl=n, nq, nr;
-startloop:
-    if (j < NSPECIAL)
-      ntry = ntryh[j];
-    else
-      ntry+= 2;
-    j++;
-    do {
-      nq = nl / ntry;
-      nr = nl - ntry*nq;
-      if (nr != 0) goto startloop;
-      nf++;
-      ifac[nf + 1] = ntry;
-      nl = nq;
-      if (ntry == 2 && nf != 1) {
-        for (i=2; i<=nf; i++) {
-          ib = nf - i + 2;
-          ifac[ib + 1] = ifac[ib];
-        }
-        ifac[2] = 2;
-      }
-    } while (nl != 1);
-    ifac[0] = n;
-    ifac[1] = nf;
-  }
-
-
-static void cffti1(int n, Treal wa[], int ifac[MAXFAC+2])
-  {
-    int fi, idot, i, j;
-    int i1, k1, l1, l2;
-    int ld, ii, nf, ip;
-    int ido, ipm;
-
-    static const int ntryh[NSPECIAL] = {
-      3,4,2,5    }; /* Do not change the order of these. */
-
-    factorize(n,ifac,ntryh);
-    nf = ifac[1];
-    i = 1;
-    l1 = 1;
-    for (k1=1; k1<=nf; k1++) {
-      ip = ifac[k1+1];
-      ld = 0;
-      l2 = l1*ip;
-      ido = n / l2;
-      idot = ido + ido + 2;
-      ipm = ip - 1;
-      for (j=1; j<=ipm; j++) {
-        i1 = i;
-        wa[i-1] = 1;
-        wa[i] = 0;
-        ld += l1;
-        fi = 0;
-        for (ii=4; ii<=idot; ii+=2) {
-          i+= 2;
-          fi+= 1;
-          sincos2pi(fi*ld, n, wa+i, wa+i-1);
-        }
-        if (ip > 5) {
-          wa[i1-1] = wa[i-1];
-          wa[i1] = wa[i];
-        }
-      }
-      l1 = l2;
-    }
-  } /* cffti1 */
-
-
-NPY_VISIBILITY_HIDDEN void npy_cffti(int n, Treal wsave[])
- {
-    int iw1, iw2;
-    if (n == 1) return;
-    iw1 = 2*n;
-    iw2 = iw1 + 2*n;
-    cffti1(n, wsave+iw1, (int*)(wsave+iw2));
-  } /* npy_cffti */
-
-  /* -------------------------------------------------------------------
-rfftf1, rfftb1, npy_rfftf, npy_rfftb, rffti1, npy_rffti. Treal FFTs.
----------------------------------------------------------------------- */
-
-static void rfftf1(int n, Treal c[], Treal ch[], const Treal wa[], const int ifac[MAXFAC+2])
-  {
-    int i;
-    int k1, l1, l2, na, kh, nf, ip, iw, ix2, ix3, ix4, ido, idl1;
-    Treal *cinput, *coutput;
-    nf = ifac[1];
-    na = 1;
-    l2 = n;
-    iw = n-1;
-    for (k1 = 1; k1 <= nf; ++k1) {
-      kh = nf - k1;
-      ip = ifac[kh + 2];
-      l1 = l2 / ip;
-      ido = n / l2;
-      idl1 = ido*l1;
-      iw -= (ip - 1)*ido;
-      na = !na;
-      if (na) {
-        cinput = ch;
-        coutput = c;
-      } else {
-        cinput = c;
-        coutput = ch;
-      }
-      switch (ip) {
-      case 4:
-        ix2 = iw + ido;
-        ix3 = ix2 + ido;
-        radf4(ido, l1, cinput, coutput, &wa[iw], &wa[ix2], &wa[ix3]);
-        break;
-      case 2:
-        radf2(ido, l1, cinput, coutput, &wa[iw]);
-        break;
-      case 3:
-        ix2 = iw + ido;
-        radf3(ido, l1, cinput, coutput, &wa[iw], &wa[ix2]);
-        break;
-      case 5:
-        ix2 = iw + ido;
-        ix3 = ix2 + ido;
-        ix4 = ix3 + ido;
-        radf5(ido, l1, cinput, coutput, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]);
-        break;
-      default:
-        if (ido == 1)
-          na = !na;
-        if (na == 0) {
-          radfg(ido, ip, l1, idl1, c, ch, &wa[iw]);
-          na = 1;
-        } else {
-          radfg(ido, ip, l1, idl1, ch, c, &wa[iw]);
-          na = 0;
-        }
-      }
-      l2 = l1;
-    }
-    if (na == 1) return;
-    for (i = 0; i < n; i++) c[i] = ch[i];
-  } /* rfftf1 */
-
-
-static void rfftb1(int n, Treal c[], Treal ch[], const Treal wa[], const int ifac[MAXFAC+2])
-  {
-    int i;
-    int k1, l1, l2, na, nf, ip, iw, ix2, ix3, ix4, ido, idl1;
-    Treal *cinput, *coutput;
-    nf = ifac[1];
-    na = 0;
-    l1 = 1;
-    iw = 0;
-    for (k1=1; k1<=nf; k1++) {
-      ip = ifac[k1 + 1];
-      l2 = ip*l1;
-      ido = n / l2;
-      idl1 = ido*l1;
-      if (na) {
-        cinput = ch;
-        coutput = c;
-      } else {
-        cinput = c;
-        coutput = ch;
-      }
-      switch (ip) {
-      case 4:
-        ix2 = iw + ido;
-        ix3 = ix2 + ido;
-        radb4(ido, l1, cinput, coutput, &wa[iw], &wa[ix2], &wa[ix3]);
-        na = !na;
-        break;
-      case 2:
-        radb2(ido, l1, cinput, coutput, &wa[iw]);
-        na = !na;
-        break;
-      case 3:
-        ix2 = iw + ido;
-        radb3(ido, l1, cinput, coutput, &wa[iw], &wa[ix2]);
-        na = !na;
-        break;
-      case 5:
-        ix2 = iw + ido;
-        ix3 = ix2 + ido;
-        ix4 = ix3 + ido;
-        radb5(ido, l1, cinput, coutput, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]);
-        na = !na;
-        break;
-      default:
-        radbg(ido, ip, l1, idl1, cinput, coutput, &wa[iw]);
-        if (ido == 1) na = !na;
-      }
-      l1 = l2;
-      iw += (ip - 1)*ido;
-    }
-    if (na == 0) return;
-    for (i=0; i<n; i++) c[i] = ch[i];
-  } /* rfftb1 */
-
-
-NPY_VISIBILITY_HIDDEN void npy_rfftf(int n, Treal r[], Treal wsave[])
-  {
-    if (n == 1) return;
-    rfftf1(n, r, wsave, wsave+n, (int*)(wsave+2*n));
-  } /* npy_rfftf */
-
-
-NPY_VISIBILITY_HIDDEN void npy_rfftb(int n, Treal r[], Treal wsave[])
-  {
-    if (n == 1) return;
-    rfftb1(n, r, wsave, wsave+n, (int*)(wsave+2*n));
-  } /* npy_rfftb */
-
-
-static void rffti1(int n, Treal wa[], int ifac[MAXFAC+2])
-  {
-    int fi, i, j;
-    int k1, l1, l2;
-    int ld, ii, nf, ip, is;
-    int ido, ipm, nfm1;
-    static const int ntryh[NSPECIAL] = {
-      4,2,3,5    }; /* Do not change the order of these. */
-    factorize(n,ifac,ntryh);
-    nf = ifac[1];
-    is = 0;
-    nfm1 = nf - 1;
-    l1 = 1;
-    if (nfm1 == 0) return;
-    for (k1 = 1; k1 <= nfm1; k1++) {
-      ip = ifac[k1 + 1];
-      ld = 0;
-      l2 = l1*ip;
-      ido = n / l2;
-      ipm = ip - 1;
-      for (j = 1; j <= ipm; ++j) {
-        ld += l1;
-        i = is;
-        fi = 0;
-        for (ii = 3; ii <= ido; ii += 2) {
-          i += 2;
-          fi += 1;
-          sincos2pi(fi*ld, n, wa+i-1, wa+i-2);
-        }
-        is += ido;
-      }
-      l1 = l2;
-    }
-  } /* rffti1 */
-
-
-NPY_VISIBILITY_HIDDEN void npy_rffti(int n, Treal wsave[])
-  {
-    if (n == 1) return;
-    rffti1(n, wsave+n, (int*)(wsave+2*n));
-  } /* npy_rffti */
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/numpy/fft/fftpack.h b/numpy/fft/fftpack.h
deleted file mode 100644
index 5e8f463..0000000
--- a/numpy/fft/fftpack.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * This file is part of tela the Tensor Language.
- * Copyright (c) 1994-1995 Pekka Janhunen
- */
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define DOUBLE
-
-#ifdef DOUBLE
-#define Treal double
-#else
-#define Treal float
-#endif
-
-extern NPY_VISIBILITY_HIDDEN void npy_cfftf(int N, Treal data[], const Treal wrk[]);
-extern NPY_VISIBILITY_HIDDEN void npy_cfftb(int N, Treal data[], const Treal wrk[]);
-extern NPY_VISIBILITY_HIDDEN void npy_cffti(int N, Treal wrk[]);
-
-extern NPY_VISIBILITY_HIDDEN void npy_rfftf(int N, Treal data[], const Treal wrk[]);
-extern NPY_VISIBILITY_HIDDEN void npy_rfftb(int N, Treal data[], const Treal wrk[]);
-extern NPY_VISIBILITY_HIDDEN void npy_rffti(int N, Treal wrk[]);
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/numpy/fft/fftpack_litemodule.c b/numpy/fft/fftpack_litemodule.c
deleted file mode 100644
index bd6cfc1..0000000
--- a/numpy/fft/fftpack_litemodule.c
+++ /dev/null
@@ -1,366 +0,0 @@
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-
-#include "Python.h"
-#include "numpy/arrayobject.h"
-#include "fftpack.h"
-
-static PyObject *ErrorObject;
-
-static const char fftpack_cfftf__doc__[] = "";
-
-static PyObject *
-fftpack_cfftf(PyObject *NPY_UNUSED(self), PyObject *args)
-{
-    PyObject *op1, *op2;
-    PyArrayObject *data;
-    PyArray_Descr *descr;
-    double *wsave, *dptr;
-    npy_intp nsave;
-    int npts, nrepeats, i;
-
-    if(!PyArg_ParseTuple(args, "OO:cfftf", &op1, &op2)) {
-        return NULL;
-    }
-    data = (PyArrayObject *)PyArray_CopyFromObject(op1,
-            NPY_CDOUBLE, 1, 0);
-    if (data == NULL) {
-        return NULL;
-    }
-    descr = PyArray_DescrFromType(NPY_DOUBLE);
-    if (PyArray_AsCArray(&op2, (void *)&wsave, &nsave, 1, descr) == -1) {
-        goto fail;
-    }
-    if (data == NULL) {
-        goto fail;
-    }
-
-    npts = PyArray_DIM(data, PyArray_NDIM(data) - 1);
-    if (nsave != npts*4 + 15) {
-        PyErr_SetString(ErrorObject, "invalid work array for fft size");
-        goto fail;
-    }
-
-    nrepeats = PyArray_SIZE(data)/npts;
-    dptr = (double *)PyArray_DATA(data);
-    Py_BEGIN_ALLOW_THREADS;
-    NPY_SIGINT_ON;
-    for (i = 0; i < nrepeats; i++) {
-        npy_cfftf(npts, dptr, wsave);
-        dptr += npts*2;
-    }
-    NPY_SIGINT_OFF;
-    Py_END_ALLOW_THREADS;
-    PyArray_Free(op2, (char *)wsave);
-    return (PyObject *)data;
-
-fail:
-    PyArray_Free(op2, (char *)wsave);
-    Py_DECREF(data);
-    return NULL;
-}
-
-static const char fftpack_cfftb__doc__[] = "";
-
-static PyObject *
-fftpack_cfftb(PyObject *NPY_UNUSED(self), PyObject *args)
-{
-    PyObject *op1, *op2;
-    PyArrayObject *data;
-    PyArray_Descr *descr;
-    double *wsave, *dptr;
-    npy_intp nsave;
-    int npts, nrepeats, i;
-
-    if(!PyArg_ParseTuple(args, "OO:cfftb", &op1, &op2)) {
-        return NULL;
-    }
-    data = (PyArrayObject *)PyArray_CopyFromObject(op1,
-            NPY_CDOUBLE, 1, 0);
-    if (data == NULL) {
-        return NULL;
-    }
-    descr = PyArray_DescrFromType(NPY_DOUBLE);
-    if (PyArray_AsCArray(&op2, (void *)&wsave, &nsave, 1, descr) == -1) {
-        goto fail;
-    }
-    if (data == NULL) {
-        goto fail;
-    }
-
-    npts = PyArray_DIM(data, PyArray_NDIM(data) - 1);
-    if (nsave != npts*4 + 15) {
-        PyErr_SetString(ErrorObject, "invalid work array for fft size");
-        goto fail;
-    }
-
-    nrepeats = PyArray_SIZE(data)/npts;
-    dptr = (double *)PyArray_DATA(data);
-    Py_BEGIN_ALLOW_THREADS;
-    NPY_SIGINT_ON;
-    for (i = 0; i < nrepeats; i++) {
-        npy_cfftb(npts, dptr, wsave);
-        dptr += npts*2;
-    }
-    NPY_SIGINT_OFF;
-    Py_END_ALLOW_THREADS;
-    PyArray_Free(op2, (char *)wsave);
-    return (PyObject *)data;
-
-fail:
-    PyArray_Free(op2, (char *)wsave);
-    Py_DECREF(data);
-    return NULL;
-}
-
-static const char fftpack_cffti__doc__[] = "";
-
-static PyObject *
-fftpack_cffti(PyObject *NPY_UNUSED(self), PyObject *args)
-{
-    PyArrayObject *op;
-    npy_intp dim;
-    long n;
-
-    if (!PyArg_ParseTuple(args, "l:cffti", &n)) {
-        return NULL;
-    }
-    /*Magic size needed by npy_cffti*/
-    dim = 4*n + 15;
-    /*Create a 1 dimensional array of dimensions of type double*/
-    op = (PyArrayObject *)PyArray_SimpleNew(1, &dim, NPY_DOUBLE);
-    if (op == NULL) {
-        return NULL;
-    }
-
-    Py_BEGIN_ALLOW_THREADS;
-    NPY_SIGINT_ON;
-    npy_cffti(n, (double *)PyArray_DATA((PyArrayObject*)op));
-    NPY_SIGINT_OFF;
-    Py_END_ALLOW_THREADS;
-
-    return (PyObject *)op;
-}
-
-static const char fftpack_rfftf__doc__[] = "";
-
-static PyObject *
-fftpack_rfftf(PyObject *NPY_UNUSED(self), PyObject *args)
-{
-    PyObject *op1, *op2;
-    PyArrayObject *data, *ret;
-    PyArray_Descr *descr;
-    double *wsave = NULL, *dptr, *rptr;
-    npy_intp nsave;
-    int npts, nrepeats, i, rstep;
-
-    if(!PyArg_ParseTuple(args, "OO:rfftf", &op1, &op2)) {
-        return NULL;
-    }
-    data = (PyArrayObject *)PyArray_ContiguousFromObject(op1,
-            NPY_DOUBLE, 1, 0);
-    if (data == NULL) {
-        return NULL;
-    }
-    /* FIXME, direct access changing contents of data->dimensions */
-    npts = PyArray_DIM(data, PyArray_NDIM(data) - 1);
-    PyArray_DIMS(data)[PyArray_NDIM(data) - 1] = npts/2 + 1;
-    ret = (PyArrayObject *)PyArray_Zeros(PyArray_NDIM(data),
-            PyArray_DIMS(data), PyArray_DescrFromType(NPY_CDOUBLE), 0);
-    if (ret == NULL) {
-        goto fail;
-    }
-    PyArray_DIMS(data)[PyArray_NDIM(data) - 1] = npts;
-    rstep = PyArray_DIM(ret, PyArray_NDIM(ret) - 1)*2;
-
-    descr = PyArray_DescrFromType(NPY_DOUBLE);
-    if (PyArray_AsCArray(&op2, (void *)&wsave, &nsave, 1, descr) == -1) {
-        goto fail;
-    }
-    if (data == NULL || ret == NULL) {
-        goto fail;
-    }
-    if (nsave != npts*2+15) {
-        PyErr_SetString(ErrorObject, "invalid work array for fft size");
-        goto fail;
-    }
-
-    nrepeats = PyArray_SIZE(data)/npts;
-    rptr = (double *)PyArray_DATA(ret);
-    dptr = (double *)PyArray_DATA(data);
-
-    Py_BEGIN_ALLOW_THREADS;
-    NPY_SIGINT_ON;
-    for (i = 0; i < nrepeats; i++) {
-        memcpy((char *)(rptr+1), dptr, npts*sizeof(double));
-        npy_rfftf(npts, rptr+1, wsave);
-        rptr[0] = rptr[1];
-        rptr[1] = 0.0;
-        rptr += rstep;
-        dptr += npts;
-    }
-    NPY_SIGINT_OFF;
-    Py_END_ALLOW_THREADS;
-    PyArray_Free(op2, (char *)wsave);
-    Py_DECREF(data);
-    return (PyObject *)ret;
-
-fail:
-    PyArray_Free(op2, (char *)wsave);
-    Py_XDECREF(data);
-    Py_XDECREF(ret);
-    return NULL;
-}
-
-static const char fftpack_rfftb__doc__[] = "";
-
-static PyObject *
-fftpack_rfftb(PyObject *NPY_UNUSED(self), PyObject *args)
-{
-    PyObject *op1, *op2;
-    PyArrayObject *data, *ret;
-    PyArray_Descr *descr;
-    double *wsave, *dptr, *rptr;
-    npy_intp nsave;
-    int npts, nrepeats, i;
-
-    if(!PyArg_ParseTuple(args, "OO:rfftb", &op1, &op2)) {
-        return NULL;
-    }
-    data = (PyArrayObject *)PyArray_ContiguousFromObject(op1,
-            NPY_CDOUBLE, 1, 0);
-    if (data == NULL) {
-        return NULL;
-    }
-    npts = PyArray_DIM(data, PyArray_NDIM(data) - 1);
-    ret = (PyArrayObject *)PyArray_Zeros(PyArray_NDIM(data), PyArray_DIMS(data),
-            PyArray_DescrFromType(NPY_DOUBLE), 0);
-
-    descr = PyArray_DescrFromType(NPY_DOUBLE);
-    if (PyArray_AsCArray(&op2, (void *)&wsave, &nsave, 1, descr) == -1) {
-        goto fail;
-    }
-    if (data == NULL || ret == NULL) {
-        goto fail;
-    }
-    if (nsave != npts*2 + 15) {
-        PyErr_SetString(ErrorObject, "invalid work array for fft size");
-        goto fail;
-    }
-
-    nrepeats = PyArray_SIZE(ret)/npts;
-    rptr = (double *)PyArray_DATA(ret);
-    dptr = (double *)PyArray_DATA(data);
-
-    Py_BEGIN_ALLOW_THREADS;
-    NPY_SIGINT_ON;
-    for (i = 0; i < nrepeats; i++) {
-        memcpy((char *)(rptr + 1), (dptr + 2), (npts - 1)*sizeof(double));
-        rptr[0] = dptr[0];
-        npy_rfftb(npts, rptr, wsave);
-        rptr += npts;
-        dptr += npts*2;
-    }
-    NPY_SIGINT_OFF;
-    Py_END_ALLOW_THREADS;
-    PyArray_Free(op2, (char *)wsave);
-    Py_DECREF(data);
-    return (PyObject *)ret;
-
-fail:
-    PyArray_Free(op2, (char *)wsave);
-    Py_XDECREF(data);
-    Py_XDECREF(ret);
-    return NULL;
-}
-
-static const char fftpack_rffti__doc__[] = "";
-
-static PyObject *
-fftpack_rffti(PyObject *NPY_UNUSED(self), PyObject *args)
-{
-  PyArrayObject *op;
-  npy_intp dim;
-  long n;
-
-  if (!PyArg_ParseTuple(args, "l:rffti", &n)) {
-      return NULL;
-  }
-  /*Magic size needed by npy_rffti*/
-  dim = 2*n + 15;
-  /*Create a 1 dimensional array of dimensions of type double*/
-  op = (PyArrayObject *)PyArray_SimpleNew(1, &dim, NPY_DOUBLE);
-  if (op == NULL) {
-      return NULL;
-  }
-  Py_BEGIN_ALLOW_THREADS;
-  NPY_SIGINT_ON;
-  npy_rffti(n, (double *)PyArray_DATA((PyArrayObject*)op));
-  NPY_SIGINT_OFF;
-  Py_END_ALLOW_THREADS;
-
-  return (PyObject *)op;
-}
-
-
-/* List of methods defined in the module */
-
-static struct PyMethodDef fftpack_methods[] = {
-    {"cfftf",   fftpack_cfftf,  1,      fftpack_cfftf__doc__},
-    {"cfftb",   fftpack_cfftb,  1,      fftpack_cfftb__doc__},
-    {"cffti",   fftpack_cffti,  1,      fftpack_cffti__doc__},
-    {"rfftf",   fftpack_rfftf,  1,      fftpack_rfftf__doc__},
-    {"rfftb",   fftpack_rfftb,  1,      fftpack_rfftb__doc__},
-    {"rffti",   fftpack_rffti,  1,      fftpack_rffti__doc__},
-    {NULL, NULL, 0, NULL}          /* sentinel */
-};
-
-#if PY_MAJOR_VERSION >= 3
-static struct PyModuleDef moduledef = {
-        PyModuleDef_HEAD_INIT,
-        "fftpack_lite",
-        NULL,
-        -1,
-        fftpack_methods,
-        NULL,
-        NULL,
-        NULL,
-        NULL
-};
-#endif
-
-/* Initialization function for the module */
-#if PY_MAJOR_VERSION >= 3
-#define RETVAL(x) x
-PyMODINIT_FUNC PyInit_fftpack_lite(void)
-#else
-#define RETVAL(x)
-PyMODINIT_FUNC
-initfftpack_lite(void)
-#endif
-{
-    PyObject *m,*d;
-#if PY_MAJOR_VERSION >= 3
-    m = PyModule_Create(&moduledef);
-#else
-    static const char fftpack_module_documentation[] = "";
-
-    m = Py_InitModule4("fftpack_lite", fftpack_methods,
-            fftpack_module_documentation,
-            (PyObject*)NULL,PYTHON_API_VERSION);
-#endif
-    if (m == NULL) {
-        return RETVAL(NULL);
-    }
-
-    /* Import the array object */
-    import_array();
-
-    /* Add some symbolic constants to the module */
-    d = PyModule_GetDict(m);
-    ErrorObject = PyErr_NewException("fftpack.error", NULL, NULL);
-    PyDict_SetItemString(d, "error", ErrorObject);
-
-    /* XXXX Add constants here */
-
-    return RETVAL(m);
-}
diff --git a/numpy/fft/helper.py b/numpy/fft/helper.py
index 864768d..a920a4a 100644
--- a/numpy/fft/helper.py
+++ b/numpy/fft/helper.py
@@ -4,11 +4,6 @@
 """
 from __future__ import division, absolute_import, print_function
 
-import collections
-try:
-    import threading
-except ImportError:
-    import dummy_threading as threading
 from numpy.compat import integer_types
 from numpy.core import integer, empty, arange, asarray, roll
 from numpy.core.overrides import array_function_dispatch, set_module
@@ -52,7 +47,7 @@
     --------
     >>> freqs = np.fft.fftfreq(10, 0.1)
     >>> freqs
-    array([ 0.,  1.,  2.,  3.,  4., -5., -4., -3., -2., -1.])
+    array([ 0.,  1.,  2., ..., -3., -2., -1.])
     >>> np.fft.fftshift(freqs)
     array([-5., -4., -3., -2., -1.,  0.,  1.,  2.,  3.,  4.])
 
@@ -162,7 +157,7 @@
     >>> timestep = 0.1
     >>> freq = np.fft.fftfreq(n, d=timestep)
     >>> freq
-    array([ 0.  ,  1.25,  2.5 ,  3.75, -5.  , -3.75, -2.5 , -1.25])
+    array([ 0.  ,  1.25,  2.5 , ..., -3.75, -2.5 , -1.25])
 
     """
     if not isinstance(n, integer_types):
@@ -215,7 +210,7 @@
     >>> sample_rate = 100
     >>> freq = np.fft.fftfreq(n, d=1./sample_rate)
     >>> freq
-    array([  0.,  10.,  20.,  30.,  40., -50., -40., -30., -20., -10.])
+    array([  0.,  10.,  20., ..., -30., -20., -10.])
     >>> freq = np.fft.rfftfreq(n, d=1./sample_rate)
     >>> freq
     array([  0.,  10.,  20.,  30.,  40.,  50.])
@@ -227,99 +222,3 @@
     N = n//2 + 1
     results = arange(0, N, dtype=int)
     return results * val
-
-
-class _FFTCache(object):
-    """
-    Cache for the FFT twiddle factors as an LRU (least recently used) cache.
-
-    Parameters
-    ----------
-    max_size_in_mb : int
-        Maximum memory usage of the cache before items are being evicted.
-    max_item_count : int
-        Maximum item count of the cache before items are being evicted.
-
-    Notes
-    -----
-    Items will be evicted if either limit has been reached upon getting and
-    setting. The maximum memory usages is not strictly the given
-    ``max_size_in_mb`` but rather
-    ``max(max_size_in_mb, 1.5 * size_of_largest_item)``. Thus the cache will
-    never be completely cleared - at least one item will remain and a single
-    large item can cause the cache to retain several smaller items even if the
-    given maximum cache size has been exceeded.
-    """
-    def __init__(self, max_size_in_mb, max_item_count):
-        self._max_size_in_bytes = max_size_in_mb * 1024 ** 2
-        self._max_item_count = max_item_count
-        self._dict = collections.OrderedDict()
-        self._lock = threading.Lock()
-
-    def put_twiddle_factors(self, n, factors):
-        """
-        Store twiddle factors for an FFT of length n in the cache.
-
-        Putting multiple twiddle factors for a certain n will store it multiple
-        times.
-
-        Parameters
-        ----------
-        n : int
-            Data length for the FFT.
-        factors : ndarray
-            The actual twiddle values.
-        """
-        with self._lock:
-            # Pop + later add to move it to the end for LRU behavior.
-            # Internally everything is stored in a dictionary whose values are
-            # lists.
-            try:
-                value = self._dict.pop(n)
-            except KeyError:
-                value = []
-            value.append(factors)
-            self._dict[n] = value
-            self._prune_cache()
-
-    def pop_twiddle_factors(self, n):
-        """
-        Pop twiddle factors for an FFT of length n from the cache.
-
-        Will return None if the requested twiddle factors are not available in
-        the cache.
-
-        Parameters
-        ----------
-        n : int
-            Data length for the FFT.
-
-        Returns
-        -------
-        out : ndarray or None
-            The retrieved twiddle factors if available, else None.
-        """
-        with self._lock:
-            if n not in self._dict or not self._dict[n]:
-                return None
-            # Pop + later add to move it to the end for LRU behavior.
-            all_values = self._dict.pop(n)
-            value = all_values.pop()
-            # Only put pack if there are still some arrays left in the list.
-            if all_values:
-                self._dict[n] = all_values
-            return value
-
-    def _prune_cache(self):
-        # Always keep at least one item.
-        while len(self._dict) > 1 and (
-                len(self._dict) > self._max_item_count or self._check_size()):
-            self._dict.popitem(last=False)
-
-    def _check_size(self):
-        item_sizes = [sum(_j.nbytes for _j in _i)
-                      for _i in self._dict.values() if _i]
-        if not item_sizes:
-            return False
-        max_size = max(self._max_size_in_bytes, 1.5 * max(item_sizes))
-        return sum(item_sizes) > max_size
diff --git a/numpy/fft/pocketfft.c b/numpy/fft/pocketfft.c
new file mode 100644
index 0000000..9d1218e
--- /dev/null
+++ b/numpy/fft/pocketfft.c
@@ -0,0 +1,2406 @@
+/*
+ * This file is part of pocketfft.
+ * Licensed under a 3-clause BSD style license - see LICENSE.md
+ */
+
+/*
+ *  Main implementation file.
+ *
+ *  Copyright (C) 2004-2018 Max-Planck-Society
+ *  \author Martin Reinecke
+ */
+
+#include <math.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include "npy_config.h"
+#define restrict NPY_RESTRICT
+
+#define RALLOC(type,num) \
+  ((type *)malloc((num)*sizeof(type)))
+#define DEALLOC(ptr) \
+  do { free(ptr); (ptr)=NULL; } while(0)
+
+#define SWAP(a,b,type) \
+  do { type tmp_=(a); (a)=(b); (b)=tmp_; } while(0)
+
+#ifdef __GNUC__
+#define NOINLINE __attribute__((noinline))
+#define WARN_UNUSED_RESULT __attribute__ ((warn_unused_result))
+#else
+#define NOINLINE
+#define WARN_UNUSED_RESULT
+#endif
+
+struct cfft_plan_i;
+typedef struct cfft_plan_i * cfft_plan;
+struct rfft_plan_i;
+typedef struct rfft_plan_i * rfft_plan;
+
+// adapted from https://stackoverflow.com/questions/42792939/
+// CAUTION: this function only works for arguments in the range [-0.25; 0.25]!
+static void my_sincosm1pi (double a, double *restrict res)
+  {
+  double s = a * a;
+  /* Approximate cos(pi*x)-1 for x in [-0.25,0.25] */
+  double r =     -1.0369917389758117e-4;
+  r = fma (r, s,  1.9294935641298806e-3);
+  r = fma (r, s, -2.5806887942825395e-2);
+  r = fma (r, s,  2.3533063028328211e-1);
+  r = fma (r, s, -1.3352627688538006e+0);
+  r = fma (r, s,  4.0587121264167623e+0);
+  r = fma (r, s, -4.9348022005446790e+0);
+  double c = r*s;
+  /* Approximate sin(pi*x) for x in [-0.25,0.25] */
+  r =             4.6151442520157035e-4;
+  r = fma (r, s, -7.3700183130883555e-3);
+  r = fma (r, s,  8.2145868949323936e-2);
+  r = fma (r, s, -5.9926452893214921e-1);
+  r = fma (r, s,  2.5501640398732688e+0);
+  r = fma (r, s, -5.1677127800499516e+0);
+  s = s * a;
+  r = r * s;
+  s = fma (a, 3.1415926535897931e+0, r);
+  res[0] = c;
+  res[1] = s;
+  }
+
+NOINLINE static void calc_first_octant(size_t den, double * restrict res)
+  {
+  size_t n = (den+4)>>3;
+  if (n==0) return;
+  res[0]=1.; res[1]=0.;
+  if (n==1) return;
+  size_t l1=(size_t)sqrt(n);
+  for (size_t i=1; i<l1; ++i)
+    my_sincosm1pi((2.*i)/den,&res[2*i]);
+  size_t start=l1;
+  while(start<n)
+    {
+    double cs[2];
+    my_sincosm1pi((2.*start)/den,cs);
+    res[2*start] = cs[0]+1.;
+    res[2*start+1] = cs[1];
+    size_t end = l1;
+    if (start+end>n) end = n-start;
+    for (size_t i=1; i<end; ++i)
+      {
+      double csx[2]={res[2*i], res[2*i+1]};
+      res[2*(start+i)] = ((cs[0]*csx[0] - cs[1]*csx[1] + cs[0]) + csx[0]) + 1.;
+      res[2*(start+i)+1] = (cs[0]*csx[1] + cs[1]*csx[0]) + cs[1] + csx[1];
+      }
+    start += l1;
+    }
+  for (size_t i=1; i<l1; ++i)
+    res[2*i] += 1.;
+  }
+
+NOINLINE static void calc_first_quadrant(size_t n, double * restrict res)
+  {
+  double * restrict p = res+n;
+  calc_first_octant(n<<1, p);
+  size_t ndone=(n+2)>>2;
+  size_t i=0, idx1=0, idx2=2*ndone-2;
+  for (; i+1<ndone; i+=2, idx1+=2, idx2-=2)
+    {
+    res[idx1]   = p[2*i];
+    res[idx1+1] = p[2*i+1];
+    res[idx2]   = p[2*i+3];
+    res[idx2+1] = p[2*i+2];
+    }
+  if (i!=ndone)
+    {
+    res[idx1  ] = p[2*i];
+    res[idx1+1] = p[2*i+1];
+    }
+  }
+
+NOINLINE static void calc_first_half(size_t n, double * restrict res)
+  {
+  int ndone=(n+1)>>1;
+  double * p = res+n-1;
+  calc_first_octant(n<<2, p);
+  int i4=0, in=n, i=0;
+  for (; i4<=in-i4; ++i, i4+=4) // octant 0
+    {
+    res[2*i] = p[2*i4]; res[2*i+1] = p[2*i4+1];
+    }
+  for (; i4-in <= 0; ++i, i4+=4) // octant 1
+    {
+    int xm = in-i4;
+    res[2*i] = p[2*xm+1]; res[2*i+1] = p[2*xm];
+    }
+  for (; i4<=3*in-i4; ++i, i4+=4) // octant 2
+    {
+    int xm = i4-in;
+    res[2*i] = -p[2*xm+1]; res[2*i+1] = p[2*xm];
+    }
+  for (; i<ndone; ++i, i4+=4) // octant 3
+    {
+    int xm = 2*in-i4;
+    res[2*i] = -p[2*xm]; res[2*i+1] = p[2*xm+1];
+    }
+  }
+
+NOINLINE static void fill_first_quadrant(size_t n, double * restrict res)
+  {
+  const double hsqt2 = 0.707106781186547524400844362104849;
+  size_t quart = n>>2;
+  if ((n&7)==0)
+    res[quart] = res[quart+1] = hsqt2;
+  for (size_t i=2, j=2*quart-2; i<quart; i+=2, j-=2)
+    {
+    res[j  ] = res[i+1];
+    res[j+1] = res[i  ];
+    }
+  }
+
+NOINLINE static void fill_first_half(size_t n, double * restrict res)
+  {
+  size_t half = n>>1;
+  if ((n&3)==0)
+    for (size_t i=0; i<half; i+=2)
+      {
+      res[i+half]   = -res[i+1];
+      res[i+half+1] =  res[i  ];
+      }
+  else
+    for (size_t i=2, j=2*half-2; i<half; i+=2, j-=2)
+      {
+      res[j  ] = -res[i  ];
+      res[j+1] =  res[i+1];
+      }
+  }
+
+NOINLINE static void fill_second_half(size_t n, double * restrict res)
+  {
+  if ((n&1)==0)
+    for (size_t i=0; i<n; ++i)
+      res[i+n] = -res[i];
+  else
+    for (size_t i=2, j=2*n-2; i<n; i+=2, j-=2)
+      {
+      res[j  ] =  res[i  ];
+      res[j+1] = -res[i+1];
+      }
+  }
+
+NOINLINE static void sincos_2pibyn_half(size_t n, double * restrict res)
+  {
+  if ((n&3)==0)
+    {
+    calc_first_octant(n, res);
+    fill_first_quadrant(n, res);
+    fill_first_half(n, res);
+    }
+  else if ((n&1)==0)
+    {
+    calc_first_quadrant(n, res);
+    fill_first_half(n, res);
+    }
+  else
+    calc_first_half(n, res);
+  }
+
+NOINLINE static void sincos_2pibyn(size_t n, double * restrict res)
+  {
+  sincos_2pibyn_half(n, res);
+  fill_second_half(n, res);
+  }
+
+NOINLINE static size_t largest_prime_factor (size_t n)
+  {
+  size_t res=1;
+  size_t tmp;
+  while (((tmp=(n>>1))<<1)==n)
+    { res=2; n=tmp; }
+
+  size_t limit=(size_t)sqrt(n+0.01);
+  for (size_t x=3; x<=limit; x+=2)
+  while (((tmp=(n/x))*x)==n)
+    {
+    res=x;
+    n=tmp;
+    limit=(size_t)sqrt(n+0.01);
+    }
+  if (n>1) res=n;
+
+  return res;
+  }
+
+NOINLINE static double cost_guess (size_t n)
+  {
+  const double lfp=1.1; // penalty for non-hardcoded larger factors
+  size_t ni=n;
+  double result=0.;
+  size_t tmp;
+  while (((tmp=(n>>1))<<1)==n)
+    { result+=2; n=tmp; }
+
+  size_t limit=(size_t)sqrt(n+0.01);
+  for (size_t x=3; x<=limit; x+=2)
+  while ((tmp=(n/x))*x==n)
+    {
+    result+= (x<=5) ? x : lfp*x; // penalize larger prime factors
+    n=tmp;
+    limit=(size_t)sqrt(n+0.01);
+    }
+  if (n>1) result+=(n<=5) ? n : lfp*n;
+
+  return result*ni;
+  }
+
+/* returns the smallest composite of 2, 3, 5, 7 and 11 which is >= n */
+NOINLINE static size_t good_size(size_t n)
+  {
+  if (n<=6) return n;
+
+  size_t bestfac=2*n;
+  for (size_t f2=1; f2<bestfac; f2*=2)
+    for (size_t f23=f2; f23<bestfac; f23*=3)
+      for (size_t f235=f23; f235<bestfac; f235*=5)
+        for (size_t f2357=f235; f2357<bestfac; f2357*=7)
+          for (size_t f235711=f2357; f235711<bestfac; f235711*=11)
+            if (f235711>=n) bestfac=f235711;
+  return bestfac;
+  }
+
+typedef struct cmplx {
+  double r,i;
+} cmplx;
+
+#define NFCT 25
+typedef struct cfftp_fctdata
+  {
+  size_t fct;
+  cmplx *tw, *tws;
+  } cfftp_fctdata;
+
+typedef struct cfftp_plan_i
+  {
+  size_t length, nfct;
+  cmplx *mem;
+  cfftp_fctdata fct[NFCT];
+  } cfftp_plan_i;
+typedef struct cfftp_plan_i * cfftp_plan;
+
+#define PMC(a,b,c,d) { a.r=c.r+d.r; a.i=c.i+d.i; b.r=c.r-d.r; b.i=c.i-d.i; }
+#define ADDC(a,b,c) { a.r=b.r+c.r; a.i=b.i+c.i; }
+#define SCALEC(a,b) { a.r*=b; a.i*=b; }
+#define ROT90(a) { double tmp_=a.r; a.r=-a.i; a.i=tmp_; }
+#define ROTM90(a) { double tmp_=-a.r; a.r=a.i; a.i=tmp_; }
+#define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
+#define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
+#define WA(x,i) wa[(i)-1+(x)*(ido-1)]
+/* a = b*c */
+#define A_EQ_B_MUL_C(a,b,c) { a.r=b.r*c.r-b.i*c.i; a.i=b.r*c.i+b.i*c.r; }
+/* a = conj(b)*c*/
+#define A_EQ_CB_MUL_C(a,b,c) { a.r=b.r*c.r+b.i*c.i; a.i=b.r*c.i-b.i*c.r; }
+
+#define PMSIGNC(a,b,c,d) { a.r=c.r+sign*d.r; a.i=c.i+sign*d.i; b.r=c.r-sign*d.r; b.i=c.i-sign*d.i; }
+/* a = b*c */
+#define MULPMSIGNC(a,b,c) { a.r=b.r*c.r-sign*b.i*c.i; a.i=b.r*c.i+sign*b.i*c.r; }
+/* a *= b */
+#define MULPMSIGNCEQ(a,b) { double xtmp=a.r; a.r=b.r*a.r-sign*b.i*a.i; a.i=b.r*a.i+sign*b.i*xtmp; }
+
+NOINLINE static void pass2b (size_t ido, size_t l1, const cmplx * restrict cc,
+  cmplx * restrict ch, const cmplx * restrict wa)
+  {
+  const size_t cdim=2;
+
+  if (ido==1)
+    for (size_t k=0; k<l1; ++k)
+      PMC (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(0,1,k))
+  else
+    for (size_t k=0; k<l1; ++k)
+      {
+      PMC (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(0,1,k))
+      for (size_t i=1; i<ido; ++i)
+        {
+        cmplx t;
+        PMC (CH(i,k,0),t,CC(i,0,k),CC(i,1,k))
+        A_EQ_B_MUL_C (CH(i,k,1),WA(0,i),t)
+        }
+      }
+  }
+
+NOINLINE static void pass2f (size_t ido, size_t l1, const cmplx * restrict cc,
+  cmplx * restrict ch, const cmplx * restrict wa)
+  {
+  const size_t cdim=2;
+
+  if (ido==1)
+    for (size_t k=0; k<l1; ++k)
+      PMC (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(0,1,k))
+  else
+    for (size_t k=0; k<l1; ++k)
+      {
+      PMC (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(0,1,k))
+      for (size_t i=1; i<ido; ++i)
+        {
+        cmplx t;
+        PMC (CH(i,k,0),t,CC(i,0,k),CC(i,1,k))
+        A_EQ_CB_MUL_C (CH(i,k,1),WA(0,i),t)
+        }
+      }
+  }
+
+#define PREP3(idx) \
+        cmplx t0 = CC(idx,0,k), t1, t2; \
+        PMC (t1,t2,CC(idx,1,k),CC(idx,2,k)) \
+        CH(idx,k,0).r=t0.r+t1.r; \
+        CH(idx,k,0).i=t0.i+t1.i;
+#define PARTSTEP3a(u1,u2,twr,twi) \
+        { \
+        cmplx ca,cb; \
+        ca.r=t0.r+twr*t1.r; \
+        ca.i=t0.i+twr*t1.i; \
+        cb.i=twi*t2.r; \
+        cb.r=-(twi*t2.i); \
+        PMC(CH(0,k,u1),CH(0,k,u2),ca,cb) \
+        }
+
+#define PARTSTEP3b(u1,u2,twr,twi) \
+        { \
+        cmplx ca,cb,da,db; \
+        ca.r=t0.r+twr*t1.r; \
+        ca.i=t0.i+twr*t1.i; \
+        cb.i=twi*t2.r; \
+        cb.r=-(twi*t2.i); \
+        PMC(da,db,ca,cb) \
+        A_EQ_B_MUL_C (CH(i,k,u1),WA(u1-1,i),da) \
+        A_EQ_B_MUL_C (CH(i,k,u2),WA(u2-1,i),db) \
+        }
+NOINLINE static void pass3b (size_t ido, size_t l1, const cmplx * restrict cc,
+  cmplx * restrict ch, const cmplx * restrict wa)
+  {
+  const size_t cdim=3;
+  const double tw1r=-0.5, tw1i= 0.86602540378443864676;
+
+  if (ido==1)
+    for (size_t k=0; k<l1; ++k)
+      {
+      PREP3(0)
+      PARTSTEP3a(1,2,tw1r,tw1i)
+      }
+  else
+    for (size_t k=0; k<l1; ++k)
+      {
+      {
+      PREP3(0)
+      PARTSTEP3a(1,2,tw1r,tw1i)
+      }
+      for (size_t i=1; i<ido; ++i)
+        {
+        PREP3(i)
+        PARTSTEP3b(1,2,tw1r,tw1i)
+        }
+      }
+  }
+#define PARTSTEP3f(u1,u2,twr,twi) \
+        { \
+        cmplx ca,cb,da,db; \
+        ca.r=t0.r+twr*t1.r; \
+        ca.i=t0.i+twr*t1.i; \
+        cb.i=twi*t2.r; \
+        cb.r=-(twi*t2.i); \
+        PMC(da,db,ca,cb) \
+        A_EQ_CB_MUL_C (CH(i,k,u1),WA(u1-1,i),da) \
+        A_EQ_CB_MUL_C (CH(i,k,u2),WA(u2-1,i),db) \
+        }
+NOINLINE static void pass3f (size_t ido, size_t l1, const cmplx * restrict cc,
+  cmplx * restrict ch, const cmplx * restrict wa)
+  {
+  const size_t cdim=3;
+  const double tw1r=-0.5, tw1i= -0.86602540378443864676;
+
+  if (ido==1)
+    for (size_t k=0; k<l1; ++k)
+      {
+      PREP3(0)
+      PARTSTEP3a(1,2,tw1r,tw1i)
+      }
+  else
+    for (size_t k=0; k<l1; ++k)
+      {
+      {
+      PREP3(0)
+      PARTSTEP3a(1,2,tw1r,tw1i)
+      }
+      for (size_t i=1; i<ido; ++i)
+        {
+        PREP3(i)
+        PARTSTEP3f(1,2,tw1r,tw1i)
+        }
+      }
+  }
+
+NOINLINE static void pass4b (size_t ido, size_t l1, const cmplx * restrict cc,
+  cmplx * restrict ch, const cmplx * restrict wa)
+  {
+  const size_t cdim=4;
+
+  if (ido==1)
+    for (size_t k=0; k<l1; ++k)
+      {
+      cmplx t1, t2, t3, t4;
+      PMC(t2,t1,CC(0,0,k),CC(0,2,k))
+      PMC(t3,t4,CC(0,1,k),CC(0,3,k))
+      ROT90(t4)
+      PMC(CH(0,k,0),CH(0,k,2),t2,t3)
+      PMC(CH(0,k,1),CH(0,k,3),t1,t4)
+      }
+  else
+    for (size_t k=0; k<l1; ++k)
+      {
+      {
+      cmplx t1, t2, t3, t4;
+      PMC(t2,t1,CC(0,0,k),CC(0,2,k))
+      PMC(t3,t4,CC(0,1,k),CC(0,3,k))
+      ROT90(t4)
+      PMC(CH(0,k,0),CH(0,k,2),t2,t3)
+      PMC(CH(0,k,1),CH(0,k,3),t1,t4)
+      }
+      for (size_t i=1; i<ido; ++i)
+        {
+        cmplx c2, c3, c4, t1, t2, t3, t4;
+        cmplx cc0=CC(i,0,k), cc1=CC(i,1,k),cc2=CC(i,2,k),cc3=CC(i,3,k);
+        PMC(t2,t1,cc0,cc2)
+        PMC(t3,t4,cc1,cc3)
+        ROT90(t4)
+        cmplx wa0=WA(0,i), wa1=WA(1,i),wa2=WA(2,i);
+        PMC(CH(i,k,0),c3,t2,t3)
+        PMC(c2,c4,t1,t4)
+        A_EQ_B_MUL_C (CH(i,k,1),wa0,c2)
+        A_EQ_B_MUL_C (CH(i,k,2),wa1,c3)
+        A_EQ_B_MUL_C (CH(i,k,3),wa2,c4)
+        }
+      }
+  }
+NOINLINE static void pass4f (size_t ido, size_t l1, const cmplx * restrict cc,
+  cmplx * restrict ch, const cmplx * restrict wa)
+  {
+  const size_t cdim=4;
+
+  if (ido==1)
+    for (size_t k=0; k<l1; ++k)
+      {
+      cmplx t1, t2, t3, t4;
+      PMC(t2,t1,CC(0,0,k),CC(0,2,k))
+      PMC(t3,t4,CC(0,1,k),CC(0,3,k))
+      ROTM90(t4)
+      PMC(CH(0,k,0),CH(0,k,2),t2,t3)
+      PMC(CH(0,k,1),CH(0,k,3),t1,t4)
+      }
+  else
+    for (size_t k=0; k<l1; ++k)
+      {
+      {
+      cmplx t1, t2, t3, t4;
+      PMC(t2,t1,CC(0,0,k),CC(0,2,k))
+      PMC(t3,t4,CC(0,1,k),CC(0,3,k))
+      ROTM90(t4)
+      PMC(CH(0,k,0),CH(0,k,2),t2,t3)
+      PMC (CH(0,k,1),CH(0,k,3),t1,t4)
+      }
+      for (size_t i=1; i<ido; ++i)
+        {
+        cmplx c2, c3, c4, t1, t2, t3, t4;
+        cmplx cc0=CC(i,0,k), cc1=CC(i,1,k),cc2=CC(i,2,k),cc3=CC(i,3,k);
+        PMC(t2,t1,cc0,cc2)
+        PMC(t3,t4,cc1,cc3)
+        ROTM90(t4)
+        cmplx wa0=WA(0,i), wa1=WA(1,i),wa2=WA(2,i);
+        PMC(CH(i,k,0),c3,t2,t3)
+        PMC(c2,c4,t1,t4)
+        A_EQ_CB_MUL_C (CH(i,k,1),wa0,c2)
+        A_EQ_CB_MUL_C (CH(i,k,2),wa1,c3)
+        A_EQ_CB_MUL_C (CH(i,k,3),wa2,c4)
+        }
+      }
+  }
+
+#define PREP5(idx) \
+        cmplx t0 = CC(idx,0,k), t1, t2, t3, t4; \
+        PMC (t1,t4,CC(idx,1,k),CC(idx,4,k)) \
+        PMC (t2,t3,CC(idx,2,k),CC(idx,3,k)) \
+        CH(idx,k,0).r=t0.r+t1.r+t2.r; \
+        CH(idx,k,0).i=t0.i+t1.i+t2.i;
+
+#define PARTSTEP5a(u1,u2,twar,twbr,twai,twbi) \
+        { \
+        cmplx ca,cb; \
+        ca.r=t0.r+twar*t1.r+twbr*t2.r; \
+        ca.i=t0.i+twar*t1.i+twbr*t2.i; \
+        cb.i=twai*t4.r twbi*t3.r; \
+        cb.r=-(twai*t4.i twbi*t3.i); \
+        PMC(CH(0,k,u1),CH(0,k,u2),ca,cb) \
+        }
+
+#define PARTSTEP5b(u1,u2,twar,twbr,twai,twbi) \
+        { \
+        cmplx ca,cb,da,db; \
+        ca.r=t0.r+twar*t1.r+twbr*t2.r; \
+        ca.i=t0.i+twar*t1.i+twbr*t2.i; \
+        cb.i=twai*t4.r twbi*t3.r; \
+        cb.r=-(twai*t4.i twbi*t3.i); \
+        PMC(da,db,ca,cb) \
+        A_EQ_B_MUL_C (CH(i,k,u1),WA(u1-1,i),da) \
+        A_EQ_B_MUL_C (CH(i,k,u2),WA(u2-1,i),db) \
+        }
+NOINLINE static void pass5b (size_t ido, size_t l1, const cmplx * restrict cc,
+  cmplx * restrict ch, const cmplx * restrict wa)
+  {
+  const size_t cdim=5;
+  const double tw1r= 0.3090169943749474241,
+               tw1i= 0.95105651629515357212,
+               tw2r= -0.8090169943749474241,
+               tw2i= 0.58778525229247312917;
+
+  if (ido==1)
+    for (size_t k=0; k<l1; ++k)
+      {
+      PREP5(0)
+      PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
+      PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
+      }
+  else
+    for (size_t k=0; k<l1; ++k)
+      {
+      {
+      PREP5(0)
+      PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
+      PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
+      }
+      for (size_t i=1; i<ido; ++i)
+        {
+        PREP5(i)
+        PARTSTEP5b(1,4,tw1r,tw2r,+tw1i,+tw2i)
+        PARTSTEP5b(2,3,tw2r,tw1r,+tw2i,-tw1i)
+        }
+      }
+  }
+#define PARTSTEP5f(u1,u2,twar,twbr,twai,twbi) \
+        { \
+        cmplx ca,cb,da,db; \
+        ca.r=t0.r+twar*t1.r+twbr*t2.r; \
+        ca.i=t0.i+twar*t1.i+twbr*t2.i; \
+        cb.i=twai*t4.r twbi*t3.r; \
+        cb.r=-(twai*t4.i twbi*t3.i); \
+        PMC(da,db,ca,cb) \
+        A_EQ_CB_MUL_C (CH(i,k,u1),WA(u1-1,i),da) \
+        A_EQ_CB_MUL_C (CH(i,k,u2),WA(u2-1,i),db) \
+        }
+NOINLINE static void pass5f (size_t ido, size_t l1, const cmplx * restrict cc,
+  cmplx * restrict ch, const cmplx * restrict wa)
+  {
+  const size_t cdim=5;
+  const double tw1r= 0.3090169943749474241,
+               tw1i= -0.95105651629515357212,
+               tw2r= -0.8090169943749474241,
+               tw2i= -0.58778525229247312917;
+
+  if (ido==1)
+    for (size_t k=0; k<l1; ++k)
+      {
+      PREP5(0)
+      PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
+      PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
+      }
+  else
+    for (size_t k=0; k<l1; ++k)
+      {
+      {
+      PREP5(0)
+      PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
+      PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
+      }
+      for (size_t i=1; i<ido; ++i)
+        {
+        PREP5(i)
+        PARTSTEP5f(1,4,tw1r,tw2r,+tw1i,+tw2i)
+        PARTSTEP5f(2,3,tw2r,tw1r,+tw2i,-tw1i)
+        }
+      }
+  }
+
+#define PREP7(idx) \
+        cmplx t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7; \
+        PMC (t2,t7,CC(idx,1,k),CC(idx,6,k)) \
+        PMC (t3,t6,CC(idx,2,k),CC(idx,5,k)) \
+        PMC (t4,t5,CC(idx,3,k),CC(idx,4,k)) \
+        CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r; \
+        CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i;
+
+#define PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,out1,out2) \
+        { \
+        cmplx ca,cb; \
+        ca.r=t1.r+x1*t2.r+x2*t3.r+x3*t4.r; \
+        ca.i=t1.i+x1*t2.i+x2*t3.i+x3*t4.i; \
+        cb.i=y1*t7.r y2*t6.r y3*t5.r; \
+        cb.r=-(y1*t7.i y2*t6.i y3*t5.i); \
+        PMC(out1,out2,ca,cb) \
+        }
+#define PARTSTEP7a(u1,u2,x1,x2,x3,y1,y2,y3) \
+        PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,CH(0,k,u1),CH(0,k,u2))
+#define PARTSTEP7(u1,u2,x1,x2,x3,y1,y2,y3) \
+        { \
+        cmplx da,db; \
+        PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,da,db) \
+        MULPMSIGNC (CH(i,k,u1),WA(u1-1,i),da) \
+        MULPMSIGNC (CH(i,k,u2),WA(u2-1,i),db) \
+        }
+
+NOINLINE static void pass7(size_t ido, size_t l1, const cmplx * restrict cc,
+  cmplx * restrict ch, const cmplx * restrict wa, const int sign)
+  {
+  const size_t cdim=7;
+  const double tw1r= 0.623489801858733530525,
+               tw1i= sign * 0.7818314824680298087084,
+               tw2r= -0.222520933956314404289,
+               tw2i= sign * 0.9749279121818236070181,
+               tw3r= -0.9009688679024191262361,
+               tw3i= sign * 0.4338837391175581204758;
+
+  if (ido==1)
+    for (size_t k=0; k<l1; ++k)
+      {
+      PREP7(0)
+      PARTSTEP7a(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
+      PARTSTEP7a(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
+      PARTSTEP7a(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
+      }
+  else
+    for (size_t k=0; k<l1; ++k)
+      {
+      {
+      PREP7(0)
+      PARTSTEP7a(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
+      PARTSTEP7a(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
+      PARTSTEP7a(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
+      }
+      for (size_t i=1; i<ido; ++i)
+        {
+        PREP7(i)
+        PARTSTEP7(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
+        PARTSTEP7(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
+        PARTSTEP7(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
+        }
+      }
+  }
+
+#define PREP11(idx) \
+        cmplx t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7, t8, t9, t10, t11; \
+        PMC (t2,t11,CC(idx,1,k),CC(idx,10,k)) \
+        PMC (t3,t10,CC(idx,2,k),CC(idx, 9,k)) \
+        PMC (t4,t9 ,CC(idx,3,k),CC(idx, 8,k)) \
+        PMC (t5,t8 ,CC(idx,4,k),CC(idx, 7,k)) \
+        PMC (t6,t7 ,CC(idx,5,k),CC(idx, 6,k)) \
+        CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r+t5.r+t6.r; \
+        CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i+t5.i+t6.i;
+
+#define PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,out1,out2) \
+        { \
+        cmplx ca,cb; \
+        ca.r=t1.r+x1*t2.r+x2*t3.r+x3*t4.r+x4*t5.r+x5*t6.r; \
+        ca.i=t1.i+x1*t2.i+x2*t3.i+x3*t4.i+x4*t5.i+x5*t6.i; \
+        cb.i=y1*t11.r y2*t10.r y3*t9.r y4*t8.r y5*t7.r; \
+        cb.r=-(y1*t11.i y2*t10.i y3*t9.i y4*t8.i y5*t7.i ); \
+        PMC(out1,out2,ca,cb) \
+        }
+#define PARTSTEP11a(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \
+        PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,CH(0,k,u1),CH(0,k,u2))
+#define PARTSTEP11(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \
+        { \
+        cmplx da,db; \
+        PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,da,db) \
+        MULPMSIGNC (CH(i,k,u1),WA(u1-1,i),da) \
+        MULPMSIGNC (CH(i,k,u2),WA(u2-1,i),db) \
+        }
+
+NOINLINE static void pass11 (size_t ido, size_t l1, const cmplx * restrict cc,
+  cmplx * restrict ch, const cmplx * restrict wa, const int sign)
+  {
+  const size_t cdim=11;
+  const double tw1r =        0.8412535328311811688618,
+               tw1i = sign * 0.5406408174555975821076,
+               tw2r =        0.4154150130018864255293,
+               tw2i = sign * 0.9096319953545183714117,
+               tw3r =       -0.1423148382732851404438,
+               tw3i = sign * 0.9898214418809327323761,
+               tw4r =       -0.6548607339452850640569,
+               tw4i = sign * 0.755749574354258283774,
+               tw5r =       -0.9594929736144973898904,
+               tw5i = sign * 0.2817325568414296977114;
+
+  if (ido==1)
+    for (size_t k=0; k<l1; ++k)
+      {
+      PREP11(0)
+      PARTSTEP11a(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
+      PARTSTEP11a(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
+      PARTSTEP11a(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
+      PARTSTEP11a(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
+      PARTSTEP11a(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
+      }
+  else
+    for (size_t k=0; k<l1; ++k)
+      {
+      {
+      PREP11(0)
+      PARTSTEP11a(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
+      PARTSTEP11a(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
+      PARTSTEP11a(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
+      PARTSTEP11a(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
+      PARTSTEP11a(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
+      }
+      for (size_t i=1; i<ido; ++i)
+        {
+        PREP11(i)
+        PARTSTEP11(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
+        PARTSTEP11(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
+        PARTSTEP11(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
+        PARTSTEP11(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
+        PARTSTEP11(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
+        }
+      }
+  }
+
+#define CX(a,b,c) cc[(a)+ido*((b)+l1*(c))]
+#define CX2(a,b) cc[(a)+idl1*(b)]
+#define CH2(a,b) ch[(a)+idl1*(b)]
+
+NOINLINE static int passg (size_t ido, size_t ip, size_t l1,
+  cmplx * restrict cc, cmplx * restrict ch, const cmplx * restrict wa,
+  const cmplx * restrict csarr, const int sign)
+  {
+  const size_t cdim=ip;
+  size_t ipph = (ip+1)/2;
+  size_t idl1 = ido*l1;
+
+  cmplx * restrict wal=RALLOC(cmplx,ip);
+  if (!wal) return -1;
+  wal[0]=(cmplx){1.,0.};
+  for (size_t i=1; i<ip; ++i)
+    wal[i]=(cmplx){csarr[i].r,sign*csarr[i].i};
+
+  for (size_t k=0; k<l1; ++k)
+    for (size_t i=0; i<ido; ++i)
+      CH(i,k,0) = CC(i,0,k);
+  for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)
+    for (size_t k=0; k<l1; ++k)
+      for (size_t i=0; i<ido; ++i)
+        PMC(CH(i,k,j),CH(i,k,jc),CC(i,j,k),CC(i,jc,k))
+  for (size_t k=0; k<l1; ++k)
+    for (size_t i=0; i<ido; ++i)
+      {
+      cmplx tmp = CH(i,k,0);
+      for (size_t j=1; j<ipph; ++j)
+        ADDC(tmp,tmp,CH(i,k,j))
+      CX(i,k,0) = tmp;
+      }
+  for (size_t l=1, lc=ip-1; l<ipph; ++l, --lc)
+    {
+    // j=0
+    for (size_t ik=0; ik<idl1; ++ik)
+      {
+      CX2(ik,l).r = CH2(ik,0).r+wal[l].r*CH2(ik,1).r+wal[2*l].r*CH2(ik,2).r;
+      CX2(ik,l).i = CH2(ik,0).i+wal[l].r*CH2(ik,1).i+wal[2*l].r*CH2(ik,2).i;
+      CX2(ik,lc).r=-wal[l].i*CH2(ik,ip-1).i-wal[2*l].i*CH2(ik,ip-2).i;
+      CX2(ik,lc).i=wal[l].i*CH2(ik,ip-1).r+wal[2*l].i*CH2(ik,ip-2).r;
+      }
+
+    size_t iwal=2*l;
+    size_t j=3, jc=ip-3;
+    for (; j<ipph-1; j+=2, jc-=2)
+      {
+      iwal+=l; if (iwal>ip) iwal-=ip;
+      cmplx xwal=wal[iwal];
+      iwal+=l; if (iwal>ip) iwal-=ip;
+      cmplx xwal2=wal[iwal];
+      for (size_t ik=0; ik<idl1; ++ik)
+        {
+        CX2(ik,l).r += CH2(ik,j).r*xwal.r+CH2(ik,j+1).r*xwal2.r;
+        CX2(ik,l).i += CH2(ik,j).i*xwal.r+CH2(ik,j+1).i*xwal2.r;
+        CX2(ik,lc).r -= CH2(ik,jc).i*xwal.i+CH2(ik,jc-1).i*xwal2.i;
+        CX2(ik,lc).i += CH2(ik,jc).r*xwal.i+CH2(ik,jc-1).r*xwal2.i;
+        }
+      }
+    for (; j<ipph; ++j, --jc)
+      {
+      iwal+=l; if (iwal>ip) iwal-=ip;
+      cmplx xwal=wal[iwal];
+      for (size_t ik=0; ik<idl1; ++ik)
+        {
+        CX2(ik,l).r += CH2(ik,j).r*xwal.r;
+        CX2(ik,l).i += CH2(ik,j).i*xwal.r;
+        CX2(ik,lc).r -= CH2(ik,jc).i*xwal.i;
+        CX2(ik,lc).i += CH2(ik,jc).r*xwal.i;
+        }
+      }
+    }
+  DEALLOC(wal);
+
+  // shuffling and twiddling
+  if (ido==1)
+    for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)
+      for (size_t ik=0; ik<idl1; ++ik)
+        {
+        cmplx t1=CX2(ik,j), t2=CX2(ik,jc);
+        PMC(CX2(ik,j),CX2(ik,jc),t1,t2)
+        }
+  else
+    {
+    for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)
+      for (size_t k=0; k<l1; ++k)
+        {
+        cmplx t1=CX(0,k,j), t2=CX(0,k,jc);
+        PMC(CX(0,k,j),CX(0,k,jc),t1,t2)
+        for (size_t i=1; i<ido; ++i)
+          {
+          cmplx x1, x2;
+          PMC(x1,x2,CX(i,k,j),CX(i,k,jc))
+          size_t idij=(j-1)*(ido-1)+i-1;
+          MULPMSIGNC (CX(i,k,j),wa[idij],x1)
+          idij=(jc-1)*(ido-1)+i-1;
+          MULPMSIGNC (CX(i,k,jc),wa[idij],x2)
+          }
+        }
+    }
+  return 0;
+  }
+
+#undef CH2
+#undef CX2
+#undef CX
+
+NOINLINE WARN_UNUSED_RESULT static int pass_all(cfftp_plan plan, cmplx c[], double fct,
+  const int sign)
+  {
+  if (plan->length==1) return 0;
+  size_t len=plan->length;
+  size_t l1=1, nf=plan->nfct;
+  cmplx *ch = RALLOC(cmplx, len);
+  if (!ch) return -1;
+  cmplx *p1=c, *p2=ch;
+
+  for(size_t k1=0; k1<nf; k1++)
+    {
+    size_t ip=plan->fct[k1].fct;
+    size_t l2=ip*l1;
+    size_t ido = len/l2;
+    if     (ip==4)
+      sign>0 ? pass4b (ido, l1, p1, p2, plan->fct[k1].tw)
+             : pass4f (ido, l1, p1, p2, plan->fct[k1].tw);
+    else if(ip==2)
+      sign>0 ? pass2b (ido, l1, p1, p2, plan->fct[k1].tw)
+             : pass2f (ido, l1, p1, p2, plan->fct[k1].tw);
+    else if(ip==3)
+      sign>0 ? pass3b (ido, l1, p1, p2, plan->fct[k1].tw)
+             : pass3f (ido, l1, p1, p2, plan->fct[k1].tw);
+    else if(ip==5)
+      sign>0 ? pass5b (ido, l1, p1, p2, plan->fct[k1].tw)
+             : pass5f (ido, l1, p1, p2, plan->fct[k1].tw);
+    else if(ip==7)  pass7 (ido, l1, p1, p2, plan->fct[k1].tw, sign);
+    else if(ip==11) pass11(ido, l1, p1, p2, plan->fct[k1].tw, sign);
+    else
+      {
+      if (passg(ido, ip, l1, p1, p2, plan->fct[k1].tw, plan->fct[k1].tws, sign))
+        { DEALLOC(ch); return -1; }
+      SWAP(p1,p2,cmplx *);
+      }
+    SWAP(p1,p2,cmplx *);
+    l1=l2;
+    }
+  if (p1!=c)
+    {
+    if (fct!=1.)
+      for (size_t i=0; i<len; ++i)
+        {
+        c[i].r = ch[i].r*fct;
+        c[i].i = ch[i].i*fct;
+        }
+    else
+      memcpy (c,p1,len*sizeof(cmplx));
+    }
+  else
+    if (fct!=1.)
+      for (size_t i=0; i<len; ++i)
+        {
+        c[i].r *= fct;
+        c[i].i *= fct;
+        }
+  DEALLOC(ch);
+  return 0;
+  }
+
+#undef PMSIGNC
+#undef A_EQ_B_MUL_C
+#undef A_EQ_CB_MUL_C
+#undef MULPMSIGNC
+#undef MULPMSIGNCEQ
+
+#undef WA
+#undef CC
+#undef CH
+#undef ROT90
+#undef SCALEC
+#undef ADDC
+#undef PMC
+
+NOINLINE WARN_UNUSED_RESULT
+static int cfftp_forward(cfftp_plan plan, double c[], double fct)
+  { return pass_all(plan,(cmplx *)c, fct, -1); }
+
+NOINLINE WARN_UNUSED_RESULT
+static int cfftp_backward(cfftp_plan plan, double c[], double fct)
+  { return pass_all(plan,(cmplx *)c, fct, 1); }
+
+NOINLINE WARN_UNUSED_RESULT
+static int cfftp_factorize (cfftp_plan plan)
+  {
+  size_t length=plan->length;
+  size_t nfct=0;
+  while ((length%4)==0)
+    { if (nfct>=NFCT) return -1; plan->fct[nfct++].fct=4; length>>=2; }
+  if ((length%2)==0)
+    {
+    length>>=1;
+    // factor 2 should be at the front of the factor list
+    if (nfct>=NFCT) return -1;
+    plan->fct[nfct++].fct=2;
+    SWAP(plan->fct[0].fct, plan->fct[nfct-1].fct,size_t);
+    }
+  size_t maxl=(size_t)(sqrt((double)length))+1;
+  for (size_t divisor=3; (length>1)&&(divisor<maxl); divisor+=2)
+    if ((length%divisor)==0)
+      {
+      while ((length%divisor)==0)
+        {
+        if (nfct>=NFCT) return -1;
+        plan->fct[nfct++].fct=divisor;
+        length/=divisor;
+        }
+      maxl=(size_t)(sqrt((double)length))+1;
+      }
+  if (length>1) plan->fct[nfct++].fct=length;
+  plan->nfct=nfct;
+  return 0;
+  }
+
+NOINLINE static size_t cfftp_twsize (cfftp_plan plan)
+  {
+  size_t twsize=0, l1=1;
+  for (size_t k=0; k<plan->nfct; ++k)
+    {
+    size_t ip=plan->fct[k].fct, ido= plan->length/(l1*ip);
+    twsize+=(ip-1)*(ido-1);
+    if (ip>11)
+      twsize+=ip;
+    l1*=ip;
+    }
+  return twsize;
+  }
+
+NOINLINE WARN_UNUSED_RESULT static int cfftp_comp_twiddle (cfftp_plan plan)
+  {
+  size_t length=plan->length;
+  double *twid = RALLOC(double, 2*length);
+  if (!twid) return -1;
+  sincos_2pibyn(length, twid);
+  size_t l1=1;
+  size_t memofs=0;
+  for (size_t k=0; k<plan->nfct; ++k)
+    {
+    size_t ip=plan->fct[k].fct, ido= length/(l1*ip);
+    plan->fct[k].tw=plan->mem+memofs;
+    memofs+=(ip-1)*(ido-1);
+    for (size_t j=1; j<ip; ++j)
+      for (size_t i=1; i<ido; ++i)
+        {
+        plan->fct[k].tw[(j-1)*(ido-1)+i-1].r = twid[2*j*l1*i];
+        plan->fct[k].tw[(j-1)*(ido-1)+i-1].i = twid[2*j*l1*i+1];
+        }
+    if (ip>11)
+      {
+      plan->fct[k].tws=plan->mem+memofs;
+      memofs+=ip;
+      for (size_t j=0; j<ip; ++j)
+        {
+        plan->fct[k].tws[j].r = twid[2*j*l1*ido];
+        plan->fct[k].tws[j].i = twid[2*j*l1*ido+1];
+        }
+      }
+    l1*=ip;
+    }
+  DEALLOC(twid);
+  return 0;
+  }
+
+static cfftp_plan make_cfftp_plan (size_t length)
+  {
+  if (length==0) return NULL;
+  cfftp_plan plan = RALLOC(cfftp_plan_i,1);
+  if (!plan) return NULL;
+  plan->length=length;
+  plan->nfct=0;
+  for (size_t i=0; i<NFCT; ++i)
+    plan->fct[i]=(cfftp_fctdata){0,0,0};
+  plan->mem=0;
+  if (length==1) return plan;
+  if (cfftp_factorize(plan)!=0) { DEALLOC(plan); return NULL; }
+  size_t tws=cfftp_twsize(plan);
+  plan->mem=RALLOC(cmplx,tws);
+  if (!plan->mem) { DEALLOC(plan); return NULL; }
+  if (cfftp_comp_twiddle(plan)!=0)
+    { DEALLOC(plan->mem); DEALLOC(plan); return NULL; }
+  return plan;
+  }
+
+static void destroy_cfftp_plan (cfftp_plan plan)
+  {
+  DEALLOC(plan->mem);
+  DEALLOC(plan);
+  }
+
+typedef struct rfftp_fctdata
+  {
+  size_t fct;
+  double *tw, *tws;
+  } rfftp_fctdata;
+
+typedef struct rfftp_plan_i
+  {
+  size_t length, nfct;
+  double *mem;
+  rfftp_fctdata fct[NFCT];
+  } rfftp_plan_i;
+typedef struct rfftp_plan_i * rfftp_plan;
+
+#define WA(x,i) wa[(i)+(x)*(ido-1)]
+#define PM(a,b,c,d) { a=c+d; b=c-d; }
+/* (a+ib) = conj(c+id) * (e+if) */
+#define MULPM(a,b,c,d,e,f) { a=c*e+d*f; b=c*f-d*e; }
+
+#define CC(a,b,c) cc[(a)+ido*((b)+l1*(c))]
+#define CH(a,b,c) ch[(a)+ido*((b)+cdim*(c))]
+
+NOINLINE static void radf2 (size_t ido, size_t l1, const double * restrict cc,
+  double * restrict ch, const double * restrict wa)
+  {
+  const size_t cdim=2;
+
+  for (size_t k=0; k<l1; k++)
+    PM (CH(0,0,k),CH(ido-1,1,k),CC(0,k,0),CC(0,k,1))
+  if ((ido&1)==0)
+    for (size_t k=0; k<l1; k++)
+      {
+      CH(    0,1,k) = -CC(ido-1,k,1);
+      CH(ido-1,0,k) =  CC(ido-1,k,0);
+      }
+  if (ido<=2) return;
+  for (size_t k=0; k<l1; k++)
+    for (size_t i=2; i<ido; i+=2)
+      {
+      size_t ic=ido-i;
+      double tr2, ti2;
+      MULPM (tr2,ti2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1))
+      PM (CH(i-1,0,k),CH(ic-1,1,k),CC(i-1,k,0),tr2)
+      PM (CH(i  ,0,k),CH(ic  ,1,k),ti2,CC(i  ,k,0))
+      }
+  }
+
+NOINLINE static void radf3(size_t ido, size_t l1, const double * restrict cc,
+  double * restrict ch, const double * restrict wa)
+  {
+  const size_t cdim=3;
+  static const double taur=-0.5, taui=0.86602540378443864676;
+
+  for (size_t k=0; k<l1; k++)
+    {
+    double cr2=CC(0,k,1)+CC(0,k,2);
+    CH(0,0,k) = CC(0,k,0)+cr2;
+    CH(0,2,k) = taui*(CC(0,k,2)-CC(0,k,1));
+    CH(ido-1,1,k) = CC(0,k,0)+taur*cr2;
+    }
+  if (ido==1) return;
+  for (size_t k=0; k<l1; k++)
+    for (size_t i=2; i<ido; i+=2)
+      {
+      size_t ic=ido-i;
+      double di2, di3, dr2, dr3;
+      MULPM (dr2,di2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1)) // d2=conj(WA0)*CC1
+      MULPM (dr3,di3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2)) // d3=conj(WA1)*CC2
+      double cr2=dr2+dr3; // c add
+      double ci2=di2+di3;
+      CH(i-1,0,k) = CC(i-1,k,0)+cr2; // c add
+      CH(i  ,0,k) = CC(i  ,k,0)+ci2;
+      double tr2 = CC(i-1,k,0)+taur*cr2; // c add
+      double ti2 = CC(i  ,k,0)+taur*ci2;
+      double tr3 = taui*(di2-di3);  // t3 = taui*i*(d3-d2)?
+      double ti3 = taui*(dr3-dr2);
+      PM(CH(i-1,2,k),CH(ic-1,1,k),tr2,tr3) // PM(i) = t2+t3
+      PM(CH(i  ,2,k),CH(ic  ,1,k),ti3,ti2) // PM(ic) = conj(t2-t3)
+      }
+  }
+
+NOINLINE static void radf4(size_t ido, size_t l1, const double * restrict cc,
+  double * restrict ch, const double * restrict wa)
+  {
+  const size_t cdim=4;
+  static const double hsqt2=0.70710678118654752440;
+
+  for (size_t k=0; k<l1; k++)
+    {
+    double tr1,tr2;
+    PM (tr1,CH(0,2,k),CC(0,k,3),CC(0,k,1))
+    PM (tr2,CH(ido-1,1,k),CC(0,k,0),CC(0,k,2))
+    PM (CH(0,0,k),CH(ido-1,3,k),tr2,tr1)
+    }
+  if ((ido&1)==0)
+    for (size_t k=0; k<l1; k++)
+      {
+      double ti1=-hsqt2*(CC(ido-1,k,1)+CC(ido-1,k,3));
+      double tr1= hsqt2*(CC(ido-1,k,1)-CC(ido-1,k,3));
+      PM (CH(ido-1,0,k),CH(ido-1,2,k),CC(ido-1,k,0),tr1)
+      PM (CH(    0,3,k),CH(    0,1,k),ti1,CC(ido-1,k,2))
+      }
+  if (ido<=2) return;
+  for (size_t k=0; k<l1; k++)
+    for (size_t i=2; i<ido; i+=2)
+      {
+      size_t ic=ido-i;
+      double ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
+      MULPM(cr2,ci2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1))
+      MULPM(cr3,ci3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2))
+      MULPM(cr4,ci4,WA(2,i-2),WA(2,i-1),CC(i-1,k,3),CC(i,k,3))
+      PM(tr1,tr4,cr4,cr2)
+      PM(ti1,ti4,ci2,ci4)
+      PM(tr2,tr3,CC(i-1,k,0),cr3)
+      PM(ti2,ti3,CC(i  ,k,0),ci3)
+      PM(CH(i-1,0,k),CH(ic-1,3,k),tr2,tr1)
+      PM(CH(i  ,0,k),CH(ic  ,3,k),ti1,ti2)
+      PM(CH(i-1,2,k),CH(ic-1,1,k),tr3,ti4)
+      PM(CH(i  ,2,k),CH(ic  ,1,k),tr4,ti3)
+      }
+  }
+
+NOINLINE static void radf5(size_t ido, size_t l1, const double * restrict cc,
+  double * restrict ch, const double * restrict wa)
+  {
+  const size_t cdim=5;
+  static const double tr11= 0.3090169943749474241, ti11=0.95105651629515357212,
+                      tr12=-0.8090169943749474241, ti12=0.58778525229247312917;
+
+  for (size_t k=0; k<l1; k++)
+    {
+    double cr2, cr3, ci4, ci5;
+    PM (cr2,ci5,CC(0,k,4),CC(0,k,1))
+    PM (cr3,ci4,CC(0,k,3),CC(0,k,2))
+    CH(0,0,k)=CC(0,k,0)+cr2+cr3;
+    CH(ido-1,1,k)=CC(0,k,0)+tr11*cr2+tr12*cr3;
+    CH(0,2,k)=ti11*ci5+ti12*ci4;
+    CH(ido-1,3,k)=CC(0,k,0)+tr12*cr2+tr11*cr3;
+    CH(0,4,k)=ti12*ci5-ti11*ci4;
+    }
+  if (ido==1) return;
+  for (size_t k=0; k<l1;++k)
+    for (size_t i=2; i<ido; i+=2)
+      {
+      double ci2, di2, ci4, ci5, di3, di4, di5, ci3, cr2, cr3, dr2, dr3,
+         dr4, dr5, cr5, cr4, ti2, ti3, ti5, ti4, tr2, tr3, tr4, tr5;
+      size_t ic=ido-i;
+      MULPM (dr2,di2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1))
+      MULPM (dr3,di3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2))
+      MULPM (dr4,di4,WA(2,i-2),WA(2,i-1),CC(i-1,k,3),CC(i,k,3))
+      MULPM (dr5,di5,WA(3,i-2),WA(3,i-1),CC(i-1,k,4),CC(i,k,4))
+      PM(cr2,ci5,dr5,dr2)
+      PM(ci2,cr5,di2,di5)
+      PM(cr3,ci4,dr4,dr3)
+      PM(ci3,cr4,di3,di4)
+      CH(i-1,0,k)=CC(i-1,k,0)+cr2+cr3;
+      CH(i  ,0,k)=CC(i  ,k,0)+ci2+ci3;
+      tr2=CC(i-1,k,0)+tr11*cr2+tr12*cr3;
+      ti2=CC(i  ,k,0)+tr11*ci2+tr12*ci3;
+      tr3=CC(i-1,k,0)+tr12*cr2+tr11*cr3;
+      ti3=CC(i  ,k,0)+tr12*ci2+tr11*ci3;
+      MULPM(tr5,tr4,cr5,cr4,ti11,ti12)
+      MULPM(ti5,ti4,ci5,ci4,ti11,ti12)
+      PM(CH(i-1,2,k),CH(ic-1,1,k),tr2,tr5)
+      PM(CH(i  ,2,k),CH(ic  ,1,k),ti5,ti2)
+      PM(CH(i-1,4,k),CH(ic-1,3,k),tr3,tr4)
+      PM(CH(i  ,4,k),CH(ic  ,3,k),ti4,ti3)
+      }
+  }
+
+#undef CC
+#undef CH
+#define C1(a,b,c) cc[(a)+ido*((b)+l1*(c))]
+#define C2(a,b) cc[(a)+idl1*(b)]
+#define CH2(a,b) ch[(a)+idl1*(b)]
+#define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
+#define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
+NOINLINE static void radfg(size_t ido, size_t ip, size_t l1,
+  double * restrict cc, double * restrict ch, const double * restrict wa,
+  const double * restrict csarr)
+  {
+  const size_t cdim=ip;
+  size_t ipph=(ip+1)/2;
+  size_t idl1 = ido*l1;
+
+  if (ido>1)
+    {
+    for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)              // 114
+      {
+      size_t is=(j-1)*(ido-1),
+             is2=(jc-1)*(ido-1);
+      for (size_t k=0; k<l1; ++k)                            // 113
+        {
+        size_t idij=is;
+        size_t idij2=is2;
+        for (size_t i=1; i<=ido-2; i+=2)                      // 112
+          {
+          double t1=C1(i,k,j ), t2=C1(i+1,k,j ),
+                 t3=C1(i,k,jc), t4=C1(i+1,k,jc);
+          double x1=wa[idij]*t1 + wa[idij+1]*t2,
+                 x2=wa[idij]*t2 - wa[idij+1]*t1,
+                 x3=wa[idij2]*t3 + wa[idij2+1]*t4,
+                 x4=wa[idij2]*t4 - wa[idij2+1]*t3;
+          C1(i  ,k,j ) = x1+x3;
+          C1(i  ,k,jc) = x2-x4;
+          C1(i+1,k,j ) = x2+x4;
+          C1(i+1,k,jc) = x3-x1;
+          idij+=2;
+          idij2+=2;
+          }
+        }
+      }
+    }
+
+  for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)                // 123
+    for (size_t k=0; k<l1; ++k)                              // 122
+      {
+      double t1=C1(0,k,j), t2=C1(0,k,jc);
+      C1(0,k,j ) = t1+t2;
+      C1(0,k,jc) = t2-t1;
+      }
+
+//everything in C
+//memset(ch,0,ip*l1*ido*sizeof(double));
+
+  for (size_t l=1,lc=ip-1; l<ipph; ++l,--lc)                 // 127
+    {
+    for (size_t ik=0; ik<idl1; ++ik)                         // 124
+      {
+      CH2(ik,l ) = C2(ik,0)+csarr[2*l]*C2(ik,1)+csarr[4*l]*C2(ik,2);
+      CH2(ik,lc) = csarr[2*l+1]*C2(ik,ip-1)+csarr[4*l+1]*C2(ik,ip-2);
+      }
+    size_t iang = 2*l;
+    size_t j=3, jc=ip-3;
+    for (; j<ipph-3; j+=4,jc-=4)              // 126
+      {
+      iang+=l; if (iang>=ip) iang-=ip;
+      double ar1=csarr[2*iang], ai1=csarr[2*iang+1];
+      iang+=l; if (iang>=ip) iang-=ip;
+      double ar2=csarr[2*iang], ai2=csarr[2*iang+1];
+      iang+=l; if (iang>=ip) iang-=ip;
+      double ar3=csarr[2*iang], ai3=csarr[2*iang+1];
+      iang+=l; if (iang>=ip) iang-=ip;
+      double ar4=csarr[2*iang], ai4=csarr[2*iang+1];
+      for (size_t ik=0; ik<idl1; ++ik)                       // 125
+        {
+        CH2(ik,l ) += ar1*C2(ik,j )+ar2*C2(ik,j +1)
+                     +ar3*C2(ik,j +2)+ar4*C2(ik,j +3);
+        CH2(ik,lc) += ai1*C2(ik,jc)+ai2*C2(ik,jc-1)
+                     +ai3*C2(ik,jc-2)+ai4*C2(ik,jc-3);
+        }
+      }
+    for (; j<ipph-1; j+=2,jc-=2)              // 126
+      {
+      iang+=l; if (iang>=ip) iang-=ip;
+      double ar1=csarr[2*iang], ai1=csarr[2*iang+1];
+      iang+=l; if (iang>=ip) iang-=ip;
+      double ar2=csarr[2*iang], ai2=csarr[2*iang+1];
+      for (size_t ik=0; ik<idl1; ++ik)                       // 125
+        {
+        CH2(ik,l ) += ar1*C2(ik,j )+ar2*C2(ik,j +1);
+        CH2(ik,lc) += ai1*C2(ik,jc)+ai2*C2(ik,jc-1);
+        }
+      }
+    for (; j<ipph; ++j,--jc)              // 126
+      {
+      iang+=l; if (iang>=ip) iang-=ip;
+      double ar=csarr[2*iang], ai=csarr[2*iang+1];
+      for (size_t ik=0; ik<idl1; ++ik)                       // 125
+        {
+        CH2(ik,l ) += ar*C2(ik,j );
+        CH2(ik,lc) += ai*C2(ik,jc);
+        }
+      }
+    }
+  for (size_t ik=0; ik<idl1; ++ik)                         // 101
+    CH2(ik,0) = C2(ik,0);
+  for (size_t j=1; j<ipph; ++j)                              // 129
+    for (size_t ik=0; ik<idl1; ++ik)                         // 128
+      CH2(ik,0) += C2(ik,j);
+
+// everything in CH at this point!
+//memset(cc,0,ip*l1*ido*sizeof(double));
+
+  for (size_t k=0; k<l1; ++k)                                // 131
+    for (size_t i=0; i<ido; ++i)                             // 130
+      CC(i,0,k) = CH(i,k,0);
+
+  for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)                // 137
+    {
+    size_t j2=2*j-1;
+    for (size_t k=0; k<l1; ++k)                              // 136
+      {
+      CC(ido-1,j2,k) = CH(0,k,j);
+      CC(0,j2+1,k) = CH(0,k,jc);
+      }
+    }
+
+  if (ido==1) return;
+
+  for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)                // 140
+    {
+    size_t j2=2*j-1;
+    for(size_t k=0; k<l1; ++k)                               // 139
+      for(size_t i=1, ic=ido-i-2; i<=ido-2; i+=2, ic-=2)      // 138
+        {
+        CC(i   ,j2+1,k) = CH(i  ,k,j )+CH(i  ,k,jc);
+        CC(ic  ,j2  ,k) = CH(i  ,k,j )-CH(i  ,k,jc);
+        CC(i+1 ,j2+1,k) = CH(i+1,k,j )+CH(i+1,k,jc);
+        CC(ic+1,j2  ,k) = CH(i+1,k,jc)-CH(i+1,k,j );
+        }
+    }
+  }
+#undef C1
+#undef C2
+#undef CH2
+
+#undef CH
+#undef CC
+#define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
+#define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
+
+NOINLINE static void radb2(size_t ido, size_t l1, const double * restrict cc,
+  double * restrict ch, const double * restrict wa)
+  {
+  const size_t cdim=2;
+
+  for (size_t k=0; k<l1; k++)
+    PM (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(ido-1,1,k))
+  if ((ido&1)==0)
+    for (size_t k=0; k<l1; k++)
+      {
+      CH(ido-1,k,0) = 2.*CC(ido-1,0,k);
+      CH(ido-1,k,1) =-2.*CC(0    ,1,k);
+      }
+  if (ido<=2) return;
+  for (size_t k=0; k<l1;++k)
+    for (size_t i=2; i<ido; i+=2)
+      {
+      size_t ic=ido-i;
+      double ti2, tr2;
+      PM (CH(i-1,k,0),tr2,CC(i-1,0,k),CC(ic-1,1,k))
+      PM (ti2,CH(i  ,k,0),CC(i  ,0,k),CC(ic  ,1,k))
+      MULPM (CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),ti2,tr2)
+      }
+  }
+
+NOINLINE static void radb3(size_t ido, size_t l1, const double * restrict cc,
+  double * restrict ch, const double * restrict wa)
+  {
+  const size_t cdim=3;
+  static const double taur=-0.5, taui=0.86602540378443864676;
+
+  for (size_t k=0; k<l1; k++)
+    {
+    double tr2=2.*CC(ido-1,1,k);
+    double cr2=CC(0,0,k)+taur*tr2;
+    CH(0,k,0)=CC(0,0,k)+tr2;
+    double ci3=2.*taui*CC(0,2,k);
+    PM (CH(0,k,2),CH(0,k,1),cr2,ci3);
+    }
+  if (ido==1) return;
+  for (size_t k=0; k<l1; k++)
+    for (size_t i=2; i<ido; i+=2)
+      {
+      size_t ic=ido-i;
+      double tr2=CC(i-1,2,k)+CC(ic-1,1,k); // t2=CC(I) + conj(CC(ic))
+      double ti2=CC(i  ,2,k)-CC(ic  ,1,k);
+      double cr2=CC(i-1,0,k)+taur*tr2;     // c2=CC +taur*t2
+      double ci2=CC(i  ,0,k)+taur*ti2;
+      CH(i-1,k,0)=CC(i-1,0,k)+tr2;         // CH=CC+t2
+      CH(i  ,k,0)=CC(i  ,0,k)+ti2;
+      double cr3=taui*(CC(i-1,2,k)-CC(ic-1,1,k));// c3=taui*(CC(i)-conj(CC(ic)))
+      double ci3=taui*(CC(i  ,2,k)+CC(ic  ,1,k));
+      double di2, di3, dr2, dr3;
+      PM(dr3,dr2,cr2,ci3) // d2= (cr2-ci3, ci2+cr3) = c2+i*c3
+      PM(di2,di3,ci2,cr3) // d3= (cr2+ci3, ci2-cr3) = c2-i*c3
+      MULPM(CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),di2,dr2) // ch = WA*d2
+      MULPM(CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),di3,dr3)
+      }
+  }
+
+NOINLINE static void radb4(size_t ido, size_t l1, const double * restrict cc,
+  double * restrict ch, const double * restrict wa)
+  {
+  const size_t cdim=4;
+  static const double sqrt2=1.41421356237309504880;
+
+  for (size_t k=0; k<l1; k++)
+    {
+    double tr1, tr2;
+    PM (tr2,tr1,CC(0,0,k),CC(ido-1,3,k))
+    double tr3=2.*CC(ido-1,1,k);
+    double tr4=2.*CC(0,2,k);
+    PM (CH(0,k,0),CH(0,k,2),tr2,tr3)
+    PM (CH(0,k,3),CH(0,k,1),tr1,tr4)
+    }
+  if ((ido&1)==0)
+    for (size_t k=0; k<l1; k++)
+      {
+      double tr1,tr2,ti1,ti2;
+      PM (ti1,ti2,CC(0    ,3,k),CC(0    ,1,k))
+      PM (tr2,tr1,CC(ido-1,0,k),CC(ido-1,2,k))
+      CH(ido-1,k,0)=tr2+tr2;
+      CH(ido-1,k,1)=sqrt2*(tr1-ti1);
+      CH(ido-1,k,2)=ti2+ti2;
+      CH(ido-1,k,3)=-sqrt2*(tr1+ti1);
+      }
+  if (ido<=2) return;
+  for (size_t k=0; k<l1;++k)
+    for (size_t i=2; i<ido; i+=2)
+      {
+      double ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
+      size_t ic=ido-i;
+      PM (tr2,tr1,CC(i-1,0,k),CC(ic-1,3,k))
+      PM (ti1,ti2,CC(i  ,0,k),CC(ic  ,3,k))
+      PM (tr4,ti3,CC(i  ,2,k),CC(ic  ,1,k))
+      PM (tr3,ti4,CC(i-1,2,k),CC(ic-1,1,k))
+      PM (CH(i-1,k,0),cr3,tr2,tr3)
+      PM (CH(i  ,k,0),ci3,ti2,ti3)
+      PM (cr4,cr2,tr1,tr4)
+      PM (ci2,ci4,ti1,ti4)
+      MULPM (CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),ci2,cr2)
+      MULPM (CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),ci3,cr3)
+      MULPM (CH(i,k,3),CH(i-1,k,3),WA(2,i-2),WA(2,i-1),ci4,cr4)
+      }
+  }
+
+NOINLINE static void radb5(size_t ido, size_t l1, const double * restrict cc,
+  double * restrict ch, const double * restrict wa)
+  {
+  const size_t cdim=5;
+  static const double tr11= 0.3090169943749474241, ti11=0.95105651629515357212,
+                      tr12=-0.8090169943749474241, ti12=0.58778525229247312917;
+
+  for (size_t k=0; k<l1; k++)
+    {
+    double ti5=CC(0,2,k)+CC(0,2,k);
+    double ti4=CC(0,4,k)+CC(0,4,k);
+    double tr2=CC(ido-1,1,k)+CC(ido-1,1,k);
+    double tr3=CC(ido-1,3,k)+CC(ido-1,3,k);
+    CH(0,k,0)=CC(0,0,k)+tr2+tr3;
+    double cr2=CC(0,0,k)+tr11*tr2+tr12*tr3;
+    double cr3=CC(0,0,k)+tr12*tr2+tr11*tr3;
+    double ci4, ci5;
+    MULPM(ci5,ci4,ti5,ti4,ti11,ti12)
+    PM(CH(0,k,4),CH(0,k,1),cr2,ci5)
+    PM(CH(0,k,3),CH(0,k,2),cr3,ci4)
+    }
+  if (ido==1) return;
+  for (size_t k=0; k<l1;++k)
+    for (size_t i=2; i<ido; i+=2)
+      {
+      size_t ic=ido-i;
+      double tr2, tr3, tr4, tr5, ti2, ti3, ti4, ti5;
+      PM(tr2,tr5,CC(i-1,2,k),CC(ic-1,1,k))
+      PM(ti5,ti2,CC(i  ,2,k),CC(ic  ,1,k))
+      PM(tr3,tr4,CC(i-1,4,k),CC(ic-1,3,k))
+      PM(ti4,ti3,CC(i  ,4,k),CC(ic  ,3,k))
+      CH(i-1,k,0)=CC(i-1,0,k)+tr2+tr3;
+      CH(i  ,k,0)=CC(i  ,0,k)+ti2+ti3;
+      double cr2=CC(i-1,0,k)+tr11*tr2+tr12*tr3;
+      double ci2=CC(i  ,0,k)+tr11*ti2+tr12*ti3;
+      double cr3=CC(i-1,0,k)+tr12*tr2+tr11*tr3;
+      double ci3=CC(i  ,0,k)+tr12*ti2+tr11*ti3;
+      double ci4, ci5, cr5, cr4;
+      MULPM(cr5,cr4,tr5,tr4,ti11,ti12)
+      MULPM(ci5,ci4,ti5,ti4,ti11,ti12)
+      double dr2, dr3, dr4, dr5, di2, di3, di4, di5;
+      PM(dr4,dr3,cr3,ci4)
+      PM(di3,di4,ci3,cr4)
+      PM(dr5,dr2,cr2,ci5)
+      PM(di2,di5,ci2,cr5)
+      MULPM(CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),di2,dr2)
+      MULPM(CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),di3,dr3)
+      MULPM(CH(i,k,3),CH(i-1,k,3),WA(2,i-2),WA(2,i-1),di4,dr4)
+      MULPM(CH(i,k,4),CH(i-1,k,4),WA(3,i-2),WA(3,i-1),di5,dr5)
+      }
+  }
+
+#undef CC
+#undef CH
+#define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
+#define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
+#define C1(a,b,c) cc[(a)+ido*((b)+l1*(c))]
+#define C2(a,b) cc[(a)+idl1*(b)]
+#define CH2(a,b) ch[(a)+idl1*(b)]
+
+NOINLINE static void radbg(size_t ido, size_t ip, size_t l1,
+  double * restrict cc, double * restrict ch, const double * restrict wa,
+  const double * restrict csarr)
+  {
+  const size_t cdim=ip;
+  size_t ipph=(ip+1)/ 2;
+  size_t idl1 = ido*l1;
+
+  for (size_t k=0; k<l1; ++k)        // 102
+    for (size_t i=0; i<ido; ++i)     // 101
+      CH(i,k,0) = CC(i,0,k);
+  for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)   // 108
+    {
+    size_t j2=2*j-1;
+    for (size_t k=0; k<l1; ++k)
+      {
+      CH(0,k,j ) = 2*CC(ido-1,j2,k);
+      CH(0,k,jc) = 2*CC(0,j2+1,k);
+      }
+    }
+
+  if (ido!=1)
+    {
+    for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)   // 111
+      {
+      size_t j2=2*j-1;
+      for (size_t k=0; k<l1; ++k)
+        for (size_t i=1, ic=ido-i-2; i<=ido-2; i+=2, ic-=2)      // 109
+          {
+          CH(i  ,k,j ) = CC(i  ,j2+1,k)+CC(ic  ,j2,k);
+          CH(i  ,k,jc) = CC(i  ,j2+1,k)-CC(ic  ,j2,k);
+          CH(i+1,k,j ) = CC(i+1,j2+1,k)-CC(ic+1,j2,k);
+          CH(i+1,k,jc) = CC(i+1,j2+1,k)+CC(ic+1,j2,k);
+          }
+      }
+    }
+  for (size_t l=1,lc=ip-1; l<ipph; ++l,--lc)
+    {
+    for (size_t ik=0; ik<idl1; ++ik)
+      {
+      C2(ik,l ) = CH2(ik,0)+csarr[2*l]*CH2(ik,1)+csarr[4*l]*CH2(ik,2);
+      C2(ik,lc) = csarr[2*l+1]*CH2(ik,ip-1)+csarr[4*l+1]*CH2(ik,ip-2);
+      }
+    size_t iang=2*l;
+    size_t j=3,jc=ip-3;
+    for(; j<ipph-3; j+=4,jc-=4)
+      {
+      iang+=l; if(iang>ip) iang-=ip;
+      double ar1=csarr[2*iang], ai1=csarr[2*iang+1];
+      iang+=l; if(iang>ip) iang-=ip;
+      double ar2=csarr[2*iang], ai2=csarr[2*iang+1];
+      iang+=l; if(iang>ip) iang-=ip;
+      double ar3=csarr[2*iang], ai3=csarr[2*iang+1];
+      iang+=l; if(iang>ip) iang-=ip;
+      double ar4=csarr[2*iang], ai4=csarr[2*iang+1];
+      for (size_t ik=0; ik<idl1; ++ik)
+        {
+        C2(ik,l ) += ar1*CH2(ik,j )+ar2*CH2(ik,j +1)
+                    +ar3*CH2(ik,j +2)+ar4*CH2(ik,j +3);
+        C2(ik,lc) += ai1*CH2(ik,jc)+ai2*CH2(ik,jc-1)
+                    +ai3*CH2(ik,jc-2)+ai4*CH2(ik,jc-3);
+        }
+      }
+    for(; j<ipph-1; j+=2,jc-=2)
+      {
+      iang+=l; if(iang>ip) iang-=ip;
+      double ar1=csarr[2*iang], ai1=csarr[2*iang+1];
+      iang+=l; if(iang>ip) iang-=ip;
+      double ar2=csarr[2*iang], ai2=csarr[2*iang+1];
+      for (size_t ik=0; ik<idl1; ++ik)
+        {
+        C2(ik,l ) += ar1*CH2(ik,j )+ar2*CH2(ik,j +1);
+        C2(ik,lc) += ai1*CH2(ik,jc)+ai2*CH2(ik,jc-1);
+        }
+      }
+    for(; j<ipph; ++j,--jc)
+      {
+      iang+=l; if(iang>ip) iang-=ip;
+      double war=csarr[2*iang], wai=csarr[2*iang+1];
+      for (size_t ik=0; ik<idl1; ++ik)
+        {
+        C2(ik,l ) += war*CH2(ik,j );
+        C2(ik,lc) += wai*CH2(ik,jc);
+        }
+      }
+    }
+  for (size_t j=1; j<ipph; ++j)
+    for (size_t ik=0; ik<idl1; ++ik)
+      CH2(ik,0) += CH2(ik,j);
+  for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)   // 124
+    for (size_t k=0; k<l1; ++k)
+      {
+      CH(0,k,j ) = C1(0,k,j)-C1(0,k,jc);
+      CH(0,k,jc) = C1(0,k,j)+C1(0,k,jc);
+      }
+
+  if (ido==1) return;
+
+  for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)  // 127
+    for (size_t k=0; k<l1; ++k)
+      for (size_t i=1; i<=ido-2; i+=2)
+        {
+        CH(i  ,k,j ) = C1(i  ,k,j)-C1(i+1,k,jc);
+        CH(i  ,k,jc) = C1(i  ,k,j)+C1(i+1,k,jc);
+        CH(i+1,k,j ) = C1(i+1,k,j)+C1(i  ,k,jc);
+        CH(i+1,k,jc) = C1(i+1,k,j)-C1(i  ,k,jc);
+        }
+
+// All in CH
+
+  for (size_t j=1; j<ip; ++j)
+    {
+    size_t is = (j-1)*(ido-1);
+    for (size_t k=0; k<l1; ++k)
+      {
+      size_t idij = is;
+      for (size_t i=1; i<=ido-2; i+=2)
+        {
+        double t1=CH(i,k,j), t2=CH(i+1,k,j);
+        CH(i  ,k,j) = wa[idij]*t1-wa[idij+1]*t2;
+        CH(i+1,k,j) = wa[idij]*t2+wa[idij+1]*t1;
+        idij+=2;
+        }
+      }
+    }
+  }
+#undef C1
+#undef C2
+#undef CH2
+
+#undef CC
+#undef CH
+#undef PM
+#undef MULPM
+#undef WA
+
+static void copy_and_norm(double *c, double *p1, size_t n, double fct)
+  {
+  if (p1!=c)
+    {
+    if (fct!=1.)
+      for (size_t i=0; i<n; ++i)
+        c[i] = fct*p1[i];
+    else
+      memcpy (c,p1,n*sizeof(double));
+    }
+  else
+    if (fct!=1.)
+      for (size_t i=0; i<n; ++i)
+        c[i] *= fct;
+  }
+
+WARN_UNUSED_RESULT
+static int rfftp_forward(rfftp_plan plan, double c[], double fct)
+  {
+  if (plan->length==1) return 0;
+  size_t n=plan->length;
+  size_t l1=n, nf=plan->nfct;
+  double *ch = RALLOC(double, n);
+  if (!ch) return -1;
+  double *p1=c, *p2=ch;
+
+  for(size_t k1=0; k1<nf;++k1)
+    {
+    size_t k=nf-k1-1;
+    size_t ip=plan->fct[k].fct;
+    size_t ido=n / l1;
+    l1 /= ip;
+    if(ip==4)
+      radf4(ido, l1, p1, p2, plan->fct[k].tw);
+    else if(ip==2)
+      radf2(ido, l1, p1, p2, plan->fct[k].tw);
+    else if(ip==3)
+      radf3(ido, l1, p1, p2, plan->fct[k].tw);
+    else if(ip==5)
+      radf5(ido, l1, p1, p2, plan->fct[k].tw);
+    else
+      {
+      radfg(ido, ip, l1, p1, p2, plan->fct[k].tw, plan->fct[k].tws);
+      SWAP (p1,p2,double *);
+      }
+    SWAP (p1,p2,double *);
+    }
+  copy_and_norm(c,p1,n,fct);
+  DEALLOC(ch);
+  return 0;
+  }
+
+WARN_UNUSED_RESULT
+static int rfftp_backward(rfftp_plan plan, double c[], double fct)
+  {
+  if (plan->length==1) return 0;
+  size_t n=plan->length;
+  size_t l1=1, nf=plan->nfct;
+  double *ch = RALLOC(double, n);
+  if (!ch) return -1;
+  double *p1=c, *p2=ch;
+
+  for(size_t k=0; k<nf; k++)
+    {
+    size_t ip = plan->fct[k].fct,
+           ido= n/(ip*l1);
+    if(ip==4)
+      radb4(ido, l1, p1, p2, plan->fct[k].tw);
+    else if(ip==2)
+      radb2(ido, l1, p1, p2, plan->fct[k].tw);
+    else if(ip==3)
+      radb3(ido, l1, p1, p2, plan->fct[k].tw);
+    else if(ip==5)
+      radb5(ido, l1, p1, p2, plan->fct[k].tw);
+    else
+      radbg(ido, ip, l1, p1, p2, plan->fct[k].tw, plan->fct[k].tws);
+    SWAP (p1,p2,double *);
+    l1*=ip;
+    }
+  copy_and_norm(c,p1,n,fct);
+  DEALLOC(ch);
+  return 0;
+  }
+
+WARN_UNUSED_RESULT
+static int rfftp_factorize (rfftp_plan plan)
+  {
+  size_t length=plan->length;
+  size_t nfct=0;
+  while ((length%4)==0)
+    { if (nfct>=NFCT) return -1; plan->fct[nfct++].fct=4; length>>=2; }
+  if ((length%2)==0)
+    {
+    length>>=1;
+    // factor 2 should be at the front of the factor list
+    if (nfct>=NFCT) return -1;
+    plan->fct[nfct++].fct=2;
+    SWAP(plan->fct[0].fct, plan->fct[nfct-1].fct,size_t);
+    }
+  size_t maxl=(size_t)(sqrt((double)length))+1;
+  for (size_t divisor=3; (length>1)&&(divisor<maxl); divisor+=2)
+    if ((length%divisor)==0)
+      {
+      while ((length%divisor)==0)
+        {
+        if (nfct>=NFCT) return -1;
+        plan->fct[nfct++].fct=divisor;
+        length/=divisor;
+        }
+      maxl=(size_t)(sqrt((double)length))+1;
+      }
+  if (length>1) plan->fct[nfct++].fct=length;
+  plan->nfct=nfct;
+  return 0;
+  }
+
+static size_t rfftp_twsize(rfftp_plan plan)
+  {
+  size_t twsize=0, l1=1;
+  for (size_t k=0; k<plan->nfct; ++k)
+    {
+    size_t ip=plan->fct[k].fct, ido= plan->length/(l1*ip);
+    twsize+=(ip-1)*(ido-1);
+    if (ip>5) twsize+=2*ip;
+    l1*=ip;
+    }
+  return twsize;
+  return 0;
+  }
+
+WARN_UNUSED_RESULT NOINLINE static int rfftp_comp_twiddle (rfftp_plan plan)
+  {
+  size_t length=plan->length;
+  double *twid = RALLOC(double, 2*length);
+  if (!twid) return -1;
+  sincos_2pibyn_half(length, twid);
+  size_t l1=1;
+  double *ptr=plan->mem;
+  for (size_t k=0; k<plan->nfct; ++k)
+    {
+    size_t ip=plan->fct[k].fct, ido=length/(l1*ip);
+    if (k<plan->nfct-1) // last factor doesn't need twiddles
+      {
+      plan->fct[k].tw=ptr; ptr+=(ip-1)*(ido-1);
+      for (size_t j=1; j<ip; ++j)
+        for (size_t i=1; i<=(ido-1)/2; ++i)
+          {
+          plan->fct[k].tw[(j-1)*(ido-1)+2*i-2] = twid[2*j*l1*i];
+          plan->fct[k].tw[(j-1)*(ido-1)+2*i-1] = twid[2*j*l1*i+1];
+          }
+      }
+    if (ip>5) // special factors required by *g functions
+      {
+      plan->fct[k].tws=ptr; ptr+=2*ip;
+      plan->fct[k].tws[0] = 1.;
+      plan->fct[k].tws[1] = 0.;
+      for (size_t i=1; i<=(ip>>1); ++i)
+        {
+        plan->fct[k].tws[2*i  ] = twid[2*i*(length/ip)];
+        plan->fct[k].tws[2*i+1] = twid[2*i*(length/ip)+1];
+        plan->fct[k].tws[2*(ip-i)  ] = twid[2*i*(length/ip)];
+        plan->fct[k].tws[2*(ip-i)+1] = -twid[2*i*(length/ip)+1];
+        }
+      }
+    l1*=ip;
+    }
+  DEALLOC(twid);
+  return 0;
+  }
+
+NOINLINE static rfftp_plan make_rfftp_plan (size_t length)
+  {
+  if (length==0) return NULL;
+  rfftp_plan plan = RALLOC(rfftp_plan_i,1);
+  if (!plan) return NULL;
+  plan->length=length;
+  plan->nfct=0;
+  plan->mem=NULL;
+  for (size_t i=0; i<NFCT; ++i)
+    plan->fct[i]=(rfftp_fctdata){0,0,0};
+  if (length==1) return plan;
+  if (rfftp_factorize(plan)!=0) { DEALLOC(plan); return NULL; }
+  size_t tws=rfftp_twsize(plan);
+  plan->mem=RALLOC(double,tws);
+  if (!plan->mem) { DEALLOC(plan); return NULL; }
+  if (rfftp_comp_twiddle(plan)!=0)
+    { DEALLOC(plan->mem); DEALLOC(plan); return NULL; }
+  return plan;
+  }
+
+NOINLINE static void destroy_rfftp_plan (rfftp_plan plan)
+  {
+  DEALLOC(plan->mem);
+  DEALLOC(plan);
+  }
+
+typedef struct fftblue_plan_i
+  {
+  size_t n, n2;
+  cfftp_plan plan;
+  double *mem;
+  double *bk, *bkf;
+  } fftblue_plan_i;
+typedef struct fftblue_plan_i * fftblue_plan;
+
+NOINLINE static fftblue_plan make_fftblue_plan (size_t length)
+  {
+  fftblue_plan plan = RALLOC(fftblue_plan_i,1);
+  if (!plan) return NULL;
+  plan->n = length;
+  plan->n2 = good_size(plan->n*2-1);
+  plan->mem = RALLOC(double, 2*plan->n+2*plan->n2);
+  if (!plan->mem) { DEALLOC(plan); return NULL; }
+  plan->bk  = plan->mem;
+  plan->bkf = plan->bk+2*plan->n;
+
+/* initialize b_k */
+  double *tmp = RALLOC(double,4*plan->n);
+  if (!tmp) { DEALLOC(plan->mem); DEALLOC(plan); return NULL; }
+  sincos_2pibyn(2*plan->n,tmp);
+  plan->bk[0] = 1;
+  plan->bk[1] = 0;
+
+  size_t coeff=0;
+  for (size_t m=1; m<plan->n; ++m)
+    {
+    coeff+=2*m-1;
+    if (coeff>=2*plan->n) coeff-=2*plan->n;
+    plan->bk[2*m  ] = tmp[2*coeff  ];
+    plan->bk[2*m+1] = tmp[2*coeff+1];
+    }
+
+  /* initialize the zero-padded, Fourier transformed b_k. Add normalisation. */
+  double xn2 = 1./plan->n2;
+  plan->bkf[0] = plan->bk[0]*xn2;
+  plan->bkf[1] = plan->bk[1]*xn2;
+  for (size_t m=2; m<2*plan->n; m+=2)
+    {
+    plan->bkf[m]   = plan->bkf[2*plan->n2-m]   = plan->bk[m]   *xn2;
+    plan->bkf[m+1] = plan->bkf[2*plan->n2-m+1] = plan->bk[m+1] *xn2;
+    }
+  for (size_t m=2*plan->n;m<=(2*plan->n2-2*plan->n+1);++m)
+    plan->bkf[m]=0.;
+  plan->plan=make_cfftp_plan(plan->n2);
+  if (!plan->plan)
+    { DEALLOC(tmp); DEALLOC(plan->mem); DEALLOC(plan); return NULL; }
+  if (cfftp_forward(plan->plan,plan->bkf,1.)!=0)
+    { DEALLOC(tmp); DEALLOC(plan->mem); DEALLOC(plan); return NULL; }
+  DEALLOC(tmp);
+
+  return plan;
+  }
+
+NOINLINE static void destroy_fftblue_plan (fftblue_plan plan)
+  {
+  DEALLOC(plan->mem);
+  destroy_cfftp_plan(plan->plan);
+  DEALLOC(plan);
+  }
+
+NOINLINE WARN_UNUSED_RESULT
+static int fftblue_fft(fftblue_plan plan, double c[], int isign, double fct)
+  {
+  size_t n=plan->n;
+  size_t n2=plan->n2;
+  double *bk  = plan->bk;
+  double *bkf = plan->bkf;
+  double *akf = RALLOC(double, 2*n2);
+  if (!akf) return -1;
+
+/* initialize a_k and FFT it */
+  if (isign>0)
+    for (size_t m=0; m<2*n; m+=2)
+      {
+      akf[m]   = c[m]*bk[m]   - c[m+1]*bk[m+1];
+      akf[m+1] = c[m]*bk[m+1] + c[m+1]*bk[m];
+      }
+  else
+    for (size_t m=0; m<2*n; m+=2)
+      {
+      akf[m]   = c[m]*bk[m]   + c[m+1]*bk[m+1];
+      akf[m+1] =-c[m]*bk[m+1] + c[m+1]*bk[m];
+      }
+  for (size_t m=2*n; m<2*n2; ++m)
+    akf[m]=0;
+
+  if (cfftp_forward (plan->plan,akf,fct)!=0)
+    { DEALLOC(akf); return -1; }
+
+/* do the convolution */
+  if (isign>0)
+    for (size_t m=0; m<2*n2; m+=2)
+      {
+      double im = -akf[m]*bkf[m+1] + akf[m+1]*bkf[m];
+      akf[m  ]  =  akf[m]*bkf[m]   + akf[m+1]*bkf[m+1];
+      akf[m+1]  = im;
+      }
+  else
+    for (size_t m=0; m<2*n2; m+=2)
+      {
+      double im = akf[m]*bkf[m+1] + akf[m+1]*bkf[m];
+      akf[m  ]  = akf[m]*bkf[m]   - akf[m+1]*bkf[m+1];
+      akf[m+1]  = im;
+      }
+
+/* inverse FFT */
+  if (cfftp_backward (plan->plan,akf,1.)!=0)
+    { DEALLOC(akf); return -1; }
+
+/* multiply by b_k */
+  if (isign>0)
+    for (size_t m=0; m<2*n; m+=2)
+      {
+      c[m]   = bk[m]  *akf[m] - bk[m+1]*akf[m+1];
+      c[m+1] = bk[m+1]*akf[m] + bk[m]  *akf[m+1];
+      }
+  else
+    for (size_t m=0; m<2*n; m+=2)
+      {
+      c[m]   = bk[m]  *akf[m] + bk[m+1]*akf[m+1];
+      c[m+1] =-bk[m+1]*akf[m] + bk[m]  *akf[m+1];
+      }
+  DEALLOC(akf);
+  return 0;
+  }
+
+WARN_UNUSED_RESULT
+static int cfftblue_backward(fftblue_plan plan, double c[], double fct)
+  { return fftblue_fft(plan,c,1,fct); }
+
+WARN_UNUSED_RESULT
+static int cfftblue_forward(fftblue_plan plan, double c[], double fct)
+  { return fftblue_fft(plan,c,-1,fct); }
+
+WARN_UNUSED_RESULT
+static int rfftblue_backward(fftblue_plan plan, double c[], double fct)
+  {
+  size_t n=plan->n;
+  double *tmp = RALLOC(double,2*n);
+  if (!tmp) return -1;
+  tmp[0]=c[0];
+  tmp[1]=0.;
+  memcpy (tmp+2,c+1, (n-1)*sizeof(double));
+  if ((n&1)==0) tmp[n+1]=0.;
+  for (size_t m=2; m<n; m+=2)
+    {
+    tmp[2*n-m]=tmp[m];
+    tmp[2*n-m+1]=-tmp[m+1];
+    }
+  if (fftblue_fft(plan,tmp,1,fct)!=0)
+    { DEALLOC(tmp); return -1; }
+  for (size_t m=0; m<n; ++m)
+    c[m] = tmp[2*m];
+  DEALLOC(tmp);
+  return 0;
+  }
+
+WARN_UNUSED_RESULT
+static int rfftblue_forward(fftblue_plan plan, double c[], double fct)
+  {
+  size_t n=plan->n;
+  double *tmp = RALLOC(double,2*n);
+  if (!tmp) return -1;
+  for (size_t m=0; m<n; ++m)
+    {
+    tmp[2*m] = c[m];
+    tmp[2*m+1] = 0.;
+    }
+  if (fftblue_fft(plan,tmp,-1,fct)!=0)
+    { DEALLOC(tmp); return -1; }
+  c[0] = tmp[0];
+  memcpy (c+1, tmp+2, (n-1)*sizeof(double));
+  DEALLOC(tmp);
+  return 0;
+  }
+
+typedef struct cfft_plan_i
+  {
+  cfftp_plan packplan;
+  fftblue_plan blueplan;
+  } cfft_plan_i;
+
+static cfft_plan make_cfft_plan (size_t length)
+  {
+  if (length==0) return NULL;
+  cfft_plan plan = RALLOC(cfft_plan_i,1);
+  if (!plan) return NULL;
+  plan->blueplan=0;
+  plan->packplan=0;
+  if ((length<50) || (largest_prime_factor(length)<=sqrt(length)))
+    {
+    plan->packplan=make_cfftp_plan(length);
+    if (!plan->packplan) { DEALLOC(plan); return NULL; }
+    return plan;
+    }
+  double comp1 = cost_guess(length);
+  double comp2 = 2*cost_guess(good_size(2*length-1));
+  comp2*=1.5; /* fudge factor that appears to give good overall performance */
+  if (comp2<comp1) // use Bluestein
+    {
+    plan->blueplan=make_fftblue_plan(length);
+    if (!plan->blueplan) { DEALLOC(plan); return NULL; }
+    }
+  else
+    {
+    plan->packplan=make_cfftp_plan(length);
+    if (!plan->packplan) { DEALLOC(plan); return NULL; }
+    }
+  return plan;
+  }
+
+static void destroy_cfft_plan (cfft_plan plan)
+  {
+  if (plan->blueplan)
+    destroy_fftblue_plan(plan->blueplan);
+  if (plan->packplan)
+    destroy_cfftp_plan(plan->packplan);
+  DEALLOC(plan);
+  }
+
+WARN_UNUSED_RESULT static int cfft_backward(cfft_plan plan, double c[], double fct)
+  {
+  if (plan->packplan)
+    return cfftp_backward(plan->packplan,c,fct);
+  // if (plan->blueplan)
+  return cfftblue_backward(plan->blueplan,c,fct);
+  }
+
+WARN_UNUSED_RESULT static int cfft_forward(cfft_plan plan, double c[], double fct)
+  {
+  if (plan->packplan)
+    return cfftp_forward(plan->packplan,c,fct);
+  // if (plan->blueplan)
+  return cfftblue_forward(plan->blueplan,c,fct);
+  }
+
+typedef struct rfft_plan_i
+  {
+  rfftp_plan packplan;
+  fftblue_plan blueplan;
+  } rfft_plan_i;
+
+static rfft_plan make_rfft_plan (size_t length)
+  {
+  if (length==0) return NULL;
+  rfft_plan plan = RALLOC(rfft_plan_i,1);
+  if (!plan) return NULL;
+  plan->blueplan=0;
+  plan->packplan=0;
+  if ((length<50) || (largest_prime_factor(length)<=sqrt(length)))
+    {
+    plan->packplan=make_rfftp_plan(length);
+    if (!plan->packplan) { DEALLOC(plan); return NULL; }
+    return plan;
+    }
+  double comp1 = 0.5*cost_guess(length);
+  double comp2 = 2*cost_guess(good_size(2*length-1));
+  comp2*=1.5; /* fudge factor that appears to give good overall performance */
+  if (comp2<comp1) // use Bluestein
+    {
+    plan->blueplan=make_fftblue_plan(length);
+    if (!plan->blueplan) { DEALLOC(plan); return NULL; }
+    }
+  else
+    {
+    plan->packplan=make_rfftp_plan(length);
+    if (!plan->packplan) { DEALLOC(plan); return NULL; }
+    }
+  return plan;
+  }
+
+static void destroy_rfft_plan (rfft_plan plan)
+  {
+  if (plan->blueplan)
+    destroy_fftblue_plan(plan->blueplan);
+  if (plan->packplan)
+    destroy_rfftp_plan(plan->packplan);
+  DEALLOC(plan);
+  }
+
+WARN_UNUSED_RESULT static int rfft_backward(rfft_plan plan, double c[], double fct)
+  {
+  if (plan->packplan)
+    return rfftp_backward(plan->packplan,c,fct);
+  else // if (plan->blueplan)
+    return rfftblue_backward(plan->blueplan,c,fct);
+  }
+
+WARN_UNUSED_RESULT static int rfft_forward(rfft_plan plan, double c[], double fct)
+  {
+  if (plan->packplan)
+    return rfftp_forward(plan->packplan,c,fct);
+  else // if (plan->blueplan)
+    return rfftblue_forward(plan->blueplan,c,fct);
+  }
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "Python.h"
+#include "numpy/arrayobject.h"
+
+static PyObject *
+execute_complex(PyObject *a1, int is_forward, double fct)
+{
+    PyArrayObject *data = (PyArrayObject *)PyArray_FromAny(a1,
+            PyArray_DescrFromType(NPY_CDOUBLE), 1, 0,
+            NPY_ARRAY_ENSURECOPY | NPY_ARRAY_DEFAULT |
+            NPY_ARRAY_ENSUREARRAY | NPY_ARRAY_FORCECAST,
+            NULL);
+    if (!data) return NULL;
+
+    int npts = PyArray_DIM(data, PyArray_NDIM(data) - 1);
+    cfft_plan plan=NULL;
+
+    int nrepeats = PyArray_SIZE(data)/npts;
+    double *dptr = (double *)PyArray_DATA(data);
+    int fail=0;
+    Py_BEGIN_ALLOW_THREADS;
+    NPY_SIGINT_ON;
+    plan = make_cfft_plan(npts);
+    if (!plan) fail=1;
+    if (!fail)
+      for (int i = 0; i < nrepeats; i++) {
+          int res = is_forward ?
+            cfft_forward(plan, dptr, fct) : cfft_backward(plan, dptr, fct);
+          if (res!=0) { fail=1; break; }
+          dptr += npts*2;
+      }
+    if (plan) destroy_cfft_plan(plan);
+    NPY_SIGINT_OFF;
+    Py_END_ALLOW_THREADS;
+    if (fail) {
+      Py_XDECREF(data);
+      return PyErr_NoMemory();
+    }
+    return (PyObject *)data;
+}
+
+static PyObject *
+execute_real_forward(PyObject *a1, double fct)
+{
+    rfft_plan plan=NULL;
+    int fail = 0;
+    PyArrayObject *data = (PyArrayObject *)PyArray_FromAny(a1,
+            PyArray_DescrFromType(NPY_DOUBLE), 1, 0,
+            NPY_ARRAY_DEFAULT | NPY_ARRAY_ENSUREARRAY | NPY_ARRAY_FORCECAST,
+            NULL);
+    if (!data) return NULL;
+
+    int ndim = PyArray_NDIM(data);
+    const npy_intp *odim = PyArray_DIMS(data);
+    int npts = odim[ndim - 1];
+    npy_intp *tdim=(npy_intp *)malloc(ndim*sizeof(npy_intp));
+    if (!tdim)
+      { Py_XDECREF(data); return NULL; }
+    for (int d=0; d<ndim-1; ++d)
+      tdim[d] = odim[d];
+    tdim[ndim-1] = npts/2 + 1;
+    PyArrayObject *ret = (PyArrayObject *)PyArray_Empty(ndim,
+            tdim, PyArray_DescrFromType(NPY_CDOUBLE), 0);
+    free(tdim);
+    if (!ret) fail=1;
+    if (!fail) {
+      int rstep = PyArray_DIM(ret, PyArray_NDIM(ret) - 1)*2;
+
+      int nrepeats = PyArray_SIZE(data)/npts;
+      double *rptr = (double *)PyArray_DATA(ret),
+             *dptr = (double *)PyArray_DATA(data);
+
+      Py_BEGIN_ALLOW_THREADS;
+      NPY_SIGINT_ON;
+      plan = make_rfft_plan(npts);
+      if (!plan) fail=1;
+      if (!fail)
+        for (int i = 0; i < nrepeats; i++) {
+            rptr[rstep-1] = 0.0;
+            memcpy((char *)(rptr+1), dptr, npts*sizeof(double));
+            if (rfft_forward(plan, rptr+1, fct)!=0) {fail=1; break;}
+            rptr[0] = rptr[1];
+            rptr[1] = 0.0;
+            rptr += rstep;
+            dptr += npts;
+      }
+      if (plan) destroy_rfft_plan(plan);
+      NPY_SIGINT_OFF;
+      Py_END_ALLOW_THREADS;
+    }
+    if (fail) {
+      Py_XDECREF(data);
+      Py_XDECREF(ret);
+      return PyErr_NoMemory();
+    }
+    Py_DECREF(data);
+    return (PyObject *)ret;
+}
+static PyObject *
+execute_real_backward(PyObject *a1, double fct)
+{
+    rfft_plan plan=NULL;
+    PyArrayObject *data = (PyArrayObject *)PyArray_FromAny(a1,
+            PyArray_DescrFromType(NPY_CDOUBLE), 1, 0,
+            NPY_ARRAY_DEFAULT | NPY_ARRAY_ENSUREARRAY | NPY_ARRAY_FORCECAST,
+            NULL);
+    if (!data) return NULL;
+    int npts = PyArray_DIM(data, PyArray_NDIM(data) - 1);
+    PyArrayObject *ret = (PyArrayObject *)PyArray_Empty(PyArray_NDIM(data),
+            PyArray_DIMS(data), PyArray_DescrFromType(NPY_DOUBLE), 0);
+    int fail = 0;
+    if (!ret) fail=1;
+    if (!fail) {
+      int nrepeats = PyArray_SIZE(ret)/npts;
+      double *rptr = (double *)PyArray_DATA(ret),
+             *dptr = (double *)PyArray_DATA(data);
+
+      Py_BEGIN_ALLOW_THREADS;
+      NPY_SIGINT_ON;
+      plan = make_rfft_plan(npts);
+      if (!plan) fail=1;
+      if (!fail) {
+        for (int i = 0; i < nrepeats; i++) {
+          memcpy((char *)(rptr + 1), (dptr + 2), (npts - 1)*sizeof(double));
+          rptr[0] = dptr[0];
+          if (rfft_backward(plan, rptr, fct)!=0) {fail=1; break;}
+          rptr += npts;
+          dptr += npts*2;
+        }
+      }
+      if (plan) destroy_rfft_plan(plan);
+      NPY_SIGINT_OFF;
+      Py_END_ALLOW_THREADS;
+    }
+    if (fail) {
+      Py_XDECREF(data);
+      Py_XDECREF(ret);
+      return PyErr_NoMemory();
+    }
+    Py_DECREF(data);
+    return (PyObject *)ret;
+}
+
+static PyObject *
+execute_real(PyObject *a1, int is_forward, double fct)
+{
+    return is_forward ? execute_real_forward(a1, fct)
+                      : execute_real_backward(a1, fct);
+}
+
+static const char execute__doc__[] = "";
+
+static PyObject *
+execute(PyObject *NPY_UNUSED(self), PyObject *args)
+{
+    PyObject *a1;
+    int is_real, is_forward;
+    double fct;
+
+    if(!PyArg_ParseTuple(args, "Oiid:execute", &a1, &is_real, &is_forward, &fct)) {
+        return NULL;
+    }
+
+    return is_real ? execute_real(a1, is_forward, fct)
+                   : execute_complex(a1, is_forward, fct);
+}
+
+/* List of methods defined in the module */
+
+static struct PyMethodDef methods[] = {
+    {"execute",   execute,   1, execute__doc__},
+    {NULL, NULL, 0, NULL}          /* sentinel */
+};
+
+#if PY_MAJOR_VERSION >= 3
+static struct PyModuleDef moduledef = {
+        PyModuleDef_HEAD_INIT,
+        "pocketfft_internal",
+        NULL,
+        -1,
+        methods,
+        NULL,
+        NULL,
+        NULL,
+        NULL
+};
+#endif
+
+/* Initialization function for the module */
+#if PY_MAJOR_VERSION >= 3
+#define RETVAL(x) x
+PyMODINIT_FUNC PyInit_pocketfft_internal(void)
+#else
+#define RETVAL(x)
+PyMODINIT_FUNC
+initpocketfft_internal(void)
+#endif
+{
+    PyObject *m;
+#if PY_MAJOR_VERSION >= 3
+    m = PyModule_Create(&moduledef);
+#else
+    static const char module_documentation[] = "";
+
+    m = Py_InitModule4("pocketfft_internal", methods,
+            module_documentation,
+            (PyObject*)NULL,PYTHON_API_VERSION);
+#endif
+    if (m == NULL) {
+        return RETVAL(NULL);
+    }
+
+    /* Import the array object */
+    import_array();
+
+    /* XXXX Add constants here */
+
+    return RETVAL(m);
+}
diff --git a/numpy/fft/fftpack.py b/numpy/fft/pocketfft.py
similarity index 87%
rename from numpy/fft/fftpack.py
rename to numpy/fft/pocketfft.py
index de67593..45dc162 100644
--- a/numpy/fft/fftpack.py
+++ b/numpy/fft/pocketfft.py
@@ -26,9 +26,6 @@
 (Note: 2D routines are just nD routines with different default
 behavior.)
 
-The underlying code for these functions is an f2c-translated and modified
-version of the FFTPACK routines.
-
 """
 from __future__ import division, absolute_import, print_function
 
@@ -37,26 +34,18 @@
 
 import functools
 
-from numpy.core import (array, asarray, zeros, swapaxes, shape, conjugate,
-                        take, sqrt)
+from numpy.core import asarray, zeros, swapaxes, conjugate, take, sqrt
+from . import pocketfft_internal as pfi
 from numpy.core.multiarray import normalize_axis_index
 from numpy.core import overrides
-from . import fftpack_lite as fftpack
-from .helper import _FFTCache
-
-_fft_cache = _FFTCache(max_size_in_mb=100, max_item_count=32)
-_real_fft_cache = _FFTCache(max_size_in_mb=100, max_item_count=32)
 
 
 array_function_dispatch = functools.partial(
     overrides.array_function_dispatch, module='numpy.fft')
 
 
-def _raw_fft(a, n=None, axis=-1, init_function=fftpack.cffti,
-             work_function=fftpack.cfftf, fft_cache=_fft_cache):
-    a = asarray(a)
+def _raw_fft(a, n, axis, is_real, is_forward, fct):
     axis = normalize_axis_index(axis, a.ndim)
-
     if n is None:
         n = a.shape[axis]
 
@@ -64,15 +53,6 @@
         raise ValueError("Invalid number of FFT data points (%d) specified."
                          % n)
 
-    # We have to ensure that only a single thread can access a wsave array
-    # at any given time. Thus we remove it from the cache and insert it
-    # again after it has been used. Multiple threads might create multiple
-    # copies of the wsave array. This is intentional and a limitation of
-    # the current C code.
-    wsave = fft_cache.pop_twiddle_factors(n)
-    if wsave is None:
-        wsave = init_function(n)
-
     if a.shape[axis] != n:
         s = list(a.shape)
         if s[axis] > n:
@@ -87,25 +67,22 @@
             z[tuple(index)] = a
             a = z
 
-    if axis != a.ndim - 1:
+    if axis == a.ndim-1:
+        r = pfi.execute(a, is_real, is_forward, fct)
+    else:
         a = swapaxes(a, axis, -1)
-    r = work_function(a, wsave)
-    if axis != a.ndim - 1:
+        r = pfi.execute(a, is_real, is_forward, fct)
         r = swapaxes(r, axis, -1)
-
-    # As soon as we put wsave back into the cache, another thread could pick it
-    # up and start using it, so we must not do this until after we're
-    # completely done using it ourselves.
-    fft_cache.put_twiddle_factors(n, wsave)
-
     return r
 
 
 def _unitary(norm):
-    if norm not in (None, "ortho"):
-        raise ValueError("Invalid norm value %s, should be None or \"ortho\"."
-                         % norm)
-    return norm is not None
+    if norm is None:
+        return False
+    if norm=="ortho":
+        return True
+    raise ValueError("Invalid norm value %s, should be None or \"ortho\"."
+                     % norm)
 
 
 def _fft_dispatcher(a, n=None, axis=None, norm=None):
@@ -177,14 +154,10 @@
     Examples
     --------
     >>> np.fft.fft(np.exp(2j * np.pi * np.arange(8) / 8))
-    array([ -3.44505240e-16 +1.14383329e-17j,
-             8.00000000e+00 -5.71092652e-15j,
-             2.33482938e-16 +1.22460635e-16j,
-             1.64863782e-15 +1.77635684e-15j,
-             9.95839695e-17 +2.33482938e-16j,
-             0.00000000e+00 +1.66837030e-15j,
-             1.14383329e-17 +1.22460635e-16j,
-             -1.64863782e-15 +1.77635684e-15j])
+    array([-2.33486982e-16+1.14423775e-17j,  8.00000000e+00-1.25557246e-15j,
+            2.33486982e-16+2.33486982e-16j,  0.00000000e+00+1.22464680e-16j,
+           -1.14423775e-17+2.33486982e-16j,  0.00000000e+00+5.20784380e-16j,
+            1.14423775e-17+1.14423775e-17j,  0.00000000e+00+1.22464680e-16j])
 
     In this example, real input has an FFT which is Hermitian, i.e., symmetric
     in the real part and anti-symmetric in the imaginary part, as described in
@@ -200,12 +173,13 @@
 
     """
 
-    a = asarray(a).astype(complex, copy=False)
+    a = asarray(a)
     if n is None:
         n = a.shape[axis]
-    output = _raw_fft(a, n, axis, fftpack.cffti, fftpack.cfftf, _fft_cache)
-    if _unitary(norm):
-        output *= 1 / sqrt(n)
+    fct = 1
+    if norm is not None and _unitary(norm):
+        fct = 1 / sqrt(n)
+    output = _raw_fft(a, n, axis, False, True, fct)
     return output
 
 
@@ -278,7 +252,7 @@
     Examples
     --------
     >>> np.fft.ifft([0, 4, 0, 0])
-    array([ 1.+0.j,  0.+1.j, -1.+0.j,  0.-1.j])
+    array([ 1.+0.j,  0.+1.j, -1.+0.j,  0.-1.j]) # may vary
 
     Create and plot a band-limited signal with random phases:
 
@@ -288,19 +262,20 @@
     >>> n[40:60] = np.exp(1j*np.random.uniform(0, 2*np.pi, (20,)))
     >>> s = np.fft.ifft(n)
     >>> plt.plot(t, s.real, 'b-', t, s.imag, 'r--')
-    ...
+    [<matplotlib.lines.Line2D object at ...>, <matplotlib.lines.Line2D object at ...>]
     >>> plt.legend(('real', 'imaginary'))
-    ...
+    <matplotlib.legend.Legend object at ...>
     >>> plt.show()
 
     """
-    # The copy may be required for multithreading.
-    a = array(a, copy=True, dtype=complex)
+    a = asarray(a)
     if n is None:
         n = a.shape[axis]
-    unitary = _unitary(norm)
-    output = _raw_fft(a, n, axis, fftpack.cffti, fftpack.cfftb, _fft_cache)
-    return output * (1 / (sqrt(n) if unitary else n))
+    fct = 1/n
+    if norm is not None and _unitary(norm):
+        fct = 1/sqrt(n)
+    output = _raw_fft(a, n, axis, False, False, fct)
+    return output
 
 
 
@@ -374,23 +349,22 @@
     Examples
     --------
     >>> np.fft.fft([0, 1, 0, 0])
-    array([ 1.+0.j,  0.-1.j, -1.+0.j,  0.+1.j])
+    array([ 1.+0.j,  0.-1.j, -1.+0.j,  0.+1.j]) # may vary
     >>> np.fft.rfft([0, 1, 0, 0])
-    array([ 1.+0.j,  0.-1.j, -1.+0.j])
+    array([ 1.+0.j,  0.-1.j, -1.+0.j]) # may vary
 
     Notice how the final element of the `fft` output is the complex conjugate
     of the second element, for real input. For `rfft`, this symmetry is
     exploited to compute only the non-negative frequency terms.
 
     """
-    # The copy may be required for multithreading.
-    a = array(a, copy=True, dtype=float)
-    output = _raw_fft(a, n, axis, fftpack.rffti, fftpack.rfftf,
-                      _real_fft_cache)
-    if _unitary(norm):
+    a = asarray(a)
+    fct = 1
+    if norm is not None and _unitary(norm):
         if n is None:
             n = a.shape[axis]
-        output *= 1 / sqrt(n)
+        fct = 1/sqrt(n)
+    output = _raw_fft(a, n, axis, True, True, fct)
     return output
 
 
@@ -465,9 +439,9 @@
     Examples
     --------
     >>> np.fft.ifft([1, -1j, -1, 1j])
-    array([ 0.+0.j,  1.+0.j,  0.+0.j,  0.+0.j])
+    array([0.+0.j,  1.+0.j,  0.+0.j,  0.+0.j]) # may vary
     >>> np.fft.irfft([1, -1j, -1])
-    array([ 0.,  1.,  0.,  0.])
+    array([0.,  1.,  0.,  0.])
 
     Notice how the last term in the input to the ordinary `ifft` is the
     complex conjugate of the second term, and the output has zero imaginary
@@ -475,14 +449,14 @@
     specified, and the output array is purely real.
 
     """
-    # The copy may be required for multithreading.
-    a = array(a, copy=True, dtype=complex)
+    a = asarray(a)
     if n is None:
         n = (a.shape[axis] - 1) * 2
-    unitary = _unitary(norm)
-    output = _raw_fft(a, n, axis, fftpack.rffti, fftpack.rfftb,
-                      _real_fft_cache)
-    return output * (1 / (sqrt(n) if unitary else n))
+    fct = 1/n
+    if norm is not None and _unitary(norm):
+        fct = 1/sqrt(n)
+    output = _raw_fft(a, n, axis, True, False, fct)
+    return output
 
 
 @array_function_dispatch(_fft_dispatcher)
@@ -543,16 +517,16 @@
     --------
     >>> signal = np.array([1, 2, 3, 4, 3, 2])
     >>> np.fft.fft(signal)
-    array([ 15.+0.j,  -4.+0.j,   0.+0.j,  -1.-0.j,   0.+0.j,  -4.+0.j])
+    array([15.+0.j,  -4.+0.j,   0.+0.j,  -1.-0.j,   0.+0.j,  -4.+0.j]) # may vary
     >>> np.fft.hfft(signal[:4]) # Input first half of signal
-    array([ 15.,  -4.,   0.,  -1.,   0.,  -4.])
+    array([15.,  -4.,   0.,  -1.,   0.,  -4.])
     >>> np.fft.hfft(signal, 6)  # Input entire signal and truncate
-    array([ 15.,  -4.,   0.,  -1.,   0.,  -4.])
+    array([15.,  -4.,   0.,  -1.,   0.,  -4.])
 
 
     >>> signal = np.array([[1, 1.j], [-1.j, 2]])
     >>> np.conj(signal.T) - signal   # check Hermitian symmetry
-    array([[ 0.-0.j,  0.+0.j],
+    array([[ 0.-0.j,  -0.+0.j], # may vary
            [ 0.+0.j,  0.-0.j]])
     >>> freq_spectrum = np.fft.hfft(signal)
     >>> freq_spectrum
@@ -560,8 +534,7 @@
            [ 2., -2.]])
 
     """
-    # The copy may be required for multithreading.
-    a = array(a, copy=True, dtype=complex)
+    a = asarray(a)
     if n is None:
         n = (a.shape[axis] - 1) * 2
     unitary = _unitary(norm)
@@ -616,13 +589,12 @@
     --------
     >>> spectrum = np.array([ 15, -4, 0, -1, 0, -4])
     >>> np.fft.ifft(spectrum)
-    array([ 1.+0.j,  2.-0.j,  3.+0.j,  4.+0.j,  3.+0.j,  2.-0.j])
+    array([1.+0.j,  2.+0.j,  3.+0.j,  4.+0.j,  3.+0.j,  2.+0.j]) # may vary
     >>> np.fft.ihfft(spectrum)
-    array([ 1.-0.j,  2.-0.j,  3.-0.j,  4.-0.j])
+    array([ 1.-0.j,  2.-0.j,  3.-0.j,  4.-0.j]) # may vary
 
     """
-    # The copy may be required for multithreading.
-    a = array(a, copy=True, dtype=float)
+    a = asarray(a)
     if n is None:
         n = a.shape[axis]
     unitary = _unitary(norm)
@@ -732,17 +704,17 @@
     --------
     >>> a = np.mgrid[:3, :3, :3][0]
     >>> np.fft.fftn(a, axes=(1, 2))
-    array([[[  0.+0.j,   0.+0.j,   0.+0.j],
-            [  0.+0.j,   0.+0.j,   0.+0.j],
-            [  0.+0.j,   0.+0.j,   0.+0.j]],
-           [[  9.+0.j,   0.+0.j,   0.+0.j],
-            [  0.+0.j,   0.+0.j,   0.+0.j],
-            [  0.+0.j,   0.+0.j,   0.+0.j]],
-           [[ 18.+0.j,   0.+0.j,   0.+0.j],
-            [  0.+0.j,   0.+0.j,   0.+0.j],
-            [  0.+0.j,   0.+0.j,   0.+0.j]]])
+    array([[[ 0.+0.j,   0.+0.j,   0.+0.j], # may vary
+            [ 0.+0.j,   0.+0.j,   0.+0.j],
+            [ 0.+0.j,   0.+0.j,   0.+0.j]],
+           [[ 9.+0.j,   0.+0.j,   0.+0.j],
+            [ 0.+0.j,   0.+0.j,   0.+0.j],
+            [ 0.+0.j,   0.+0.j,   0.+0.j]],
+           [[18.+0.j,   0.+0.j,   0.+0.j],
+            [ 0.+0.j,   0.+0.j,   0.+0.j],
+            [ 0.+0.j,   0.+0.j,   0.+0.j]]])
     >>> np.fft.fftn(a, (2, 2), axes=(0, 1))
-    array([[[ 2.+0.j,  2.+0.j,  2.+0.j],
+    array([[[ 2.+0.j,  2.+0.j,  2.+0.j], # may vary
             [ 0.+0.j,  0.+0.j,  0.+0.j]],
            [[-2.+0.j, -2.+0.j, -2.+0.j],
             [ 0.+0.j,  0.+0.j,  0.+0.j]]])
@@ -838,10 +810,10 @@
     --------
     >>> a = np.eye(4)
     >>> np.fft.ifftn(np.fft.fftn(a, axes=(0,)), axes=(1,))
-    array([[ 1.+0.j,  0.+0.j,  0.+0.j,  0.+0.j],
-           [ 0.+0.j,  1.+0.j,  0.+0.j,  0.+0.j],
-           [ 0.+0.j,  0.+0.j,  1.+0.j,  0.+0.j],
-           [ 0.+0.j,  0.+0.j,  0.+0.j,  1.+0.j]])
+    array([[1.+0.j,  0.+0.j,  0.+0.j,  0.+0.j], # may vary
+           [0.+0.j,  1.+0.j,  0.+0.j,  0.+0.j],
+           [0.+0.j,  0.+0.j,  1.+0.j,  0.+0.j],
+           [0.+0.j,  0.+0.j,  0.+0.j,  1.+0.j]])
 
 
     Create and plot an image with band-limited frequency content:
@@ -934,16 +906,16 @@
     --------
     >>> a = np.mgrid[:5, :5][0]
     >>> np.fft.fft2(a)
-    array([[ 50.0 +0.j        ,   0.0 +0.j        ,   0.0 +0.j        ,
-              0.0 +0.j        ,   0.0 +0.j        ],
-           [-12.5+17.20477401j,   0.0 +0.j        ,   0.0 +0.j        ,
-              0.0 +0.j        ,   0.0 +0.j        ],
-           [-12.5 +4.0614962j ,   0.0 +0.j        ,   0.0 +0.j        ,
-              0.0 +0.j        ,   0.0 +0.j        ],
-           [-12.5 -4.0614962j ,   0.0 +0.j        ,   0.0 +0.j        ,
-                0.0 +0.j        ,   0.0 +0.j        ],
-           [-12.5-17.20477401j,   0.0 +0.j        ,   0.0 +0.j        ,
-              0.0 +0.j        ,   0.0 +0.j        ]])
+    array([[ 50.  +0.j        ,   0.  +0.j        ,   0.  +0.j        , # may vary
+              0.  +0.j        ,   0.  +0.j        ],
+           [-12.5+17.20477401j,   0.  +0.j        ,   0.  +0.j        ,
+              0.  +0.j        ,   0.  +0.j        ],
+           [-12.5 +4.0614962j ,   0.  +0.j        ,   0.  +0.j        ,
+              0.  +0.j        ,   0.  +0.j        ],
+           [-12.5 -4.0614962j ,   0.  +0.j        ,   0.  +0.j        ,
+              0.  +0.j        ,   0.  +0.j        ],
+           [-12.5-17.20477401j,   0.  +0.j        ,   0.  +0.j        ,
+              0.  +0.j        ,   0.  +0.j        ]])
 
     """
 
@@ -1028,10 +1000,10 @@
     --------
     >>> a = 4 * np.eye(4)
     >>> np.fft.ifft2(a)
-    array([[ 1.+0.j,  0.+0.j,  0.+0.j,  0.+0.j],
-           [ 0.+0.j,  0.+0.j,  0.+0.j,  1.+0.j],
-           [ 0.+0.j,  0.+0.j,  1.+0.j,  0.+0.j],
-           [ 0.+0.j,  1.+0.j,  0.+0.j,  0.+0.j]])
+    array([[1.+0.j,  0.+0.j,  0.+0.j,  0.+0.j], # may vary
+           [0.+0.j,  0.+0.j,  0.+0.j,  1.+0.j],
+           [0.+0.j,  0.+0.j,  1.+0.j,  0.+0.j],
+           [0.+0.j,  1.+0.j,  0.+0.j,  0.+0.j]])
 
     """
 
@@ -1110,20 +1082,19 @@
     --------
     >>> a = np.ones((2, 2, 2))
     >>> np.fft.rfftn(a)
-    array([[[ 8.+0.j,  0.+0.j],
-            [ 0.+0.j,  0.+0.j]],
-           [[ 0.+0.j,  0.+0.j],
-            [ 0.+0.j,  0.+0.j]]])
+    array([[[8.+0.j,  0.+0.j], # may vary
+            [0.+0.j,  0.+0.j]],
+           [[0.+0.j,  0.+0.j],
+            [0.+0.j,  0.+0.j]]])
 
     >>> np.fft.rfftn(a, axes=(2, 0))
-    array([[[ 4.+0.j,  0.+0.j],
-            [ 4.+0.j,  0.+0.j]],
-           [[ 0.+0.j,  0.+0.j],
-            [ 0.+0.j,  0.+0.j]]])
+    array([[[4.+0.j,  0.+0.j], # may vary
+            [4.+0.j,  0.+0.j]],
+           [[0.+0.j,  0.+0.j],
+            [0.+0.j,  0.+0.j]]])
 
     """
-    # The copy may be required for multithreading.
-    a = array(a, copy=True, dtype=float)
+    a = asarray(a)
     s, axes = _cook_nd_args(a, s, axes)
     a = rfft(a, s[-1], axes[-1], norm)
     for ii in range(len(axes)-1):
@@ -1247,16 +1218,15 @@
     >>> a = np.zeros((3, 2, 2))
     >>> a[0, 0, 0] = 3 * 2 * 2
     >>> np.fft.irfftn(a)
-    array([[[ 1.,  1.],
-            [ 1.,  1.]],
-           [[ 1.,  1.],
-            [ 1.,  1.]],
-           [[ 1.,  1.],
-            [ 1.,  1.]]])
+    array([[[1.,  1.],
+            [1.,  1.]],
+           [[1.,  1.],
+            [1.,  1.]],
+           [[1.,  1.],
+            [1.,  1.]]])
 
     """
-    # The copy may be required for multithreading.
-    a = array(a, copy=True, dtype=complex)
+    a = asarray(a)
     s, axes = _cook_nd_args(a, s, axes, invreal=1)
     for ii in range(len(axes)-1):
         a = ifft(a, s[ii], axes[ii], norm)
diff --git a/numpy/fft/setup.py b/numpy/fft/setup.py
index cd99a82..6c3548b 100644
--- a/numpy/fft/setup.py
+++ b/numpy/fft/setup.py
@@ -7,9 +7,9 @@
 
     config.add_data_dir('tests')
 
-    # Configure fftpack_lite
-    config.add_extension('fftpack_lite',
-                         sources=['fftpack_litemodule.c', 'fftpack.c']
+    # Configure pocketfft_internal
+    config.add_extension('pocketfft_internal',
+                         sources=['pocketfft.c']
                          )
 
     return config
diff --git a/numpy/fft/tests/test_helper.py b/numpy/fft/tests/test_helper.py
index 8d315fa..6613c80 100644
--- a/numpy/fft/tests/test_helper.py
+++ b/numpy/fft/tests/test_helper.py
@@ -7,7 +7,6 @@
 import numpy as np
 from numpy.testing import assert_array_almost_equal, assert_equal
 from numpy import fft, pi
-from numpy.fft.helper import _FFTCache
 
 
 class TestFFTShift(object):
@@ -168,81 +167,3 @@
 
         # Should not raise error
         fft.irfftn(a, axes=axes)
-
-
-class TestFFTCache(object):
-
-    def test_basic_behaviour(self):
-        c = _FFTCache(max_size_in_mb=1, max_item_count=4)
-
-        # Put
-        c.put_twiddle_factors(1, np.ones(2, dtype=np.float32))
-        c.put_twiddle_factors(2, np.zeros(2, dtype=np.float32))
-
-        # Get
-        assert_array_almost_equal(c.pop_twiddle_factors(1),
-                                  np.ones(2, dtype=np.float32))
-        assert_array_almost_equal(c.pop_twiddle_factors(2),
-                                  np.zeros(2, dtype=np.float32))
-
-        # Nothing should be left.
-        assert_equal(len(c._dict), 0)
-
-        # Now put everything in twice so it can be retrieved once and each will
-        # still have one item left.
-        for _ in range(2):
-            c.put_twiddle_factors(1, np.ones(2, dtype=np.float32))
-            c.put_twiddle_factors(2, np.zeros(2, dtype=np.float32))
-        assert_array_almost_equal(c.pop_twiddle_factors(1),
-                                  np.ones(2, dtype=np.float32))
-        assert_array_almost_equal(c.pop_twiddle_factors(2),
-                                  np.zeros(2, dtype=np.float32))
-        assert_equal(len(c._dict), 2)
-
-    def test_automatic_pruning(self):
-        # That's around 2600 single precision samples.
-        c = _FFTCache(max_size_in_mb=0.01, max_item_count=4)
-
-        c.put_twiddle_factors(1, np.ones(200, dtype=np.float32))
-        c.put_twiddle_factors(2, np.ones(200, dtype=np.float32))
-        assert_equal(list(c._dict.keys()), [1, 2])
-
-        # This is larger than the limit but should still be kept.
-        c.put_twiddle_factors(3, np.ones(3000, dtype=np.float32))
-        assert_equal(list(c._dict.keys()), [1, 2, 3])
-        # Add one more.
-        c.put_twiddle_factors(4, np.ones(3000, dtype=np.float32))
-        # The other three should no longer exist.
-        assert_equal(list(c._dict.keys()), [4])
-
-        # Now test the max item count pruning.
-        c = _FFTCache(max_size_in_mb=0.01, max_item_count=2)
-        c.put_twiddle_factors(2, np.empty(2))
-        c.put_twiddle_factors(1, np.empty(2))
-        # Can still be accessed.
-        assert_equal(list(c._dict.keys()), [2, 1])
-
-        c.put_twiddle_factors(3, np.empty(2))
-        # 1 and 3 can still be accessed - c[2] has been touched least recently
-        # and is thus evicted.
-        assert_equal(list(c._dict.keys()), [1, 3])
-
-        # One last test. We will add a single large item that is slightly
-        # bigger then the cache size. Some small items can still be added.
-        c = _FFTCache(max_size_in_mb=0.01, max_item_count=5)
-        c.put_twiddle_factors(1, np.ones(3000, dtype=np.float32))
-        c.put_twiddle_factors(2, np.ones(2, dtype=np.float32))
-        c.put_twiddle_factors(3, np.ones(2, dtype=np.float32))
-        c.put_twiddle_factors(4, np.ones(2, dtype=np.float32))
-        assert_equal(list(c._dict.keys()), [1, 2, 3, 4])
-
-        # One more big item. This time it is 6 smaller ones but they are
-        # counted as one big item.
-        for _ in range(6):
-            c.put_twiddle_factors(5, np.ones(500, dtype=np.float32))
-        # '1' no longer in the cache. Rest still in the cache.
-        assert_equal(list(c._dict.keys()), [2, 3, 4, 5])
-
-        # Another big item - should now be the only item in the cache.
-        c.put_twiddle_factors(6, np.ones(4000, dtype=np.float32))
-        assert_equal(list(c._dict.keys()), [6])
diff --git a/numpy/fft/tests/test_fftpack.py b/numpy/fft/tests/test_pocketfft.py
similarity index 76%
rename from numpy/fft/tests/test_fftpack.py
rename to numpy/fft/tests/test_pocketfft.py
index 8d6cd84..08f8007 100644
--- a/numpy/fft/tests/test_fftpack.py
+++ b/numpy/fft/tests/test_pocketfft.py
@@ -1,6 +1,7 @@
 from __future__ import division, absolute_import, print_function
 
 import numpy as np
+import pytest
 from numpy.random import random
 from numpy.testing import (
         assert_array_almost_equal, assert_array_equal, assert_raises,
@@ -28,6 +29,16 @@
 
 class TestFFT1D(object):
 
+    def test_identity(self):
+        maxlen = 512
+        x = random(maxlen) + 1j*random(maxlen)
+        xr = random(maxlen)
+        for i in range(1,maxlen):
+            assert_array_almost_equal(np.fft.ifft(np.fft.fft(x[0:i])), x[0:i],
+                                      decimal=12)
+            assert_array_almost_equal(np.fft.irfft(np.fft.rfft(xr[0:i]),i),
+                                      xr[0:i], decimal=12)
+
     def test_fft(self):
         x = random(30) + 1j*random(30)
         assert_array_almost_equal(fft1(x), np.fft.fft(x))
@@ -146,6 +157,53 @@
                     assert_array_almost_equal(x_norm,
                                               np.linalg.norm(tmp))
 
+    @pytest.mark.parametrize("dtype", [np.half, np.single, np.double,
+                                       np.longdouble])
+    def test_dtypes(self, dtype):
+        # make sure that all input precisions are accepted and internally
+        # converted to 64bit
+        x = random(30).astype(dtype)
+        assert_array_almost_equal(np.fft.ifft(np.fft.fft(x)), x)
+        assert_array_almost_equal(np.fft.irfft(np.fft.rfft(x)), x)
+
+
+@pytest.mark.parametrize(
+        "dtype",
+        [np.float32, np.float64, np.complex64, np.complex128])
+@pytest.mark.parametrize("order", ["F", 'non-contiguous'])
+@pytest.mark.parametrize(
+        "fft",
+        [np.fft.fft, np.fft.fft2, np.fft.fftn,
+         np.fft.ifft, np.fft.ifft2, np.fft.ifftn])
+def test_fft_with_order(dtype, order, fft):
+    # Check that FFT/IFFT produces identical results for C, Fortran and
+    # non contiguous arrays
+    rng = np.random.RandomState(42)
+    X = rng.rand(8, 7, 13).astype(dtype, copy=False)
+    if order == 'F':
+        Y = np.asfortranarray(X)
+    else:
+        # Make a non contiguous array
+        Y = X[::-1]
+        X = np.ascontiguousarray(X[::-1])
+
+    if fft.__name__.endswith('fft'):
+        for axis in range(3):
+            X_res = fft(X, axis=axis)
+            Y_res = fft(Y, axis=axis)
+            assert_array_almost_equal(X_res, Y_res)
+    elif fft.__name__.endswith(('fft2', 'fftn')):
+        axes = [(0, 1), (1, 2), (0, 2)]
+        if fft.__name__.endswith('fftn'):
+            axes.extend([(0,), (1,), (2,), None])
+        for ax in axes:
+            X_res = fft(X, axes=ax)
+            Y_res = fft(Y, axes=ax)
+            assert_array_almost_equal(X_res, Y_res)
+    else:
+        raise ValueError
+
+
 class TestFFTThreadSafe(object):
     threads = 16
     input_shape = (800, 200)
diff --git a/numpy/lib/_datasource.py b/numpy/lib/_datasource.py
index 30237b7..816f762 100644
--- a/numpy/lib/_datasource.py
+++ b/numpy/lib/_datasource.py
@@ -20,17 +20,18 @@
 Example::
 
     >>> # Create a DataSource, use os.curdir (default) for local storage.
-    >>> ds = datasource.DataSource()
+    >>> from numpy import DataSource
+    >>> ds = DataSource()
     >>>
     >>> # Open a remote file.
     >>> # DataSource downloads the file, stores it locally in:
     >>> #     './www.google.com/index.html'
     >>> # opens the file and returns a file object.
-    >>> fp = ds.open('http://www.google.com/index.html')
+    >>> fp = ds.open('http://www.google.com/') # doctest: +SKIP
     >>>
     >>> # Use the file as you normally would
-    >>> fp.read()
-    >>> fp.close()
+    >>> fp.read() # doctest: +SKIP
+    >>> fp.close() # doctest: +SKIP
 
 """
 from __future__ import division, absolute_import, print_function
@@ -156,6 +157,7 @@
 
     Examples
     --------
+    >>> import gzip
     >>> np.lib._datasource._file_openers.keys()
     [None, '.bz2', '.gz', '.xz', '.lzma']
     >>> np.lib._datasource._file_openers['.gz'] is gzip.open
@@ -290,7 +292,7 @@
     URLs require a scheme string (``http://``) to be used, without it they
     will fail::
 
-        >>> repos = DataSource()
+        >>> repos = np.DataSource()
         >>> repos.exists('www.google.com/index.html')
         False
         >>> repos.exists('http://www.google.com/index.html')
@@ -302,17 +304,17 @@
     --------
     ::
 
-        >>> ds = DataSource('/home/guido')
-        >>> urlname = 'http://www.google.com/index.html'
-        >>> gfile = ds.open('http://www.google.com/index.html')  # remote file
+        >>> ds = np.DataSource('/home/guido')
+        >>> urlname = 'http://www.google.com/'
+        >>> gfile = ds.open('http://www.google.com/')
         >>> ds.abspath(urlname)
-        '/home/guido/www.google.com/site/index.html'
+        '/home/guido/www.google.com/index.html'
 
-        >>> ds = DataSource(None)  # use with temporary file
+        >>> ds = np.DataSource(None)  # use with temporary file
         >>> ds.open('/home/guido/foobar.txt')
         <open file '/home/guido.foobar.txt', mode 'r' at 0x91d4430>
         >>> ds.abspath('/home/guido/foobar.txt')
-        '/tmp/tmpy4pgsP/home/guido/foobar.txt'
+        '/tmp/.../home/guido/foobar.txt'
 
     """
 
@@ -545,6 +547,11 @@
         is accessible if it exists in either location.
 
         """
+
+        # First test for local path
+        if os.path.exists(path):
+            return True
+
         # We import this here because importing urllib2 is slow and
         # a significant fraction of numpy's total import time.
         if sys.version_info[0] >= 3:
@@ -554,10 +561,6 @@
             from urllib2 import urlopen
             from urllib2 import URLError
 
-        # Test local path
-        if os.path.exists(path):
-            return True
-
         # Test cached url
         upath = self.abspath(path)
         if os.path.exists(upath):
diff --git a/numpy/lib/_iotools.py b/numpy/lib/_iotools.py
index 8a042f1..0ebd39b 100644
--- a/numpy/lib/_iotools.py
+++ b/numpy/lib/_iotools.py
@@ -146,11 +146,17 @@
     >>> dt = np.dtype([('name', 'S4'), ('x', float), ('y', float),
     ...                ('block', int, (2, 3))])
     >>> np.lib._iotools.flatten_dtype(dt)
-    [dtype('|S4'), dtype('float64'), dtype('float64'), dtype('int32')]
+    [dtype('S4'), dtype('float64'), dtype('float64'), dtype('int64')]
     >>> np.lib._iotools.flatten_dtype(dt, flatten_base=True)
-    [dtype('|S4'), dtype('float64'), dtype('float64'), dtype('int32'),
-     dtype('int32'), dtype('int32'), dtype('int32'), dtype('int32'),
-     dtype('int32')]
+    [dtype('S4'),
+     dtype('float64'),
+     dtype('float64'),
+     dtype('int64'),
+     dtype('int64'),
+     dtype('int64'),
+     dtype('int64'),
+     dtype('int64'),
+     dtype('int64')]
 
     """
     names = ndtype.names
@@ -309,13 +315,13 @@
     --------
     >>> validator = np.lib._iotools.NameValidator()
     >>> validator(['file', 'field2', 'with space', 'CaSe'])
-    ['file_', 'field2', 'with_space', 'CaSe']
+    ('file_', 'field2', 'with_space', 'CaSe')
 
     >>> validator = np.lib._iotools.NameValidator(excludelist=['excl'],
-                                                  deletechars='q',
-                                                  case_sensitive='False')
+    ...                                           deletechars='q',
+    ...                                           case_sensitive=False)
     >>> validator(['excl', 'field2', 'no_q', 'with space', 'CaSe'])
-    ['excl_', 'field2', 'no_', 'with_space', 'case']
+    ('EXCL', 'FIELD2', 'NO_Q', 'WITH_SPACE', 'CASE')
 
     """
     #
@@ -599,7 +605,7 @@
     --------
     >>> import dateutil.parser
     >>> import datetime
-    >>> dateparser = datetustil.parser.parse
+    >>> dateparser = dateutil.parser.parse
     >>> defaultdate = datetime.date(2000, 1, 1)
     >>> StringConverter.upgrade_mapper(dateparser, default=defaultdate)
         """
diff --git a/numpy/lib/_version.py b/numpy/lib/_version.py
index c3563a7..8aa999f 100644
--- a/numpy/lib/_version.py
+++ b/numpy/lib/_version.py
@@ -47,9 +47,12 @@
     >>> from numpy.lib import NumpyVersion
     >>> if NumpyVersion(np.__version__) < '1.7.0':
     ...     print('skip')
-    skip
+    >>> # skip
 
     >>> NumpyVersion('1.7')  # raises ValueError, add ".0"
+    Traceback (most recent call last):
+        ...
+    ValueError: Not a valid numpy version string
 
     """
 
diff --git a/numpy/lib/arraypad.py b/numpy/lib/arraypad.py
index 4f63710..b236cc4 100644
--- a/numpy/lib/arraypad.py
+++ b/numpy/lib/arraypad.py
@@ -1100,10 +1100,10 @@
     --------
     >>> a = [1, 2, 3, 4, 5]
     >>> np.pad(a, (2,3), 'constant', constant_values=(4, 6))
-    array([4, 4, 1, 2, 3, 4, 5, 6, 6, 6])
+    array([4, 4, 1, ..., 6, 6, 6])
 
     >>> np.pad(a, (2, 3), 'edge')
-    array([1, 1, 1, 2, 3, 4, 5, 5, 5, 5])
+    array([1, 1, 1, ..., 5, 5, 5])
 
     >>> np.pad(a, (2, 3), 'linear_ramp', end_values=(5, -4))
     array([ 5,  3,  1,  2,  3,  4,  5,  2, -1, -4])
diff --git a/numpy/lib/arraysetops.py b/numpy/lib/arraysetops.py
index fd64ecb..b53d8c0 100644
--- a/numpy/lib/arraysetops.py
+++ b/numpy/lib/arraysetops.py
@@ -82,7 +82,7 @@
     array([ 1,  2,  3, -7])
 
     >>> np.ediff1d(x, to_begin=-99, to_end=np.array([88, 99]))
-    array([-99,   1,   2,   3,  -7,  88,  99])
+    array([-99,   1,   2, ...,  -7,  88,  99])
 
     The returned array is always 1D.
 
@@ -94,8 +94,7 @@
     # force a 1d array
     ary = np.asanyarray(ary).ravel()
 
-    # we have unit tests enforcing
-    # propagation of the dtype of input
+    # enforce propagation of the dtype of input
     # ary to returned result
     dtype_req = ary.dtype
 
@@ -106,23 +105,22 @@
     if to_begin is None:
         l_begin = 0
     else:
-        to_begin = np.asanyarray(to_begin)
-        if not np.can_cast(to_begin, dtype_req):
-            raise TypeError("dtype of to_begin must be compatible "
-                            "with input ary")
-
-        to_begin = to_begin.ravel()
+        _to_begin = np.asanyarray(to_begin, dtype=dtype_req)
+        if not np.all(_to_begin == to_begin):
+            raise ValueError("cannot convert 'to_begin' to array with dtype "
+                            "'%r' as required for input ary" % dtype_req)
+        to_begin = _to_begin.ravel()
         l_begin = len(to_begin)
 
     if to_end is None:
         l_end = 0
     else:
-        to_end = np.asanyarray(to_end)
-        if not np.can_cast(to_end, dtype_req):
-            raise TypeError("dtype of to_end must be compatible "
-                            "with input ary")
-
-        to_end = to_end.ravel()
+        _to_end = np.asanyarray(to_end, dtype=dtype_req)
+        # check that casting has not overflowed
+        if not np.all(_to_end == to_end):
+            raise ValueError("cannot convert 'to_end' to array with dtype "
+                             "'%r' as required for input ary" % dtype_req)
+        to_end = _to_end.ravel()
         l_end = len(to_end)
 
     # do the calculation in place and copy to_begin and to_end
@@ -241,13 +239,11 @@
     >>> a = np.array(['a', 'b', 'b', 'c', 'a'])
     >>> u, indices = np.unique(a, return_index=True)
     >>> u
-    array(['a', 'b', 'c'],
-           dtype='|S1')
+    array(['a', 'b', 'c'], dtype='<U1')
     >>> indices
     array([0, 1, 3])
     >>> a[indices]
-    array(['a', 'b', 'c'],
-           dtype='|S1')
+    array(['a', 'b', 'c'], dtype='<U1')
 
     Reconstruct the input array from the unique values:
 
@@ -256,9 +252,9 @@
     >>> u
     array([1, 2, 3, 4, 6])
     >>> indices
-    array([0, 1, 4, 3, 1, 2, 1])
+    array([0, 1, 4, ..., 1, 2, 1])
     >>> u[indices]
-    array([1, 2, 6, 4, 2, 3, 2])
+    array([1, 2, 6, ..., 2, 3, 2])
 
     """
     ar = np.asanyarray(ar)
@@ -661,8 +657,8 @@
     >>> test_elements = [1, 2, 4, 8]
     >>> mask = np.isin(element, test_elements)
     >>> mask
-    array([[ False,  True],
-           [ True,  False]])
+    array([[False,  True],
+           [ True, False]])
     >>> element[mask]
     array([2, 4])
 
@@ -676,7 +672,7 @@
     >>> mask = np.isin(element, test_elements, invert=True)
     >>> mask
     array([[ True, False],
-           [ False, True]])
+           [False,  True]])
     >>> element[mask]
     array([0, 6])
 
@@ -685,14 +681,14 @@
 
     >>> test_set = {1, 2, 4, 8}
     >>> np.isin(element, test_set)
-    array([[ False, False],
-           [ False, False]])
+    array([[False, False],
+           [False, False]])
 
     Casting the set to a list gives the expected result:
 
     >>> np.isin(element, list(test_set))
-    array([[ False,  True],
-           [ True,  False]])
+    array([[False,  True],
+           [ True, False]])
     """
     element = np.asarray(element)
     return in1d(element, test_elements, assume_unique=assume_unique,
diff --git a/numpy/lib/arrayterator.py b/numpy/lib/arrayterator.py
index f2d4fe9..c166685 100644
--- a/numpy/lib/arrayterator.py
+++ b/numpy/lib/arrayterator.py
@@ -80,9 +80,8 @@
 
     >>> for subarr in a_itor:
     ...     if not subarr.all():
-    ...         print(subarr, subarr.shape)
-    ...
-    [[[[0 1]]]] (1, 1, 1, 2)
+    ...         print(subarr, subarr.shape) # doctest: +SKIP
+    >>> # [[[[0 1]]]] (1, 1, 1, 2)
 
     """
 
@@ -160,7 +159,7 @@
         ...     if not subarr:
         ...         print(subarr, type(subarr))
         ...
-        0 <type 'numpy.int32'>
+        0 <class 'numpy.int64'>
 
         """
         for block in self:
diff --git a/numpy/lib/financial.py b/numpy/lib/financial.py
index e1e2974..2166874 100644
--- a/numpy/lib/financial.py
+++ b/numpy/lib/financial.py
@@ -127,7 +127,7 @@
 
     >>> a = np.array((0.05, 0.06, 0.07))/12
     >>> np.fv(a, 10*12, -100, -100)
-    array([ 15692.92889434,  16569.87435405,  17509.44688102])
+    array([ 15692.92889434,  16569.87435405,  17509.44688102]) # may vary
 
     """
     when = _convert_when(when)
@@ -275,7 +275,7 @@
     If you only had $150/month to pay towards the loan, how long would it take
     to pay-off a loan of $8,000 at 7% annual interest?
 
-    >>> print(round(np.nper(0.07/12, -150, 8000), 5))
+    >>> print(np.round(np.nper(0.07/12, -150, 8000), 5))
     64.07335
 
     So, over 64 months would be required to pay off the loan.
@@ -286,10 +286,10 @@
     >>> np.nper(*(np.ogrid[0.07/12: 0.08/12: 0.01/12,
     ...                    -150   : -99     : 50    ,
     ...                    8000   : 9001    : 1000]))
-    array([[[  64.07334877,   74.06368256],
-            [ 108.07548412,  127.99022654]],
-           [[  66.12443902,   76.87897353],
-            [ 114.70165583,  137.90124779]]])
+    array([[[ 64.07334877,  74.06368256],
+            [108.07548412, 127.99022654]],
+           [[ 66.12443902,  76.87897353],
+            [114.70165583, 137.90124779]]])
 
     """
     when = _convert_when(when)
@@ -539,7 +539,7 @@
 
     >>> a = np.array((0.05, 0.04, 0.03))/12
     >>> np.pv(a, 10*12, -100, 15692.93)
-    array([ -100.00067132,  -649.26771385, -1273.78633713])
+    array([ -100.00067132,  -649.26771385, -1273.78633713]) # may vary
 
     So, to end up with the same $15692.93 under the same $100 per month
     "savings plan," for annual interest rates of 4% and 3%, one would
@@ -704,15 +704,15 @@
 
     Examples
     --------
-    >>> round(irr([-100, 39, 59, 55, 20]), 5)
+    >>> round(np.irr([-100, 39, 59, 55, 20]), 5)
     0.28095
-    >>> round(irr([-100, 0, 0, 74]), 5)
+    >>> round(np.irr([-100, 0, 0, 74]), 5)
     -0.0955
-    >>> round(irr([-100, 100, 0, -7]), 5)
+    >>> round(np.irr([-100, 100, 0, -7]), 5)
     -0.0833
-    >>> round(irr([-100, 100, 0, 7]), 5)
+    >>> round(np.irr([-100, 100, 0, 7]), 5)
     0.06206
-    >>> round(irr([-5, 10.5, 1, -8, 1]), 5)
+    >>> round(np.irr([-5, 10.5, 1, -8, 1]), 5)
     0.0886
 
     (Compare with the Example given for numpy.lib.financial.npv)
@@ -777,7 +777,7 @@
     Examples
     --------
     >>> np.npv(0.281,[-100, 39, 59, 55, 20])
-    -0.0084785916384548798
+    -0.0084785916384548798 # may vary
 
     (Compare with the Example given for numpy.lib.financial.irr)
 
diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py
index 5f87c8b..b61a64b 100644
--- a/numpy/lib/function_base.py
+++ b/numpy/lib/function_base.py
@@ -218,12 +218,12 @@
             [2, 3]],
            [[4, 5],
             [6, 7]]])
-    >>> flip(A, 0)
+    >>> np.flip(A, 0)
     array([[[4, 5],
             [6, 7]],
            [[0, 1],
             [2, 3]]])
-    >>> flip(A, 1)
+    >>> np.flip(A, 1)
     array([[[2, 3],
             [0, 1]],
            [[6, 7],
@@ -239,7 +239,7 @@
            [[1, 0],
             [3, 2]]])
     >>> A = np.random.randn(3,4,5)
-    >>> np.all(flip(A,2) == A[:,:,::-1,...])
+    >>> np.all(np.flip(A,2) == A[:,:,::-1,...])
     True
     """
     if not hasattr(m, 'ndim'):
@@ -359,7 +359,7 @@
 
     Examples
     --------
-    >>> data = range(1,5)
+    >>> data = list(range(1,5))
     >>> data
     [1, 2, 3, 4]
     >>> np.average(data)
@@ -373,11 +373,10 @@
            [2, 3],
            [4, 5]])
     >>> np.average(data, axis=1, weights=[1./4, 3./4])
-    array([ 0.75,  2.75,  4.75])
+    array([0.75, 2.75, 4.75])
     >>> np.average(data, weights=[1./4, 3./4])
-    
     Traceback (most recent call last):
-    ...
+        ...
     TypeError: Axis must be specified when shapes of a and weights differ.
     
     >>> a = np.ones(5, dtype=np.float128)
@@ -586,7 +585,7 @@
     ``x >= 0``.
 
     >>> np.piecewise(x, [x < 0, x >= 0], [lambda x: -x, lambda x: x])
-    array([ 2.5,  1.5,  0.5,  0.5,  1.5,  2.5])
+    array([2.5,  1.5,  0.5,  0.5,  1.5,  2.5])
 
     Apply the same function to a scalar value.
 
@@ -671,7 +670,7 @@
     >>> condlist = [x<3, x>5]
     >>> choicelist = [x, x**2]
     >>> np.select(condlist, choicelist)
-    array([ 0,  1,  2,  0,  0,  0, 36, 49, 64, 81])
+    array([ 0,  1,  2, ..., 49, 64, 81])
 
     """
     # Check the size of condlist and choicelist are the same, or abort.
@@ -854,9 +853,9 @@
     --------
     >>> f = np.array([1, 2, 4, 7, 11, 16], dtype=float)
     >>> np.gradient(f)
-    array([ 1. ,  1.5,  2.5,  3.5,  4.5,  5. ])
+    array([1. , 1.5, 2.5, 3.5, 4.5, 5. ])
     >>> np.gradient(f, 2)
-    array([ 0.5 ,  0.75,  1.25,  1.75,  2.25,  2.5 ])
+    array([0.5 ,  0.75,  1.25,  1.75,  2.25,  2.5 ])
 
     Spacing can be also specified with an array that represents the coordinates
     of the values F along the dimensions.
@@ -864,13 +863,13 @@
 
     >>> x = np.arange(f.size)
     >>> np.gradient(f, x)
-    array([ 1. ,  1.5,  2.5,  3.5,  4.5,  5. ])
+    array([1. ,  1.5,  2.5,  3.5,  4.5,  5. ])
 
     Or a non uniform one:
 
     >>> x = np.array([0., 1., 1.5, 3.5, 4., 6.], dtype=float)
     >>> np.gradient(f, x)
-    array([ 1. ,  3. ,  3.5,  6.7,  6.9,  2.5])
+    array([1. ,  3. ,  3.5,  6.7,  6.9,  2.5])
 
     For two dimensional arrays, the return will be two arrays ordered by
     axis. In this example the first array stands for the gradient in
@@ -878,8 +877,8 @@
 
     >>> np.gradient(np.array([[1, 2, 6], [3, 4, 5]], dtype=float))
     [array([[ 2.,  2., -1.],
-            [ 2.,  2., -1.]]), array([[ 1. ,  2.5,  4. ],
-            [ 1. ,  1. ,  1. ]])]
+           [ 2.,  2., -1.]]), array([[1. , 2.5, 4. ],
+           [1. , 1. , 1. ]])]
 
     In this example the spacing is also specified:
     uniform for axis=0 and non uniform for axis=1
@@ -888,17 +887,17 @@
     >>> y = [1., 1.5, 3.5]
     >>> np.gradient(np.array([[1, 2, 6], [3, 4, 5]], dtype=float), dx, y)
     [array([[ 1. ,  1. , -0.5],
-            [ 1. ,  1. , -0.5]]), array([[ 2. ,  2. ,  2. ],
-            [ 2. ,  1.7,  0.5]])]
+           [ 1. ,  1. , -0.5]]), array([[2. , 2. , 2. ],
+           [2. , 1.7, 0.5]])]
 
     It is possible to specify how boundaries are treated using `edge_order`
 
     >>> x = np.array([0, 1, 2, 3, 4])
     >>> f = x**2
     >>> np.gradient(f, edge_order=1)
-    array([ 1.,  2.,  4.,  6.,  7.])
+    array([1.,  2.,  4.,  6.,  7.])
     >>> np.gradient(f, edge_order=2)
-    array([-0.,  2.,  4.,  6.,  8.])
+    array([0., 2., 4., 6., 8.])
 
     The `axis` keyword can be used to specify a subset of axes of which the
     gradient is calculated
@@ -1151,7 +1150,7 @@
     """
     Calculate the n-th discrete difference along the given axis.
 
-    The first difference is given by ``out[n] = a[n+1] - a[n]`` along
+    The first difference is given by ``out[i] = a[i+1] - a[i]`` along
     the given axis, higher differences are calculated by using `diff`
     recursively.
 
@@ -1200,7 +1199,7 @@
     >>> np.diff(u8_arr)
     array([255], dtype=uint8)
     >>> u8_arr[1,...] - u8_arr[0,...]
-    array(255, np.uint8)
+    255
 
     If this is not desirable, then the array should be cast to a larger
     integer type first:
@@ -1340,7 +1339,7 @@
     >>> np.interp(2.5, xp, fp)
     1.0
     >>> np.interp([0, 1, 1.5, 2.72, 3.14], xp, fp)
-    array([ 3. ,  3. ,  2.5 ,  0.56,  0. ])
+    array([3.  , 3.  , 2.5 , 0.56, 0.  ])
     >>> UNDEF = -99.0
     >>> np.interp(3.14, xp, fp, right=UNDEF)
     -99.0
@@ -1364,7 +1363,7 @@
     >>> xp = [190, -190, 350, -350]
     >>> fp = [5, 10, 3, 4]
     >>> np.interp(x, xp, fp, period=360)
-    array([7.5, 5., 8.75, 6.25, 3., 3.25, 3.5, 3.75])
+    array([7.5 , 5.  , 8.75, 6.25, 3.  , 3.25, 3.5 , 3.75])
 
     Complex interpolation:
 
@@ -1372,7 +1371,7 @@
     >>> xp = [2,3,5]
     >>> fp = [1.0j, 0, 2+3j]
     >>> np.interp(x, xp, fp)
-    array([ 0.+1.j ,  1.+1.5j])
+    array([0.+1.j , 1.+1.5j])
 
     """
 
@@ -1445,7 +1444,7 @@
     Examples
     --------
     >>> np.angle([1.0, 1.0j, 1+1j])               # in radians
-    array([ 0.        ,  1.57079633,  0.78539816])
+    array([ 0.        ,  1.57079633,  0.78539816]) # may vary
     >>> np.angle(1+1j, deg=True)                  # in degrees
     45.0
 
@@ -1505,9 +1504,9 @@
     >>> phase = np.linspace(0, np.pi, num=5)
     >>> phase[3:] += np.pi
     >>> phase
-    array([ 0.        ,  0.78539816,  1.57079633,  5.49778714,  6.28318531])
+    array([ 0.        ,  0.78539816,  1.57079633,  5.49778714,  6.28318531]) # may vary
     >>> np.unwrap(phase)
-    array([ 0.        ,  0.78539816,  1.57079633, -0.78539816,  0.        ])
+    array([ 0.        ,  0.78539816,  1.57079633, -0.78539816,  0.        ]) # may vary
 
     """
     p = asarray(p)
@@ -1547,10 +1546,10 @@
     Examples
     --------
     >>> np.sort_complex([5, 3, 6, 2, 1])
-    array([ 1.+0.j,  2.+0.j,  3.+0.j,  5.+0.j,  6.+0.j])
+    array([1.+0.j, 2.+0.j, 3.+0.j, 5.+0.j, 6.+0.j])
 
     >>> np.sort_complex([1 + 2j, 2 - 1j, 3 - 2j, 3 - 3j, 3 + 5j])
-    array([ 1.+2.j,  2.-1.j,  3.-3.j,  3.-2.j,  3.+5.j])
+    array([1.+2.j,  2.-1.j,  3.-3.j,  3.-2.j,  3.+5.j])
 
     """
     b = array(a, copy=True)
@@ -1596,7 +1595,7 @@
     array([1, 2, 3, 0, 2, 1])
 
     >>> np.trim_zeros(a, 'b')
-    array([0, 0, 0, 1, 2, 3, 0, 2, 1])
+    array([0, 0, 0, ..., 0, 2, 1])
 
     The input data type is preserved, list/tuple in means list/tuple out.
 
@@ -1958,11 +1957,11 @@
 
     >>> out = vfunc([1, 2, 3, 4], 2)
     >>> type(out[0])
-    <type 'numpy.int32'>
+    <class 'numpy.int64'>
     >>> vfunc = np.vectorize(myfunc, otypes=[float])
     >>> out = vfunc([1, 2, 3, 4], 2)
     >>> type(out[0])
-    <type 'numpy.float64'>
+    <class 'numpy.float64'>
 
     The `excluded` argument can be used to prevent vectorizing over certain
     arguments.  This can be useful for array-like arguments of a fixed length
@@ -1990,18 +1989,18 @@
 
     >>> import scipy.stats
     >>> pearsonr = np.vectorize(scipy.stats.pearsonr,
-    ...                         signature='(n),(n)->(),()')
-    >>> pearsonr([[0, 1, 2, 3]], [[1, 2, 3, 4], [4, 3, 2, 1]])
+    ...                 signature='(n),(n)->(),()') 
+    >>> pearsonr([[0, 1, 2, 3]], [[1, 2, 3, 4], [4, 3, 2, 1]]) 
     (array([ 1., -1.]), array([ 0.,  0.]))
 
     Or for a vectorized convolution:
 
     >>> convolve = np.vectorize(np.convolve, signature='(n),(m)->(k)')
     >>> convolve(np.eye(4), [1, 2, 1])
-    array([[ 1.,  2.,  1.,  0.,  0.,  0.],
-           [ 0.,  1.,  2.,  1.,  0.,  0.],
-           [ 0.,  0.,  1.,  2.,  1.,  0.],
-           [ 0.,  0.,  0.,  1.,  2.,  1.]])
+    array([[1., 2., 1., 0., 0., 0.],
+           [0., 1., 2., 1., 0., 0.],
+           [0., 0., 1., 2., 1., 0.],
+           [0., 0., 0., 1., 2., 1.]])
 
     See Also
     --------
@@ -2311,10 +2310,14 @@
     array `m` and let ``f = fweights`` and ``a = aweights`` for brevity. The
     steps to compute the weighted covariance are as follows::
 
+        >>> m = np.arange(10, dtype=np.float64)
+        >>> f = np.arange(10) * 2
+        >>> a = np.arange(10) ** 2.
+        >>> ddof = 9 # N - 1
         >>> w = f * a
         >>> v1 = np.sum(w)
         >>> v2 = np.sum(w * a)
-        >>> m -= np.sum(m * w, axis=1, keepdims=True) / v1
+        >>> m -= np.sum(m * w, axis=None, keepdims=True) / v1
         >>> cov = np.dot(m * w, m.T) * v1 / (v1**2 - ddof * v2)
 
     Note that when ``a == 1``, the normalization factor
@@ -2346,14 +2349,14 @@
     >>> x = [-2.1, -1,  4.3]
     >>> y = [3,  1.1,  0.12]
     >>> X = np.stack((x, y), axis=0)
-    >>> print(np.cov(X))
-    [[ 11.71        -4.286     ]
-     [ -4.286        2.14413333]]
-    >>> print(np.cov(x, y))
-    [[ 11.71        -4.286     ]
-     [ -4.286        2.14413333]]
-    >>> print(np.cov(x))
-    11.71
+    >>> np.cov(X)
+    array([[11.71      , -4.286     ], # may vary
+           [-4.286     ,  2.144133]])
+    >>> np.cov(x, y)
+    array([[11.71      , -4.286     ], # may vary
+           [-4.286     ,  2.144133]])
+    >>> np.cov(x)
+    array(11.71)
 
     """
     # Check inputs
@@ -2590,12 +2593,12 @@
 
     Examples
     --------
+    >>> import matplotlib.pyplot as plt
     >>> np.blackman(12)
-    array([ -1.38777878e-17,   3.26064346e-02,   1.59903635e-01,
-             4.14397981e-01,   7.36045180e-01,   9.67046769e-01,
-             9.67046769e-01,   7.36045180e-01,   4.14397981e-01,
-             1.59903635e-01,   3.26064346e-02,  -1.38777878e-17])
-
+    array([-1.38777878e-17,   3.26064346e-02,   1.59903635e-01, # may vary
+            4.14397981e-01,   7.36045180e-01,   9.67046769e-01,
+            9.67046769e-01,   7.36045180e-01,   4.14397981e-01,
+            1.59903635e-01,   3.26064346e-02,  -1.38777878e-17])
 
     Plot the window and the frequency response:
 
@@ -2604,15 +2607,15 @@
     >>> plt.plot(window)
     [<matplotlib.lines.Line2D object at 0x...>]
     >>> plt.title("Blackman window")
-    <matplotlib.text.Text object at 0x...>
+    Text(0.5, 1.0, 'Blackman window')
     >>> plt.ylabel("Amplitude")
-    <matplotlib.text.Text object at 0x...>
+    Text(0, 0.5, 'Amplitude')
     >>> plt.xlabel("Sample")
-    <matplotlib.text.Text object at 0x...>
+    Text(0.5, 0, 'Sample')
     >>> plt.show()
 
     >>> plt.figure()
-    <matplotlib.figure.Figure object at 0x...>
+    <Figure size 640x480 with 0 Axes>
     >>> A = fft(window, 2048) / 25.5
     >>> mag = np.abs(fftshift(A))
     >>> freq = np.linspace(-0.5, 0.5, len(A))
@@ -2621,13 +2624,12 @@
     >>> plt.plot(freq, response)
     [<matplotlib.lines.Line2D object at 0x...>]
     >>> plt.title("Frequency response of Blackman window")
-    <matplotlib.text.Text object at 0x...>
+    Text(0.5, 1.0, 'Frequency response of Blackman window')
     >>> plt.ylabel("Magnitude [dB]")
-    <matplotlib.text.Text object at 0x...>
+    Text(0, 0.5, 'Magnitude [dB]')
     >>> plt.xlabel("Normalized frequency [cycles per sample]")
-    <matplotlib.text.Text object at 0x...>
-    >>> plt.axis('tight')
-    (-0.5, 0.5, -100.0, ...)
+    Text(0.5, 0, 'Normalized frequency [cycles per sample]')
+    >>> _ = plt.axis('tight')
     >>> plt.show()
 
     """
@@ -2699,8 +2701,9 @@
 
     Examples
     --------
+    >>> import matplotlib.pyplot as plt
     >>> np.bartlett(12)
-    array([ 0.        ,  0.18181818,  0.36363636,  0.54545455,  0.72727273,
+    array([ 0.        ,  0.18181818,  0.36363636,  0.54545455,  0.72727273, # may vary
             0.90909091,  0.90909091,  0.72727273,  0.54545455,  0.36363636,
             0.18181818,  0.        ])
 
@@ -2711,15 +2714,15 @@
     >>> plt.plot(window)
     [<matplotlib.lines.Line2D object at 0x...>]
     >>> plt.title("Bartlett window")
-    <matplotlib.text.Text object at 0x...>
+    Text(0.5, 1.0, 'Bartlett window')
     >>> plt.ylabel("Amplitude")
-    <matplotlib.text.Text object at 0x...>
+    Text(0, 0.5, 'Amplitude')
     >>> plt.xlabel("Sample")
-    <matplotlib.text.Text object at 0x...>
+    Text(0.5, 0, 'Sample')
     >>> plt.show()
 
     >>> plt.figure()
-    <matplotlib.figure.Figure object at 0x...>
+    <Figure size 640x480 with 0 Axes>
     >>> A = fft(window, 2048) / 25.5
     >>> mag = np.abs(fftshift(A))
     >>> freq = np.linspace(-0.5, 0.5, len(A))
@@ -2728,13 +2731,12 @@
     >>> plt.plot(freq, response)
     [<matplotlib.lines.Line2D object at 0x...>]
     >>> plt.title("Frequency response of Bartlett window")
-    <matplotlib.text.Text object at 0x...>
+    Text(0.5, 1.0, 'Frequency response of Bartlett window')
     >>> plt.ylabel("Magnitude [dB]")
-    <matplotlib.text.Text object at 0x...>
+    Text(0, 0.5, 'Magnitude [dB]')
     >>> plt.xlabel("Normalized frequency [cycles per sample]")
-    <matplotlib.text.Text object at 0x...>
-    >>> plt.axis('tight')
-    (-0.5, 0.5, -100.0, ...)
+    Text(0.5, 0, 'Normalized frequency [cycles per sample]')
+    >>> _ = plt.axis('tight')
     >>> plt.show()
 
     """
@@ -2801,26 +2803,27 @@
     Examples
     --------
     >>> np.hanning(12)
-    array([ 0.        ,  0.07937323,  0.29229249,  0.57115742,  0.82743037,
-            0.97974649,  0.97974649,  0.82743037,  0.57115742,  0.29229249,
-            0.07937323,  0.        ])
+    array([0.        , 0.07937323, 0.29229249, 0.57115742, 0.82743037,
+           0.97974649, 0.97974649, 0.82743037, 0.57115742, 0.29229249,
+           0.07937323, 0.        ])
 
     Plot the window and its frequency response:
 
+    >>> import matplotlib.pyplot as plt
     >>> from numpy.fft import fft, fftshift
     >>> window = np.hanning(51)
     >>> plt.plot(window)
     [<matplotlib.lines.Line2D object at 0x...>]
     >>> plt.title("Hann window")
-    <matplotlib.text.Text object at 0x...>
+    Text(0.5, 1.0, 'Hann window')
     >>> plt.ylabel("Amplitude")
-    <matplotlib.text.Text object at 0x...>
+    Text(0, 0.5, 'Amplitude')
     >>> plt.xlabel("Sample")
-    <matplotlib.text.Text object at 0x...>
+    Text(0.5, 0, 'Sample')
     >>> plt.show()
 
     >>> plt.figure()
-    <matplotlib.figure.Figure object at 0x...>
+    <Figure size 640x480 with 0 Axes>
     >>> A = fft(window, 2048) / 25.5
     >>> mag = np.abs(fftshift(A))
     >>> freq = np.linspace(-0.5, 0.5, len(A))
@@ -2829,13 +2832,13 @@
     >>> plt.plot(freq, response)
     [<matplotlib.lines.Line2D object at 0x...>]
     >>> plt.title("Frequency response of the Hann window")
-    <matplotlib.text.Text object at 0x...>
+    Text(0.5, 1.0, 'Frequency response of the Hann window')
     >>> plt.ylabel("Magnitude [dB]")
-    <matplotlib.text.Text object at 0x...>
+    Text(0, 0.5, 'Magnitude [dB]')
     >>> plt.xlabel("Normalized frequency [cycles per sample]")
-    <matplotlib.text.Text object at 0x...>
+    Text(0.5, 0, 'Normalized frequency [cycles per sample]')
     >>> plt.axis('tight')
-    (-0.5, 0.5, -100.0, ...)
+    ...
     >>> plt.show()
 
     """
@@ -2900,26 +2903,27 @@
     Examples
     --------
     >>> np.hamming(12)
-    array([ 0.08      ,  0.15302337,  0.34890909,  0.60546483,  0.84123594,
+    array([ 0.08      ,  0.15302337,  0.34890909,  0.60546483,  0.84123594, # may vary
             0.98136677,  0.98136677,  0.84123594,  0.60546483,  0.34890909,
             0.15302337,  0.08      ])
 
     Plot the window and the frequency response:
 
+    >>> import matplotlib.pyplot as plt
     >>> from numpy.fft import fft, fftshift
     >>> window = np.hamming(51)
     >>> plt.plot(window)
     [<matplotlib.lines.Line2D object at 0x...>]
     >>> plt.title("Hamming window")
-    <matplotlib.text.Text object at 0x...>
+    Text(0.5, 1.0, 'Hamming window')
     >>> plt.ylabel("Amplitude")
-    <matplotlib.text.Text object at 0x...>
+    Text(0, 0.5, 'Amplitude')
     >>> plt.xlabel("Sample")
-    <matplotlib.text.Text object at 0x...>
+    Text(0.5, 0, 'Sample')
     >>> plt.show()
 
     >>> plt.figure()
-    <matplotlib.figure.Figure object at 0x...>
+    <Figure size 640x480 with 0 Axes>
     >>> A = fft(window, 2048) / 25.5
     >>> mag = np.abs(fftshift(A))
     >>> freq = np.linspace(-0.5, 0.5, len(A))
@@ -2928,13 +2932,13 @@
     >>> plt.plot(freq, response)
     [<matplotlib.lines.Line2D object at 0x...>]
     >>> plt.title("Frequency response of Hamming window")
-    <matplotlib.text.Text object at 0x...>
+    Text(0.5, 1.0, 'Frequency response of Hamming window')
     >>> plt.ylabel("Magnitude [dB]")
-    <matplotlib.text.Text object at 0x...>
+    Text(0, 0.5, 'Magnitude [dB]')
     >>> plt.xlabel("Normalized frequency [cycles per sample]")
-    <matplotlib.text.Text object at 0x...>
+    Text(0.5, 0, 'Normalized frequency [cycles per sample]')
     >>> plt.axis('tight')
-    (-0.5, 0.5, -100.0, ...)
+    ...
     >>> plt.show()
 
     """
@@ -3083,9 +3087,9 @@
     Examples
     --------
     >>> np.i0([0.])
-    array(1.0)
+    array(1.0) # may vary
     >>> np.i0([0., 1. + 2j])
-    array([ 1.00000000+0.j        ,  0.18785373+0.64616944j])
+    array([ 1.00000000+0.j        ,  0.18785373+0.64616944j]) # may vary
 
     """
     x = atleast_1d(x).copy()
@@ -3180,11 +3184,12 @@
 
     Examples
     --------
+    >>> import matplotlib.pyplot as plt
     >>> np.kaiser(12, 14)
-    array([  7.72686684e-06,   3.46009194e-03,   4.65200189e-02,
-             2.29737120e-01,   5.99885316e-01,   9.45674898e-01,
-             9.45674898e-01,   5.99885316e-01,   2.29737120e-01,
-             4.65200189e-02,   3.46009194e-03,   7.72686684e-06])
+     array([7.72686684e-06, 3.46009194e-03, 4.65200189e-02, # may vary
+            2.29737120e-01, 5.99885316e-01, 9.45674898e-01,
+            9.45674898e-01, 5.99885316e-01, 2.29737120e-01,
+            4.65200189e-02, 3.46009194e-03, 7.72686684e-06])
 
 
     Plot the window and the frequency response:
@@ -3194,15 +3199,15 @@
     >>> plt.plot(window)
     [<matplotlib.lines.Line2D object at 0x...>]
     >>> plt.title("Kaiser window")
-    <matplotlib.text.Text object at 0x...>
+    Text(0.5, 1.0, 'Kaiser window')
     >>> plt.ylabel("Amplitude")
-    <matplotlib.text.Text object at 0x...>
+    Text(0, 0.5, 'Amplitude')
     >>> plt.xlabel("Sample")
-    <matplotlib.text.Text object at 0x...>
+    Text(0.5, 0, 'Sample')
     >>> plt.show()
 
     >>> plt.figure()
-    <matplotlib.figure.Figure object at 0x...>
+    <Figure size 640x480 with 0 Axes>
     >>> A = fft(window, 2048) / 25.5
     >>> mag = np.abs(fftshift(A))
     >>> freq = np.linspace(-0.5, 0.5, len(A))
@@ -3211,13 +3216,13 @@
     >>> plt.plot(freq, response)
     [<matplotlib.lines.Line2D object at 0x...>]
     >>> plt.title("Frequency response of Kaiser window")
-    <matplotlib.text.Text object at 0x...>
+    Text(0.5, 1.0, 'Frequency response of Kaiser window')
     >>> plt.ylabel("Magnitude [dB]")
-    <matplotlib.text.Text object at 0x...>
+    Text(0, 0.5, 'Magnitude [dB]')
     >>> plt.xlabel("Normalized frequency [cycles per sample]")
-    <matplotlib.text.Text object at 0x...>
+    Text(0.5, 0, 'Normalized frequency [cycles per sample]')
     >>> plt.axis('tight')
-    (-0.5, 0.5, -100.0, ...)
+    (-0.5, 0.5, -100.0, ...) # may vary
     >>> plt.show()
 
     """
@@ -3273,31 +3278,32 @@
 
     Examples
     --------
+    >>> import matplotlib.pyplot as plt
     >>> x = np.linspace(-4, 4, 41)
     >>> np.sinc(x)
-    array([ -3.89804309e-17,  -4.92362781e-02,  -8.40918587e-02,
+     array([-3.89804309e-17,  -4.92362781e-02,  -8.40918587e-02, # may vary
             -8.90384387e-02,  -5.84680802e-02,   3.89804309e-17,
-             6.68206631e-02,   1.16434881e-01,   1.26137788e-01,
-             8.50444803e-02,  -3.89804309e-17,  -1.03943254e-01,
+            6.68206631e-02,   1.16434881e-01,   1.26137788e-01,
+            8.50444803e-02,  -3.89804309e-17,  -1.03943254e-01,
             -1.89206682e-01,  -2.16236208e-01,  -1.55914881e-01,
-             3.89804309e-17,   2.33872321e-01,   5.04551152e-01,
-             7.56826729e-01,   9.35489284e-01,   1.00000000e+00,
-             9.35489284e-01,   7.56826729e-01,   5.04551152e-01,
-             2.33872321e-01,   3.89804309e-17,  -1.55914881e-01,
-            -2.16236208e-01,  -1.89206682e-01,  -1.03943254e-01,
-            -3.89804309e-17,   8.50444803e-02,   1.26137788e-01,
-             1.16434881e-01,   6.68206631e-02,   3.89804309e-17,
+            3.89804309e-17,   2.33872321e-01,   5.04551152e-01,
+            7.56826729e-01,   9.35489284e-01,   1.00000000e+00,
+            9.35489284e-01,   7.56826729e-01,   5.04551152e-01,
+            2.33872321e-01,   3.89804309e-17,  -1.55914881e-01,
+           -2.16236208e-01,  -1.89206682e-01,  -1.03943254e-01,
+           -3.89804309e-17,   8.50444803e-02,   1.26137788e-01,
+            1.16434881e-01,   6.68206631e-02,   3.89804309e-17,
             -5.84680802e-02,  -8.90384387e-02,  -8.40918587e-02,
             -4.92362781e-02,  -3.89804309e-17])
 
     >>> plt.plot(x, np.sinc(x))
     [<matplotlib.lines.Line2D object at 0x...>]
     >>> plt.title("Sinc Function")
-    <matplotlib.text.Text object at 0x...>
+    Text(0.5, 1.0, 'Sinc Function')
     >>> plt.ylabel("Amplitude")
-    <matplotlib.text.Text object at 0x...>
+    Text(0, 0.5, 'Amplitude')
     >>> plt.xlabel("X")
-    <matplotlib.text.Text object at 0x...>
+    Text(0.5, 0, 'X')
     >>> plt.show()
 
     It works in 2-D as well:
@@ -3469,18 +3475,18 @@
     >>> np.median(a)
     3.5
     >>> np.median(a, axis=0)
-    array([ 6.5,  4.5,  2.5])
+    array([6.5, 4.5, 2.5])
     >>> np.median(a, axis=1)
-    array([ 7.,  2.])
+    array([7.,  2.])
     >>> m = np.median(a, axis=0)
     >>> out = np.zeros_like(m)
     >>> np.median(a, axis=0, out=m)
-    array([ 6.5,  4.5,  2.5])
+    array([6.5,  4.5,  2.5])
     >>> m
-    array([ 6.5,  4.5,  2.5])
+    array([6.5,  4.5,  2.5])
     >>> b = a.copy()
     >>> np.median(b, axis=1, overwrite_input=True)
-    array([ 7.,  2.])
+    array([7.,  2.])
     >>> assert not np.all(a==b)
     >>> b = a.copy()
     >>> np.median(b, axis=None, overwrite_input=True)
@@ -3647,23 +3653,23 @@
     >>> np.percentile(a, 50)
     3.5
     >>> np.percentile(a, 50, axis=0)
-    array([[ 6.5,  4.5,  2.5]])
+    array([6.5, 4.5, 2.5])
     >>> np.percentile(a, 50, axis=1)
-    array([ 7.,  2.])
+    array([7.,  2.])
     >>> np.percentile(a, 50, axis=1, keepdims=True)
-    array([[ 7.],
-           [ 2.]])
+    array([[7.],
+           [2.]])
 
     >>> m = np.percentile(a, 50, axis=0)
     >>> out = np.zeros_like(m)
     >>> np.percentile(a, 50, axis=0, out=out)
-    array([[ 6.5,  4.5,  2.5]])
+    array([6.5, 4.5, 2.5])
     >>> m
-    array([[ 6.5,  4.5,  2.5]])
+    array([6.5, 4.5, 2.5])
 
     >>> b = a.copy()
     >>> np.percentile(b, 50, axis=1, overwrite_input=True)
-    array([ 7.,  2.])
+    array([7.,  2.])
     >>> assert not np.all(a == b)
 
     The different types of interpolation can be visualized graphically:
@@ -3789,21 +3795,21 @@
     >>> np.quantile(a, 0.5)
     3.5
     >>> np.quantile(a, 0.5, axis=0)
-    array([[ 6.5,  4.5,  2.5]])
+    array([6.5, 4.5, 2.5])
     >>> np.quantile(a, 0.5, axis=1)
-    array([ 7.,  2.])
+    array([7.,  2.])
     >>> np.quantile(a, 0.5, axis=1, keepdims=True)
-    array([[ 7.],
-           [ 2.]])
+    array([[7.],
+           [2.]])
     >>> m = np.quantile(a, 0.5, axis=0)
     >>> out = np.zeros_like(m)
     >>> np.quantile(a, 0.5, axis=0, out=out)
-    array([[ 6.5,  4.5,  2.5]])
+    array([6.5, 4.5, 2.5])
     >>> m
-    array([[ 6.5,  4.5,  2.5]])
+    array([6.5, 4.5, 2.5])
     >>> b = a.copy()
     >>> np.quantile(b, 0.5, axis=1, overwrite_input=True)
-    array([ 7.,  2.])
+    array([7.,  2.])
     >>> assert not np.all(a == b)
     """
     q = np.asanyarray(q)
@@ -3950,8 +3956,6 @@
             r = add(x1, x2)
 
     if np.any(n):
-        warnings.warn("Invalid value encountered in percentile",
-                      RuntimeWarning, stacklevel=3)
         if zerod:
             if ap.ndim == 1:
                 if out is not None:
@@ -4032,9 +4036,9 @@
     array([[0, 1, 2],
            [3, 4, 5]])
     >>> np.trapz(a, axis=0)
-    array([ 1.5,  2.5,  3.5])
+    array([1.5, 2.5, 3.5])
     >>> np.trapz(a, axis=1)
-    array([ 2.,  8.])
+    array([2.,  8.])
 
     """
     y = asanyarray(y)
@@ -4152,17 +4156,17 @@
     >>> y = np.linspace(0, 1, ny)
     >>> xv, yv = np.meshgrid(x, y)
     >>> xv
-    array([[ 0. ,  0.5,  1. ],
-           [ 0. ,  0.5,  1. ]])
+    array([[0. , 0.5, 1. ],
+           [0. , 0.5, 1. ]])
     >>> yv
-    array([[ 0.,  0.,  0.],
-           [ 1.,  1.,  1.]])
+    array([[0.,  0.,  0.],
+           [1.,  1.,  1.]])
     >>> xv, yv = np.meshgrid(x, y, sparse=True)  # make sparse output arrays
     >>> xv
-    array([[ 0. ,  0.5,  1. ]])
+    array([[0. ,  0.5,  1. ]])
     >>> yv
-    array([[ 0.],
-           [ 1.]])
+    array([[0.],
+           [1.]])
 
     `meshgrid` is very useful to evaluate functions on a grid.
 
@@ -4224,7 +4228,7 @@
     arr : array_like
       Input array.
     obj : slice, int or array of ints
-      Indicate which sub-arrays to remove.
+      Indicate indices of sub-arrays to remove along the specified axis.
     axis : int, optional
       The axis along which to delete the subarray defined by `obj`.
       If `axis` is None, `obj` is applied to the flattened array.
@@ -4245,6 +4249,7 @@
     -----
     Often it is preferable to use a boolean mask. For example:
 
+    >>> arr = np.arange(12) + 1
     >>> mask = np.ones(len(arr), dtype=bool)
     >>> mask[[0,2,4]] = False
     >>> result = arr[mask,...]
@@ -4476,7 +4481,7 @@
            [2, 2],
            [3, 3]])
     >>> np.insert(a, 1, 5)
-    array([1, 5, 1, 2, 2, 3, 3])
+    array([1, 5, 1, ..., 2, 3, 3])
     >>> np.insert(a, 1, 5, axis=1)
     array([[1, 5, 1],
            [2, 5, 2],
@@ -4496,13 +4501,13 @@
     >>> b
     array([1, 1, 2, 2, 3, 3])
     >>> np.insert(b, [2, 2], [5, 6])
-    array([1, 1, 5, 6, 2, 2, 3, 3])
+    array([1, 1, 5, ..., 2, 3, 3])
 
     >>> np.insert(b, slice(2, 4), [5, 6])
-    array([1, 1, 5, 2, 6, 2, 3, 3])
+    array([1, 1, 5, ..., 2, 3, 3])
 
     >>> np.insert(b, [2, 2], [7.13, False]) # type casting
-    array([1, 1, 7, 0, 2, 2, 3, 3])
+    array([1, 1, 7, ..., 2, 3, 3])
 
     >>> x = np.arange(8).reshape(2, 4)
     >>> idx = (1, 3)
@@ -4666,7 +4671,7 @@
     Examples
     --------
     >>> np.append([1, 2, 3], [[4, 5, 6], [7, 8, 9]])
-    array([1, 2, 3, 4, 5, 6, 7, 8, 9])
+    array([1, 2, 3, ..., 7, 8, 9])
 
     When `axis` is specified, `values` must have the correct shape.
 
@@ -4676,8 +4681,8 @@
            [7, 8, 9]])
     >>> np.append([[1, 2, 3], [4, 5, 6]], [7, 8, 9], axis=0)
     Traceback (most recent call last):
-    ...
-    ValueError: arrays must have same number of dimensions
+        ...
+    ValueError: all the input arrays must have same number of dimensions
 
     """
     arr = asanyarray(arr)
diff --git a/numpy/lib/histograms.py b/numpy/lib/histograms.py
index 482eabe..7b229cc 100644
--- a/numpy/lib/histograms.py
+++ b/numpy/lib/histograms.py
@@ -645,7 +645,7 @@
 
     >>> hist_0, bins_0 = np.histogram(arr[group_id == 0], bins='auto')
     >>> hist_1, bins_1 = np.histogram(arr[group_id == 1], bins='auto')
-    >>> hist_0; hist1
+    >>> hist_0; hist_1
     array([1, 1, 1])
     array([2, 1, 1, 2])
     >>> bins_0; bins_1
@@ -748,14 +748,14 @@
     >>> np.histogram([1, 2, 1], bins=[0, 1, 2, 3])
     (array([0, 2, 1]), array([0, 1, 2, 3]))
     >>> np.histogram(np.arange(4), bins=np.arange(5), density=True)
-    (array([ 0.25,  0.25,  0.25,  0.25]), array([0, 1, 2, 3, 4]))
+    (array([0.25, 0.25, 0.25, 0.25]), array([0, 1, 2, 3, 4]))
     >>> np.histogram([[1, 2, 1], [1, 0, 1]], bins=[0,1,2,3])
     (array([1, 4, 1]), array([0, 1, 2, 3]))
 
     >>> a = np.arange(5)
     >>> hist, bin_edges = np.histogram(a, density=True)
     >>> hist
-    array([ 0.5,  0. ,  0.5,  0. ,  0. ,  0.5,  0. ,  0.5,  0. ,  0.5])
+    array([0.5, 0. , 0.5, 0. , 0. , 0.5, 0. , 0.5, 0. , 0.5])
     >>> hist.sum()
     2.4999999999999996
     >>> np.sum(hist * np.diff(bin_edges))
@@ -770,8 +770,9 @@
     >>> rng = np.random.RandomState(10)  # deterministic random data
     >>> a = np.hstack((rng.normal(size=1000),
     ...                rng.normal(loc=5, scale=2, size=1000)))
-    >>> plt.hist(a, bins='auto')  # arguments are passed to np.histogram
+    >>> _ = plt.hist(a, bins='auto')  # arguments are passed to np.histogram
     >>> plt.title("Histogram with 'auto' bins")
+    Text(0.5, 1.0, "Histogram with 'auto' bins")
     >>> plt.show()
 
     """
diff --git a/numpy/lib/index_tricks.py b/numpy/lib/index_tricks.py
index 56abe29..64c491c 100644
--- a/numpy/lib/index_tricks.py
+++ b/numpy/lib/index_tricks.py
@@ -478,7 +478,7 @@
     Examples
     --------
     >>> np.r_[np.array([1,2,3]), 0, 0, np.array([4,5,6])]
-    array([1, 2, 3, 0, 0, 4, 5, 6])
+    array([1, 2, 3, ..., 4, 5, 6])
     >>> np.r_[-1:1:6j, [0]*3, 5, 6]
     array([-1. , -0.6, -0.2,  0.2,  0.6,  1. ,  0. ,  0. ,  0. ,  5. ,  6. ])
 
@@ -538,7 +538,7 @@
            [2, 5],
            [3, 6]])
     >>> np.c_[np.array([[1,2,3]]), 0, 0, np.array([[4,5,6]])]
-    array([[1, 2, 3, 0, 0, 4, 5, 6]])
+    array([[1, 2, 3, ..., 4, 5, 6]])
 
     """
 
@@ -813,7 +813,7 @@
 
     >>> # tall matrices no wrap
     >>> a = np.zeros((5, 3),int)
-    >>> fill_diagonal(a, 4)
+    >>> np.fill_diagonal(a, 4)
     >>> a
     array([[4, 0, 0],
            [0, 4, 0],
@@ -823,7 +823,7 @@
 
     >>> # tall matrices wrap
     >>> a = np.zeros((5, 3),int)
-    >>> fill_diagonal(a, 4, wrap=True)
+    >>> np.fill_diagonal(a, 4, wrap=True)
     >>> a
     array([[4, 0, 0],
            [0, 4, 0],
@@ -833,7 +833,7 @@
 
     >>> # wide matrices
     >>> a = np.zeros((3, 5),int)
-    >>> fill_diagonal(a, 4, wrap=True)
+    >>> np.fill_diagonal(a, 4, wrap=True)
     >>> a
     array([[4, 0, 0, 0, 0],
            [0, 4, 0, 0, 0],
diff --git a/numpy/lib/nanfunctions.py b/numpy/lib/nanfunctions.py
index d73d844..77c851f 100644
--- a/numpy/lib/nanfunctions.py
+++ b/numpy/lib/nanfunctions.py
@@ -40,6 +40,33 @@
     ]
 
 
+def _nan_mask(a, out=None):
+    """
+    Parameters
+    ----------
+    a : array-like
+        Input array with at least 1 dimension.
+    out : ndarray, optional
+        Alternate output array in which to place the result.  The default
+        is ``None``; if provided, it must have the same shape as the
+        expected output and will prevent the allocation of a new array.
+
+    Returns
+    -------
+    y : bool ndarray or True
+        A bool array where ``np.nan`` positions are marked with ``False``
+        and other positions are marked with ``True``. If the type of ``a``
+        is such that it can't possibly contain ``np.nan``, returns ``True``.
+    """
+    # we assume that a is an array for this private function
+
+    if a.dtype.kind not in 'fc':
+        return True
+
+    y = np.isnan(a, out=out)
+    y = np.invert(y, out=y)
+    return y
+
 def _replace_nan(a, val):
     """
     If `a` is of inexact type, make a copy of `a`, replace NaNs with
@@ -271,9 +298,9 @@
     >>> np.nanmin(a)
     1.0
     >>> np.nanmin(a, axis=0)
-    array([ 1.,  2.])
+    array([1.,  2.])
     >>> np.nanmin(a, axis=1)
-    array([ 1.,  3.])
+    array([1.,  3.])
 
     When positive infinity and negative infinity are present:
 
@@ -384,9 +411,9 @@
     >>> np.nanmax(a)
     3.0
     >>> np.nanmax(a, axis=0)
-    array([ 3.,  2.])
+    array([3.,  2.])
     >>> np.nanmax(a, axis=1)
-    array([ 2.,  3.])
+    array([2.,  3.])
 
     When positive infinity and negative infinity are present:
 
@@ -601,12 +628,15 @@
     >>> np.nansum(a)
     3.0
     >>> np.nansum(a, axis=0)
-    array([ 2.,  1.])
+    array([2.,  1.])
     >>> np.nansum([1, np.nan, np.inf])
     inf
     >>> np.nansum([1, np.nan, np.NINF])
     -inf
-    >>> np.nansum([1, np.nan, np.inf, -np.inf]) # both +/- infinity present
+    >>> from numpy.testing import suppress_warnings
+    >>> with suppress_warnings() as sup:
+    ...     sup.filter(RuntimeWarning)
+    ...     np.nansum([1, np.nan, np.inf, -np.inf]) # both +/- infinity present
     nan
 
     """
@@ -677,7 +707,7 @@
     >>> np.nanprod(a)
     6.0
     >>> np.nanprod(a, axis=0)
-    array([ 3.,  2.])
+    array([3., 2.])
 
     """
     a, mask = _replace_nan(a, 1)
@@ -738,16 +768,16 @@
     >>> np.nancumsum([1])
     array([1])
     >>> np.nancumsum([1, np.nan])
-    array([ 1.,  1.])
+    array([1.,  1.])
     >>> a = np.array([[1, 2], [3, np.nan]])
     >>> np.nancumsum(a)
-    array([ 1.,  3.,  6.,  6.])
+    array([1.,  3.,  6.,  6.])
     >>> np.nancumsum(a, axis=0)
-    array([[ 1.,  2.],
-           [ 4.,  2.]])
+    array([[1.,  2.],
+           [4.,  2.]])
     >>> np.nancumsum(a, axis=1)
-    array([[ 1.,  3.],
-           [ 3.,  3.]])
+    array([[1.,  3.],
+           [3.,  3.]])
 
     """
     a, mask = _replace_nan(a, 0)
@@ -805,16 +835,16 @@
     >>> np.nancumprod([1])
     array([1])
     >>> np.nancumprod([1, np.nan])
-    array([ 1.,  1.])
+    array([1.,  1.])
     >>> a = np.array([[1, 2], [3, np.nan]])
     >>> np.nancumprod(a)
-    array([ 1.,  2.,  6.,  6.])
+    array([1.,  2.,  6.,  6.])
     >>> np.nancumprod(a, axis=0)
-    array([[ 1.,  2.],
-           [ 3.,  2.]])
+    array([[1.,  2.],
+           [3.,  2.]])
     >>> np.nancumprod(a, axis=1)
-    array([[ 1.,  2.],
-           [ 3.,  3.]])
+    array([[1.,  2.],
+           [3.,  3.]])
 
     """
     a, mask = _replace_nan(a, 1)
@@ -895,9 +925,9 @@
     >>> np.nanmean(a)
     2.6666666666666665
     >>> np.nanmean(a, axis=0)
-    array([ 2.,  4.])
+    array([2.,  4.])
     >>> np.nanmean(a, axis=1)
-    array([ 1.,  3.5])
+    array([1.,  3.5]) # may vary
 
     """
     arr, mask = _replace_nan(a, 0)
@@ -1049,19 +1079,19 @@
     >>> a = np.array([[10.0, 7, 4], [3, 2, 1]])
     >>> a[0, 1] = np.nan
     >>> a
-    array([[ 10.,  nan,   4.],
-       [  3.,   2.,   1.]])
+    array([[10., nan,  4.],
+           [ 3.,  2.,  1.]])
     >>> np.median(a)
     nan
     >>> np.nanmedian(a)
     3.0
     >>> np.nanmedian(a, axis=0)
-    array([ 6.5,  2.,  2.5])
+    array([6.5, 2. , 2.5])
     >>> np.median(a, axis=1)
-    array([ 7.,  2.])
+    array([nan,  2.])
     >>> b = a.copy()
     >>> np.nanmedian(b, axis=1, overwrite_input=True)
-    array([ 7.,  2.])
+    array([7.,  2.])
     >>> assert not np.all(a==b)
     >>> b = a.copy()
     >>> np.nanmedian(b, axis=None, overwrite_input=True)
@@ -1177,27 +1207,27 @@
     >>> a = np.array([[10., 7., 4.], [3., 2., 1.]])
     >>> a[0][1] = np.nan
     >>> a
-    array([[ 10.,  nan,   4.],
-          [  3.,   2.,   1.]])
+    array([[10.,  nan,   4.],
+          [ 3.,   2.,   1.]])
     >>> np.percentile(a, 50)
     nan
     >>> np.nanpercentile(a, 50)
-    3.5
+    3.0
     >>> np.nanpercentile(a, 50, axis=0)
-    array([ 6.5,  2.,   2.5])
+    array([6.5, 2. , 2.5])
     >>> np.nanpercentile(a, 50, axis=1, keepdims=True)
-    array([[ 7.],
-           [ 2.]])
+    array([[7.],
+           [2.]])
     >>> m = np.nanpercentile(a, 50, axis=0)
     >>> out = np.zeros_like(m)
     >>> np.nanpercentile(a, 50, axis=0, out=out)
-    array([ 6.5,  2.,   2.5])
+    array([6.5, 2. , 2.5])
     >>> m
-    array([ 6.5,  2. ,  2.5])
+    array([6.5,  2. ,  2.5])
 
     >>> b = a.copy()
     >>> np.nanpercentile(b, 50, axis=1, overwrite_input=True)
-    array([  7.,  2.])
+    array([7., 2.])
     >>> assert not np.all(a==b)
 
     """
@@ -1291,26 +1321,26 @@
     >>> a = np.array([[10., 7., 4.], [3., 2., 1.]])
     >>> a[0][1] = np.nan
     >>> a
-    array([[ 10.,  nan,   4.],
-          [  3.,   2.,   1.]])
+    array([[10.,  nan,   4.],
+          [ 3.,   2.,   1.]])
     >>> np.quantile(a, 0.5)
     nan
     >>> np.nanquantile(a, 0.5)
-    3.5
+    3.0
     >>> np.nanquantile(a, 0.5, axis=0)
-    array([ 6.5,  2.,   2.5])
+    array([6.5, 2. , 2.5])
     >>> np.nanquantile(a, 0.5, axis=1, keepdims=True)
-    array([[ 7.],
-           [ 2.]])
+    array([[7.],
+           [2.]])
     >>> m = np.nanquantile(a, 0.5, axis=0)
     >>> out = np.zeros_like(m)
     >>> np.nanquantile(a, 0.5, axis=0, out=out)
-    array([ 6.5,  2.,   2.5])
+    array([6.5, 2. , 2.5])
     >>> m
-    array([ 6.5,  2. ,  2.5])
+    array([6.5,  2. ,  2.5])
     >>> b = a.copy()
     >>> np.nanquantile(b, 0.5, axis=1, overwrite_input=True)
-    array([  7.,  2.])
+    array([7., 2.])
     >>> assert not np.all(a==b)
     """
     a = np.asanyarray(a)
@@ -1465,12 +1495,12 @@
     Examples
     --------
     >>> a = np.array([[1, np.nan], [3, 4]])
-    >>> np.var(a)
+    >>> np.nanvar(a)
     1.5555555555555554
     >>> np.nanvar(a, axis=0)
-    array([ 1.,  0.])
+    array([1.,  0.])
     >>> np.nanvar(a, axis=1)
-    array([ 0.,  0.25])
+    array([0.,  0.25])  # may vary
 
     """
     arr, mask = _replace_nan(a, 0)
@@ -1619,9 +1649,9 @@
     >>> np.nanstd(a)
     1.247219128924647
     >>> np.nanstd(a, axis=0)
-    array([ 1.,  0.])
+    array([1., 0.])
     >>> np.nanstd(a, axis=1)
-    array([ 0.,  0.5])
+    array([0.,  0.5]) # may vary
 
     """
     var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py
index db6a8e5..704fea1 100644
--- a/numpy/lib/npyio.py
+++ b/numpy/lib/npyio.py
@@ -168,13 +168,13 @@
     >>> x = np.arange(10)
     >>> y = np.sin(x)
     >>> np.savez(outfile, x=x, y=y)
-    >>> outfile.seek(0)
+    >>> _ = outfile.seek(0)
 
     >>> npz = np.load(outfile)
     >>> isinstance(npz, np.lib.io.NpzFile)
     True
-    >>> npz.files
-    ['y', 'x']
+    >>> sorted(npz.files)
+    ['x', 'y']
     >>> npz['x']  # getitem access
     array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
     >>> npz.f.x  # attribute lookup
@@ -502,7 +502,7 @@
     >>> x = np.arange(10)
     >>> np.save(outfile, x)
 
-    >>> outfile.seek(0) # Only needed here to simulate closing & reopening file
+    >>> _ = outfile.seek(0) # Only needed here to simulate closing & reopening file
     >>> np.load(outfile)
     array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
 
@@ -597,10 +597,10 @@
     Using `savez` with \\*args, the arrays are saved with default names.
 
     >>> np.savez(outfile, x, y)
-    >>> outfile.seek(0) # Only needed here to simulate closing & reopening file
+    >>> _ = outfile.seek(0) # Only needed here to simulate closing & reopening file
     >>> npzfile = np.load(outfile)
     >>> npzfile.files
-    ['arr_1', 'arr_0']
+    ['arr_0', 'arr_1']
     >>> npzfile['arr_0']
     array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
 
@@ -608,10 +608,10 @@
 
     >>> outfile = TemporaryFile()
     >>> np.savez(outfile, x=x, y=y)
-    >>> outfile.seek(0)
+    >>> _ = outfile.seek(0)
     >>> npzfile = np.load(outfile)
-    >>> npzfile.files
-    ['y', 'x']
+    >>> sorted(npzfile.files)
+    ['x', 'y']
     >>> npzfile['x']
     array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
 
@@ -829,7 +829,7 @@
         `genfromtxt`): ``converters = {3: lambda s: float(s.strip() or 0)}``.
         Default: None.
     skiprows : int, optional
-        Skip the first `skiprows` lines; default: 0.
+        Skip the first `skiprows` lines, including comments; default: 0.
     usecols : int or sequence, optional
         Which columns to read, with 0 being the first. For example,
         ``usecols = (1,4,5)`` will extract the 2nd, 5th and 6th columns.
@@ -891,21 +891,21 @@
     >>> from io import StringIO   # StringIO behaves like a file object
     >>> c = StringIO(u"0 1\\n2 3")
     >>> np.loadtxt(c)
-    array([[ 0.,  1.],
-           [ 2.,  3.]])
+    array([[0., 1.],
+           [2., 3.]])
 
     >>> d = StringIO(u"M 21 72\\nF 35 58")
     >>> np.loadtxt(d, dtype={'names': ('gender', 'age', 'weight'),
     ...                      'formats': ('S1', 'i4', 'f4')})
-    array([('M', 21, 72.0), ('F', 35, 58.0)],
-          dtype=[('gender', '|S1'), ('age', '<i4'), ('weight', '<f4')])
+    array([(b'M', 21, 72.), (b'F', 35, 58.)],
+          dtype=[('gender', 'S1'), ('age', '<i4'), ('weight', '<f4')])
 
     >>> c = StringIO(u"1,0,2\\n3,0,4")
     >>> x, y = np.loadtxt(c, delimiter=',', usecols=(0, 2), unpack=True)
     >>> x
-    array([ 1.,  3.])
+    array([1., 3.])
     >>> y
-    array([ 2.,  4.])
+    array([2., 4.])
 
     """
     # Type conversions for Py3 convenience
@@ -1481,17 +1481,17 @@
     Examples
     --------
     >>> f = open('test.dat', 'w')
-    >>> f.write("1312 foo\\n1534  bar\\n444   qux")
+    >>> _ = f.write("1312 foo\\n1534  bar\\n444   qux")
     >>> f.close()
 
     >>> regexp = r"(\\d+)\\s+(...)"  # match [digits, whitespace, anything]
     >>> output = np.fromregex('test.dat', regexp,
     ...                       [('num', np.int64), ('key', 'S3')])
     >>> output
-    array([(1312L, 'foo'), (1534L, 'bar'), (444L, 'qux')],
-          dtype=[('num', '<i8'), ('key', '|S3')])
+    array([(1312, b'foo'), (1534, b'bar'), ( 444, b'qux')],
+          dtype=[('num', '<i8'), ('key', 'S3')])
     >>> output['num']
-    array([1312, 1534,  444], dtype=int64)
+    array([1312, 1534,  444])
 
     """
     own_fh = False
@@ -1674,26 +1674,26 @@
     >>> data = np.genfromtxt(s, dtype=[('myint','i8'),('myfloat','f8'),
     ... ('mystring','S5')], delimiter=",")
     >>> data
-    array((1, 1.3, 'abcde'),
-          dtype=[('myint', '<i8'), ('myfloat', '<f8'), ('mystring', '|S5')])
+    array((1, 1.3, b'abcde'),
+          dtype=[('myint', '<i8'), ('myfloat', '<f8'), ('mystring', 'S5')])
 
     Using dtype = None
 
-    >>> s.seek(0) # needed for StringIO example only
+    >>> _ = s.seek(0) # needed for StringIO example only
     >>> data = np.genfromtxt(s, dtype=None,
     ... names = ['myint','myfloat','mystring'], delimiter=",")
     >>> data
-    array((1, 1.3, 'abcde'),
-          dtype=[('myint', '<i8'), ('myfloat', '<f8'), ('mystring', '|S5')])
+    array((1, 1.3, b'abcde'),
+          dtype=[('myint', '<i8'), ('myfloat', '<f8'), ('mystring', 'S5')])
 
     Specifying dtype and names
 
-    >>> s.seek(0)
+    >>> _ = s.seek(0)
     >>> data = np.genfromtxt(s, dtype="i8,f8,S5",
     ... names=['myint','myfloat','mystring'], delimiter=",")
     >>> data
-    array((1, 1.3, 'abcde'),
-          dtype=[('myint', '<i8'), ('myfloat', '<f8'), ('mystring', '|S5')])
+    array((1, 1.3, b'abcde'),
+          dtype=[('myint', '<i8'), ('myfloat', '<f8'), ('mystring', 'S5')])
 
     An example with fixed-width columns
 
@@ -1701,8 +1701,8 @@
     >>> data = np.genfromtxt(s, dtype=None, names=['intvar','fltvar','strvar'],
     ...     delimiter=[1,3,5])
     >>> data
-    array((1, 1.3, 'abcde'),
-          dtype=[('intvar', '<i8'), ('fltvar', '<f8'), ('strvar', '|S5')])
+    array((1, 1.3, b'abcde'),
+          dtype=[('intvar', '<i8'), ('fltvar', '<f8'), ('strvar', 'S5')])
 
     """
     if max_rows is not None:
diff --git a/numpy/lib/polynomial.py b/numpy/lib/polynomial.py
index e3defdc..b55764b 100644
--- a/numpy/lib/polynomial.py
+++ b/numpy/lib/polynomial.py
@@ -110,7 +110,7 @@
     Given a sequence of a polynomial's zeros:
 
     >>> np.poly((0, 0, 0)) # Multiple root example
-    array([1, 0, 0, 0])
+    array([1., 0., 0., 0.])
 
     The line above represents z**3 + 0*z**2 + 0*z + 0.
 
@@ -119,14 +119,14 @@
 
     The line above represents z**3 - z/4
 
-    >>> np.poly((np.random.random(1.)[0], 0, np.random.random(1.)[0]))
-    array([ 1.        , -0.77086955,  0.08618131,  0.        ]) #random
+    >>> np.poly((np.random.random(1)[0], 0, np.random.random(1)[0]))
+    array([ 1.        , -0.77086955,  0.08618131,  0.        ]) # random
 
     Given a square array object:
 
     >>> P = np.array([[0, 1./3], [-1./2, 0]])
     >>> np.poly(P)
-    array([ 1.        ,  0.        ,  0.16666667])
+    array([1.        , 0.        , 0.16666667])
 
     Note how in all cases the leading coefficient is always 1.
 
@@ -295,7 +295,7 @@
     >>> p = np.poly1d([1,1,1])
     >>> P = np.polyint(p)
     >>> P
-    poly1d([ 0.33333333,  0.5       ,  1.        ,  0.        ])
+     poly1d([ 0.33333333,  0.5       ,  1.        ,  0.        ]) # may vary
     >>> np.polyder(P) == p
     True
 
@@ -310,7 +310,7 @@
     0.0
     >>> P = np.polyint(p, 3, k=[6,5,3])
     >>> P
-    poly1d([ 0.01666667,  0.04166667,  0.16666667,  3. ,  5. ,  3. ])
+    poly1d([ 0.01666667,  0.04166667,  0.16666667,  3. ,  5. ,  3. ]) # may vary
 
     Note that 3 = 6 / 2!, and that the constants are given in the order of
     integrations. Constant of the highest-order polynomial term comes first:
@@ -404,7 +404,7 @@
     >>> np.polyder(p, 3)
     poly1d([6])
     >>> np.polyder(p, 4)
-    poly1d([ 0.])
+    poly1d([0.])
 
     """
     m = int(m)
@@ -552,28 +552,29 @@
     >>> y = np.array([0.0, 0.8, 0.9, 0.1, -0.8, -1.0])
     >>> z = np.polyfit(x, y, 3)
     >>> z
-    array([ 0.08703704, -0.81349206,  1.69312169, -0.03968254])
+    array([ 0.08703704, -0.81349206,  1.69312169, -0.03968254]) # may vary
 
     It is convenient to use `poly1d` objects for dealing with polynomials:
 
     >>> p = np.poly1d(z)
     >>> p(0.5)
-    0.6143849206349179
+    0.6143849206349179 # may vary
     >>> p(3.5)
-    -0.34732142857143039
+    -0.34732142857143039 # may vary
     >>> p(10)
-    22.579365079365115
+    22.579365079365115 # may vary
 
     High-order polynomials may oscillate wildly:
 
     >>> p30 = np.poly1d(np.polyfit(x, y, 30))
-    /... RankWarning: Polyfit may be poorly conditioned...
+    ... 
+    >>> # RankWarning: Polyfit may be poorly conditioned...
     >>> p30(4)
-    -0.80000000000000204
+    -0.80000000000000204 # may vary
     >>> p30(5)
-    -0.99999999999999445
+    -0.99999999999999445 # may vary
     >>> p30(4.5)
-    -0.10547061179440398
+    -0.10547061179440398 # may vary
 
     Illustration:
 
@@ -703,6 +704,8 @@
     for polynomials of high degree the values may be inaccurate due to
     rounding errors. Use carefully.
 
+    If `x` is a subtype of `ndarray` the return value will be of the same type.
+
     References
     ----------
     .. [1] I. N. Bronshtein, K. A. Semendyayev, and K. A. Hirsch (Eng.
@@ -714,18 +717,18 @@
     >>> np.polyval([3,0,1], 5)  # 3 * 5**2 + 0 * 5**1 + 1
     76
     >>> np.polyval([3,0,1], np.poly1d(5))
-    poly1d([ 76.])
+    poly1d([76.])
     >>> np.polyval(np.poly1d([3,0,1]), 5)
     76
     >>> np.polyval(np.poly1d([3,0,1]), np.poly1d(5))
-    poly1d([ 76.])
+    poly1d([76.])
 
     """
     p = NX.asarray(p)
     if isinstance(x, poly1d):
         y = 0
     else:
-        x = NX.asarray(x)
+        x = NX.asanyarray(x)
         y = NX.zeros_like(x)
     for i in range(len(p)):
         y = y * x + p[i]
@@ -951,7 +954,7 @@
     >>> x = np.array([3.0, 5.0, 2.0])
     >>> y = np.array([2.0, 1.0])
     >>> np.polydiv(x, y)
-    (array([ 1.5 ,  1.75]), array([ 0.25]))
+    (array([1.5 , 1.75]), array([0.25]))
 
     """
     truepoly = (isinstance(u, poly1d) or isinstance(u, poly1d))
@@ -1046,7 +1049,7 @@
     >>> p.r
     array([-1.+1.41421356j, -1.-1.41421356j])
     >>> p(p.r)
-    array([ -4.44089210e-16+0.j,  -4.44089210e-16+0.j])
+    array([ -4.44089210e-16+0.j,  -4.44089210e-16+0.j]) # may vary
 
     These numbers in the previous line represent (0, 0) to machine precision
 
@@ -1073,7 +1076,7 @@
     poly1d([ 1,  4, 10, 12,  9])
 
     >>> (p**3 + 4) / p
-    (poly1d([  1.,   4.,  10.,  12.,   9.]), poly1d([ 4.]))
+    (poly1d([ 1.,  4., 10., 12.,  9.]), poly1d([4.]))
 
     ``asarray(p)`` gives the coefficient array, so polynomials can be
     used in all functions that accept arrays:
@@ -1095,7 +1098,7 @@
     Construct a polynomial from its roots:
 
     >>> np.poly1d([1, 2], True)
-    poly1d([ 1, -3,  2])
+    poly1d([ 1., -3.,  2.])
 
     This is the same polynomial as obtained by:
 
diff --git a/numpy/lib/recfunctions.py b/numpy/lib/recfunctions.py
index fcc0d9a..5ff35f0 100644
--- a/numpy/lib/recfunctions.py
+++ b/numpy/lib/recfunctions.py
@@ -57,11 +57,10 @@
     Examples
     --------
     >>> from numpy.lib import recfunctions as rfn
-    >>> a = np.array([(1, 10.), (2, 20.)], dtype=[('A', int), ('B', float)])
+    >>> a = np.array([(1, 10.), (2, 20.)], dtype=[('A', np.int64), ('B', np.float64)])
     >>> b = np.zeros((3,), dtype=a.dtype)
     >>> rfn.recursive_fill_fields(a, b)
-    array([(1, 10.0), (2, 20.0), (0, 0.0)],
-          dtype=[('A', '<i4'), ('B', '<f8')])
+    array([(1, 10.), (2, 20.), (0,  0.)], dtype=[('A', '<i8'), ('B', '<f8')])
 
     """
     newdtype = output.dtype
@@ -89,11 +88,11 @@
 
     Examples
     --------
-    >>> dt = np.dtype([(('a', 'A'), int), ('b', float, 3)])
+    >>> dt = np.dtype([(('a', 'A'), np.int64), ('b', np.double, 3)])
     >>> dt.descr
-    [(('a', 'A'), '<i4'), ('b', '<f8', (3,))]
+    [(('a', 'A'), '<i8'), ('b', '<f8', (3,))]
     >>> get_fieldspec(dt)
-    [(('a', 'A'), dtype('int32')), ('b', dtype(('<f8', (3,))))]
+    [(('a', 'A'), dtype('int64')), ('b', dtype(('<f8', (3,))))]
 
     """
     if dtype.names is None:
@@ -120,10 +119,15 @@
     Examples
     --------
     >>> from numpy.lib import recfunctions as rfn
-    >>> rfn.get_names(np.empty((1,), dtype=int)) is None
-    True
+    >>> rfn.get_names(np.empty((1,), dtype=int))
+    Traceback (most recent call last):
+        ...
+    AttributeError: 'numpy.ndarray' object has no attribute 'names'
+
     >>> rfn.get_names(np.empty((1,), dtype=[('A',int), ('B', float)]))
-    ('A', 'B')
+    Traceback (most recent call last):
+        ...
+    AttributeError: 'numpy.ndarray' object has no attribute 'names'
     >>> adtype = np.dtype([('a', int), ('b', [('ba', int), ('bb', int)])])
     >>> rfn.get_names(adtype)
     ('a', ('b', ('ba', 'bb')))
@@ -153,9 +157,13 @@
     --------
     >>> from numpy.lib import recfunctions as rfn
     >>> rfn.get_names_flat(np.empty((1,), dtype=int)) is None
-    True
+    Traceback (most recent call last):
+        ...
+    AttributeError: 'numpy.ndarray' object has no attribute 'names'
     >>> rfn.get_names_flat(np.empty((1,), dtype=[('A',int), ('B', float)]))
-    ('A', 'B')
+    Traceback (most recent call last):
+        ...
+    AttributeError: 'numpy.ndarray' object has no attribute 'names'
     >>> adtype = np.dtype([('a', int), ('b', [('ba', int), ('bb', int)])])
     >>> rfn.get_names_flat(adtype)
     ('a', 'b', 'ba', 'bb')
@@ -403,20 +411,18 @@
     --------
     >>> from numpy.lib import recfunctions as rfn
     >>> rfn.merge_arrays((np.array([1, 2]), np.array([10., 20., 30.])))
-    masked_array(data = [(1, 10.0) (2, 20.0) (--, 30.0)],
-                 mask = [(False, False) (False, False) (True, False)],
-           fill_value = (999999, 1e+20),
-                dtype = [('f0', '<i4'), ('f1', '<f8')])
+    array([( 1, 10.), ( 2, 20.), (-1, 30.)],
+          dtype=[('f0', '<i8'), ('f1', '<f8')])
 
-    >>> rfn.merge_arrays((np.array([1, 2]), np.array([10., 20., 30.])),
-    ...              usemask=False)
-    array([(1, 10.0), (2, 20.0), (-1, 30.0)],
-          dtype=[('f0', '<i4'), ('f1', '<f8')])
-    >>> rfn.merge_arrays((np.array([1, 2]).view([('a', int)]),
+    >>> rfn.merge_arrays((np.array([1, 2], dtype=np.int64),
+    ...         np.array([10., 20., 30.])), usemask=False)
+     array([(1, 10.0), (2, 20.0), (-1, 30.0)],
+             dtype=[('f0', '<i8'), ('f1', '<f8')])
+    >>> rfn.merge_arrays((np.array([1, 2]).view([('a', np.int64)]),
     ...               np.array([10., 20., 30.])),
     ...              usemask=False, asrecarray=True)
-    rec.array([(1, 10.0), (2, 20.0), (-1, 30.0)],
-              dtype=[('a', '<i4'), ('f1', '<f8')])
+    rec.array([( 1, 10.), ( 2, 20.), (-1, 30.)],
+              dtype=[('a', '<i8'), ('f1', '<f8')])
 
     Notes
     -----
@@ -547,16 +553,14 @@
     --------
     >>> from numpy.lib import recfunctions as rfn
     >>> a = np.array([(1, (2, 3.0)), (4, (5, 6.0))],
-    ...   dtype=[('a', int), ('b', [('ba', float), ('bb', int)])])
+    ...   dtype=[('a', np.int64), ('b', [('ba', np.double), ('bb', np.int64)])])
     >>> rfn.drop_fields(a, 'a')
-    array([((2.0, 3),), ((5.0, 6),)],
-          dtype=[('b', [('ba', '<f8'), ('bb', '<i4')])])
+    array([((2., 3),), ((5., 6),)],
+          dtype=[('b', [('ba', '<f8'), ('bb', '<i8')])])
     >>> rfn.drop_fields(a, 'ba')
-    array([(1, (3,)), (4, (6,))],
-          dtype=[('a', '<i4'), ('b', [('bb', '<i4')])])
+    array([(1, (3,)), (4, (6,))], dtype=[('a', '<i8'), ('b', [('bb', '<i8')])])
     >>> rfn.drop_fields(a, ['ba', 'bb'])
-    array([(1,), (4,)],
-          dtype=[('a', '<i4')])
+    array([(1,), (4,)], dtype=[('a', '<i8')])
     """
     if _is_string_like(drop_names):
         drop_names = [drop_names]
@@ -648,8 +652,8 @@
     >>> a = np.array([(1, (2, [3.0, 30.])), (4, (5, [6.0, 60.]))],
     ...   dtype=[('a', int),('b', [('ba', float), ('bb', (float, 2))])])
     >>> rfn.rename_fields(a, {'a':'A', 'bb':'BB'})
-    array([(1, (2.0, [3.0, 30.0])), (4, (5.0, [6.0, 60.0]))],
-          dtype=[('A', '<i4'), ('b', [('ba', '<f8'), ('BB', '<f8', 2)])])
+    array([(1, (2., [ 3., 30.])), (4, (5., [ 6., 60.]))],
+          dtype=[('A', '<i8'), ('b', [('ba', '<f8'), ('BB', '<f8', (2,))])])
 
     """
     def _recursive_rename_fields(ndtype, namemapper):
@@ -834,18 +838,18 @@
     ...     print("offsets:", [d.fields[name][1] for name in d.names])
     ...     print("itemsize:", d.itemsize)
     ...
-    >>> dt = np.dtype('u1,i4,f4', align=True)
+    >>> dt = np.dtype('u1,<i4,<f4', align=True)
     >>> dt
-    dtype({'names':['f0','f1','f2'], 'formats':['u1','<i4','<f8'], 'offsets':[0,4,8], 'itemsize':16}, align=True)
+    dtype({'names':['f0','f1','f2'], 'formats':['u1','<i8','<f8'], 'offsets':[0,8,16], 'itemsize':24}, align=True)
     >>> print_offsets(dt)
-    offsets: [0, 4, 8]
-    itemsize: 16
+    offsets: [0, 8, 16]
+    itemsize: 24
     >>> packed_dt = repack_fields(dt)
     >>> packed_dt
-    dtype([('f0', 'u1'), ('f1', '<i4'), ('f2', '<f8')])
+    dtype([('f0', 'u1'), ('f1', '<i8'), ('f2', '<f8')])
     >>> print_offsets(packed_dt)
-    offsets: [0, 1, 5]
-    itemsize: 13
+    offsets: [0, 1, 9]
+    itemsize: 17
 
     """
     if not isinstance(a, np.dtype):
@@ -1244,15 +1248,16 @@
     True
     >>> z = np.array([('A', 1), ('B', 2)], dtype=[('A', '|S3'), ('B', float)])
     >>> zz = np.array([('a', 10., 100.), ('b', 20., 200.), ('c', 30., 300.)],
-    ...   dtype=[('A', '|S3'), ('B', float), ('C', float)])
+    ...   dtype=[('A', '|S3'), ('B', np.double), ('C', np.double)])
     >>> test = rfn.stack_arrays((z,zz))
     >>> test
-    masked_array(data = [('A', 1.0, --) ('B', 2.0, --) ('a', 10.0, 100.0) ('b', 20.0, 200.0)
-     ('c', 30.0, 300.0)],
-                 mask = [(False, False, True) (False, False, True) (False, False, False)
-     (False, False, False) (False, False, False)],
-           fill_value = ('N/A', 1e+20, 1e+20),
-                dtype = [('A', '|S3'), ('B', '<f8'), ('C', '<f8')])
+    masked_array(data=[(b'A', 1.0, --), (b'B', 2.0, --), (b'a', 10.0, 100.0),
+                       (b'b', 20.0, 200.0), (b'c', 30.0, 300.0)],
+                 mask=[(False, False,  True), (False, False,  True),
+                       (False, False, False), (False, False, False),
+                       (False, False, False)],
+           fill_value=(b'N/A', 1.e+20, 1.e+20),
+                dtype=[('A', 'S3'), ('B', '<f8'), ('C', '<f8')])
 
     """
     if isinstance(arrays, ndarray):
@@ -1331,7 +1336,10 @@
     >>> a = np.ma.array([1, 1, 1, 2, 2, 3, 3],
     ...         mask=[0, 0, 1, 0, 0, 0, 1]).view(ndtype)
     >>> rfn.find_duplicates(a, ignoremask=True, return_index=True)
-    ... # XXX: judging by the output, the ignoremask flag has no effect
+    (masked_array(data=[(1,), (1,), (2,), (2,)],
+                 mask=[(False,), (False,), (False,), (False,)],
+           fill_value=(999999,),
+                dtype=[('a', '<i8')]), array([0, 1, 3, 4]))
     """
     a = np.asanyarray(a).ravel()
     # Get a dictionary of fields
diff --git a/numpy/lib/scimath.py b/numpy/lib/scimath.py
index 9ca0068..5ac790c 100644
--- a/numpy/lib/scimath.py
+++ b/numpy/lib/scimath.py
@@ -59,7 +59,7 @@
     >>> a = np.array([1,2,3],np.short)
 
     >>> ac = np.lib.scimath._tocomplex(a); ac
-    array([ 1.+0.j,  2.+0.j,  3.+0.j], dtype=complex64)
+    array([1.+0.j, 2.+0.j, 3.+0.j], dtype=complex64)
 
     >>> ac.dtype
     dtype('complex64')
@@ -70,7 +70,7 @@
     >>> b = np.array([1,2,3],np.double)
 
     >>> bc = np.lib.scimath._tocomplex(b); bc
-    array([ 1.+0.j,  2.+0.j,  3.+0.j])
+    array([1.+0.j, 2.+0.j, 3.+0.j])
 
     >>> bc.dtype
     dtype('complex128')
@@ -81,13 +81,13 @@
     >>> c = np.array([1,2,3],np.csingle)
 
     >>> cc = np.lib.scimath._tocomplex(c); cc
-    array([ 1.+0.j,  2.+0.j,  3.+0.j], dtype=complex64)
+    array([1.+0.j,  2.+0.j,  3.+0.j], dtype=complex64)
 
     >>> c *= 2; c
-    array([ 2.+0.j,  4.+0.j,  6.+0.j], dtype=complex64)
+    array([2.+0.j,  4.+0.j,  6.+0.j], dtype=complex64)
 
     >>> cc
-    array([ 1.+0.j,  2.+0.j,  3.+0.j], dtype=complex64)
+    array([1.+0.j,  2.+0.j,  3.+0.j], dtype=complex64)
     """
     if issubclass(arr.dtype.type, (nt.single, nt.byte, nt.short, nt.ubyte,
                                    nt.ushort, nt.csingle)):
@@ -170,7 +170,7 @@
     array([0, 1])
 
     >>> np.lib.scimath._fix_real_abs_gt_1([0,2])
-    array([ 0.+0.j,  2.+0.j])
+    array([0.+0.j, 2.+0.j])
     """
     x = asarray(x)
     if any(isreal(x) & (abs(x) > 1)):
@@ -212,14 +212,14 @@
     >>> np.lib.scimath.sqrt(1)
     1.0
     >>> np.lib.scimath.sqrt([1, 4])
-    array([ 1.,  2.])
+    array([1.,  2.])
 
     But it automatically handles negative inputs:
 
     >>> np.lib.scimath.sqrt(-1)
-    (0.0+1.0j)
+    1j
     >>> np.lib.scimath.sqrt([-1,4])
-    array([ 0.+1.j,  2.+0.j])
+    array([0.+1.j, 2.+0.j])
 
     """
     x = _fix_real_lt_zero(x)
@@ -317,7 +317,7 @@
     1.0
 
     >>> np.emath.log10([-10**1, -10**2, 10**2])
-    array([ 1.+1.3644j,  2.+1.3644j,  2.+0.j    ])
+    array([1.+1.3644j, 2.+1.3644j, 2.+0.j    ])
 
     """
     x = _fix_real_lt_zero(x)
@@ -354,9 +354,9 @@
     >>> np.set_printoptions(precision=4)
 
     >>> np.lib.scimath.logn(2, [4, 8])
-    array([ 2.,  3.])
+    array([2., 3.])
     >>> np.lib.scimath.logn(2, [-4, -8, 8])
-    array([ 2.+4.5324j,  3.+4.5324j,  3.+0.j    ])
+    array([2.+4.5324j, 3.+4.5324j, 3.+0.j    ])
 
     """
     x = _fix_real_lt_zero(x)
@@ -405,7 +405,7 @@
     >>> np.emath.log2(8)
     3.0
     >>> np.emath.log2([-4, -8, 8])
-    array([ 2.+4.5324j,  3.+4.5324j,  3.+0.j    ])
+    array([2.+4.5324j, 3.+4.5324j, 3.+0.j    ])
 
     """
     x = _fix_real_lt_zero(x)
@@ -451,9 +451,9 @@
     >>> np.lib.scimath.power([2, 4], 2)
     array([ 4, 16])
     >>> np.lib.scimath.power([2, 4], -2)
-    array([ 0.25  ,  0.0625])
+    array([0.25  ,  0.0625])
     >>> np.lib.scimath.power([-2, 4], 2)
-    array([  4.+0.j,  16.+0.j])
+    array([ 4.-0.j, 16.+0.j])
 
     """
     x = _fix_real_lt_zero(x)
@@ -499,7 +499,7 @@
     0.0
 
     >>> np.emath.arccos([1,2])
-    array([ 0.-0.j   ,  0.+1.317j])
+    array([0.-0.j   , 0.-1.317j])
 
     """
     x = _fix_real_abs_gt_1(x)
@@ -545,7 +545,7 @@
     0.0
 
     >>> np.emath.arcsin([0,1])
-    array([ 0.    ,  1.5708])
+    array([0.    , 1.5708])
 
     """
     x = _fix_real_abs_gt_1(x)
@@ -589,11 +589,14 @@
     --------
     >>> np.set_printoptions(precision=4)
 
-    >>> np.emath.arctanh(np.eye(2))
-    array([[ Inf,   0.],
-           [  0.,  Inf]])
+    >>> from numpy.testing import suppress_warnings
+    >>> with suppress_warnings() as sup:
+    ...     sup.filter(RuntimeWarning)
+    ...     np.emath.arctanh(np.eye(2))
+    array([[inf,  0.],
+           [ 0., inf]])
     >>> np.emath.arctanh([1j])
-    array([ 0.+0.7854j])
+    array([0.+0.7854j])
 
     """
     x = _fix_real_abs_gt_1(x)
diff --git a/numpy/lib/shape_base.py b/numpy/lib/shape_base.py
index f56c4f4..e088a6c 100644
--- a/numpy/lib/shape_base.py
+++ b/numpy/lib/shape_base.py
@@ -129,7 +129,7 @@
            [40, 50, 60]])
     >>> ai = np.argsort(a, axis=1); ai
     array([[0, 2, 1],
-           [1, 2, 0]], dtype=int64)
+           [1, 2, 0]])
     >>> np.take_along_axis(a, ai, axis=1)
     array([[10, 20, 30],
            [40, 50, 60]])
@@ -142,7 +142,7 @@
     >>> ai = np.expand_dims(np.argmax(a, axis=1), axis=1)
     >>> ai
     array([[1],
-           [0], dtype=int64)
+           [0]])
     >>> np.take_along_axis(a, ai, axis=1)
     array([[30],
            [60]])
@@ -152,10 +152,10 @@
 
     >>> ai_min = np.expand_dims(np.argmin(a, axis=1), axis=1)
     >>> ai_max = np.expand_dims(np.argmax(a, axis=1), axis=1)
-    >>> ai = np.concatenate([ai_min, ai_max], axis=axis)
-    >> ai
+    >>> ai = np.concatenate([ai_min, ai_max], axis=1)
+    >>> ai
     array([[0, 1],
-           [1, 0]], dtype=int64)
+           [1, 0]])
     >>> np.take_along_axis(a, ai, axis=1)
     array([[10, 30],
            [40, 60]])
@@ -243,7 +243,7 @@
     >>> ai = np.expand_dims(np.argmax(a, axis=1), axis=1)
     >>> ai
     array([[1],
-           [0]], dtype=int64)
+           [0]])
     >>> np.put_along_axis(a, ai, 99, axis=1)
     >>> a
     array([[10, 99, 20],
@@ -330,9 +330,9 @@
     ...     return (a[0] + a[-1]) * 0.5
     >>> b = np.array([[1,2,3], [4,5,6], [7,8,9]])
     >>> np.apply_along_axis(my_func, 0, b)
-    array([ 4.,  5.,  6.])
+    array([4., 5., 6.])
     >>> np.apply_along_axis(my_func, 1, b)
-    array([ 2.,  5.,  8.])
+    array([2.,  5.,  8.])
 
     For a function that returns a 1D array, the number of dimensions in
     `outarr` is the same as `arr`.
@@ -732,11 +732,11 @@
     --------
     >>> x = np.arange(8.0)
     >>> np.array_split(x, 3)
-        [array([ 0.,  1.,  2.]), array([ 3.,  4.,  5.]), array([ 6.,  7.])]
+        [array([0.,  1.,  2.]), array([3.,  4.,  5.]), array([6.,  7.])]
 
     >>> x = np.arange(7.0)
     >>> np.array_split(x, 3)
-        [array([ 0.,  1.,  2.]), array([ 3.,  4.]), array([ 5.,  6.])]
+        [array([0.,  1.,  2.]), array([3.,  4.]), array([5.,  6.])]
 
     """
     try:
@@ -828,14 +828,14 @@
     --------
     >>> x = np.arange(9.0)
     >>> np.split(x, 3)
-    [array([ 0.,  1.,  2.]), array([ 3.,  4.,  5.]), array([ 6.,  7.,  8.])]
+    [array([0.,  1.,  2.]), array([3.,  4.,  5.]), array([6.,  7.,  8.])]
 
     >>> x = np.arange(8.0)
     >>> np.split(x, [3, 5, 6, 10])
-    [array([ 0.,  1.,  2.]),
-     array([ 3.,  4.]),
-     array([ 5.]),
-     array([ 6.,  7.]),
+    [array([0.,  1.,  2.]),
+     array([3.,  4.]),
+     array([5.]),
+     array([6.,  7.]),
      array([], dtype=float64)]
 
     """
@@ -872,43 +872,43 @@
     --------
     >>> x = np.arange(16.0).reshape(4, 4)
     >>> x
-    array([[  0.,   1.,   2.,   3.],
-           [  4.,   5.,   6.,   7.],
-           [  8.,   9.,  10.,  11.],
-           [ 12.,  13.,  14.,  15.]])
+    array([[ 0.,   1.,   2.,   3.],
+           [ 4.,   5.,   6.,   7.],
+           [ 8.,   9.,  10.,  11.],
+           [12.,  13.,  14.,  15.]])
     >>> np.hsplit(x, 2)
     [array([[  0.,   1.],
            [  4.,   5.],
            [  8.,   9.],
-           [ 12.,  13.]]),
+           [12.,  13.]]),
      array([[  2.,   3.],
            [  6.,   7.],
-           [ 10.,  11.],
-           [ 14.,  15.]])]
+           [10.,  11.],
+           [14.,  15.]])]
     >>> np.hsplit(x, np.array([3, 6]))
-    [array([[  0.,   1.,   2.],
-           [  4.,   5.,   6.],
-           [  8.,   9.,  10.],
-           [ 12.,  13.,  14.]]),
-     array([[  3.],
-           [  7.],
-           [ 11.],
-           [ 15.]]),
-     array([], dtype=float64)]
+    [array([[ 0.,   1.,   2.],
+           [ 4.,   5.,   6.],
+           [ 8.,   9.,  10.],
+           [12.,  13.,  14.]]),
+     array([[ 3.],
+           [ 7.],
+           [11.],
+           [15.]]),
+     array([], shape=(4, 0), dtype=float64)]
 
     With a higher dimensional array the split is still along the second axis.
 
     >>> x = np.arange(8.0).reshape(2, 2, 2)
     >>> x
-    array([[[ 0.,  1.],
-            [ 2.,  3.]],
-           [[ 4.,  5.],
-            [ 6.,  7.]]])
+    array([[[0.,  1.],
+            [2.,  3.]],
+           [[4.,  5.],
+            [6.,  7.]]])
     >>> np.hsplit(x, 2)
-    [array([[[ 0.,  1.]],
-           [[ 4.,  5.]]]),
-     array([[[ 2.,  3.]],
-           [[ 6.,  7.]]])]
+    [array([[[0.,  1.]],
+           [[4.,  5.]]]),
+     array([[[2.,  3.]],
+           [[6.,  7.]]])]
 
     """
     if _nx.ndim(ary) == 0:
@@ -936,35 +936,31 @@
     --------
     >>> x = np.arange(16.0).reshape(4, 4)
     >>> x
-    array([[  0.,   1.,   2.,   3.],
-           [  4.,   5.,   6.,   7.],
-           [  8.,   9.,  10.,  11.],
-           [ 12.,  13.,  14.,  15.]])
+    array([[ 0.,   1.,   2.,   3.],
+           [ 4.,   5.,   6.,   7.],
+           [ 8.,   9.,  10.,  11.],
+           [12.,  13.,  14.,  15.]])
     >>> np.vsplit(x, 2)
-    [array([[ 0.,  1.,  2.,  3.],
-           [ 4.,  5.,  6.,  7.]]),
-     array([[  8.,   9.,  10.,  11.],
-           [ 12.,  13.,  14.,  15.]])]
+    [array([[0., 1., 2., 3.],
+           [4., 5., 6., 7.]]), array([[ 8.,  9., 10., 11.],
+           [12., 13., 14., 15.]])]
     >>> np.vsplit(x, np.array([3, 6]))
-    [array([[  0.,   1.,   2.,   3.],
-           [  4.,   5.,   6.,   7.],
-           [  8.,   9.,  10.,  11.]]),
-     array([[ 12.,  13.,  14.,  15.]]),
-     array([], dtype=float64)]
+    [array([[ 0.,  1.,  2.,  3.],
+           [ 4.,  5.,  6.,  7.],
+           [ 8.,  9., 10., 11.]]), array([[12., 13., 14., 15.]]), array([], shape=(0, 4), dtype=float64)]
 
     With a higher dimensional array the split is still along the first axis.
 
     >>> x = np.arange(8.0).reshape(2, 2, 2)
     >>> x
-    array([[[ 0.,  1.],
-            [ 2.,  3.]],
-           [[ 4.,  5.],
-            [ 6.,  7.]]])
+    array([[[0.,  1.],
+            [2.,  3.]],
+           [[4.,  5.],
+            [6.,  7.]]])
     >>> np.vsplit(x, 2)
-    [array([[[ 0.,  1.],
-            [ 2.,  3.]]]),
-     array([[[ 4.,  5.],
-            [ 6.,  7.]]])]
+    [array([[[0., 1.],
+            [2., 3.]]]), array([[[4., 5.],
+            [6., 7.]]])]
 
     """
     if _nx.ndim(ary) < 2:
@@ -989,30 +985,28 @@
     --------
     >>> x = np.arange(16.0).reshape(2, 2, 4)
     >>> x
-    array([[[  0.,   1.,   2.,   3.],
-            [  4.,   5.,   6.,   7.]],
-           [[  8.,   9.,  10.,  11.],
-            [ 12.,  13.,  14.,  15.]]])
+    array([[[ 0.,   1.,   2.,   3.],
+            [ 4.,   5.,   6.,   7.]],
+           [[ 8.,   9.,  10.,  11.],
+            [12.,  13.,  14.,  15.]]])
     >>> np.dsplit(x, 2)
-    [array([[[  0.,   1.],
-            [  4.,   5.]],
-           [[  8.,   9.],
-            [ 12.,  13.]]]),
-     array([[[  2.,   3.],
-            [  6.,   7.]],
-           [[ 10.,  11.],
-            [ 14.,  15.]]])]
+    [array([[[ 0.,  1.],
+            [ 4.,  5.]],
+           [[ 8.,  9.],
+            [12., 13.]]]), array([[[ 2.,  3.],
+            [ 6.,  7.]],
+           [[10., 11.],
+            [14., 15.]]])]
     >>> np.dsplit(x, np.array([3, 6]))
-    [array([[[  0.,   1.,   2.],
-            [  4.,   5.,   6.]],
-           [[  8.,   9.,  10.],
-            [ 12.,  13.,  14.]]]),
-     array([[[  3.],
-            [  7.]],
-           [[ 11.],
-            [ 15.]]]),
-     array([], dtype=float64)]
-
+    [array([[[ 0.,   1.,   2.],
+            [ 4.,   5.,   6.]],
+           [[ 8.,   9.,  10.],
+            [12.,  13.,  14.]]]),
+     array([[[ 3.],
+            [ 7.]],
+           [[11.],
+            [15.]]]),
+    array([], shape=(2, 2, 0), dtype=float64)]
     """
     if _nx.ndim(ary) < 3:
         raise ValueError('dsplit only works on arrays of 3 or more dimensions')
@@ -1092,15 +1086,15 @@
     Examples
     --------
     >>> np.kron([1,10,100], [5,6,7])
-    array([  5,   6,   7,  50,  60,  70, 500, 600, 700])
+    array([  5,   6,   7, ..., 500, 600, 700])
     >>> np.kron([5,6,7], [1,10,100])
-    array([  5,  50, 500,   6,  60, 600,   7,  70, 700])
+    array([  5,  50, 500, ...,   7,  70, 700])
 
     >>> np.kron(np.eye(2), np.ones((2,2)))
-    array([[ 1.,  1.,  0.,  0.],
-           [ 1.,  1.,  0.,  0.],
-           [ 0.,  0.,  1.,  1.],
-           [ 0.,  0.,  1.,  1.]])
+    array([[1.,  1.,  0.,  0.],
+           [1.,  1.,  0.,  0.],
+           [0.,  0.,  1.,  1.],
+           [0.,  0.,  1.,  1.]])
 
     >>> a = np.arange(100).reshape((2,5,2,5))
     >>> b = np.arange(24).reshape((2,3,4))
diff --git a/numpy/lib/tests/test_arraysetops.py b/numpy/lib/tests/test_arraysetops.py
index a17fc66..93d4b27 100644
--- a/numpy/lib/tests/test_arraysetops.py
+++ b/numpy/lib/tests/test_arraysetops.py
@@ -136,8 +136,8 @@
          np.nan),
         # should fail because attempting
         # to downcast to smaller int type:
-        (np.array([1, 2, 3], dtype=np.int32),
-         np.array([5, 7, 2], dtype=np.int64),
+        (np.array([1, 2, 3], dtype=np.int16),
+         np.array([5, 1<<20, 2], dtype=np.int32),
          None),
         # should fail because attempting to cast
         # two special floating point values
@@ -152,8 +152,8 @@
         # specifically, raise an appropriate
         # Exception when attempting to append or
         # prepend with an incompatible type
-        msg = 'must be compatible'
-        with assert_raises_regex(TypeError, msg):
+        msg = 'cannot convert'
+        with assert_raises_regex(ValueError, msg):
             ediff1d(ary=ary,
                     to_end=append,
                     to_begin=prepend)
diff --git a/numpy/lib/tests/test_function_base.py b/numpy/lib/tests/test_function_base.py
index 3d4b0e3..d9a97db 100644
--- a/numpy/lib/tests/test_function_base.py
+++ b/numpy/lib/tests/test_function_base.py
@@ -4,6 +4,7 @@
 import warnings
 import sys
 import decimal
+import types
 import pytest
 
 import numpy as np
@@ -24,6 +25,7 @@
 
 from numpy.compat import long
 
+PY2 = sys.version_info[0] == 2
 
 def get_mat(n):
     data = np.arange(n)
@@ -353,9 +355,9 @@
         assert_equal(type(np.average(a, weights=w)), subclass)
 
     def test_upcasting(self):
-        types = [('i4', 'i4', 'f8'), ('i4', 'f4', 'f8'), ('f4', 'i4', 'f8'),
+        typs = [('i4', 'i4', 'f8'), ('i4', 'f4', 'f8'), ('f4', 'i4', 'f8'),
                  ('f4', 'f4', 'f4'), ('f4', 'f8', 'f8')]
-        for at, wt, rt in types:
+        for at, wt, rt in typs:
             a = np.array([[1,2],[3,4]], dtype=at)
             w = np.array([[1,2],[3,4]], dtype=wt)
             assert_equal(np.average(a, weights=w).dtype, np.dtype(rt))
@@ -1498,6 +1500,49 @@
             f(x)
 
 
+class TestLeaks(object):
+    class A(object):
+        iters = 20
+
+        def bound(self, *args):
+            return 0
+
+        @staticmethod
+        def unbound(*args):
+            return 0
+
+    @pytest.mark.skipif(not HAS_REFCOUNT, reason="Python lacks refcounts")
+    @pytest.mark.parametrize('name, incr', [
+            ('bound', A.iters),
+            ('unbound', 0),
+            ])
+    def test_frompyfunc_leaks(self, name, incr):
+        # exposed in gh-11867 as np.vectorized, but the problem stems from
+        # frompyfunc.
+        # class.attribute = np.frompyfunc(<method>) creates a
+        # reference cycle if <method> is a bound class method. It requires a
+        # gc collection cycle to break the cycle (on CPython 3)
+        import gc
+        A_func = getattr(self.A, name)
+        gc.disable()
+        try:
+            refcount = sys.getrefcount(A_func)
+            for i in range(self.A.iters):
+                a = self.A()
+                a.f = np.frompyfunc(getattr(a, name), 1, 1)
+                out = a.f(np.arange(10))
+            a = None
+            if PY2:
+                assert_equal(sys.getrefcount(A_func), refcount)
+            else:
+                # A.func is part of a reference cycle if incr is non-zero
+                assert_equal(sys.getrefcount(A_func), refcount + incr)
+            for i in range(5):
+                gc.collect()
+            assert_equal(sys.getrefcount(A_func), refcount)
+        finally:
+            gc.enable()
+
 class TestDigitize(object):
 
     def test_forward(self):
@@ -2391,11 +2436,8 @@
         assert_equal(np.percentile(x, 100), 3.5)
         assert_equal(np.percentile(x, 50), 1.75)
         x[1] = np.nan
-        with warnings.catch_warnings(record=True) as w:
-            warnings.filterwarnings('always', '', RuntimeWarning)
-            assert_equal(np.percentile(x, 0), np.nan)
-            assert_equal(np.percentile(x, 0, interpolation='nearest'), np.nan)
-            assert_(w[0].category is RuntimeWarning)
+        assert_equal(np.percentile(x, 0), np.nan)
+        assert_equal(np.percentile(x, 0, interpolation='nearest'), np.nan)
 
     def test_api(self):
         d = np.ones(5)
@@ -2733,85 +2775,63 @@
     def test_nan_behavior(self):
         a = np.arange(24, dtype=float)
         a[2] = np.nan
-        with warnings.catch_warnings(record=True) as w:
-            warnings.filterwarnings('always', '', RuntimeWarning)
-            assert_equal(np.percentile(a, 0.3), np.nan)
-            assert_equal(np.percentile(a, 0.3, axis=0), np.nan)
-            assert_equal(np.percentile(a, [0.3, 0.6], axis=0),
-                         np.array([np.nan] * 2))
-            assert_(w[0].category is RuntimeWarning)
-            assert_(w[1].category is RuntimeWarning)
-            assert_(w[2].category is RuntimeWarning)
+        assert_equal(np.percentile(a, 0.3), np.nan)
+        assert_equal(np.percentile(a, 0.3, axis=0), np.nan)
+        assert_equal(np.percentile(a, [0.3, 0.6], axis=0),
+                     np.array([np.nan] * 2))
 
         a = np.arange(24, dtype=float).reshape(2, 3, 4)
         a[1, 2, 3] = np.nan
         a[1, 1, 2] = np.nan
 
         # no axis
-        with warnings.catch_warnings(record=True) as w:
-            warnings.filterwarnings('always', '', RuntimeWarning)
-            assert_equal(np.percentile(a, 0.3), np.nan)
-            assert_equal(np.percentile(a, 0.3).ndim, 0)
-            assert_(w[0].category is RuntimeWarning)
+        assert_equal(np.percentile(a, 0.3), np.nan)
+        assert_equal(np.percentile(a, 0.3).ndim, 0)
 
         # axis0 zerod
         b = np.percentile(np.arange(24, dtype=float).reshape(2, 3, 4), 0.3, 0)
         b[2, 3] = np.nan
         b[1, 2] = np.nan
-        with warnings.catch_warnings(record=True) as w:
-            warnings.filterwarnings('always', '', RuntimeWarning)
-            assert_equal(np.percentile(a, 0.3, 0), b)
+        assert_equal(np.percentile(a, 0.3, 0), b)
 
         # axis0 not zerod
         b = np.percentile(np.arange(24, dtype=float).reshape(2, 3, 4),
                           [0.3, 0.6], 0)
         b[:, 2, 3] = np.nan
         b[:, 1, 2] = np.nan
-        with warnings.catch_warnings(record=True) as w:
-            warnings.filterwarnings('always', '', RuntimeWarning)
-            assert_equal(np.percentile(a, [0.3, 0.6], 0), b)
+        assert_equal(np.percentile(a, [0.3, 0.6], 0), b)
 
         # axis1 zerod
         b = np.percentile(np.arange(24, dtype=float).reshape(2, 3, 4), 0.3, 1)
         b[1, 3] = np.nan
         b[1, 2] = np.nan
-        with warnings.catch_warnings(record=True) as w:
-            warnings.filterwarnings('always', '', RuntimeWarning)
-            assert_equal(np.percentile(a, 0.3, 1), b)
+        assert_equal(np.percentile(a, 0.3, 1), b)
         # axis1 not zerod
         b = np.percentile(
             np.arange(24, dtype=float).reshape(2, 3, 4), [0.3, 0.6], 1)
         b[:, 1, 3] = np.nan
         b[:, 1, 2] = np.nan
-        with warnings.catch_warnings(record=True) as w:
-            warnings.filterwarnings('always', '', RuntimeWarning)
-            assert_equal(np.percentile(a, [0.3, 0.6], 1), b)
+        assert_equal(np.percentile(a, [0.3, 0.6], 1), b)
 
         # axis02 zerod
         b = np.percentile(
             np.arange(24, dtype=float).reshape(2, 3, 4), 0.3, (0, 2))
         b[1] = np.nan
         b[2] = np.nan
-        with warnings.catch_warnings(record=True) as w:
-            warnings.filterwarnings('always', '', RuntimeWarning)
-            assert_equal(np.percentile(a, 0.3, (0, 2)), b)
+        assert_equal(np.percentile(a, 0.3, (0, 2)), b)
         # axis02 not zerod
         b = np.percentile(np.arange(24, dtype=float).reshape(2, 3, 4),
                           [0.3, 0.6], (0, 2))
         b[:, 1] = np.nan
         b[:, 2] = np.nan
-        with warnings.catch_warnings(record=True) as w:
-            warnings.filterwarnings('always', '', RuntimeWarning)
-            assert_equal(np.percentile(a, [0.3, 0.6], (0, 2)), b)
+        assert_equal(np.percentile(a, [0.3, 0.6], (0, 2)), b)
         # axis02 not zerod with nearest interpolation
         b = np.percentile(np.arange(24, dtype=float).reshape(2, 3, 4),
                           [0.3, 0.6], (0, 2), interpolation='nearest')
         b[:, 1] = np.nan
         b[:, 2] = np.nan
-        with warnings.catch_warnings(record=True) as w:
-            warnings.filterwarnings('always', '', RuntimeWarning)
-            assert_equal(np.percentile(
-                a, [0.3, 0.6], (0, 2), interpolation='nearest'), b)
+        assert_equal(np.percentile(
+            a, [0.3, 0.6], (0, 2), interpolation='nearest'), b)
 
 
 class TestQuantile(object):
@@ -2858,10 +2878,7 @@
         # check array scalar result
         assert_equal(np.median(a).ndim, 0)
         a[1] = np.nan
-        with warnings.catch_warnings(record=True) as w:
-            warnings.filterwarnings('always', '', RuntimeWarning)
-            assert_equal(np.median(a).ndim, 0)
-            assert_(w[0].category is RuntimeWarning)
+        assert_equal(np.median(a).ndim, 0)
 
     def test_axis_keyword(self):
         a3 = np.array([[2, 3],
@@ -2960,58 +2977,43 @@
     def test_nan_behavior(self):
         a = np.arange(24, dtype=float)
         a[2] = np.nan
-        with warnings.catch_warnings(record=True) as w:
-            warnings.filterwarnings('always', '', RuntimeWarning)
-            assert_equal(np.median(a), np.nan)
-            assert_equal(np.median(a, axis=0), np.nan)
-            assert_(w[0].category is RuntimeWarning)
-            assert_(w[1].category is RuntimeWarning)
+        assert_equal(np.median(a), np.nan)
+        assert_equal(np.median(a, axis=0), np.nan)
 
         a = np.arange(24, dtype=float).reshape(2, 3, 4)
         a[1, 2, 3] = np.nan
         a[1, 1, 2] = np.nan
 
         # no axis
-        with warnings.catch_warnings(record=True) as w:
-            warnings.filterwarnings('always', '', RuntimeWarning)
-            assert_equal(np.median(a), np.nan)
-            assert_equal(np.median(a).ndim, 0)
-            assert_(w[0].category is RuntimeWarning)
+        assert_equal(np.median(a), np.nan)
+        assert_equal(np.median(a).ndim, 0)
 
         # axis0
         b = np.median(np.arange(24, dtype=float).reshape(2, 3, 4), 0)
         b[2, 3] = np.nan
         b[1, 2] = np.nan
-        with warnings.catch_warnings(record=True) as w:
-            warnings.filterwarnings('always', '', RuntimeWarning)
-            assert_equal(np.median(a, 0), b)
-            assert_equal(len(w), 1)
+        assert_equal(np.median(a, 0), b)
 
         # axis1
         b = np.median(np.arange(24, dtype=float).reshape(2, 3, 4), 1)
         b[1, 3] = np.nan
         b[1, 2] = np.nan
-        with warnings.catch_warnings(record=True) as w:
-            warnings.filterwarnings('always', '', RuntimeWarning)
-            assert_equal(np.median(a, 1), b)
-            assert_equal(len(w), 1)
+        assert_equal(np.median(a, 1), b)
 
         # axis02
         b = np.median(np.arange(24, dtype=float).reshape(2, 3, 4), (0, 2))
         b[1] = np.nan
         b[2] = np.nan
-        with warnings.catch_warnings(record=True) as w:
-            warnings.filterwarnings('always', '', RuntimeWarning)
-            assert_equal(np.median(a, (0, 2)), b)
-            assert_equal(len(w), 1)
+        assert_equal(np.median(a, (0, 2)), b)
 
     def test_empty(self):
-        # empty arrays
+        # mean(empty array) emits two warnings: empty slice and divide by 0
         a = np.array([], dtype=float)
         with warnings.catch_warnings(record=True) as w:
             warnings.filterwarnings('always', '', RuntimeWarning)
             assert_equal(np.median(a), np.nan)
             assert_(w[0].category is RuntimeWarning)
+            assert_equal(len(w), 2)
 
         # multiple dimensions
         a = np.array([], dtype=float, ndmin=3)
diff --git a/numpy/lib/tests/test_nanfunctions.py b/numpy/lib/tests/test_nanfunctions.py
index 504372f..b7261c6 100644
--- a/numpy/lib/tests/test_nanfunctions.py
+++ b/numpy/lib/tests/test_nanfunctions.py
@@ -1,8 +1,10 @@
 from __future__ import division, absolute_import, print_function
 
 import warnings
+import pytest
 
 import numpy as np
+from numpy.lib.nanfunctions import _nan_mask
 from numpy.testing import (
     assert_, assert_equal, assert_almost_equal, assert_no_warnings,
     assert_raises, assert_array_equal, suppress_warnings
@@ -925,3 +927,29 @@
         p = p.tolist()
         np.nanquantile(np.arange(100.), p, interpolation="midpoint")
         assert_array_equal(p, p0)
+
+@pytest.mark.parametrize("arr, expected", [
+    # array of floats with some nans
+    (np.array([np.nan, 5.0, np.nan, np.inf]),
+     np.array([False, True, False, True])),
+    # int64 array that can't possibly have nans
+    (np.array([1, 5, 7, 9], dtype=np.int64),
+     True),
+    # bool array that can't possibly have nans
+    (np.array([False, True, False, True]),
+     True),
+    # 2-D complex array with nans
+    (np.array([[np.nan, 5.0],
+               [np.nan, np.inf]], dtype=np.complex64),
+     np.array([[False, True],
+               [False, True]])),
+    ])
+def test__nan_mask(arr, expected):
+    for out in [None, np.empty(arr.shape, dtype=np.bool_)]:
+        actual = _nan_mask(arr, out=out)
+        assert_equal(actual, expected)
+        # the above won't distinguish between True proper
+        # and an array of True values; we want True proper
+        # for types that can't possibly contain NaN
+        if type(expected) is not np.ndarray:
+            assert actual is True
diff --git a/numpy/lib/tests/test_recfunctions.py b/numpy/lib/tests/test_recfunctions.py
index 11f8a5a..0696936 100644
--- a/numpy/lib/tests/test_recfunctions.py
+++ b/numpy/lib/tests/test_recfunctions.py
@@ -221,9 +221,9 @@
                          ( 5, ( 6.,  7), [ 8.,  9.]),
                          (10, (11., 12), [13., 14.]),
                          (15, (16., 17), [18., 19.])],
-                     dtype=[('a', '<i4'),
-                            ('b', [('f0', '<f4'), ('f1', '<u2')]),
-                            ('c', '<f4', (2,))])
+                     dtype=[('a', 'i4'),
+                            ('b', [('f0', 'f4'), ('f1', 'u2')]),
+                            ('c', 'f4', (2,))])
         assert_equal(out, want)
 
         d = np.array([(1, 2, 5), (4, 5, 7), (7, 8 ,11), (10, 11, 12)],
diff --git a/numpy/lib/twodim_base.py b/numpy/lib/twodim_base.py
index 27d8486..e165c9b 100644
--- a/numpy/lib/twodim_base.py
+++ b/numpy/lib/twodim_base.py
@@ -77,13 +77,13 @@
     --------
     >>> A = np.diag([1.,2.,3.])
     >>> A
-    array([[ 1.,  0.,  0.],
-           [ 0.,  2.,  0.],
-           [ 0.,  0.,  3.]])
+    array([[1.,  0.,  0.],
+           [0.,  2.,  0.],
+           [0.,  0.,  3.]])
     >>> np.fliplr(A)
-    array([[ 0.,  0.,  1.],
-           [ 0.,  2.,  0.],
-           [ 3.,  0.,  0.]])
+    array([[0.,  0.,  1.],
+           [0.,  2.,  0.],
+           [3.,  0.,  0.]])
 
     >>> A = np.random.randn(2,3,5)
     >>> np.all(np.fliplr(A) == A[:,::-1,...])
@@ -129,13 +129,13 @@
     --------
     >>> A = np.diag([1.0, 2, 3])
     >>> A
-    array([[ 1.,  0.,  0.],
-           [ 0.,  2.,  0.],
-           [ 0.,  0.,  3.]])
+    array([[1.,  0.,  0.],
+           [0.,  2.,  0.],
+           [0.,  0.,  3.]])
     >>> np.flipud(A)
-    array([[ 0.,  0.,  3.],
-           [ 0.,  2.,  0.],
-           [ 1.,  0.,  0.]])
+    array([[0.,  0.,  3.],
+           [0.,  2.,  0.],
+           [1.,  0.,  0.]])
 
     >>> A = np.random.randn(2,3,5)
     >>> np.all(np.flipud(A) == A[::-1,...])
@@ -191,9 +191,9 @@
     array([[1, 0],
            [0, 1]])
     >>> np.eye(3, k=1)
-    array([[ 0.,  1.,  0.],
-           [ 0.,  0.,  1.],
-           [ 0.,  0.,  0.]])
+    array([[0.,  1.,  0.],
+           [0.,  0.,  1.],
+           [0.,  0.,  0.]])
 
     """
     if M is None:
@@ -378,9 +378,9 @@
            [1, 1, 1, 1, 1]])
 
     >>> np.tri(3, 5, -1)
-    array([[ 0.,  0.,  0.,  0.,  0.],
-           [ 1.,  0.,  0.,  0.,  0.],
-           [ 1.,  1.,  0.,  0.,  0.]])
+    array([[0.,  0.,  0.,  0.,  0.],
+           [1.,  0.,  0.,  0.,  0.],
+           [1.,  1.,  0.,  0.,  0.]])
 
     """
     if M is None:
@@ -540,7 +540,7 @@
     of the differences between the values of the input vector:
 
     >>> np.linalg.det(np.vander(x))
-    48.000000000000043
+    48.000000000000043 # may vary
     >>> (5-3)*(5-2)*(5-1)*(3-2)*(3-1)*(2-1)
     48
 
@@ -644,7 +644,7 @@
 
     Examples
     --------
-    >>> import matplotlib as mpl
+    >>> from matplotlib.image import NonUniformImage
     >>> import matplotlib.pyplot as plt
 
     Construct a 2-D histogram with variable bin width. First define the bin
@@ -666,6 +666,7 @@
     >>> ax = fig.add_subplot(131, title='imshow: square bins')
     >>> plt.imshow(H, interpolation='nearest', origin='low',
     ...         extent=[xedges[0], xedges[-1], yedges[0], yedges[-1]])
+    <matplotlib.image.AxesImage object at 0x...>
 
     :func:`pcolormesh <matplotlib.pyplot.pcolormesh>` can display actual edges:
 
@@ -673,13 +674,14 @@
     ...         aspect='equal')
     >>> X, Y = np.meshgrid(xedges, yedges)
     >>> ax.pcolormesh(X, Y, H)
+    <matplotlib.collections.QuadMesh object at 0x...>
 
     :class:`NonUniformImage <matplotlib.image.NonUniformImage>` can be used to
     display actual bin edges with interpolation:
 
     >>> ax = fig.add_subplot(133, title='NonUniformImage: interpolated',
     ...         aspect='equal', xlim=xedges[[0, -1]], ylim=yedges[[0, -1]])
-    >>> im = mpl.image.NonUniformImage(ax, interpolation='bilinear')
+    >>> im = NonUniformImage(ax, interpolation='bilinear')
     >>> xcenters = (xedges[:-1] + xedges[1:]) / 2
     >>> ycenters = (yedges[:-1] + yedges[1:]) / 2
     >>> im.set_data(xcenters, ycenters, H)
@@ -829,7 +831,7 @@
     Both for indexing:
 
     >>> a[il1]
-    array([ 0,  4,  5,  8,  9, 10, 12, 13, 14, 15])
+    array([ 0,  4,  5, ..., 13, 14, 15])
 
     And for assigning values:
 
@@ -944,7 +946,7 @@
     Both for indexing:
 
     >>> a[iu1]
-    array([ 0,  1,  2,  3,  5,  6,  7, 10, 11, 15])
+    array([ 0,  1,  2, ..., 10, 11, 15])
 
     And for assigning values:
 
diff --git a/numpy/lib/type_check.py b/numpy/lib/type_check.py
index 90b1e9a..f555177 100644
--- a/numpy/lib/type_check.py
+++ b/numpy/lib/type_check.py
@@ -105,11 +105,11 @@
     Examples
     --------
     >>> np.asfarray([2, 3])
-    array([ 2.,  3.])
+    array([2.,  3.])
     >>> np.asfarray([2, 3], dtype='float')
-    array([ 2.,  3.])
+    array([2.,  3.])
     >>> np.asfarray([2, 3], dtype='int8')
-    array([ 2.,  3.])
+    array([2.,  3.])
 
     """
     if not _nx.issubdtype(dtype, _nx.inexact):
@@ -146,13 +146,13 @@
     --------
     >>> a = np.array([1+2j, 3+4j, 5+6j])
     >>> a.real
-    array([ 1.,  3.,  5.])
+    array([1.,  3.,  5.])
     >>> a.real = 9
     >>> a
-    array([ 9.+2.j,  9.+4.j,  9.+6.j])
+    array([9.+2.j,  9.+4.j,  9.+6.j])
     >>> a.real = np.array([9, 8, 7])
     >>> a
-    array([ 9.+2.j,  8.+4.j,  7.+6.j])
+    array([9.+2.j,  8.+4.j,  7.+6.j])
     >>> np.real(1 + 1j)
     1.0
 
@@ -192,10 +192,10 @@
     --------
     >>> a = np.array([1+2j, 3+4j, 5+6j])
     >>> a.imag
-    array([ 2.,  4.,  6.])
+    array([2.,  4.,  6.])
     >>> a.imag = np.array([8, 10, 12])
     >>> a
-    array([ 1. +8.j,  3.+10.j,  5.+12.j])
+    array([1. +8.j,  3.+10.j,  5.+12.j])
     >>> np.imag(1 + 1j)
     1.0
 
@@ -422,11 +422,13 @@
     0.0
     >>> x = np.array([np.inf, -np.inf, np.nan, -128, 128])
     >>> np.nan_to_num(x)
-    array([  1.79769313e+308,  -1.79769313e+308,   0.00000000e+000,
-            -1.28000000e+002,   1.28000000e+002])
+    array([ 1.79769313e+308, -1.79769313e+308,  0.00000000e+000, # may vary
+           -1.28000000e+002,  1.28000000e+002])
     >>> y = np.array([complex(np.inf, np.nan), np.nan, complex(np.nan, np.inf)])
+    array([  1.79769313e+308,  -1.79769313e+308,   0.00000000e+000, # may vary
+         -1.28000000e+002,   1.28000000e+002])
     >>> np.nan_to_num(y)
-    array([  1.79769313e+308 +0.00000000e+000j,
+    array([  1.79769313e+308 +0.00000000e+000j, # may vary
              0.00000000e+000 +0.00000000e+000j,
              0.00000000e+000 +1.79769313e+308j])
     """
@@ -490,12 +492,12 @@
     Examples
     --------
     >>> np.finfo(float).eps
-    2.2204460492503131e-16
+    2.2204460492503131e-16 # may vary
 
     >>> np.real_if_close([2.1 + 4e-14j], tol=1000)
-    array([ 2.1])
+    array([2.1])
     >>> np.real_if_close([2.1 + 4e-13j], tol=1000)
-    array([ 2.1 +4.00000000e-13j])
+    array([2.1+4.e-13j])
 
     """
     a = asanyarray(a)
@@ -538,7 +540,6 @@
     --------
     >>> np.asscalar(np.array([24]))
     24
-
     """
 
     # 2018-10-10, 1.16
@@ -672,11 +673,11 @@
     Examples
     --------
     >>> np.common_type(np.arange(2, dtype=np.float32))
-    <type 'numpy.float32'>
+    <class 'numpy.float32'>
     >>> np.common_type(np.arange(2, dtype=np.float32), np.arange(2))
-    <type 'numpy.float64'>
+    <class 'numpy.float64'>
     >>> np.common_type(np.arange(4), np.array([45, 6.j]), np.array([45.0]))
-    <type 'numpy.complex128'>
+    <class 'numpy.complex128'>
 
     """
     is_complex = False
diff --git a/numpy/lib/ufunclike.py b/numpy/lib/ufunclike.py
index 9a9e6f9..5c411e8 100644
--- a/numpy/lib/ufunclike.py
+++ b/numpy/lib/ufunclike.py
@@ -154,11 +154,11 @@
     Examples
     --------
     >>> np.isposinf(np.PINF)
-    array(True, dtype=bool)
+    True
     >>> np.isposinf(np.inf)
-    array(True, dtype=bool)
+    True
     >>> np.isposinf(np.NINF)
-    array(False, dtype=bool)
+    False
     >>> np.isposinf([-np.inf, 0., np.inf])
     array([False, False,  True])
 
@@ -224,11 +224,11 @@
     Examples
     --------
     >>> np.isneginf(np.NINF)
-    array(True, dtype=bool)
+    True
     >>> np.isneginf(np.inf)
-    array(False, dtype=bool)
+    False
     >>> np.isneginf(np.PINF)
-    array(False, dtype=bool)
+    False
     >>> np.isneginf([-np.inf, 0., np.inf])
     array([ True, False, False])
 
diff --git a/numpy/lib/utils.py b/numpy/lib/utils.py
index 84edf40..6b112f3 100644
--- a/numpy/lib/utils.py
+++ b/numpy/lib/utils.py
@@ -150,10 +150,8 @@
     Warning:
 
     >>> olduint = np.deprecate(np.uint)
+    DeprecationWarning: `uint64` is deprecated! # may vary
     >>> olduint(6)
-    /usr/lib/python2.5/site-packages/numpy/lib/utils.py:114:
-    DeprecationWarning: uint32 is deprecated
-      warnings.warn(str1, DeprecationWarning, stacklevel=2)
     6
 
     """
@@ -201,8 +199,8 @@
     >>> low, high = np.byte_bounds(I)
     >>> high - low == I.size*I.itemsize
     True
-    >>> I = np.eye(2, dtype='G'); I.dtype
-    dtype('complex192')
+    >>> I = np.eye(2); I.dtype
+    dtype('float64')
     >>> low, high = np.byte_bounds(I)
     >>> high - low == I.size*I.itemsize
     True
@@ -263,17 +261,17 @@
     >>> np.who()
     Name            Shape            Bytes            Type
     ===========================================================
-    a               10               40               int32
+    a               10               80               int64
     b               20               160              float64
-    Upper bound on total bytes  =       200
+    Upper bound on total bytes  =       240
 
     >>> d = {'x': np.arange(2.0), 'y': np.arange(3.0), 'txt': 'Some str',
     ... 'idx':5}
     >>> np.who(d)
     Name            Shape            Bytes            Type
     ===========================================================
-    y               3                24               float64
     x               2                16               float64
+    y               3                24               float64
     Upper bound on total bytes  =       40
 
     """
@@ -733,7 +731,7 @@
 
     Examples
     --------
-    >>> np.lookfor('binary representation')
+    >>> np.lookfor('binary representation') # doctest: +SKIP
     Search results for 'binary representation'
     ------------------------------------------
     numpy.binary_repr
@@ -1104,7 +1102,7 @@
     >>> np.safe_eval('open("/home/user/.ssh/id_dsa").read()')
     Traceback (most recent call last):
       ...
-    SyntaxError: Unsupported source construct: compiler.ast.CallFunc
+    ValueError: malformed node or string: <_ast.Call object at 0x...>
 
     """
     # Local import to speed up numpy's import time.
@@ -1142,17 +1140,12 @@
         n = n.filled(False)
     if result.ndim == 0:
         if n == True:
-            warnings.warn("Invalid value encountered in median",
-                          RuntimeWarning, stacklevel=3)
             if out is not None:
                 out[...] = data.dtype.type(np.nan)
                 result = out
             else:
                 result = data.dtype.type(np.nan)
     elif np.count_nonzero(n.ravel()) > 0:
-        warnings.warn("Invalid value encountered in median for" +
-                      " %d results" % np.count_nonzero(n.ravel()),
-                      RuntimeWarning, stacklevel=3)
         result[n] = np.nan
     return result
 
diff --git a/numpy/linalg/linalg.py b/numpy/linalg/linalg.py
index 8363d73..17e84be 100644
--- a/numpy/linalg/linalg.py
+++ b/numpy/linalg/linalg.py
@@ -26,7 +26,7 @@
     add, multiply, sqrt, fastCopyAndTranspose, sum, isfinite,
     finfo, errstate, geterrobj, moveaxis, amin, amax, product, abs,
     atleast_2d, intp, asanyarray, object_, matmul,
-    swapaxes, divide, count_nonzero, isnan
+    swapaxes, divide, count_nonzero, isnan, sign
 )
 from numpy.core.multiarray import normalize_axis_index
 from numpy.core.overrides import set_module
@@ -377,7 +377,7 @@
     >>> b = np.array([9,8])
     >>> x = np.linalg.solve(a, b)
     >>> x
-    array([ 2.,  3.])
+    array([2.,  3.])
 
     Check that the solution is correct:
 
@@ -535,10 +535,10 @@
 
     >>> a = np.array([[[1., 2.], [3., 4.]], [[1, 3], [3, 5]]])
     >>> inv(a)
-    array([[[-2. ,  1. ],
-            [ 1.5, -0.5]],
-           [[-5. ,  2. ],
-            [ 3. , -1. ]]])
+    array([[[-2.  ,  1.  ],
+            [ 1.5 , -0.5 ]],
+           [[-1.25,  0.75],
+            [ 0.75, -0.25]]])
 
     """
     a, wrap = _makearray(a)
@@ -730,21 +730,21 @@
     --------
     >>> A = np.array([[1,-2j],[2j,5]])
     >>> A
-    array([[ 1.+0.j,  0.-2.j],
+    array([[ 1.+0.j, -0.-2.j],
            [ 0.+2.j,  5.+0.j]])
     >>> L = np.linalg.cholesky(A)
     >>> L
-    array([[ 1.+0.j,  0.+0.j],
-           [ 0.+2.j,  1.+0.j]])
+    array([[1.+0.j, 0.+0.j],
+           [0.+2.j, 1.+0.j]])
     >>> np.dot(L, L.T.conj()) # verify that L * L.H = A
-    array([[ 1.+0.j,  0.-2.j],
-           [ 0.+2.j,  5.+0.j]])
+    array([[1.+0.j, 0.-2.j],
+           [0.+2.j, 5.+0.j]])
     >>> A = [[1,-2j],[2j,5]] # what happens if A is only array_like?
     >>> np.linalg.cholesky(A) # an ndarray object is returned
-    array([[ 1.+0.j,  0.+0.j],
-           [ 0.+2.j,  1.+0.j]])
+    array([[1.+0.j, 0.+0.j],
+           [0.+2.j, 1.+0.j]])
     >>> # But a matrix object is returned if A is a matrix object
-    >>> LA.cholesky(np.matrix(A))
+    >>> np.linalg.cholesky(np.matrix(A))
     matrix([[ 1.+0.j,  0.+0.j],
             [ 0.+2.j,  1.+0.j]])
 
@@ -878,9 +878,9 @@
            [1, 1],
            [2, 1]])
     >>> b = np.array([1, 0, 2, 1])
-    >>> q, r = LA.qr(A)
+    >>> q, r = np.linalg.qr(A)
     >>> p = np.dot(q.T, b)
-    >>> np.dot(LA.inv(r), p)
+    >>> np.dot(np.linalg.inv(r), p)
     array([  1.1e-16,   1.0e+00])
 
     """
@@ -1049,7 +1049,7 @@
     >>> A = np.dot(Q, D)
     >>> A = np.dot(A, Q.T)
     >>> LA.eigvals(A)
-    array([ 1., -1.])
+    array([ 1., -1.]) # random
 
     """
     a, wrap = _makearray(a)
@@ -1131,24 +1131,24 @@
     >>> from numpy import linalg as LA
     >>> a = np.array([[1, -2j], [2j, 5]])
     >>> LA.eigvalsh(a)
-    array([ 0.17157288,  5.82842712])
+    array([ 0.17157288,  5.82842712]) # may vary
 
     >>> # demonstrate the treatment of the imaginary part of the diagonal
     >>> a = np.array([[5+2j, 9-2j], [0+2j, 2-1j]])
     >>> a
-    array([[ 5.+2.j,  9.-2.j],
-           [ 0.+2.j,  2.-1.j]])
+    array([[5.+2.j, 9.-2.j],
+           [0.+2.j, 2.-1.j]])
     >>> # with UPLO='L' this is numerically equivalent to using LA.eigvals()
     >>> # with:
     >>> b = np.array([[5.+0.j, 0.-2.j], [0.+2.j, 2.-0.j]])
     >>> b
-    array([[ 5.+0.j,  0.-2.j],
-           [ 0.+2.j,  2.+0.j]])
+    array([[5.+0.j, 0.-2.j],
+           [0.+2.j, 2.+0.j]])
     >>> wa = LA.eigvalsh(a)
     >>> wb = LA.eigvals(b)
     >>> wa; wb
-    array([ 1.,  6.])
-    array([ 6.+0.j,  1.+0.j])
+    array([1., 6.])
+    array([6.+0.j, 1.+0.j])
 
     """
     UPLO = UPLO.upper()
@@ -1264,19 +1264,19 @@
 
     >>> w, v = LA.eig(np.diag((1, 2, 3)))
     >>> w; v
-    array([ 1.,  2.,  3.])
-    array([[ 1.,  0.,  0.],
-           [ 0.,  1.,  0.],
-           [ 0.,  0.,  1.]])
+    array([1., 2., 3.])
+    array([[1., 0., 0.],
+           [0., 1., 0.],
+           [0., 0., 1.]])
 
     Real matrix possessing complex e-values and e-vectors; note that the
     e-values are complex conjugates of each other.
 
     >>> w, v = LA.eig(np.array([[1, -1], [1, 1]]))
     >>> w; v
-    array([ 1. + 1.j,  1. - 1.j])
-    array([[ 0.70710678+0.j        ,  0.70710678+0.j        ],
-           [ 0.00000000-0.70710678j,  0.00000000+0.70710678j]])
+    array([1.+1.j, 1.-1.j])
+    array([[0.70710678+0.j        , 0.70710678-0.j        ],
+           [0.        -0.70710678j, 0.        +0.70710678j]])
 
     Complex-valued matrix with real e-values (but complex-valued e-vectors);
     note that a.conj().T = a, i.e., a is Hermitian.
@@ -1284,9 +1284,9 @@
     >>> a = np.array([[1, 1j], [-1j, 1]])
     >>> w, v = LA.eig(a)
     >>> w; v
-    array([  2.00000000e+00+0.j,   5.98651912e-36+0.j]) # i.e., {2, 0}
-    array([[ 0.00000000+0.70710678j,  0.70710678+0.j        ],
-           [ 0.70710678+0.j        ,  0.00000000+0.70710678j]])
+    array([2.+0.j, 0.+0.j])
+    array([[ 0.        +0.70710678j,  0.70710678+0.j        ], # may vary
+           [ 0.70710678+0.j        , -0.        +0.70710678j]])
 
     Be careful about round-off error!
 
@@ -1294,9 +1294,9 @@
     >>> # Theor. e-values are 1 +/- 1e-9
     >>> w, v = LA.eig(a)
     >>> w; v
-    array([ 1.,  1.])
-    array([[ 1.,  0.],
-           [ 0.,  1.]])
+    array([1., 1.])
+    array([[1., 0.],
+           [0., 1.]])
 
     """
     a, wrap = _makearray(a)
@@ -1392,49 +1392,49 @@
     >>> from numpy import linalg as LA
     >>> a = np.array([[1, -2j], [2j, 5]])
     >>> a
-    array([[ 1.+0.j,  0.-2.j],
+    array([[ 1.+0.j, -0.-2.j],
            [ 0.+2.j,  5.+0.j]])
     >>> w, v = LA.eigh(a)
     >>> w; v
-    array([ 0.17157288,  5.82842712])
-    array([[-0.92387953+0.j        , -0.38268343+0.j        ],
-           [ 0.00000000+0.38268343j,  0.00000000-0.92387953j]])
+    array([0.17157288, 5.82842712])
+    array([[-0.92387953+0.j        , -0.38268343+0.j        ], # may vary
+           [ 0.        +0.38268343j,  0.        -0.92387953j]])
 
     >>> np.dot(a, v[:, 0]) - w[0] * v[:, 0] # verify 1st e-val/vec pair
-    array([2.77555756e-17 + 0.j, 0. + 1.38777878e-16j])
+    array([5.55111512e-17+0.0000000e+00j, 0.00000000e+00+1.2490009e-16j])
     >>> np.dot(a, v[:, 1]) - w[1] * v[:, 1] # verify 2nd e-val/vec pair
-    array([ 0.+0.j,  0.+0.j])
+    array([0.+0.j, 0.+0.j])
 
     >>> A = np.matrix(a) # what happens if input is a matrix object
     >>> A
-    matrix([[ 1.+0.j,  0.-2.j],
+    matrix([[ 1.+0.j, -0.-2.j],
             [ 0.+2.j,  5.+0.j]])
     >>> w, v = LA.eigh(A)
     >>> w; v
-    array([ 0.17157288,  5.82842712])
-    matrix([[-0.92387953+0.j        , -0.38268343+0.j        ],
-            [ 0.00000000+0.38268343j,  0.00000000-0.92387953j]])
+    array([0.17157288, 5.82842712])
+    matrix([[-0.92387953+0.j        , -0.38268343+0.j        ], # may vary
+            [ 0.        +0.38268343j,  0.        -0.92387953j]])
 
     >>> # demonstrate the treatment of the imaginary part of the diagonal
     >>> a = np.array([[5+2j, 9-2j], [0+2j, 2-1j]])
     >>> a
-    array([[ 5.+2.j,  9.-2.j],
-           [ 0.+2.j,  2.-1.j]])
+    array([[5.+2.j, 9.-2.j],
+           [0.+2.j, 2.-1.j]])
     >>> # with UPLO='L' this is numerically equivalent to using LA.eig() with:
     >>> b = np.array([[5.+0.j, 0.-2.j], [0.+2.j, 2.-0.j]])
     >>> b
-    array([[ 5.+0.j,  0.-2.j],
-           [ 0.+2.j,  2.+0.j]])
+    array([[5.+0.j, 0.-2.j],
+           [0.+2.j, 2.+0.j]])
     >>> wa, va = LA.eigh(a)
     >>> wb, vb = LA.eig(b)
     >>> wa; wb
-    array([ 1.,  6.])
-    array([ 6.+0.j,  1.+0.j])
+    array([1., 6.])
+    array([6.+0.j, 1.+0.j])
     >>> va; vb
-    array([[-0.44721360-0.j        , -0.89442719+0.j        ],
-           [ 0.00000000+0.89442719j,  0.00000000-0.4472136j ]])
-    array([[ 0.89442719+0.j       ,  0.00000000-0.4472136j],
-           [ 0.00000000-0.4472136j,  0.89442719+0.j       ]])
+    array([[-0.4472136 +0.j        , -0.89442719+0.j        ], # may vary
+           [ 0.        +0.89442719j,  0.        -0.4472136j ]])
+    array([[ 0.89442719+0.j       , -0.        +0.4472136j],
+           [-0.        +0.4472136j,  0.89442719+0.j       ]])
     """
     UPLO = UPLO.upper()
     if UPLO not in ('L', 'U'):
@@ -1461,12 +1461,12 @@
 
 # Singular value decomposition
 
-def _svd_dispatcher(a, full_matrices=None, compute_uv=None):
+def _svd_dispatcher(a, full_matrices=None, compute_uv=None, hermitian=None):
     return (a,)
 
 
 @array_function_dispatch(_svd_dispatcher)
-def svd(a, full_matrices=True, compute_uv=True):
+def svd(a, full_matrices=True, compute_uv=True, hermitian=False):
     """
     Singular Value Decomposition.
 
@@ -1504,6 +1504,12 @@
         size as those of the input `a`. The size of the last two dimensions
         depends on the value of `full_matrices`. Only returned when
         `compute_uv` is True.
+    hermitian : bool, optional
+        If True, `a` is assumed to be Hermitian (symmetric if real-valued),
+        enabling a more efficient method for finding singular values.
+        Defaults to False.
+
+        ..versionadded:: 1.17.0
 
     Raises
     ------
@@ -1590,6 +1596,24 @@
 
     """
     a, wrap = _makearray(a)
+
+    if hermitian:
+        # note: lapack returns eigenvalues in reverse order to our contract.
+        # reversing is cheap by design in numpy, so we do so to be consistent
+        if compute_uv:
+            s, u = eigh(a)
+            s = s[..., ::-1]
+            u = u[..., ::-1]
+            # singular values are unsigned, move the sign into v
+            vt = transpose(u * sign(s)[..., None, :]).conjugate()
+            s = abs(s)
+            return wrap(u), s, wrap(vt)
+        else:
+            s = eigvalsh(a)
+            s = s[..., ::-1]
+            s = abs(s)
+            return s
+
     _assertRankAtLeast2(a)
     t, result_t = _commonType(a)
 
@@ -1705,9 +1729,9 @@
     >>> LA.cond(a, 2)
     1.4142135623730951
     >>> LA.cond(a, -2)
-    0.70710678118654746
+    0.70710678118654746 # may vary
     >>> min(LA.svd(a, compute_uv=0))*min(LA.svd(LA.inv(a), compute_uv=0))
-    0.70710678118654746
+    0.70710678118654746 # may vary
 
     """
     x = asarray(x)  # in case we have a matrix
@@ -1844,10 +1868,7 @@
     M = asarray(M)
     if M.ndim < 2:
         return int(not all(M==0))
-    if hermitian:
-        S = abs(eigvalsh(M))
-    else:
-        S = svd(M, compute_uv=False)
+    S = svd(M, compute_uv=False, hermitian=hermitian)
     if tol is None:
         tol = S.max(axis=-1, keepdims=True) * max(M.shape[-2:]) * finfo(S.dtype).eps
     else:
@@ -1857,12 +1878,12 @@
 
 # Generalized inverse
 
-def _pinv_dispatcher(a, rcond=None):
+def _pinv_dispatcher(a, rcond=None, hermitian=None):
     return (a,)
 
 
 @array_function_dispatch(_pinv_dispatcher)
-def pinv(a, rcond=1e-15):
+def pinv(a, rcond=1e-15, hermitian=False):
     """
     Compute the (Moore-Penrose) pseudo-inverse of a matrix.
 
@@ -1882,6 +1903,12 @@
         Singular values smaller (in modulus) than
         `rcond` * largest_singular_value (again, in modulus)
         are set to zero. Broadcasts against the stack of matrices
+    hermitian : bool, optional
+        If True, `a` is assumed to be Hermitian (symmetric if real-valued),
+        enabling a more efficient method for finding singular values.
+        Defaults to False.
+
+        ..versionadded:: 1.17.0
 
     Returns
     -------
@@ -1935,7 +1962,7 @@
         res = empty(a.shape[:-2] + (n, m), dtype=a.dtype)
         return wrap(res)
     a = a.conjugate()
-    u, s, vt = svd(a, full_matrices=False)
+    u, s, vt = svd(a, full_matrices=False, hermitian=hermitian)
 
     # discard small singular values
     cutoff = rcond[..., newaxis] * amax(s, axis=-1, keepdims=True)
@@ -2002,7 +2029,7 @@
     >>> a = np.array([[1, 2], [3, 4]])
     >>> (sign, logdet) = np.linalg.slogdet(a)
     >>> (sign, logdet)
-    (-1, 0.69314718055994529)
+    (-1, 0.69314718055994529) # may vary
     >>> sign * np.exp(logdet)
     -2.0
 
@@ -2074,7 +2101,7 @@
 
     >>> a = np.array([[1, 2], [3, 4]])
     >>> np.linalg.det(a)
-    -2.0
+    -2.0 # may vary
 
     Computing determinants for a stack of matrices:
 
@@ -2181,15 +2208,15 @@
            [ 3.,  1.]])
 
     >>> m, c = np.linalg.lstsq(A, y, rcond=None)[0]
-    >>> print(m, c)
-    1.0 -0.95
+    >>> m, c
+    (1.0 -0.95) # may vary
 
     Plot the data along with the fitted line:
 
     >>> import matplotlib.pyplot as plt
-    >>> plt.plot(x, y, 'o', label='Original data', markersize=10)
-    >>> plt.plot(x, m*x + c, 'r', label='Fitted line')
-    >>> plt.legend()
+    >>> _ = plt.plot(x, y, 'o', label='Original data', markersize=10)
+    >>> _ = plt.plot(x, m*x + c, 'r', label='Fitted line')
+    >>> _ = plt.legend()
     >>> plt.show()
 
     """
@@ -2367,7 +2394,7 @@
     >>> from numpy import linalg as LA
     >>> a = np.arange(9) - 4
     >>> a
-    array([-4, -3, -2, -1,  0,  1,  2,  3,  4])
+    array([-4, -3, -2, ...,  2,  3,  4])
     >>> b = a.reshape((3, 3))
     >>> b
     array([[-4, -3, -2],
@@ -2403,13 +2430,13 @@
     7.3484692283495345
 
     >>> LA.norm(a, -2)
-    nan
+    0.0
     >>> LA.norm(b, -2)
-    1.8570331885190563e-016
+    1.8570331885190563e-016 # may vary
     >>> LA.norm(a, 3)
-    5.8480354764257312
+    5.8480354764257312 # may vary
     >>> LA.norm(a, -3)
-    nan
+    0.0
 
     Using the `axis` argument to compute vector norms:
 
@@ -2584,18 +2611,20 @@
 
     >>> from numpy.linalg import multi_dot
     >>> # Prepare some data
-    >>> A = np.random.random(10000, 100)
-    >>> B = np.random.random(100, 1000)
-    >>> C = np.random.random(1000, 5)
-    >>> D = np.random.random(5, 333)
+    >>> A = np.random.random((10000, 100))
+    >>> B = np.random.random((100, 1000))
+    >>> C = np.random.random((1000, 5))
+    >>> D = np.random.random((5, 333))
     >>> # the actual dot multiplication
-    >>> multi_dot([A, B, C, D])
+    >>> _ = multi_dot([A, B, C, D])
 
     instead of::
 
-    >>> np.dot(np.dot(np.dot(A, B), C), D)
+    >>> _ = np.dot(np.dot(np.dot(A, B), C), D)
+    ...
     >>> # or
-    >>> A.dot(B).dot(C).dot(D)
+    >>> _ = A.dot(B).dot(C).dot(D)
+    ...
 
     Notes
     -----
diff --git a/numpy/linalg/tests/test_linalg.py b/numpy/linalg/tests/test_linalg.py
index 235488c..831c059 100644
--- a/numpy/linalg/tests/test_linalg.py
+++ b/numpy/linalg/tests/test_linalg.py
@@ -633,6 +633,20 @@
         assert_(isinstance(a, np.ndarray))
 
 
+class SVDBaseTests(object):
+    hermitian = False
+
+    @pytest.mark.parametrize('dtype', [single, double, csingle, cdouble])
+    def test_types(self, dtype):
+        x = np.array([[1, 0.5], [0.5, 1]], dtype=dtype)
+        u, s, vh = linalg.svd(x)
+        assert_equal(u.dtype, dtype)
+        assert_equal(s.dtype, get_real_dtype(dtype))
+        assert_equal(vh.dtype, dtype)
+        s = linalg.svd(x, compute_uv=False, hermitian=self.hermitian)
+        assert_equal(s.dtype, get_real_dtype(dtype))
+
+
 class SVDCases(LinalgSquareTestCase, LinalgGeneralizedSquareTestCase):
 
     def do(self, a, b, tags):
@@ -644,32 +658,37 @@
         assert_(consistent_subclass(vt, a))
 
 
-class TestSVD(SVDCases):
-    @pytest.mark.parametrize('dtype', [single, double, csingle, cdouble])
-    def test_types(self, dtype):
-        x = np.array([[1, 0.5], [0.5, 1]], dtype=dtype)
-        u, s, vh = linalg.svd(x)
-        assert_equal(u.dtype, dtype)
-        assert_equal(s.dtype, get_real_dtype(dtype))
-        assert_equal(vh.dtype, dtype)
-        s = linalg.svd(x, compute_uv=False)
-        assert_equal(s.dtype, get_real_dtype(dtype))
-
+class TestSVD(SVDCases, SVDBaseTests):
     def test_empty_identity(self):
         """ Empty input should put an identity matrix in u or vh """
         x = np.empty((4, 0))
-        u, s, vh = linalg.svd(x, compute_uv=True)
+        u, s, vh = linalg.svd(x, compute_uv=True, hermitian=self.hermitian)
         assert_equal(u.shape, (4, 4))
         assert_equal(vh.shape, (0, 0))
         assert_equal(u, np.eye(4))
 
         x = np.empty((0, 4))
-        u, s, vh = linalg.svd(x, compute_uv=True)
+        u, s, vh = linalg.svd(x, compute_uv=True, hermitian=self.hermitian)
         assert_equal(u.shape, (0, 0))
         assert_equal(vh.shape, (4, 4))
         assert_equal(vh, np.eye(4))
 
 
+class SVDHermitianCases(HermitianTestCase, HermitianGeneralizedTestCase):
+
+    def do(self, a, b, tags):
+        u, s, vt = linalg.svd(a, 0, hermitian=True)
+        assert_allclose(a, dot_generalized(np.asarray(u) * np.asarray(s)[..., None, :],
+                                           np.asarray(vt)),
+                        rtol=get_rtol(u.dtype))
+        assert_(consistent_subclass(u, a))
+        assert_(consistent_subclass(vt, a))
+
+
+class TestSVDHermitian(SVDHermitianCases, SVDBaseTests):
+    hermitian = True
+
+
 class CondCases(LinalgSquareTestCase, LinalgGeneralizedSquareTestCase):
     # cond(x, p) for p in (None, 2, -2)
 
@@ -797,6 +816,20 @@
     pass
 
 
+class PinvHermitianCases(HermitianTestCase, HermitianGeneralizedTestCase):
+
+    def do(self, a, b, tags):
+        a_ginv = linalg.pinv(a, hermitian=True)
+        # `a @ a_ginv == I` does not hold if a is singular
+        dot = dot_generalized
+        assert_almost_equal(dot(dot(a, a_ginv), a), a, single_decimal=5, double_decimal=11)
+        assert_(consistent_subclass(a_ginv, a))
+
+
+class TestPinvHermitian(PinvHermitianCases):
+    pass
+
+
 class DetCases(LinalgSquareTestCase, LinalgGeneralizedSquareTestCase):
 
     def do(self, a, b, tags):
@@ -1962,3 +1995,10 @@
         ainv = linalg.tensorinv(a, ind=1)
         b = np.ones(24)
         assert_allclose(np.tensordot(ainv, b, 1), np.linalg.tensorsolve(a, b))
+
+
+def test_unsupported_commontype():
+    # linalg gracefully handles unsupported type
+    arr = np.array([[1, -2], [2, 5]], dtype='float16')
+    with assert_raises_regex(TypeError, "unsupported in linalg"):
+        linalg.cholesky(arr)
diff --git a/numpy/ma/core.py b/numpy/ma/core.py
index 96d7207..63a6159 100644
--- a/numpy/ma/core.py
+++ b/numpy/ma/core.py
@@ -516,18 +516,18 @@
     array([0, 1, 2, 3, 4])
     >>> a = ma.masked_where(a < 3, a)
     >>> a
-    masked_array(data = [-- -- -- 3 4],
-          mask = [ True  True  True False False],
-          fill_value=999999)
+    masked_array(data=[--, --, --, 3, 4],
+                 mask=[ True,  True,  True, False, False],
+           fill_value=999999)
     >>> ma.set_fill_value(a, -999)
     >>> a
-    masked_array(data = [-- -- -- 3 4],
-          mask = [ True  True  True False False],
-          fill_value=-999)
+    masked_array(data=[--, --, --, 3, 4],
+                 mask=[ True,  True,  True, False, False],
+           fill_value=-999)
 
     Nothing happens if `a` is not a masked array.
 
-    >>> a = range(5)
+    >>> a = list(range(5))
     >>> a
     [0, 1, 2, 3, 4]
     >>> ma.set_fill_value(a, 100)
@@ -689,13 +689,12 @@
     >>> import numpy.ma as ma
     >>> a = ma.masked_equal([[1,2],[3,4]], 2)
     >>> a
-    masked_array(data =
-     [[1 --]
-     [3 4]],
-          mask =
-     [[False  True]
-     [False False]],
-          fill_value=999999)
+    masked_array(
+      data=[[1, --],
+            [3, 4]],
+      mask=[[False,  True],
+            [False, False]],
+      fill_value=2)
     >>> ma.getdata(a)
     array([[1, 2],
            [3, 4]])
@@ -752,20 +751,19 @@
     --------
     >>> x = np.ma.array([1., -1, np.nan, np.inf], mask=[1] + [0]*3)
     >>> x
-    masked_array(data = [-- -1.0 nan inf],
-                 mask = [ True False False False],
-           fill_value = 1e+20)
+    masked_array(data=[--, -1.0, nan, inf],
+                 mask=[ True, False, False, False],
+           fill_value=1e+20)
     >>> np.ma.fix_invalid(x)
-    masked_array(data = [-- -1.0 -- --],
-                 mask = [ True False  True  True],
-           fill_value = 1e+20)
+    masked_array(data=[--, -1.0, --, --],
+                 mask=[ True, False,  True,  True],
+           fill_value=1e+20)
 
     >>> fixed = np.ma.fix_invalid(x)
     >>> fixed.data
-    array([  1.00000000e+00,  -1.00000000e+00,   1.00000000e+20,
-             1.00000000e+20])
+    array([ 1.e+00, -1.e+00,  1.e+20,  1.e+20])
     >>> x.data
-    array([  1.,  -1.,  NaN,  Inf])
+    array([ 1., -1., nan, inf])
 
     """
     a = masked_array(a, copy=copy, mask=mask, subok=True)
@@ -1346,9 +1344,9 @@
     --------
     >>> import numpy.ma as ma
     >>> dtype = np.dtype({'names':['foo', 'bar'],
-                          'formats':[np.float32, int]})
+    ...                   'formats':[np.float32, np.int64]})
     >>> dtype
-    dtype([('foo', '<f4'), ('bar', '<i4')])
+    dtype([('foo', '<f4'), ('bar', '<i8')])
     >>> ma.make_mask_descr(dtype)
     dtype([('foo', '|b1'), ('bar', '|b1')])
     >>> ma.make_mask_descr(np.float32)
@@ -1381,13 +1379,12 @@
     >>> import numpy.ma as ma
     >>> a = ma.masked_equal([[1,2],[3,4]], 2)
     >>> a
-    masked_array(data =
-     [[1 --]
-     [3 4]],
-          mask =
-     [[False  True]
-     [False False]],
-          fill_value=999999)
+    masked_array(
+      data=[[1, --],
+            [3, 4]],
+      mask=[[False,  True],
+            [False, False]],
+      fill_value=2)
     >>> ma.getmask(a)
     array([[False,  True],
            [False, False]])
@@ -1402,12 +1399,11 @@
 
     >>> b = ma.masked_array([[1,2],[3,4]])
     >>> b
-    masked_array(data =
-     [[1 2]
-     [3 4]],
-          mask =
-     False,
-          fill_value=999999)
+    masked_array(
+      data=[[1, 2],
+            [3, 4]],
+      mask=False,
+      fill_value=999999)
     >>> ma.nomask
     False
     >>> ma.getmask(b) == ma.nomask
@@ -1445,13 +1441,12 @@
     >>> import numpy.ma as ma
     >>> a = ma.masked_equal([[1,2],[3,4]], 2)
     >>> a
-    masked_array(data =
-     [[1 --]
-     [3 4]],
-          mask =
-     [[False  True]
-     [False False]],
-          fill_value=999999)
+    masked_array(
+      data=[[1, --],
+            [3, 4]],
+      mask=[[False,  True],
+            [False, False]],
+      fill_value=2)
     >>> ma.getmaskarray(a)
     array([[False,  True],
            [False, False]])
@@ -1460,13 +1455,12 @@
 
     >>> b = ma.masked_array([[1,2],[3,4]])
     >>> b
-    masked_array(data =
-     [[1 2]
-     [3 4]],
-          mask =
-     False,
-          fill_value=999999)
-    >>> >ma.getmaskarray(b)
+    masked_array(
+      data=[[1, 2],
+            [3, 4]],
+      mask=False,
+      fill_value=999999)
+    >>> ma.getmaskarray(b)
     array([[False, False],
            [False, False]])
 
@@ -1504,9 +1498,9 @@
     >>> import numpy.ma as ma
     >>> m = ma.masked_equal([0, 1, 0, 2, 3], 0)
     >>> m
-    masked_array(data = [-- 1 -- 2 3],
-          mask = [ True False  True False False],
-          fill_value=999999)
+    masked_array(data=[--, 1, --, 2, 3],
+                 mask=[ True, False,  True, False, False],
+           fill_value=0)
     >>> ma.is_mask(m)
     False
     >>> ma.is_mask(m.mask)
@@ -1527,14 +1521,14 @@
     Arrays with complex dtypes don't return True.
 
     >>> dtype = np.dtype({'names':['monty', 'pithon'],
-                          'formats':[bool, bool]})
+    ...                   'formats':[bool, bool]})
     >>> dtype
     dtype([('monty', '|b1'), ('pithon', '|b1')])
     >>> m = np.array([(True, False), (False, True), (True, False)],
-                     dtype=dtype)
+    ...              dtype=dtype)
     >>> m
-    array([(True, False), (False, True), (True, False)],
-          dtype=[('monty', '|b1'), ('pithon', '|b1')])
+    array([( True, False), (False,  True), ( True, False)],
+          dtype=[('monty', '?'), ('pithon', '?')])
     >>> ma.is_mask(m)
     False
 
@@ -1600,7 +1594,7 @@
 
     >>> m = np.zeros(4)
     >>> m
-    array([ 0.,  0.,  0.,  0.])
+    array([0., 0., 0., 0.])
     >>> ma.make_mask(m)
     False
     >>> ma.make_mask(m, shrink=False)
@@ -1616,11 +1610,11 @@
     >>> arr
     [(1, 0), (0, 1), (1, 0), (1, 0)]
     >>> dtype = np.dtype({'names':['man', 'mouse'],
-                          'formats':[int, int]})
+    ...                   'formats':[np.int64, np.int64]})
     >>> arr = np.array(arr, dtype=dtype)
     >>> arr
     array([(1, 0), (0, 1), (1, 0), (1, 0)],
-          dtype=[('man', '<i4'), ('mouse', '<i4')])
+          dtype=[('man', '<i8'), ('mouse', '<i8')])
     >>> ma.make_mask(arr, dtype=dtype)
     array([(True, False), (False, True), (True, False), (True, False)],
           dtype=[('man', '|b1'), ('mouse', '|b1')])
@@ -1679,9 +1673,9 @@
     Defining a more complex dtype.
 
     >>> dtype = np.dtype({'names':['foo', 'bar'],
-                          'formats':[np.float32, int]})
+    ...                   'formats':[np.float32, np.int64]})
     >>> dtype
-    dtype([('foo', '<f4'), ('bar', '<i4')])
+    dtype([('foo', '<f4'), ('bar', '<i8')])
     >>> ma.make_mask_none((3,), dtype=dtype)
     array([(False, False), (False, False), (False, False)],
           dtype=[('foo', '|b1'), ('bar', '|b1')])
@@ -1779,16 +1773,16 @@
     Examples
     --------
     >>> mask = np.array([0, 0, 1])
-    >>> flatten_mask(mask)
+    >>> np.ma.flatten_mask(mask)
     array([False, False,  True])
 
     >>> mask = np.array([(0, 0), (0, 1)], dtype=[('a', bool), ('b', bool)])
-    >>> flatten_mask(mask)
+    >>> np.ma.flatten_mask(mask)
     array([False, False, False,  True])
 
     >>> mdtype = [('a', bool), ('b', [('ba', bool), ('bb', bool)])]
     >>> mask = np.array([(0, (0, 0)), (0, (0, 1))], dtype=mdtype)
-    >>> flatten_mask(mask)
+    >>> np.ma.flatten_mask(mask)
     array([False, False, False, False, False,  True])
 
     """
@@ -1873,38 +1867,39 @@
     >>> a
     array([0, 1, 2, 3])
     >>> ma.masked_where(a <= 2, a)
-    masked_array(data = [-- -- -- 3],
-          mask = [ True  True  True False],
-          fill_value=999999)
+    masked_array(data=[--, --, --, 3],
+                 mask=[ True,  True,  True, False],
+           fill_value=999999)
 
     Mask array `b` conditional on `a`.
 
     >>> b = ['a', 'b', 'c', 'd']
     >>> ma.masked_where(a == 2, b)
-    masked_array(data = [a b -- d],
-          mask = [False False  True False],
-          fill_value=N/A)
+    masked_array(data=['a', 'b', --, 'd'],
+                 mask=[False, False,  True, False],
+           fill_value='N/A',
+                dtype='<U1')
 
     Effect of the `copy` argument.
 
     >>> c = ma.masked_where(a <= 2, a)
     >>> c
-    masked_array(data = [-- -- -- 3],
-          mask = [ True  True  True False],
-          fill_value=999999)
+    masked_array(data=[--, --, --, 3],
+                 mask=[ True,  True,  True, False],
+           fill_value=999999)
     >>> c[0] = 99
     >>> c
-    masked_array(data = [99 -- -- 3],
-          mask = [False  True  True False],
-          fill_value=999999)
+    masked_array(data=[99, --, --, 3],
+                 mask=[False,  True,  True, False],
+           fill_value=999999)
     >>> a
     array([0, 1, 2, 3])
     >>> c = ma.masked_where(a <= 2, a, copy=False)
     >>> c[0] = 99
     >>> c
-    masked_array(data = [99 -- -- 3],
-          mask = [False  True  True False],
-          fill_value=999999)
+    masked_array(data=[99, --, --, 3],
+                 mask=[False,  True,  True, False],
+           fill_value=999999)
     >>> a
     array([99,  1,  2,  3])
 
@@ -1913,19 +1908,19 @@
     >>> a = np.arange(4)
     >>> a = ma.masked_where(a == 2, a)
     >>> a
-    masked_array(data = [0 1 -- 3],
-          mask = [False False  True False],
-          fill_value=999999)
+    masked_array(data=[0, 1, --, 3],
+                 mask=[False, False,  True, False],
+           fill_value=999999)
     >>> b = np.arange(4)
     >>> b = ma.masked_where(b == 0, b)
     >>> b
-    masked_array(data = [-- 1 2 3],
-          mask = [ True False False False],
-          fill_value=999999)
+    masked_array(data=[--, 1, 2, 3],
+                 mask=[ True, False, False, False],
+           fill_value=999999)
     >>> ma.masked_where(a == 3, b)
-    masked_array(data = [-- 1 -- --],
-          mask = [ True False  True  True],
-          fill_value=999999)
+    masked_array(data=[--, 1, --, --],
+                 mask=[ True, False,  True,  True],
+           fill_value=999999)
 
     """
     # Make sure that condition is a valid standard-type mask.
@@ -1965,9 +1960,9 @@
     >>> a
     array([0, 1, 2, 3])
     >>> ma.masked_greater(a, 2)
-    masked_array(data = [0 1 2 --],
-          mask = [False False False  True],
-          fill_value=999999)
+    masked_array(data=[0, 1, 2, --],
+                 mask=[False, False, False,  True],
+           fill_value=999999)
 
     """
     return masked_where(greater(x, value), x, copy=copy)
@@ -1991,9 +1986,9 @@
     >>> a
     array([0, 1, 2, 3])
     >>> ma.masked_greater_equal(a, 2)
-    masked_array(data = [0 1 -- --],
-          mask = [False False  True  True],
-          fill_value=999999)
+    masked_array(data=[0, 1, --, --],
+                 mask=[False, False,  True,  True],
+           fill_value=999999)
 
     """
     return masked_where(greater_equal(x, value), x, copy=copy)
@@ -2017,9 +2012,9 @@
     >>> a
     array([0, 1, 2, 3])
     >>> ma.masked_less(a, 2)
-    masked_array(data = [-- -- 2 3],
-          mask = [ True  True False False],
-          fill_value=999999)
+    masked_array(data=[--, --, 2, 3],
+                 mask=[ True,  True, False, False],
+           fill_value=999999)
 
     """
     return masked_where(less(x, value), x, copy=copy)
@@ -2043,9 +2038,9 @@
     >>> a
     array([0, 1, 2, 3])
     >>> ma.masked_less_equal(a, 2)
-    masked_array(data = [-- -- -- 3],
-          mask = [ True  True  True False],
-          fill_value=999999)
+    masked_array(data=[--, --, --, 3],
+                 mask=[ True,  True,  True, False],
+           fill_value=999999)
 
     """
     return masked_where(less_equal(x, value), x, copy=copy)
@@ -2069,9 +2064,9 @@
     >>> a
     array([0, 1, 2, 3])
     >>> ma.masked_not_equal(a, 2)
-    masked_array(data = [-- -- 2 --],
-          mask = [ True  True False  True],
-          fill_value=999999)
+    masked_array(data=[--, --, 2, --],
+                 mask=[ True,  True, False,  True],
+           fill_value=999999)
 
     """
     return masked_where(not_equal(x, value), x, copy=copy)
@@ -2097,9 +2092,9 @@
     >>> a
     array([0, 1, 2, 3])
     >>> ma.masked_equal(a, 2)
-    masked_array(data = [0 1 -- 3],
-          mask = [False False  True False],
-          fill_value=999999)
+    masked_array(data=[0, 1, --, 3],
+                 mask=[False, False,  True, False],
+           fill_value=2)
 
     """
     output = masked_where(equal(x, value), x, copy=copy)
@@ -2128,16 +2123,16 @@
     >>> import numpy.ma as ma
     >>> x = [0.31, 1.2, 0.01, 0.2, -0.4, -1.1]
     >>> ma.masked_inside(x, -0.3, 0.3)
-    masked_array(data = [0.31 1.2 -- -- -0.4 -1.1],
-          mask = [False False  True  True False False],
-          fill_value=1e+20)
+    masked_array(data=[0.31, 1.2, --, --, -0.4, -1.1],
+                 mask=[False, False,  True,  True, False, False],
+           fill_value=1e+20)
 
     The order of `v1` and `v2` doesn't matter.
 
     >>> ma.masked_inside(x, 0.3, -0.3)
-    masked_array(data = [0.31 1.2 -- -- -0.4 -1.1],
-          mask = [False False  True  True False False],
-          fill_value=1e+20)
+    masked_array(data=[0.31, 1.2, --, --, -0.4, -1.1],
+                 mask=[False, False,  True,  True, False, False],
+           fill_value=1e+20)
 
     """
     if v2 < v1:
@@ -2168,16 +2163,16 @@
     >>> import numpy.ma as ma
     >>> x = [0.31, 1.2, 0.01, 0.2, -0.4, -1.1]
     >>> ma.masked_outside(x, -0.3, 0.3)
-    masked_array(data = [-- -- 0.01 0.2 -- --],
-          mask = [ True  True False False  True  True],
-          fill_value=1e+20)
+    masked_array(data=[--, --, 0.01, 0.2, --, --],
+                 mask=[ True,  True, False, False,  True,  True],
+           fill_value=1e+20)
 
     The order of `v1` and `v2` doesn't matter.
 
     >>> ma.masked_outside(x, 0.3, -0.3)
-    masked_array(data = [-- -- 0.01 0.2 -- --],
-          mask = [ True  True False False  True  True],
-          fill_value=1e+20)
+    masked_array(data=[--, --, 0.01, 0.2, --, --],
+                 mask=[ True,  True, False, False,  True,  True],
+           fill_value=1e+20)
 
     """
     if v2 < v1:
@@ -2222,20 +2217,27 @@
     >>> food = np.array(['green_eggs', 'ham'], dtype=object)
     >>> # don't eat spoiled food
     >>> eat = ma.masked_object(food, 'green_eggs')
-    >>> print(eat)
-    [-- ham]
+    >>> eat
+    masked_array(data=[--, 'ham'],
+                 mask=[ True, False],
+           fill_value='green_eggs',
+                dtype=object)
     >>> # plain ol` ham is boring
     >>> fresh_food = np.array(['cheese', 'ham', 'pineapple'], dtype=object)
     >>> eat = ma.masked_object(fresh_food, 'green_eggs')
-    >>> print(eat)
-    [cheese ham pineapple]
+    >>> eat
+    masked_array(data=['cheese', 'ham', 'pineapple'],
+                 mask=False,
+           fill_value='green_eggs',
+                dtype=object)
 
     Note that `mask` is set to ``nomask`` if possible.
 
     >>> eat
-    masked_array(data = [cheese ham pineapple],
-          mask = False,
-          fill_value=?)
+    masked_array(data=['cheese', 'ham', 'pineapple'],
+                 mask=False,
+           fill_value='green_eggs',
+                dtype=object)
 
     """
     if isMaskedArray(x):
@@ -2290,16 +2292,16 @@
     >>> import numpy.ma as ma
     >>> x = np.array([1, 1.1, 2, 1.1, 3])
     >>> ma.masked_values(x, 1.1)
-    masked_array(data = [1.0 -- 2.0 -- 3.0],
-          mask = [False  True False  True False],
-          fill_value=1.1)
+    masked_array(data=[1.0, --, 2.0, --, 3.0],
+                 mask=[False,  True, False,  True, False],
+           fill_value=1.1)
 
     Note that `mask` is set to ``nomask`` if possible.
 
     >>> ma.masked_values(x, 1.5)
-    masked_array(data = [ 1.   1.1  2.   1.1  3. ],
-          mask = False,
-          fill_value=1.5)
+    masked_array(data=[1. , 1.1, 2. , 1.1, 3. ],
+                 mask=False,
+           fill_value=1.5)
 
     For integers, the fill value will be different in general to the
     result of ``masked_equal``.
@@ -2308,13 +2310,13 @@
     >>> x
     array([0, 1, 2, 3, 4])
     >>> ma.masked_values(x, 2)
-    masked_array(data = [0 1 -- 3 4],
-          mask = [False False  True False False],
-          fill_value=2)
+    masked_array(data=[0, 1, --, 3, 4],
+                 mask=[False, False,  True, False, False],
+           fill_value=2)
     >>> ma.masked_equal(x, 2)
-    masked_array(data = [0 1 -- 3 4],
-          mask = [False False  True False False],
-          fill_value=999999)
+    masked_array(data=[0, 1, --, 3, 4],
+                 mask=[False, False,  True, False, False],
+           fill_value=2)
 
     """
     xnew = filled(x, value)
@@ -2348,11 +2350,11 @@
     >>> a[2] = np.NaN
     >>> a[3] = np.PINF
     >>> a
-    array([  0.,   1.,  NaN,  Inf,   4.])
+    array([ 0.,  1., nan, inf,  4.])
     >>> ma.masked_invalid(a)
-    masked_array(data = [0.0 1.0 -- -- 4.0],
-          mask = [False False  True  True False],
-          fill_value=1e+20)
+    masked_array(data=[0.0, 1.0, --, --, 4.0],
+                 mask=[False, False,  True,  True, False],
+           fill_value=1e+20)
 
     """
     a = np.array(a, copy=copy, subok=True)
@@ -2513,7 +2515,7 @@
     --------
     >>> ndtype = [('a', int), ('b', float)]
     >>> a = np.array([(1, 1), (2, 2)], dtype=ndtype)
-    >>> flatten_structured_array(a)
+    >>> np.ma.flatten_structured_array(a)
     array([[1., 1.],
            [2., 2.]])
 
@@ -2684,9 +2686,7 @@
         >>> fl.next()
         3
         >>> fl.next()
-        masked_array(data = --,
-                     mask = True,
-               fill_value = 1e+20)
+        masked
         >>> fl.next()
         Traceback (most recent call last):
           File "<stdin>", line 1, in <module>
@@ -3551,6 +3551,11 @@
         array([[False, False],
                [False, False]])
         >>> x.shrink_mask()
+        masked_array(
+          data=[[1, 2],
+                [3, 4]],
+          mask=False,
+          fill_value=999999)
         >>> x.mask
         False
 
@@ -3639,7 +3644,7 @@
         -inf
         >>> x.set_fill_value(np.pi)
         >>> x.fill_value
-        3.1415926535897931
+        3.1415926535897931 # may vary
 
         Reset to default:
 
@@ -3688,9 +3693,9 @@
         --------
         >>> x = np.ma.array([1,2,3,4,5], mask=[0,0,1,0,1], fill_value=-999)
         >>> x.filled()
-        array([1, 2, -999, 4, -999])
+        array([   1,    2, -999,    4, -999])
         >>> type(x.filled())
-        <type 'numpy.ndarray'>
+        <class 'numpy.ndarray'>
 
         Subclassing is preserved. This means that if, e.g., the data part of
         the masked array is a recarray, `filled` returns a recarray:
@@ -3755,7 +3760,7 @@
         >>> x.compressed()
         array([0, 1])
         >>> type(x.compressed())
-        <type 'numpy.ndarray'>
+        <class 'numpy.ndarray'>
 
         """
         data = ndarray.ravel(self._data)
@@ -3797,25 +3802,29 @@
         Examples
         --------
         >>> x = np.ma.array([[1,2,3],[4,5,6],[7,8,9]], mask=[0] + [1,0]*4)
-        >>> print(x)
-        [[1 -- 3]
-         [-- 5 --]
-         [7 -- 9]]
+        >>> x
+        masked_array(
+          data=[[1, --, 3],
+                [--, 5, --],
+                [7, --, 9]],
+          mask=[[False,  True, False],
+                [ True, False,  True],
+                [False,  True, False]],
+          fill_value=999999)
         >>> x.compress([1, 0, 1])
-        masked_array(data = [1 3],
-              mask = [False False],
-              fill_value=999999)
+        masked_array(data=[1, 3],
+                     mask=[False, False],
+               fill_value=999999)
 
         >>> x.compress([1, 0, 1], axis=1)
-        masked_array(data =
-         [[1 3]
-         [-- --]
-         [7 9]],
-              mask =
-         [[False False]
-         [ True  True]
-         [False False]],
-              fill_value=999999)
+        masked_array(
+          data=[[1, 3],
+                [--, --],
+                [7, 9]],
+          mask=[[False, False],
+                [ True,  True],
+                [False, False]],
+          fill_value=999999)
 
         """
         # Get the basic components
@@ -4348,9 +4357,9 @@
         --------
         >>> x = np.ma.array([1+1.j, -2j, 3.45+1.6j], mask=[False, True, False])
         >>> x.get_imag()
-        masked_array(data = [1.0 -- 1.6],
-                     mask = [False  True False],
-               fill_value = 1e+20)
+        masked_array(data=[1.0, --, 1.6],
+                     mask=[False,  True, False],
+               fill_value=1e+20)
 
         """
         result = self._data.imag.view(type(self))
@@ -4383,9 +4392,9 @@
         --------
         >>> x = np.ma.array([1+1.j, -2j, 3.45+1.6j], mask=[False, True, False])
         >>> x.get_real()
-        masked_array(data = [1.0 -- 3.45],
-                     mask = [False  True False],
-               fill_value = 1e+20)
+        masked_array(data=[1.0, --, 3.45],
+                     mask=[False,  True, False],
+               fill_value=1e+20)
 
         """
         result = self._data.real.view(type(self))
@@ -4431,13 +4440,12 @@
         >>> a = ma.arange(6).reshape((2, 3))
         >>> a[1, :] = ma.masked
         >>> a
-        masked_array(data =
-         [[0 1 2]
-         [-- -- --]],
-                     mask =
-         [[False False False]
-         [ True  True  True]],
-               fill_value = 999999)
+        masked_array(
+          data=[[0, 1, 2],
+                [--, --, --]],
+          mask=[[False, False, False],
+                [ True,  True,  True]],
+          fill_value=999999)
         >>> a.count()
         3
 
@@ -4522,12 +4530,20 @@
         Examples
         --------
         >>> x = np.ma.array([[1,2,3],[4,5,6],[7,8,9]], mask=[0] + [1,0]*4)
-        >>> print(x)
-        [[1 -- 3]
-         [-- 5 --]
-         [7 -- 9]]
-        >>> print(x.ravel())
-        [1 -- 3 -- 5 -- 7 -- 9]
+        >>> x
+        masked_array(
+          data=[[1, --, 3],
+                [--, 5, --],
+                [7, --, 9]],
+          mask=[[False,  True, False],
+                [ True, False,  True],
+                [False,  True, False]],
+          fill_value=999999)
+        >>> x.ravel()
+        masked_array(data=[1, --, 3, --, 5, --, 7, --, 9],
+                     mask=[False,  True, False,  True, False,  True, False,  True,
+                           False],
+               fill_value=999999)
 
         """
         r = ndarray.ravel(self._data, order=order).view(type(self))
@@ -4576,15 +4592,25 @@
         Examples
         --------
         >>> x = np.ma.array([[1,2],[3,4]], mask=[1,0,0,1])
-        >>> print(x)
-        [[-- 2]
-         [3 --]]
+        >>> x
+        masked_array(
+          data=[[--, 2],
+                [3, --]],
+          mask=[[ True, False],
+                [False,  True]],
+          fill_value=999999)
         >>> x = x.reshape((4,1))
-        >>> print(x)
-        [[--]
-         [2]
-         [3]
-         [--]]
+        >>> x
+        masked_array(
+          data=[[--],
+                [2],
+                [3],
+                [--]],
+          mask=[[ True],
+                [False],
+                [False],
+                [ True]],
+          fill_value=999999)
 
         """
         kwargs.update(order=kwargs.get('order', 'C'))
@@ -4641,21 +4667,36 @@
         Examples
         --------
         >>> x = np.ma.array([[1,2,3],[4,5,6],[7,8,9]], mask=[0] + [1,0]*4)
-        >>> print(x)
-        [[1 -- 3]
-         [-- 5 --]
-         [7 -- 9]]
+        >>> x
+        masked_array(
+          data=[[1, --, 3],
+                [--, 5, --],
+                [7, --, 9]],
+          mask=[[False,  True, False],
+                [ True, False,  True],
+                [False,  True, False]],
+          fill_value=999999)
         >>> x.put([0,4,8],[10,20,30])
-        >>> print(x)
-        [[10 -- 3]
-         [-- 20 --]
-         [7 -- 30]]
+        >>> x
+        masked_array(
+          data=[[10, --, 3],
+                [--, 20, --],
+                [7, --, 30]],
+          mask=[[False,  True, False],
+                [ True, False,  True],
+                [False,  True, False]],
+          fill_value=999999)
 
         >>> x.put(4,999)
-        >>> print(x)
-        [[10 -- 3]
-         [-- 999 --]
-         [7 -- 30]]
+        >>> x
+        masked_array(
+          data=[[10, --, 3],
+                [--, 999, --],
+                [7, --, 30]],
+          mask=[[False,  True, False],
+                [ True, False,  True],
+                [False,  True, False]],
+          fill_value=999999)
 
         """
         # Hard mask: Get rid of the values/indices that fall on masked data
@@ -4695,14 +4736,14 @@
         --------
         >>> x = np.ma.array([1, 2, 3], mask=[0, 1, 1])
         >>> x.ids()
-        (166670640, 166659832)
+        (166670640, 166659832) # may vary
 
         If the array has no mask, the address of `nomask` is returned. This address
         is typically not close to the data in memory:
 
         >>> x = np.ma.array([1, 2, 3])
         >>> x.ids()
-        (166691080, 3083169284L)
+        (166691080, 3083169284L) # may vary
 
         """
         if self._mask is nomask:
@@ -4851,13 +4892,12 @@
         >>> import numpy.ma as ma
         >>> x = ma.array(np.eye(3))
         >>> x
-        masked_array(data =
-         [[ 1.  0.  0.]
-         [ 0.  1.  0.]
-         [ 0.  0.  1.]],
-              mask =
-         False,
-              fill_value=1e+20)
+        masked_array(
+          data=[[1., 0., 0.],
+                [0., 1., 0.],
+                [0., 0., 1.]],
+          mask=False,
+          fill_value=1e+20)
         >>> x.nonzero()
         (array([0, 1, 2]), array([0, 1, 2]))
 
@@ -4865,15 +4905,14 @@
 
         >>> x[1, 1] = ma.masked
         >>> x
-        masked_array(data =
-         [[1.0 0.0 0.0]
-         [0.0 -- 0.0]
-         [0.0 0.0 1.0]],
-              mask =
-         [[False False False]
-         [False  True False]
-         [False False False]],
-              fill_value=1e+20)
+        masked_array(
+          data=[[1.0, 0.0, 0.0],
+                [0.0, --, 0.0],
+                [0.0, 0.0, 1.0]],
+          mask=[[False, False, False],
+                [False,  True, False],
+                [False, False, False]],
+          fill_value=1e+20)
         >>> x.nonzero()
         (array([0, 2]), array([0, 2]))
 
@@ -4890,13 +4929,12 @@
 
         >>> a = ma.array([[1,2,3],[4,5,6],[7,8,9]])
         >>> a > 3
-        masked_array(data =
-         [[False False False]
-         [ True  True  True]
-         [ True  True  True]],
-              mask =
-         False,
-              fill_value=999999)
+        masked_array(
+          data=[[False, False, False],
+                [ True,  True,  True],
+                [ True,  True,  True]],
+          mask=False,
+          fill_value=True)
         >>> ma.nonzero(a > 3)
         (array([1, 1, 1, 2, 2, 2]), array([0, 1, 2, 0, 1, 2]))
 
@@ -4978,18 +5016,27 @@
         Examples
         --------
         >>> x = np.ma.array([[1,2,3],[4,5,6],[7,8,9]], mask=[0] + [1,0]*4)
-        >>> print(x)
-        [[1 -- 3]
-         [-- 5 --]
-         [7 -- 9]]
-        >>> print(x.sum())
+        >>> x
+        masked_array(
+          data=[[1, --, 3],
+                [--, 5, --],
+                [7, --, 9]],
+          mask=[[False,  True, False],
+                [ True, False,  True],
+                [False,  True, False]],
+          fill_value=999999)
+        >>> x.sum()
         25
-        >>> print(x.sum(axis=1))
-        [4 5 16]
-        >>> print(x.sum(axis=0))
-        [8 5 12]
+        >>> x.sum(axis=1)
+        masked_array(data=[4, 5, 16],
+                     mask=[False, False, False],
+               fill_value=999999)
+        >>> x.sum(axis=0)
+        masked_array(data=[8, 5, 12],
+                     mask=[False, False, False],
+               fill_value=999999)
         >>> print(type(x.sum(axis=0, dtype=np.int64)[0]))
-        <type 'numpy.int64'>
+        <class 'numpy.int64'>
 
         """
         kwargs = {} if keepdims is np._NoValue else {'keepdims': keepdims}
@@ -5040,8 +5087,11 @@
         Examples
         --------
         >>> marr = np.ma.array(np.arange(10), mask=[0,0,0,1,1,1,0,0,0,0])
-        >>> print(marr.cumsum())
-        [0 1 3 -- -- -- 9 16 24 33]
+        >>> marr.cumsum()
+        masked_array(data=[0, 1, 3, --, --, --, 9, 16, 24, 33],
+                     mask=[False, False, False,  True,  True,  True, False, False,
+                           False, False],
+               fill_value=999999)
 
         """
         result = self.filled(0).cumsum(axis=axis, dtype=dtype, out=out)
@@ -5145,9 +5195,9 @@
         --------
         >>> a = np.ma.array([1,2,3], mask=[False, False, True])
         >>> a
-        masked_array(data = [1 2 --],
-                     mask = [False False  True],
-               fill_value = 999999)
+        masked_array(data=[1, 2, --],
+                     mask=[False, False,  True],
+               fill_value=999999)
         >>> a.mean()
         1.5
 
@@ -5200,9 +5250,9 @@
         --------
         >>> a = np.ma.array([1,2,3])
         >>> a.anom()
-        masked_array(data = [-1.  0.  1.],
-                     mask = False,
-               fill_value = 1e+20)
+        masked_array(data=[-1.,  0.,  1.],
+                     mask=False,
+               fill_value=1e+20)
 
         """
         m = self.mean(axis, dtype)
@@ -5382,9 +5432,9 @@
         --------
         >>> a = np.ma.array([3,2,1], mask=[False, False, True])
         >>> a
-        masked_array(data = [3 2 --],
-                     mask = [False False  True],
-               fill_value = 999999)
+        masked_array(data=[3, 2, --],
+                     mask=[False, False,  True],
+               fill_value=999999)
         >>> a.argsort()
         array([1, 0, 2])
 
@@ -5432,15 +5482,19 @@
 
         Examples
         --------
-        >>> x = np.ma.array(arange(4), mask=[1,1,0,0])
+        >>> x = np.ma.array(np.arange(4), mask=[1,1,0,0])
         >>> x.shape = (2,2)
-        >>> print(x)
-        [[-- --]
-         [2 3]]
-        >>> print(x.argmin(axis=0, fill_value=-1))
-        [0 0]
-        >>> print(x.argmin(axis=0, fill_value=9))
-        [1 1]
+        >>> x
+        masked_array(
+          data=[[--, --],
+                [2, 3]],
+          mask=[[ True,  True],
+                [False, False]],
+          fill_value=999999)
+        >>> x.argmin(axis=0, fill_value=-1)
+        array([0, 0])
+        >>> x.argmin(axis=0, fill_value=9)
+        array([1, 1])
 
         """
         if fill_value is None:
@@ -5531,23 +5585,29 @@
 
         Examples
         --------
-        >>> a = ma.array([1, 2, 5, 4, 3],mask=[0, 1, 0, 1, 0])
+        >>> a = np.ma.array([1, 2, 5, 4, 3],mask=[0, 1, 0, 1, 0])
         >>> # Default
         >>> a.sort()
-        >>> print(a)
-        [1 3 5 -- --]
+        >>> a
+        masked_array(data=[1, 3, 5, --, --],
+                     mask=[False, False, False,  True,  True],
+               fill_value=999999)
 
-        >>> a = ma.array([1, 2, 5, 4, 3],mask=[0, 1, 0, 1, 0])
+        >>> a = np.ma.array([1, 2, 5, 4, 3],mask=[0, 1, 0, 1, 0])
         >>> # Put missing values in the front
         >>> a.sort(endwith=False)
-        >>> print(a)
-        [-- -- 1 3 5]
+        >>> a
+        masked_array(data=[--, --, 1, 3, 5],
+                     mask=[ True,  True, False, False, False],
+               fill_value=999999)
 
-        >>> a = ma.array([1, 2, 5, 4, 3],mask=[0, 1, 0, 1, 0])
+        >>> a = np.ma.array([1, 2, 5, 4, 3],mask=[0, 1, 0, 1, 0])
         >>> # fill_value takes over endwith
         >>> a.sort(endwith=False, fill_value=3)
-        >>> print(a)
-        [1 -- -- 3 5]
+        >>> a
+        masked_array(data=[1, --, --, 3, 5],
+                     mask=[False,  True,  True, False, False],
+               fill_value=999999)
 
         """
         if self._mask is nomask:
@@ -5653,27 +5713,36 @@
         Examples
         --------
         >>> x = np.ma.array(np.arange(6), mask=[0 ,1, 0, 0, 0 ,1]).reshape(3, 2)
-        >>> print(x)
-        [[0 --]
-         [2 3]
-         [4 --]]
+        >>> x
+        masked_array(
+          data=[[0, --],
+                [2, 3],
+                [4, --]],
+          mask=[[False,  True],
+                [False, False],
+                [False,  True]],
+          fill_value=999999)
         >>> x.mini()
-        0
+        masked_array(data=0,
+                     mask=False,
+               fill_value=999999)
         >>> x.mini(axis=0)
-        masked_array(data = [0 3],
-                     mask = [False False],
-               fill_value = 999999)
-        >>> print(x.mini(axis=1))
-        [0 2 4]
+        masked_array(data=[0, 3],
+                     mask=[False, False],
+               fill_value=999999)
+        >>> x.mini(axis=1)
+        masked_array(data=[0, 2, 4],
+                     mask=[False, False, False],
+               fill_value=999999)
 
         There is a small difference between `mini` and `min`:
 
         >>> x[:,1].mini(axis=0)
-        masked_array(data = --,
-                     mask = True,
-               fill_value = 999999)
+        masked_array(data=3,
+                     mask=False,
+               fill_value=999999)
         >>> x[:,1].min(axis=0)
-        masked
+        3
         """
 
         # 2016-04-13, 1.13.0, gh-8764
@@ -5926,7 +5995,7 @@
         --------
         >>> x = np.ma.array(np.array([[1, 2], [3, 4]]), mask=[[0, 1], [1, 0]])
         >>> x.tobytes()
-        '\\x01\\x00\\x00\\x00?B\\x0f\\x00?B\\x0f\\x00\\x04\\x00\\x00\\x00'
+        b'\\x01\\x00\\x00\\x00\\x00\\x00\\x00\\x00?B\\x0f\\x00\\x00\\x00\\x00\\x00?B\\x0f\\x00\\x00\\x00\\x00\\x00\\x04\\x00\\x00\\x00\\x00\\x00\\x00\\x00'
 
         """
         return self.filled(fill_value).tobytes(order=order)
@@ -5974,14 +6043,20 @@
         Examples
         --------
         >>> x = np.ma.array([[1,2,3],[4,5,6],[7,8,9]], mask=[0] + [1,0]*4)
-        >>> print(x)
-        [[1 -- 3]
-         [-- 5 --]
-         [7 -- 9]]
-        >>> print(x.toflex())
-        [[(1, False) (2, True) (3, False)]
-         [(4, True) (5, False) (6, True)]
-         [(7, False) (8, True) (9, False)]]
+        >>> x
+        masked_array(
+          data=[[1, --, 3],
+                [--, 5, --],
+                [7, --, 9]],
+          mask=[[False,  True, False],
+                [ True, False,  True],
+                [False,  True, False]],
+          fill_value=999999)
+        >>> x.toflex()
+        array([[(1, False), (2,  True), (3, False)],
+               [(4,  True), (5, False), (6,  True)],
+               [(7, False), (8,  True), (9, False)]],
+              dtype=[('_data', '<i8'), ('_mask', '?')])
 
         """
         # Get the basic dtype.
@@ -6228,15 +6303,14 @@
            [ 0.,  0.,  1.]])
     >>> m = ma.masked_values(a, 0)
     >>> m
-    masked_array(data =
-     [[1.0 -- --]
-     [-- 1.0 --]
-     [-- -- 1.0]],
-          mask =
-     [[False  True  True]
-     [ True False  True]
-     [ True  True False]],
-          fill_value=0.0)
+    masked_array(
+      data=[[1.0, --, --],
+            [--, 1.0, --],
+            [--, --, 1.0]],
+      mask=[[False,  True,  True],
+            [ True, False,  True],
+            [ True,  True, False]],
+      fill_value=0.0)
     >>> ma.isMaskedArray(a)
     False
     >>> ma.isMaskedArray(m)
@@ -6400,16 +6474,16 @@
     >>> import numpy.ma as ma
     >>> x = ma.masked_equal([0, 1, 0, 2, 3], 0)
     >>> x
-    masked_array(data = [-- 1 -- 2 3],
-          mask = [ True False  True False False],
-          fill_value=999999)
+    masked_array(data=[--, 1, --, 2, 3],
+                 mask=[ True, False,  True, False, False],
+           fill_value=0)
     >>> ma.is_masked(x)
     True
     >>> x = ma.masked_equal([0, 1, 0, 2, 3], 42)
     >>> x
-    masked_array(data = [0 1 0 2 3],
-          mask = False,
-          fill_value=999999)
+    masked_array(data=[0, 1, 0, 2, 3],
+                 mask=False,
+           fill_value=42)
     >>> ma.is_masked(x)
     False
 
@@ -6759,17 +6833,17 @@
     >>> a[1] = ma.masked
     >>> b = ma.arange(2, 5)
     >>> a
-    masked_array(data = [0 -- 2],
-                 mask = [False  True False],
-           fill_value = 999999)
+    masked_array(data=[0, --, 2],
+                 mask=[False,  True, False],
+           fill_value=999999)
     >>> b
-    masked_array(data = [2 3 4],
-                 mask = False,
-           fill_value = 999999)
+    masked_array(data=[2, 3, 4],
+                 mask=False,
+           fill_value=999999)
     >>> ma.concatenate([a, b])
-    masked_array(data = [0 -- 2 2 3 4],
-                 mask = [False  True False False False False],
-           fill_value = 999999)
+    masked_array(data=[0, --, 2, 2, 3, 4],
+                 mask=[False,  True, False, False, False, False],
+           fill_value=999999)
 
     """
     d = np.concatenate([getdata(a) for a in arrays], axis)
@@ -6924,24 +6998,21 @@
     >>> import numpy.ma as ma
     >>> x = ma.arange(4).reshape((2,2))
     >>> x[1, 1] = ma.masked
-    >>>> x
-    masked_array(data =
-     [[0 1]
-     [2 --]],
-                 mask =
-     [[False False]
-     [False  True]],
-           fill_value = 999999)
+    >>> x
+    masked_array(
+      data=[[0, 1],
+            [2, --]],
+      mask=[[False, False],
+            [False,  True]],
+      fill_value=999999)
 
     >>> ma.transpose(x)
-    masked_array(data =
-     [[0 2]
-     [1 --]],
-                 mask =
-     [[False False]
-     [False  True]],
-           fill_value = 999999)
-
+    masked_array(
+      data=[[0, 2],
+            [1, --]],
+      mask=[[False, False],
+            [False,  True]],
+      fill_value=999999)
     """
     # We can't use 'frommethod', as 'transpose' doesn't take keywords
     try:
@@ -6988,39 +7059,39 @@
     >>> a = ma.array([[1, 2] ,[3, 4]])
     >>> a[0, 1] = ma.masked
     >>> a
-    masked_array(data =
-     [[1 --]
-     [3 4]],
-                 mask =
-     [[False  True]
-     [False False]],
-           fill_value = 999999)
+    masked_array(
+      data=[[1, --],
+            [3, 4]],
+      mask=[[False,  True],
+            [False, False]],
+      fill_value=999999)
     >>> np.resize(a, (3, 3))
-    array([[1, 2, 3],
-           [4, 1, 2],
-           [3, 4, 1]])
+    masked_array(
+      data=[[1, 2, 3],
+            [4, 1, 2],
+            [3, 4, 1]],
+      mask=False,
+      fill_value=999999)
     >>> ma.resize(a, (3, 3))
-    masked_array(data =
-     [[1 -- 3]
-     [4 1 --]
-     [3 4 1]],
-                 mask =
-     [[False  True False]
-     [False False  True]
-     [False False False]],
-           fill_value = 999999)
+    masked_array(
+      data=[[1, --, 3],
+            [4, 1, --],
+            [3, 4, 1]],
+      mask=[[False,  True, False],
+            [False, False,  True],
+            [False, False, False]],
+      fill_value=999999)
 
     A MaskedArray is always returned, regardless of the input type.
 
     >>> a = np.array([[1, 2] ,[3, 4]])
     >>> ma.resize(a, (3, 3))
-    masked_array(data =
-     [[1 2 3]
-     [4 1 2]
-     [3 4 1]],
-                 mask =
-     False,
-           fill_value = 999999)
+    masked_array(
+      data=[[1, 2, 3],
+            [4, 1, 2],
+            [3, 4, 1]],
+      mask=False,
+      fill_value=999999)
 
     """
     # We can't use _frommethods here, as N.resize is notoriously whiny.
@@ -7111,14 +7182,24 @@
     >>> x = np.ma.array(np.arange(9.).reshape(3, 3), mask=[[0, 1, 0],
     ...                                                    [1, 0, 1],
     ...                                                    [0, 1, 0]])
-    >>> print(x)
-    [[0.0 -- 2.0]
-     [-- 4.0 --]
-     [6.0 -- 8.0]]
-    >>> print(np.ma.where(x > 5, x, -3.1416))
-    [[-3.1416 -- -3.1416]
-     [-- -3.1416 --]
-     [6.0 -- 8.0]]
+    >>> x
+    masked_array(
+      data=[[0.0, --, 2.0],
+            [--, 4.0, --],
+            [6.0, --, 8.0]],
+      mask=[[False,  True, False],
+            [ True, False,  True],
+            [False,  True, False]],
+      fill_value=1e+20)
+    >>> np.ma.where(x > 5, x, -3.1416)
+    masked_array(
+      data=[[-3.1416, --, -3.1416],
+            [--, -3.1416, --],
+            [6.0, --, 8.0]],
+      mask=[[False,  True, False],
+            [ True, False,  True],
+            [False,  True, False]],
+      fill_value=1e+20)
 
     """
 
@@ -7198,9 +7279,9 @@
     >>> choice = np.array([[1,1,1], [2,2,2], [3,3,3]])
     >>> a = np.array([2, 1, 0])
     >>> np.ma.choose(a, choice)
-    masked_array(data = [3 2 1],
-          mask = False,
-          fill_value=999999)
+    masked_array(data=[3, 2, 1],
+                 mask=False,
+           fill_value=999999)
 
     """
     def fmask(x):
@@ -7323,25 +7404,23 @@
            [0, 0, 0]])
     >>> a = ma.masked_equal(a, 1)
     >>> a
-    masked_array(data =
-     [[0 0 0]
-     [0 -- 0]
-     [0 0 0]],
-          mask =
-     [[False False False]
-     [False  True False]
-     [False False False]],
-          fill_value=999999)
+    masked_array(
+      data=[[0, 0, 0],
+            [0, --, 0],
+            [0, 0, 0]],
+      mask=[[False, False, False],
+            [False,  True, False],
+            [False, False, False]],
+      fill_value=1)
     >>> ma.mask_rowcols(a)
-    masked_array(data =
-     [[0 -- 0]
-     [-- -- --]
-     [0 -- 0]],
-          mask =
-     [[False  True False]
-     [ True  True  True]
-     [False  True False]],
-          fill_value=999999)
+    masked_array(
+      data=[[0, --, 0],
+            [--, --, --],
+            [0, --, 0]],
+      mask=[[False,  True, False],
+            [ True,  True,  True],
+            [False,  True, False]],
+      fill_value=1)
 
     """
     a = array(a, subok=False)
@@ -7402,24 +7481,22 @@
 
     Examples
     --------
-    >>> a = ma.array([[1, 2, 3], [4, 5, 6]], mask=[[1, 0, 0], [0, 0, 0]])
-    >>> b = ma.array([[1, 2], [3, 4], [5, 6]], mask=[[1, 0], [0, 0], [0, 0]])
+    >>> a = np.ma.array([[1, 2, 3], [4, 5, 6]], mask=[[1, 0, 0], [0, 0, 0]])
+    >>> b = np.ma.array([[1, 2], [3, 4], [5, 6]], mask=[[1, 0], [0, 0], [0, 0]])
     >>> np.ma.dot(a, b)
-    masked_array(data =
-     [[21 26]
-     [45 64]],
-                 mask =
-     [[False False]
-     [False False]],
-           fill_value = 999999)
+    masked_array(
+      data=[[21, 26],
+            [45, 64]],
+      mask=[[False, False],
+            [False, False]],
+      fill_value=999999)
     >>> np.ma.dot(a, b, strict=True)
-    masked_array(data =
-     [[-- --]
-     [-- 64]],
-                 mask =
-     [[ True  True]
-     [ True False]],
-           fill_value = 999999)
+    masked_array(
+      data=[[--, --],
+            [--, 64]],
+      mask=[[ True,  True],
+            [ True, False]],
+      fill_value=999999)
 
     """
     # !!!: Works only with 2D arrays. There should be a way to get it to run
@@ -7587,18 +7664,18 @@
 
     Examples
     --------
-    >>> a = ma.array([1e10, 1e-7, 42.0], mask=[0, 0, 1])
+    >>> a = np.ma.array([1e10, 1e-7, 42.0], mask=[0, 0, 1])
     >>> a
-    masked_array(data = [10000000000.0 1e-07 --],
-          mask = [False False  True],
-          fill_value=1e+20)
+    masked_array(data=[10000000000.0, 1e-07, --],
+                 mask=[False, False,  True],
+           fill_value=1e+20)
 
-    >>> b = array([1e10, 1e-7, -42.0])
+    >>> b = np.array([1e10, 1e-7, -42.0])
     >>> b
     array([  1.00000000e+10,   1.00000000e-07,  -4.20000000e+01])
-    >>> ma.allequal(a, b, fill_value=False)
+    >>> np.ma.allequal(a, b, fill_value=False)
     False
-    >>> ma.allequal(a, b)
+    >>> np.ma.allequal(a, b)
     True
 
     """
@@ -7664,29 +7741,29 @@
 
     Examples
     --------
-    >>> a = ma.array([1e10, 1e-7, 42.0], mask=[0, 0, 1])
+    >>> a = np.ma.array([1e10, 1e-7, 42.0], mask=[0, 0, 1])
     >>> a
-    masked_array(data = [10000000000.0 1e-07 --],
-                 mask = [False False  True],
-           fill_value = 1e+20)
-    >>> b = ma.array([1e10, 1e-8, -42.0], mask=[0, 0, 1])
-    >>> ma.allclose(a, b)
+    masked_array(data=[10000000000.0, 1e-07, --],
+                 mask=[False, False,  True],
+           fill_value=1e+20)
+    >>> b = np.ma.array([1e10, 1e-8, -42.0], mask=[0, 0, 1])
+    >>> np.ma.allclose(a, b)
     False
 
-    >>> a = ma.array([1e10, 1e-8, 42.0], mask=[0, 0, 1])
-    >>> b = ma.array([1.00001e10, 1e-9, -42.0], mask=[0, 0, 1])
-    >>> ma.allclose(a, b)
+    >>> a = np.ma.array([1e10, 1e-8, 42.0], mask=[0, 0, 1])
+    >>> b = np.ma.array([1.00001e10, 1e-9, -42.0], mask=[0, 0, 1])
+    >>> np.ma.allclose(a, b)
     True
-    >>> ma.allclose(a, b, masked_equal=False)
+    >>> np.ma.allclose(a, b, masked_equal=False)
     False
 
     Masked values are not compared directly.
 
-    >>> a = ma.array([1e10, 1e-8, 42.0], mask=[0, 0, 1])
-    >>> b = ma.array([1.00001e10, 1e-9, 42.0], mask=[0, 0, 1])
-    >>> ma.allclose(a, b)
+    >>> a = np.ma.array([1e10, 1e-8, 42.0], mask=[0, 0, 1])
+    >>> b = np.ma.array([1.00001e10, 1e-9, 42.0], mask=[0, 0, 1])
+    >>> np.ma.allclose(a, b)
     True
-    >>> ma.allclose(a, b, masked_equal=False)
+    >>> np.ma.allclose(a, b, masked_equal=False)
     False
 
     """
@@ -7753,15 +7830,14 @@
     --------
     >>> x = np.arange(10.).reshape(2, 5)
     >>> x
-    array([[ 0.,  1.,  2.,  3.,  4.],
-           [ 5.,  6.,  7.,  8.,  9.]])
+    array([[0., 1., 2., 3., 4.],
+           [5., 6., 7., 8., 9.]])
     >>> np.ma.asarray(x)
-    masked_array(data =
-     [[ 0.  1.  2.  3.  4.]
-     [ 5.  6.  7.  8.  9.]],
-                 mask =
-     False,
-           fill_value = 1e+20)
+    masked_array(
+      data=[[0., 1., 2., 3., 4.],
+            [5., 6., 7., 8., 9.]],
+      mask=False,
+      fill_value=1e+20)
     >>> type(np.ma.asarray(x))
     <class 'numpy.ma.core.MaskedArray'>
 
@@ -7801,15 +7877,14 @@
     --------
     >>> x = np.arange(10.).reshape(2, 5)
     >>> x
-    array([[ 0.,  1.,  2.,  3.,  4.],
-           [ 5.,  6.,  7.,  8.,  9.]])
+    array([[0., 1., 2., 3., 4.],
+           [5., 6., 7., 8., 9.]])
     >>> np.ma.asanyarray(x)
-    masked_array(data =
-     [[ 0.  1.  2.  3.  4.]
-     [ 5.  6.  7.  8.  9.]],
-                 mask =
-     False,
-           fill_value = 1e+20)
+    masked_array(
+      data=[[0., 1., 2., 3., 4.],
+            [5., 6., 7., 8., 9.]],
+      mask=False,
+      fill_value=1e+20)
     >>> type(np.ma.asanyarray(x))
     <class 'numpy.ma.core.MaskedArray'>
 
@@ -7953,39 +8028,38 @@
     >>> x = np.ma.array(np.arange(9).reshape(3, 3), mask=[0] + [1, 0] * 4)
     >>> rec = x.toflex()
     >>> rec
-    array([[(0, False), (1, True), (2, False)],
-           [(3, True), (4, False), (5, True)],
-           [(6, False), (7, True), (8, False)]],
-          dtype=[('_data', '<i4'), ('_mask', '|b1')])
+    array([[(0, False), (1,  True), (2, False)],
+           [(3,  True), (4, False), (5,  True)],
+           [(6, False), (7,  True), (8, False)]],
+          dtype=[('_data', '<i8'), ('_mask', '?')])
     >>> x2 = np.ma.fromflex(rec)
     >>> x2
-    masked_array(data =
-     [[0 -- 2]
-     [-- 4 --]
-     [6 -- 8]],
-                 mask =
-     [[False  True False]
-     [ True False  True]
-     [False  True False]],
-           fill_value = 999999)
+    masked_array(
+      data=[[0, --, 2],
+            [--, 4, --],
+            [6, --, 8]],
+      mask=[[False,  True, False],
+            [ True, False,  True],
+            [False,  True, False]],
+      fill_value=999999)
 
     Extra fields can be present in the structured array but are discarded:
 
     >>> dt = [('_data', '<i4'), ('_mask', '|b1'), ('field3', '<f4')]
     >>> rec2 = np.zeros((2, 2), dtype=dt)
     >>> rec2
-    array([[(0, False, 0.0), (0, False, 0.0)],
-           [(0, False, 0.0), (0, False, 0.0)]],
-          dtype=[('_data', '<i4'), ('_mask', '|b1'), ('field3', '<f4')])
+    array([[(0, False, 0.), (0, False, 0.)],
+           [(0, False, 0.), (0, False, 0.)]],
+          dtype=[('_data', '<i4'), ('_mask', '?'), ('field3', '<f4')])
     >>> y = np.ma.fromflex(rec2)
     >>> y
-    masked_array(data =
-     [[0 0]
-     [0 0]],
-                 mask =
-     [[False False]
-     [False False]],
-           fill_value = 999999)
+    masked_array(
+      data=[[0, 0],
+            [0, 0]],
+      mask=[[False, False],
+            [False, False]],
+      fill_value=999999,
+      dtype=int32)
 
     """
     return masked_array(fxarray['_data'], mask=fxarray['_mask'])
@@ -8086,7 +8160,10 @@
     >>> import numpy.ma as ma
     >>> a = ma.masked_values([1, 2, 3], 2)
     >>> b = ma.masked_values([[4, 5, 6], [7, 8, 9]], 7)
-    >>> print(ma.append(a, b))
-    [1 -- 3 4 5 6 -- 8 9]
+    >>> ma.append(a, b)
+    masked_array(data=[1, --, 3, 4, 5, 6, --, 8, 9],
+                 mask=[False,  True, False, False, False, False,  True, False,
+                       False],
+           fill_value=999999)
     """
     return concatenate([a, b], axis)
diff --git a/numpy/ma/extras.py b/numpy/ma/extras.py
index 3be4d36..2e3b84e 100644
--- a/numpy/ma/extras.py
+++ b/numpy/ma/extras.py
@@ -81,15 +81,14 @@
     >>> a[1, 2] = ma.masked
     >>> a[2, 1] = ma.masked
     >>> a
-    masked_array(data =
-     [[0 1 2]
-     [-- 4 --]
-     [6 -- 8]],
-          mask =
-     [[False False False]
-     [ True False  True]
-     [False  True False]],
-          fill_value=999999)
+    masked_array(
+      data=[[0, 1, 2],
+            [--, 4, --],
+            [6, --, 8]],
+      mask=[[False, False, False],
+            [ True, False,  True],
+            [False,  True, False]],
+      fill_value=999999)
     >>> ma.count_masked(a)
     3
 
@@ -132,15 +131,15 @@
     --------
     >>> import numpy.ma as ma
     >>> ma.masked_all((3, 3))
-    masked_array(data =
-     [[-- -- --]
-     [-- -- --]
-     [-- -- --]],
-          mask =
-     [[ True  True  True]
-     [ True  True  True]
-     [ True  True  True]],
-          fill_value=1e+20)
+    masked_array(
+      data=[[--, --, --],
+            [--, --, --],
+            [--, --, --]],
+      mask=[[ True,  True,  True],
+            [ True,  True,  True],
+            [ True,  True,  True]],
+      fill_value=1e+20,
+      dtype=float64)
 
     The `dtype` parameter defines the underlying data type.
 
@@ -188,16 +187,16 @@
     >>> import numpy.ma as ma
     >>> arr = np.zeros((2, 3), dtype=np.float32)
     >>> arr
-    array([[ 0.,  0.,  0.],
-           [ 0.,  0.,  0.]], dtype=float32)
+    array([[0., 0., 0.],
+           [0., 0., 0.]], dtype=float32)
     >>> ma.masked_all_like(arr)
-    masked_array(data =
-     [[-- -- --]
-     [-- -- --]],
-          mask =
-     [[ True  True  True]
-     [ True  True  True]],
-          fill_value=1e+20)
+    masked_array(
+      data=[[--, --, --],
+            [--, --, --]],
+      mask=[[ True,  True,  True],
+            [ True,  True,  True]],
+      fill_value=1e+20,
+      dtype=float32)
 
     The dtype of the masked array matches the dtype of `arr`.
 
@@ -492,28 +491,45 @@
 
     Examples
     --------
-    >>> a = ma.arange(24).reshape(2,3,4)
-    >>> a[:,0,1] = ma.masked
-    >>> a[:,1,:] = ma.masked
-    >>> print(a)
-    [[[0 -- 2 3]
-      [-- -- -- --]
-      [8 9 10 11]]
-
-     [[12 -- 14 15]
-      [-- -- -- --]
-      [20 21 22 23]]]
-    >>> print(ma.apply_over_axes(ma.sum, a, [0,2]))
-    [[[46]
-      [--]
-      [124]]]
+    >>> a = np.ma.arange(24).reshape(2,3,4)
+    >>> a[:,0,1] = np.ma.masked
+    >>> a[:,1,:] = np.ma.masked
+    >>> a
+    masked_array(
+      data=[[[0, --, 2, 3],
+             [--, --, --, --],
+             [8, 9, 10, 11]],
+            [[12, --, 14, 15],
+             [--, --, --, --],
+             [20, 21, 22, 23]]],
+      mask=[[[False,  True, False, False],
+             [ True,  True,  True,  True],
+             [False, False, False, False]],
+            [[False,  True, False, False],
+             [ True,  True,  True,  True],
+             [False, False, False, False]]],
+      fill_value=999999)
+    >>> np.ma.apply_over_axes(np.ma.sum, a, [0,2])
+    masked_array(
+      data=[[[46],
+             [--],
+             [124]]],
+      mask=[[[False],
+             [ True],
+             [False]]],
+      fill_value=999999)
 
     Tuple axis arguments to ufuncs are equivalent:
 
-    >>> print(ma.sum(a, axis=(0,2)).reshape((1,-1,1)))
-    [[[46]
-      [--]
-      [124]]]
+    >>> np.ma.sum(a, axis=(0,2)).reshape((1,-1,1))
+    masked_array(
+      data=[[[46],
+             [--],
+             [124]]],
+      mask=[[[False],
+             [ True],
+             [False]]],
+      fill_value=999999)
     """
 
 
@@ -558,14 +574,19 @@
     1.25
 
     >>> x = np.ma.arange(6.).reshape(3, 2)
-    >>> print(x)
-    [[ 0.  1.]
-     [ 2.  3.]
-     [ 4.  5.]]
+    >>> x
+    masked_array(
+      data=[[0., 1.],
+            [2., 3.],
+            [4., 5.]],
+      mask=False,
+      fill_value=1e+20)
     >>> avg, sumweights = np.ma.average(x, axis=0, weights=[1, 2, 3],
     ...                                 returned=True)
-    >>> print(avg)
-    [2.66666666667 3.66666666667]
+    >>> avg
+    masked_array(data=[2.6666666666666665, 3.6666666666666665],
+                 mask=[False, False],
+           fill_value=1e+20)
 
     """
     a = asarray(a)
@@ -676,9 +697,9 @@
     >>> np.ma.median(x)
     2.5
     >>> np.ma.median(x, axis=-1, overwrite_input=True)
-    masked_array(data = [ 2.  5.],
-                 mask = False,
-           fill_value = 1e+20)
+    masked_array(data=[2.0, 5.0],
+                 mask=[False, False],
+           fill_value=1e+20)
 
     """
     if not hasattr(a, 'mask'):
@@ -856,15 +877,14 @@
     ...                                                   [1, 0, 0],
     ...                                                   [0, 0, 0]])
     >>> x
-    masked_array(data =
-     [[-- 1 2]
-     [-- 4 5]
-     [6 7 8]],
-                 mask =
-     [[ True False False]
-     [ True False False]
-     [False False False]],
-           fill_value = 999999)
+    masked_array(
+      data=[[--, 1, 2],
+            [--, 4, 5],
+            [6, 7, 8]],
+      mask=[[ True, False, False],
+            [ True, False, False],
+            [False, False, False]],
+      fill_value=999999)
 
     >>> np.ma.compress_rowcols(x)
     array([[7, 8]])
@@ -937,25 +957,24 @@
            [0, 0, 0]])
     >>> a = ma.masked_equal(a, 1)
     >>> a
-    masked_array(data =
-     [[0 0 0]
-     [0 -- 0]
-     [0 0 0]],
-          mask =
-     [[False False False]
-     [False  True False]
-     [False False False]],
-          fill_value=999999)
+    masked_array(
+      data=[[0, 0, 0],
+            [0, --, 0],
+            [0, 0, 0]],
+      mask=[[False, False, False],
+            [False,  True, False],
+            [False, False, False]],
+      fill_value=1)
+
     >>> ma.mask_rows(a)
-    masked_array(data =
-     [[0 0 0]
-     [-- -- --]
-     [0 0 0]],
-          mask =
-     [[False False False]
-     [ True  True  True]
-     [False False False]],
-          fill_value=999999)
+    masked_array(
+      data=[[0, 0, 0],
+            [--, --, --],
+            [0, 0, 0]],
+      mask=[[False, False, False],
+            [ True,  True,  True],
+            [False, False, False]],
+      fill_value=1)
 
     """
     return mask_rowcols(a, 0)
@@ -982,25 +1001,23 @@
            [0, 0, 0]])
     >>> a = ma.masked_equal(a, 1)
     >>> a
-    masked_array(data =
-     [[0 0 0]
-     [0 -- 0]
-     [0 0 0]],
-          mask =
-     [[False False False]
-     [False  True False]
-     [False False False]],
-          fill_value=999999)
+    masked_array(
+      data=[[0, 0, 0],
+            [0, --, 0],
+            [0, 0, 0]],
+      mask=[[False, False, False],
+            [False,  True, False],
+            [False, False, False]],
+      fill_value=1)
     >>> ma.mask_cols(a)
-    masked_array(data =
-     [[0 -- 0]
-     [0 -- 0]
-     [0 -- 0]],
-          mask =
-     [[False  True False]
-     [False  True False]
-     [False  True False]],
-          fill_value=999999)
+    masked_array(
+      data=[[0, --, 0],
+            [0, --, 0],
+            [0, --, 0]],
+      mask=[[False,  True, False],
+            [False,  True, False],
+            [False,  True, False]],
+      fill_value=1)
 
     """
     return mask_rowcols(a, 1)
@@ -1078,12 +1095,12 @@
 
     Examples
     --------
-    >>> x = array([1, 3, 3, 3], mask=[0, 0, 0, 1])
-    >>> y = array([3, 1, 1, 1], mask=[0, 0, 0, 1])
-    >>> intersect1d(x, y)
-    masked_array(data = [1 3 --],
-                 mask = [False False  True],
-           fill_value = 999999)
+    >>> x = np.ma.array([1, 3, 3, 3], mask=[0, 0, 0, 1])
+    >>> y = np.ma.array([3, 1, 1, 1], mask=[0, 0, 0, 1])
+    >>> np.ma.intersect1d(x, y)
+    masked_array(data=[1, 3, --],
+                 mask=[False, False,  True],
+           fill_value=999999)
 
     """
     if assume_unique:
@@ -1216,9 +1233,9 @@
     --------
     >>> x = np.ma.array([1, 2, 3, 4], mask=[0, 1, 0, 1])
     >>> np.ma.setdiff1d(x, [1, 2])
-    masked_array(data = [3 --],
-                 mask = [False  True],
-           fill_value = 999999)
+    masked_array(data=[3, --],
+                 mask=[False,  True],
+           fill_value=999999)
 
     """
     if assume_unique:
@@ -1483,7 +1500,9 @@
     Examples
     --------
     >>> np.ma.mr_[np.ma.array([1,2,3]), 0, 0, np.ma.array([4,5,6])]
-    array([1, 2, 3, 0, 0, 4, 5, 6])
+    masked_array(data=[1, 2, 3, ..., 4, 5, 6],
+                 mask=False,
+           fill_value=999999)
 
     """
     def __init__(self):
@@ -1524,19 +1543,19 @@
     Examples
     --------
     >>> a = np.ma.arange(10)
-    >>> flatnotmasked_edges(a)
-    [0,-1]
+    >>> np.ma.flatnotmasked_edges(a)
+    array([0, 9])
 
     >>> mask = (a < 3) | (a > 8) | (a == 5)
     >>> a[mask] = np.ma.masked
     >>> np.array(a[~a.mask])
     array([3, 4, 6, 7, 8])
 
-    >>> flatnotmasked_edges(a)
+    >>> np.ma.flatnotmasked_edges(a)
     array([3, 8])
 
     >>> a[:] = np.ma.masked
-    >>> print(flatnotmasked_edges(ma))
+    >>> print(np.ma.flatnotmasked_edges(a))
     None
 
     """
@@ -1588,7 +1607,7 @@
     >>> np.array(am[~am.mask])
     array([0, 1, 2, 3, 6])
 
-    >>> np.ma.notmasked_edges(ma)
+    >>> np.ma.notmasked_edges(am)
     array([0, 6])
 
     """
@@ -1709,15 +1728,11 @@
     [slice(0, 1, None), slice(2, 4, None), slice(7, 9, None), slice(11, 12, None)]
 
     >>> np.ma.notmasked_contiguous(ma, axis=0)
-    [[slice(0, 1, None), slice(2, 3, None)],  # column broken into two segments
-     [],                                      # fully masked column
-     [slice(0, 1, None)],
-     [slice(0, 3, None)]]
+    [[slice(0, 1, None), slice(2, 3, None)], [], [slice(0, 1, None)], [slice(0, 3, None)]]
 
     >>> np.ma.notmasked_contiguous(ma, axis=1)
-    [[slice(0, 1, None), slice(2, 4, None)],  # row broken into two segments
-     [slice(3, 4, None)],
-     [slice(0, 1, None), slice(3, 4, None)]]
+    [[slice(0, 1, None), slice(2, 4, None)], [slice(3, 4, None)], [slice(0, 1, None), slice(3, 4, None)]]
+
     """
     a = asarray(a)
     nd = a.ndim
diff --git a/numpy/ma/tests/test_core.py b/numpy/ma/tests/test_core.py
index 2775b11..e0dbf1b 100644
--- a/numpy/ma/tests/test_core.py
+++ b/numpy/ma/tests/test_core.py
@@ -2401,9 +2401,9 @@
         assert_equal(xm, y + 1)
 
         (x, _, xm) = self.floatdata
-        id1 = x.data.ctypes._data
+        id1 = x.data.ctypes.data
         x += 1.
-        assert_(id1 == x.data.ctypes._data)
+        assert_(id1 == x.data.ctypes.data)
         assert_equal(x, y + 1.)
 
     def test_inplace_addition_array(self):
diff --git a/numpy/ma/tests/test_extras.py b/numpy/ma/tests/test_extras.py
index 5243cf7..afcfd12 100644
--- a/numpy/ma/tests/test_extras.py
+++ b/numpy/ma/tests/test_extras.py
@@ -891,61 +891,51 @@
                            expected)
 
     def test_nan(self):
-        with suppress_warnings() as w:
-            w.record(RuntimeWarning)
-            for mask in (False, np.zeros(6, dtype=bool)):
-                dm = np.ma.array([[1, np.nan, 3], [1, 2, 3]])
-                dm.mask = mask
-
-                # scalar result
-                r = np.ma.median(dm, axis=None)
-                assert_(np.isscalar(r))
-                assert_array_equal(r, np.nan)
-                r = np.ma.median(dm.ravel(), axis=0)
-                assert_(np.isscalar(r))
-                assert_array_equal(r, np.nan)
-
-                r = np.ma.median(dm, axis=0)
-                assert_equal(type(r), MaskedArray)
-                assert_array_equal(r, [1, np.nan, 3])
-                r = np.ma.median(dm, axis=1)
-                assert_equal(type(r), MaskedArray)
-                assert_array_equal(r, [np.nan, 2])
-                r = np.ma.median(dm, axis=-1)
-                assert_equal(type(r), MaskedArray)
-                assert_array_equal(r, [np.nan, 2])
-
+        for mask in (False, np.zeros(6, dtype=bool)):
             dm = np.ma.array([[1, np.nan, 3], [1, 2, 3]])
-            dm[:, 2] = np.ma.masked
-            assert_array_equal(np.ma.median(dm, axis=None), np.nan)
-            assert_array_equal(np.ma.median(dm, axis=0), [1, np.nan, 3])
-            assert_array_equal(np.ma.median(dm, axis=1), [np.nan, 1.5])
-            assert_equal([x.category is RuntimeWarning for x in w.log],
-                         [True]*13)
+            dm.mask = mask
+
+            # scalar result
+            r = np.ma.median(dm, axis=None)
+            assert_(np.isscalar(r))
+            assert_array_equal(r, np.nan)
+            r = np.ma.median(dm.ravel(), axis=0)
+            assert_(np.isscalar(r))
+            assert_array_equal(r, np.nan)
+
+            r = np.ma.median(dm, axis=0)
+            assert_equal(type(r), MaskedArray)
+            assert_array_equal(r, [1, np.nan, 3])
+            r = np.ma.median(dm, axis=1)
+            assert_equal(type(r), MaskedArray)
+            assert_array_equal(r, [np.nan, 2])
+            r = np.ma.median(dm, axis=-1)
+            assert_equal(type(r), MaskedArray)
+            assert_array_equal(r, [np.nan, 2])
+
+        dm = np.ma.array([[1, np.nan, 3], [1, 2, 3]])
+        dm[:, 2] = np.ma.masked
+        assert_array_equal(np.ma.median(dm, axis=None), np.nan)
+        assert_array_equal(np.ma.median(dm, axis=0), [1, np.nan, 3])
+        assert_array_equal(np.ma.median(dm, axis=1), [np.nan, 1.5])
 
     def test_out_nan(self):
-        with warnings.catch_warnings(record=True):
-            warnings.filterwarnings('always', '', RuntimeWarning)
-            o = np.ma.masked_array(np.zeros((4,)))
-            d = np.ma.masked_array(np.ones((3, 4)))
-            d[2, 1] = np.nan
-            d[2, 2] = np.ma.masked
-            assert_equal(np.ma.median(d, 0, out=o), o)
-            o = np.ma.masked_array(np.zeros((3,)))
-            assert_equal(np.ma.median(d, 1, out=o), o)
-            o = np.ma.masked_array(np.zeros(()))
-            assert_equal(np.ma.median(d, out=o), o)
+        o = np.ma.masked_array(np.zeros((4,)))
+        d = np.ma.masked_array(np.ones((3, 4)))
+        d[2, 1] = np.nan
+        d[2, 2] = np.ma.masked
+        assert_equal(np.ma.median(d, 0, out=o), o)
+        o = np.ma.masked_array(np.zeros((3,)))
+        assert_equal(np.ma.median(d, 1, out=o), o)
+        o = np.ma.masked_array(np.zeros(()))
+        assert_equal(np.ma.median(d, out=o), o)
 
     def test_nan_behavior(self):
         a = np.ma.masked_array(np.arange(24, dtype=float))
         a[::3] = np.ma.masked
         a[2] = np.nan
-        with suppress_warnings() as w:
-            w.record(RuntimeWarning)
-            assert_array_equal(np.ma.median(a), np.nan)
-            assert_array_equal(np.ma.median(a, axis=0), np.nan)
-            assert_(w.log[0].category is RuntimeWarning)
-            assert_(w.log[1].category is RuntimeWarning)
+        assert_array_equal(np.ma.median(a), np.nan)
+        assert_array_equal(np.ma.median(a, axis=0), np.nan)
 
         a = np.ma.masked_array(np.arange(24, dtype=float).reshape(2, 3, 4))
         a.mask = np.arange(a.size) % 2 == 1
@@ -954,39 +944,26 @@
         a[1, 1, 2] = np.nan
 
         # no axis
-        with suppress_warnings() as w:
-            w.record(RuntimeWarning)
-            warnings.filterwarnings('always', '', RuntimeWarning)
-            assert_array_equal(np.ma.median(a), np.nan)
-            assert_(np.isscalar(np.ma.median(a)))
-            assert_(w.log[0].category is RuntimeWarning)
+        assert_array_equal(np.ma.median(a), np.nan)
+        assert_(np.isscalar(np.ma.median(a)))
 
         # axis0
         b = np.ma.median(aorig, axis=0)
         b[2, 3] = np.nan
         b[1, 2] = np.nan
-        with warnings.catch_warnings(record=True) as w:
-            warnings.filterwarnings('always', '', RuntimeWarning)
-            assert_equal(np.ma.median(a, 0), b)
-            assert_equal(len(w), 1)
+        assert_equal(np.ma.median(a, 0), b)
 
         # axis1
         b = np.ma.median(aorig, axis=1)
         b[1, 3] = np.nan
         b[1, 2] = np.nan
-        with warnings.catch_warnings(record=True) as w:
-            warnings.filterwarnings('always', '', RuntimeWarning)
-            assert_equal(np.ma.median(a, 1), b)
-            assert_equal(len(w), 1)
+        assert_equal(np.ma.median(a, 1), b)
 
         # axis02
         b = np.ma.median(aorig, axis=(0, 2))
         b[1] = np.nan
         b[2] = np.nan
-        with warnings.catch_warnings(record=True) as w:
-            warnings.filterwarnings('always', '', RuntimeWarning)
-            assert_equal(np.ma.median(a, (0, 2)), b)
-            assert_equal(len(w), 1)
+        assert_equal(np.ma.median(a, (0, 2)), b)
 
     def test_ambigous_fill(self):
         # 255 is max value, used as filler for sort
diff --git a/numpy/matlib.py b/numpy/matlib.py
index 004e5f0..9e11594 100644
--- a/numpy/matlib.py
+++ b/numpy/matlib.py
@@ -39,11 +39,11 @@
     --------
     >>> import numpy.matlib
     >>> np.matlib.empty((2, 2))    # filled with random data
-    matrix([[  6.76425276e-320,   9.79033856e-307],
-            [  7.39337286e-309,   3.22135945e-309]])        #random
+    matrix([[  6.76425276e-320,   9.79033856e-307], # random
+            [  7.39337286e-309,   3.22135945e-309]])
     >>> np.matlib.empty((2, 2), dtype=int)
-    matrix([[ 6600475,        0],
-            [ 6586976, 22740995]])                          #random
+    matrix([[ 6600475,        0], # random
+            [ 6586976, 22740995]])
 
     """
     return ndarray.__new__(matrix, shape, dtype, order=order)
@@ -82,11 +82,11 @@
     Examples
     --------
     >>> np.matlib.ones((2,3))
-    matrix([[ 1.,  1.,  1.],
-            [ 1.,  1.,  1.]])
+    matrix([[1.,  1.,  1.],
+            [1.,  1.,  1.]])
 
     >>> np.matlib.ones(2)
-    matrix([[ 1.,  1.]])
+    matrix([[1.,  1.]])
 
     """
     a = ndarray.__new__(matrix, shape, dtype, order=order)
@@ -126,11 +126,11 @@
     --------
     >>> import numpy.matlib
     >>> np.matlib.zeros((2, 3))
-    matrix([[ 0.,  0.,  0.],
-            [ 0.,  0.,  0.]])
+    matrix([[0.,  0.,  0.],
+            [0.,  0.,  0.]])
 
     >>> np.matlib.zeros(2)
-    matrix([[ 0.,  0.]])
+    matrix([[0.,  0.]])
 
     """
     a = ndarray.__new__(matrix, shape, dtype, order=order)
@@ -210,9 +210,9 @@
     --------
     >>> import numpy.matlib
     >>> np.matlib.eye(3, k=1, dtype=float)
-    matrix([[ 0.,  1.,  0.],
-            [ 0.,  0.,  1.],
-            [ 0.,  0.,  0.]])
+    matrix([[0.,  1.,  0.],
+            [0.,  0.,  1.],
+            [0.,  0.,  0.]])
 
     """
     return asmatrix(np.eye(n, M=M, k=k, dtype=dtype, order=order))
@@ -243,19 +243,20 @@
 
     Examples
     --------
+    >>> np.random.seed(123)
     >>> import numpy.matlib
     >>> np.matlib.rand(2, 3)
-    matrix([[ 0.68340382,  0.67926887,  0.83271405],
-            [ 0.00793551,  0.20468222,  0.95253525]])       #random
+    matrix([[0.69646919, 0.28613933, 0.22685145],
+            [0.55131477, 0.71946897, 0.42310646]])
     >>> np.matlib.rand((2, 3))
-    matrix([[ 0.84682055,  0.73626594,  0.11308016],
-            [ 0.85429008,  0.3294825 ,  0.89139555]])       #random
+    matrix([[0.9807642 , 0.68482974, 0.4809319 ],
+            [0.39211752, 0.34317802, 0.72904971]])
 
     If the first argument is a tuple, other arguments are ignored:
 
     >>> np.matlib.rand((2, 3), 4)
-    matrix([[ 0.46898646,  0.15163588,  0.95188261],
-            [ 0.59208621,  0.09561818,  0.00583606]])       #random
+    matrix([[0.43857224, 0.0596779 , 0.39804426],
+            [0.73799541, 0.18249173, 0.17545176]])
 
     """
     if isinstance(args[0], tuple):
@@ -294,18 +295,19 @@
 
     Examples
     --------
+    >>> np.random.seed(123)
     >>> import numpy.matlib
     >>> np.matlib.randn(1)
-    matrix([[-0.09542833]])                                 #random
+    matrix([[-1.0856306]])
     >>> np.matlib.randn(1, 2, 3)
-    matrix([[ 0.16198284,  0.0194571 ,  0.18312985],
-            [-0.7509172 ,  1.61055   ,  0.45298599]])       #random
+    matrix([[ 0.99734545,  0.2829785 , -1.50629471],
+            [-0.57860025,  1.65143654, -2.42667924]])
 
     Two-by-four matrix of samples from :math:`N(3, 6.25)`:
 
     >>> 2.5 * np.matlib.randn((2, 4)) + 3
-    matrix([[ 4.74085004,  8.89381862,  4.09042411,  4.83721922],
-            [ 7.52373709,  5.07933944, -2.64043543,  0.45610557]])  #random
+    matrix([[1.92771843, 6.16484065, 0.83314899, 1.30278462],
+            [2.76322758, 6.72847407, 1.40274501, 1.8900451 ]])
 
     """
     if isinstance(args[0], tuple):
diff --git a/numpy/matrixlib/defmatrix.py b/numpy/matrixlib/defmatrix.py
index 93b344c..6f8eadf 100644
--- a/numpy/matrixlib/defmatrix.py
+++ b/numpy/matrixlib/defmatrix.py
@@ -104,9 +104,9 @@
     Examples
     --------
     >>> a = np.matrix('1 2; 3 4')
-    >>> print(a)
-    [[1 2]
-     [3 4]]
+    >>> a
+    matrix([[1, 2],
+            [3, 4]])
 
     >>> np.matrix([[1, 2], [3, 4]])
     matrix([[1, 2],
@@ -310,12 +310,12 @@
         matrix([[3],
                 [7]])
         >>> x.sum(axis=1, dtype='float')
-        matrix([[ 3.],
-                [ 7.]])
-        >>> out = np.zeros((1, 2), dtype='float')
-        >>> x.sum(axis=1, dtype='float', out=out)
-        matrix([[ 3.],
-                [ 7.]])
+        matrix([[3.],
+                [7.]])
+        >>> out = np.zeros((2, 1), dtype='float')
+        >>> x.sum(axis=1, dtype='float', out=np.asmatrix(out))
+        matrix([[3.],
+                [7.]])
 
         """
         return N.ndarray.sum(self, axis, dtype, out, keepdims=True)._collapse(axis)
@@ -437,7 +437,7 @@
         >>> x.mean()
         5.5
         >>> x.mean(0)
-        matrix([[ 4.,  5.,  6.,  7.]])
+        matrix([[4., 5., 6., 7.]])
         >>> x.mean(1)
         matrix([[ 1.5],
                 [ 5.5],
@@ -469,9 +469,9 @@
                 [ 4,  5,  6,  7],
                 [ 8,  9, 10, 11]])
         >>> x.std()
-        3.4520525295346629
+        3.4520525295346629 # may vary
         >>> x.std(0)
-        matrix([[ 3.26598632,  3.26598632,  3.26598632,  3.26598632]])
+        matrix([[ 3.26598632,  3.26598632,  3.26598632,  3.26598632]]) # may vary
         >>> x.std(1)
         matrix([[ 1.11803399],
                 [ 1.11803399],
@@ -505,11 +505,11 @@
         >>> x.var()
         11.916666666666666
         >>> x.var(0)
-        matrix([[ 10.66666667,  10.66666667,  10.66666667,  10.66666667]])
+        matrix([[ 10.66666667,  10.66666667,  10.66666667,  10.66666667]]) # may vary
         >>> x.var(1)
-        matrix([[ 1.25],
-                [ 1.25],
-                [ 1.25]])
+        matrix([[1.25],
+                [1.25],
+                [1.25]])
 
         """
         return N.ndarray.var(self, axis, dtype, out, ddof, keepdims=True)._collapse(axis)
@@ -824,7 +824,7 @@
         matrix([[-2. ,  1. ],
                 [ 1.5, -0.5]])
         >>> m.getI() * m
-        matrix([[ 1.,  0.],
+        matrix([[ 1.,  0.], # may vary
                 [ 0.,  1.]])
 
         """
@@ -886,7 +886,8 @@
                 [ 4,  5,  6,  7],
                 [ 8,  9, 10, 11]])
         >>> x.getA1()
-        array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])
+        array([ 0,  1,  2, ...,  9, 10, 11])
+
 
         """
         return self.__array__().ravel()
@@ -986,10 +987,10 @@
                 [  4. -4.j,   5. -5.j,   6. -6.j,   7. -7.j],
                 [  8. -8.j,   9. -9.j,  10.-10.j,  11.-11.j]])
         >>> z.getH()
-        matrix([[  0. +0.j,   4. +4.j,   8. +8.j],
-                [  1. +1.j,   5. +5.j,   9. +9.j],
-                [  2. +2.j,   6. +6.j,  10.+10.j],
-                [  3. +3.j,   7. +7.j,  11.+11.j]])
+        matrix([[ 0. -0.j,  4. +4.j,  8. +8.j],
+                [ 1. +1.j,  5. +5.j,  9. +9.j],
+                [ 2. +2.j,  6. +6.j, 10.+10.j],
+                [ 3. +3.j,  7. +7.j, 11.+11.j]])
 
         """
         if issubclass(self.dtype.type, N.complexfloating):
diff --git a/numpy/polynomial/chebyshev.py b/numpy/polynomial/chebyshev.py
index 92cdb18..e0734e1 100644
--- a/numpy/polynomial/chebyshev.py
+++ b/numpy/polynomial/chebyshev.py
@@ -361,12 +361,12 @@
     >>> from numpy import polynomial as P
     >>> p = P.Polynomial(range(4))
     >>> p
-    Polynomial([ 0.,  1.,  2.,  3.], domain=[-1,  1], window=[-1,  1])
+    Polynomial([0., 1., 2., 3.], domain=[-1,  1], window=[-1,  1])
     >>> c = p.convert(kind=P.Chebyshev)
     >>> c
-    Chebyshev([ 1.  ,  3.25,  1.  ,  0.75], domain=[-1,  1], window=[-1,  1])
+    Chebyshev([1.  , 3.25, 1.  , 0.75], domain=[-1.,  1.], window=[-1.,  1.])
     >>> P.chebyshev.poly2cheb(range(4))
-    array([ 1.  ,  3.25,  1.  ,  0.75])
+    array([1.  , 3.25, 1.  , 0.75])
 
     """
     [pol] = pu.as_series([pol])
@@ -413,12 +413,12 @@
     >>> from numpy import polynomial as P
     >>> c = P.Chebyshev(range(4))
     >>> c
-    Chebyshev([ 0.,  1.,  2.,  3.], [-1.,  1.])
+    Chebyshev([0., 1., 2., 3.], domain=[-1,  1], window=[-1,  1])
     >>> p = c.convert(kind=P.Polynomial)
     >>> p
-    Polynomial([ -2.,  -8.,   4.,  12.], [-1.,  1.])
+    Polynomial([-2., -8.,  4., 12.], domain=[-1.,  1.], window=[-1.,  1.])
     >>> P.chebyshev.cheb2poly(range(4))
-    array([ -2.,  -8.,   4.,  12.])
+    array([-2.,  -8.,   4.,  12.])
 
     """
     from .polynomial import polyadd, polysub, polymulx
@@ -538,7 +538,7 @@
     array([ 0.  , -0.25,  0.  ,  0.25])
     >>> j = complex(0,1)
     >>> C.chebfromroots((-j,j)) # x^2 + 1 relative to the standard basis
-    array([ 1.5+0.j,  0.0+0.j,  0.5+0.j])
+    array([1.5+0.j, 0. +0.j, 0.5+0.j])
 
     """
     if len(roots) == 0:
@@ -594,7 +594,7 @@
     >>> c1 = (1,2,3)
     >>> c2 = (3,2,1)
     >>> C.chebadd(c1,c2)
-    array([ 4.,  4.,  4.])
+    array([4., 4., 4.])
 
     """
     # c1, c2 are trimmed copies
@@ -688,7 +688,7 @@
     --------
     >>> from numpy.polynomial import chebyshev as C
     >>> C.chebmulx([1,2,3])
-    array([ 1.,  2.5,  3.,  1.5,  2.])
+    array([1. , 2.5, 1. , 1.5])
 
     """
     # c is a trimmed copy
@@ -796,10 +796,10 @@
     >>> c1 = (1,2,3)
     >>> c2 = (3,2,1)
     >>> C.chebdiv(c1,c2) # quotient "intuitive," remainder not
-    (array([ 3.]), array([-8., -4.]))
+    (array([3.]), array([-8., -4.]))
     >>> c2 = (0,1,2,3)
     >>> C.chebdiv(c2,c1) # neither "intuitive"
-    (array([ 0.,  2.]), array([-2., -4.]))
+    (array([0., 2.]), array([-2., -4.]))
 
     """
     # c1, c2 are trimmed copies
@@ -853,7 +853,7 @@
     --------
     >>> from numpy.polynomial import chebyshev as C
     >>> C.chebpow([1, 2, 3, 4], 2)
-    array([15.5, 22. , 16. , 14. , 12.5, 12. ,  8. ])
+    array([15.5, 22. , 16. , ..., 12.5, 12. ,  8. ])
 
     """
     # c is a trimmed copy
@@ -928,13 +928,13 @@
     >>> from numpy.polynomial import chebyshev as C
     >>> c = (1,2,3,4)
     >>> C.chebder(c)
-    array([ 14.,  12.,  24.])
+    array([14., 12., 24.])
     >>> C.chebder(c,3)
-    array([ 96.])
+    array([96.])
     >>> C.chebder(c,scl=-1)
     array([-14., -12., -24.])
     >>> C.chebder(c,2,-1)
-    array([ 12.,  96.])
+    array([12.,  96.])
 
     """
     c = np.array(c, ndmin=1, copy=1)
@@ -1048,8 +1048,8 @@
     >>> C.chebint(c)
     array([ 0.5, -0.5,  0.5,  0.5])
     >>> C.chebint(c,3)
-    array([ 0.03125   , -0.1875    ,  0.04166667, -0.05208333,  0.01041667,
-            0.00625   ])
+    array([ 0.03125   , -0.1875    ,  0.04166667, -0.05208333,  0.01041667, # may vary
+        0.00625   ])
     >>> C.chebint(c, k=3)
     array([ 3.5, -0.5,  0.5,  0.5])
     >>> C.chebint(c,lbnd=-2)
@@ -1674,7 +1674,7 @@
         warnings can be turned off by
 
         >>> import warnings
-        >>> warnings.simplefilter('ignore', RankWarning)
+        >>> warnings.simplefilter('ignore', np.RankWarning)
 
     See Also
     --------
@@ -1885,7 +1885,7 @@
     --------
     >>> import numpy.polynomial.chebyshev as cheb
     >>> cheb.chebroots((-1, 1,-1, 1)) # T3 - T2 + T1 - T0 has real roots
-    array([ -5.00000000e-01,   2.60860684e-17,   1.00000000e+00])
+    array([ -5.00000000e-01,   2.60860684e-17,   1.00000000e+00]) # may vary
 
     """
     # c is a trimmed copy
diff --git a/numpy/polynomial/hermite.py b/numpy/polynomial/hermite.py
index 4905f36..93c9fc5 100644
--- a/numpy/polynomial/hermite.py
+++ b/numpy/polynomial/hermite.py
@@ -114,7 +114,7 @@
     --------
     >>> from numpy.polynomial.hermite import poly2herm
     >>> poly2herm(np.arange(4))
-    array([ 1.   ,  2.75 ,  0.5  ,  0.375])
+    array([1.   ,  2.75 ,  0.5  ,  0.375])
 
     """
     [pol] = pu.as_series([pol])
@@ -160,7 +160,7 @@
     --------
     >>> from numpy.polynomial.hermite import herm2poly
     >>> herm2poly([ 1.   ,  2.75 ,  0.5  ,  0.375])
-    array([ 0.,  1.,  2.,  3.])
+    array([0., 1., 2., 3.])
 
     """
     from .polynomial import polyadd, polysub, polymulx
@@ -280,10 +280,10 @@
     >>> from numpy.polynomial.hermite import hermfromroots, hermval
     >>> coef = hermfromroots((-1, 0, 1))
     >>> hermval((-1, 0, 1), coef)
-    array([ 0.,  0.,  0.])
+    array([0.,  0.,  0.])
     >>> coef = hermfromroots((-1j, 1j))
     >>> hermval((-1j, 1j), coef)
-    array([ 0.+0.j,  0.+0.j])
+    array([0.+0.j, 0.+0.j])
 
     """
     if len(roots) == 0:
@@ -337,7 +337,7 @@
     --------
     >>> from numpy.polynomial.hermite import hermadd
     >>> hermadd([1, 2, 3], [1, 2, 3, 4])
-    array([ 2.,  4.,  6.,  4.])
+    array([2., 4., 6., 4.])
 
     """
     # c1, c2 are trimmed copies
@@ -385,7 +385,7 @@
     --------
     >>> from numpy.polynomial.hermite import hermsub
     >>> hermsub([1, 2, 3, 4], [1, 2, 3])
-    array([ 0.,  0.,  0.,  4.])
+    array([0.,  0.,  0.,  4.])
 
     """
     # c1, c2 are trimmed copies
@@ -435,7 +435,7 @@
     --------
     >>> from numpy.polynomial.hermite import hermmulx
     >>> hermmulx([1, 2, 3])
-    array([ 2. ,  6.5,  1. ,  1.5])
+    array([2. , 6.5, 1. , 1.5])
 
     """
     # c is a trimmed copy
@@ -488,7 +488,7 @@
     --------
     >>> from numpy.polynomial.hermite import hermmul
     >>> hermmul([1, 2, 3], [0, 1, 2])
-    array([ 52.,  29.,  52.,   7.,   6.])
+    array([52.,  29.,  52.,   7.,   6.])
 
     """
     # s1, s2 are trimmed copies
@@ -557,11 +557,11 @@
     --------
     >>> from numpy.polynomial.hermite import hermdiv
     >>> hermdiv([ 52.,  29.,  52.,   7.,   6.], [0, 1, 2])
-    (array([ 1.,  2.,  3.]), array([ 0.]))
+    (array([1., 2., 3.]), array([0.]))
     >>> hermdiv([ 54.,  31.,  52.,   7.,   6.], [0, 1, 2])
-    (array([ 1.,  2.,  3.]), array([ 2.,  2.]))
+    (array([1., 2., 3.]), array([2., 2.]))
     >>> hermdiv([ 53.,  30.,  52.,   7.,   6.], [0, 1, 2])
-    (array([ 1.,  2.,  3.]), array([ 1.,  1.]))
+    (array([1., 2., 3.]), array([1., 1.]))
 
     """
     # c1, c2 are trimmed copies
@@ -617,7 +617,7 @@
     --------
     >>> from numpy.polynomial.hermite import hermpow
     >>> hermpow([1, 2, 3], 2)
-    array([ 81.,  52.,  82.,  12.,   9.])
+    array([81.,  52.,  82.,  12.,   9.])
 
     """
     # c is a trimmed copy
@@ -690,9 +690,9 @@
     --------
     >>> from numpy.polynomial.hermite import hermder
     >>> hermder([ 1. ,  0.5,  0.5,  0.5])
-    array([ 1.,  2.,  3.])
+    array([1., 2., 3.])
     >>> hermder([-0.5,  1./2.,  1./8.,  1./12.,  1./16.], m=2)
-    array([ 1.,  2.,  3.])
+    array([1., 2., 3.])
 
     """
     c = np.array(c, ndmin=1, copy=1)
@@ -799,15 +799,15 @@
     --------
     >>> from numpy.polynomial.hermite import hermint
     >>> hermint([1,2,3]) # integrate once, value 0 at 0.
-    array([ 1. ,  0.5,  0.5,  0.5])
+    array([1. , 0.5, 0.5, 0.5])
     >>> hermint([1,2,3], m=2) # integrate twice, value & deriv 0 at 0
-    array([-0.5       ,  0.5       ,  0.125     ,  0.08333333,  0.0625    ])
+    array([-0.5       ,  0.5       ,  0.125     ,  0.08333333,  0.0625    ]) # may vary
     >>> hermint([1,2,3], k=1) # integrate once, value 1 at 0.
-    array([ 2. ,  0.5,  0.5,  0.5])
+    array([2. , 0.5, 0.5, 0.5])
     >>> hermint([1,2,3], lbnd=-1) # integrate once, value 0 at -1
     array([-2. ,  0.5,  0.5,  0.5])
     >>> hermint([1,2,3], m=2, k=[1,2], lbnd=-1)
-    array([ 1.66666667, -0.5       ,  0.125     ,  0.08333333,  0.0625    ])
+    array([ 1.66666667, -0.5       ,  0.125     ,  0.08333333,  0.0625    ]) # may vary
 
     """
     c = np.array(c, ndmin=1, copy=1)
@@ -918,8 +918,8 @@
     >>> hermval(1, coef)
     11.0
     >>> hermval([[1,2],[3,4]], coef)
-    array([[  11.,   51.],
-           [ 115.,  203.]])
+    array([[ 11.,   51.],
+           [115.,  203.]])
 
     """
     c = np.array(c, ndmin=1, copy=0)
@@ -1437,7 +1437,7 @@
         warnings can be turned off by
 
         >>> import warnings
-        >>> warnings.simplefilter('ignore', RankWarning)
+        >>> warnings.simplefilter('ignore', np.RankWarning)
 
     See Also
     --------
@@ -1490,7 +1490,7 @@
     >>> err = np.random.randn(len(x))/10
     >>> y = hermval(x, [1, 2, 3]) + err
     >>> hermfit(x, y, 2)
-    array([ 0.97902637,  1.99849131,  3.00006   ])
+    array([1.0218, 1.9986, 2.9999]) # may vary
 
     """
     x = np.asarray(x) + 0.0
@@ -1656,9 +1656,9 @@
     >>> from numpy.polynomial.hermite import hermroots, hermfromroots
     >>> coef = hermfromroots([-1, 0, 1])
     >>> coef
-    array([ 0.   ,  0.25 ,  0.   ,  0.125])
+    array([0.   ,  0.25 ,  0.   ,  0.125])
     >>> hermroots(coef)
-    array([ -1.00000000e+00,  -1.38777878e-17,   1.00000000e+00])
+    array([-1.00000000e+00, -1.38777878e-17,  1.00000000e+00])
 
     """
     # c is a trimmed copy
diff --git a/numpy/polynomial/hermite_e.py b/numpy/polynomial/hermite_e.py
index 6cb044a..bafb4b9 100644
--- a/numpy/polynomial/hermite_e.py
+++ b/numpy/polynomial/hermite_e.py
@@ -161,7 +161,7 @@
     --------
     >>> from numpy.polynomial.hermite_e import herme2poly
     >>> herme2poly([  2.,  10.,   2.,   3.])
-    array([ 0.,  1.,  2.,  3.])
+    array([0.,  1.,  2.,  3.])
 
     """
     from .polynomial import polyadd, polysub, polymulx
@@ -281,10 +281,10 @@
     >>> from numpy.polynomial.hermite_e import hermefromroots, hermeval
     >>> coef = hermefromroots((-1, 0, 1))
     >>> hermeval((-1, 0, 1), coef)
-    array([ 0.,  0.,  0.])
+    array([0., 0., 0.])
     >>> coef = hermefromroots((-1j, 1j))
     >>> hermeval((-1j, 1j), coef)
-    array([ 0.+0.j,  0.+0.j])
+    array([0.+0.j, 0.+0.j])
 
     """
     if len(roots) == 0:
@@ -338,7 +338,7 @@
     --------
     >>> from numpy.polynomial.hermite_e import hermeadd
     >>> hermeadd([1, 2, 3], [1, 2, 3, 4])
-    array([ 2.,  4.,  6.,  4.])
+    array([2.,  4.,  6.,  4.])
 
     """
     # c1, c2 are trimmed copies
@@ -386,7 +386,7 @@
     --------
     >>> from numpy.polynomial.hermite_e import hermesub
     >>> hermesub([1, 2, 3, 4], [1, 2, 3])
-    array([ 0.,  0.,  0.,  4.])
+    array([0., 0., 0., 4.])
 
     """
     # c1, c2 are trimmed copies
@@ -432,7 +432,7 @@
     --------
     >>> from numpy.polynomial.hermite_e import hermemulx
     >>> hermemulx([1, 2, 3])
-    array([ 2.,  7.,  2.,  3.])
+    array([2.,  7.,  2.,  3.])
 
     """
     # c is a trimmed copy
@@ -485,7 +485,7 @@
     --------
     >>> from numpy.polynomial.hermite_e import hermemul
     >>> hermemul([1, 2, 3], [0, 1, 2])
-    array([ 14.,  15.,  28.,   7.,   6.])
+    array([14.,  15.,  28.,   7.,   6.])
 
     """
     # s1, s2 are trimmed copies
@@ -554,9 +554,9 @@
     --------
     >>> from numpy.polynomial.hermite_e import hermediv
     >>> hermediv([ 14.,  15.,  28.,   7.,   6.], [0, 1, 2])
-    (array([ 1.,  2.,  3.]), array([ 0.]))
+    (array([1., 2., 3.]), array([0.]))
     >>> hermediv([ 15.,  17.,  28.,   7.,   6.], [0, 1, 2])
-    (array([ 1.,  2.,  3.]), array([ 1.,  2.]))
+    (array([1., 2., 3.]), array([1., 2.]))
 
     """
     # c1, c2 are trimmed copies
@@ -612,7 +612,7 @@
     --------
     >>> from numpy.polynomial.hermite_e import hermepow
     >>> hermepow([1, 2, 3], 2)
-    array([ 23.,  28.,  46.,  12.,   9.])
+    array([23.,  28.,  46.,  12.,   9.])
 
     """
     # c is a trimmed copy
@@ -685,9 +685,9 @@
     --------
     >>> from numpy.polynomial.hermite_e import hermeder
     >>> hermeder([ 1.,  1.,  1.,  1.])
-    array([ 1.,  2.,  3.])
+    array([1.,  2.,  3.])
     >>> hermeder([-0.25,  1.,  1./2.,  1./3.,  1./4 ], m=2)
-    array([ 1.,  2.,  3.])
+    array([1.,  2.,  3.])
 
     """
     c = np.array(c, ndmin=1, copy=1)
@@ -794,15 +794,15 @@
     --------
     >>> from numpy.polynomial.hermite_e import hermeint
     >>> hermeint([1, 2, 3]) # integrate once, value 0 at 0.
-    array([ 1.,  1.,  1.,  1.])
+    array([1., 1., 1., 1.])
     >>> hermeint([1, 2, 3], m=2) # integrate twice, value & deriv 0 at 0
-    array([-0.25      ,  1.        ,  0.5       ,  0.33333333,  0.25      ])
+    array([-0.25      ,  1.        ,  0.5       ,  0.33333333,  0.25      ]) # may vary
     >>> hermeint([1, 2, 3], k=1) # integrate once, value 1 at 0.
-    array([ 2.,  1.,  1.,  1.])
+    array([2., 1., 1., 1.])
     >>> hermeint([1, 2, 3], lbnd=-1) # integrate once, value 0 at -1
     array([-1.,  1.,  1.,  1.])
     >>> hermeint([1, 2, 3], m=2, k=[1, 2], lbnd=-1)
-    array([ 1.83333333,  0.        ,  0.5       ,  0.33333333,  0.25      ])
+    array([ 1.83333333,  0.        ,  0.5       ,  0.33333333,  0.25      ]) # may vary
 
     """
     c = np.array(c, ndmin=1, copy=1)
@@ -913,8 +913,8 @@
     >>> hermeval(1, coef)
     3.0
     >>> hermeval([[1,2],[3,4]], coef)
-    array([[  3.,  14.],
-           [ 31.,  54.]])
+    array([[ 3., 14.],
+           [31., 54.]])
 
     """
     c = np.array(c, ndmin=1, copy=0)
@@ -1430,7 +1430,7 @@
         warnings can be turned off by
 
         >>> import warnings
-        >>> warnings.simplefilter('ignore', RankWarning)
+        >>> warnings.simplefilter('ignore', np.RankWarning)
 
     See Also
     --------
@@ -1480,10 +1480,11 @@
     --------
     >>> from numpy.polynomial.hermite_e import hermefit, hermeval
     >>> x = np.linspace(-10, 10)
+    >>> np.random.seed(123)
     >>> err = np.random.randn(len(x))/10
     >>> y = hermeval(x, [1, 2, 3]) + err
     >>> hermefit(x, y, 2)
-    array([ 1.01690445,  1.99951418,  2.99948696])
+    array([ 1.01690445,  1.99951418,  2.99948696]) # may vary
 
     """
     x = np.asarray(x) + 0.0
@@ -1650,9 +1651,9 @@
     >>> from numpy.polynomial.hermite_e import hermeroots, hermefromroots
     >>> coef = hermefromroots([-1, 0, 1])
     >>> coef
-    array([ 0.,  2.,  0.,  1.])
+    array([0., 2., 0., 1.])
     >>> hermeroots(coef)
-    array([-1.,  0.,  1.])
+    array([-1.,  0.,  1.]) # may vary
 
     """
     # c is a trimmed copy
diff --git a/numpy/polynomial/laguerre.py b/numpy/polynomial/laguerre.py
index a116d20..9207c9a 100644
--- a/numpy/polynomial/laguerre.py
+++ b/numpy/polynomial/laguerre.py
@@ -160,7 +160,7 @@
     --------
     >>> from numpy.polynomial.laguerre import lag2poly
     >>> lag2poly([ 23., -63.,  58., -18.])
-    array([ 0.,  1.,  2.,  3.])
+    array([0., 1., 2., 3.])
 
     """
     from .polynomial import polyadd, polysub, polymulx
@@ -277,10 +277,10 @@
     >>> from numpy.polynomial.laguerre import lagfromroots, lagval
     >>> coef = lagfromroots((-1, 0, 1))
     >>> lagval((-1, 0, 1), coef)
-    array([ 0.,  0.,  0.])
+    array([0.,  0.,  0.])
     >>> coef = lagfromroots((-1j, 1j))
     >>> lagval((-1j, 1j), coef)
-    array([ 0.+0.j,  0.+0.j])
+    array([0.+0.j, 0.+0.j])
 
     """
     if len(roots) == 0:
@@ -334,7 +334,7 @@
     --------
     >>> from numpy.polynomial.laguerre import lagadd
     >>> lagadd([1, 2, 3], [1, 2, 3, 4])
-    array([ 2.,  4.,  6.,  4.])
+    array([2.,  4.,  6.,  4.])
 
 
     """
@@ -383,7 +383,7 @@
     --------
     >>> from numpy.polynomial.laguerre import lagsub
     >>> lagsub([1, 2, 3, 4], [1, 2, 3])
-    array([ 0.,  0.,  0.,  4.])
+    array([0.,  0.,  0.,  4.])
 
     """
     # c1, c2 are trimmed copies
@@ -433,7 +433,7 @@
     --------
     >>> from numpy.polynomial.laguerre import lagmulx
     >>> lagmulx([1, 2, 3])
-    array([ -1.,  -1.,  11.,  -9.])
+    array([-1.,  -1.,  11.,  -9.])
 
     """
     # c is a trimmed copy
@@ -556,9 +556,9 @@
     --------
     >>> from numpy.polynomial.laguerre import lagdiv
     >>> lagdiv([  8., -13.,  38., -51.,  36.], [0, 1, 2])
-    (array([ 1.,  2.,  3.]), array([ 0.]))
+    (array([1., 2., 3.]), array([0.]))
     >>> lagdiv([  9., -12.,  38., -51.,  36.], [0, 1, 2])
-    (array([ 1.,  2.,  3.]), array([ 1.,  1.]))
+    (array([1., 2., 3.]), array([1., 1.]))
 
     """
     # c1, c2 are trimmed copies
@@ -687,9 +687,9 @@
     --------
     >>> from numpy.polynomial.laguerre import lagder
     >>> lagder([ 1.,  1.,  1., -3.])
-    array([ 1.,  2.,  3.])
+    array([1.,  2.,  3.])
     >>> lagder([ 1.,  0.,  0., -4.,  3.], m=2)
-    array([ 1.,  2.,  3.])
+    array([1.,  2.,  3.])
 
     """
     c = np.array(c, ndmin=1, copy=1)
@@ -805,9 +805,9 @@
     >>> lagint([1,2,3], k=1)
     array([ 2.,  1.,  1., -3.])
     >>> lagint([1,2,3], lbnd=-1)
-    array([ 11.5,   1. ,   1. ,  -3. ])
+    array([11.5,  1. ,  1. , -3. ])
     >>> lagint([1,2], m=2, k=[1,2], lbnd=-1)
-    array([ 11.16666667,  -5.        ,  -3.        ,   2.        ])
+    array([ 11.16666667,  -5.        ,  -3.        ,   2.        ]) # may vary
 
     """
     c = np.array(c, ndmin=1, copy=1)
@@ -1436,7 +1436,7 @@
         warnings can be turned off by
 
         >>> import warnings
-        >>> warnings.simplefilter('ignore', RankWarning)
+        >>> warnings.simplefilter('ignore', np.RankWarning)
 
     See Also
     --------
@@ -1489,7 +1489,7 @@
     >>> err = np.random.randn(len(x))/10
     >>> y = lagval(x, [1, 2, 3]) + err
     >>> lagfit(x, y, 2)
-    array([ 0.96971004,  2.00193749,  3.00288744])
+    array([ 0.96971004,  2.00193749,  3.00288744]) # may vary
 
     """
     x = np.asarray(x) + 0.0
@@ -1656,7 +1656,7 @@
     >>> coef
     array([  2.,  -8.,  12.,  -6.])
     >>> lagroots(coef)
-    array([ -4.44089210e-16,   1.00000000e+00,   2.00000000e+00])
+    array([-4.4408921e-16,  1.0000000e+00,  2.0000000e+00])
 
     """
     # c is a trimmed copy
diff --git a/numpy/polynomial/legendre.py b/numpy/polynomial/legendre.py
index e9c2459..f81bc00 100644
--- a/numpy/polynomial/legendre.py
+++ b/numpy/polynomial/legendre.py
@@ -136,10 +136,10 @@
     >>> from numpy import polynomial as P
     >>> p = P.Polynomial(np.arange(4))
     >>> p
-    Polynomial([ 0.,  1.,  2.,  3.], domain=[-1,  1], window=[-1,  1])
+    Polynomial([0.,  1.,  2.,  3.], domain=[-1,  1], window=[-1,  1])
     >>> c = P.Legendre(P.legendre.poly2leg(p.coef))
     >>> c
-    Legendre([ 1.  ,  3.25,  1.  ,  0.75], domain=[-1,  1], window=[-1,  1])
+    Legendre([ 1.  ,  3.25,  1.  ,  0.75], domain=[-1,  1], window=[-1,  1]) # may vary
 
     """
     [pol] = pu.as_series([pol])
@@ -183,12 +183,13 @@
 
     Examples
     --------
+    >>> from numpy import polynomial as P
     >>> c = P.Legendre(range(4))
     >>> c
-    Legendre([ 0.,  1.,  2.,  3.], [-1.,  1.])
+    Legendre([0., 1., 2., 3.], domain=[-1,  1], window=[-1,  1])
     >>> p = c.convert(kind=P.Polynomial)
     >>> p
-    Polynomial([-1. , -3.5,  3. ,  7.5], [-1.,  1.])
+    Polynomial([-1. , -3.5,  3. ,  7.5], domain=[-1.,  1.], window=[-1.,  1.])
     >>> P.leg2poly(range(4))
     array([-1. , -3.5,  3. ,  7.5])
 
@@ -310,7 +311,7 @@
     array([ 0. , -0.4,  0. ,  0.4])
     >>> j = complex(0,1)
     >>> L.legfromroots((-j,j)) # x^2 + 1 relative to the standard basis
-    array([ 1.33333333+0.j,  0.00000000+0.j,  0.66666667+0.j])
+    array([ 1.33333333+0.j,  0.00000000+0.j,  0.66666667+0.j]) # may vary
 
     """
     if len(roots) == 0:
@@ -366,7 +367,7 @@
     >>> c1 = (1,2,3)
     >>> c2 = (3,2,1)
     >>> L.legadd(c1,c2)
-    array([ 4.,  4.,  4.])
+    array([4.,  4.,  4.])
 
     """
     # c1, c2 are trimmed copies
@@ -468,7 +469,7 @@
     --------
     >>> from numpy.polynomial import legendre as L
     >>> L.legmulx([1,2,3])
-    array([ 0.66666667, 2.2, 1.33333333, 1.8])
+    array([ 0.66666667, 2.2, 1.33333333, 1.8]) # may vary
 
     """
     # c is a trimmed copy
@@ -525,8 +526,8 @@
     >>> from numpy.polynomial import legendre as L
     >>> c1 = (1,2,3)
     >>> c2 = (3,2)
-    >>> P.legmul(c1,c2) # multiplication requires "reprojection"
-    array([  4.33333333,  10.4       ,  11.66666667,   3.6       ])
+    >>> L.legmul(c1,c2) # multiplication requires "reprojection"
+    array([  4.33333333,  10.4       ,  11.66666667,   3.6       ]) # may vary
 
     """
     # s1, s2 are trimmed copies
@@ -597,10 +598,10 @@
     >>> c1 = (1,2,3)
     >>> c2 = (3,2,1)
     >>> L.legdiv(c1,c2) # quotient "intuitive," remainder not
-    (array([ 3.]), array([-8., -4.]))
+    (array([3.]), array([-8., -4.]))
     >>> c2 = (0,1,2,3)
     >>> L.legdiv(c2,c1) # neither "intuitive"
-    (array([-0.07407407,  1.66666667]), array([-1.03703704, -2.51851852]))
+    (array([-0.07407407,  1.66666667]), array([-1.03703704, -2.51851852])) # may vary
 
     """
     # c1, c2 are trimmed copies
@@ -729,7 +730,7 @@
     >>> L.legder(c)
     array([  6.,   9.,  20.])
     >>> L.legder(c, 3)
-    array([ 60.])
+    array([60.])
     >>> L.legder(c, scl=-1)
     array([ -6.,  -9., -20.])
     >>> L.legder(c, 2,-1)
@@ -845,16 +846,16 @@
     >>> from numpy.polynomial import legendre as L
     >>> c = (1,2,3)
     >>> L.legint(c)
-    array([ 0.33333333,  0.4       ,  0.66666667,  0.6       ])
+    array([ 0.33333333,  0.4       ,  0.66666667,  0.6       ]) # may vary
     >>> L.legint(c, 3)
-    array([  1.66666667e-02,  -1.78571429e-02,   4.76190476e-02,
-            -1.73472348e-18,   1.90476190e-02,   9.52380952e-03])
+    array([  1.66666667e-02,  -1.78571429e-02,   4.76190476e-02, # may vary
+             -1.73472348e-18,   1.90476190e-02,   9.52380952e-03])
     >>> L.legint(c, k=3)
-    array([ 3.33333333,  0.4       ,  0.66666667,  0.6       ])
+     array([ 3.33333333,  0.4       ,  0.66666667,  0.6       ]) # may vary
     >>> L.legint(c, lbnd=-2)
-    array([ 7.33333333,  0.4       ,  0.66666667,  0.6       ])
+    array([ 7.33333333,  0.4       ,  0.66666667,  0.6       ]) # may vary
     >>> L.legint(c, scl=2)
-    array([ 0.66666667,  0.8       ,  1.33333333,  1.2       ])
+    array([ 0.66666667,  0.8       ,  1.33333333,  1.2       ]) # may vary
 
     """
     c = np.array(c, ndmin=1, copy=1)
@@ -1476,7 +1477,7 @@
         warnings can be turned off by
 
         >>> import warnings
-        >>> warnings.simplefilter('ignore', RankWarning)
+        >>> warnings.simplefilter('ignore', np.RankWarning)
 
     See Also
     --------
@@ -1686,7 +1687,7 @@
     --------
     >>> import numpy.polynomial.legendre as leg
     >>> leg.legroots((1, 2, 3, 4)) # 4L_3 + 3L_2 + 2L_1 + 1L_0, all real roots
-    array([-0.85099543, -0.11407192,  0.51506735])
+    array([-0.85099543, -0.11407192,  0.51506735]) # may vary
 
     """
     # c is a trimmed copy
diff --git a/numpy/polynomial/polynomial.py b/numpy/polynomial/polynomial.py
index 259cd31..69599e3 100644
--- a/numpy/polynomial/polynomial.py
+++ b/numpy/polynomial/polynomial.py
@@ -185,7 +185,7 @@
     array([ 0., -1.,  0.,  1.])
     >>> j = complex(0,1)
     >>> P.polyfromroots((-j,j)) # complex returned, though values are real
-    array([ 1.+0.j,  0.+0.j,  1.+0.j])
+    array([1.+0.j,  0.+0.j,  1.+0.j])
 
     """
     if len(roots) == 0:
@@ -233,7 +233,7 @@
     >>> c1 = (1,2,3)
     >>> c2 = (3,2,1)
     >>> sum = P.polyadd(c1,c2); sum
-    array([ 4.,  4.,  4.])
+    array([4.,  4.,  4.])
     >>> P.polyval(2, sum) # 4 + 4(2) + 4(2**2)
     28.0
 
@@ -401,9 +401,9 @@
     >>> c1 = (1,2,3)
     >>> c2 = (3,2,1)
     >>> P.polydiv(c1,c2)
-    (array([ 3.]), array([-8., -4.]))
+    (array([3.]), array([-8., -4.]))
     >>> P.polydiv(c2,c1)
-    (array([ 0.33333333]), array([ 2.66666667,  1.33333333]))
+    (array([ 0.33333333]), array([ 2.66666667,  1.33333333])) # may vary
 
     """
     # c1, c2 are trimmed copies
@@ -529,7 +529,7 @@
     >>> P.polyder(c) # (d/dx)(c) = 2 + 6x + 12x**2
     array([  2.,   6.,  12.])
     >>> P.polyder(c,3) # (d**3/dx**3)(c) = 24
-    array([ 24.])
+    array([24.])
     >>> P.polyder(c,scl=-1) # (d/d(-x))(c) = -2 - 6x - 12x**2
     array([ -2.,  -6., -12.])
     >>> P.polyder(c,2,-1) # (d**2/d(-x)**2)(c) = 6 + 24x
@@ -636,14 +636,14 @@
     >>> from numpy.polynomial import polynomial as P
     >>> c = (1,2,3)
     >>> P.polyint(c) # should return array([0, 1, 1, 1])
-    array([ 0.,  1.,  1.,  1.])
+    array([0.,  1.,  1.,  1.])
     >>> P.polyint(c,3) # should return array([0, 0, 0, 1/6, 1/12, 1/20])
-    array([ 0.        ,  0.        ,  0.        ,  0.16666667,  0.08333333,
-            0.05      ])
+     array([ 0.        ,  0.        ,  0.        ,  0.16666667,  0.08333333, # may vary
+             0.05      ])
     >>> P.polyint(c,k=3) # should return array([3, 1, 1, 1])
-    array([ 3.,  1.,  1.,  1.])
+    array([3.,  1.,  1.,  1.])
     >>> P.polyint(c,lbnd=-2) # should return array([6, 1, 1, 1])
-    array([ 6.,  1.,  1.,  1.])
+    array([6.,  1.,  1.,  1.])
     >>> P.polyint(c,scl=-2) # should return array([0, -2, -2, -2])
     array([ 0., -2., -2., -2.])
 
@@ -761,17 +761,17 @@
     array([[0, 1],
            [2, 3]])
     >>> polyval(a, [1,2,3])
-    array([[  1.,   6.],
-           [ 17.,  34.]])
+    array([[ 1.,   6.],
+           [17.,  34.]])
     >>> coef = np.arange(4).reshape(2,2) # multidimensional coefficients
     >>> coef
     array([[0, 1],
            [2, 3]])
     >>> polyval([1,2], coef, tensor=True)
-    array([[ 2.,  4.],
-           [ 4.,  7.]])
+    array([[2.,  4.],
+           [4.,  7.]])
     >>> polyval([1,2], coef, tensor=False)
-    array([ 2.,  7.])
+    array([2.,  7.])
 
     """
     c = np.array(c, ndmin=1, copy=0)
@@ -851,8 +851,8 @@
     array([[0, 1],
            [2, 3]])
     >>> polyvalfromroots(a, [-1, 0, 1])
-    array([[ -0.,   0.],
-           [  6.,  24.]])
+    array([[-0.,   0.],
+           [ 6.,  24.]])
     >>> r = np.arange(-2, 2).reshape(2,2) # multidimensional coefficients
     >>> r # each column of r defines one polynomial
     array([[-2, -1],
@@ -1363,7 +1363,7 @@
         be turned off by:
 
         >>> import warnings
-        >>> warnings.simplefilter('ignore', RankWarning)
+        >>> warnings.simplefilter('ignore', np.RankWarning)
 
     See Also
     --------
@@ -1410,26 +1410,27 @@
 
     Examples
     --------
+    >>> np.random.seed(123)
     >>> from numpy.polynomial import polynomial as P
     >>> x = np.linspace(-1,1,51) # x "data": [-1, -0.96, ..., 0.96, 1]
     >>> y = x**3 - x + np.random.randn(len(x)) # x^3 - x + N(0,1) "noise"
     >>> c, stats = P.polyfit(x,y,3,full=True)
+    >>> np.random.seed(123)
     >>> c # c[0], c[2] should be approx. 0, c[1] approx. -1, c[3] approx. 1
-    array([ 0.01909725, -1.30598256, -0.00577963,  1.02644286])
+    array([ 0.01909725, -1.30598256, -0.00577963,  1.02644286]) # may vary
     >>> stats # note the large SSR, explaining the rather poor results
-    [array([ 38.06116253]), 4, array([ 1.38446749,  1.32119158,  0.50443316,
-    0.28853036]), 1.1324274851176597e-014]
+     [array([ 38.06116253]), 4, array([ 1.38446749,  1.32119158,  0.50443316, # may vary
+              0.28853036]), 1.1324274851176597e-014]
 
     Same thing without the added noise
 
     >>> y = x**3 - x
     >>> c, stats = P.polyfit(x,y,3,full=True)
     >>> c # c[0], c[2] should be "very close to 0", c[1] ~= -1, c[3] ~= 1
-    array([ -1.73362882e-17,  -1.00000000e+00,  -2.67471909e-16,
-             1.00000000e+00])
+    array([-6.36925336e-18, -1.00000000e+00, -4.08053781e-16,  1.00000000e+00])
     >>> stats # note the minuscule SSR
-    [array([  7.46346754e-31]), 4, array([ 1.38446749,  1.32119158,
-    0.50443316,  0.28853036]), 1.1324274851176597e-014]
+    [array([  7.46346754e-31]), 4, array([ 1.38446749,  1.32119158, # may vary
+               0.50443316,  0.28853036]), 1.1324274851176597e-014]
 
     """
     x = np.asarray(x) + 0.0
@@ -1591,7 +1592,7 @@
     dtype('float64')
     >>> j = complex(0,1)
     >>> poly.polyroots(poly.polyfromroots((-j,0,j)))
-    array([  0.00000000e+00+0.j,   0.00000000e+00+1.j,   2.77555756e-17-1.j])
+    array([  0.00000000e+00+0.j,   0.00000000e+00+1.j,   2.77555756e-17-1.j]) # may vary
 
     """
     # c is a trimmed copy
diff --git a/numpy/polynomial/polyutils.py b/numpy/polynomial/polyutils.py
index c1ed0c9..eff4a8e 100644
--- a/numpy/polynomial/polyutils.py
+++ b/numpy/polynomial/polyutils.py
@@ -156,19 +156,19 @@
     >>> from numpy.polynomial import polyutils as pu
     >>> a = np.arange(4)
     >>> pu.as_series(a)
-    [array([ 0.]), array([ 1.]), array([ 2.]), array([ 3.])]
+    [array([0.]), array([1.]), array([2.]), array([3.])]
     >>> b = np.arange(6).reshape((2,3))
     >>> pu.as_series(b)
-    [array([ 0.,  1.,  2.]), array([ 3.,  4.,  5.])]
+    [array([0., 1., 2.]), array([3., 4., 5.])]
 
     >>> pu.as_series((1, np.arange(3), np.arange(2, dtype=np.float16)))
-    [array([ 1.]), array([ 0.,  1.,  2.]), array([ 0.,  1.])]
+    [array([1.]), array([0., 1., 2.]), array([0., 1.])]
 
     >>> pu.as_series([2, [1.1, 0.]])
-    [array([ 2.]), array([ 1.1])]
+    [array([2.]), array([1.1])]
 
     >>> pu.as_series([2, [1.1, 0.]], trim=False)
-    [array([ 2.]), array([ 1.1,  0. ])]
+    [array([2.]), array([1.1, 0. ])]
 
     """
     arrays = [np.array(a, ndmin=1, copy=0) for a in alist]
@@ -233,12 +233,12 @@
     --------
     >>> from numpy.polynomial import polyutils as pu
     >>> pu.trimcoef((0,0,3,0,5,0,0))
-    array([ 0.,  0.,  3.,  0.,  5.])
+    array([0.,  0.,  3.,  0.,  5.])
     >>> pu.trimcoef((0,0,1e-3,0,1e-5,0,0),1e-3) # item == tol is trimmed
-    array([ 0.])
+    array([0.])
     >>> i = complex(0,1) # works for complex
     >>> pu.trimcoef((3e-4,1e-3*(1-i),5e-4,2e-5*(1+i)), 1e-3)
-    array([ 0.0003+0.j   ,  0.0010-0.001j])
+    array([0.0003+0.j   , 0.001 -0.001j])
 
     """
     if tol < 0:
@@ -332,10 +332,10 @@
     >>> pu.mapparms((-1,1),(-1,1))
     (0.0, 1.0)
     >>> pu.mapparms((1,-1),(-1,1))
-    (0.0, -1.0)
+    (-0.0, -1.0)
     >>> i = complex(0,1)
     >>> pu.mapparms((-i,-1),(1,i))
-    ((1+1j), (1+0j))
+    ((1+1j), (1-0j))
 
     """
     oldlen = old[1] - old[0]
@@ -390,10 +390,10 @@
     >>> x = np.linspace(-1,1,6); x
     array([-1. , -0.6, -0.2,  0.2,  0.6,  1. ])
     >>> x_out = pu.mapdomain(x, old_domain, new_domain); x_out
-    array([ 0.        ,  1.25663706,  2.51327412,  3.76991118,  5.02654825,
+    array([ 0.        ,  1.25663706,  2.51327412,  3.76991118,  5.02654825, # may vary
             6.28318531])
     >>> x - pu.mapdomain(x_out, new_domain, old_domain)
-    array([ 0.,  0.,  0.,  0.,  0.,  0.])
+    array([0., 0., 0., 0., 0., 0.])
 
     Also works for complex numbers (and thus can be used to map any line in
     the complex plane to any other line therein).
@@ -402,9 +402,9 @@
     >>> old = (-1 - i, 1 + i)
     >>> new = (-1 + i, 1 - i)
     >>> z = np.linspace(old[0], old[1], 6); z
-    array([-1.0-1.j , -0.6-0.6j, -0.2-0.2j,  0.2+0.2j,  0.6+0.6j,  1.0+1.j ])
-    >>> new_z = P.mapdomain(z, old, new); new_z
-    array([-1.0+1.j , -0.6+0.6j, -0.2+0.2j,  0.2-0.2j,  0.6-0.6j,  1.0-1.j ])
+    array([-1. -1.j , -0.6-0.6j, -0.2-0.2j,  0.2+0.2j,  0.6+0.6j,  1. +1.j ])
+    >>> new_z = pu.mapdomain(z, old, new); new_z
+    array([-1.0+1.j , -0.6+0.6j, -0.2+0.2j,  0.2-0.2j,  0.6-0.6j,  1.0-1.j ]) # may vary
 
     """
     x = np.asanyarray(x)
diff --git a/numpy/polynomial/tests/test_polynomial.py b/numpy/polynomial/tests/test_polynomial.py
index 0c93be2..562aa90 100644
--- a/numpy/polynomial/tests/test_polynomial.py
+++ b/numpy/polynomial/tests/test_polynomial.py
@@ -9,7 +9,7 @@
 import numpy.polynomial.polynomial as poly
 from numpy.testing import (
     assert_almost_equal, assert_raises, assert_equal, assert_,
-    )
+    assert_array_equal)
 
 
 def trim(x):
@@ -147,6 +147,19 @@
             assert_equal(poly.polyval(x, [1, 0]).shape, dims)
             assert_equal(poly.polyval(x, [1, 0, 0]).shape, dims)
 
+        #check masked arrays are processed correctly
+        mask = [False, True, False]
+        mx = np.ma.array([1, 2, 3], mask=mask)
+        res = np.polyval([7, 5, 3], mx)
+        assert_array_equal(res.mask, mask)
+
+        #check subtypes of ndarray are preserved
+        class C(np.ndarray):
+            pass
+
+        cx = np.array([1, 2, 3]).view(C)
+        assert_equal(type(np.polyval([2, 3, 4], cx)), C)
+
     def test_polyvalfromroots(self):
         # check exception for broadcasting x values over root array with
         # too few dimensions
diff --git a/numpy/random/mtrand/mtrand.pyx b/numpy/random/mtrand/mtrand.pyx
index 21bc73e..f922c59 100644
--- a/numpy/random/mtrand/mtrand.pyx
+++ b/numpy/random/mtrand/mtrand.pyx
@@ -1,3 +1,5 @@
+# cython: language_level=3
+
 # mtrand.pyx -- A Pyrex wrapper of Jean-Sebastien Roy's RandomKit
 #
 # Copyright 2005 Robert Kern (robert.kern@gmail.com)
@@ -844,16 +846,16 @@
         Examples
         --------
         >>> np.random.random_sample()
-        0.47108547995356098
+        0.47108547995356098 # random
         >>> type(np.random.random_sample())
-        <type 'float'>
+        <class 'float'>
         >>> np.random.random_sample((5,))
-        array([ 0.30220482,  0.86820401,  0.1654503 ,  0.11659149,  0.54323428])
+        array([ 0.30220482,  0.86820401,  0.1654503 ,  0.11659149,  0.54323428]) # random
 
         Three-by-two array of random numbers from [-5, 0):
 
         >>> 5 * np.random.random_sample((3, 2)) - 5
-        array([[-3.99149989, -0.52338984],
+        array([[-3.99149989, -0.52338984], # random
                [-2.99091858, -0.79479508],
                [-1.23204345, -1.75224494]])
 
@@ -954,14 +956,14 @@
         Examples
         --------
         >>> np.random.randint(2, size=10)
-        array([1, 0, 0, 0, 1, 1, 0, 0, 1, 0])
+        array([1, 0, 0, 0, 1, 1, 0, 0, 1, 0]) # random
         >>> np.random.randint(1, size=10)
         array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
 
         Generate a 2 x 4 array of ints between 0 and 4, inclusive:
 
         >>> np.random.randint(5, size=(2, 4))
-        array([[4, 0, 2, 1],
+        array([[4, 0, 2, 1], # random
                [3, 2, 2, 0]])
 
         """
@@ -1076,34 +1078,34 @@
         Generate a uniform random sample from np.arange(5) of size 3:
 
         >>> np.random.choice(5, 3)
-        array([0, 3, 4])
+        array([0, 3, 4]) # random
         >>> #This is equivalent to np.random.randint(0,5,3)
 
         Generate a non-uniform random sample from np.arange(5) of size 3:
 
         >>> np.random.choice(5, 3, p=[0.1, 0, 0.3, 0.6, 0])
-        array([3, 3, 0])
+        array([3, 3, 0]) # random
 
         Generate a uniform random sample from np.arange(5) of size 3 without
         replacement:
 
         >>> np.random.choice(5, 3, replace=False)
-        array([3,1,0])
+        array([3,1,0]) # random
         >>> #This is equivalent to np.random.permutation(np.arange(5))[:3]
 
         Generate a non-uniform random sample from np.arange(5) of size
         3 without replacement:
 
         >>> np.random.choice(5, 3, replace=False, p=[0.1, 0, 0.3, 0.6, 0])
-        array([2, 3, 0])
+        array([2, 3, 0]) # random
 
         Any of the above can be repeated with an arbitrary array-like
         instead of just integers. For instance:
 
         >>> aa_milne_arr = ['pooh', 'rabbit', 'piglet', 'Christopher']
         >>> np.random.choice(aa_milne_arr, 5, p=[0.5, 0.1, 0.1, 0.3])
-        array(['pooh', 'pooh', 'pooh', 'Christopher', 'piglet'],
-              dtype='|S11')
+        array(['pooh', 'pooh', 'pooh', 'Christopher', 'piglet'], # random
+              dtype='<U11')
 
         """
 
@@ -1139,9 +1141,12 @@
                 raise ValueError("'p' must be 1-dimensional")
             if p.size != pop_size:
                 raise ValueError("'a' and 'p' must have same size")
+            p_sum = kahan_sum(pix, d)
+            if np.isnan(p_sum):
+                raise ValueError("probabilities contain NaN")
             if np.logical_or.reduce(p < 0):
                 raise ValueError("probabilities are not non-negative")
-            if abs(kahan_sum(pix, d) - 1.) > atol:
+            if abs(p_sum - 1.) > atol:
                 raise ValueError("probabilities do not sum to 1")
 
         shape = size
@@ -1470,11 +1475,11 @@
         Examples
         --------
         >>> np.random.random_integers(5)
-        4
+        4 # random
         >>> type(np.random.random_integers(5))
-        <type 'int'>
+        <class 'numpy.int64'>
         >>> np.random.random_integers(5, size=(3,2))
-        array([[5, 4],
+        array([[5, 4], # random
                [3, 3],
                [4, 5]])
 
@@ -1483,7 +1488,7 @@
         :math:`{0, 5/8, 10/8, 15/8, 20/8}`):
 
         >>> 2.5 * (np.random.random_integers(5, size=(5,)) - 1) / 4.
-        array([ 0.625,  1.25 ,  0.625,  0.625,  2.5  ])
+        array([ 0.625,  1.25 ,  0.625,  0.625,  2.5  ]) # random
 
         Roll two six sided dice 1000 times and sum the results:
 
@@ -2068,8 +2073,8 @@
 
         The lower bound for the top 1% of the samples is :
 
-        >>> sort(s)[-10]
-        7.61988120985
+        >>> np.sort(s)[-10]
+        7.61988120985 # random
 
         So there is about a 1% chance that the F statistic will exceed 7.62,
         the measured value is 36, so the null hypothesis is rejected at the 1%
@@ -2166,6 +2171,7 @@
         >>> NF = np.histogram(nc_vals, bins=50, density=True)
         >>> c_vals = np.random.f(dfnum, dfden, 1000000)
         >>> F = np.histogram(c_vals, bins=50, density=True)
+        >>> import matplotlib.pyplot as plt
         >>> plt.plot(F[1][1:], F[0])
         >>> plt.plot(NF[1][1:], NF[0])
         >>> plt.show()
@@ -2261,7 +2267,7 @@
         Examples
         --------
         >>> np.random.chisquare(2,4)
-        array([ 1.89920014,  9.00867716,  3.13710533,  5.62318272])
+        array([ 1.89920014,  9.00867716,  3.13710533,  5.62318272]) # random
 
         """
         cdef ndarray odf
@@ -2443,6 +2449,7 @@
         --------
         Draw samples and plot the distribution:
 
+        >>> import matplotlib.pyplot as plt
         >>> s = np.random.standard_cauchy(1000000)
         >>> s = s[(s>-25) & (s<25)]  # truncate distribution so it plots well
         >>> plt.hist(s, bins=100)
@@ -3279,12 +3286,13 @@
 
         >>> loc, scale = 10, 1
         >>> s = np.random.logistic(loc, scale, 10000)
+        >>> import matplotlib.pyplot as plt
         >>> count, bins, ignored = plt.hist(s, bins=50)
 
         #   plot against distribution
 
         >>> def logist(x, loc, scale):
-        ...     return exp((loc-x)/scale)/(scale*(1+exp((loc-x)/scale))**2)
+        ...     return np.exp((loc-x)/scale)/(scale*(1+np.exp((loc-x)/scale))**2)
         >>> plt.plot(bins, logist(bins, loc, scale)*count.max()/\\
         ... logist(bins, loc, scale).max())
         >>> plt.show()
@@ -3479,6 +3487,7 @@
         --------
         Draw values from the distribution and plot the histogram
 
+        >>> from matplotlib.pyplot import hist
         >>> values = hist(np.random.rayleigh(3, 100000), bins=200, density=True)
 
         Wave heights tend to follow a Rayleigh distribution. If the mean wave
@@ -3492,7 +3501,7 @@
         The percentage of waves larger than 3 meters is:
 
         >>> 100.*sum(s>3)/1000000.
-        0.087300000000000003
+        0.087300000000000003 # random
 
         """
         cdef ndarray oscale
@@ -3873,9 +3882,9 @@
         single success after drilling 5 wells, after 6 wells, etc.?
 
         >>> s = np.random.negative_binomial(1, 0.1, 100000)
-        >>> for i in range(1, 11):
+        >>> for i in range(1, 11): # doctest: +SKIP
         ...    probability = sum(s<i) / 100000.
-        ...    print i, "wells drilled, probability of one success =", probability
+        ...    print(i, "wells drilled, probability of one success =", probability)
 
         """
         cdef ndarray on
@@ -4233,6 +4242,7 @@
         >>> ngood, nbad, nsamp = 100, 2, 10
         # number of good, number of bad, and number of samples
         >>> s = np.random.hypergeometric(ngood, nbad, nsamp, 1000)
+        >>> from matplotlib.pyplot import hist
         >>> hist(s)
         #   note that it is very unlikely to grab both bad items
 
@@ -4342,14 +4352,15 @@
 
         >>> a = .6
         >>> s = np.random.logseries(a, 10000)
+        >>> import matplotlib.pyplot as plt
         >>> count, bins, ignored = plt.hist(s)
 
         #   plot against distribution
 
         >>> def logseries(k, p):
-        ...     return -p**k/(k*log(1-p))
+        ...     return -p**k/(k*np.log(1-p))
         >>> plt.plot(bins, logseries(bins, a)*count.max()/
-                     logseries(bins, a).max(), 'r')
+        ...          logseries(bins, a).max(), 'r')
         >>> plt.show()
 
         """
@@ -4474,7 +4485,7 @@
         standard deviation:
 
         >>> list((x[0,0,:] - mean) < 0.6)
-        [True, True]
+        [True, True] # random
 
         """
         from numpy.dual import svd
@@ -4580,14 +4591,14 @@
         Throw a dice 20 times:
 
         >>> np.random.multinomial(20, [1/6.]*6, size=1)
-        array([[4, 1, 7, 5, 2, 1]])
+        array([[4, 1, 7, 5, 2, 1]]) # random
 
         It landed 4 times on 1, once on 2, etc.
 
         Now, throw the dice 20 times, and 20 times again:
 
         >>> np.random.multinomial(20, [1/6.]*6, size=2)
-        array([[3, 4, 3, 3, 4, 3],
+        array([[3, 4, 3, 3, 4, 3], # random
                [2, 4, 3, 4, 0, 7]])
 
         For the first run, we threw 3 times 1, 4 times 2, etc.  For the second,
@@ -4596,7 +4607,7 @@
         A loaded die is more likely to land on number 6:
 
         >>> np.random.multinomial(100, [1/7.]*5 + [2/7.])
-        array([11, 16, 14, 17, 16, 26])
+        array([11, 16, 14, 17, 16, 26]) # random
 
         The probability inputs should be normalized. As an implementation
         detail, the value of the last entry is ignored and assumed to take
@@ -4605,7 +4616,7 @@
         other should be sampled like so:
 
         >>> np.random.multinomial(100, [1.0 / 3, 2.0 / 3])  # RIGHT
-        array([38, 62])
+        array([38, 62]) # random
 
         not like:
 
@@ -4659,8 +4670,9 @@
 
         Draw `size` samples of dimension k from a Dirichlet distribution. A
         Dirichlet-distributed random variable can be seen as a multivariate
-        generalization of a Beta distribution. Dirichlet pdf is the conjugate
-        prior of a multinomial in Bayesian inference.
+        generalization of a Beta distribution. The Dirichlet distribution
+        is a conjugate prior of a multinomial distribution in Bayesian 
+        inference.
 
         Parameters
         ----------
@@ -4684,13 +4696,24 @@
 
         Notes
         -----
-        .. math:: X \\approx \\prod_{i=1}^{k}{x^{\\alpha_i-1}_i}
 
-        Uses the following property for computation: for each dimension,
-        draw a random sample y_i from a standard gamma generator of shape
-        `alpha_i`, then
-        :math:`X = \\frac{1}{\\sum_{i=1}^k{y_i}} (y_1, \\ldots, y_n)` is
-        Dirichlet distributed.
+        The Dirichlet distribution is a distribution over vectors 
+        :math:`x` that fulfil the conditions :math:`x_i>0` and 
+        :math:`\\sum_{i=1}^k x_i = 1`.
+
+        The probability density function :math:`p` of a 
+        Dirichlet-distributed random vector :math:`X` is 
+        proportional to
+
+        .. math:: p(x) \\propto \\prod_{i=1}^{k}{x^{\\alpha_i-1}_i},
+
+        where :math:`\\alpha` is a vector containing the positive 
+        concentration parameters.
+
+        The method uses the following property for computation: let :math:`Y`
+        be a random vector which has components that follow a standard gamma 
+        distribution, then :math:`X = \\frac{1}{\\sum_{i=1}^k{Y_i}} Y` 
+        is Dirichlet-distributed
 
         References
         ----------
@@ -4710,6 +4733,7 @@
 
         >>> s = np.random.dirichlet((10, 5, 3), 20).transpose()
 
+        >>> import matplotlib.pyplot as plt
         >>> plt.barh(range(20), s[0])
         >>> plt.barh(range(20), s[1], left=s[0], color='g')
         >>> plt.barh(range(20), s[2], left=s[0]+s[1], color='r')
@@ -4798,14 +4822,14 @@
         >>> arr = np.arange(10)
         >>> np.random.shuffle(arr)
         >>> arr
-        [1 7 5 2 9 4 3 6 0 8]
+        [1 7 5 2 9 4 3 6 0 8] # random
 
         Multi-dimensional arrays are only shuffled along the first axis:
 
         >>> arr = np.arange(9).reshape((3, 3))
         >>> np.random.shuffle(arr)
         >>> arr
-        array([[3, 4, 5],
+        array([[3, 4, 5], # random
                [6, 7, 8],
                [0, 1, 2]])
 
@@ -4885,14 +4909,14 @@
         Examples
         --------
         >>> np.random.permutation(10)
-        array([1, 7, 4, 3, 0, 9, 2, 5, 8, 6])
+        array([1, 7, 4, 3, 0, 9, 2, 5, 8, 6]) # random
 
         >>> np.random.permutation([1, 4, 9, 12, 15])
-        array([15,  1,  9,  4, 12])
+        array([15,  1,  9,  4, 12]) # random
 
         >>> arr = np.arange(9).reshape((3, 3))
         >>> np.random.permutation(arr)
-        array([[6, 7, 8],
+        array([[6, 7, 8], # random
                [0, 1, 2],
                [3, 4, 5]])
 
diff --git a/numpy/random/mtrand/numpy.pxd b/numpy/random/mtrand/numpy.pxd
index 9092fa1..1b4fe6c 100644
--- a/numpy/random/mtrand/numpy.pxd
+++ b/numpy/random/mtrand/numpy.pxd
@@ -1,3 +1,5 @@
+# cython: language_level=3
+
 # :Author:    Travis Oliphant
 from cpython.exc cimport PyErr_Print
 
diff --git a/numpy/random/tests/test_random.py b/numpy/random/tests/test_random.py
index d0bb92a..d4721bc 100644
--- a/numpy/random/tests/test_random.py
+++ b/numpy/random/tests/test_random.py
@@ -448,6 +448,11 @@
         assert_equal(np.random.choice(['a', 'b'], size=(3, 0, 4)).shape, (3, 0, 4))
         assert_raises(ValueError, np.random.choice, [], 10)
 
+    def test_choice_nan_probabilities(self):
+        a = np.array([42, 1, 2])
+        p = [None, None, None]
+        assert_raises(ValueError, np.random.choice, a, p=p)
+
     def test_bytes(self):
         np.random.seed(self.seed)
         actual = np.random.bytes(10)
diff --git a/numpy/testing/_private/nosetester.py b/numpy/testing/_private/nosetester.py
index 1728d9d..19569a5 100644
--- a/numpy/testing/_private/nosetester.py
+++ b/numpy/testing/_private/nosetester.py
@@ -92,7 +92,7 @@
 
     Alternatively, calling::
 
-    >>> run_module_suite(file_to_run="numpy/tests/test_matlib.py")
+    >>> run_module_suite(file_to_run="numpy/tests/test_matlib.py")  # doctest: +SKIP
 
     from an interpreter will run all the test routine in 'test_matlib.py'.
     """
diff --git a/numpy/testing/_private/utils.py b/numpy/testing/_private/utils.py
index 55306e4..1f7b516 100644
--- a/numpy/testing/_private/utils.py
+++ b/numpy/testing/_private/utils.py
@@ -318,8 +318,9 @@
     Examples
     --------
     >>> np.testing.assert_equal([4,5], [4,6])
-    ...
-    <type 'exceptions.AssertionError'>:
+    Traceback (most recent call last):
+        ...
+    AssertionError:
     Items are not equal:
     item=1
      ACTUAL: 5
@@ -510,21 +511,24 @@
     >>> import numpy.testing as npt
     >>> npt.assert_almost_equal(2.3333333333333, 2.33333334)
     >>> npt.assert_almost_equal(2.3333333333333, 2.33333334, decimal=10)
-    ...
-    <type 'exceptions.AssertionError'>:
-    Items are not equal:
-     ACTUAL: 2.3333333333333002
-     DESIRED: 2.3333333399999998
+    Traceback (most recent call last):
+        ...
+    AssertionError:
+    Arrays are not almost equal to 10 decimals
+     ACTUAL: 2.3333333333333
+     DESIRED: 2.33333334
 
     >>> npt.assert_almost_equal(np.array([1.0,2.3333333333333]),
     ...                         np.array([1.0,2.33333334]), decimal=9)
-    ...
-    <type 'exceptions.AssertionError'>:
-    Arrays are not almost equal
-    <BLANKLINE>
-    (mismatch 50.0%)
-     x: array([ 1.        ,  2.33333333])
-     y: array([ 1.        ,  2.33333334])
+    Traceback (most recent call last):
+        ...
+    AssertionError:
+    Arrays are not almost equal to 9 decimals
+    Mismatch: 50%
+    Max absolute difference: 6.66669964e-09
+    Max relative difference: 2.85715698e-09
+     x: array([1.         , 2.333333333])
+     y: array([1.        , 2.33333334])
 
     """
     __tracebackhide__ = True  # Hide traceback for py.test
@@ -626,14 +630,15 @@
     --------
     >>> np.testing.assert_approx_equal(0.12345677777777e-20, 0.1234567e-20)
     >>> np.testing.assert_approx_equal(0.12345670e-20, 0.12345671e-20,
-                                       significant=8)
+    ...                                significant=8)
     >>> np.testing.assert_approx_equal(0.12345670e-20, 0.12345672e-20,
-                                       significant=8)
-    ...
-    <type 'exceptions.AssertionError'>:
+    ...                                significant=8)
+    Traceback (most recent call last):
+        ...
+    AssertionError:
     Items are not equal to 8 significant digits:
-     ACTUAL: 1.234567e-021
-     DESIRED: 1.2345672000000001e-021
+     ACTUAL: 1.234567e-21
+     DESIRED: 1.2345672e-21
 
     the evaluated condition that raises the exception is
 
@@ -660,10 +665,10 @@
         sc_actual = actual/scale
     except ZeroDivisionError:
         sc_actual = 0.0
-    msg = build_err_msg([actual, desired], err_msg,
-                header='Items are not equal to %d significant digits:' %
-                                 significant,
-                verbose=verbose)
+    msg = build_err_msg(
+        [actual, desired], err_msg,
+        header='Items are not equal to %d significant digits:' % significant,
+        verbose=verbose)
     try:
         # If one of desired/actual is not finite, handle it specially here:
         # check that both are nan if any is a nan, and test for equality
@@ -686,7 +691,7 @@
                          header='', precision=6, equal_nan=True,
                          equal_inf=True):
     __tracebackhide__ = True  # Hide traceback for py.test
-    from numpy.core import array, isnan, inf, bool_
+    from numpy.core import array, array2string, isnan, inf, bool_, errstate
 
     x = array(x, copy=False, subok=True)
     y = array(y, copy=False, subok=True)
@@ -782,15 +787,31 @@
             reduced = val.ravel()
             cond = reduced.all()
             reduced = reduced.tolist()
+
         # The below comparison is a hack to ensure that fully masked
         # results, for which val.ravel().all() returns np.ma.masked,
         # do not trigger a failure (np.ma.masked != True evaluates as
         # np.ma.masked, which is falsy).
         if cond != True:
             mismatch = 100.0 * reduced.count(0) / ox.size
-            msg = build_err_msg([ox, oy],
-                                err_msg
-                                + '\n(mismatch %s%%)' % (mismatch,),
+            remarks = ['Mismatch: {:.3g}%'.format(mismatch)]
+
+            with errstate(invalid='ignore', divide='ignore'):
+                # ignore errors for non-numeric types
+                with contextlib.suppress(TypeError):
+                    error = abs(x - y)
+                    max_abs_error = error.max()
+                    remarks.append('Max absolute difference: '
+                                   + array2string(max_abs_error))
+
+                    # note: this definition of relative error matches that one
+                    # used by assert_allclose (found in np.isclose)
+                    max_rel_error = (error / abs(y)).max()
+                    remarks.append('Max relative difference: '
+                                   + array2string(max_rel_error))
+
+            err_msg += '\n' + '\n'.join(remarks)
+            msg = build_err_msg([ox, oy], err_msg,
                                 verbose=verbose, header=header,
                                 names=('x', 'y'), precision=precision)
             raise AssertionError(msg)
@@ -850,14 +871,15 @@
 
     >>> np.testing.assert_array_equal([1.0,np.pi,np.nan],
     ...                               [1, np.sqrt(np.pi)**2, np.nan])
-    ...
-    <type 'exceptions.ValueError'>:
+    Traceback (most recent call last):
+        ...
     AssertionError:
     Arrays are not equal
-    <BLANKLINE>
-    (mismatch 50.0%)
-     x: array([ 1.        ,  3.14159265,         NaN])
-     y: array([ 1.        ,  3.14159265,         NaN])
+    Mismatch: 33.3%
+    Max absolute difference: 4.4408921e-16
+    Max relative difference: 1.41357986e-16
+     x: array([1.      , 3.141593,      nan])
+     y: array([1.      , 3.141593,      nan])
 
     Use `assert_allclose` or one of the nulp (number of floating point values)
     functions for these cases instead:
@@ -922,26 +944,29 @@
     the first assert does not raise an exception
 
     >>> np.testing.assert_array_almost_equal([1.0,2.333,np.nan],
-                                             [1.0,2.333,np.nan])
+    ...                                      [1.0,2.333,np.nan])
 
     >>> np.testing.assert_array_almost_equal([1.0,2.33333,np.nan],
     ...                                      [1.0,2.33339,np.nan], decimal=5)
-    ...
-    <type 'exceptions.AssertionError'>:
+    Traceback (most recent call last):
+        ...
     AssertionError:
-    Arrays are not almost equal
-    <BLANKLINE>
-    (mismatch 50.0%)
-     x: array([ 1.     ,  2.33333,      NaN])
-     y: array([ 1.     ,  2.33339,      NaN])
+    Arrays are not almost equal to 5 decimals
+    Mismatch: 33.3%
+    Max absolute difference: 6.e-05
+    Max relative difference: 2.57136612e-05
+     x: array([1.     , 2.33333,     nan])
+     y: array([1.     , 2.33339,     nan])
 
     >>> np.testing.assert_array_almost_equal([1.0,2.33333,np.nan],
     ...                                      [1.0,2.33333, 5], decimal=5)
-    <type 'exceptions.ValueError'>:
-    ValueError:
-    Arrays are not almost equal
-     x: array([ 1.     ,  2.33333,      NaN])
-     y: array([ 1.     ,  2.33333,  5.     ])
+    Traceback (most recent call last):
+        ...
+    AssertionError:
+    Arrays are not almost equal to 5 decimals
+    x and y nan location mismatch:
+     x: array([1.     , 2.33333,     nan])
+     y: array([1.     , 2.33333, 5.     ])
 
     """
     __tracebackhide__ = True  # Hide traceback for py.test
@@ -1022,27 +1047,34 @@
     --------
     >>> np.testing.assert_array_less([1.0, 1.0, np.nan], [1.1, 2.0, np.nan])
     >>> np.testing.assert_array_less([1.0, 1.0, np.nan], [1, 2.0, np.nan])
-    ...
-    <type 'exceptions.ValueError'>:
+    Traceback (most recent call last):
+        ...
+    AssertionError:
     Arrays are not less-ordered
-    (mismatch 50.0%)
-     x: array([  1.,   1.,  NaN])
-     y: array([  1.,   2.,  NaN])
+    Mismatch: 33.3%
+    Max absolute difference: 1.
+    Max relative difference: 0.5
+     x: array([ 1.,  1., nan])
+     y: array([ 1.,  2., nan])
 
     >>> np.testing.assert_array_less([1.0, 4.0], 3)
-    ...
-    <type 'exceptions.ValueError'>:
+    Traceback (most recent call last):
+        ...
+    AssertionError:
     Arrays are not less-ordered
-    (mismatch 50.0%)
-     x: array([ 1.,  4.])
+    Mismatch: 50%
+    Max absolute difference: 2.
+    Max relative difference: 0.66666667
+     x: array([1., 4.])
      y: array(3)
 
     >>> np.testing.assert_array_less([1.0, 2.0, 3.0], [4])
-    ...
-    <type 'exceptions.ValueError'>:
+    Traceback (most recent call last):
+        ...
+    AssertionError:
     Arrays are not less-ordered
     (shapes (3,), (1,) mismatch)
-     x: array([ 1.,  2.,  3.])
+     x: array([1., 2., 3.])
      y: array([4])
 
     """
@@ -1147,7 +1179,7 @@
     argument to the ``test()`` call. For example, to run all tests (including
     doctests) for `numpy.lib`:
 
-    >>> np.lib.test(doctests=True) #doctest: +SKIP
+    >>> np.lib.test(doctests=True)  # doctest: +SKIP
     """
     from numpy.compat import npy_load_module
     import doctest
@@ -1329,7 +1361,7 @@
     return
 
 
-def measure(code_str,times=1,label=None):
+def measure(code_str, times=1, label=None):
     """
     Return elapsed time for executing code in the namespace of the caller.
 
@@ -1356,9 +1388,9 @@
 
     Examples
     --------
-    >>> etime = np.testing.measure('for i in range(1000): np.sqrt(i**2)',
-    ...                            times=times)
-    >>> print("Time for a single execution : ", etime / times, "s")
+    >>> times = 10
+    >>> etime = np.testing.measure('for i in range(1000): np.sqrt(i**2)', times=times)
+    >>> print("Time for a single execution : ", etime / times, "s")  # doctest: +SKIP
     Time for a single execution :  0.005 s
 
     """
@@ -1443,7 +1475,7 @@
     --------
     >>> x = [1e-5, 1e-3, 1e-1]
     >>> y = np.arccos(np.cos(x))
-    >>> assert_allclose(x, y, rtol=1e-5, atol=0)
+    >>> np.testing.assert_allclose(x, y, rtol=1e-5, atol=0)
 
     """
     __tracebackhide__ = True  # Hide traceback for py.test
@@ -1897,7 +1929,8 @@
     Examples
     --------
     >>> import warnings
-    >>> with clear_and_catch_warnings(modules=[np.core.fromnumeric]):
+    >>> with np.testing.clear_and_catch_warnings(
+    ...         modules=[np.core.fromnumeric]):
     ...     warnings.simplefilter('always')
     ...     warnings.filterwarnings('ignore', module='np.core.fromnumeric')
     ...     # do something that raises a warning but ignore those in
@@ -1978,25 +2011,28 @@
 
     Examples
     --------
-    >>> with suppress_warnings() as sup:
-    ...     sup.filter(DeprecationWarning, "Some text")
-    ...     sup.filter(module=np.ma.core)
-    ...     log = sup.record(FutureWarning, "Does this occur?")
-    ...     command_giving_warnings()
-    ...     # The FutureWarning was given once, the filtered warnings were
-    ...     # ignored. All other warnings abide outside settings (may be
-    ...     # printed/error)
-    ...     assert_(len(log) == 1)
-    ...     assert_(len(sup.log) == 1)  # also stored in log attribute
 
-    Or as a decorator:
+    With a context manager::
 
-    >>> sup = suppress_warnings()
-    >>> sup.filter(module=np.ma.core)  # module must match exact
-    >>> @sup
-    >>> def some_function():
-    ...     # do something which causes a warning in np.ma.core
-    ...     pass
+        with np.testing.suppress_warnings() as sup:
+            sup.filter(DeprecationWarning, "Some text")
+            sup.filter(module=np.ma.core)
+            log = sup.record(FutureWarning, "Does this occur?")
+            command_giving_warnings()
+            # The FutureWarning was given once, the filtered warnings were
+            # ignored. All other warnings abide outside settings (may be
+            # printed/error)
+            assert_(len(log) == 1)
+            assert_(len(sup.log) == 1)  # also stored in log attribute
+
+    Or as a decorator::
+
+        sup = np.testing.suppress_warnings()
+        sup.filter(module=np.ma.core)  # module must match exactly
+        @sup
+        def some_function():
+            # do something which causes a warning in np.ma.core
+            pass
     """
     def __init__(self, forwarding_rule="always"):
         self._entered = False
diff --git a/numpy/testing/tests/test_utils.py b/numpy/testing/tests/test_utils.py
index 43afafa..c376a38 100644
--- a/numpy/testing/tests/test_utils.py
+++ b/numpy/testing/tests/test_utils.py
@@ -327,24 +327,22 @@
         self._test_not_equal(x, y)
 
     def test_error_message(self):
-        try:
+        with pytest.raises(AssertionError) as exc_info:
             self._assert_func(np.array([1, 2]), np.array([[1, 2]]))
-        except AssertionError as e:
-            msg = str(e)
-            msg2 = msg.replace("shapes (2L,), (1L, 2L)", "shapes (2,), (1, 2)")
-            msg_reference = textwrap.dedent("""\
+        msg = str(exc_info.value)
+        msg2 = msg.replace("shapes (2L,), (1L, 2L)", "shapes (2,), (1, 2)")
+        msg_reference = textwrap.dedent("""\
 
-            Arrays are not equal
+        Arrays are not equal
 
-            (shapes (2,), (1, 2) mismatch)
-             x: array([1, 2])
-             y: array([[1, 2]])""")
-            try:
-                assert_equal(msg, msg_reference)
-            except AssertionError:
-                assert_equal(msg2, msg_reference)
-        else:
-            raise AssertionError("Did not raise")
+        (shapes (2,), (1, 2) mismatch)
+         x: array([1, 2])
+         y: array([[1, 2]])""")
+
+        try:
+            assert_equal(msg, msg_reference)
+        except AssertionError:
+            assert_equal(msg2, msg_reference)
 
 
 class TestArrayAlmostEqual(_GenericTest):
@@ -509,38 +507,53 @@
         x = np.array([1.00000000001, 2.00000000002, 3.00003])
         y = np.array([1.00000000002, 2.00000000003, 3.00004])
 
-        # test with a different amount of decimal digits
-        # note that we only check for the formatting of the arrays themselves
-        b = ('x: array([1.00000000001, 2.00000000002, 3.00003     '
-             ' ])\n y: array([1.00000000002, 2.00000000003, 3.00004      ])')
-        try:
+        # Test with a different amount of decimal digits
+        with pytest.raises(AssertionError) as exc_info:
             self._assert_func(x, y, decimal=12)
-        except AssertionError as e:
-            # remove anything that's not the array string
-            assert_equal(str(e).split('%)\n ')[1], b)
+        msgs = str(exc_info.value).split('\n')
+        assert_equal(msgs[3], 'Mismatch: 100%')
+        assert_equal(msgs[4], 'Max absolute difference: 1.e-05')
+        assert_equal(msgs[5], 'Max relative difference: 3.33328889e-06')
+        assert_equal(
+            msgs[6],
+            ' x: array([1.00000000001, 2.00000000002, 3.00003      ])')
+        assert_equal(
+            msgs[7],
+            ' y: array([1.00000000002, 2.00000000003, 3.00004      ])')
 
-        # with the default value of decimal digits, only the 3rd element differs
-        # note that we only check for the formatting of the arrays themselves
-        b = ('x: array([1.     , 2.     , 3.00003])\n y: array([1.     , '
-             '2.     , 3.00004])')
-        try:
+        # With the default value of decimal digits, only the 3rd element
+        # differs. Note that we only check for the formatting of the arrays
+        # themselves.
+        with pytest.raises(AssertionError) as exc_info:
             self._assert_func(x, y)
-        except AssertionError as e:
-            # remove anything that's not the array string
-            assert_equal(str(e).split('%)\n ')[1], b)
+        msgs = str(exc_info.value).split('\n')
+        assert_equal(msgs[3], 'Mismatch: 33.3%')
+        assert_equal(msgs[4], 'Max absolute difference: 1.e-05')
+        assert_equal(msgs[5], 'Max relative difference: 3.33328889e-06')
+        assert_equal(msgs[6], ' x: array([1.     , 2.     , 3.00003])')
+        assert_equal(msgs[7], ' y: array([1.     , 2.     , 3.00004])')
 
-        # Check the error message when input includes inf or nan
+        # Check the error message when input includes inf
         x = np.array([np.inf, 0])
         y = np.array([np.inf, 1])
-        try:
+        with pytest.raises(AssertionError) as exc_info:
             self._assert_func(x, y)
-        except AssertionError as e:
-            msgs = str(e).split('\n')
-            # assert error percentage is 50%
-            assert_equal(msgs[3], '(mismatch 50.0%)')
-            # assert output array contains inf
-            assert_equal(msgs[4], ' x: array([inf,  0.])')
-            assert_equal(msgs[5], ' y: array([inf,  1.])')
+        msgs = str(exc_info.value).split('\n')
+        assert_equal(msgs[3], 'Mismatch: 50%')
+        assert_equal(msgs[4], 'Max absolute difference: 1.')
+        assert_equal(msgs[5], 'Max relative difference: 1.')
+        assert_equal(msgs[6], ' x: array([inf,  0.])')
+        assert_equal(msgs[7], ' y: array([inf,  1.])')
+
+        # Check the error message when dividing by zero
+        x = np.array([1, 2])
+        y = np.array([0, 0])
+        with pytest.raises(AssertionError) as exc_info:
+            self._assert_func(x, y)
+        msgs = str(exc_info.value).split('\n')
+        assert_equal(msgs[3], 'Mismatch: 100%')
+        assert_equal(msgs[4], 'Max absolute difference: 2')
+        assert_equal(msgs[5], 'Max relative difference: inf')
 
     def test_subclass_that_cannot_be_bool(self):
         # While we cannot guarantee testing functions will always work for
@@ -829,12 +842,12 @@
     def test_report_fail_percentage(self):
         a = np.array([1, 1, 1, 1])
         b = np.array([1, 1, 1, 2])
-        try:
+
+        with pytest.raises(AssertionError) as exc_info:
             assert_allclose(a, b)
-            msg = ''
-        except AssertionError as exc:
-            msg = exc.args[0]
-        assert_("mismatch 25.0%" in msg)
+        msg = str(exc_info.value)
+        assert_('Mismatch: 25%\nMax absolute difference: 1\n'
+                'Max relative difference: 0.5' in msg)
 
     def test_equal_nan(self):
         a = np.array([np.nan])
@@ -1117,12 +1130,10 @@
         assert_string_equal("hello", "hello")
         assert_string_equal("hello\nmultiline", "hello\nmultiline")
 
-        try:
+        with pytest.raises(AssertionError) as exc_info:
             assert_string_equal("foo\nbar", "hello\nbar")
-        except AssertionError as exc:
-            assert_equal(str(exc), "Differences in strings:\n- foo\n+ hello")
-        else:
-            raise AssertionError("exception not raised")
+        msg = str(exc_info.value)
+        assert_equal(msg, "Differences in strings:\n- foo\n+ hello")
 
         assert_raises(AssertionError,
                       lambda: assert_string_equal("foo", "hello"))
diff --git a/numpy/tests/test_ctypeslib.py b/numpy/tests/test_ctypeslib.py
index 53b75db..521208c 100644
--- a/numpy/tests/test_ctypeslib.py
+++ b/numpy/tests/test_ctypeslib.py
@@ -2,6 +2,7 @@
 
 import sys
 import pytest
+import weakref
 
 import numpy as np
 from numpy.ctypeslib import ndpointer, load_library, as_array
@@ -260,3 +261,107 @@
             b = np.ctypeslib.as_array(newpnt, (N,))
             # now delete both, which should cleanup both objects
             del newpnt, b
+
+    def test_segmentation_fault(self):
+        arr = np.zeros((224, 224, 3))
+        c_arr = np.ctypeslib.as_ctypes(arr)
+        arr_ref = weakref.ref(arr)
+        del arr
+
+        # check the reference wasn't cleaned up
+        assert_(arr_ref() is not None)
+
+        # check we avoid the segfault
+        c_arr[0][0][0]
+
+
+@pytest.mark.skipif(ctypes is None,
+                    reason="ctypes not available on this python installation")
+class TestAsCtypesType(object):
+    """ Test conversion from dtypes to ctypes types """
+    def test_scalar(self):
+        dt = np.dtype('<u2')
+        ct = np.ctypeslib.as_ctypes_type(dt)
+        assert_equal(ct, ctypes.c_uint16.__ctype_le__)
+
+        dt = np.dtype('>u2')
+        ct = np.ctypeslib.as_ctypes_type(dt)
+        assert_equal(ct, ctypes.c_uint16.__ctype_be__)
+
+        dt = np.dtype('u2')
+        ct = np.ctypeslib.as_ctypes_type(dt)
+        assert_equal(ct, ctypes.c_uint16)
+
+    def test_subarray(self):
+        dt = np.dtype((np.int32, (2, 3)))
+        ct = np.ctypeslib.as_ctypes_type(dt)
+        assert_equal(ct, 2 * (3 * ctypes.c_int32))
+
+    def test_structure(self):
+        dt = np.dtype([
+            ('a', np.uint16),
+            ('b', np.uint32),
+        ])
+
+        ct = np.ctypeslib.as_ctypes_type(dt)
+        assert_(issubclass(ct, ctypes.Structure))
+        assert_equal(ctypes.sizeof(ct), dt.itemsize)
+        assert_equal(ct._fields_, [
+            ('a', ctypes.c_uint16),
+            ('b', ctypes.c_uint32),
+        ])
+
+    def test_structure_aligned(self):
+        dt = np.dtype([
+            ('a', np.uint16),
+            ('b', np.uint32),
+        ], align=True)
+
+        ct = np.ctypeslib.as_ctypes_type(dt)
+        assert_(issubclass(ct, ctypes.Structure))
+        assert_equal(ctypes.sizeof(ct), dt.itemsize)
+        assert_equal(ct._fields_, [
+            ('a', ctypes.c_uint16),
+            ('', ctypes.c_char * 2),  # padding
+            ('b', ctypes.c_uint32),
+        ])
+
+    def test_union(self):
+        dt = np.dtype(dict(
+            names=['a', 'b'],
+            offsets=[0, 0],
+            formats=[np.uint16, np.uint32]
+        ))
+
+        ct = np.ctypeslib.as_ctypes_type(dt)
+        assert_(issubclass(ct, ctypes.Union))
+        assert_equal(ctypes.sizeof(ct), dt.itemsize)
+        assert_equal(ct._fields_, [
+            ('a', ctypes.c_uint16),
+            ('b', ctypes.c_uint32),
+        ])
+
+    def test_padded_union(self):
+        dt = np.dtype(dict(
+            names=['a', 'b'],
+            offsets=[0, 0],
+            formats=[np.uint16, np.uint32],
+            itemsize=5,
+        ))
+
+        ct = np.ctypeslib.as_ctypes_type(dt)
+        assert_(issubclass(ct, ctypes.Union))
+        assert_equal(ctypes.sizeof(ct), dt.itemsize)
+        assert_equal(ct._fields_, [
+            ('a', ctypes.c_uint16),
+            ('b', ctypes.c_uint32),
+            ('', ctypes.c_char * 5),  # padding
+        ])
+
+    def test_overlapping(self):
+        dt = np.dtype(dict(
+            names=['a', 'b'],
+            offsets=[0, 2],
+            formats=[np.uint32, np.uint32]
+        ))
+        assert_raises(NotImplementedError, np.ctypeslib.as_ctypes_type, dt)
diff --git a/numpy/tests/test_scripts.py b/numpy/tests/test_scripts.py
index 9e27cc6..e42dc25 100644
--- a/numpy/tests/test_scripts.py
+++ b/numpy/tests/test_scripts.py
@@ -8,7 +8,7 @@
 import os
 import pytest
 from os.path import join as pathjoin, isfile, dirname
-from subprocess import Popen, PIPE
+import subprocess
 
 import numpy as np
 from numpy.compat.py3k import basestring
@@ -17,74 +17,13 @@
 is_inplace = isfile(pathjoin(dirname(np.__file__),  '..', 'setup.py'))
 
 
-def run_command(cmd, check_code=True):
-    """ Run command sequence `cmd` returning exit code, stdout, stderr
-
-    Parameters
-    ----------
-    cmd : str or sequence
-        string with command name or sequence of strings defining command
-    check_code : {True, False}, optional
-        If True, raise error for non-zero return code
-
-    Returns
-    -------
-    returncode : int
-        return code from execution of `cmd`
-    stdout : bytes (python 3) or str (python 2)
-        stdout from `cmd`
-    stderr : bytes (python 3) or str (python 2)
-        stderr from `cmd`
-
-    Raises
-    ------
-    RuntimeError
-        If `check_code` is True, and return code !=0
-    """
-    cmd = [cmd] if isinstance(cmd, basestring) else list(cmd)
-    if os.name == 'nt':
-        # Quote any arguments with spaces. The quotes delimit the arguments
-        # on Windows, and the arguments might be file paths with spaces.
-        # On Unix the list elements are each separate arguments.
-        cmd = ['"{0}"'.format(c) if ' ' in c else c for c in cmd]
-    proc = Popen(cmd, stdout=PIPE, stderr=PIPE)
-    stdout, stderr = proc.communicate()
-    if proc.poll() is None:
-        proc.terminate()
-    if check_code and proc.returncode != 0:
-        raise RuntimeError('\n'.join(
-            ['Command "{0}" failed with',
-             'stdout', '------', '{1}', '',
-             'stderr', '------', '{2}']).format(cmd, stdout, stderr))
-    return proc.returncode, stdout, stderr
-
-
-@pytest.mark.skipif(is_inplace, reason="Cannot test f2py command inplace")
-@pytest.mark.xfail(reason="Test is unreliable")
-def test_f2py():
-    # test that we can run f2py script
-
-    def try_f2py_commands(cmds):
-        success = 0
-        for f2py_cmd in cmds:
-            try:
-                code, stdout, stderr = run_command([f2py_cmd, '-v'])
-                assert_equal(stdout.strip(), b'2')
-                success += 1
-            except Exception:
-                pass
-        return success
-
+def find_f2py_commands():
     if sys.platform == 'win32':
-        # Only the single 'f2py' script is installed in windows.
         exe_dir = dirname(sys.executable)
         if exe_dir.endswith('Scripts'): # virtualenv
-            f2py_cmds = [os.path.join(exe_dir, 'f2py')]
+            return [os.path.join(exe_dir, 'f2py')]
         else:
-            f2py_cmds = [os.path.join(exe_dir, "Scripts", 'f2py')]
-        success = try_f2py_commands(f2py_cmds)
-        msg = "Warning: f2py not found in path"
-        assert_(success == 1, msg)
+            return [os.path.join(exe_dir, "Scripts", 'f2py')]
     else:
         # Three scripts are installed in Unix-like systems:
         # 'f2py', 'f2py{major}', and 'f2py{major.minor}'. For example,
@@ -93,7 +32,18 @@
         version = sys.version_info
         major = str(version.major)
         minor = str(version.minor)
-        f2py_cmds = ('f2py', 'f2py' + major, 'f2py' + major + '.' + minor)
-        success = try_f2py_commands(f2py_cmds)
-        msg = "Warning: not all of %s, %s, and %s are found in path" % f2py_cmds
-        assert_(success == 3, msg)
+        return ['f2py', 'f2py' + major, 'f2py' + major + '.' + minor]
+
+
+@pytest.mark.skipif(is_inplace, reason="Cannot test f2py command inplace")
+@pytest.mark.xfail(reason="Test is unreliable")
+@pytest.mark.parametrize('f2py_cmd', find_f2py_commands())
+def test_f2py(f2py_cmd):
+    # test that we can run f2py script
+    stdout = subprocess.check_output([f2py_cmd, '-v'])
+    assert_equal(stdout.strip(), b'2')
+
+
+def test_pep338():
+    stdout = subprocess.check_output([sys.executable, '-mnumpy.f2py', '-v'])
+    assert_equal(stdout.strip(), b'2')
diff --git a/pavement.py b/pavement.py
index f2c5688..2a5225f 100644
--- a/pavement.py
+++ b/pavement.py
@@ -42,13 +42,12 @@
 #-----------------------------------
 
 # Path to the release notes
-RELEASE_NOTES = 'doc/release/1.16.0-notes.rst'
+RELEASE_NOTES = 'doc/release/1.17.0-notes.rst'
 
 
 #-------------------------------------------------------
 # Hardcoded build/install dirs, virtualenv options, etc.
 #-------------------------------------------------------
-DEFAULT_PYTHON = "2.7"
 
 # Where to put the release installers
 options(installers=Bunch(releasedir="release",
diff --git a/pytest.ini b/pytest.ini
index 1a49e5d..4748e35 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -1,7 +1,7 @@
 [pytest]
 addopts = -l
 norecursedirs = doc tools numpy/linalg/lapack_lite numpy/core/code_generators
-doctest_optionflags = NORMALIZE_WHITESPACE
+doctest_optionflags = NORMALIZE_WHITESPACE ELLIPSIS ALLOW_UNICODE ALLOW_BYTES
 
 filterwarnings =
     error
diff --git a/runtests.py b/runtests.py
index 81c7c10..4286671 100755
--- a/runtests.py
+++ b/runtests.py
@@ -73,8 +73,8 @@
                         help="just build, do not run any tests")
     parser.add_argument("--doctests", action="store_true", default=False,
                         help="Run doctests in module")
-    #parser.add_argument("--refguide-check", action="store_true", default=False,
-                        #help="Run refguide check (do not run regular tests.)")
+    parser.add_argument("--refguide-check", action="store_true", default=False,
+                        help="Run refguide check (do not run regular tests.)")
     parser.add_argument("--coverage", action="store_true", default=False,
                         help=("report coverage of project code. HTML output goes "
                               "under build/coverage"))
@@ -202,6 +202,14 @@
             shutil.rmtree(dst_dir)
         extra_argv += ['--cov-report=html:' + dst_dir]
 
+    if args.refguide_check:
+        cmd = [os.path.join(ROOT_DIR, 'tools', 'refguide_check.py'),
+               '--doctests']
+        if args.submodule:
+            cmd += [args.submodule]
+        os.execv(sys.executable, [sys.executable] + cmd)
+        sys.exit(0)
+
     if args.bench:
         # Run ASV
         items = extra_argv
@@ -335,7 +343,6 @@
             # add flags used as werrors
             warnings_as_errors = ' '.join([
                 # from tools/travis-test.sh
-                '-Werror=declaration-after-statement',
                 '-Werror=vla',
                 '-Werror=nonnull',
                 '-Werror=pointer-arith',
diff --git a/setup.py b/setup.py
index cc20fa6..da25ddd 100755
--- a/setup.py
+++ b/setup.py
@@ -27,13 +27,10 @@
 import textwrap
 
 
-if sys.version_info[:2] < (2, 7) or (3, 0) <= sys.version_info[:2] < (3, 4):
-    raise RuntimeError("Python version 2.7 or >= 3.4 required.")
+if sys.version_info[:2] < (3, 5):
+    raise RuntimeError("Python version >= 3.5 required.")
 
-if sys.version_info[0] >= 3:
-    import builtins
-else:
-    import __builtin__ as builtins
+import builtins
 
 
 CLASSIFIERS = """\
@@ -43,10 +40,7 @@
 License :: OSI Approved
 Programming Language :: C
 Programming Language :: Python
-Programming Language :: Python :: 2
-Programming Language :: Python :: 2.7
 Programming Language :: Python :: 3
-Programming Language :: Python :: 3.4
 Programming Language :: Python :: 3.5
 Programming Language :: Python :: 3.6
 Programming Language :: Python :: 3.7
@@ -60,7 +54,7 @@
 """
 
 MAJOR               = 1
-MINOR               = 16
+MINOR               = 17
 MICRO               = 0
 ISRELEASED          = False
 VERSION             = '%d.%d.%d' % (MAJOR, MINOR, MICRO)
@@ -375,7 +369,7 @@
         platforms = ["Windows", "Linux", "Solaris", "Mac OS-X", "Unix"],
         test_suite='nose.collector',
         cmdclass={"sdist": sdist_checked},
-        python_requires='>=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*',
+        python_requires='>=3.5',
         zip_safe=False,
         entry_points={
             'console_scripts': f2py_cmds
diff --git a/shippable.yml b/shippable.yml
index 6a92c0f..82ee946 100644
--- a/shippable.yml
+++ b/shippable.yml
@@ -1,17 +1,17 @@
 branches:
     only:
        - master
+       - maintenance/*
 
 language: python
 
 python:
     # use versions available for job image
-    # aarch64_u16pytall:v6.7.4 
+    # aarch64_u16pytall:v6.7.4
     # (what we currently have access to by default)
     # this is a bit restrictive in terms
     # of version availability / control,
     # but it is convenient
-    - 2.7
     - 3.7
 
 runtime:
diff --git a/tools/changelog.py b/tools/changelog.py
index 84e046c..b135b14 100755
--- a/tools/changelog.py
+++ b/tools/changelog.py
@@ -42,8 +42,10 @@
 from git import Repo
 from github import Github
 
-UTF8Writer = codecs.getwriter('utf8')
-sys.stdout = UTF8Writer(sys.stdout)
+if sys.version_info.major < 3:
+    UTF8Writer = codecs.getwriter('utf8')
+    sys.stdout = UTF8Writer(sys.stdout)
+
 this_repo = Repo(os.path.join(os.path.dirname(__file__), ".."))
 
 author_msg =\
diff --git a/tools/refguide_check.py b/tools/refguide_check.py
new file mode 100644
index 0000000..531eeac
--- /dev/null
+++ b/tools/refguide_check.py
@@ -0,0 +1,957 @@
+#!/usr/bin/env python
+"""
+refguide_check.py [OPTIONS] [-- ARGS]
+
+Check for a NumPy submodule whether the objects in its __all__ dict
+correspond to the objects included in the reference guide.
+
+Example of usage::
+
+    $ python refguide_check.py optimize
+
+Note that this is a helper script to be able to check if things are missing;
+the output of this script does need to be checked manually.  In some cases
+objects are left out of the refguide for a good reason (it's an alias of
+another function, or deprecated, or ...)
+
+Another use of this helper script is to check validity of code samples
+in docstrings. This is different from doctesting [we do not aim to have
+numpy docstrings doctestable!], this is just to make sure that code in
+docstrings is valid python::
+
+    $ python refguide_check.py --doctests optimize
+
+"""
+from __future__ import print_function
+
+import sys
+import os
+import re
+import copy
+import inspect
+import warnings
+import doctest
+import tempfile
+import io
+import docutils.core
+from docutils.parsers.rst import directives
+import shutil
+import glob
+from doctest import NORMALIZE_WHITESPACE, ELLIPSIS, IGNORE_EXCEPTION_DETAIL
+from argparse import ArgumentParser
+from pkg_resources import parse_version
+
+import sphinx
+import numpy as np
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'doc', 'sphinxext'))
+from numpydoc.docscrape_sphinx import get_doc_object
+
+if parse_version(sphinx.__version__) >= parse_version('1.5'):
+    # Enable specific Sphinx directives
+    from sphinx.directives import SeeAlso, Only
+    directives.register_directive('seealso', SeeAlso)
+    directives.register_directive('only', Only)
+else:
+    # Remove sphinx directives that don't run without Sphinx environment.
+    # Sphinx < 1.5 installs all directives on import...
+    directives._directives.pop('versionadded', None)
+    directives._directives.pop('versionchanged', None)
+    directives._directives.pop('moduleauthor', None)
+    directives._directives.pop('sectionauthor', None)
+    directives._directives.pop('codeauthor', None)
+    directives._directives.pop('toctree', None)
+
+
+BASE_MODULE = "numpy"
+
+PUBLIC_SUBMODULES = [
+    'core',
+    'doc.structured_arrays',
+    'f2py',
+    'linalg',
+    'lib',
+    'lib.recfunctions',
+    'fft',
+    'ma',
+    'polynomial',
+    'matrixlib',
+    'random',
+    'testing',
+]
+
+# Docs for these modules are included in the parent module
+OTHER_MODULE_DOCS = {
+    'fftpack.convolve': 'fftpack',
+    'io.wavfile': 'io',
+    'io.arff': 'io',
+}
+
+# these names are known to fail doctesting and we like to keep it that way
+# e.g. sometimes pseudocode is acceptable etc
+DOCTEST_SKIPLIST = set([
+    # cases where NumPy docstrings import things from SciPy:
+    'numpy.lib.vectorize',
+    'numpy.random.standard_gamma',
+    'numpy.random.gamma',
+    'numpy.random.vonmises',
+    'numpy.random.power',
+    'numpy.random.zipf',
+    # remote / local file IO with DataSource is problematic in doctest:
+    'numpy.lib.DataSource',
+    'numpy.lib.Repository',
+])
+
+# these names are not required to be present in ALL despite being in
+# autosummary:: listing
+REFGUIDE_ALL_SKIPLIST = [
+    r'scipy\.sparse\.linalg',
+    r'scipy\.spatial\.distance',
+    r'scipy\.linalg\.blas\.[sdczi].*',
+    r'scipy\.linalg\.lapack\.[sdczi].*',
+]
+
+# these names are not required to be in an autosummary:: listing
+# despite being in ALL
+REFGUIDE_AUTOSUMMARY_SKIPLIST = [
+    # NOTE: should NumPy have a better match between autosummary
+    # listings and __all__? For now, TR isn't convinced this is a
+    # priority -- focus on just getting docstrings executed / correct
+    r'numpy\.*',
+]
+# deprecated windows in scipy.signal namespace
+for name in ('barthann', 'bartlett', 'blackmanharris', 'blackman', 'bohman',
+             'boxcar', 'chebwin', 'cosine', 'exponential', 'flattop',
+             'gaussian', 'general_gaussian', 'hamming', 'hann', 'hanning',
+             'kaiser', 'nuttall', 'parzen', 'slepian', 'triang', 'tukey'):
+    REFGUIDE_AUTOSUMMARY_SKIPLIST.append(r'scipy\.signal\.' + name)
+
+HAVE_MATPLOTLIB = False
+
+
+def short_path(path, cwd=None):
+    """
+    Return relative or absolute path name, whichever is shortest.
+    """
+    if not isinstance(path, str):
+        return path
+    if cwd is None:
+        cwd = os.getcwd()
+    abspath = os.path.abspath(path)
+    relpath = os.path.relpath(path, cwd)
+    if len(abspath) <= len(relpath):
+        return abspath
+    return relpath
+
+
+def find_names(module, names_dict):
+    # Refguide entries:
+    #
+    # - 3 spaces followed by function name, and maybe some spaces, some
+    #   dashes, and an explanation; only function names listed in
+    #   refguide are formatted like this (mostly, there may be some false
+    #   positives)
+    #
+    # - special directives, such as data and function
+    #
+    # - (scipy.constants only): quoted list
+    #
+    patterns = [
+        r"^\s\s\s([a-z_0-9A-Z]+)(\s+-+.*)?$",
+        r"^\.\. (?:data|function)::\s*([a-z_0-9A-Z]+)\s*$"
+    ]
+
+    if module.__name__ == 'scipy.constants':
+        patterns += ["^``([a-z_0-9A-Z]+)``"]
+
+    patterns = [re.compile(pattern) for pattern in patterns]
+    module_name = module.__name__
+
+    for line in module.__doc__.splitlines():
+        res = re.search(r"^\s*\.\. (?:currentmodule|module):: ([a-z0-9A-Z_.]+)\s*$", line)
+        if res:
+            module_name = res.group(1)
+            continue
+
+        for pattern in patterns:
+            res = re.match(pattern, line)
+            if res is not None:
+                name = res.group(1)
+                entry = '.'.join([module_name, name])
+                names_dict.setdefault(module_name, set()).add(name)
+                break
+
+
+def get_all_dict(module):
+    """Return a copy of the __all__ dict with irrelevant items removed."""
+    if hasattr(module, "__all__"):
+        all_dict = copy.deepcopy(module.__all__)
+    else:
+        all_dict = copy.deepcopy(dir(module))
+        all_dict = [name for name in all_dict
+                    if not name.startswith("_")]
+    for name in ['absolute_import', 'division', 'print_function']:
+        try:
+            all_dict.remove(name)
+        except ValueError:
+            pass
+    if not all_dict:
+        # Must be a pure documentation module like doc.structured_arrays
+        all_dict.append('__doc__')
+
+    # Modules are almost always private; real submodules need a separate
+    # run of refguide_check.
+    all_dict = [name for name in all_dict
+                if not inspect.ismodule(getattr(module, name, None))]
+
+    deprecated = []
+    not_deprecated = []
+    for name in all_dict:
+        f = getattr(module, name, None)
+        if callable(f) and is_deprecated(f):
+            deprecated.append(name)
+        else:
+            not_deprecated.append(name)
+
+    others = set(dir(module)).difference(set(deprecated)).difference(set(not_deprecated))
+
+    return not_deprecated, deprecated, others
+
+
+def compare(all_dict, others, names, module_name):
+    """Return sets of objects only in __all__, refguide, or completely missing."""
+    only_all = set()
+    for name in all_dict:
+        if name not in names:
+            for pat in REFGUIDE_AUTOSUMMARY_SKIPLIST:
+                if re.match(pat, module_name + '.' + name):
+                    break
+            else:
+                only_all.add(name)
+
+    only_ref = set()
+    missing = set()
+    for name in names:
+        if name not in all_dict:
+            for pat in REFGUIDE_ALL_SKIPLIST:
+                if re.match(pat, module_name + '.' + name):
+                    if name not in others:
+                        missing.add(name)
+                    break
+            else:
+                only_ref.add(name)
+
+    return only_all, only_ref, missing
+
+def is_deprecated(f):
+    with warnings.catch_warnings(record=True) as w:
+        warnings.simplefilter("error")
+        try:
+            f(**{"not a kwarg":None})
+        except DeprecationWarning:
+            return True
+        except Exception:
+            pass
+        return False
+
+def check_items(all_dict, names, deprecated, others, module_name, dots=True):
+    num_all = len(all_dict)
+    num_ref = len(names)
+
+    output = ""
+
+    output += "Non-deprecated objects in __all__: %i\n" % num_all
+    output += "Objects in refguide: %i\n\n" % num_ref
+
+    only_all, only_ref, missing = compare(all_dict, others, names, module_name)
+    dep_in_ref = set(only_ref).intersection(deprecated)
+    only_ref = set(only_ref).difference(deprecated)
+
+    if len(dep_in_ref) > 0:
+        output += "Deprecated objects in refguide::\n\n"
+        for name in sorted(deprecated):
+            output += "    " + name + "\n"
+
+    if len(only_all) == len(only_ref) == len(missing) == 0:
+        if dots:
+            output_dot('.')
+        return [(None, True, output)]
+    else:
+        if len(only_all) > 0:
+            output += "ERROR: objects in %s.__all__ but not in refguide::\n\n" % module_name
+            for name in sorted(only_all):
+                output += "    " + name + "\n"
+
+            output += "\nThis issue can be fixed by adding these objects to\n"
+            output += "the function listing in __init__.py for this module\n"
+
+        if len(only_ref) > 0:
+            output += "ERROR: objects in refguide but not in %s.__all__::\n\n" % module_name
+            for name in sorted(only_ref):
+                output += "    " + name + "\n"
+
+            output += "\nThis issue should likely be fixed by removing these objects\n"
+            output += "from the function listing in __init__.py for this module\n"
+            output += "or adding them to __all__.\n"
+
+        if len(missing) > 0:
+            output += "ERROR: missing objects::\n\n"
+            for name in sorted(missing):
+                output += "    " + name + "\n"
+
+        if dots:
+            output_dot('F')
+        return [(None, False, output)]
+
+
+def validate_rst_syntax(text, name, dots=True):
+    if text is None:
+        if dots:
+            output_dot('E')
+        return False, "ERROR: %s: no documentation" % (name,)
+
+    ok_unknown_items = set([
+        'mod', 'currentmodule', 'autosummary', 'data',
+        'obj', 'versionadded', 'versionchanged', 'module', 'class',
+        'ref', 'func', 'toctree', 'moduleauthor', 'term', 'c:member',
+        'sectionauthor', 'codeauthor', 'eq', 'doi', 'DOI', 'arXiv', 'arxiv'
+    ])
+
+    # Run through docutils
+    error_stream = io.StringIO()
+
+    def resolve(name, is_label=False):
+        return ("http://foo", name)
+
+    token = '<RST-VALIDATE-SYNTAX-CHECK>'
+
+    docutils.core.publish_doctree(
+        text, token,
+        settings_overrides = dict(halt_level=5,
+                                  traceback=True,
+                                  default_reference_context='title-reference',
+                                  default_role='emphasis',
+                                  link_base='',
+                                  resolve_name=resolve,
+                                  stylesheet_path='',
+                                  raw_enabled=0,
+                                  file_insertion_enabled=0,
+                                  warning_stream=error_stream))
+
+    # Print errors, disregarding unimportant ones
+    error_msg = error_stream.getvalue()
+    errors = error_msg.split(token)
+    success = True
+    output = ""
+
+    for error in errors:
+        lines = error.splitlines()
+        if not lines:
+            continue
+
+        m = re.match(r'.*Unknown (?:interpreted text role|directive type) "(.*)".*$', lines[0])
+        if m:
+            if m.group(1) in ok_unknown_items:
+                continue
+
+        m = re.match(r'.*Error in "math" directive:.*unknown option: "label"', " ".join(lines), re.S)
+        if m:
+            continue
+
+        output += name + lines[0] + "::\n    " + "\n    ".join(lines[1:]).rstrip() + "\n"
+        success = False
+
+    if not success:
+        output += "    " + "-"*72 + "\n"
+        for lineno, line in enumerate(text.splitlines()):
+            output += "    %-4d    %s\n" % (lineno+1, line)
+        output += "    " + "-"*72 + "\n\n"
+
+    if dots:
+        output_dot('.' if success else 'F')
+    return success, output
+
+
+def output_dot(msg='.', stream=sys.stderr):
+    stream.write(msg)
+    stream.flush()
+
+
+def check_rest(module, names, dots=True):
+    """
+    Check reStructuredText formatting of docstrings
+
+    Returns: [(name, success_flag, output), ...]
+    """
+
+    try:
+        skip_types = (dict, str, unicode, float, int)
+    except NameError:
+        # python 3
+        skip_types = (dict, str, float, int)
+
+
+    results = []
+
+    if module.__name__[6:] not in OTHER_MODULE_DOCS:
+        results += [(module.__name__,) +
+                    validate_rst_syntax(inspect.getdoc(module),
+                                        module.__name__, dots=dots)]
+
+    for name in names:
+        full_name = module.__name__ + '.' + name
+        obj = getattr(module, name, None)
+
+        if obj is None:
+            results.append((full_name, False, "%s has no docstring" % (full_name,)))
+            continue
+        elif isinstance(obj, skip_types):
+            continue
+
+        if inspect.ismodule(obj):
+            text = inspect.getdoc(obj)
+        else:
+            try:
+                text = str(get_doc_object(obj))
+            except Exception:
+                import traceback
+                results.append((full_name, False,
+                                "Error in docstring format!\n" +
+                                traceback.format_exc()))
+                continue
+
+        m = re.search("([\x00-\x09\x0b-\x1f])", text)
+        if m:
+            msg = ("Docstring contains a non-printable character %r! "
+                   "Maybe forgot r\"\"\"?" % (m.group(1),))
+            results.append((full_name, False, msg))
+            continue
+
+        try:
+            src_file = short_path(inspect.getsourcefile(obj))
+        except TypeError:
+            src_file = None
+
+        if src_file:
+            file_full_name = src_file + ':' + full_name
+        else:
+            file_full_name = full_name
+
+        results.append((full_name,) + validate_rst_syntax(text, file_full_name, dots=dots))
+
+    return results
+
+
+### Doctest helpers ####
+
+# the namespace to run examples in
+DEFAULT_NAMESPACE = {'np': np}
+
+# the namespace to do checks in
+CHECK_NAMESPACE = {
+      'np': np,
+      'assert_allclose': np.testing.assert_allclose,
+      'assert_equal': np.testing.assert_equal,
+      # recognize numpy repr's
+      'array': np.array,
+      'matrix': np.matrix,
+      'int64': np.int64,
+      'uint64': np.uint64,
+      'int8': np.int8,
+      'int32': np.int32,
+      'float32': np.float32,
+      'float64': np.float64,
+      'dtype': np.dtype,
+      'nan': np.nan,
+      'NaN': np.nan,
+      'inf': np.inf,
+      'Inf': np.inf,}
+
+
+class DTRunner(doctest.DocTestRunner):
+    DIVIDER = "\n"
+
+    def __init__(self, item_name, checker=None, verbose=None, optionflags=0):
+        self._item_name = item_name
+        doctest.DocTestRunner.__init__(self, checker=checker, verbose=verbose,
+                                       optionflags=optionflags)
+
+    def _report_item_name(self, out, new_line=False):
+        if self._item_name is not None:
+            if new_line:
+                out("\n")
+            self._item_name = None
+
+    def report_start(self, out, test, example):
+        self._checker._source = example.source
+        return doctest.DocTestRunner.report_start(self, out, test, example)
+
+    def report_success(self, out, test, example, got):
+        if self._verbose:
+            self._report_item_name(out, new_line=True)
+        return doctest.DocTestRunner.report_success(self, out, test, example, got)
+
+    def report_unexpected_exception(self, out, test, example, exc_info):
+        self._report_item_name(out)
+        return doctest.DocTestRunner.report_unexpected_exception(
+            self, out, test, example, exc_info)
+
+    def report_failure(self, out, test, example, got):
+        self._report_item_name(out)
+        return doctest.DocTestRunner.report_failure(self, out, test,
+                                                    example, got)
+
+class Checker(doctest.OutputChecker):
+    obj_pattern = re.compile('at 0x[0-9a-fA-F]+>')
+    int_pattern = re.compile('^[0-9]+L?$')
+    vanilla = doctest.OutputChecker()
+    rndm_markers = {'# random', '# Random', '#random', '#Random', "# may vary"}
+    stopwords = {'plt.', '.hist', '.show', '.ylim', '.subplot(',
+                 'set_title', 'imshow', 'plt.show', '.axis(', '.plot(',
+                 '.bar(', '.title', '.ylabel', '.xlabel', 'set_ylim', 'set_xlim',
+                 '# reformatted', '.set_xlabel(', '.set_ylabel(', '.set_zlabel(',
+                 '.set(xlim=', '.set(ylim=', '.set(xlabel=', '.set(ylabel='}
+
+    def __init__(self, parse_namedtuples=True, ns=None, atol=1e-8, rtol=1e-2):
+        self.parse_namedtuples = parse_namedtuples
+        self.atol, self.rtol = atol, rtol
+        if ns is None:
+            self.ns = dict(CHECK_NAMESPACE)
+        else:
+            self.ns = ns
+
+    def check_output(self, want, got, optionflags):
+        # cut it short if they are equal
+        if want == got:
+            return True
+
+        # skip stopwords in source
+        if any(word in self._source for word in self.stopwords):
+            return True
+
+        # skip random stuff
+        if any(word in want for word in self.rndm_markers):
+            return True
+
+        # skip function/object addresses
+        if self.obj_pattern.search(got):
+            return True
+
+        # ignore comments (e.g. signal.freqresp)
+        if want.lstrip().startswith("#"):
+            return True
+
+        # python 2 long integers are equal to python 3 integers
+        if self.int_pattern.match(want) and self.int_pattern.match(got):
+            if want.rstrip("L\r\n") == got.rstrip("L\r\n"):
+                return True
+
+        # try the standard doctest
+        try:
+            if self.vanilla.check_output(want, got, optionflags):
+                return True
+        except Exception:
+            pass
+
+        # OK then, convert strings to objects
+        try:
+            a_want = eval(want, dict(self.ns))
+            a_got = eval(got, dict(self.ns))
+        except Exception:
+            # Maybe we're printing a numpy array? This produces invalid python
+            # code: `print(np.arange(3))` produces "[0 1 2]" w/o commas between
+            # values. So, reinsert commas and retry.
+            # TODO: handle (1) abberivation (`print(np.arange(10000))`), and
+            #              (2) n-dim arrays with n > 1
+            s_want = want.strip()
+            s_got = got.strip()
+            cond = (s_want.startswith("[") and s_want.endswith("]") and
+                    s_got.startswith("[") and s_got.endswith("]"))
+            if cond:
+                s_want = ", ".join(s_want[1:-1].split())
+                s_got = ", ".join(s_got[1:-1].split())
+                return self.check_output(s_want, s_got, optionflags)
+
+            if not self.parse_namedtuples:
+                return False
+            # suppose that "want"  is a tuple, and "got" is smth like
+            # MoodResult(statistic=10, pvalue=0.1).
+            # Then convert the latter to the tuple (10, 0.1),
+            # and then compare the tuples.
+            try:
+                num = len(a_want)
+                regex = ('[\w\d_]+\(' +
+                         ', '.join(['[\w\d_]+=(.+)']*num) +
+                         '\)')
+                grp = re.findall(regex, got.replace('\n', ' '))
+                if len(grp) > 1:  # no more than one for now
+                    return False
+                # fold it back to a tuple
+                got_again = '(' + ', '.join(grp[0]) + ')'
+                return self.check_output(want, got_again, optionflags)
+            except Exception:
+                return False
+
+        # ... and defer to numpy
+        try:
+            return self._do_check(a_want, a_got)
+        except Exception:
+            # heterog tuple, eg (1, np.array([1., 2.]))
+           try:
+                return all(self._do_check(w, g) for w, g in zip(a_want, a_got))
+           except (TypeError, ValueError):
+                return False
+
+    def _do_check(self, want, got):
+        # This should be done exactly as written to correctly handle all of
+        # numpy-comparable objects, strings, and heterogeneous tuples
+        try:
+            if want == got:
+                return True
+        except Exception:
+            pass
+        return np.allclose(want, got, atol=self.atol, rtol=self.rtol)
+
+
+def _run_doctests(tests, full_name, verbose, doctest_warnings):
+    """Run modified doctests for the set of `tests`.
+
+    Returns: list of [(success_flag, output), ...]
+    """
+    flags = NORMALIZE_WHITESPACE | ELLIPSIS | IGNORE_EXCEPTION_DETAIL
+    runner = DTRunner(full_name, checker=Checker(), optionflags=flags,
+                      verbose=verbose)
+
+    output = []
+    success = True
+    def out(msg):
+        output.append(msg)
+
+    class MyStderr(object):
+        """Redirect stderr to the current stdout"""
+        def write(self, msg):
+            if doctest_warnings:
+                sys.stdout.write(msg)
+            else:
+                out(msg)
+
+        # a flush method is required when a doctest uses multiprocessing
+        # multiprocessing/popen_fork.py flushes sys.stderr
+        def flush(self):
+            if doctest_warnings:
+                sys.stdout.flush()
+
+    # Run tests, trying to restore global state afterward
+    old_printoptions = np.get_printoptions()
+    old_errstate = np.seterr()
+    old_stderr = sys.stderr
+    cwd = os.getcwd()
+    tmpdir = tempfile.mkdtemp()
+    sys.stderr = MyStderr()
+    try:
+        os.chdir(tmpdir)
+
+        # try to ensure random seed is NOT reproducible
+        np.random.seed(None)
+
+        for t in tests:
+            t.filename = short_path(t.filename, cwd)
+            fails, successes = runner.run(t, out=out)
+            if fails > 0:
+                success = False
+    finally:
+        sys.stderr = old_stderr
+        os.chdir(cwd)
+        shutil.rmtree(tmpdir)
+        np.set_printoptions(**old_printoptions)
+        np.seterr(**old_errstate)
+
+    return success, output
+
+
+def check_doctests(module, verbose, ns=None,
+                   dots=True, doctest_warnings=False):
+    """Check code in docstrings of the module's public symbols.
+
+    Returns: list of [(item_name, success_flag, output), ...]
+    """
+    if ns is None:
+        ns = dict(DEFAULT_NAMESPACE)
+
+    # Loop over non-deprecated items
+    results = []
+
+    for name in get_all_dict(module)[0]:
+        full_name = module.__name__ + '.' + name
+
+        if full_name in DOCTEST_SKIPLIST:
+            continue
+
+        try:
+            obj = getattr(module, name)
+        except AttributeError:
+            import traceback
+            results.append((full_name, False,
+                            "Missing item!\n" +
+                            traceback.format_exc()))
+            continue
+
+        finder = doctest.DocTestFinder()
+        try:
+            tests = finder.find(obj, name, globs=dict(ns))
+        except Exception:
+            import traceback
+            results.append((full_name, False,
+                            "Failed to get doctests!\n" +
+                            traceback.format_exc()))
+            continue
+
+        success, output = _run_doctests(tests, full_name, verbose,
+                                        doctest_warnings)
+
+        if dots:
+            output_dot('.' if success else 'F')
+
+        results.append((full_name, success, "".join(output)))
+
+        if HAVE_MATPLOTLIB:
+            import matplotlib.pyplot as plt
+            plt.close('all')
+
+    return results
+
+
+def check_doctests_testfile(fname, verbose, ns=None,
+                   dots=True, doctest_warnings=False):
+    """Check code in a text file.
+
+    Mimic `check_doctests` above, differing mostly in test discovery.
+    (which is borrowed from stdlib's doctest.testfile here,
+     https://github.com/python-git/python/blob/master/Lib/doctest.py)
+
+    Returns: list of [(item_name, success_flag, output), ...]
+
+    Notes
+    -----
+
+    We also try to weed out pseudocode:
+    * We maintain a list of exceptions which signal pseudocode,
+    * We split the text file into "blocks" of code separated by empty lines
+      and/or intervening text.
+    * If a block contains a marker, the whole block is then assumed to be
+      pseudocode. It is then not being doctested.
+
+    The rationale is that typically, the text looks like this:
+
+    blah
+    <BLANKLINE>
+    >>> from numpy import some_module   # pseudocode!
+    >>> func = some_module.some_function
+    >>> func(42)                  # still pseudocode
+    146
+    <BLANKLINE>
+    blah
+    <BLANKLINE>
+    >>> 2 + 3        # real code, doctest it
+    5
+
+    """
+    results = []
+
+    if ns is None:
+        ns = dict(DEFAULT_NAMESPACE)
+
+    _, short_name = os.path.split(fname)
+    if short_name in DOCTEST_SKIPLIST:
+        return results
+
+    full_name = fname
+    if sys.version_info.major <= 2:
+        with open(fname) as f:
+            text = f.read()
+    else:
+        with open(fname, encoding='utf-8') as f:
+            text = f.read()
+
+    PSEUDOCODE = set(['some_function', 'some_module', 'import example',
+                      'ctypes.CDLL',     # likely need compiling, skip it
+                      'integrate.nquad(func,'  # ctypes integrate tutotial
+    ])
+
+    # split the text into "blocks" and try to detect and omit pseudocode blocks.
+    parser = doctest.DocTestParser()
+    good_parts = []
+    for part in text.split('\n\n'):
+        tests = parser.get_doctest(part, ns, fname, fname, 0)
+        if any(word in ex.source for word in PSEUDOCODE
+                                 for ex in tests.examples):
+            # omit it
+            pass
+        else:
+            # `part` looks like a good code, let's doctest it
+            good_parts += [part]
+
+    # Reassemble the good bits and doctest them:
+    good_text = '\n\n'.join(good_parts)
+    tests = parser.get_doctest(good_text, ns, fname, fname, 0)
+    success, output = _run_doctests([tests], full_name, verbose,
+                                    doctest_warnings)
+
+    if dots:
+        output_dot('.' if success else 'F')
+
+    results.append((full_name, success, "".join(output)))
+
+    if HAVE_MATPLOTLIB:
+        import matplotlib.pyplot as plt
+        plt.close('all')
+
+    return results
+
+
+def init_matplotlib():
+    global HAVE_MATPLOTLIB
+
+    try:
+        import matplotlib
+        matplotlib.use('Agg')
+        HAVE_MATPLOTLIB = True
+    except ImportError:
+        HAVE_MATPLOTLIB = False
+
+
+def main(argv):
+    parser = ArgumentParser(usage=__doc__.lstrip())
+    parser.add_argument("module_names", metavar="SUBMODULES", default=[],
+                        nargs='*', help="Submodules to check (default: all public)")
+    parser.add_argument("--doctests", action="store_true", help="Run also doctests")
+    parser.add_argument("-v", "--verbose", action="count", default=0)
+    parser.add_argument("--doctest-warnings", action="store_true",
+                        help="Enforce warning checking for doctests")
+    parser.add_argument("--skip-tutorial", action="store_true",
+                        help="Skip running doctests in the tutorial.")
+    args = parser.parse_args(argv)
+
+    modules = []
+    names_dict = {}
+
+    if args.module_names:
+        args.skip_tutorial = True
+    else:
+        args.module_names = list(PUBLIC_SUBMODULES)
+
+    os.environ['SCIPY_PIL_IMAGE_VIEWER'] = 'true'
+
+    module_names = list(args.module_names)
+    for name in list(module_names):
+        if name in OTHER_MODULE_DOCS:
+            name = OTHER_MODULE_DOCS[name]
+            if name not in module_names:
+                module_names.append(name)
+
+    for submodule_name in module_names:
+        module_name = BASE_MODULE + '.' + submodule_name
+        __import__(module_name)
+        module = sys.modules[module_name]
+
+        if submodule_name not in OTHER_MODULE_DOCS:
+            find_names(module, names_dict)
+
+        if submodule_name in args.module_names:
+            modules.append(module)
+
+    dots = True
+    success = True
+    results = []
+
+    print("Running checks for %d modules:" % (len(modules),))
+
+    if args.doctests or not args.skip_tutorial:
+        init_matplotlib()
+
+    for module in modules:
+        if dots:
+            if module is not modules[0]:
+                sys.stderr.write(' ')
+            sys.stderr.write(module.__name__ + ' ')
+            sys.stderr.flush()
+
+        all_dict, deprecated, others = get_all_dict(module)
+        names = names_dict.get(module.__name__, set())
+
+        mod_results = []
+        mod_results += check_items(all_dict, names, deprecated, others, module.__name__)
+        mod_results += check_rest(module, set(names).difference(deprecated),
+                                  dots=dots)
+        if args.doctests:
+            mod_results += check_doctests(module, (args.verbose >= 2), dots=dots,
+                                          doctest_warnings=args.doctest_warnings)
+
+        for v in mod_results:
+            assert isinstance(v, tuple), v
+
+        results.append((module, mod_results))
+
+    if dots:
+        sys.stderr.write("\n")
+        sys.stderr.flush()
+
+    if not args.skip_tutorial:
+        base_dir = os.path.join(os.path.abspath(os.path.dirname(__file__)), '..')
+        tut_path = os.path.join(base_dir, 'doc', 'source', 'tutorial', '*.rst')
+        print('\nChecking tutorial files at %s:' % os.path.relpath(tut_path, os.getcwd()))
+        for filename in sorted(glob.glob(tut_path)):
+            if dots:
+                sys.stderr.write('\n')
+                sys.stderr.write(os.path.split(filename)[1] + ' ')
+                sys.stderr.flush()
+
+            tut_results = check_doctests_testfile(filename, (args.verbose >= 2),
+                    dots=dots, doctest_warnings=args.doctest_warnings)
+
+            def scratch(): pass        # stub out a "module", see below
+            scratch.__name__ = filename
+            results.append((scratch, tut_results))
+
+        if dots:
+            sys.stderr.write("\n")
+            sys.stderr.flush()
+
+    # Report results
+    all_success = True
+
+    for module, mod_results in results:
+        success = all(x[1] for x in mod_results)
+        all_success = all_success and success
+
+        if success and args.verbose == 0:
+            continue
+
+        print("")
+        print("=" * len(module.__name__))
+        print(module.__name__)
+        print("=" * len(module.__name__))
+        print("")
+
+        for name, success, output in mod_results:
+            if name is None:
+                if not success or args.verbose >= 1:
+                    print(output.strip())
+                    print("")
+            elif not success or (args.verbose >= 2 and output.strip()):
+                print(name)
+                print("-"*len(name))
+                print("")
+                print(output.strip())
+                print("")
+
+    if all_success:
+        print("\nOK: refguide and doctests checks passed!")
+        sys.exit(0)
+    else:
+        print("\nERROR: refguide or doctests have errors")
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main(argv=sys.argv[1:])
diff --git a/tools/travis-test.sh b/tools/travis-test.sh
index fa83606..3533620 100755
--- a/tools/travis-test.sh
+++ b/tools/travis-test.sh
@@ -25,8 +25,7 @@
 fi
 
 # make some warnings fatal, mostly to match windows compilers
-werrors="-Werror=declaration-after-statement -Werror=vla "
-werrors+="-Werror=nonnull -Werror=pointer-arith"
+werrors="-Werror=vla -Werror=nonnull -Werror=pointer-arith"
 
 # build with c99 by default
 
@@ -63,48 +62,6 @@
   fi
 }
 
-setup_chroot()
-{
-  # this can all be replaced with:
-  # apt-get install libpython2.7-dev:i386
-  # CC="gcc -m32" LDSHARED="gcc -m32 -shared" LDFLAGS="-m32 -shared" \
-  #   linux32 python setup.py build
-  # when travis updates to ubuntu 14.04
-  #
-  # NumPy may not distinguish between 64 and 32 bit ATLAS in the
-  # configuration stage.
-  DIR=$1
-  set -u
-  sudo debootstrap --variant=buildd --include=fakeroot,build-essential \
-    --arch=$ARCH --foreign $DIST $DIR
-  sudo chroot $DIR ./debootstrap/debootstrap --second-stage
-
-  # put the numpy repo in the chroot directory
-  sudo rsync -a $TRAVIS_BUILD_DIR $DIR/
-
-  # set up repos in the chroot directory for installing packages
-  echo deb http://archive.ubuntu.com/ubuntu/ \
-    $DIST main restricted universe multiverse \
-    | sudo tee -a $DIR/etc/apt/sources.list
-  echo deb http://archive.ubuntu.com/ubuntu/ \
-    $DIST-updates main restricted universe multiverse \
-    | sudo tee -a $DIR/etc/apt/sources.list
-  echo deb http://security.ubuntu.com/ubuntu \
-    $DIST-security  main restricted universe multiverse \
-    | sudo tee -a $DIR/etc/apt/sources.list
-
-  sudo chroot $DIR bash -c "apt-get update"
-  # faster operation with preloaded eatmydata
-  sudo chroot $DIR bash -c "apt-get install -qq -y eatmydata"
-  echo '/usr/$LIB/libeatmydata.so' | \
-    sudo tee -a $DIR/etc/ld.so.preload
-
-  # install needed packages
-  sudo chroot $DIR bash -c "apt-get install -qq -y \
-    libatlas-base-dev gfortran python3-dev python3-pip \
-    cython  python3-pytest"
-}
-
 run_test()
 {
   if [ -n "$USE_DEBUG" ]; then
@@ -113,7 +70,7 @@
 
   if [ -n "$RUN_COVERAGE" ]; then
     $PIP install pytest-cov
-    NUMPY_EXPERIMENTAL_ARRAY_FUNCTION=1
+    export NUMPY_EXPERIMENTAL_ARRAY_FUNCTION=1
     COVERAGE_FLAG=--coverage
   fi
 
@@ -224,15 +181,6 @@
 
   popd
   run_test
-elif [ -n "$USE_CHROOT" ] && [ $# -eq 0 ]; then
-  DIR=/chroot
-  setup_chroot $DIR
-  # the chroot'ed environment will not have the current locale,
-  # avoid any warnings which may disturb testing
-  export LANG=C LC_ALL=C
-  # run again in chroot with this time testing with python3
-  sudo linux32 chroot $DIR bash -c \
-    "cd numpy && PYTHON=python3 PIP=pip3 IN_CHROOT=1 $0 test"
 else
   setup_base
   run_test