Merge pull request #136 from Cyan4973/dev

xxHash v0.6.5
diff --git a/.travis.yml b/.travis.yml
index 4adeb39..895da85 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -7,5 +7,3 @@
   - sudo apt-get install -qq clang
   - sudo apt-get install -qq g++-multilib
   - sudo apt-get install -qq gcc-multilib
-  - sudo apt-get install -qq valgrind
-
diff --git a/Makefile b/Makefile
index c352b51..6dd738f 100644
--- a/Makefile
+++ b/Makefile
@@ -33,10 +33,19 @@
 LIBVER_PATCH := $(shell echo $(LIBVER_PATCH_SCRIPT))
 LIBVER := $(LIBVER_MAJOR).$(LIBVER_MINOR).$(LIBVER_PATCH)
 
-CFLAGS ?= -O3
+# SSE4 detection
+HAVE_SSE4 := $(shell $(CC) -dM -E - < /dev/null | grep "SSE4" > /dev/null && echo 1 || echo 0)
+ifeq ($(HAVE_SSE4), 1)
+NOSSE4 := -mno-sse4
+else
+NOSSE4 :=
+endif
+
+CFLAGS ?= -O2 $(NOSSE4)   # disables potential auto-vectorization
 CFLAGS += -Wall -Wextra -Wcast-qual -Wcast-align -Wshadow \
           -Wstrict-aliasing=1 -Wswitch-enum -Wdeclaration-after-statement \
-		  -Wstrict-prototypes -Wundef
+          -Wstrict-prototypes -Wundef
+
 FLAGS   = $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) $(MOREFLAGS)
 XXHSUM_VERSION=$(LIBVER)
 MD2ROFF = ronn
@@ -67,16 +76,19 @@
 
 
 .PHONY: default
-default: lib xxhsum
+default: lib xxhsum_and_links
 
 .PHONY: all
-all: lib xxhsum xxhsum32 xxhsum_inlinedXXH
+all: lib xxhsum xxhsum_inlinedXXH
 
 xxhsum32: CFLAGS += -m32
 xxhsum xxhsum32: xxhash.c xxhsum.c
 	$(CC) $(FLAGS) $^ -o $@$(EXT)
-	ln -sf $@ xxh32sum
-	ln -sf $@ xxh64sum
+
+.PHONY: xxhsum_and_links
+xxhsum_and_links: xxhsum
+	ln -sf xxhsum xxh32sum
+	ln -sf xxhsum xxh64sum
 
 xxhsum_inlinedXXH: xxhsum.c
 	$(CC) $(FLAGS) -DXXH_PRIVATE_API $^ -o $@$(EXT)
@@ -89,7 +101,10 @@
 	@echo compiling static library
 	@$(AR) $(ARFLAGS) $@ $^
 
-$(LIBXXH): LDFLAGS += -shared -fPIC
+$(LIBXXH): LDFLAGS += -shared
+ifeq (,$(filter Windows%,$(OS)))
+$(LIBXXH): LDFLAGS += -fPIC
+endif
 $(LIBXXH): xxhash.c
 	@echo compiling dynamic library $(LIBVER)
 	@$(CC) $(FLAGS) $^ $(LDFLAGS) $(SONAME_FLAGS) -o $@
@@ -104,12 +119,12 @@
 
 # tests
 
-.PHONY: test
-test: xxhsum
+.PHONY: check
+check: xxhsum
 	# stdin
 	./xxhsum < xxhash.c
 	# multiple files
-	./xxhsum *
+	./xxhsum xxhash.* xxhsum.*
 	# internal bench
 	./xxhsum -bi1
 	# file bench
@@ -119,21 +134,21 @@
 test-mem: xxhsum
 	# memory tests
 	valgrind --leak-check=yes --error-exitcode=1 ./xxhsum -bi1 xxhash.c
-	valgrind --leak-check=yes --error-exitcode=1 ./xxhsum -H0 xxhash.c
-	valgrind --leak-check=yes --error-exitcode=1 ./xxhsum -H1 xxhash.c
+	valgrind --leak-check=yes --error-exitcode=1 ./xxhsum -H0  xxhash.c
+	valgrind --leak-check=yes --error-exitcode=1 ./xxhsum -H1  xxhash.c
 
 .PHONY: test32
 test32: clean xxhsum32
-	@echo ---- test 32-bits ----
+	@echo ---- test 32-bit ----
 	./xxhsum32 -bi1 xxhash.c
 
 test-xxhsum-c: xxhsum
 	# xxhsum to/from pipe
-	./xxhsum * | ./xxhsum -c -
-	./xxhsum -H0 * | ./xxhsum -c -
+	./xxhsum lib* | ./xxhsum -c -
+	./xxhsum -H0 lib* | ./xxhsum -c -
 	# xxhsum to/from file, shell redirection
-	./xxhsum * > .test.xxh64
-	./xxhsum -H0 * > .test.xxh32
+	./xxhsum lib* > .test.xxh64
+	./xxhsum -H0 lib* > .test.xxh32
 	./xxhsum -c .test.xxh64
 	./xxhsum -c .test.xxh32
 	./xxhsum -c < .test.xxh64
@@ -147,8 +162,6 @@
 	# Expects "FAILED open or read"
 	echo "0000000000000000  test-expects-file-not-found" | ./xxhsum -c -; test $$? -eq 1
 	echo "00000000  test-expects-file-not-found" | ./xxhsum -c -; test $$? -eq 1
-
-clean-xxhsum-c:
 	@$(RM) -f .test.xxh32 .test.xxh64
 
 armtest: clean
@@ -168,9 +181,10 @@
 	$(CC) -std=c90 -Werror -pedantic -DXXH_NO_LONG_LONG -c xxhash.c
 	$(RM) xxhash.o
 
+usan: CC=clang
 usan: clean
 	@echo ---- check undefined behavior - sanitize ----
-	$(MAKE) clean test CC=clang MOREFLAGS="-g -fsanitize=undefined"
+	$(MAKE) clean test CC=$(CC) MOREFLAGS="-g -fsanitize=undefined -fno-sanitize-recover=all"
 
 staticAnalyze: clean
 	@echo ---- static analyzer - scan-build ----
@@ -193,15 +207,21 @@
 preview-man: clean-man man
 	man ./xxhsum.1
 
-test-all: clean all namespaceTest test test32 test-xxhsum-c clean-xxhsum-c \
-	armtest clangtest gpptest c90test test-mem usan staticAnalyze
+test: all namespaceTest check test-xxhsum-c c90test
+
+test-all: test test32 armtest clangtest gpptest usan listL120 trailingWhitespace staticAnalyze
 
 .PHONY: listL120
 listL120:  # extract lines >= 120 characters in *.{c,h}, by Takayuki Matsuoka (note : $$, for Makefile compatibility)
 	find . -type f -name '*.c' -o -name '*.h' | while read -r filename; do awk 'length > 120 {print FILENAME "(" FNR "): " $$0}' $$filename; done
 
+.PHONY: trailingWhitespace
+trailingWhitespace:
+	! grep -E "`printf '[ \\t]$$'`" *.1 *.c *.h LICENSE Makefile cmake_unofficial/CMakeLists.txt
+
 .PHONY: clean
-clean: clean-xxhsum-c
+clean:
+	@$(RM) -r *.dSYM   # Mac OS-X specific
 	@$(RM) core *.o libxxhash.*
 	@$(RM) xxhsum$(EXT) xxhsum32$(EXT) xxhsum_inlinedXXH$(EXT) xxh32sum xxh64sum
 	@echo cleaning completed
@@ -212,6 +232,10 @@
 #-----------------------------------------------------------------------------
 ifneq (,$(filter $(shell uname),Linux Darwin GNU/kFreeBSD GNU OpenBSD FreeBSD NetBSD DragonFly SunOS))
 
+.PHONY: list
+list:
+	@$(MAKE) -pRrq -f $(lastword $(MAKEFILE_LIST)) : 2>/dev/null | awk -v RS= -F: '/^# File/,/^# Finished Make data base/ {if ($$1 !~ "^[#.]") {print $$1}}' | sort | egrep -v -e '^[^[:alnum:]]' -e '^$@$$' | xargs
+
 DESTDIR     ?=
 # directory variables : GNU conventions prefer lowercase
 # see https://www.gnu.org/prep/standards/html_node/Makefile-Conventions.html
diff --git a/README.md b/README.md
index 5be77c1..30318a9 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,7 @@
 Benchmarks
 -------------------------
 
-The benchmark uses SMHasher speed test, compiled with Visual 2010 on a Windows Seven 32-bits box.
+The benchmark uses SMHasher speed test, compiled with Visual 2010 on a Windows Seven 32-bit box.
 The reference system uses a Core 2 Duo @3GHz
 
 
@@ -40,13 +40,13 @@
 Algorithms with a score < 5 are not listed on this table.
 
 A more recent version, XXH64, has been created thanks to [Mathias Westerdahl](https://github.com/JCash),
-which offers superior speed and dispersion for 64-bits systems.
-Note however that 32-bits applications will still run faster using the 32-bits version.
+which offers superior speed and dispersion for 64-bit systems.
+Note however that 32-bit applications will still run faster using the 32-bit version.
 
-SMHasher speed test, compiled using GCC 4.8.2, on Linux Mint 64-bits.
+SMHasher speed test, compiled using GCC 4.8.2, on Linux Mint 64-bit.
 The reference system uses a Core i5-3340M @2.7GHz
 
-| Version    | Speed on 64-bits | Speed on 32-bits |
+| Version    | Speed on 64-bit | Speed on 32-bit |
 |------------|------------------|------------------|
 | XXH64      | 13.8 GB/s        |  1.9 GB/s        |
 | XXH32      |  6.8 GB/s        |  6.0 GB/s        |
@@ -66,9 +66,15 @@
 The following macros can be set at compilation time,
 they modify xxhash behavior. They are all disabled by default.
 
+- `XXH_INLINE_ALL` : Make all functions `inline`, with bodies directly included within `xxhash.h`.
+                     There is no need for an `xxhash.o` module in this case.
+                     Inlining functions is generally beneficial for speed on small keys.
+                     It's especially effective when key length is a compile time constant,
+                     with observed performance improvement in the +200% range .
+                     See [this article](https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html) for details.
 - `XXH_ACCEPT_NULL_INPUT_POINTER` : if set to `1`, when input is a null-pointer,
-                                    xxhash result is the same as a null-length key,
-                                    instead of a dereference segfault.
+                                    xxhash result is the same as a zero-length key
+                                    (instead of a dereference segfault).
 - `XXH_FORCE_MEMORY_ACCESS` : default method `0` uses a portable `memcpy()` notation.
                               Method `1` uses a gcc-specific `packed` attribute, which can provide better performance for some targets.
                               Method `2` forces unaligned reads, which is not standard compliant, but might sometimes be the only way to extract better performance.
@@ -77,21 +83,22 @@
                             Setting it to 0 forces big-endian.
 - `XXH_FORCE_NATIVE_FORMAT` : on big-endian systems : use native number representation.
                               Breaks consistency with little-endian results.
+- `XXH_PRIVATE_API` : same impact as `XXH_INLINE_ALL`.
+                      Name underlines that symbols will not be published on library public interface.
 - `XXH_NAMESPACE` : prefix all symbols with the value of `XXH_NAMESPACE`.
                     Useful to evade symbol naming collisions,
-                    in case of multiple inclusions of xxHash library.
-                    Client programs can still use regular function name, symbols are automatically translated through `xxhash.h`.
-- `XXH_STATIC_LINKING_ONLY` : gives access to state definition for static allocation.
+                    in case of multiple inclusions of xxHash source code.
+                    Client applications can still use regular function name,
+                    symbols are automatically translated through `xxhash.h`.
+- `XXH_STATIC_LINKING_ONLY` : gives access to state declaration for static allocation.
                               Incompatible with dynamic linking, due to risks of ABI changes.
-- `XXH_PRIVATE_API` : Make all functions `static`, directly accessible through `#include xxhash.h`, for inlining.
-                      Do not compile `xxhash.c` as a separate module in this case.
 - `XXH_NO_LONG_LONG` : removes support for XXH64,
-                       useful for targets without 64-bits support.
+                       for targets without 64-bit support.
 
 
 ### Example
 
-Calling xxhash 64-bits variant from a C program :
+Calling xxhash 64-bit variant from a C program :
 
 ```
 #include "xxhash.h"
diff --git a/appveyor.yml b/appveyor.yml
new file mode 100644
index 0000000..aa71222
--- /dev/null
+++ b/appveyor.yml
@@ -0,0 +1,70 @@
+version: 1.0.{build}
+environment:
+  matrix:
+  - COMPILER: "gcc"
+    PLATFORM: "mingw64"
+  - COMPILER: "gcc"
+    PLATFORM: "mingw32"
+
+install:
+  - ECHO Installing %COMPILER% %PLATFORM% %CONFIGURATION%
+  - MKDIR bin
+  - if [%COMPILER%]==[gcc] SET PATH_ORIGINAL=%PATH%
+  - if [%COMPILER%]==[gcc] (
+      SET "PATH_MINGW32=c:\MinGW\bin;c:\MinGW\usr\bin" &&
+      SET "PATH_MINGW64=c:\msys64\mingw64\bin;c:\msys64\usr\bin" &&
+      COPY C:\MinGW\bin\mingw32-make.exe C:\MinGW\bin\make.exe &&
+      COPY C:\MinGW\bin\gcc.exe C:\MinGW\bin\cc.exe
+    ) else (
+      IF [%PLATFORM%]==[x64] (SET ADDITIONALPARAM=/p:LibraryPath="C:\Program Files\Microsoft SDKs\Windows\v7.1\lib\x64;c:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\lib\amd64;C:\Program Files (x86)\Microsoft Visual Studio 10.0\;C:\Program Files (x86)\Microsoft Visual Studio 10.0\lib\amd64;")
+    )
+
+build_script:
+  - if [%PLATFORM%]==[mingw32] SET PATH=%PATH_MINGW32%;%PATH_ORIGINAL%
+  - if [%PLATFORM%]==[mingw64] SET PATH=%PATH_MINGW64%;%PATH_ORIGINAL%
+  - if [%PLATFORM%]==[clang] SET PATH=%PATH_MINGW64%;%PATH_ORIGINAL%
+  - ECHO *** &&
+      ECHO Building %COMPILER% %PLATFORM% %CONFIGURATION% &&
+      ECHO ***
+  - if [%PLATFORM%]==[clang] (clang -v)
+  - if [%COMPILER%]==[gcc] (gcc -v)
+  - if [%COMPILER%]==[gcc] (
+      echo ----- &&
+      make -v &&
+      echo ----- &&
+      if not [%PLATFORM%]==[clang] (
+        make -B clean test MOREFLAGS=-Werror
+      ) ELSE (
+        make -B clean test CC=clang MOREFLAGS="--target=x86_64-w64-mingw32 -Werror -Wconversion -Wno-sign-conversion"
+      )
+    )
+  - if [%COMPILER%]==[visual] (
+      ECHO *** &&
+      ECHO *** Building Visual Studio 2010 %PLATFORM%\%CONFIGURATION% &&
+      ECHO *** &&
+      msbuild "visual\VS2010\lz4.sln" %ADDITIONALPARAM% /m /verbosity:minimal /property:PlatformToolset=v100 /t:Clean,Build /p:Platform=%PLATFORM% /p:Configuration=%CONFIGURATION% /p:EnableWholeProgramOptimization=true /logger:"C:\Program Files\AppVeyor\BuildAgent\Appveyor.MSBuildLogger.dll" &&
+      ECHO *** &&
+      ECHO *** Building Visual Studio 2012 %PLATFORM%\%CONFIGURATION% &&
+      ECHO *** &&
+      msbuild "visual\VS2010\lz4.sln" /m /verbosity:minimal /property:PlatformToolset=v110 /t:Clean,Build /p:Platform=%PLATFORM% /p:Configuration=%CONFIGURATION% /logger:"C:\Program Files\AppVeyor\BuildAgent\Appveyor.MSBuildLogger.dll" &&
+      ECHO *** &&
+      ECHO *** Building Visual Studio 2013 %PLATFORM%\%CONFIGURATION% &&
+      ECHO *** &&
+      msbuild "visual\VS2010\lz4.sln" /m /verbosity:minimal /property:PlatformToolset=v120 /t:Clean,Build /p:Platform=%PLATFORM% /p:Configuration=%CONFIGURATION% /logger:"C:\Program Files\AppVeyor\BuildAgent\Appveyor.MSBuildLogger.dll" &&
+      ECHO *** &&
+      ECHO *** Building Visual Studio 2015 %PLATFORM%\%CONFIGURATION% &&
+      ECHO *** &&
+      msbuild "visual\VS2010\lz4.sln" /m /verbosity:minimal /property:PlatformToolset=v140 /t:Clean,Build /p:Platform=%PLATFORM% /p:Configuration=%CONFIGURATION% /logger:"C:\Program Files\AppVeyor\BuildAgent\Appveyor.MSBuildLogger.dll" &&
+      COPY visual\VS2010\bin\%PLATFORM%_%CONFIGURATION%\*.exe programs\
+    )
+
+test_script:
+  - ECHO *** &&
+      ECHO Testing %COMPILER% %PLATFORM% %CONFIGURATION% &&
+      ECHO ***
+  - if not [%COMPILER%]==[unknown] (
+      xxhsum -h &&
+      xxhsum xxhsum.exe &&
+      xxhsum -bi1 &&
+      echo ------- xxhsum tested -------
+    )
diff --git a/cmake_unofficial/CMakeLists.txt b/cmake_unofficial/CMakeLists.txt
index 82b32ff..1ca7a06 100644
--- a/cmake_unofficial/CMakeLists.txt
+++ b/cmake_unofficial/CMakeLists.txt
@@ -1,41 +1,100 @@
-cmake_minimum_required(VERSION 2.6)
-cmake_policy(VERSION 2.6)
+# To the extent possible under law, the author(s) have dedicated all
+# copyright and related and neighboring rights to this software to
+# the public domain worldwide. This software is distributed without
+# any warranty.
+#
+# For details, see <http://creativecommons.org/publicdomain/zero/1.0/>.
 
-project(xxhash)
+set(XXHASH_DIR "${CMAKE_CURRENT_SOURCE_DIR}/..")
 
-set(XXHASH_LIB_VERSION "0.6.3")
-set(XXHASH_LIB_SOVERSION "0")
+file(STRINGS "${XXHASH_DIR}/xxhash.h" XXHASH_VERSION_MAJOR REGEX "^#define XXH_VERSION_MAJOR +([0-9]+) *$")
+string(REGEX REPLACE "^#define XXH_VERSION_MAJOR +([0-9]+) *$" "\\1" XXHASH_VERSION_MAJOR "${XXHASH_VERSION_MAJOR}")
+file(STRINGS "${XXHASH_DIR}/xxhash.h" XXHASH_VERSION_MINOR REGEX "^#define XXH_VERSION_MINOR +([0-9]+) *$")
+string(REGEX REPLACE "^#define XXH_VERSION_MINOR +([0-9]+) *$" "\\1" XXHASH_VERSION_MINOR "${XXHASH_VERSION_MINOR}")
+file(STRINGS "${XXHASH_DIR}/xxhash.h" XXHASH_VERSION_RELEASE REGEX "^#define XXH_VERSION_RELEASE +([0-9]+) *$")
+string(REGEX REPLACE "^#define XXH_VERSION_RELEASE +([0-9]+) *$" "\\1" XXHASH_VERSION_RELEASE "${XXHASH_VERSION_RELEASE}")
+set(XXHASH_VERSION_STRING "${XXHASH_VERSION_MAJOR}.${XXHASH_VERSION_MINOR}.${XXHASH_VERSION_RELEASE}")
+set(XXHASH_LIB_VERSION ${XXHASH_VERSION_STRING})
+set(XXHASH_LIB_SOVERSION "${XXHASH_VERSION_MAJOR}")
+mark_as_advanced(XXHASH_VERSION_MAJOR XXHASH_VERSION_MINOR XXHASH_VERSION_RELEASE XXHASH_VERSION_STRING XXHASH_LIB_VERSION XXHASH_LIB_SOVERSION)
 
 option(BUILD_XXHSUM "Build the xxhsum binary" ON)
 option(BUILD_SHARED_LIBS "Build shared library" ON)
 
-# Make CMake's RPATH handling not be insane. This suff has cmake set rpaths appropriately for
-# where things end up in the install tree. For some reason that's not the default:
-# https://cmake.org/Wiki/CMake_RPATH_handling
-SET(CMAKE_SKIP_BUILD_RPATH FALSE)
-SET(CMAKE_BUILD_WITH_INSTALL_RPATH FALSE)
-
-# Where we search for shared libraries
-SET(CMAKE_INSTALL_RPATH "./lib")
-
-# add the automatically determined parts of the RPATH
-# which point to directories outside the build tree to the install RPATH
-SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
-
-add_library(xxhash ../xxhash.c)
-set_target_properties(xxhash PROPERTIES COMPILE_DEFINITIONS "XXHASH_EXPORT"
-                       VERSION "${XXHASH_LIB_VERSION}"
-                       SOVERSION "${XXHASH_LIB_SOVERSION}")
-
-if (BUILD_XXHSUM)
-    add_executable(xxhsum ../xxhsum.c)
-    target_link_libraries(xxhsum xxhash)
+if("${CMAKE_VERSION}" VERSION_LESS "3.0")
+  project(XXHASH C)
+else()
+  cmake_policy (SET CMP0048 NEW)
+  project(XXHASH
+    VERSION ${XXHASH_VERSION_STRING}
+    LANGUAGES C)
 endif()
 
-INSTALL(FILES ../xxhash.h DESTINATION include)
-INSTALL(
-    TARGETS xxhash xxhsum
-    RUNTIME DESTINATION bin
-    ARCHIVE DESTINATION lib
-    LIBRARY DESTINATION lib
-)
+cmake_minimum_required (VERSION 2.8.12)
+
+# If XXHASH is being bundled in another project, we don't want to
+# install anything.  However, we want to let people override this, so
+# we'll use the XXHASH_BUNDLED_MODE variable to let them do that; just
+# set it to OFF in your project before you add_subdirectory(xxhash/contrib/cmake_unofficial).
+if(CMAKE_CURRENT_SOURCE_DIR STREQUAL "${CMAKE_SOURCE_DIR}")
+  # Bundled mode hasn't been set one way or the other, set the default
+  # depending on whether or not we are the top-level project.
+  if("${XXHASH_PARENT_DIRECTORY}" STREQUAL "")
+    set(XXHASH_BUNDLED_MODE OFF)
+  else()
+    set(XXHASH_BUNDLED_MODE ON)
+  endif()
+endif()
+mark_as_advanced(XXHASH_BUNDLED_MODE)
+
+# Allow people to choose whether to build shared or static libraries
+# via the BUILD_SHARED_LIBS option unless we are in bundled mode, in
+# which case we always use static libraries.
+include(CMakeDependentOption)
+CMAKE_DEPENDENT_OPTION(BUILD_SHARED_LIBS "Build shared libraries" ON "NOT XXHASH_BUNDLED_MODE" OFF)
+
+include_directories("${XXHASH_DIR}")
+
+# libxxhash
+add_library(xxhash "${XXHASH_DIR}/xxhash.c")
+set_target_properties(xxhash PROPERTIES
+  SOVERSION "${XXHASH_VERSION_STRING}"
+  VERSION "${XXHASH_VERSION_STRING}")
+
+# xxhsum
+add_executable(xxhsum "${XXHASH_DIR}/xxhsum.c")
+target_link_libraries(xxhsum xxhash)
+
+# Extra warning flags
+include (CheckCCompilerFlag)
+foreach (flag
+    -Wall -Wextra -Wcast-qual -Wcast-align -Wshadow
+    -Wstrict-aliasing=1 -Wswitch-enum -Wdeclaration-after-statement
+    -Wstrict-prototypes -Wundef)
+  # Because https://gcc.gnu.org/wiki/FAQ#wnowarning
+  string(REGEX REPLACE "\\-Wno\\-(.+)" "-W\\1" flag_to_test "${flag}")
+  string(REGEX REPLACE "[^a-zA-Z0-9]+" "_" test_name "CFLAG_${flag_to_test}")
+
+  check_c_compiler_flag("${ADD_COMPILER_FLAGS_PREPEND} ${flag_to_test}" ${test_name})
+
+  if(${test_name})
+    set(CMAKE_C_FLAGS "${flag} ${CMAKE_C_FLAGS}")
+  endif()
+
+  unset(test_name)
+  unset(flag_to_test)
+endforeach (flag)
+
+if(NOT XXHASH_BUNDLED_MODE)
+  include(GNUInstallDirs)
+
+  install(TARGETS xxhsum
+    RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}")
+  install(TARGETS xxhash
+    LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}"
+    ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}")
+  install(FILES "${XXHASH_DIR}/xxhash.h"
+    DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}")
+  install(FILES "${XXHASH_DIR}/xxhsum.1"
+    DESTINATION "${CMAKE_INSTALL_MANDIR}/man1")
+endif(NOT XXHASH_BUNDLED_MODE)
diff --git a/cmake_unofficial/README.md b/cmake_unofficial/README.md
index fb93042..4fca58d 100644
--- a/cmake_unofficial/README.md
+++ b/cmake_unofficial/README.md
@@ -3,4 +3,4 @@
 The `cmake` script present in this directory offers the following options :
 
 - `BUILD_XXHSUM` : build the command line binary. ON by default
-- `BUILD_SHARED_LIBS` : build a dynamic library. OFF by default, builds static library instead.
+- `BUILD_SHARED_LIBS` : build dynamic library. ON by default.
diff --git a/doc/xxhash_spec.md b/doc/xxhash_spec.md
new file mode 100644
index 0000000..e673334
--- /dev/null
+++ b/doc/xxhash_spec.md
@@ -0,0 +1,311 @@
+xxHash fast digest algorithm
+======================
+
+### Notices
+
+Copyright (c) Yann Collet
+
+Permission is granted to copy and distribute this document
+for any purpose and without charge,
+including translations into other languages
+and incorporation into compilations,
+provided that the copyright notice and this notice are preserved,
+and that any substantive changes or deletions from the original
+are clearly marked.
+Distribution of this document is unlimited.
+
+### Version
+
+0.1.0 (15/01/18)
+
+
+Table of Contents
+---------------------
+- [Introduction](#introduction)
+- [XXH32 algorithm description](#xxh32-algorithm-description)
+- [XXH64 algorithm description](#xxh64-algorithm-description)
+- [Performance considerations](#performance-considerations)
+- [Reference Implementation](#reference-implementation)
+
+
+Introduction
+----------------
+
+This document describes the xxHash digest algorithm, for both 32 and 64 variants, named `XXH32` and `XXH64`. The algorithm takes as input a message of arbitrary length and an optional seed value, it then produces an output of 32 or 64-bit as "fingerprint" or "digest".
+
+xxHash is primarily designed for speed. It is labelled non-cryptographic, and is not meant to avoid intentional collisions (same digest for 2 different messages), or to prevent producing a message with predefined digest.
+
+XXH32 is designed to be fast on 32-bits machines.
+XXH64 is designed to be fast on 64-bits machines.
+Both variants produce different output.
+However, a given variant shall produce exactly the same output, irrespective of the cpu / os used. In particular, the result remains identical whatever the endianness and width of the cpu.
+
+### Operation notations
+
+All operations are performed modulo {32,64} bits. Arithmetic overflows are expected.
+`XXH32` uses 32-bit modular operations. `XXH64` uses 64-bit modular operations.
+
+- `+` : denote modular addition
+- `*` : denote modular multiplication
+- `X <<< s` : denote the value obtained by circularly shifting (rotating) `X` left by `s` bit positions.  
+- `X >> s` : denote the value obtained by shifting `X` right by s bit positions. Upper `s` bits become `0`.  
+- `X xor Y` : denote the bit-wise XOR of `X` and `Y` (same width).
+
+
+XXH32 Algorithm Description
+-------------------------------------
+
+### Overview
+
+We begin by supposing that we have a message of any length `L` as input, and that we wish to find its digest. Here `L` is an arbitrary nonnegative integer; `L` may be zero. The following steps are performed to compute the digest of the message.
+
+The algorithm collect and transform input in _stripes_ of 16 bytes. The transforms are stored inside 4 "accumulators", each one storing an unsigned 32-bit value. Each accumulator can be processed independently in parallel, speeding up processing for cpu with multiple execution units.
+
+The algorithm uses 32-bits addition, multiplication, rotate, shift and xor operations. Many operations require some 32-bits prime number constants, all defined below :
+
+    static const u32 PRIME32_1 = 2654435761U;
+    static const u32 PRIME32_2 = 2246822519U;
+    static const u32 PRIME32_3 = 3266489917U;
+    static const u32 PRIME32_4 =  668265263U;
+    static const u32 PRIME32_5 =  374761393U;
+
+### Step 1. Initialise internal accumulators
+
+Each accumulator gets an initial value based on optional `seed` input. Since the `seed` is optional, it can be `0`.
+
+        u32 acc1 = seed + PRIME32_1 + PRIME32_2;
+        u32 acc2 = seed + PRIME32_2;
+        u32 acc3 = seed + 0;
+        u32 acc4 = seed - PRIME32_1;
+
+#### Special case : input is less than 16 bytes
+
+When input is too small (< 16 bytes), the algorithm will not process any stripe. Consequently, it will not make use of parallel accumulators.
+
+In which case, a simplified initialization is performed, using a single accumulator :
+
+      u32 acc  = seed + PRIME32_5;
+
+The algorithm then proceeds directly to step 4.
+
+### Step 2. Process stripes
+
+A stripe is a contiguous segment of 16 bytes.
+It is evenly divided into 4 _lanes_, of 4 bytes each.
+The first lane is used to update accumulator 1, the second lane is used to update accumulator 2, and so on.
+
+Each lane read its associated 32-bit value using __little-endian__ convention.
+
+For each {lane, accumulator}, the update process is called a _round_, and applies the following formula :
+
+    accN = accN + (laneN * PRIME32_2);
+    accN = accN <<< 13;
+    accN = accN * PRIME32_1;
+
+This shuffles the bits so that any bit from input _lane_ impacts several bits in output _accumulator_. All operations are performed modulo 2^32.
+
+Input is consumed one full stripe at a time. Step 2 is looped as many times as necessary to consume the whole input, except the last remaining bytes which cannot form a stripe (< 16 bytes).
+When that happens, move to step 3.
+
+### Step 3. Accumulator convergence
+
+All 4 lane accumulators from previous steps are merged to produce a single remaining accumulator of same width (32-bit). The associated formula is as follows :
+
+    acc = (acc1 <<< 1) + (acc2 <<< 7) + (acc3 <<< 12) + (acc4 <<< 18);
+
+### Step 4. Add input length
+
+The input total length is presumed known at this stage. This step is just about adding the length to accumulator, so that it participates to final mixing.
+
+    acc = acc + (u32)inputLength;
+
+Note that, if input length is so large that it requires more than 32-bits, only the lower 32-bits are added to the accumulator.
+
+### Step 5. Consume remaining input
+
+There may be up to 15 bytes remaining to consume from the input.
+The final stage will digest them according to following pseudo-code :
+
+    while (remainingLength >= 4) {
+        lane = read_32bit_little_endian(input_ptr);
+        acc = acc + lane * PRIME32_3;
+        acc = (acc <<< 17) * PRIME32_4;
+        input_ptr += 4; remainingLength -= 4;
+    }
+
+    while (remainingLength >= 1) {
+        lane = read_byte(input_ptr);
+        acc = acc + lane * PRIME32_5;
+        acc = (acc <<< 11) * PRIME32_1;
+        input_ptr += 1; remainingLength -= 1;
+    }
+
+This process ensures that all input bytes are present in the final mix.
+
+### Step 6. Final mix (avalanche)
+
+The final mix ensures that all input bits have a chance to impact any bit in the output digest, resulting in an unbiased distribution. This is also called avalanche effect.
+
+    acc = acc xor (acc >> 15);
+    acc = acc * PRIME32_2;
+    acc = acc xor (acc >> 13);
+    acc = acc * PRIME32_3;
+    acc = acc xor (acc >> 16);
+
+### Step 7. Output
+
+The `XXH32()` function produces an unsigned 32-bit value as output.
+
+For systems which require to store and/or display the result in binary or hexadecimal format, the canonical format is defined to reproduce the same value as the natural decimal format, hence follows __big-endian__ convention (most significant byte first).
+
+
+XXH64 Algorithm Description
+-------------------------------------
+
+### Overview
+
+`XXH64` algorithm structure is very similar to `XXH32` one. The major difference is that `XXH64` uses 64-bit arithmetic, speeding up memory transfer for 64-bit compliant systems, but also relying on cpu capability to efficiently perform 64-bit operations.
+
+The algorithm collects and transforms input in _stripes_ of 32 bytes. The transforms are stored inside 4 "accumulators", each one storing an unsigned 64-bit value. Each accumulator can be processed independently in parallel, speeding up processing for cpu with multiple execution units.
+
+The algorithm uses 64-bit addition, multiplication, rotate, shift and xor operations. Many operations require some 64-bit prime number constants, all defined below :
+
+    static const u64 PRIME64_1 = 11400714785074694791ULL;
+    static const u64 PRIME64_2 = 14029467366897019727ULL;
+    static const u64 PRIME64_3 =  1609587929392839161ULL;
+    static const u64 PRIME64_4 =  9650029242287828579ULL;
+    static const u64 PRIME64_5 =  2870177450012600261ULL;
+
+### Step 1. Initialise internal accumulators
+
+Each accumulator gets an initial value based on optional `seed` input. Since the `seed` is optional, it can be `0`.
+
+        u64 acc1 = seed + PRIME64_1 + PRIME64_2;
+        u64 acc2 = seed + PRIME64_2;
+        u64 acc3 = seed + 0;
+        u64 acc4 = seed - PRIME64_1;
+
+#### Special case : input is less than 32 bytes
+
+When input is too small (< 32 bytes), the algorithm will not process any stripe. Consequently, it will not make use of parallel accumulators.
+
+In which case, a simplified initialization is performed, using a single accumulator :
+
+      u64 acc  = seed + PRIME64_5;
+
+The algorithm then proceeds directly to step 4.
+
+### Step 2. Process stripes
+
+A stripe is a contiguous segment of 32 bytes.
+It is evenly divided into 4 _lanes_, of 8 bytes each.
+The first lane is used to update accumulator 1, the second lane is used to update accumulator 2, and so on.
+
+Each lane read its associated 64-bit value using __little-endian__ convention.
+
+For each {lane, accumulator}, the update process is called a _round_, and applies the following formula :
+
+    round(accN,laneN):
+    accN = accN + (laneN * PRIME64_2);
+    accN = accN <<< 31;
+    return accN * PRIME64_1;
+
+This shuffles the bits so that any bit from input _lane_ impacts several bits in output _accumulator_. All operations are performed modulo 2^64.
+
+Input is consumed one full stripe at a time. Step 2 is looped as many times as necessary to consume the whole input, except the last remaining bytes which cannot form a stripe (< 32 bytes).
+When that happens, move to step 3.
+
+### Step 3. Accumulator convergence
+
+All 4 lane accumulators from previous steps are merged to produce a single remaining accumulator of same width (64-bit). The associated formula is as follows.
+
+Note that accumulator convergence is more complex than 32-bit variant, and requires to define another function called _mergeAccumulator()_ :
+
+    mergeAccumulator(acc,accN):
+    acc  = acc xor round(0, accN);
+    acc  = acc * PRIME64_1
+    return acc + PRIME64_4;
+
+which is then used in the convergence formula :
+
+    acc = (acc1 <<< 1) + (acc2 <<< 7) + (acc3 <<< 12) + (acc4 <<< 18);
+    acc = mergeAccumulator(acc, acc1);
+    acc = mergeAccumulator(acc, acc2);
+    acc = mergeAccumulator(acc, acc3);
+    acc = mergeAccumulator(acc, acc4);
+
+### Step 4. Add input length
+
+The input total length is presumed known at this stage. This step is just about adding the length to accumulator, so that it participates to final mixing.
+
+    acc = acc + inputLength;
+
+### Step 5. Consume remaining input
+
+There may be up to 31 bytes remaining to consume from the input.
+The final stage will digest them according to following pseudo-code :
+
+    while (remainingLength >= 8) {
+        lane = read_64bit_little_endian(input_ptr);
+        acc = acc xor round(0, lane);
+        acc = (acc <<< 27) * PRIME64_1;
+        acc = acc + PRIME64_4;
+        input_ptr += 8; remainingLength -= 8;
+    }
+
+    if (remainingLength >= 4) {
+        lane = read_32bit_little_endian(input_ptr);
+        acc = acc xor (lane * PRIME64_1);
+        acc = (acc <<< 23) * PRIME64_2;
+        acc = acc + PRIME64_3;
+        input_ptr += 4; remainingLength -= 4;
+    }
+
+    while (remainingLength >= 1) {
+        lane = read_byte(input_ptr);
+        acc = acc xor (lane * PRIME64_5);
+        acc = (acc <<< 11) * PRIME64_1;
+        input_ptr += 1; remainingLength -= 1;
+    }
+
+This process ensures that all input bytes are present in the final mix.
+
+### Step 6. Final mix (avalanche)
+
+The final mix ensures that all input bits have a chance to impact any bit in the output digest, resulting in an unbiased distribution. This is also called avalanche effect.
+
+    acc = acc xor (acc >> 33);
+    acc = acc * PRIME64_2;
+    acc = acc xor (acc >> 29);
+    acc = acc * PRIME64_3;
+    acc = acc xor (acc >> 32);
+
+### Step 7. Output
+
+The `XXH64()` function produces an unsigned 64-bit value as output.
+
+For systems which require to store and/or display the result in binary or hexadecimal format, the canonical format is defined to reproduce the same value as the natural decimal format, hence follows __big-endian__ convention (most significant byte first).
+
+Performance considerations
+----------------------------------
+
+The xxHash algorithms are simple and compact to implement. They provide a system independent "fingerprint" or digest of a message of arbitrary length.
+
+The algorithm allows input to be streamed and processed in multiple steps. In such case, an internal buffer is needed to ensure data is presented to the algorithm in full stripes.
+
+On 64-bit systems, the 64-bit variant `XXH64` is generally faster to compute, so it is a recommended variant, even when only 32-bit are needed.
+
+On 32-bit systems though, positions are reversed : `XXH64` performance is reduced, due to its usage of 64-bit arithmetic. `XXH32` becomes a faster variant.
+
+
+Reference Implementation
+----------------------------------------
+
+A reference library written in C is available at http://www.xxhash.com .
+The web page also links to multiple other implementations written in many different languages.
+It links to the [github project page](https://github.com/Cyan4973/xxHash) where an [issue board](https://github.com/Cyan4973/xxHash/issues) can be used for further public discussions on the topic.
+
+
+Version changes
+--------------------
+v0.1.0 : initial release
diff --git a/xxhash.c b/xxhash.c
index 63a1171..da06ea7 100644
--- a/xxhash.c
+++ b/xxhash.c
@@ -54,7 +54,7 @@
                         || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) \
                         || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) )
 #    define XXH_FORCE_MEMORY_ACCESS 2
-#  elif defined(__INTEL_COMPILER) || \
+#  elif (defined(__INTEL_COMPILER) && !defined(_WIN32)) || \
   (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) \
                     || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) \
                     || defined(__ARM_ARCH_7S__) ))
@@ -111,6 +111,8 @@
 #include <string.h>
 static void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcpy(dest,src,size); }
 
+#include <assert.h>   /* assert */
+
 #define XXH_STATIC_LINKING_ONLY
 #include "xxhash.h"
 
@@ -215,8 +217,12 @@
 
 /* XXH_CPU_LITTLE_ENDIAN can be defined externally, for example on the compiler command line */
 #ifndef XXH_CPU_LITTLE_ENDIAN
-    static const int g_one = 1;
-#   define XXH_CPU_LITTLE_ENDIAN   (*(const char*)(&g_one))
+static int XXH_isLittleEndian(void)
+{
+    const union { U32 u; BYTE c[4]; } one = { 1 };   /* don't use static : performance detrimental  */
+    return one.c[0];
+}
+#   define XXH_CPU_LITTLE_ENDIAN   XXH_isLittleEndian()
 #endif
 
 
@@ -252,7 +258,7 @@
 
 
 /* *******************************************************************
-*  32-bits hash functions
+*  32-bit hash functions
 *********************************************************************/
 static const U32 PRIME32_1 = 2654435761U;
 static const U32 PRIME32_2 = 2246822519U;
@@ -268,12 +274,87 @@
     return seed;
 }
 
-FORCE_INLINE U32 XXH32_endian_align(const void* input, size_t len, U32 seed, XXH_endianess endian, XXH_alignment align)
+/* mix all bits */
+static U32 XXH32_avalanche(U32 h32)
+{
+    h32 ^= h32 >> 15;
+    h32 *= PRIME32_2;
+    h32 ^= h32 >> 13;
+    h32 *= PRIME32_3;
+    h32 ^= h32 >> 16;
+    return(h32);
+}
+
+#define XXH_get32bits(p) XXH_readLE32_align(p, endian, align)
+
+static U32
+XXH32_finalize(U32 h32, const void* ptr, size_t len,
+                XXH_endianess endian, XXH_alignment align)
+
+{
+    const BYTE* p = (const BYTE*)ptr;
+#define PROCESS1             \
+    h32 += (*p) * PRIME32_5; \
+    p++;                     \
+    h32 = XXH_rotl32(h32, 11) * PRIME32_1 ;
+
+#define PROCESS4                         \
+    h32 += XXH_get32bits(p) * PRIME32_3; \
+    p+=4;                                \
+    h32  = XXH_rotl32(h32, 17) * PRIME32_4 ;
+
+    switch(len&15)  /* or switch(bEnd - p) */
+    {
+      case 12:      PROCESS4;
+                    /* fallthrough */
+      case 8:       PROCESS4;
+                    /* fallthrough */
+      case 4:       PROCESS4;
+                    return XXH32_avalanche(h32);
+
+      case 13:      PROCESS4;
+                    /* fallthrough */
+      case 9:       PROCESS4;
+                    /* fallthrough */
+      case 5:       PROCESS4;
+                    PROCESS1;
+                    return XXH32_avalanche(h32);
+
+      case 14:      PROCESS4;
+                    /* fallthrough */
+      case 10:      PROCESS4;
+                    /* fallthrough */
+      case 6:       PROCESS4;
+                    PROCESS1;
+                    PROCESS1;
+                    return XXH32_avalanche(h32);
+
+      case 15:      PROCESS4;
+                    /* fallthrough */
+      case 11:      PROCESS4;
+                    /* fallthrough */
+      case 7:       PROCESS4;
+                    /* fallthrough */
+      case 3:       PROCESS1;
+                    /* fallthrough */
+      case 2:       PROCESS1;
+                    /* fallthrough */
+      case 1:       PROCESS1;
+                    /* fallthrough */
+      case 0:       return XXH32_avalanche(h32);
+    }
+    assert(0);
+    return h32;   /* reaching this point is deemed impossible */
+}
+
+
+FORCE_INLINE U32
+XXH32_endian_align(const void* input, size_t len, U32 seed,
+                    XXH_endianess endian, XXH_alignment align)
 {
     const BYTE* p = (const BYTE*)input;
     const BYTE* bEnd = p + len;
     U32 h32;
-#define XXH_get32bits(p) XXH_readLE32_align(p, endian, align)
 
 #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
     if (p==NULL) {
@@ -283,7 +364,7 @@
 #endif
 
     if (len>=16) {
-        const BYTE* const limit = bEnd - 16;
+        const BYTE* const limit = bEnd - 15;
         U32 v1 = seed + PRIME32_1 + PRIME32_2;
         U32 v2 = seed + PRIME32_2;
         U32 v3 = seed + 0;
@@ -294,34 +375,17 @@
             v2 = XXH32_round(v2, XXH_get32bits(p)); p+=4;
             v3 = XXH32_round(v3, XXH_get32bits(p)); p+=4;
             v4 = XXH32_round(v4, XXH_get32bits(p)); p+=4;
-        } while (p<=limit);
+        } while (p < limit);
 
-        h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
+        h32 = XXH_rotl32(v1, 1)  + XXH_rotl32(v2, 7)
+            + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
     } else {
         h32  = seed + PRIME32_5;
     }
 
-    h32 += (U32) len;
+    h32 += (U32)len;
 
-    while (p+4<=bEnd) {
-        h32 += XXH_get32bits(p) * PRIME32_3;
-        h32  = XXH_rotl32(h32, 17) * PRIME32_4 ;
-        p+=4;
-    }
-
-    while (p<bEnd) {
-        h32 += (*p) * PRIME32_5;
-        h32 = XXH_rotl32(h32, 11) * PRIME32_1 ;
-        p++;
-    }
-
-    h32 ^= h32 >> 15;
-    h32 *= PRIME32_2;
-    h32 ^= h32 >> 13;
-    h32 *= PRIME32_3;
-    h32 ^= h32 >> 16;
-
-    return h32;
+    return XXH32_finalize(h32, p, len&15, endian, align);
 }
 
 
@@ -446,6 +510,7 @@
     return XXH_OK;
 }
 
+
 XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* state_in, const void* input, size_t len)
 {
     XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
@@ -457,11 +522,9 @@
 }
 
 
-
-FORCE_INLINE U32 XXH32_digest_endian (const XXH32_state_t* state, XXH_endianess endian)
+FORCE_INLINE U32
+XXH32_digest_endian (const XXH32_state_t* state, XXH_endianess endian)
 {
-    const BYTE * p = (const BYTE*)state->mem32;
-    const BYTE* const bEnd = (const BYTE*)(state->mem32) + state->memsize;
     U32 h32;
 
     if (state->large_len) {
@@ -475,25 +538,7 @@
 
     h32 += state->total_len_32;
 
-    while (p+4<=bEnd) {
-        h32 += XXH_readLE32(p, endian) * PRIME32_3;
-        h32  = XXH_rotl32(h32, 17) * PRIME32_4;
-        p+=4;
-    }
-
-    while (p<bEnd) {
-        h32 += (*p) * PRIME32_5;
-        h32  = XXH_rotl32(h32, 11) * PRIME32_1;
-        p++;
-    }
-
-    h32 ^= h32 >> 15;
-    h32 *= PRIME32_2;
-    h32 ^= h32 >> 13;
-    h32 *= PRIME32_3;
-    h32 ^= h32 >> 16;
-
-    return h32;
+    return XXH32_finalize(h32, state->mem32, state->memsize, endian, XXH_aligned);
 }
 
 
@@ -532,7 +577,7 @@
 #ifndef XXH_NO_LONG_LONG
 
 /* *******************************************************************
-*  64-bits hash functions
+*  64-bit hash functions
 *********************************************************************/
 
 /*======   Memory access   ======*/
@@ -639,12 +684,136 @@
     return acc;
 }
 
-FORCE_INLINE U64 XXH64_endian_align(const void* input, size_t len, U64 seed, XXH_endianess endian, XXH_alignment align)
+static U64 XXH64_avalanche(U64 h64)
+{
+    h64 ^= h64 >> 33;
+    h64 *= PRIME64_2;
+    h64 ^= h64 >> 29;
+    h64 *= PRIME64_3;
+    h64 ^= h64 >> 32;
+    return h64;
+}
+
+
+#define XXH_get64bits(p) XXH_readLE64_align(p, endian, align)
+
+static U64
+XXH64_finalize(U64 h64, const void* ptr, size_t len,
+               XXH_endianess endian, XXH_alignment align)
+{
+    const BYTE* p = (const BYTE*)ptr;
+
+#define PROCESS1_64          \
+    h64 ^= (*p) * PRIME64_5; \
+    p++;                     \
+    h64 = XXH_rotl64(h64, 11) * PRIME64_1;
+
+#define PROCESS4_64          \
+    h64 ^= (U64)(XXH_get32bits(p)) * PRIME64_1; \
+    p+=4;                    \
+    h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
+
+#define PROCESS8_64 {        \
+    U64 const k1 = XXH64_round(0, XXH_get64bits(p)); \
+    p+=8;                    \
+    h64 ^= k1;               \
+    h64  = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4; \
+}
+
+    switch(len&31) {
+      case 24: PROCESS8_64;
+                    /* fallthrough */
+      case 16: PROCESS8_64;
+                    /* fallthrough */
+      case  8: PROCESS8_64;
+               return XXH64_avalanche(h64);
+
+      case 28: PROCESS8_64;
+                    /* fallthrough */
+      case 20: PROCESS8_64;
+                    /* fallthrough */
+      case 12: PROCESS8_64;
+                    /* fallthrough */
+      case  4: PROCESS4_64;
+               return XXH64_avalanche(h64);
+
+      case 25: PROCESS8_64;
+                    /* fallthrough */
+      case 17: PROCESS8_64;
+                    /* fallthrough */
+      case  9: PROCESS8_64;
+               PROCESS1_64;
+               return XXH64_avalanche(h64);
+
+      case 29: PROCESS8_64;
+                    /* fallthrough */
+      case 21: PROCESS8_64;
+                    /* fallthrough */
+      case 13: PROCESS8_64;
+                    /* fallthrough */
+      case  5: PROCESS4_64;
+               PROCESS1_64;
+               return XXH64_avalanche(h64);
+
+      case 26: PROCESS8_64;
+                    /* fallthrough */
+      case 18: PROCESS8_64;
+                    /* fallthrough */
+      case 10: PROCESS8_64;
+               PROCESS1_64;
+               PROCESS1_64;
+               return XXH64_avalanche(h64);
+
+      case 30: PROCESS8_64;
+                    /* fallthrough */
+      case 22: PROCESS8_64;
+                    /* fallthrough */
+      case 14: PROCESS8_64;
+                    /* fallthrough */
+      case  6: PROCESS4_64;
+               PROCESS1_64;
+               PROCESS1_64;
+               return XXH64_avalanche(h64);
+
+      case 27: PROCESS8_64;
+                    /* fallthrough */
+      case 19: PROCESS8_64;
+                    /* fallthrough */
+      case 11: PROCESS8_64;
+               PROCESS1_64;
+               PROCESS1_64;
+               PROCESS1_64;
+               return XXH64_avalanche(h64);
+
+      case 31: PROCESS8_64;
+                    /* fallthrough */
+      case 23: PROCESS8_64;
+                    /* fallthrough */
+      case 15: PROCESS8_64;
+                    /* fallthrough */
+      case  7: PROCESS4_64;
+                    /* fallthrough */
+      case  3: PROCESS1_64;
+                    /* fallthrough */
+      case  2: PROCESS1_64;
+                    /* fallthrough */
+      case  1: PROCESS1_64;
+                    /* fallthrough */
+      case  0: return XXH64_avalanche(h64);
+    }
+
+    /* impossible to reach */
+    assert(0);
+    return 0;  /* unreachable, but some compilers complain without it */
+}
+
+FORCE_INLINE U64
+XXH64_endian_align(const void* input, size_t len, U64 seed,
+                XXH_endianess endian, XXH_alignment align)
 {
     const BYTE* p = (const BYTE*)input;
     const BYTE* bEnd = p + len;
     U64 h64;
-#define XXH_get64bits(p) XXH_readLE64_align(p, endian, align)
 
 #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
     if (p==NULL) {
@@ -679,32 +848,7 @@
 
     h64 += (U64) len;
 
-    while (p+8<=bEnd) {
-        U64 const k1 = XXH64_round(0, XXH_get64bits(p));
-        h64 ^= k1;
-        h64  = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4;
-        p+=8;
-    }
-
-    if (p+4<=bEnd) {
-        h64 ^= (U64)(XXH_get32bits(p)) * PRIME64_1;
-        h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
-        p+=4;
-    }
-
-    while (p<bEnd) {
-        h64 ^= (*p) * PRIME64_5;
-        h64 = XXH_rotl64(h64, 11) * PRIME64_1;
-        p++;
-    }
-
-    h64 ^= h64 >> 33;
-    h64 *= PRIME64_2;
-    h64 ^= h64 >> 29;
-    h64 *= PRIME64_3;
-    h64 ^= h64 >> 32;
-
-    return h64;
+    return XXH64_finalize(h64, p, len, endian, align);
 }
 
 
@@ -835,8 +979,6 @@
 
 FORCE_INLINE U64 XXH64_digest_endian (const XXH64_state_t* state, XXH_endianess endian)
 {
-    const BYTE * p = (const BYTE*)state->mem64;
-    const BYTE* const bEnd = (const BYTE*)state->mem64 + state->memsize;
     U64 h64;
 
     if (state->total_len >= 32) {
@@ -851,37 +993,12 @@
         h64 = XXH64_mergeRound(h64, v3);
         h64 = XXH64_mergeRound(h64, v4);
     } else {
-        h64  = state->v3 + PRIME64_5;
+        h64  = state->v3 /*seed*/ + PRIME64_5;
     }
 
     h64 += (U64) state->total_len;
 
-    while (p+8<=bEnd) {
-        U64 const k1 = XXH64_round(0, XXH_readLE64(p, endian));
-        h64 ^= k1;
-        h64  = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4;
-        p+=8;
-    }
-
-    if (p+4<=bEnd) {
-        h64 ^= (U64)(XXH_readLE32(p, endian)) * PRIME64_1;
-        h64  = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
-        p+=4;
-    }
-
-    while (p<bEnd) {
-        h64 ^= (*p) * PRIME64_5;
-        h64  = XXH_rotl64(h64, 11) * PRIME64_1;
-        p++;
-    }
-
-    h64 ^= h64 >> 33;
-    h64 *= PRIME64_2;
-    h64 ^= h64 >> 29;
-    h64 *= PRIME64_3;
-    h64 ^= h64 >> 32;
-
-    return h64;
+    return XXH64_finalize(h64, state->mem64, (size_t)state->total_len, endian, XXH_aligned);
 }
 
 XXH_PUBLIC_API unsigned long long XXH64_digest (const XXH64_state_t* state_in)
diff --git a/xxhash.h b/xxhash.h
index 1313663..d6bad94 100644
--- a/xxhash.h
+++ b/xxhash.h
@@ -57,8 +57,8 @@
 It depends on successfully passing SMHasher test set.
 10 is a perfect score.
 
-A 64-bits version, named XXH64, is available since r35.
-It offers much better speed, but for 64-bits applications only.
+A 64-bit version, named XXH64, is available since r35.
+It offers much better speed, but for 64-bit applications only.
 Name     Speed on 64 bits    Speed on 32 bits
 XXH64       13.8 GB/s            1.9 GB/s
 XXH32        6.8 GB/s            6.0 GB/s
@@ -80,18 +80,19 @@
 
 
 /* ****************************
-*  API modifier
-******************************/
-/** XXH_PRIVATE_API
-*   This is useful to include xxhash functions in `static` mode
-*   in order to inline them, and remove their symbol from the public list.
-*   Methodology :
-*     #define XXH_PRIVATE_API
-*     #include "xxhash.h"
-*   `xxhash.c` is automatically included.
-*   It's not useful to compile and link it as a separate module.
-*/
-#ifdef XXH_PRIVATE_API
+ *  API modifier
+ ******************************/
+/** XXH_INLINE_ALL (and XXH_PRIVATE_API)
+ *  This is useful to include xxhash functions in `static` mode
+ *  in order to inline them, and remove their symbol from the public list.
+ *  Inlining can offer dramatic performance improvement on small keys.
+ *  Methodology :
+ *     #define XXH_INLINE_ALL
+ *     #include "xxhash.h"
+ * `xxhash.c` is automatically included.
+ *  It's not useful to compile and link it as a separate module.
+ */
+#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)
 #  ifndef XXH_STATIC_LINKING_ONLY
 #    define XXH_STATIC_LINKING_ONLY
 #  endif
@@ -107,19 +108,19 @@
 #  endif
 #else
 #  define XXH_PUBLIC_API   /* do nothing */
-#endif /* XXH_PRIVATE_API */
+#endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */
 
-/*!XXH_NAMESPACE, aka Namespace Emulation :
-
-If you want to include _and expose_ xxHash functions from within your own library,
-but also want to avoid symbol collisions with other libraries which may also include xxHash,
-
-you can use XXH_NAMESPACE, to automatically prefix any public symbol from xxhash library
-with the value of XXH_NAMESPACE (therefore, avoid NULL and numeric values).
-
-Note that no change is required within the calling program as long as it includes `xxhash.h` :
-regular symbol name will be automatically translated by this header.
-*/
+/*! XXH_NAMESPACE, aka Namespace Emulation :
+ *
+ * If you want to include _and expose_ xxHash functions from within your own library,
+ * but also want to avoid symbol collisions with other libraries which may also include xxHash,
+ *
+ * you can use XXH_NAMESPACE, to automatically prefix any public symbol from xxhash library
+ * with the value of XXH_NAMESPACE (therefore, avoid NULL and numeric values).
+ *
+ * Note that no change is required within the calling program as long as it includes `xxhash.h` :
+ * regular symbol name will be automatically translated by this header.
+ */
 #ifdef XXH_NAMESPACE
 #  define XXH_CAT(A,B) A##B
 #  define XXH_NAME2(A,B) XXH_CAT(A,B)
@@ -150,18 +151,18 @@
 ***************************************/
 #define XXH_VERSION_MAJOR    0
 #define XXH_VERSION_MINOR    6
-#define XXH_VERSION_RELEASE  4
+#define XXH_VERSION_RELEASE  5
 #define XXH_VERSION_NUMBER  (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE)
 XXH_PUBLIC_API unsigned XXH_versionNumber (void);
 
 
 /*-**********************************************************************
-*  32-bits hash
+*  32-bit hash
 ************************************************************************/
 typedef unsigned int XXH32_hash_t;
 
 /*! XXH32() :
-    Calculate the 32-bits hash of sequence "length" bytes stored at memory address "input".
+    Calculate the 32-bit hash of sequence "length" bytes stored at memory address "input".
     The memory between input & input+length must be valid (allocated and read-accessible).
     "seed" can be used to alter the result predictably.
     Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s */
@@ -178,26 +179,25 @@
 XXH_PUBLIC_API XXH32_hash_t  XXH32_digest (const XXH32_state_t* statePtr);
 
 /*
-These functions generate the xxHash of an input provided in multiple segments.
-Note that, for small input, they are slower than single-call functions, due to state management.
-For small input, prefer `XXH32()` and `XXH64()` .
-
-XXH state must first be allocated, using XXH*_createState() .
-
-Start a new hash by initializing state with a seed, using XXH*_reset().
-
-Then, feed the hash state by calling XXH*_update() as many times as necessary.
-Obviously, input must be allocated and read accessible.
-The function returns an error code, with 0 meaning OK, and any other value meaning there is an error.
-
-Finally, a hash value can be produced anytime, by using XXH*_digest().
-This function returns the nn-bits hash as an int or long long.
-
-It's still possible to continue inserting input into the hash state after a digest,
-and generate some new hashes later on, by calling again XXH*_digest().
-
-When done, free XXH state space if it was allocated dynamically.
-*/
+ * Streaming functions generate the xxHash of an input provided in multiple segments.
+ * Note that, for small input, they are slower than single-call functions, due to state management.
+ * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized.
+ *
+ * XXH state must first be allocated, using XXH*_createState() .
+ *
+ * Start a new hash by initializing state with a seed, using XXH*_reset().
+ *
+ * Then, feed the hash state by calling XXH*_update() as many times as necessary.
+ * The function returns an error code, with 0 meaning OK, and any other value meaning there is an error.
+ *
+ * Finally, a hash value can be produced anytime, by using XXH*_digest().
+ * This function returns the nn-bits hash as an int or long long.
+ *
+ * It's still possible to continue inserting input into the hash state after a digest,
+ * and generate some new hashes later on, by calling again XXH*_digest().
+ *
+ * When done, free XXH state space if it was allocated dynamically.
+ */
 
 /*======   Canonical representation   ======*/
 
@@ -206,22 +206,22 @@
 XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src);
 
 /* Default result type for XXH functions are primitive unsigned 32 and 64 bits.
-*  The canonical representation uses human-readable write convention, aka big-endian (large digits first).
-*  These functions allow transformation of hash result into and from its canonical format.
-*  This way, hash values can be written into a file / memory, and remain comparable on different systems and programs.
-*/
+ * The canonical representation uses human-readable write convention, aka big-endian (large digits first).
+ * These functions allow transformation of hash result into and from its canonical format.
+ * This way, hash values can be written into a file / memory, and remain comparable on different systems and programs.
+ */
 
 
 #ifndef XXH_NO_LONG_LONG
 /*-**********************************************************************
-*  64-bits hash
+*  64-bit hash
 ************************************************************************/
 typedef unsigned long long XXH64_hash_t;
 
 /*! XXH64() :
-    Calculate the 64-bits hash of sequence of length "len" stored at memory address "input".
+    Calculate the 64-bit hash of sequence of length "len" stored at memory address "input".
     "seed" can be used to alter the result predictably.
-    This function runs faster on 64-bits systems, but slower on 32-bits systems (see benchmark).
+    This function runs faster on 64-bit systems, but slower on 32-bit systems (see benchmark).
 */
 XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t length, unsigned long long seed);
 
@@ -242,6 +242,7 @@
 #endif  /* XXH_NO_LONG_LONG */
 
 
+
 #ifdef XXH_STATIC_LINKING_ONLY
 
 /* ================================================================================================
@@ -251,9 +252,39 @@
    Never use them in association with dynamic linking !
 =================================================================================================== */
 
-/* These definitions are only meant to make possible
-   static allocation of XXH state, on stack or in a struct for example.
-   Never use members directly. */
+/* These definitions are only present to allow
+ * static allocation of XXH state, on stack or in a struct for example.
+ * Never **ever** use members directly. */
+
+#if !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#   include <stdint.h>
+
+struct XXH32_state_s {
+   uint32_t total_len_32;
+   uint32_t large_len;
+   uint32_t v1;
+   uint32_t v2;
+   uint32_t v3;
+   uint32_t v4;
+   uint32_t mem32[4];
+   uint32_t memsize;
+   uint32_t reserved;   /* never read nor write, might be removed in a future version */
+};   /* typedef'd to XXH32_state_t */
+
+struct XXH64_state_s {
+   uint64_t total_len;
+   uint64_t v1;
+   uint64_t v2;
+   uint64_t v3;
+   uint64_t v4;
+   uint64_t mem64[4];
+   uint32_t memsize;
+   uint32_t reserved[2];          /* never read nor write, might be removed in a future version */
+};   /* typedef'd to XXH64_state_t */
+
+# else
 
 struct XXH32_state_s {
    unsigned total_len_32;
@@ -262,25 +293,28 @@
    unsigned v2;
    unsigned v3;
    unsigned v4;
-   unsigned mem32[4];   /* buffer defined as U32 for alignment */
+   unsigned mem32[4];
    unsigned memsize;
-   unsigned reserved;   /* never read nor write, will be removed in a future version */
+   unsigned reserved;   /* never read nor write, might be removed in a future version */
 };   /* typedef'd to XXH32_state_t */
 
-#ifndef XXH_NO_LONG_LONG   /* remove 64-bits support */
+#   ifndef XXH_NO_LONG_LONG  /* remove 64-bit support */
 struct XXH64_state_s {
    unsigned long long total_len;
    unsigned long long v1;
    unsigned long long v2;
    unsigned long long v3;
    unsigned long long v4;
-   unsigned long long mem64[4];   /* buffer defined as U64 for alignment */
+   unsigned long long mem64[4];
    unsigned memsize;
-   unsigned reserved[2];          /* never read nor write, will be removed in a future version */
+   unsigned reserved[2];     /* never read nor write, might be removed in a future version */
 };   /* typedef'd to XXH64_state_t */
-#endif
+#    endif
 
-#ifdef XXH_PRIVATE_API
+# endif
+
+
+#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)
 #  include "xxhash.c"   /* include xxhash function bodies as `static`, for inlining */
 #endif
 
diff --git a/xxhsum.c b/xxhsum.c
index 656d9b1..69931f7 100644
--- a/xxhsum.c
+++ b/xxhsum.c
@@ -32,8 +32,8 @@
 #define XXHASH_C_2097394837
 
 /* ************************************
-*  Compiler Options
-**************************************/
+ *  Compiler Options
+ **************************************/
 /* MS Visual */
 #if defined(_MSC_VER) || defined(_WIN32)
 #  define _CRT_SECURE_NO_WARNINGS   /* removes visual warnings */
@@ -46,28 +46,26 @@
 
 
 /* ************************************
-*  Includes
-**************************************/
+ *  Includes
+ **************************************/
 #include <stdlib.h>     /* malloc, calloc, free, exit */
-#include <stdio.h>      /* fprintf, fopen, ftello64, fread, stdin, stdout; when present : _fileno */
+#include <stdio.h>      /* fprintf, fopen, ftello64, fread, stdin, stdout, _fileno (when present) */
 #include <string.h>     /* strcmp */
-#include <sys/types.h>  /* stat64 */
-#include <sys/stat.h>   /* stat64 */
+#include <sys/types.h>  /* stat, stat64, _stat64 */
+#include <sys/stat.h>   /* stat, stat64, _stat64 */
 #include <time.h>       /* clock_t, clock, CLOCKS_PER_SEC */
+#include <assert.h>     /* assert */
 
 #define XXH_STATIC_LINKING_ONLY   /* *_state_t */
 #include "xxhash.h"
 
 
-/*-************************************
-*  OS-Specific Includes
-**************************************/
+/* ************************************
+ *  OS-Specific Includes
+ **************************************/
 #if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(_WIN32) || defined(__CYGWIN__)
 #  include <fcntl.h>    /* _O_BINARY */
 #  include <io.h>       /* _setmode, _isatty */
-#  ifdef __MINGW32__
-   int _fileno(FILE *stream);   /* MINGW somehow forgets to include this windows declaration into <stdio.h> */
-#  endif
 #  define SET_BINARY_MODE(file) _setmode(_fileno(file), _O_BINARY)
 #  define IS_CONSOLE(stdStream) _isatty(_fileno(stdStream))
 #else
@@ -110,8 +108,8 @@
 
 
 /* *************************************
-*  Constants
-***************************************/
+ *  Constants
+ ***************************************/
 #define LIB_VERSION XXH_VERSION_MAJOR.XXH_VERSION_MINOR.XXH_VERSION_RELEASE
 #define QUOTE(str) #str
 #define EXPAND_AND_QUOTE(str) QUOTE(str)
@@ -124,16 +122,17 @@
 #define WELCOME_MESSAGE(exename) "%s %s (%i-bits %s), by %s \n", \
                     exename, PROGRAM_VERSION, g_nbBits, ENDIAN_NAME, author
 
+#define KB *( 1<<10)
+#define MB *( 1<<20)
+#define GB *(1U<<30)
+
+static size_t XXH_DEFAULT_SAMPLE_SIZE = 100 KB;
 #define NBLOOPS    3                              /* Default number of benchmark iterations */
 #define TIMELOOP_S 1
 #define TIMELOOP  (TIMELOOP_S * CLOCKS_PER_SEC)   /* Minimum timing per iteration */
 #define XXHSUM32_DEFAULT_SEED 0                   /* Default seed for algo_xxh32 */
 #define XXHSUM64_DEFAULT_SEED 0                   /* Default seed for algo_xxh64 */
 
-#define KB *( 1<<10)
-#define MB *( 1<<20)
-#define GB *(1U<<30)
-
 #define MAX_MEM    (2 GB - 64 MB)
 
 static const char stdinName[] = "-";
@@ -149,24 +148,23 @@
 
 
 /* ************************************
-*  Display macros
-**************************************/
+ *  Display macros
+ **************************************/
 #define DISPLAY(...)         fprintf(stderr, __VA_ARGS__)
 #define DISPLAYRESULT(...)   fprintf(stdout, __VA_ARGS__)
-#define DISPLAYLEVEL(l, ...) if (g_displayLevel>=l) DISPLAY(__VA_ARGS__);
-static U32 g_displayLevel = 1;
+#define DISPLAYLEVEL(l, ...) do { if (g_displayLevel>=l) DISPLAY(__VA_ARGS__); } while (0)
+static int g_displayLevel = 2;
 
 
 /* ************************************
-*  Local variables
-**************************************/
-static size_t g_sampleSize = 100 KB;
+ *  Local variables
+ **************************************/
 static U32 g_nbIterations = NBLOOPS;
 
 
 /* ************************************
-*  Benchmark Functions
-**************************************/
+ *  Benchmark Functions
+ **************************************/
 static clock_t BMK_clockSpan( clock_t start )
 {
     return clock() - start;   /* works even if overflow; Typical max span ~ 30 mn */
@@ -219,52 +217,74 @@
 
 static void BMK_benchHash(hashFunction h, const char* hName, const void* buffer, size_t bufferSize)
 {
-    static const U32 nbh_perloop = 100;
+    U32 nbh_perIteration = ((300 MB) / (bufferSize+1)) + 1;  /* first loop conservatively aims for 300 MB/s */
     U32 iterationNb;
     double fastestH = 100000000.;
 
-    DISPLAY("\r%79s\r", "");       /* Clean display line */
+    DISPLAYLEVEL(2, "\r%70s\r", "");       /* Clean display line */
     if (g_nbIterations<1) g_nbIterations=1;
     for (iterationNb = 1; iterationNb <= g_nbIterations; iterationNb++) {
-        U32 nbHashes = 0, r=0;
+        U32 r=0;
         clock_t cStart;
 
-        DISPLAY("%1i-%-17.17s : %10u ->\r", iterationNb, hName, (U32)bufferSize);
+        DISPLAYLEVEL(2, "%1i-%-17.17s : %10u ->\r", iterationNb, hName, (U32)bufferSize);
         cStart = clock();
         while (clock() == cStart);   /* starts clock() at its exact beginning */
         cStart = clock();
 
-        while (BMK_clockSpan(cStart) < TIMELOOP) {
-            U32 i;
-            for (i=0; i<nbh_perloop; i++)
+        {   U32 i;
+            for (i=0; i<nbh_perIteration; i++)
                 r += h(buffer, bufferSize, i);
-            nbHashes += nbh_perloop;
         }
-        if (r==0) DISPLAY(".\r");  /* need to do something with r to avoid compiler "optimizing" away hash function */
-        {   double const timeS = ((double)BMK_clockSpan(cStart) / CLOCKS_PER_SEC) / nbHashes;
+        if (r==0) DISPLAYLEVEL(3,".\r");  /* do something with r to avoid compiler "optimizing" away hash function */
+        {   double const timeS = ((double)BMK_clockSpan(cStart) / CLOCKS_PER_SEC) / nbh_perIteration;
             if (timeS < fastestH) fastestH = timeS;
-            DISPLAY("%1i-%-17.17s : %10u -> %7.1f MB/s\r",
-                    iterationNb, hName, (U32)bufferSize, ((double)bufferSize / (1<<20)) / fastestH );
+            DISPLAYLEVEL(2, "%1i-%-17.17s : %10u -> %8.0f it/s (%7.1f MB/s) \r",
+                    iterationNb, hName, (U32)bufferSize,
+                    (double)1 / fastestH,
+                    ((double)bufferSize / (1<<20)) / fastestH );
         }
+        assert(fastestH > 1./2000000000);  /* avoid U32 overflow */
+        nbh_perIteration = (U32)(1 / fastestH) + 1;  /* adjust nbh_perIteration to last roughtly one second */
     }
-    DISPLAY("%-19.19s : %10u -> %7.1f MB/s  \n", hName, (U32)bufferSize, ((double)bufferSize / (1<<20)) / fastestH);
+    DISPLAYLEVEL(1, "%-19.19s : %10u -> %8.0f it/s (%7.1f MB/s) \n", hName, (U32)bufferSize,
+        (double)1 / fastestH,
+        ((double)bufferSize / (1<<20)) / fastestH);
+    if (g_displayLevel<1)
+        DISPLAYLEVEL(0, "%u, ", (U32)((double)1 / fastestH));
 }
 
 
-/* Note : buffer is supposed malloc'ed, hence aligned */
-static void BMK_benchMem(const void* buffer, size_t bufferSize)
+/* BMK_benchMem():
+ * specificTest : 0 == run all tests, 1+ run only specific test
+ * buffer : is supposed 8-bytes aligned (if malloc'ed, it should be)
+ * the real allocated size of buffer is supposed to be >= (bufferSize+3).
+ * @return : 0 on success, 1 if error (invalid mode selected) */
+static int BMK_benchMem(const void* buffer, size_t bufferSize, U32 specificTest)
 {
+    assert((((size_t)buffer) & 8) == 0);  /* ensure alignment */
+
     /* XXH32 bench */
-    BMK_benchHash(localXXH32, "XXH32", buffer, bufferSize);
+    if ((specificTest==0) | (specificTest==1))
+        BMK_benchHash(localXXH32, "XXH32", buffer, bufferSize);
 
     /* Bench XXH32 on Unaligned input */
-    BMK_benchHash(localXXH32, "XXH32 unaligned", ((const char*)buffer)+1, bufferSize);
+    if ((specificTest==0) | (specificTest==2))
+        BMK_benchHash(localXXH32, "XXH32 unaligned", ((const char*)buffer)+1, bufferSize);
 
     /* Bench XXH64 */
-    BMK_benchHash(localXXH64, "XXH64", buffer, bufferSize);
+    if ((specificTest==0) | (specificTest==3))
+        BMK_benchHash(localXXH64, "XXH64", buffer, bufferSize);
 
     /* Bench XXH64 on Unaligned input */
-    BMK_benchHash(localXXH64, "XXH64 unaligned", ((const char*)buffer)+3, bufferSize);
+    if ((specificTest==0) | (specificTest==4))
+        BMK_benchHash(localXXH64, "XXH64 unaligned", ((const char*)buffer)+3, bufferSize);
+
+    if (specificTest > 4) {
+        DISPLAY("benchmark mode invalid \n");
+        return 1;
+    }
+    return 0;
 }
 
 
@@ -279,9 +299,11 @@
 }
 
 
-static int BMK_benchFiles(const char** fileNamesTable, int nbFiles)
+static int BMK_benchFiles(const char** fileNamesTable, int nbFiles, U32 specificTest)
 {
+    int result = 0;
     int fileIdx;
+
     for (fileIdx=0; fileIdx<nbFiles; fileIdx++) {
         const char* const inFileName = fileNamesTable[fileIdx];
         FILE* const inFile = fopen( inFileName, "rb" );
@@ -291,7 +313,7 @@
 
         /* Checks */
         if ((inFile==NULL) || (inFileName==NULL)) {
-            DISPLAY( "Pb opening %s\n", inFileName);
+            DISPLAY("Pb opening %s\n", inFileName);
             free(buffer);
             return 11;
         }
@@ -302,7 +324,7 @@
         }
 
         /* Fill input buffer */
-        DISPLAY("\rLoading %s...        \n", inFileName);
+        DISPLAYLEVEL(1, "\rLoading %s...        \n", inFileName);
         {   size_t const readSize = fread(alignedBuffer, 1, benchedSize, inFile);
             fclose(inFile);
             if(readSize != benchedSize) {
@@ -312,39 +334,47 @@
         }   }
 
         /* bench */
-        BMK_benchMem(alignedBuffer, benchedSize);
+        result |= BMK_benchMem(alignedBuffer, benchedSize, specificTest);
 
         free(buffer);
     }
 
-    return 0;
+    return result;
 }
 
 
 
-static int BMK_benchInternal(void)
+static int BMK_benchInternal(size_t keySize, int specificTest)
 {
-    size_t const benchedSize = g_sampleSize;
-    void* const buffer = calloc(benchedSize+3, 1);
+    void* const buffer = calloc(keySize+16+3, 1);
+    void* const alignedBuffer = ((char*)buffer+15) - (((size_t)((char*)buffer+15)) & 0xF);  /* align on next 16 bytes */
     if(!buffer) {
         DISPLAY("\nError: not enough memory!\n");
         return 12;
     }
 
     /* bench */
-    DISPLAY("\rSample of %u KB...        \n", (U32)(benchedSize >> 10));
-    BMK_benchMem(buffer, benchedSize);
+    DISPLAYLEVEL(1, "Sample of ");
+    if (keySize > 10 KB) {
+        DISPLAYLEVEL(1, "%u KB", (U32)(keySize >> 10));
+    } else {
+        DISPLAYLEVEL(1, "%u bytes", (U32)keySize);
+    }
+    DISPLAYLEVEL(1, "...        \n");
 
-    free(buffer);
-    return 0;
+    {   int const result = BMK_benchMem(alignedBuffer, keySize, specificTest);
+        free(buffer);
+        return result;
+    }
 }
 
 
 static void BMK_checkResult(U32 r1, U32 r2)
 {
     static int nbTests = 1;
-    if (r1==r2) DISPLAY("\rTest%3i : %08X == %08X   ok   ", nbTests, r1, r2);
-    else {
+    if (r1==r2) {
+        DISPLAYLEVEL(3, "\rTest%3i : %08X == %08X   ok   ", nbTests, r1, r2);
+    } else {
         DISPLAY("\rERROR : Test%3i : %08X <> %08X   !!!!!   \n", nbTests, r1, r2);
         exit(1);
     }
@@ -356,7 +386,7 @@
 {
     static int nbTests = 1;
     if (r1!=r2) {
-        DISPLAY("\rERROR : Test%3i : 64-bits values non equals   !!!!!   \n", nbTests);
+        DISPLAY("\rERROR : Test%3i : 64-bit values non equals   !!!!!   \n", nbTests);
         DISPLAY("\r %08X%08X != %08X%08X \n", (U32)(r1>>32), (U32)r1, (U32)(r2>>32), (U32)r2);
         exit(1);
     }
@@ -379,7 +409,8 @@
     BMK_checkResult64(Dresult, Nresult);
 
     XXH64_reset(&state, seed);
-    for (pos=0; pos<len; pos++) XXH64_update(&state, ((char*)sentence)+pos, 1);
+    for (pos=0; pos<len; pos++)
+        XXH64_update(&state, ((char*)sentence)+pos, 1);
     Dresult = XXH64_digest(&state);
     BMK_checkResult64(Dresult, Nresult);
 }
@@ -400,7 +431,8 @@
     BMK_checkResult(Dresult, Nresult);
 
     XXH32_reset(&state, seed);
-    for (pos=0; pos<len; pos++) XXH32_update(&state, ((const char*)sequence)+pos, 1);
+    for (pos=0; pos<len; pos++)
+        XXH32_update(&state, ((const char*)sequence)+pos, 1);
     Dresult = XXH32_digest(&state);
     BMK_checkResult(Dresult, Nresult);
 }
@@ -437,8 +469,8 @@
     BMK_testSequence64(sanityBuffer, SANITY_BUFFER_SIZE, 0,     0x0EAB543384F878ADULL);
     BMK_testSequence64(sanityBuffer, SANITY_BUFFER_SIZE, prime, 0xCAA65939306F1E21ULL);
 
-    DISPLAY("\r%79s\r", "");       /* Clean display line */
-    DISPLAYLEVEL(2, "Sanity check -- all tests ok\n");
+    DISPLAYLEVEL(3, "\r%70s\r", "");       /* Clean display line */
+    DISPLAYLEVEL(3, "Sanity check -- all tests ok\n");
 }
 
 
@@ -542,11 +574,11 @@
     /* loading notification */
     {   const size_t fileNameSize = strlen(fileName);
         const char* const fileNameEnd = fileName + fileNameSize;
-        const size_t maxInfoFilenameSize = fileNameSize > 30 ? 30 : fileNameSize;
-        size_t infoFilenameSize = 1;
-        while ( (infoFilenameSize < maxInfoFilenameSize)
-              &&(fileNameEnd[-1-infoFilenameSize] != '/')
-              &&(fileNameEnd[-1-infoFilenameSize] != '\\') )
+        const int maxInfoFilenameSize = (int)(fileNameSize > 30 ? 30 : fileNameSize);
+        int infoFilenameSize = 1;
+        while ((infoFilenameSize < maxInfoFilenameSize)
+            && (fileNameEnd[-1-infoFilenameSize] != '/')
+            && (fileNameEnd[-1-infoFilenameSize] != '\\') )
               infoFilenameSize++;
         DISPLAY("\rLoading %s...  \r", fileNameEnd - infoFilenameSize);
 
@@ -1125,10 +1157,30 @@
     return 1;
 }
 
+/*! readU32FromChar() :
+   @return : unsigned integer value read from input in `char` format,
+             0 is no figure at *stringPtr position.
+    Interprets K, KB, KiB, M, MB and MiB suffix.
+    Modifies `*stringPtr`, advancing it to position where reading stopped.
+    Note : function result can overflow if digit string > MAX_UINT */
+static unsigned readU32FromChar(const char** stringPtr)
+{
+    unsigned result = 0;
+    while ((**stringPtr >='0') && (**stringPtr <='9'))
+        result *= 10, result += **stringPtr - '0', (*stringPtr)++ ;
+    if ((**stringPtr=='K') || (**stringPtr=='M')) {
+        result <<= 10;
+        if (**stringPtr=='M') result <<= 10;
+        (*stringPtr)++ ;
+        if (**stringPtr=='i') (*stringPtr)++;
+        if (**stringPtr=='B') (*stringPtr)++;
+    }
+    return result;
+}
 
 int main(int argc, const char** argv)
 {
-    int i, filenamesStart=0;
+    int i, filenamesStart = 0;
     const char* const exename = argv[0];
     U32 benchmarkMode = 0;
     U32 fileCheckMode = 0;
@@ -1136,7 +1188,9 @@
     U32 statusOnly    = 0;
     U32 warn          = 0;
     U32 quiet         = 0;
-    algoType algo = g_defaultAlgo;
+    U32 specificTest  = 0;
+    size_t keySize    = XXH_DEFAULT_SAMPLE_SIZE;
+    algoType algo     = g_defaultAlgo;
     endianess displayEndianess = big_endian;
 
     /* special case : xxh32sum default to 32 bits checksum */
@@ -1196,21 +1250,26 @@
             /* Trigger benchmark mode */
             case 'b':
                 argument++;
-                benchmarkMode=1;
+                benchmarkMode = 1;
+                specificTest = readU32FromChar(&argument);   /* select one specific test (hidden option) */
                 break;
 
             /* Modify Nb Iterations (benchmark only) */
             case 'i':
-                g_nbIterations = argument[1] - '0';
-                argument+=2;
+                argument++;
+                g_nbIterations = readU32FromChar(&argument);
                 break;
 
             /* Modify Block size (benchmark only) */
             case 'B':
                 argument++;
-                g_sampleSize = 0;
-                while (argument[0]>='0' && argument[0]<='9')
-                    g_sampleSize *= 10, g_sampleSize += argument[0]-'0', argument++;
+                keySize = readU32FromChar(&argument);
+                break;
+
+            /* Modify verbosity of benchmark output (hidden option) */
+            case 'q':
+                argument++;
+                g_displayLevel--;
                 break;
 
             default:
@@ -1221,10 +1280,10 @@
 
     /* Check benchmark mode */
     if (benchmarkMode) {
-        DISPLAY( WELCOME_MESSAGE(exename) );
+        DISPLAYLEVEL(2, WELCOME_MESSAGE(exename) );
         BMK_sanityCheck();
-        if (filenamesStart==0) return BMK_benchInternal();
-        return BMK_benchFiles(argv+filenamesStart, argc-filenamesStart);
+        if (filenamesStart==0) return BMK_benchInternal(keySize, specificTest);
+        return BMK_benchFiles(argv+filenamesStart, argc-filenamesStart, specificTest);
     }
 
     /* Check if input is defined as console; trigger an error in this case */