Update FLAC to 1.3.1

This includes fixes for CVE-2014-9028 and CVE-2014-8962
(which shouldn't affect Chromium since we only use FLAC for
encoding).

It also reduces the number of downstream patches.

BUG=https://bugs.chromium.org/p/chromium/issues/detail?id=610125
R=felt@chromium.org, tommi@chromium.org

Review URL: https://codereview.chromium.org/1961133002 .
diff --git a/AUTHORS b/AUTHORS
index 217baf4..ef22b50 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -1,8 +1,9 @@
 /* FLAC - Free Lossless Audio Codec
- * Copyright (C) 2001,2002,2003,2004,2005,2006,2007  Josh Coalson
+ * Copyright (C) 2001-2009  Josh Coalson
+ * Copyright (C) 2011-2014  Xiph.Org Foundation
  *
  * This file is part the FLAC project.  FLAC is comprised of several
- * components distributed under difference licenses.  The codec libraries
+ * components distributed under different licenses.  The codec libraries
  * are distributed under Xiph.Org's BSD-like license (see the file
  * COPYING.Xiph in this distribution).  All other programs, libraries, and
  * plugins are distributed under the GPL (see COPYING.GPL).  The documentation
@@ -16,11 +17,27 @@
  * distribution.
  */
 
+Current FLAC maintainer: Erik de Castro Lopo <erikd@mega-nerd.com>
 
-FLAC (http://flac.sourceforge.net/) is an Open Source lossless audio
-codec developed by Josh Coalson <jcoalson@users.sourceforge.net>.
+Original author: Josh Coalson <jcoalson@users.sourceforge.net>
+
+Website : https://www.xiph.org/flac/
+
+FLAC is an Open Source lossless audio codec originally developed by Josh Coalson
+between 2001 and 2009. From 2009 to 2012 FLAC was basically unmaintained. In
+2012 the Erik de Castro Lopo became the chief maintainer as part of the
+Xiph.Org Foundation.
 
 Other major contributors and their contributions:
+
+"lvqcl" <lvqcl@users.sourceforge.net>
+* Visual Studio build system.
+* Optimisations in the encoder and decoder.
+
+"Janne Hyvärinen" <cse@sci.fi>
+* Visual Studio build system.
+* Unicode handling on Windows.
+
 "Andrey Astafiev" <andrei@tvcell.ru>
 * Russian translation of the HTML documentation
 
diff --git a/BUILD.gn b/BUILD.gn
index 68719eb..ae81550 100644
--- a/BUILD.gn
+++ b/BUILD.gn
@@ -34,6 +34,9 @@
     "include/FLAC/stream_decoder.h",
     "include/FLAC/stream_encoder.h",
     "include/share/alloc.h",
+    "include/share/compat.h",
+    "include/share/endswap.h",
+    "include/share/private.h",
     "src/libFLAC/alloc.c",
     "src/libFLAC/bitmath.c",
     "src/libFLAC/bitreader.c",
@@ -60,9 +63,11 @@
     "src/libFLAC/include/private/float.h",
     "src/libFLAC/include/private/format.h",
     "src/libFLAC/include/private/lpc.h",
+    "src/libFLAC/include/private/macros.h",
     "src/libFLAC/include/private/md5.h",
     "src/libFLAC/include/private/memory.h",
     "src/libFLAC/include/private/metadata.h",
+    "src/libFLAC/include/private/stream_encoder.h",
     "src/libFLAC/include/private/stream_encoder_framing.h",
     "src/libFLAC/include/private/window.h",
     "src/libFLAC/include/protected/all.h",
@@ -85,6 +90,24 @@
 
   defines = [
     "FLAC__OVERFLOW_DETECT",
-    "VERSION=\"1.2.1\"",
+    "VERSION=\"1.3.1\"",
+    "HAVE_LROUND",
   ]
+
+  if (is_win) {
+    sources += [
+      "include/share/win_utf8_io.h",
+      "src/share/win_utf8_io/win_utf8_io.c",
+    ]
+    # win_utf8_io.c defines this itself.
+    configs -= [ "//build/config/win:lean_and_mean" ]
+    cflags = [
+      "/wd4334",  # 32-bit shift converted to 64 bits.
+      "/wd4267"   # Converting from size_t to unsigned on 64-bit.
+    ]
+  } else {
+    defines += [
+      "HAVE_INTTYPES_H",
+    ]
+  }
 }
diff --git a/COPYING.GPL b/COPYING.GPL
index c3c7a9e..d159169 100644
--- a/COPYING.GPL
+++ b/COPYING.GPL
@@ -1,12 +1,12 @@
-		    GNU GENERAL PUBLIC LICENSE
-		       Version 2, June 1991
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 2, June 1991
 
- Copyright (C) 1989, 1991 Free Software Foundation, Inc.
- 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  Everyone is permitted to copy and distribute verbatim copies
  of this license document, but changing it is not allowed.
 
-			    Preamble
+                            Preamble
 
   The licenses for most software are designed to take away your
 freedom to share and change it.  By contrast, the GNU General Public
@@ -15,7 +15,7 @@
 General Public License applies to most of the Free Software
 Foundation's software and to any other program whose authors commit to
 using it.  (Some other Free Software Foundation software is covered by
-the GNU Library General Public License instead.)  You can apply it to
+the GNU Lesser General Public License instead.)  You can apply it to
 your programs, too.
 
   When we speak of free software, we are referring to freedom, not
@@ -55,8 +55,8 @@
 
   The precise terms and conditions for copying, distribution and
 modification follow.
-
-		    GNU GENERAL PUBLIC LICENSE
+
+                    GNU GENERAL PUBLIC LICENSE
    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 
   0. This License applies to any program or other work which contains
@@ -110,7 +110,7 @@
     License.  (Exception: if the Program itself is interactive but
     does not normally print such an announcement, your work based on
     the Program is not required to print an announcement.)
-
+
 These requirements apply to the modified work as a whole.  If
 identifiable sections of that work are not derived from the Program,
 and can be reasonably considered independent and separate works in
@@ -168,7 +168,7 @@
 access to copy the source code from the same place counts as
 distribution of the source code, even though third parties are not
 compelled to copy the source along with the object code.
-
+
   4. You may not copy, modify, sublicense, or distribute the Program
 except as expressly provided under this License.  Any attempt
 otherwise to copy, modify, sublicense or distribute the Program is
@@ -225,7 +225,7 @@
 
 This section is intended to make thoroughly clear what is believed to
 be a consequence of the rest of this License.
-
+
   8. If the distribution and/or use of the Program is restricted in
 certain countries either by patents or by copyrighted interfaces, the
 original copyright holder who places the Program under this License
@@ -255,7 +255,7 @@
 of preserving the free status of all derivatives of our free software and
 of promoting the sharing and reuse of software generally.
 
-			    NO WARRANTY
+                            NO WARRANTY
 
   11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
 FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
@@ -277,9 +277,9 @@
 PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
 POSSIBILITY OF SUCH DAMAGES.
 
-		     END OF TERMS AND CONDITIONS
-
-	    How to Apply These Terms to Your New Programs
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
 
   If you develop a new program, and you want it to be of the greatest
 possible use to the public, the best way to achieve this is to make it
@@ -291,7 +291,7 @@
 the "copyright" line and a pointer to where the full notice is found.
 
     <one line to give the program's name and a brief idea of what it does.>
-    Copyright (C) 19yy  <name of author>
+    Copyright (C) <year>  <name of author>
 
     This program is free software; you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -303,17 +303,16 @@
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.
 
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 
 Also add information on how to contact you by electronic and paper mail.
 
 If the program is interactive, make it output a short notice like this
 when it starts in an interactive mode:
 
-    Gnomovision version 69, Copyright (C) 19yy name of author
+    Gnomovision version 69, Copyright (C) year name of author
     Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
     This is free software, and you are welcome to redistribute it
     under certain conditions; type `show c' for details.
@@ -336,5 +335,5 @@
 This General Public License does not permit incorporating your program into
 proprietary programs.  If your program is a subroutine library, you may
 consider it more useful to permit linking proprietary applications with the
-library.  If this is what you want to do, use the GNU Library General
+library.  If this is what you want to do, use the GNU Lesser General
 Public License instead of this License.
diff --git a/COPYING.Xiph b/COPYING.Xiph
index 0a104a9..c0361fd 100644
--- a/COPYING.Xiph
+++ b/COPYING.Xiph
@@ -1,4 +1,5 @@
-Copyright (C) 2000,2001,2002,2003,2004,2005,2006,2007  Josh Coalson
+Copyright (C) 2000-2009  Josh Coalson
+Copyright (C) 2011-2014  Xiph.Org Foundation
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions
diff --git a/README b/README
index f4a461b..9178882 100644
--- a/README
+++ b/README
@@ -1,8 +1,9 @@
 /* FLAC - Free Lossless Audio Codec
- * Copyright (C) 2001,2002,2003,2004,2005,2006,2007  Josh Coalson
+ * Copyright (C) 2001-2009  Josh Coalson
+ * Copyright (C) 2011-2014  Xiph.Org Foundation
  *
  * This file is part the FLAC project.  FLAC is comprised of several
- * components distributed under difference licenses.  The codec libraries
+ * components distributed under different licenses.  The codec libraries
  * are distributed under Xiph.Org's BSD-like license (see the file
  * COPYING.Xiph in this distribution).  All other programs, libraries, and
  * plugins are distributed under the LGPL or GPL (see COPYING.LGPL and
@@ -17,8 +18,11 @@
  */
 
 
-FLAC (http://flac.sourceforge.net/) is an Open Source lossless audio
-codec developed by Josh Coalson.
+FLAC is an Open Source lossless audio codec developed by Josh Coalson from 2001
+to 2009.
+
+From January 2012 FLAC is being maintained by Erik de Castro Lopo under the
+auspices of the Xiph.org Foundation.
 
 FLAC is comprised of
   * `libFLAC', a library which implements reference encoders and
@@ -27,7 +31,7 @@
   * `flac', a command-line program for encoding and decoding files
   * `metaflac', a command-line program for viewing and editing FLAC
     metadata
-  * player plugins for XMMS and Winamp
+  * player plugin for XMMS
   * user and API documentation
 
 The libraries (libFLAC, libFLAC++) are
@@ -38,7 +42,7 @@
 
 
 ===============================================================================
-FLAC - 1.2.1 - Contents
+FLAC - 1.3.1 - Contents
 ===============================================================================
 
 - Introduction
@@ -63,11 +67,17 @@
 A brief description of the directory tree:
 
 	doc/          the HTML documentation
+	examples/     example programs demonstrating the use of libFLAC and libFLAC++
 	include/      public include files for libFLAC and libFLAC++
-	man/          the man page for `flac'
+	man/          the man pages for `flac' and `metaflac'
 	src/          the source code and private headers
 	test/         the test scripts
 
+If you have questions about building FLAC that this document does not answer,
+please submit them at the following tracker so this document can be improved:
+
+	https://sourceforge.net/p/flac/support-requests/
+
 
 ===============================================================================
 Prerequisites
@@ -89,7 +99,7 @@
 libFLAC has grown larger over time as more functionality has been
 included, but much of it may be unnecessary for a particular embedded
 implementation.  Unused parts may be pruned by some simple editing of
-configure.in and src/libFLAC/Makefile.am; the following dependency
+configure.ac and src/libFLAC/Makefile.am; the following dependency
 graph shows which modules may be pruned without breaking things
 further down:
 
@@ -148,7 +158,7 @@
 assembly routines.  Many routines have assembly versions for
 speed and `configure' is pretty good about knowing what is
 supported, but you can use this option to build only from the
-C sources.  May be necessary for building on OS X (Intel)
+C sources.  May be necessary for building on OS X (Intel).
 
 --enable-sse : If you are building for an x86 CPU that supports
 SSE instructions, you can enable some of the faster routines
@@ -170,7 +180,7 @@
 Use these if you have these packages but configure can't find them.
 
 If you want to build completely from scratch (i.e. starting with just
-configure.in and Makefile.am) you should be able to just run 'autogen.sh'
+configure.ac and Makefile.am) you should be able to just run 'autogen.sh'
 but make sure and read the comments in that file first.
 
 
@@ -200,55 +210,45 @@
 Building with MSVC
 ===============================================================================
 
-There are .dsp projects and a master FLAC.dsw workspace to build all
-the libraries and executables with MSVC6.  There are also .vcproj
-projects and a master FLAC.sln solution to build all the libraries and
-executables with VC++ 2005.
+There are .vcproj projects and a master FLAC.sln solution to build all
+the libraries and executables with MSVC 2005 or newer.
 
 Prerequisite: you must have the Ogg libraries installed as described
 later.
 
-Prerequisite: you must have nasm installed, and nasmw.exe must be in
-your PATH, or the path to nasmw.exe must be added to the list of
+Prerequisite: you must have nasm installed, and nasm.exe must be in
+your PATH, or the path to nasm.exe must be added to the list of
 directories for executable files in the MSVC global options.
 
-MSVC6:
-To build everything, run Developer Studio, do File|Open Workspace,
-and open FLAC.dsw.  Select "Build | Set active configuration..."
-from the menu, then in the dialog, select "All - Win32 Release" (or
-Debug if you prefer).  Click "Ok" then hit F7 to build.
-
-VC++ 2005:
 To build everything, run Visual Studio, do File|Open and open FLAC.sln.
 From the dropdown in the toolbar, select "Release" instead of "Debug",
-then hit F7 to build.
+then do Build|Build Solution.
 
-Either way, this will build all libraries both statically (e.g.
-obj\release\lib\libFLAC_static.lib) and as DLLs (e.g.
-obj\release\lib\libFLAC.dll), and it will build all binaries, statically
-linked (e.g. obj\release\bin\flac.exe).
+This will build all libraries both statically (e.g.
+objs\release\lib\libFLAC_static.lib) and as DLLs (e.g.
+objs\release\lib\libFLAC.dll), and it will build all binaries, statically
+linked (e.g. objs\release\bin\flac.exe).
 
-Everything will end up in the "obj" directory.  DLLs and .exe files
+Everything will end up in the "objs" directory.  DLLs and .exe files
 are all that are needed and can be copied to an installation area and
-added to the PATH.  The plugins have to be copied to their appropriate
-place in the player area.  For Winamp2 this is <winamp2-dir>\Plugins.
+added to the PATH.
 
-By default the code is configured with Ogg support.  Before building FLAC
+By default the code is configured with Ogg support. Before building FLAC
 you will need to get the Ogg source distribution
-(see http://xiph.org/ogg/vorbis/download/), build ogg_static.lib (load and
-build win32\ogg_static.dsp), copy ogg_static.lib into FLAC's
-'obj\release\lib' directory, and copy the entire include\ogg tree into
-FLAC's 'include' directory (so that there is an 'ogg' directory in FLAC's
+(see http://xiph.org/downloads/), build libogg_static.lib (load
+win32\libogg_static.sln, change solution configuration to "Release" and
+code generation to "Multi-threaded (/MT)", then build), copy libogg_static.lib
+into FLAC's 'objs\release\lib' directory, and copy the entire include\ogg tree
+into FLAC's 'include' directory (so that there is an 'ogg' directory in FLAC's
 'include' directory with the files ogg.h, os_types.h and config_types.h).
 
-If you want to build without Ogg support, instead edit all .dsp or
-.vcproj files and remove any occurrences of "/D FLAC__HAS_OGG".
+If you want to build without Ogg support, instead edit all .vcproj files
+and remove any "FLAC__HAS_OGG" definitions.
 
 
 ===============================================================================
 Building on Mac OS X
 ===============================================================================
 
-If you have Fink or a recent version of OS X with the proper autotooles,
-the GNU flow above should work.  The Project Builder project has been
-deprecated but we are working on replacing it with an Xcode equivalent.
+If you have Fink or a recent version of OS X with the proper autotools,
+the GNU flow above should work.
diff --git a/README.chromium b/README.chromium
index b297704..a6cf89b 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,21 +1,19 @@
-Name: flac
-URL: https://xiph.org/flac/
-Version: 1.2.1
+URL: http://downloads.xiph.org/releases/flac/flac-1.3.1.tar.xz
+Version: 1.3.1
 License: BSD
 License File: COPYING.Xiph
 Security Critical: yes
 
 Description:
-This contains a copy of flac-1.2.1
-http://sourceforge.net/projects/flac/files/flac-src/flac-1.2.1-src/flac-1.2.1.tar.gz/download
+This contains a copy of flac-1.3.1
 
 This library is required for the browser to compress and encode recorded audio
 before sending to Google servers for speech recognition. This is a straight dump
-of flac-1.2.1 with all the unused files removed and the following changes:
+of flac-1.3.1 with all the unused files removed and the following changes:
 
-- Replaced include/share/alloc.h with a new file
 - Added flac.gyp, flac.h, README.chromium and src/libFLAC/alloc.c
-- Fixed memset(ctx, 0, sizeof(ctx)); in src/libFLAC/md5.c:266
-- Add #define for fseeko/ftello for VS2012 and above in
-  src/libFLAC/stream_decoder.c and src/libFLAC/stream_encoder.c
-- Fixed Win64 compilation issues: https://codereview.chromium.org/11567032/
+- Replaced include/share/alloc.h with a new file.
+- include/libFLAC/private/macros.h: don't conditionalize on GCC version >= 4.3;
+  Clang looks like GCC 4.2, but we want these defines.
+- src/share/win_utf8_io/win_utf8_io.c: call LoadLibraryA instead of just
+  LoadLibrary since the argument is a non-wide string.
diff --git a/flac.gyp b/flac.gyp
index 95935ce..cea6e98 100644
--- a/flac.gyp
+++ b/flac.gyp
@@ -19,6 +19,9 @@
         'include/FLAC/stream_decoder.h',
         'include/FLAC/stream_encoder.h',
         'include/share/alloc.h',
+        'include/share/compat.h',
+        'include/share/endswap.h',
+        'include/share/private.h',
         'src/libFLAC/alloc.c',
         'src/libFLAC/bitmath.c',
         'src/libFLAC/bitreader.c',
@@ -45,9 +48,11 @@
         'src/libFLAC/include/private/float.h',
         'src/libFLAC/include/private/format.h',
         'src/libFLAC/include/private/lpc.h',
+        'src/libFLAC/include/private/macros.h',
         'src/libFLAC/include/private/md5.h',
         'src/libFLAC/include/private/memory.h',
         'src/libFLAC/include/private/metadata.h',
+        'src/libFLAC/include/private/stream_encoder.h',
         'src/libFLAC/include/private/stream_encoder_framing.h',
         'src/libFLAC/include/private/window.h',
         'src/libFLAC/include/protected/all.h',
@@ -57,7 +62,31 @@
       'defines': [
         'FLAC__NO_DLL',
         'FLAC__OVERFLOW_DETECT',
-        'VERSION="1.2.1"',
+        'VERSION="1.3.1"',
+        'HAVE_LROUND',
+      ],
+      'conditions': [
+        ['OS=="win"', {
+          'sources': [
+            'include/share/win_utf8_io.h',
+            'src/share/win_utf8_io/win_utf8_io.c',
+          ],
+          'defines!': [
+            'WIN32_LEAN_AND_MEAN',  # win_utf8_io.c defines this itself.
+          ],
+          'msvs_settings': {
+            'VCCLCompilerTool': {
+              'AdditionalOptions': [
+                '/wd4334',  # 32-bit shift converted to 64 bits.
+                '/wd4267',  # Converting from size_t to unsigned on 64-bit.
+              ],
+            },
+          },
+        }, {
+          'defines': [
+            'HAVE_INTTYPES_H',
+          ],
+        }],
       ],
       'include_dirs': [
         'include',
diff --git a/include/FLAC/all.h b/include/FLAC/all.h
index c542c0d..2851cf5 100644
--- a/include/FLAC/all.h
+++ b/include/FLAC/all.h
@@ -1,5 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
- * Copyright (C) 2000,2001,2002,2003,2004,2005,2006,2007  Josh Coalson
+ * Copyright (C) 2000-2009  Josh Coalson
+ * Copyright (C) 2011-2014  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -161,7 +162,7 @@
  * in FLAC 1.1.3 is a set of \c #defines in \c export.h of each
  * library's includes (e.g. \c include/FLAC/export.h).  The
  * \c #defines mirror the libraries'
- * <A HREF="http://www.gnu.org/software/libtool/manual.html#Libtool-versioning">libtool version numbers</A>,
+ * <A HREF="http://www.gnu.org/software/libtool/manual/libtool.html#Libtool-versioning">libtool version numbers</A>,
  * e.g. in libFLAC there are \c FLAC_API_VERSION_CURRENT,
  * \c FLAC_API_VERSION_REVISION, and \c FLAC_API_VERSION_AGE.
  * These can be used to support multiple versions of an API during the
diff --git a/include/FLAC/assert.h b/include/FLAC/assert.h
index 3fc03f3..dc9bcef 100644
--- a/include/FLAC/assert.h
+++ b/include/FLAC/assert.h
@@ -1,5 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
- * Copyright (C) 2001,2002,2003,2004,2005,2006,2007  Josh Coalson
+ * Copyright (C) 2001-2009  Josh Coalson
+ * Copyright (C) 2011-2014  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
diff --git a/include/FLAC/callback.h b/include/FLAC/callback.h
index c954121..ce8787f 100644
--- a/include/FLAC/callback.h
+++ b/include/FLAC/callback.h
@@ -1,5 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
- * Copyright (C) 2004,2005,2006,2007  Josh Coalson
+ * Copyright (C) 2004-2009  Josh Coalson
+ * Copyright (C) 2011-2014  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
diff --git a/include/FLAC/export.h b/include/FLAC/export.h
index a525f29..9cc9e13 100644
--- a/include/FLAC/export.h
+++ b/include/FLAC/export.h
@@ -1,5 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
- * Copyright (C) 2000,2001,2002,2003,2004,2005,2006,2007  Josh Coalson
+ * Copyright (C) 2000-2009  Josh Coalson
+ * Copyright (C) 2011-2014  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -55,25 +56,30 @@
  * \{
  */
 
-#if defined(FLAC__NO_DLL) || !defined(_MSC_VER)
+#if defined(FLAC__NO_DLL)
 #define FLAC_API
 
-#else
-
+#elif defined(_MSC_VER)
 #ifdef FLAC_API_EXPORTS
-#define	FLAC_API	_declspec(dllexport)
+#define	FLAC_API __declspec(dllexport)
 #else
-#define FLAC_API	_declspec(dllimport)
-
+#define FLAC_API __declspec(dllimport)
 #endif
+
+#elif defined(FLAC__USE_VISIBILITY_ATTR)
+#define FLAC_API __attribute__ ((visibility ("default")))
+
+#else
+#define FLAC_API
+
 #endif
 
 /** These #defines will mirror the libtool-based library version number, see
- * http://www.gnu.org/software/libtool/manual.html#Libtool-versioning
+ * http://www.gnu.org/software/libtool/manual/libtool.html#Libtool-versioning
  */
-#define FLAC_API_VERSION_CURRENT 10
+#define FLAC_API_VERSION_CURRENT 11
 #define FLAC_API_VERSION_REVISION 0 /**< see above */
-#define FLAC_API_VERSION_AGE 2 /**< see above */
+#define FLAC_API_VERSION_AGE 3 /**< see above */
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/include/FLAC/format.h b/include/FLAC/format.h
index 77e2d01..7424565 100644
--- a/include/FLAC/format.h
+++ b/include/FLAC/format.h
@@ -1,5 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
- * Copyright (C) 2000,2001,2002,2003,2004,2005,2006,2007  Josh Coalson
+ * Copyright (C) 2000-2009  Josh Coalson
+ * Copyright (C) 2011-2014  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -508,9 +509,11 @@
 	FLAC__METADATA_TYPE_PICTURE = 6,
 	/**< <A HREF="../format.html#metadata_block_picture">PICTURE</A> block */
 
-	FLAC__METADATA_TYPE_UNDEFINED = 7
+	FLAC__METADATA_TYPE_UNDEFINED = 7,
 	/**< marker to denote beginning of undefined type range; this number will increase as new metadata types are added */
 
+        FLAC__MAX_METADATA_TYPE = FLAC__MAX_METADATA_TYPE_CODE,
+        /**< No type will ever be greater than this. There is not enough room in the protocol block. */
 } FLAC__MetadataType;
 
 /** Maps a FLAC__MetadataType to a C string.
@@ -879,6 +882,18 @@
  */
 FLAC_API FLAC__bool FLAC__format_sample_rate_is_valid(unsigned sample_rate);
 
+/** Tests that a blocksize at the given sample rate is valid for the FLAC
+ *  subset.
+ *
+ * \param blocksize    The blocksize to test for compliance.
+ * \param sample_rate  The sample rate is needed, since the valid subset
+ *                     blocksize depends on the sample rate.
+ * \retval FLAC__bool
+ *    \c true if the given blocksize conforms to the specification for the
+ *    subset at the given sample rate, else \c false.
+ */
+FLAC_API FLAC__bool FLAC__format_blocksize_is_subset(unsigned blocksize, unsigned sample_rate);
+
 /** Tests that a sample rate is valid for the FLAC subset.  The subset rules
  *  for valid sample rates are slightly more complex since the rate has to
  *  be expressible completely in the frame header.
diff --git a/include/FLAC/metadata.h b/include/FLAC/metadata.h
index fff90b0..8951532 100644
--- a/include/FLAC/metadata.h
+++ b/include/FLAC/metadata.h
@@ -1,5 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
- * Copyright (C) 2001,2002,2003,2004,2005,2006,2007  Josh Coalson
+ * Copyright (C) 2001-2009  Josh Coalson
+ * Copyright (C) 2011-2014  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -499,7 +500,7 @@
  * \retval unsigned
  *    The length of the metadata block at the current iterator position.
  *    The is same length as that in the
- *    <a href="http://flac.sourceforge.net/format.html#metadata_block_header">metadata block header</a>,
+ *    <a href="http://xiph.org/flac/format.html#metadata_block_header">metadata block header</a>,
  *    i.e. the length of the metadata body that follows the header.
  */
 FLAC_API unsigned FLAC__metadata_simple_iterator_get_block_length(const FLAC__Metadata_SimpleIterator *iterator);
@@ -1986,7 +1987,7 @@
  *    \code object->type == FLAC__METADATA_TYPE_CUESHEET \endcode
  *    \code track_num < object->data.cue_sheet.num_tracks \endcode
  *    \code (track->indices != NULL && track->num_indices > 0) ||
- * (track->indices == NULL && track->num_indices == 0)
+ * (track->indices == NULL && track->num_indices == 0) \endcode
  * \retval FLAC__bool
  *    \c false if \a copy is \c true and malloc() fails, else \c true.
  */
diff --git a/include/FLAC/ordinals.h b/include/FLAC/ordinals.h
index a7a5cd9..b1e1acf 100644
--- a/include/FLAC/ordinals.h
+++ b/include/FLAC/ordinals.h
@@ -1,5 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
- * Copyright (C) 2000,2001,2002,2003,2004,2005,2006,2007  Josh Coalson
+ * Copyright (C) 2000-2009  Josh Coalson
+ * Copyright (C) 2011-2014  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -32,40 +33,45 @@
 #ifndef FLAC__ORDINALS_H
 #define FLAC__ORDINALS_H
 
-#if !(defined(_MSC_VER) || defined(__BORLANDC__) || defined(__EMX__))
-#include <inttypes.h>
-#endif
+#if defined(_MSC_VER) && _MSC_VER < 1600
 
-typedef signed char FLAC__int8;
-typedef unsigned char FLAC__uint8;
+/* Microsoft Visual Studio earlier than the 2010 version did not provide
+ * the 1999 ISO C Standard header file <stdint.h>.
+ */
 
-#if defined(_MSC_VER) || defined(__BORLANDC__)
+typedef __int8 FLAC__int8;
+typedef unsigned __int8 FLAC__uint8;
+
 typedef __int16 FLAC__int16;
 typedef __int32 FLAC__int32;
 typedef __int64 FLAC__int64;
 typedef unsigned __int16 FLAC__uint16;
 typedef unsigned __int32 FLAC__uint32;
 typedef unsigned __int64 FLAC__uint64;
-#elif defined(__EMX__)
-typedef short FLAC__int16;
-typedef long FLAC__int32;
-typedef long long FLAC__int64;
-typedef unsigned short FLAC__uint16;
-typedef unsigned long FLAC__uint32;
-typedef unsigned long long FLAC__uint64;
+
 #else
+
+/* For MSVC 2010 and everything else which provides <stdint.h>. */
+
+#include <stdint.h>
+
+typedef int8_t FLAC__int8;
+typedef uint8_t FLAC__uint8;
+
 typedef int16_t FLAC__int16;
 typedef int32_t FLAC__int32;
 typedef int64_t FLAC__int64;
 typedef uint16_t FLAC__uint16;
 typedef uint32_t FLAC__uint32;
 typedef uint64_t FLAC__uint64;
+
 #endif
 
 typedef int FLAC__bool;
 
 typedef FLAC__uint8 FLAC__byte;
 
+
 #ifdef true
 #undef true
 #endif
diff --git a/include/FLAC/stream_decoder.h b/include/FLAC/stream_decoder.h
index 9ac1594..9bfdd1f 100644
--- a/include/FLAC/stream_decoder.h
+++ b/include/FLAC/stream_decoder.h
@@ -1,5 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
- * Copyright (C) 2000,2001,2002,2003,2004,2005,2006,2007  Josh Coalson
+ * Copyright (C) 2000-2009  Josh Coalson
+ * Copyright (C) 2011-2014  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
diff --git a/include/FLAC/stream_encoder.h b/include/FLAC/stream_encoder.h
index dbbbb23..efc213a 100644
--- a/include/FLAC/stream_encoder.h
+++ b/include/FLAC/stream_encoder.h
@@ -1,5 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
- * Copyright (C) 2000,2001,2002,2003,2004,2005,2006,2007  Josh Coalson
+ * Copyright (C) 2000-2009  Josh Coalson
+ * Copyright (C) 2011-2014  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -128,7 +129,7 @@
  * Unlike the decoders, the stream encoder has many options that can
  * affect the speed and compression ratio.  When setting these parameters
  * you should have some basic knowledge of the format (see the
- * <A HREF="../documentation.html#format">user-level documentation</A>
+ * <A HREF="../documentation_format_overview.html">user-level documentation</A>
  * or the <A HREF="../format.html">formal description</A>).  The
  * FLAC__stream_encoder_set_*() functions themselves do not validate the
  * values as many are interdependent.  The FLAC__stream_encoder_init_*()
@@ -829,28 +830,28 @@
  * The actual values set for each level are:
  * <table>
  * <tr>
- *  <td><b>level</b><td>
- *  <td>do mid-side stereo<td>
- *  <td>loose mid-side stereo<td>
- *  <td>apodization<td>
- *  <td>max lpc order<td>
- *  <td>qlp coeff precision<td>
- *  <td>qlp coeff prec search<td>
- *  <td>escape coding<td>
- *  <td>exhaustive model search<td>
- *  <td>min residual partition order<td>
- *  <td>max residual partition order<td>
- *  <td>rice parameter search dist<td>
+ *  <td><b>level</b></td>
+ *  <td>do mid-side stereo</td>
+ *  <td>loose mid-side stereo</td>
+ *  <td>apodization</td>
+ *  <td>max lpc order</td>
+ *  <td>qlp coeff precision</td>
+ *  <td>qlp coeff prec search</td>
+ *  <td>escape coding</td>
+ *  <td>exhaustive model search</td>
+ *  <td>min residual partition order</td>
+ *  <td>max residual partition order</td>
+ *  <td>rice parameter search dist</td>
  * </tr>
- * <tr>  <td><b>0</b><td>  <td>false<td>  <td>false<td>  <td>tukey(0.5)<td>  <td>0<td>   <td>0<td>  <td>false<td>  <td>false<td>  <td>false<td>  <td>0<td>  <td>3<td>  <td>0<td>  </tr>
- * <tr>  <td><b>1</b><td>  <td>true<td>   <td>true<td>   <td>tukey(0.5)<td>  <td>0<td>   <td>0<td>  <td>false<td>  <td>false<td>  <td>false<td>  <td>0<td>  <td>3<td>  <td>0<td>  </tr>
- * <tr>  <td><b>2</b><td>  <td>true<td>   <td>false<td>  <td>tukey(0.5)<td>  <td>0<td>   <td>0<td>  <td>false<td>  <td>false<td>  <td>false<td>  <td>0<td>  <td>3<td>  <td>0<td>  </tr>
- * <tr>  <td><b>3</b><td>  <td>false<td>  <td>false<td>  <td>tukey(0.5)<td>  <td>6<td>   <td>0<td>  <td>false<td>  <td>false<td>  <td>false<td>  <td>0<td>  <td>4<td>  <td>0<td>  </tr>
- * <tr>  <td><b>4</b><td>  <td>true<td>   <td>true<td>   <td>tukey(0.5)<td>  <td>8<td>   <td>0<td>  <td>false<td>  <td>false<td>  <td>false<td>  <td>0<td>  <td>4<td>  <td>0<td>  </tr>
- * <tr>  <td><b>5</b><td>  <td>true<td>   <td>false<td>  <td>tukey(0.5)<td>  <td>8<td>   <td>0<td>  <td>false<td>  <td>false<td>  <td>false<td>  <td>0<td>  <td>5<td>  <td>0<td>  </tr>
- * <tr>  <td><b>6</b><td>  <td>true<td>   <td>false<td>  <td>tukey(0.5)<td>  <td>8<td>   <td>0<td>  <td>false<td>  <td>false<td>  <td>false<td>  <td>0<td>  <td>6<td>  <td>0<td>  </tr>
- * <tr>  <td><b>7</b><td>  <td>true<td>   <td>false<td>  <td>tukey(0.5)<td>  <td>8<td>   <td>0<td>  <td>false<td>  <td>false<td>  <td>true<td>   <td>0<td>  <td>6<td>  <td>0<td>  </tr>
- * <tr>  <td><b>8</b><td>  <td>true<td>   <td>false<td>  <td>tukey(0.5)<td>  <td>12<td>  <td>0<td>  <td>false<td>  <td>false<td>  <td>true<td>   <td>0<td>  <td>6<td>  <td>0<td>  </tr>
+ * <tr>  <td><b>0</b></td> <td>false</td> <td>false</td> <td>tukey(0.5)<td>                                     <td>0</td>  <td>0</td> <td>false</td> <td>false</td> <td>false</td> <td>0</td> <td>3</td> <td>0</td> </tr>
+ * <tr>  <td><b>1</b></td> <td>true</td>  <td>true</td>  <td>tukey(0.5)<td>                                     <td>0</td>  <td>0</td> <td>false</td> <td>false</td> <td>false</td> <td>0</td> <td>3</td> <td>0</td> </tr>
+ * <tr>  <td><b>2</b></td> <td>true</td>  <td>false</td> <td>tukey(0.5)<td>                                     <td>0</td>  <td>0</td> <td>false</td> <td>false</td> <td>false</td> <td>0</td> <td>3</td> <td>0</td> </tr>
+ * <tr>  <td><b>3</b></td> <td>false</td> <td>false</td> <td>tukey(0.5)<td>                                     <td>6</td>  <td>0</td> <td>false</td> <td>false</td> <td>false</td> <td>0</td> <td>4</td> <td>0</td> </tr>
+ * <tr>  <td><b>4</b></td> <td>true</td>  <td>true</td>  <td>tukey(0.5)<td>                                     <td>8</td>  <td>0</td> <td>false</td> <td>false</td> <td>false</td> <td>0</td> <td>4</td> <td>0</td> </tr>
+ * <tr>  <td><b>5</b></td> <td>true</td>  <td>false</td> <td>tukey(0.5)<td>                                     <td>8</td>  <td>0</td> <td>false</td> <td>false</td> <td>false</td> <td>0</td> <td>5</td> <td>0</td> </tr>
+ * <tr>  <td><b>6</b></td> <td>true</td>  <td>false</td> <td>tukey(0.5);partial_tukey(2)<td>                    <td>8</td>  <td>0</td> <td>false</td> <td>false</td> <td>false</td> <td>0</td> <td>6</td> <td>0</td> </tr>
+ * <tr>  <td><b>7</b></td> <td>true</td>  <td>false</td> <td>tukey(0.5);partial_tukey(2)<td>                    <td>12</td> <td>0</td> <td>false</td> <td>false</td> <td>false</td> <td>0</td> <td>6</td> <td>0</td> </tr>
+ * <tr>  <td><b>8</b></td> <td>true</td>  <td>false</td> <td>tukey(0.5);partial_tukey(2);punchout_tukey(3)</td> <td>12</td> <td>0</td> <td>false</td> <td>false</td> <td>false</td> <td>0</td> <td>6</td> <td>0</td> </tr>
  * </table>
  *
  * \default \c 5
@@ -919,7 +920,8 @@
  * The available functions are \c bartlett, \c bartlett_hann,
  * \c blackman, \c blackman_harris_4term_92db, \c connes, \c flattop,
  * \c gauss(STDDEV), \c hamming, \c hann, \c kaiser_bessel, \c nuttall,
- * \c rectangle, \c triangle, \c tukey(P), \c welch.
+ * \c rectangle, \c triangle, \c tukey(P), \c partial_tukey(n[/ov[/P]]),
+ * \c punchout_tukey(n[/ov[/P]]), \c welch.
  *
  * For \c gauss(STDDEV), STDDEV specifies the standard deviation
  * (0<STDDEV<=0.5).
@@ -928,6 +930,24 @@
  * tapered (0<=P<=1).  P=0 corresponds to \c rectangle and P=1
  * corresponds to \c hann.
  *
+ * Specifying \c partial_tukey or \c punchout_tukey works a little
+ * different. These do not specify a single apodization function, but
+ * a series of them with some overlap. partial_tukey specifies a series
+ * of small windows (all treated separately) while punchout_tukey
+ * specifies a series of windows that have a hole in them. In this way,
+ * the predictor is constructed with only a part of the block, which
+ * helps in case a block consists of dissimilar parts.
+ *
+ * The three parameters that can be specified for the functions are
+ * n, ov and P. n is the number of functions to add, ov is the overlap
+ * of the windows in case of partial_tukey and the overlap in the gaps
+ * in case of punchout_tukey. P is the fraction of the window that is
+ * tapered, like with a regular tukey window. The function can be
+ * specified with only a number, a number and an overlap, or a number
+ * an overlap and a P, for example, partial_tukey(3), partial_tukey(3/0.3)
+ * and partial_tukey(3/0.3/0.5) are all valid. ov should be smaller than 1
+ * and can be negative.
+ *
  * Example specifications are \c "blackman" or
  * \c "hann;triangle;tukey(0.5);tukey(0.25);tukey(0.125)"
  *
@@ -940,7 +960,9 @@
  * results in the smallest compressed subframe.
  *
  * Note that each function specified causes the encoder to occupy a
- * floating point array in which to store the window.
+ * floating point array in which to store the window. Also note that the
+ * values of P, STDDEV and ov are locale-specific, so if the comma
+ * separator specified by the locale is a comma, a comma should be used.
  *
  * \default \c "tukey(0.5)"
  * \param  encoder        An encoder instance to set.
diff --git a/include/share/compat.h b/include/share/compat.h
new file mode 100644
index 0000000..0dc5673
--- /dev/null
+++ b/include/share/compat.h
@@ -0,0 +1,201 @@
+/* libFLAC - Free Lossless Audio Codec library
+ * Copyright (C) 2012-2014  Xiph.org Foundation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the Xiph.org Foundation nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* This is the prefered location of all CPP hackery to make $random_compiler
+ * work like something approaching a C99 (or maybe more accurately GNU99)
+ * compiler.
+ *
+ * It is assumed that this header will be included after "config.h".
+ */
+
+#ifndef FLAC__SHARE__COMPAT_H
+#define FLAC__SHARE__COMPAT_H
+
+#if defined _WIN32 && !defined __CYGWIN__
+/* where MSVC puts unlink() */
+# include <io.h>
+#else
+# include <unistd.h>
+#endif
+
+#if defined _MSC_VER || defined __BORLANDC__ || defined __MINGW32__
+#include <sys/types.h> /* for off_t */
+#define FLAC__off_t __int64 /* use this instead of off_t to fix the 2 GB limit */
+#if !defined __MINGW32__
+#define fseeko _fseeki64
+#define ftello _ftelli64
+#else /* MinGW */
+#if !defined(HAVE_FSEEKO)
+#define fseeko fseeko64
+#define ftello ftello64
+#endif
+#endif
+#else
+#define FLAC__off_t off_t
+#endif
+
+#if HAVE_INTTYPES_H
+#define __STDC_FORMAT_MACROS
+#include <inttypes.h>
+#endif
+
+#if defined(_MSC_VER)
+#define strtoll _strtoi64
+#define strtoull _strtoui64
+#endif
+
+#if defined(_MSC_VER)
+#define inline __inline
+#endif
+
+#if defined __INTEL_COMPILER || (defined _MSC_VER && defined _WIN64)
+/* MSVS generates VERY slow 32-bit code with __restrict */
+#define flac_restrict __restrict
+#elif defined __GNUC__
+#define flac_restrict __restrict__
+#else
+#define flac_restrict
+#endif
+
+#define FLAC__U64L(x) x##ULL
+
+#if defined _MSC_VER || defined __BORLANDC__ || defined __MINGW32__
+#define FLAC__STRCASECMP stricmp
+#define FLAC__STRNCASECMP strnicmp
+#else
+#define FLAC__STRCASECMP strcasecmp
+#define FLAC__STRNCASECMP strncasecmp
+#endif
+
+#if defined _MSC_VER || defined __MINGW32__ || defined __CYGWIN__ || defined __EMX__
+#include <io.h> /* for _setmode(), chmod() */
+#include <fcntl.h> /* for _O_BINARY */
+#else
+#include <unistd.h> /* for chown(), unlink() */
+#endif
+
+#if defined _MSC_VER || defined __BORLANDC__ || defined __MINGW32__
+#if defined __BORLANDC__
+#include <utime.h> /* for utime() */
+#else
+#include <sys/utime.h> /* for utime() */
+#endif
+#else
+#include <sys/types.h> /* some flavors of BSD (like OS X) require this to get time_t */
+#include <utime.h> /* for utime() */
+#endif
+
+#if defined _MSC_VER
+#  if _MSC_VER >= 1600
+/* Visual Studio 2010 has decent C99 support */
+#    include <stdint.h>
+#    define PRIu64 "llu"
+#    define PRId64 "lld"
+#    define PRIx64 "llx"
+#  else
+#    include <limits.h>
+#    ifndef UINT32_MAX
+#      define UINT32_MAX _UI32_MAX
+#    endif
+     typedef unsigned __int64 uint64_t;
+     typedef unsigned __int32 uint32_t;
+     typedef unsigned __int16 uint16_t;
+     typedef unsigned __int8 uint8_t;
+     typedef __int64 int64_t;
+     typedef __int32 int32_t;
+     typedef __int16 int16_t;
+     typedef __int8  int8_t;
+#    define PRIu64 "I64u"
+#    define PRId64 "I64d"
+#    define PRIx64 "I64x"
+#  endif
+#endif /* defined _MSC_VER */
+
+#ifdef _WIN32
+/* All char* strings are in UTF-8 format. Added to support Unicode files on Windows */
+#include "share/win_utf8_io.h"
+
+#define flac_printf printf_utf8
+#define flac_fprintf fprintf_utf8
+#define flac_vfprintf vfprintf_utf8
+#define flac_fopen fopen_utf8
+#define flac_chmod chmod_utf8
+#define flac_utime utime_utf8
+#define flac_unlink unlink_utf8
+#define flac_rename rename_utf8
+#define flac_stat _stat64_utf8
+
+#else
+
+#define flac_printf printf
+#define flac_fprintf fprintf
+#define flac_vfprintf vfprintf
+#define flac_fopen fopen
+#define flac_chmod chmod
+#define flac_utime utime
+#define flac_unlink unlink
+#define flac_rename rename
+#define flac_stat stat
+
+#endif
+
+#ifdef _WIN32
+#define flac_stat_s __stat64 /* stat struct */
+#define flac_fstat _fstat64
+#else
+#define flac_stat_s stat /* stat struct */
+#define flac_fstat fstat
+#endif
+
+#ifndef M_LN2
+#define M_LN2 0.69314718055994530942
+#endif
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+
+/* FLAC needs to compile and work correctly on systems with a normal ISO C99
+ * snprintf as well as Microsoft Visual Studio which has an non-standards
+ * conformant snprint_s function.
+ *
+ * This function wraps the MS version to behave more like the the ISO version.
+ */
+#include <stdarg.h>
+#ifdef __cplusplus
+extern "C" {
+#endif
+int flac_snprintf(char *str, size_t size, const char *fmt, ...);
+int flac_vsnprintf(char *str, size_t size, const char *fmt, va_list va);
+#ifdef __cplusplus
+};
+#endif
+
+#endif /* FLAC__SHARE__COMPAT_H */
diff --git a/include/share/endswap.h b/include/share/endswap.h
new file mode 100644
index 0000000..4fde4c1
--- /dev/null
+++ b/include/share/endswap.h
@@ -0,0 +1,78 @@
+/* libFLAC - Free Lossless Audio Codec library
+ * Copyright (C) 2012-2014  Xiph.org Foundation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the Xiph.org Foundation nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* It is assumed that this header will be included after "config.h". */
+
+#if HAVE_BSWAP32			/* GCC and Clang */
+
+/* GCC prior to 4.8 didn't provide bswap16 on x86_64 */
+#if ! HAVE_BSWAP16
+static inline unsigned short __builtin_bswap16(unsigned short a)
+{
+	return (a<<8)|(a>>8);
+}
+#endif
+
+#define	ENDSWAP_16(x)		(__builtin_bswap16 (x))
+#define	ENDSWAP_32(x)		(__builtin_bswap32 (x))
+
+#elif defined _MSC_VER		/* Windows. Apparently in <stdlib.h>. */
+
+#define	ENDSWAP_16(x)		(_byteswap_ushort (x))
+#define	ENDSWAP_32(x)		(_byteswap_ulong (x))
+
+#elif defined HAVE_BYTESWAP_H		/* Linux */
+
+#include <byteswap.h>
+
+#define	ENDSWAP_16(x)		(bswap_16 (x))
+#define	ENDSWAP_32(x)		(bswap_32 (x))
+
+#else
+
+#define	ENDSWAP_16(x)		((((x) >> 8) & 0xFF) | (((x) & 0xFF) << 8))
+#define	ENDSWAP_32(x)		((((x) >> 24) & 0xFF) | (((x) >> 8) & 0xFF00) | (((x) & 0xFF00) << 8) | (((x) & 0xFF) << 24))
+
+#endif
+
+
+/* Host to little-endian byte swapping. */
+#if CPU_IS_BIG_ENDIAN
+
+#define H2LE_16(x)		ENDSWAP_16 (x)
+#define H2LE_32(x)		ENDSWAP_32 (x)
+
+#else
+
+#define H2LE_16(x)		(x)
+#define H2LE_32(x)		(x)
+
+#endif
diff --git a/include/share/private.h b/include/share/private.h
new file mode 100644
index 0000000..3a500d3
--- /dev/null
+++ b/include/share/private.h
@@ -0,0 +1,45 @@
+/* libFLAC - Free Lossless Audio Codec library
+ * Copyright (C) 2013-2014  Xiph.org Foundation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the Xiph.org Foundation nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef FLAC__SHARE__PRIVATE_H
+#define FLAC__SHARE__PRIVATE_H
+
+/*
+ * Unpublished debug routines from libFLAC> This should not be used from any
+ * client code other than code shipped with the FLAC sources.
+ */
+FLAC_API FLAC__bool FLAC__stream_encoder_disable_constant_subframes(FLAC__StreamEncoder *encoder, FLAC__bool value);
+FLAC_API FLAC__bool FLAC__stream_encoder_disable_fixed_subframes(FLAC__StreamEncoder *encoder, FLAC__bool value);
+FLAC_API FLAC__bool FLAC__stream_encoder_disable_verbatim_subframes(FLAC__StreamEncoder *encoder, FLAC__bool value);
+FLAC_API FLAC__bool FLAC__stream_encoder_set_do_md5(FLAC__StreamEncoder *encoder, FLAC__bool value);
+FLAC_API FLAC__bool FLAC__stream_encoder_get_do_md5(const FLAC__StreamEncoder *encoder);
+
+#endif /* FLAC__SHARE__PRIVATE_H */
diff --git a/include/share/win_utf8_io.h b/include/share/win_utf8_io.h
new file mode 100644
index 0000000..1d15339
--- /dev/null
+++ b/include/share/win_utf8_io.h
@@ -0,0 +1,69 @@
+/* libFLAC - Free Lossless Audio Codec library
+ * Copyright (C) 2013-2014  Xiph.Org Foundation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the Xiph.org Foundation nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef _WIN32
+
+#ifndef flac__win_utf8_io_h
+#define flac__win_utf8_io_h
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+#include <sys/stat.h>
+#include <stdarg.h>
+#include <windows.h>
+
+int get_utf8_argv(int *argc, char ***argv);
+
+int printf_utf8(const char *format, ...);
+int fprintf_utf8(FILE *stream, const char *format, ...);
+int vfprintf_utf8(FILE *stream, const char *format, va_list argptr);
+
+FILE *fopen_utf8(const char *filename, const char *mode);
+int stat_utf8(const char *path, struct stat *buffer);
+int _stat64_utf8(const char *path, struct __stat64 *buffer);
+int chmod_utf8(const char *filename, int pmode);
+int utime_utf8(const char *filename, struct utimbuf *times);
+int unlink_utf8(const char *filename);
+int rename_utf8(const char *oldname, const char *newname);
+size_t strlen_utf8(const char *str);
+int win_get_console_width(void);
+int print_console(FILE *stream, const wchar_t *text, size_t len);
+HANDLE WINAPI CreateFile_utf8(const char *lpFileName, DWORD dwDesiredAccess, DWORD dwShareMode, LPSECURITY_ATTRIBUTES lpSecurityAttributes, DWORD dwCreationDisposition, DWORD dwFlagsAndAttributes, HANDLE hTemplateFile);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif
+#endif
diff --git a/src/libFLAC/bitmath.c b/src/libFLAC/bitmath.c
index 27e25f0..5b58ca9 100644
--- a/src/libFLAC/bitmath.c
+++ b/src/libFLAC/bitmath.c
@@ -1,5 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
- * Copyright (C) 2001,2002,2003,2004,2005,2006,2007  Josh Coalson
+ * Copyright (C) 2001-2009  Josh Coalson
+ * Copyright (C) 2011-2014  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -29,52 +30,11 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#if HAVE_CONFIG_H
+#ifdef HAVE_CONFIG_H
 #  include <config.h>
 #endif
 
 #include "private/bitmath.h"
-#include "FLAC/assert.h"
-
-/* An example of what FLAC__bitmath_ilog2() computes:
- *
- * ilog2( 0) = assertion failure
- * ilog2( 1) = 0
- * ilog2( 2) = 1
- * ilog2( 3) = 1
- * ilog2( 4) = 2
- * ilog2( 5) = 2
- * ilog2( 6) = 2
- * ilog2( 7) = 2
- * ilog2( 8) = 3
- * ilog2( 9) = 3
- * ilog2(10) = 3
- * ilog2(11) = 3
- * ilog2(12) = 3
- * ilog2(13) = 3
- * ilog2(14) = 3
- * ilog2(15) = 3
- * ilog2(16) = 4
- * ilog2(17) = 4
- * ilog2(18) = 4
- */
-unsigned FLAC__bitmath_ilog2(FLAC__uint32 v)
-{
-	unsigned l = 0;
-	FLAC__ASSERT(v > 0);
-	while(v >>= 1)
-		l++;
-	return l;
-}
-
-unsigned FLAC__bitmath_ilog2_wide(FLAC__uint64 v)
-{
-	unsigned l = 0;
-	FLAC__ASSERT(v > 0);
-	while(v >>= 1)
-		l++;
-	return l;
-}
 
 /* An example of what FLAC__bitmath_silog2() computes:
  *
diff --git a/src/libFLAC/bitreader.c b/src/libFLAC/bitreader.c
index d496379..f61229b 100644
--- a/src/libFLAC/bitreader.c
+++ b/src/libFLAC/bitreader.c
@@ -1,5 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
- * Copyright (C) 2000,2001,2002,2003,2004,2005,2006,2007  Josh Coalson
+ * Copyright (C) 2000-2009  Josh Coalson
+ * Copyright (C) 2011-2014  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -29,53 +30,33 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#if HAVE_CONFIG_H
+#ifdef HAVE_CONFIG_H
 #  include <config.h>
 #endif
 
-#include <stdlib.h> /* for malloc() */
-#include <string.h> /* for memcpy(), memset() */
-#ifdef _MSC_VER
-#include <winsock.h> /* for ntohl() */
-#elif defined FLAC__SYS_DARWIN
-#include <machine/endian.h> /* for ntohl() */
-#elif defined __MINGW32__
-#include <winsock.h> /* for ntohl() */
-#else
-#include <netinet/in.h> /* for ntohl() */
-#endif
+#include <stdlib.h>
+#include <string.h>
 #include "private/bitmath.h"
 #include "private/bitreader.h"
 #include "private/crc.h"
+#include "private/macros.h"
 #include "FLAC/assert.h"
+#include "share/compat.h"
+#include "share/endswap.h"
 
 /* Things should be fastest when this matches the machine word size */
-/* WATCHOUT: if you change this you must also change the following #defines down to COUNT_ZERO_MSBS below to match */
-/* WATCHOUT: there are a few places where the code will not work unless brword is >= 32 bits wide */
+/* WATCHOUT: if you change this you must also change the following #defines down to FLAC__clz_uint32 below to match */
+/* WATCHOUT: there are a few places where the code will not work unless uint32_t is >= 32 bits wide */
 /*           also, some sections currently only have fast versions for 4 or 8 bytes per word */
-typedef FLAC__uint32 brword;
-#define FLAC__BYTES_PER_WORD 4
-#define FLAC__BITS_PER_WORD 32
+#define FLAC__BYTES_PER_WORD 4		/* sizeof uint32_t */
+#define FLAC__BITS_PER_WORD (8 * FLAC__BYTES_PER_WORD)
 #define FLAC__WORD_ALL_ONES ((FLAC__uint32)0xffffffff)
-/* SWAP_BE_WORD_TO_HOST swaps bytes in a brword (which is always big-endian) if necessary to match host byte order */
+/* SWAP_BE_WORD_TO_HOST swaps bytes in a uint32_t (which is always big-endian) if necessary to match host byte order */
 #if WORDS_BIGENDIAN
 #define SWAP_BE_WORD_TO_HOST(x) (x)
 #else
-#ifdef _MSC_VER
-#define SWAP_BE_WORD_TO_HOST(x) local_swap32_(x)
-#else
-#define SWAP_BE_WORD_TO_HOST(x) ntohl(x)
+#define SWAP_BE_WORD_TO_HOST(x) ENDSWAP_32(x)
 #endif
-#endif
-/* counts the # of zero MSBs in a word */
-#define COUNT_ZERO_MSBS(word) ( \
-	(word) <= 0xffff ? \
-		( (word) <= 0xff? byte_to_unary_table[word] + 24 : byte_to_unary_table[(word) >> 8] + 16 ) : \
-		( (word) <= 0xffffff? byte_to_unary_table[word >> 16] + 8 : byte_to_unary_table[(word) >> 24] ) \
-)
-/* this alternate might be slightly faster on some systems/compilers: */
-#define COUNT_ZERO_MSBS2(word) ( (word) <= 0xff ? byte_to_unary_table[word] + 24 : ((word) <= 0xffff ? byte_to_unary_table[(word) >> 8] + 16 : ((word) <= 0xffffff ? byte_to_unary_table[(word) >> 16] + 8 : byte_to_unary_table[(word) >> 24])) )
-
 
 /*
  * This should be at least twice as large as the largest number of words
@@ -93,50 +74,10 @@
  */
 static const unsigned FLAC__BITREADER_DEFAULT_CAPACITY = 65536u / FLAC__BITS_PER_WORD; /* in words */
 
-static const unsigned char byte_to_unary_table[] = {
-	8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
-	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-};
-
-#ifdef min
-#undef min
-#endif
-#define min(x,y) ((x)<(y)?(x):(y))
-#ifdef max
-#undef max
-#endif
-#define max(x,y) ((x)>(y)?(x):(y))
-
-/* adjust for compilers that can't understand using LLU suffix for uint64_t literals */
-#ifdef _MSC_VER
-#define FLAC__U64L(x) x##ULL
-#else
-#define FLAC__U64L(x) x##LLU
-#endif
-
-#ifndef FLaC__INLINE
-#define FLaC__INLINE
-#endif
-
-/* WATCHOUT: assembly routines rely on the order in which these fields are declared */
 struct FLAC__BitReader {
 	/* any partially-consumed word at the head will stay right-justified as bits are consumed from the left */
 	/* any incomplete word at the tail will be left-justified, and bytes from the read callback are added on the right */
-	brword *buffer;
+	uint32_t *buffer;
 	unsigned capacity; /* in words */
 	unsigned words; /* # of completed words in buffer */
 	unsigned bytes; /* # of bytes in incomplete word at buffer[words] */
@@ -146,26 +87,9 @@
 	unsigned crc16_align; /* the number of bits in the current consumed word that should not be CRC'd */
 	FLAC__BitReaderReadCallback read_callback;
 	void *client_data;
-	FLAC__CPUInfo cpu_info;
 };
 
-#ifdef _MSC_VER
-/* OPT: an MSVC built-in would be better */
-static _inline FLAC__uint32 local_swap32_(FLAC__uint32 x)
-{
-	return _byteswap_ulong(x);
-}
-
-static void local_swap32_block_(FLAC__uint32 *start, FLAC__uint32 len)
-{
-	FLAC__uint32 *end;
-	for(end = start + len; start < end; ++start) {
-		*start = local_swap32_(*start);
-	}
-}
-#endif
-
-static FLaC__INLINE void crc16_update_word_(FLAC__BitReader *br, brword word)
+static inline void crc16_update_word_(FLAC__BitReader *br, uint32_t word)
 {
 	register unsigned crc = br->read_crc16;
 #if FLAC__BYTES_PER_WORD == 4
@@ -194,8 +118,7 @@
 	br->crc16_align = 0;
 }
 
-/* would be static except it needs to be called by asm routines */
-FLAC__bool bitreader_read_from_client_(FLAC__BitReader *br)
+static FLAC__bool bitreader_read_from_client_(FLAC__BitReader *br)
 {
 	unsigned start, end;
 	size_t bytes;
@@ -219,7 +142,7 @@
 		return false; /* no space left, buffer is too small; see note for FLAC__BITREADER_DEFAULT_CAPACITY  */
 	target = ((FLAC__byte*)(br->buffer+br->words)) + br->bytes;
 
-	/* before reading, if the existing reader looks like this (say brword is 32 bits wide)
+	/* before reading, if the existing reader looks like this (say uint32_t is 32 bits wide)
 	 *   bitstream :  11 22 33 44 55            br->words=1 br->bytes=1 (partial tail word is left-justified)
 	 *   buffer[BE]:  11 22 33 44 55 ?? ?? ??   (shown layed out as bytes sequentially in memory)
 	 *   buffer[LE]:  44 33 22 11 ?? ?? ?? 55   (?? being don't-care)
@@ -252,14 +175,7 @@
 	 */
 #if WORDS_BIGENDIAN
 #else
-	end = (br->words*FLAC__BYTES_PER_WORD + br->bytes + (unsigned)bytes + (FLAC__BYTES_PER_WORD-1)) / FLAC__BYTES_PER_WORD;
-# if defined(_MSC_VER) && (FLAC__BYTES_PER_WORD == 4)
-	if(br->cpu_info.type == FLAC__CPUINFO_TYPE_IA32 && br->cpu_info.data.ia32.bswap) {
-		start = br->words;
-		local_swap32_block_(br->buffer + start, end - start);
-	}
-	else
-# endif
+	end = (br->words*FLAC__BYTES_PER_WORD + br->bytes + bytes + (FLAC__BYTES_PER_WORD-1)) / FLAC__BYTES_PER_WORD;
 	for(start = br->words; start < end; start++)
 		br->buffer[start] = SWAP_BE_WORD_TO_HOST(br->buffer[start]);
 #endif
@@ -270,7 +186,7 @@
 	 *   buffer[LE]:  44 33 22 11 88 77 66 55 CC BB AA 99 ?? FF EE DD
 	 * finally we'll update the reader values:
 	 */
-	end = br->words*FLAC__BYTES_PER_WORD + br->bytes + (unsigned)bytes;
+	end = br->words*FLAC__BYTES_PER_WORD + br->bytes + bytes;
 	br->words = end / FLAC__BYTES_PER_WORD;
 	br->bytes = end % FLAC__BYTES_PER_WORD;
 
@@ -285,7 +201,7 @@
 
 FLAC__BitReader *FLAC__bitreader_new(void)
 {
-	FLAC__BitReader *br = (FLAC__BitReader*)calloc(1, sizeof(FLAC__BitReader));
+	FLAC__BitReader *br = calloc(1, sizeof(FLAC__BitReader));
 
 	/* calloc() implies:
 		memset(br, 0, sizeof(FLAC__BitReader));
@@ -313,19 +229,18 @@
  *
  ***********************************************************************/
 
-FLAC__bool FLAC__bitreader_init(FLAC__BitReader *br, FLAC__CPUInfo cpu, FLAC__BitReaderReadCallback rcb, void *cd)
+FLAC__bool FLAC__bitreader_init(FLAC__BitReader *br, FLAC__BitReaderReadCallback rcb, void *cd)
 {
 	FLAC__ASSERT(0 != br);
 
 	br->words = br->bytes = 0;
 	br->consumed_words = br->consumed_bits = 0;
 	br->capacity = FLAC__BITREADER_DEFAULT_CAPACITY;
-	br->buffer = (brword*)malloc(sizeof(brword) * br->capacity);
+	br->buffer = malloc(sizeof(uint32_t) * br->capacity);
 	if(br->buffer == 0)
 		return false;
 	br->read_callback = rcb;
 	br->client_data = cd;
-	br->cpu_info = cpu;
 
 	return true;
 }
@@ -400,29 +315,29 @@
 
 	/* CRC any tail bytes in a partially-consumed word */
 	if(br->consumed_bits) {
-		const brword tail = br->buffer[br->consumed_words];
+		const uint32_t tail = br->buffer[br->consumed_words];
 		for( ; br->crc16_align < br->consumed_bits; br->crc16_align += 8)
 			br->read_crc16 = FLAC__CRC16_UPDATE((unsigned)((tail >> (FLAC__BITS_PER_WORD-8-br->crc16_align)) & 0xff), br->read_crc16);
 	}
 	return br->read_crc16;
 }
 
-FLaC__INLINE FLAC__bool FLAC__bitreader_is_consumed_byte_aligned(const FLAC__BitReader *br)
+inline FLAC__bool FLAC__bitreader_is_consumed_byte_aligned(const FLAC__BitReader *br)
 {
 	return ((br->consumed_bits & 7) == 0);
 }
 
-FLaC__INLINE unsigned FLAC__bitreader_bits_left_for_byte_alignment(const FLAC__BitReader *br)
+inline unsigned FLAC__bitreader_bits_left_for_byte_alignment(const FLAC__BitReader *br)
 {
 	return 8 - (br->consumed_bits & 7);
 }
 
-FLaC__INLINE unsigned FLAC__bitreader_get_input_bits_unconsumed(const FLAC__BitReader *br)
+inline unsigned FLAC__bitreader_get_input_bits_unconsumed(const FLAC__BitReader *br)
 {
 	return (br->words-br->consumed_words)*FLAC__BITS_PER_WORD + br->bytes*8 - br->consumed_bits;
 }
 
-FLaC__INLINE FLAC__bool FLAC__bitreader_read_raw_uint32(FLAC__BitReader *br, FLAC__uint32 *val, unsigned bits)
+FLAC__bool FLAC__bitreader_read_raw_uint32(FLAC__BitReader *br, FLAC__uint32 *val, unsigned bits)
 {
 	FLAC__ASSERT(0 != br);
 	FLAC__ASSERT(0 != br->buffer);
@@ -448,7 +363,7 @@
 		if(br->consumed_bits) {
 			/* this also works when consumed_bits==0, it's just a little slower than necessary for that case */
 			const unsigned n = FLAC__BITS_PER_WORD - br->consumed_bits;
-			const brword word = br->buffer[br->consumed_words];
+			const uint32_t word = br->buffer[br->consumed_words];
 			if(bits < n) {
 				*val = (word & (FLAC__WORD_ALL_ONES >> br->consumed_bits)) >> (n-bits);
 				br->consumed_bits += bits;
@@ -467,7 +382,7 @@
 			return true;
 		}
 		else {
-			const brword word = br->buffer[br->consumed_words];
+			const uint32_t word = br->buffer[br->consumed_words];
 			if(bits < FLAC__BITS_PER_WORD) {
 				*val = word >> (FLAC__BITS_PER_WORD-bits);
 				br->consumed_bits = bits;
@@ -533,7 +448,7 @@
 	return true;
 }
 
-FLaC__INLINE FLAC__bool FLAC__bitreader_read_uint32_little_endian(FLAC__BitReader *br, FLAC__uint32 *val)
+inline FLAC__bool FLAC__bitreader_read_uint32_little_endian(FLAC__BitReader *br, FLAC__uint32 *val)
 {
 	FLAC__uint32 x8, x32 = 0;
 
@@ -573,7 +488,7 @@
 		FLAC__uint32 x;
 
 		if(n != 0) {
-			m = min(8-n, bits);
+			m = flac_min(8-n, bits);
 			if(!FLAC__bitreader_read_raw_uint32(br, &x, m))
 				return false;
 			bits -= m;
@@ -648,7 +563,7 @@
 	/* step 2: read whole words in chunks */
 	while(nvals >= FLAC__BYTES_PER_WORD) {
 		if(br->consumed_words < br->words) {
-			const brword word = br->buffer[br->consumed_words++];
+			const uint32_t word = br->buffer[br->consumed_words++];
 #if FLAC__BYTES_PER_WORD == 4
 			val[0] = (FLAC__byte)(word >> 24);
 			val[1] = (FLAC__byte)(word >> 16);
@@ -684,7 +599,7 @@
 	return true;
 }
 
-FLaC__INLINE FLAC__bool FLAC__bitreader_read_unary_unsigned(FLAC__BitReader *br, unsigned *val)
+FLAC__bool FLAC__bitreader_read_unary_unsigned(FLAC__BitReader *br, unsigned *val)
 #if 0 /* slow but readable version */
 {
 	unsigned bit;
@@ -713,9 +628,9 @@
 	*val = 0;
 	while(1) {
 		while(br->consumed_words < br->words) { /* if we've not consumed up to a partial tail word... */
-			brword b = br->buffer[br->consumed_words] << br->consumed_bits;
+			uint32_t b = br->buffer[br->consumed_words] << br->consumed_bits;
 			if(b) {
-				i = COUNT_ZERO_MSBS(b);
+				i = FLAC__clz_uint32(b);
 				*val += i;
 				i++;
 				br->consumed_bits += i;
@@ -741,11 +656,11 @@
 		 * us data a byte at a time (unlikely), br->consumed_bits may not
 		 * be zero.
 		 */
-		if(br->bytes) {
+		if(br->bytes*8 > br->consumed_bits) {
 			const unsigned end = br->bytes * 8;
-			brword b = (br->buffer[br->consumed_words] & (FLAC__WORD_ALL_ONES << (FLAC__BITS_PER_WORD-end))) << br->consumed_bits;
+			uint32_t b = (br->buffer[br->consumed_words] & (FLAC__WORD_ALL_ONES << (FLAC__BITS_PER_WORD-end))) << br->consumed_bits;
 			if(b) {
-				i = COUNT_ZERO_MSBS(b);
+				i = FLAC__clz_uint32(b);
 				*val += i;
 				i++;
 				br->consumed_bits += i;
@@ -754,7 +669,7 @@
 			}
 			else {
 				*val += end - br->consumed_bits;
-				br->consumed_bits += end;
+				br->consumed_bits = end;
 				FLAC__ASSERT(br->consumed_bits < FLAC__BITS_PER_WORD);
 				/* didn't find stop bit yet, have to keep going... */
 			}
@@ -793,202 +708,15 @@
 }
 
 /* this is by far the most heavily used reader call.  it ain't pretty but it's fast */
-/* a lot of the logic is copied, then adapted, from FLAC__bitreader_read_unary_unsigned() and FLAC__bitreader_read_raw_uint32() */
 FLAC__bool FLAC__bitreader_read_rice_signed_block(FLAC__BitReader *br, int vals[], unsigned nvals, unsigned parameter)
-/* OPT: possibly faster version for use with MSVC */
-#ifdef _MSC_VER
 {
-	unsigned i;
-	unsigned uval = 0;
-	unsigned bits; /* the # of binary LSBs left to read to finish a rice codeword */
-
 	/* try and get br->consumed_words and br->consumed_bits into register;
 	 * must remember to flush them back to *br before calling other
-	 * bitwriter functions that use them, and before returning */
-	register unsigned cwords;
-	register unsigned cbits;
-
-	FLAC__ASSERT(0 != br);
-	FLAC__ASSERT(0 != br->buffer);
-	/* WATCHOUT: code does not work with <32bit words; we can make things much faster with this assertion */
-	FLAC__ASSERT(FLAC__BITS_PER_WORD >= 32);
-	FLAC__ASSERT(parameter < 32);
-	/* the above two asserts also guarantee that the binary part never straddles more that 2 words, so we don't have to loop to read it */
-
-	if(nvals == 0)
-		return true;
-
-	cbits = br->consumed_bits;
-	cwords = br->consumed_words;
-
-	while(1) {
-
-		/* read unary part */
-		while(1) {
-			while(cwords < br->words) { /* if we've not consumed up to a partial tail word... */
-				brword b = br->buffer[cwords] << cbits;
-				if(b) {
-#if 0 /* slower, probably due to bad register allocation... */ && defined FLAC__CPU_IA32 && !defined FLAC__NO_ASM && FLAC__BITS_PER_WORD == 32
-					__asm {
-						bsr eax, b
-						not eax
-						and eax, 31
-						mov i, eax
-					}
-#else
-					i = COUNT_ZERO_MSBS(b);
-#endif
-					uval += i;
-					bits = parameter;
-					i++;
-					cbits += i;
-					if(cbits == FLAC__BITS_PER_WORD) {
-						crc16_update_word_(br, br->buffer[cwords]);
-						cwords++;
-						cbits = 0;
-					}
-					goto break1;
-				}
-				else {
-					uval += FLAC__BITS_PER_WORD - cbits;
-					crc16_update_word_(br, br->buffer[cwords]);
-					cwords++;
-					cbits = 0;
-					/* didn't find stop bit yet, have to keep going... */
-				}
-			}
-			/* at this point we've eaten up all the whole words; have to try
-			 * reading through any tail bytes before calling the read callback.
-			 * this is a repeat of the above logic adjusted for the fact we
-			 * don't have a whole word.  note though if the client is feeding
-			 * us data a byte at a time (unlikely), br->consumed_bits may not
-			 * be zero.
-			 */
-			if(br->bytes) {
-				const unsigned end = br->bytes * 8;
-				brword b = (br->buffer[cwords] & (FLAC__WORD_ALL_ONES << (FLAC__BITS_PER_WORD-end))) << cbits;
-				if(b) {
-					i = COUNT_ZERO_MSBS(b);
-					uval += i;
-					bits = parameter;
-					i++;
-					cbits += i;
-					FLAC__ASSERT(cbits < FLAC__BITS_PER_WORD);
-					goto break1;
-				}
-				else {
-					uval += end - cbits;
-					cbits += end;
-					FLAC__ASSERT(cbits < FLAC__BITS_PER_WORD);
-					/* didn't find stop bit yet, have to keep going... */
-				}
-			}
-			/* flush registers and read; bitreader_read_from_client_() does
-			 * not touch br->consumed_bits at all but we still need to set
-			 * it in case it fails and we have to return false.
-			 */
-			br->consumed_bits = cbits;
-			br->consumed_words = cwords;
-			if(!bitreader_read_from_client_(br))
-				return false;
-			cwords = br->consumed_words;
-		}
-break1:
-		/* read binary part */
-		FLAC__ASSERT(cwords <= br->words);
-
-		if(bits) {
-			while((br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits < bits) {
-				/* flush registers and read; bitreader_read_from_client_() does
-				 * not touch br->consumed_bits at all but we still need to set
-				 * it in case it fails and we have to return false.
-				 */
-				br->consumed_bits = cbits;
-				br->consumed_words = cwords;
-				if(!bitreader_read_from_client_(br))
-					return false;
-				cwords = br->consumed_words;
-			}
-			if(cwords < br->words) { /* if we've not consumed up to a partial tail word... */
-				if(cbits) {
-					/* this also works when consumed_bits==0, it's just a little slower than necessary for that case */
-					const unsigned n = FLAC__BITS_PER_WORD - cbits;
-					const brword word = br->buffer[cwords];
-					if(bits < n) {
-						uval <<= bits;
-						uval |= (word & (FLAC__WORD_ALL_ONES >> cbits)) >> (n-bits);
-						cbits += bits;
-						goto break2;
-					}
-					uval <<= n;
-					uval |= word & (FLAC__WORD_ALL_ONES >> cbits);
-					bits -= n;
-					crc16_update_word_(br, word);
-					cwords++;
-					cbits = 0;
-					if(bits) { /* if there are still bits left to read, there have to be less than 32 so they will all be in the next word */
-						uval <<= bits;
-						uval |= (br->buffer[cwords] >> (FLAC__BITS_PER_WORD-bits));
-						cbits = bits;
-					}
-					goto break2;
-				}
-				else {
-					FLAC__ASSERT(bits < FLAC__BITS_PER_WORD);
-					uval <<= bits;
-					uval |= br->buffer[cwords] >> (FLAC__BITS_PER_WORD-bits);
-					cbits = bits;
-					goto break2;
-				}
-			}
-			else {
-				/* in this case we're starting our read at a partial tail word;
-				 * the reader has guaranteed that we have at least 'bits' bits
-				 * available to read, which makes this case simpler.
-				 */
-				uval <<= bits;
-				if(cbits) {
-					/* this also works when consumed_bits==0, it's just a little slower than necessary for that case */
-					FLAC__ASSERT(cbits + bits <= br->bytes*8);
-					uval |= (br->buffer[cwords] & (FLAC__WORD_ALL_ONES >> cbits)) >> (FLAC__BITS_PER_WORD-cbits-bits);
-					cbits += bits;
-					goto break2;
-				}
-				else {
-					uval |= br->buffer[cwords] >> (FLAC__BITS_PER_WORD-bits);
-					cbits += bits;
-					goto break2;
-				}
-			}
-		}
-break2:
-		/* compose the value */
-		*vals = (int)(uval >> 1 ^ -(int)(uval & 1));
-
-		/* are we done? */
-		--nvals;
-		if(nvals == 0) {
-			br->consumed_bits = cbits;
-			br->consumed_words = cwords;
-			return true;
-		}
-
-		uval = 0;
-		++vals;
-
-	}
-}
-#else
-{
-	unsigned i;
-	unsigned uval = 0;
-
-	/* try and get br->consumed_words and br->consumed_bits into register;
-	 * must remember to flush them back to *br before calling other
-	 * bitwriter functions that use them, and before returning */
-	register unsigned cwords;
-	register unsigned cbits;
-	unsigned ucbits; /* keep track of the number of unconsumed bits in the buffer */
+	 * bitreader functions that use them, and before returning */
+	unsigned cwords, words, lsbs, msbs, x, y;
+	unsigned ucbits; /* keep track of the number of unconsumed bits in word */
+	uint32_t b;
+	int *val, *end;
 
 	FLAC__ASSERT(0 != br);
 	FLAC__ASSERT(0 != br->buffer);
@@ -997,175 +725,127 @@
 	FLAC__ASSERT(parameter < 32);
 	/* the above two asserts also guarantee that the binary part never straddles more than 2 words, so we don't have to loop to read it */
 
-	if(nvals == 0)
-		return true;
+	val = vals;
+	end = vals + nvals;
 
-	cbits = br->consumed_bits;
-	cwords = br->consumed_words;
-	ucbits = (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits;
-
-	while(1) {
-
-		/* read unary part */
-		while(1) {
-			while(cwords < br->words) { /* if we've not consumed up to a partial tail word... */
-				brword b = br->buffer[cwords] << cbits;
-				if(b) {
-#if 0 /* is not discernably faster... */ && defined FLAC__CPU_IA32 && !defined FLAC__NO_ASM && FLAC__BITS_PER_WORD == 32 && defined __GNUC__
-					asm volatile (
-						"bsrl %1, %0;"
-						"notl %0;"
-						"andl $31, %0;"
-						: "=r"(i)
-						: "r"(b)
-					);
-#else
-					i = COUNT_ZERO_MSBS(b);
-#endif
-					uval += i;
-					cbits += i;
-					cbits++; /* skip over stop bit */
-					if(cbits >= FLAC__BITS_PER_WORD) { /* faster way of testing if(cbits == FLAC__BITS_PER_WORD) */
-						crc16_update_word_(br, br->buffer[cwords]);
-						cwords++;
-						cbits = 0;
-					}
-					goto break1;
-				}
-				else {
-					uval += FLAC__BITS_PER_WORD - cbits;
-					crc16_update_word_(br, br->buffer[cwords]);
-					cwords++;
-					cbits = 0;
-					/* didn't find stop bit yet, have to keep going... */
-				}
-			}
-			/* at this point we've eaten up all the whole words; have to try
-			 * reading through any tail bytes before calling the read callback.
-			 * this is a repeat of the above logic adjusted for the fact we
-			 * don't have a whole word.  note though if the client is feeding
-			 * us data a byte at a time (unlikely), br->consumed_bits may not
-			 * be zero.
-			 */
-			if(br->bytes) {
-				const unsigned end = br->bytes * 8;
-				brword b = (br->buffer[cwords] & ~(FLAC__WORD_ALL_ONES >> end)) << cbits;
-				if(b) {
-					i = COUNT_ZERO_MSBS(b);
-					uval += i;
-					cbits += i;
-					cbits++; /* skip over stop bit */
-					FLAC__ASSERT(cbits < FLAC__BITS_PER_WORD);
-					goto break1;
-				}
-				else {
-					uval += end - cbits;
-					cbits += end;
-					FLAC__ASSERT(cbits < FLAC__BITS_PER_WORD);
-					/* didn't find stop bit yet, have to keep going... */
-				}
-			}
-			/* flush registers and read; bitreader_read_from_client_() does
-			 * not touch br->consumed_bits at all but we still need to set
-			 * it in case it fails and we have to return false.
-			 */
-			br->consumed_bits = cbits;
-			br->consumed_words = cwords;
-			if(!bitreader_read_from_client_(br))
+	if(parameter == 0) {
+		while(val < end) {
+			/* read the unary MSBs and end bit */
+			if(!FLAC__bitreader_read_unary_unsigned(br, &msbs))
 				return false;
-			cwords = br->consumed_words;
-			ucbits = (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits + uval;
-			/* + uval to offset our count by the # of unary bits already
-			 * consumed before the read, because we will add these back
-			 * in all at once at break1
-			 */
-		}
-break1:
-		ucbits -= uval;
-		ucbits--; /* account for stop bit */
 
-		/* read binary part */
-		FLAC__ASSERT(cwords <= br->words);
-
-		if(parameter) {
-			while(ucbits < parameter) {
-				/* flush registers and read; bitreader_read_from_client_() does
-				 * not touch br->consumed_bits at all but we still need to set
-				 * it in case it fails and we have to return false.
-				 */
-				br->consumed_bits = cbits;
-				br->consumed_words = cwords;
-				if(!bitreader_read_from_client_(br))
-					return false;
-				cwords = br->consumed_words;
-				ucbits = (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits;
-			}
-			if(cwords < br->words) { /* if we've not consumed up to a partial tail word... */
-				if(cbits) {
-					/* this also works when consumed_bits==0, it's just slower than necessary for that case */
-					const unsigned n = FLAC__BITS_PER_WORD - cbits;
-					const brword word = br->buffer[cwords];
-					if(parameter < n) {
-						uval <<= parameter;
-						uval |= (word & (FLAC__WORD_ALL_ONES >> cbits)) >> (n-parameter);
-						cbits += parameter;
-					}
-					else {
-						uval <<= n;
-						uval |= word & (FLAC__WORD_ALL_ONES >> cbits);
-						crc16_update_word_(br, word);
-						cwords++;
-						cbits = parameter - n;
-						if(cbits) { /* parameter > n, i.e. if there are still bits left to read, there have to be less than 32 so they will all be in the next word */
-							uval <<= cbits;
-							uval |= (br->buffer[cwords] >> (FLAC__BITS_PER_WORD-cbits));
-						}
-					}
-				}
-				else {
-					cbits = parameter;
-					uval <<= parameter;
-					uval |= br->buffer[cwords] >> (FLAC__BITS_PER_WORD-cbits);
-				}
-			}
-			else {
-				/* in this case we're starting our read at a partial tail word;
-				 * the reader has guaranteed that we have at least 'parameter'
-				 * bits available to read, which makes this case simpler.
-				 */
-				uval <<= parameter;
-				if(cbits) {
-					/* this also works when consumed_bits==0, it's just a little slower than necessary for that case */
-					FLAC__ASSERT(cbits + parameter <= br->bytes*8);
-					uval |= (br->buffer[cwords] & (FLAC__WORD_ALL_ONES >> cbits)) >> (FLAC__BITS_PER_WORD-cbits-parameter);
-					cbits += parameter;
-				}
-				else {
-					cbits = parameter;
-					uval |= br->buffer[cwords] >> (FLAC__BITS_PER_WORD-cbits);
-				}
-			}
+			*val++ = (int)(msbs >> 1) ^ -(int)(msbs & 1);
 		}
 
-		ucbits -= parameter;
+		return true;
+	}
+
+	FLAC__ASSERT(parameter > 0);
+
+	cwords = br->consumed_words;
+	words = br->words;
+
+	/* if we've not consumed up to a partial tail word... */
+	if(cwords >= words) {
+		x = 0;
+		goto process_tail;
+	}
+
+	ucbits = FLAC__BITS_PER_WORD - br->consumed_bits;
+	b = br->buffer[cwords] << br->consumed_bits;  /* keep unconsumed bits aligned to left */
+
+	while(val < end) {
+		/* read the unary MSBs and end bit */
+		x = y = FLAC__clz2_uint32(b);
+		if(x == FLAC__BITS_PER_WORD) {
+			x = ucbits;
+			do {
+				/* didn't find stop bit yet, have to keep going... */
+				crc16_update_word_(br, br->buffer[cwords++]);
+				if (cwords >= words)
+					goto incomplete_msbs;
+				b = br->buffer[cwords];
+				y = FLAC__clz2_uint32(b);
+				x += y;
+			} while(y == FLAC__BITS_PER_WORD);
+		}
+		b <<= y;
+		b <<= 1; /* account for stop bit */
+		ucbits = (ucbits - x - 1) % FLAC__BITS_PER_WORD;
+		msbs = x;
+
+		/* read the binary LSBs */
+		x = b >> (FLAC__BITS_PER_WORD - parameter);
+		if(parameter <= ucbits) {
+			ucbits -= parameter;
+			b <<= parameter;
+		} else {
+			/* there are still bits left to read, they will all be in the next word */
+			crc16_update_word_(br, br->buffer[cwords++]);
+			if (cwords >= words)
+				goto incomplete_lsbs;
+			b = br->buffer[cwords];
+			ucbits += FLAC__BITS_PER_WORD - parameter;
+			x |= b >> ucbits;
+			b <<= FLAC__BITS_PER_WORD - ucbits;
+		}
+		lsbs = x;
 
 		/* compose the value */
-		*vals = (int)(uval >> 1 ^ -(int)(uval & 1));
+		x = (msbs << parameter) | lsbs;
+		*val++ = (int)(x >> 1) ^ -(int)(x & 1);
 
-		/* are we done? */
-		--nvals;
-		if(nvals == 0) {
-			br->consumed_bits = cbits;
-			br->consumed_words = cwords;
-			return true;
-		}
+		continue;
 
-		uval = 0;
-		++vals;
+		/* at this point we've eaten up all the whole words */
+process_tail:
+		do {
+			if(0) {
+incomplete_msbs:
+				br->consumed_bits = 0;
+				br->consumed_words = cwords;
+			}
 
+			/* read the unary MSBs and end bit */
+			if(!FLAC__bitreader_read_unary_unsigned(br, &msbs))
+				return false;
+			msbs += x;
+			x = ucbits = 0;
+
+			if(0) {
+incomplete_lsbs:
+				br->consumed_bits = 0;
+				br->consumed_words = cwords;
+			}
+
+			/* read the binary LSBs */
+			if(!FLAC__bitreader_read_raw_uint32(br, &lsbs, parameter - ucbits))
+				return false;
+			lsbs = x | lsbs;
+
+			/* compose the value */
+			x = (msbs << parameter) | lsbs;
+			*val++ = (int)(x >> 1) ^ -(int)(x & 1);
+			x = 0;
+
+			cwords = br->consumed_words;
+			words = br->words;
+			ucbits = FLAC__BITS_PER_WORD - br->consumed_bits;
+			b = br->buffer[cwords] << br->consumed_bits;
+		} while(cwords >= words && val < end);
 	}
+
+	if(ucbits == 0 && cwords < words) {
+		/* don't leave the head word with no unconsumed bits */
+		crc16_update_word_(br, br->buffer[cwords++]);
+		ucbits = FLAC__BITS_PER_WORD;
+	}
+
+	br->consumed_bits = FLAC__BITS_PER_WORD - ucbits;
+	br->consumed_words = cwords;
+
+	return true;
 }
-#endif
 
 #if 0 /* UNUSED */
 FLAC__bool FLAC__bitreader_read_golomb_signed(FLAC__BitReader *br, int *val, unsigned parameter)
@@ -1364,3 +1044,16 @@
 	*val = v;
 	return true;
 }
+
+/* These functions are declared inline in this file but are also callable as
+ * externs from elsewhere.
+ * According to the C99 spec, section 6.7.4, simply providing a function
+ * prototype in a header file without 'inline' and making the function inline
+ * in this file should be sufficient.
+ * Unfortunately, the Microsoft VS compiler doesn't pick them up externally. To
+ * fix that we add extern declarations here.
+ */
+extern FLAC__bool FLAC__bitreader_is_consumed_byte_aligned(const FLAC__BitReader *br);
+extern unsigned FLAC__bitreader_bits_left_for_byte_alignment(const FLAC__BitReader *br);
+extern unsigned FLAC__bitreader_get_input_bits_unconsumed(const FLAC__BitReader *br);
+extern FLAC__bool FLAC__bitreader_read_uint32_little_endian(FLAC__BitReader *br, FLAC__uint32 *val);
diff --git a/src/libFLAC/bitwriter.c b/src/libFLAC/bitwriter.c
index e3e7ad1..76be1bc 100644
--- a/src/libFLAC/bitwriter.c
+++ b/src/libFLAC/bitwriter.c
@@ -1,5 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
- * Copyright (C) 2000,2001,2002,2003,2004,2005,2006,2007  Josh Coalson
+ * Copyright (C) 2000-2009  Josh Coalson
+ * Copyright (C) 2011-2014  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -29,45 +30,31 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#if HAVE_CONFIG_H
+#ifdef HAVE_CONFIG_H
 #  include <config.h>
 #endif
 
-#include <stdlib.h> /* for malloc() */
-#include <string.h> /* for memcpy(), memset() */
-#ifdef _MSC_VER
-#include <winsock.h> /* for ntohl() */
-#elif defined FLAC__SYS_DARWIN
-#include <machine/endian.h> /* for ntohl() */
-#elif defined __MINGW32__
-#include <winsock.h> /* for ntohl() */
-#else
-#include <netinet/in.h> /* for ntohl() */
-#endif
-#if 0 /* UNUSED */
-#include "private/bitmath.h"
-#endif
+#include <stdlib.h>
+#include <string.h>
 #include "private/bitwriter.h"
 #include "private/crc.h"
+#include "private/macros.h"
 #include "FLAC/assert.h"
 #include "share/alloc.h"
+#include "share/compat.h"
+#include "share/endswap.h"
 
 /* Things should be fastest when this matches the machine word size */
 /* WATCHOUT: if you change this you must also change the following #defines down to SWAP_BE_WORD_TO_HOST below to match */
-/* WATCHOUT: there are a few places where the code will not work unless bwword is >= 32 bits wide */
-typedef FLAC__uint32 bwword;
+/* WATCHOUT: there are a few places where the code will not work unless uint32_t is >= 32 bits wide */
 #define FLAC__BYTES_PER_WORD 4
 #define FLAC__BITS_PER_WORD 32
 #define FLAC__WORD_ALL_ONES ((FLAC__uint32)0xffffffff)
-/* SWAP_BE_WORD_TO_HOST swaps bytes in a bwword (which is always big-endian) if necessary to match host byte order */
+/* SWAP_BE_WORD_TO_HOST swaps bytes in a uint32_t (which is always big-endian) if necessary to match host byte order */
 #if WORDS_BIGENDIAN
 #define SWAP_BE_WORD_TO_HOST(x) (x)
 #else
-#ifdef _MSC_VER
-#define SWAP_BE_WORD_TO_HOST(x) local_swap32_(x)
-#else
-#define SWAP_BE_WORD_TO_HOST(x) ntohl(x)
-#endif
+#define SWAP_BE_WORD_TO_HOST(x) ENDSWAP_32(x)
 #endif
 
 /*
@@ -76,51 +63,29 @@
  * a frame or metadata block, then write that out and clear the buffer for the
  * next one.
  */
-static const unsigned FLAC__BITWRITER_DEFAULT_CAPACITY = 32768u / sizeof(bwword); /* size in words */
+static const unsigned FLAC__BITWRITER_DEFAULT_CAPACITY = 32768u / sizeof(uint32_t); /* size in words */
 /* When growing, increment 4K at a time */
-static const unsigned FLAC__BITWRITER_DEFAULT_INCREMENT = 4096u / sizeof(bwword); /* size in words */
+static const unsigned FLAC__BITWRITER_DEFAULT_INCREMENT = 4096u / sizeof(uint32_t); /* size in words */
 
 #define FLAC__WORDS_TO_BITS(words) ((words) * FLAC__BITS_PER_WORD)
 #define FLAC__TOTAL_BITS(bw) (FLAC__WORDS_TO_BITS((bw)->words) + (bw)->bits)
 
-#ifdef min
-#undef min
-#endif
-#define min(x,y) ((x)<(y)?(x):(y))
-
-/* adjust for compilers that can't understand using LLU suffix for uint64_t literals */
-#ifdef _MSC_VER
-#define FLAC__U64L(x) x
-#else
-#define FLAC__U64L(x) x##LLU
-#endif
-
-#ifndef FLaC__INLINE
-#define FLaC__INLINE
-#endif
-
 struct FLAC__BitWriter {
-	bwword *buffer;
-	bwword accum; /* accumulator; bits are right-justified; when full, accum is appended to buffer */
+	uint32_t *buffer;
+	uint32_t accum; /* accumulator; bits are right-justified; when full, accum is appended to buffer */
 	unsigned capacity; /* capacity of buffer in words */
 	unsigned words; /* # of complete words in buffer */
 	unsigned bits; /* # of used bits in accum */
 };
 
-#ifdef _MSC_VER
-/* OPT: an MSVC built-in would be better */
-static _inline FLAC__uint32 local_swap32_(FLAC__uint32 x)
-{
-	x = ((x<<8)&0xFF00FF00) | ((x>>8)&0x00FF00FF);
-	return (x>>16) | (x<<16);
-}
-#endif
-
 /* * WATCHOUT: The current implementation only grows the buffer. */
-static FLAC__bool bitwriter_grow_(FLAC__BitWriter *bw, unsigned bits_to_add)
+#ifndef __SUNPRO_C
+static
+#endif
+FLAC__bool bitwriter_grow_(FLAC__BitWriter *bw, unsigned bits_to_add)
 {
 	unsigned new_capacity;
-	bwword *new_buffer;
+	uint32_t *new_buffer;
 
 	FLAC__ASSERT(0 != bw);
 	FLAC__ASSERT(0 != bw->buffer);
@@ -142,7 +107,7 @@
 	FLAC__ASSERT(new_capacity > bw->capacity);
 	FLAC__ASSERT(new_capacity >= bw->words + ((bw->bits + bits_to_add + FLAC__BITS_PER_WORD - 1) / FLAC__BITS_PER_WORD));
 
-	new_buffer = (bwword*)safe_realloc_mul_2op_(bw->buffer, sizeof(bwword), /*times*/new_capacity);
+	new_buffer = safe_realloc_mul_2op_(bw->buffer, sizeof(uint32_t), /*times*/new_capacity);
 	if(new_buffer == 0)
 		return false;
 	bw->buffer = new_buffer;
@@ -159,7 +124,7 @@
 
 FLAC__BitWriter *FLAC__bitwriter_new(void)
 {
-	FLAC__BitWriter *bw = (FLAC__BitWriter*)calloc(1, sizeof(FLAC__BitWriter));
+	FLAC__BitWriter *bw = calloc(1, sizeof(FLAC__BitWriter));
 	/* note that calloc() sets all members to 0 for us */
 	return bw;
 }
@@ -184,7 +149,7 @@
 
 	bw->words = bw->bits = 0;
 	bw->capacity = FLAC__BITWRITER_DEFAULT_CAPACITY;
-	bw->buffer = (bwword*)malloc(sizeof(bwword) * bw->capacity);
+	bw->buffer = malloc(sizeof(uint32_t) * bw->capacity);
 	if(bw->buffer == 0)
 		return false;
 
@@ -299,7 +264,7 @@
 	(void)bw;
 }
 
-FLaC__INLINE FLAC__bool FLAC__bitwriter_write_zeroes(FLAC__BitWriter *bw, unsigned bits)
+inline FLAC__bool FLAC__bitwriter_write_zeroes(FLAC__BitWriter *bw, unsigned bits)
 {
 	unsigned n;
 
@@ -313,7 +278,7 @@
 		return false;
 	/* first part gets to word alignment */
 	if(bw->bits) {
-		n = min(FLAC__BITS_PER_WORD - bw->bits, bits);
+		n = flac_min(FLAC__BITS_PER_WORD - bw->bits, bits);
 		bw->accum <<= n;
 		bits -= n;
 		bw->bits += n;
@@ -337,7 +302,7 @@
 	return true;
 }
 
-FLaC__INLINE FLAC__bool FLAC__bitwriter_write_raw_uint32(FLAC__BitWriter *bw, FLAC__uint32 val, unsigned bits)
+inline FLAC__bool FLAC__bitwriter_write_raw_uint32(FLAC__BitWriter *bw, FLAC__uint32 val, unsigned bits)
 {
 	register unsigned left;
 
@@ -376,7 +341,7 @@
 	return true;
 }
 
-FLaC__INLINE FLAC__bool FLAC__bitwriter_write_raw_int32(FLAC__BitWriter *bw, FLAC__int32 val, unsigned bits)
+inline FLAC__bool FLAC__bitwriter_write_raw_int32(FLAC__BitWriter *bw, FLAC__int32 val, unsigned bits)
 {
 	/* zero-out unused bits */
 	if(bits < 32)
@@ -385,7 +350,7 @@
 	return FLAC__bitwriter_write_raw_uint32(bw, (FLAC__uint32)val, bits);
 }
 
-FLaC__INLINE FLAC__bool FLAC__bitwriter_write_raw_uint64(FLAC__BitWriter *bw, FLAC__uint64 val, unsigned bits)
+inline FLAC__bool FLAC__bitwriter_write_raw_uint64(FLAC__BitWriter *bw, FLAC__uint64 val, unsigned bits)
 {
 	/* this could be a little faster but it's not used for much */
 	if(bits > 32) {
@@ -397,7 +362,7 @@
 		return FLAC__bitwriter_write_raw_uint32(bw, (FLAC__uint32)val, bits);
 }
 
-FLaC__INLINE FLAC__bool FLAC__bitwriter_write_raw_uint32_little_endian(FLAC__BitWriter *bw, FLAC__uint32 val)
+inline FLAC__bool FLAC__bitwriter_write_raw_uint32_little_endian(FLAC__BitWriter *bw, FLAC__uint32 val)
 {
 	/* this doesn't need to be that fast as currently it is only used for vorbis comments */
 
@@ -413,7 +378,7 @@
 	return true;
 }
 
-FLaC__INLINE FLAC__bool FLAC__bitwriter_write_byte_block(FLAC__BitWriter *bw, const FLAC__byte vals[], unsigned nvals)
+inline FLAC__bool FLAC__bitwriter_write_byte_block(FLAC__BitWriter *bw, const FLAC__byte vals[], unsigned nvals)
 {
 	unsigned i;
 
@@ -549,7 +514,7 @@
 
 	FLAC__ASSERT(0 != bw);
 	FLAC__ASSERT(0 != bw->buffer);
-	FLAC__ASSERT(parameter < 8*sizeof(bwword)-1);
+	FLAC__ASSERT(parameter < 8*sizeof(uint32_t)-1);
 	/* WATCHOUT: code does not work with <32bit words; we can make things much faster with this assertion */
 	FLAC__ASSERT(FLAC__BITS_PER_WORD >= 32);
 
@@ -559,30 +524,8 @@
 
 		msbits = uval >> parameter;
 
-#if 0 /* OPT: can remove this special case if it doesn't make up for the extra compare (doesn't make a statistically significant difference with msvc or gcc/x86) */
-		if(bw->bits && bw->bits + msbits + lsbits <= FLAC__BITS_PER_WORD) { /* i.e. if the whole thing fits in the current bwword */
-			/* ^^^ if bw->bits is 0 then we may have filled the buffer and have no free bwword to work in */
-			bw->bits = bw->bits + msbits + lsbits;
-			uval |= mask1; /* set stop bit */
-			uval &= mask2; /* mask off unused top bits */
-			/* NOT: bw->accum <<= msbits + lsbits because msbits+lsbits could be 32, then the shift would be a NOP */
-			bw->accum <<= msbits;
-			bw->accum <<= lsbits;
-			bw->accum |= uval;
-			if(bw->bits == FLAC__BITS_PER_WORD) {
-				bw->buffer[bw->words++] = SWAP_BE_WORD_TO_HOST(bw->accum);
-				bw->bits = 0;
-				/* burying the capacity check down here means we have to grow the buffer a little if there are more vals to do */
-				if(bw->capacity <= bw->words && nvals > 1 && !bitwriter_grow_(bw, 1)) {
-					FLAC__ASSERT(bw->capacity == bw->words);
-					return false;
-				}
-			}
-		}
-		else {
-#elif 1 /*@@@@@@ OPT: try this version with MSVC6 to see if better, not much difference for gcc-4 */
-		if(bw->bits && bw->bits + msbits + lsbits < FLAC__BITS_PER_WORD) { /* i.e. if the whole thing fits in the current bwword */
-			/* ^^^ if bw->bits is 0 then we may have filled the buffer and have no free bwword to work in */
+		if(bw->bits && bw->bits + msbits + lsbits < FLAC__BITS_PER_WORD) { /* i.e. if the whole thing fits in the current uint32_t */
+			/* ^^^ if bw->bits is 0 then we may have filled the buffer and have no free uint32_t to work in */
 			bw->bits = bw->bits + msbits + lsbits;
 			uval |= mask1; /* set stop bit */
 			uval &= mask2; /* mask off unused top bits */
@@ -590,10 +533,9 @@
 			bw->accum |= uval;
 		}
 		else {
-#endif
 			/* slightly pessimistic size check but faster than "<= bw->words + (bw->bits+msbits+lsbits+FLAC__BITS_PER_WORD-1)/FLAC__BITS_PER_WORD" */
 			/* OPT: pessimism may cause flurry of false calls to grow_ which eat up all savings before it */
-			if(bw->capacity <= bw->words + bw->bits + msbits + 1/*lsbits always fit in 1 bwword*/ && !bitwriter_grow_(bw, msbits+lsbits))
+			if(bw->capacity <= bw->words + bw->bits + msbits + 1/*lsbits always fit in 1 uint32_t*/ && !bitwriter_grow_(bw, msbits+lsbits))
 				return false;
 
 			if(msbits) {
@@ -645,9 +587,7 @@
 				bw->buffer[bw->words++] = SWAP_BE_WORD_TO_HOST(bw->accum);
 				bw->accum = uval;
 			}
-#if 1
 		}
-#endif
 		vals++;
 		nvals--;
 	}
@@ -887,3 +827,17 @@
 	else
 		return true;
 }
+
+/* These functions are declared inline in this file but are also callable as
+ * externs from elsewhere.
+ * According to the C99 spec, section 6.7.4, simply providing a function
+ * prototype in a header file without 'inline' and making the function inline
+ * in this file should be sufficient.
+ * Unfortunately, the Microsoft VS compiler doesn't pick them up externally. To
+ * fix that we add extern declarations here.
+ */
+extern FLAC__bool FLAC__bitwriter_write_zeroes(FLAC__BitWriter *bw, unsigned bits);
+extern FLAC__bool FLAC__bitwriter_write_raw_int32(FLAC__BitWriter *bw, FLAC__int32 val, unsigned bits);
+extern FLAC__bool FLAC__bitwriter_write_raw_uint64(FLAC__BitWriter *bw, FLAC__uint64 val, unsigned bits);
+extern FLAC__bool FLAC__bitwriter_write_raw_uint32_little_endian(FLAC__BitWriter *bw, FLAC__uint32 val);
+extern FLAC__bool FLAC__bitwriter_write_byte_block(FLAC__BitWriter *bw, const FLAC__byte vals[], unsigned nvals);
diff --git a/src/libFLAC/cpu.c b/src/libFLAC/cpu.c
index 60b73bf..bb09506 100644
--- a/src/libFLAC/cpu.c
+++ b/src/libFLAC/cpu.c
@@ -1,5 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
- * Copyright (C) 2001,2002,2003,2004,2005,2006,2007  Josh Coalson
+ * Copyright (C) 2001-2009  Josh Coalson
+ * Copyright (C) 2011-2014  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -29,47 +30,46 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#if HAVE_CONFIG_H
+#ifdef HAVE_CONFIG_H
 #  include <config.h>
 #endif
 
 #include "private/cpu.h"
 #include <stdlib.h>
-#include <stdio.h>
+#include <memory.h>
+#ifdef DEBUG
+# include <stdio.h>
+#endif
 
 #if defined FLAC__CPU_IA32
 # include <signal.h>
-#elif defined FLAC__CPU_PPC
-# if !defined FLAC__NO_ASM
-#  if defined FLAC__SYS_DARWIN
-#   include <sys/sysctl.h>
-#   include <mach/mach.h>
-#   include <mach/mach_host.h>
-#   include <mach/host_info.h>
-#   include <mach/machine.h>
-#   ifndef CPU_SUBTYPE_POWERPC_970
-#    define CPU_SUBTYPE_POWERPC_970 ((cpu_subtype_t) 100)
-#   endif
-#  else /* FLAC__SYS_DARWIN */
 
-#   include <signal.h>
-#   include <setjmp.h>
-
-static sigjmp_buf jmpbuf;
-static volatile sig_atomic_t canjump = 0;
-
-static void sigill_handler (int sig)
+static void disable_sse(FLAC__CPUInfo *info)
 {
-	if (!canjump) {
-		signal (sig, SIG_DFL);
-		raise (sig);
-	}
-	canjump = 0;
-	siglongjmp (jmpbuf, 1);
+	info->ia32.sse   = false;
+	info->ia32.sse2  = false;
+	info->ia32.sse3  = false;
+	info->ia32.ssse3 = false;
+	info->ia32.sse41 = false;
+	info->ia32.sse42 = false;
 }
-#  endif /* FLAC__SYS_DARWIN */
-# endif /* FLAC__NO_ASM */
-#endif /* FLAC__CPU_PPC */
+
+static void disable_avx(FLAC__CPUInfo *info)
+{
+	info->ia32.avx     = false;
+	info->ia32.avx2    = false;
+	info->ia32.fma     = false;
+}
+
+#elif defined FLAC__CPU_X86_64
+
+static void disable_avx(FLAC__CPUInfo *info)
+{
+	info->x86.avx     = false;
+	info->x86.avx2    = false;
+	info->x86.fma     = false;
+}
+#endif
 
 #if defined (__NetBSD__) || defined(__OpenBSD__)
 #include <sys/param.h>
@@ -86,25 +86,34 @@
 /* how to get sysctlbyname()? */
 #endif
 
+#ifdef FLAC__CPU_IA32
 /* these are flags in EDX of CPUID AX=00000001 */
 static const unsigned FLAC__CPUINFO_IA32_CPUID_CMOV = 0x00008000;
 static const unsigned FLAC__CPUINFO_IA32_CPUID_MMX = 0x00800000;
 static const unsigned FLAC__CPUINFO_IA32_CPUID_FXSR = 0x01000000;
 static const unsigned FLAC__CPUINFO_IA32_CPUID_SSE = 0x02000000;
 static const unsigned FLAC__CPUINFO_IA32_CPUID_SSE2 = 0x04000000;
+#endif
+
 /* these are flags in ECX of CPUID AX=00000001 */
 static const unsigned FLAC__CPUINFO_IA32_CPUID_SSE3 = 0x00000001;
 static const unsigned FLAC__CPUINFO_IA32_CPUID_SSSE3 = 0x00000200;
-/* these are flags in EDX of CPUID AX=80000001 */
-static const unsigned FLAC__CPUINFO_IA32_CPUID_EXTENDED_AMD_3DNOW = 0x80000000;
-static const unsigned FLAC__CPUINFO_IA32_CPUID_EXTENDED_AMD_EXT3DNOW = 0x40000000;
-static const unsigned FLAC__CPUINFO_IA32_CPUID_EXTENDED_AMD_EXTMMX = 0x00400000;
+static const unsigned FLAC__CPUINFO_IA32_CPUID_SSE41 = 0x00080000;
+static const unsigned FLAC__CPUINFO_IA32_CPUID_SSE42 = 0x00100000;
 
+#if defined FLAC__AVX_SUPPORTED
+/* these are flags in ECX of CPUID AX=00000001 */
+static const unsigned FLAC__CPUINFO_IA32_CPUID_OSXSAVE = 0x08000000;
+static const unsigned FLAC__CPUINFO_IA32_CPUID_AVX = 0x10000000;
+static const unsigned FLAC__CPUINFO_IA32_CPUID_FMA = 0x00001000;
+/* these are flags in EBX of CPUID AX=00000007 */
+static const unsigned FLAC__CPUINFO_IA32_CPUID_AVX2 = 0x00000020;
+#endif
 
 /*
  * Extra stuff needed for detection of OS support for SSE on IA-32
  */
-#if defined(FLAC__CPU_IA32) && !defined FLAC__NO_ASM && defined FLAC__HAS_NASM && !defined FLAC__NO_SSE_OS && !defined FLAC__SSE_OS
+#if defined(FLAC__CPU_IA32) && !defined FLAC__NO_ASM && (defined FLAC__HAS_NASM || defined FLAC__HAS_X86INTRIN) && !defined FLAC__NO_SSE_OS && !defined FLAC__SSE_OS
 # if defined(__linux__)
 /*
  * If the OS doesn't support SSE, we will get here with a SIGILL.  We
@@ -119,35 +128,14 @@
  *   6 bytes extra in case our estimate is wrong
  * 12 bytes puts us in the NOP "landing zone"
  */
-#  undef USE_OBSOLETE_SIGCONTEXT_FLAVOR /* #define this to use the older signal handler method */
-#  ifdef USE_OBSOLETE_SIGCONTEXT_FLAVOR
-	static void sigill_handler_sse_os(int signal, struct sigcontext sc)
-	{
-		(void)signal;
-		sc.eip += 3 + 3 + 6;
-	}
-#  else
 #   include <sys/ucontext.h>
 	static void sigill_handler_sse_os(int signal, siginfo_t *si, void *uc)
 	{
 		(void)signal, (void)si;
 		((ucontext_t*)uc)->uc_mcontext.gregs[14/*REG_EIP*/] += 3 + 3 + 6;
 	}
-#  endif
 # elif defined(_MSC_VER)
 #  include <windows.h>
-#  undef USE_TRY_CATCH_FLAVOR /* #define this to use the try/catch method for catching illegal opcode exception */
-#  ifdef USE_TRY_CATCH_FLAVOR
-#  else
-	LONG CALLBACK sigill_handler_sse_os(EXCEPTION_POINTERS *ep)
-	{
-		if(ep->ExceptionRecord->ExceptionCode == EXCEPTION_ILLEGAL_INSTRUCTION) {
-			ep->ContextRecord->Eip += 3 + 3 + 6;
-			return EXCEPTION_CONTINUE_EXECUTION;
-		}
-		return EXCEPTION_CONTINUE_SEARCH;
-	}
-#  endif
 # endif
 #endif
 
@@ -158,261 +146,342 @@
  * IA32-specific
  */
 #ifdef FLAC__CPU_IA32
+	FLAC__bool ia32_fxsr = false;
+	FLAC__bool ia32_osxsave = false;
+	(void) ia32_fxsr; (void) ia32_osxsave; /* to avoid warnings about unused variables */
+	memset(info, 0, sizeof(*info));
 	info->type = FLAC__CPUINFO_TYPE_IA32;
-#if !defined FLAC__NO_ASM && defined FLAC__HAS_NASM
+#if !defined FLAC__NO_ASM && (defined FLAC__HAS_NASM || defined FLAC__HAS_X86INTRIN)
 	info->use_asm = true; /* we assume a minimum of 80386 with FLAC__CPU_IA32 */
-	info->data.ia32.cpuid = FLAC__cpu_have_cpuid_asm_ia32()? true : false;
-	info->data.ia32.bswap = info->data.ia32.cpuid; /* CPUID => BSWAP since it came after */
-	info->data.ia32.cmov = false;
-	info->data.ia32.mmx = false;
-	info->data.ia32.fxsr = false;
-	info->data.ia32.sse = false;
-	info->data.ia32.sse2 = false;
-	info->data.ia32.sse3 = false;
-	info->data.ia32.ssse3 = false;
-	info->data.ia32._3dnow = false;
-	info->data.ia32.ext3dnow = false;
-	info->data.ia32.extmmx = false;
-	if(info->data.ia32.cpuid) {
-		/* http://www.sandpile.org/ia32/cpuid.htm */
-		FLAC__uint32 flags_edx, flags_ecx;
-		FLAC__cpu_info_asm_ia32(&flags_edx, &flags_ecx);
-		info->data.ia32.cmov  = (flags_edx & FLAC__CPUINFO_IA32_CPUID_CMOV )? true : false;
-		info->data.ia32.mmx   = (flags_edx & FLAC__CPUINFO_IA32_CPUID_MMX  )? true : false;
-		info->data.ia32.fxsr  = (flags_edx & FLAC__CPUINFO_IA32_CPUID_FXSR )? true : false;
-		info->data.ia32.sse   = (flags_edx & FLAC__CPUINFO_IA32_CPUID_SSE  )? true : false;
-		info->data.ia32.sse2  = (flags_edx & FLAC__CPUINFO_IA32_CPUID_SSE2 )? true : false;
-		info->data.ia32.sse3  = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_SSE3 )? true : false;
-		info->data.ia32.ssse3 = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_SSSE3)? true : false;
-
-#ifdef FLAC__USE_3DNOW
-		flags_edx = FLAC__cpu_info_extended_amd_asm_ia32();
-		info->data.ia32._3dnow   = (flags_edx & FLAC__CPUINFO_IA32_CPUID_EXTENDED_AMD_3DNOW   )? true : false;
-		info->data.ia32.ext3dnow = (flags_edx & FLAC__CPUINFO_IA32_CPUID_EXTENDED_AMD_EXT3DNOW)? true : false;
-		info->data.ia32.extmmx   = (flags_edx & FLAC__CPUINFO_IA32_CPUID_EXTENDED_AMD_EXTMMX  )? true : false;
+#ifdef FLAC__HAS_X86INTRIN
+	if(!FLAC__cpu_have_cpuid_x86())
+		return;
 #else
-		info->data.ia32._3dnow = info->data.ia32.ext3dnow = info->data.ia32.extmmx = false;
+	if(!FLAC__cpu_have_cpuid_asm_ia32())
+		return;
 #endif
+	{
+		/* http://www.sandpile.org/x86/cpuid.htm */
+#ifdef FLAC__HAS_X86INTRIN
+		FLAC__uint32 flags_eax, flags_ebx, flags_ecx, flags_edx;
+		FLAC__cpu_info_x86(1, &flags_eax, &flags_ebx, &flags_ecx, &flags_edx);
+#else
+		FLAC__uint32 flags_ecx, flags_edx;
+		FLAC__cpu_info_asm_ia32(&flags_edx, &flags_ecx);
+#endif
+		info->ia32.cmov  = (flags_edx & FLAC__CPUINFO_IA32_CPUID_CMOV )? true : false;
+		info->ia32.mmx   = (flags_edx & FLAC__CPUINFO_IA32_CPUID_MMX  )? true : false;
+		      ia32_fxsr  = (flags_edx & FLAC__CPUINFO_IA32_CPUID_FXSR )? true : false;
+		info->ia32.sse   = (flags_edx & FLAC__CPUINFO_IA32_CPUID_SSE  )? true : false;
+		info->ia32.sse2  = (flags_edx & FLAC__CPUINFO_IA32_CPUID_SSE2 )? true : false;
+		info->ia32.sse3  = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_SSE3 )? true : false;
+		info->ia32.ssse3 = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_SSSE3)? true : false;
+		info->ia32.sse41 = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_SSE41)? true : false;
+		info->ia32.sse42 = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_SSE42)? true : false;
+#if defined FLAC__HAS_X86INTRIN && defined FLAC__AVX_SUPPORTED
+		    ia32_osxsave = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_OSXSAVE)? true : false;
+		info->ia32.avx   = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_AVX    )? true : false;
+		info->ia32.fma   = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_FMA    )? true : false;
+		FLAC__cpu_info_x86(7, &flags_eax, &flags_ebx, &flags_ecx, &flags_edx);
+		info->ia32.avx2  = (flags_ebx & FLAC__CPUINFO_IA32_CPUID_AVX2   )? true : false;
+#endif
+	}
 
 #ifdef DEBUG
-		fprintf(stderr, "CPU info (IA-32):\n");
-		fprintf(stderr, "  CPUID ...... %c\n", info->data.ia32.cpuid   ? 'Y' : 'n');
-		fprintf(stderr, "  BSWAP ...... %c\n", info->data.ia32.bswap   ? 'Y' : 'n');
-		fprintf(stderr, "  CMOV ....... %c\n", info->data.ia32.cmov    ? 'Y' : 'n');
-		fprintf(stderr, "  MMX ........ %c\n", info->data.ia32.mmx     ? 'Y' : 'n');
-		fprintf(stderr, "  FXSR ....... %c\n", info->data.ia32.fxsr    ? 'Y' : 'n');
-		fprintf(stderr, "  SSE ........ %c\n", info->data.ia32.sse     ? 'Y' : 'n');
-		fprintf(stderr, "  SSE2 ....... %c\n", info->data.ia32.sse2    ? 'Y' : 'n');
-		fprintf(stderr, "  SSE3 ....... %c\n", info->data.ia32.sse3    ? 'Y' : 'n');
-		fprintf(stderr, "  SSSE3 ...... %c\n", info->data.ia32.ssse3   ? 'Y' : 'n');
-		fprintf(stderr, "  3DNow! ..... %c\n", info->data.ia32._3dnow  ? 'Y' : 'n');
-		fprintf(stderr, "  3DNow!-ext . %c\n", info->data.ia32.ext3dnow? 'Y' : 'n');
-		fprintf(stderr, "  3DNow!-MMX . %c\n", info->data.ia32.extmmx  ? 'Y' : 'n');
+	fprintf(stderr, "CPU info (IA-32):\n");
+	fprintf(stderr, "  CMOV ....... %c\n", info->ia32.cmov    ? 'Y' : 'n');
+	fprintf(stderr, "  MMX ........ %c\n", info->ia32.mmx     ? 'Y' : 'n');
+	fprintf(stderr, "  SSE ........ %c\n", info->ia32.sse     ? 'Y' : 'n');
+	fprintf(stderr, "  SSE2 ....... %c\n", info->ia32.sse2    ? 'Y' : 'n');
+	fprintf(stderr, "  SSE3 ....... %c\n", info->ia32.sse3    ? 'Y' : 'n');
+	fprintf(stderr, "  SSSE3 ...... %c\n", info->ia32.ssse3   ? 'Y' : 'n');
+	fprintf(stderr, "  SSE41 ...... %c\n", info->ia32.sse41   ? 'Y' : 'n');
+	fprintf(stderr, "  SSE42 ...... %c\n", info->ia32.sse42   ? 'Y' : 'n');
+# if defined FLAC__HAS_X86INTRIN && defined FLAC__AVX_SUPPORTED
+	fprintf(stderr, "  AVX ........ %c\n", info->ia32.avx     ? 'Y' : 'n');
+	fprintf(stderr, "  FMA ........ %c\n", info->ia32.fma     ? 'Y' : 'n');
+	fprintf(stderr, "  AVX2 ....... %c\n", info->ia32.avx2    ? 'Y' : 'n');
+# endif
 #endif
 
-		/*
-		 * now have to check for OS support of SSE/SSE2
-		 */
-		if(info->data.ia32.fxsr || info->data.ia32.sse || info->data.ia32.sse2) {
+	/*
+	 * now have to check for OS support of SSE instructions
+	 */
+	if(info->ia32.sse) {
 #if defined FLAC__NO_SSE_OS
-			/* assume user knows better than us; turn it off */
-			info->data.ia32.fxsr = info->data.ia32.sse = info->data.ia32.sse2 = info->data.ia32.sse3 = info->data.ia32.ssse3 = false;
+		/* assume user knows better than us; turn it off */
+		disable_sse(info);
 #elif defined FLAC__SSE_OS
-			/* assume user knows better than us; leave as detected above */
+		/* assume user knows better than us; leave as detected above */
 #elif defined(__FreeBSD__) || defined(__FreeBSD_kernel__) || defined(__DragonFly__) || defined(__APPLE__)
-			int sse = 0;
-			size_t len;
-			/* at least one of these must work: */
-			len = sizeof(sse); sse = sse || (sysctlbyname("hw.instruction_sse", &sse, &len, NULL, 0) == 0 && sse);
-			len = sizeof(sse); sse = sse || (sysctlbyname("hw.optional.sse"   , &sse, &len, NULL, 0) == 0 && sse); /* __APPLE__ ? */
-			if(!sse)
-				info->data.ia32.fxsr = info->data.ia32.sse = info->data.ia32.sse2 = info->data.ia32.sse3 = info->data.ia32.ssse3 = false;
+		int sse = 0;
+		size_t len;
+		/* at least one of these must work: */
+		len = sizeof(sse); sse = sse || (sysctlbyname("hw.instruction_sse", &sse, &len, NULL, 0) == 0 && sse);
+		len = sizeof(sse); sse = sse || (sysctlbyname("hw.optional.sse"   , &sse, &len, NULL, 0) == 0 && sse); /* __APPLE__ ? */
+		if(!sse)
+			disable_sse(info);
 #elif defined(__NetBSD__) || defined (__OpenBSD__)
 # if __NetBSD_Version__ >= 105250000 || (defined __OpenBSD__)
-			int val = 0, mib[2] = { CTL_MACHDEP, CPU_SSE };
-			size_t len = sizeof(val);
-			if(sysctl(mib, 2, &val, &len, NULL, 0) < 0 || !val)
-				info->data.ia32.fxsr = info->data.ia32.sse = info->data.ia32.sse2 = info->data.ia32.sse3 = info->data.ia32.ssse3 = false;
-			else { /* double-check SSE2 */
-				mib[1] = CPU_SSE2;
-				len = sizeof(val);
-				if(sysctl(mib, 2, &val, &len, NULL, 0) < 0 || !val)
-					info->data.ia32.sse2 = info->data.ia32.sse3 = info->data.ia32.ssse3 = false;
+		int val = 0, mib[2] = { CTL_MACHDEP, CPU_SSE };
+		size_t len = sizeof(val);
+		if(sysctl(mib, 2, &val, &len, NULL, 0) < 0 || !val)
+			disable_sse(info);
+		else { /* double-check SSE2 */
+			mib[1] = CPU_SSE2;
+			len = sizeof(val);
+			if(sysctl(mib, 2, &val, &len, NULL, 0) < 0 || !val) {
+				disable_sse(info);
+				info->ia32.sse = true;
 			}
+		}
 # else
-			info->data.ia32.fxsr = info->data.ia32.sse = info->data.ia32.sse2 = info->data.ia32.sse3 = info->data.ia32.ssse3 = false;
+		disable_sse(info);
 # endif
 #elif defined(__linux__)
-			int sse = 0;
-			struct sigaction sigill_save;
-#ifdef USE_OBSOLETE_SIGCONTEXT_FLAVOR
-			if(0 == sigaction(SIGILL, NULL, &sigill_save) && signal(SIGILL, (void (*)(int))sigill_handler_sse_os) != SIG_ERR)
-#else
-			struct sigaction sigill_sse;
-			sigill_sse.sa_sigaction = sigill_handler_sse_os;
-			__sigemptyset(&sigill_sse.sa_mask);
-			sigill_sse.sa_flags = SA_SIGINFO | SA_RESETHAND; /* SA_RESETHAND just in case our SIGILL return jump breaks, so we don't get stuck in a loop */
-			if(0 == sigaction(SIGILL, &sigill_sse, &sigill_save))
-#endif
-			{
-				/* http://www.ibiblio.org/gferg/ldp/GCC-Inline-Assembly-HOWTO.html */
-				/* see sigill_handler_sse_os() for an explanation of the following: */
-				asm volatile (
-					"xorl %0,%0\n\t"          /* for some reason, still need to do this to clear 'sse' var */
-					"xorps %%xmm0,%%xmm0\n\t" /* will cause SIGILL if unsupported by OS */
-					"incl %0\n\t"             /* SIGILL handler will jump over this */
-					/* landing zone */
-					"nop\n\t" /* SIGILL jump lands here if "inc" is 9 bytes */
-					"nop\n\t"
-					"nop\n\t"
-					"nop\n\t"
-					"nop\n\t"
-					"nop\n\t"
-					"nop\n\t" /* SIGILL jump lands here if "inc" is 3 bytes (expected) */
-					"nop\n\t"
-					"nop"     /* SIGILL jump lands here if "inc" is 1 byte */
-					: "=r"(sse)
-					: "r"(sse)
-				);
-
-				sigaction(SIGILL, &sigill_save, NULL);
-			}
-
-			if(!sse)
-				info->data.ia32.fxsr = info->data.ia32.sse = info->data.ia32.sse2 = info->data.ia32.sse3 = info->data.ia32.ssse3 = false;
-#elif defined(_MSC_VER)
-# ifdef USE_TRY_CATCH_FLAVOR
-			_try {
-				__asm {
-#  if _MSC_VER <= 1200
-					/* VC6 assembler doesn't know SSE, have to emit bytecode instead */
-					_emit 0x0F
-					_emit 0x57
-					_emit 0xC0
-#  else
-					xorps xmm0,xmm0
-#  endif
-				}
-			}
-			_except(EXCEPTION_EXECUTE_HANDLER) {
-				if (_exception_code() == STATUS_ILLEGAL_INSTRUCTION)
-					info->data.ia32.fxsr = info->data.ia32.sse = info->data.ia32.sse2 = info->data.ia32.sse3 = info->data.ia32.ssse3 = false;
-			}
-# else
-			int sse = 0;
-			LPTOP_LEVEL_EXCEPTION_FILTER save = SetUnhandledExceptionFilter(sigill_handler_sse_os);
-			/* see GCC version above for explanation */
-			/*  http://msdn2.microsoft.com/en-us/library/4ks26t93.aspx */
-			/*  http://www.codeproject.com/cpp/gccasm.asp */
-			/*  http://www.hick.org/~mmiller/msvc_inline_asm.html */
-			__asm {
-#  if _MSC_VER <= 1200
-				/* VC6 assembler doesn't know SSE, have to emit bytecode instead */
-				_emit 0x0F
-				_emit 0x57
-				_emit 0xC0
-#  else
-				xorps xmm0,xmm0
-#  endif
-				inc sse
-				nop
-				nop
-				nop
-				nop
-				nop
-				nop
-				nop
-				nop
-				nop
-			}
-			SetUnhandledExceptionFilter(save);
-			if(!sse)
-				info->data.ia32.fxsr = info->data.ia32.sse = info->data.ia32.sse2 = info->data.ia32.sse3 = info->data.ia32.ssse3 = false;
-# endif
-#else
-			/* no way to test, disable to be safe */
-			info->data.ia32.fxsr = info->data.ia32.sse = info->data.ia32.sse2 = info->data.ia32.sse3 = info->data.ia32.ssse3 = false;
-#endif
-#ifdef DEBUG
-		fprintf(stderr, "  SSE OS sup . %c\n", info->data.ia32.sse     ? 'Y' : 'n');
-#endif
-
-		}
-	}
-#else
-	info->use_asm = false;
-#endif
-
-/*
- * PPC-specific
- */
-#elif defined FLAC__CPU_PPC
-	info->type = FLAC__CPUINFO_TYPE_PPC;
-# if !defined FLAC__NO_ASM
-	info->use_asm = true;
-#  ifdef FLAC__USE_ALTIVEC
-#   if defined FLAC__SYS_DARWIN
-	{
-		int val = 0, mib[2] = { CTL_HW, HW_VECTORUNIT };
-		size_t len = sizeof(val);
-		info->data.ppc.altivec = !(sysctl(mib, 2, &val, &len, NULL, 0) || !val);
-	}
-	{
-		host_basic_info_data_t hostInfo;
-		mach_msg_type_number_t infoCount;
-
-		infoCount = HOST_BASIC_INFO_COUNT;
-		host_info(mach_host_self(), HOST_BASIC_INFO, (host_info_t)&hostInfo, &infoCount);
-
-		info->data.ppc.ppc64 = (hostInfo.cpu_type == CPU_TYPE_POWERPC) && (hostInfo.cpu_subtype == CPU_SUBTYPE_POWERPC_970);
-	}
-#   else /* FLAC__USE_ALTIVEC && !FLAC__SYS_DARWIN */
-	{
-		/* no Darwin, do it the brute-force way */
-		/* @@@@@@ this is not thread-safe; replace with SSE OS method above or remove */
-		info->data.ppc.altivec = 0;
-		info->data.ppc.ppc64 = 0;
-
-		signal (SIGILL, sigill_handler);
-		canjump = 0;
-		if (!sigsetjmp (jmpbuf, 1)) {
-			canjump = 1;
-
+		int sse = 0;
+		struct sigaction sigill_save;
+		struct sigaction sigill_sse;
+		sigill_sse.sa_sigaction = sigill_handler_sse_os;
+		__sigemptyset(&sigill_sse.sa_mask);
+		sigill_sse.sa_flags = SA_SIGINFO | SA_RESETHAND; /* SA_RESETHAND just in case our SIGILL return jump breaks, so we don't get stuck in a loop */
+		if(0 == sigaction(SIGILL, &sigill_sse, &sigill_save))
+		{
+			/* http://www.ibiblio.org/gferg/ldp/GCC-Inline-Assembly-HOWTO.html */
+			/* see sigill_handler_sse_os() for an explanation of the following: */
 			asm volatile (
-				"mtspr 256, %0\n\t"
-				"vand %%v0, %%v0, %%v0"
-				:
-				: "r" (-1)
+				"xorps %%xmm0,%%xmm0\n\t" /* will cause SIGILL if unsupported by OS */
+				"incl %0\n\t"             /* SIGILL handler will jump over this */
+				/* landing zone */
+				"nop\n\t" /* SIGILL jump lands here if "inc" is 9 bytes */
+				"nop\n\t"
+				"nop\n\t"
+				"nop\n\t"
+				"nop\n\t"
+				"nop\n\t"
+				"nop\n\t" /* SIGILL jump lands here if "inc" is 3 bytes (expected) */
+				"nop\n\t"
+				"nop"     /* SIGILL jump lands here if "inc" is 1 byte */
+				: "=r"(sse)
+				: "0"(sse)
 			);
 
-			info->data.ppc.altivec = 1;
+			sigaction(SIGILL, &sigill_save, NULL);
 		}
-		canjump = 0;
-		if (!sigsetjmp (jmpbuf, 1)) {
-			int x = 0;
-			canjump = 1;
 
-			/* PPC64 hardware implements the cntlzd instruction */
-			asm volatile ("cntlzd %0, %1" : "=r" (x) : "r" (x) );
-
-			info->data.ppc.ppc64 = 1;
+		if(!sse)
+			disable_sse(info);
+#elif defined(_MSC_VER)
+		__try {
+			__asm {
+				xorps xmm0,xmm0
+			}
 		}
-		signal (SIGILL, SIG_DFL); /*@@@@@@ should save and restore old signal */
+		__except(EXCEPTION_EXECUTE_HANDLER) {
+			if (_exception_code() == STATUS_ILLEGAL_INSTRUCTION)
+				disable_sse(info);
+		}
+#elif defined(__GNUC__) /* MinGW goes here */
+		int sse = 0;
+		/* Based on the idea described in Agner Fog's manual "Optimizing subroutines in assembly language" */
+		/* In theory, not guaranteed to detect lack of OS SSE support on some future Intel CPUs, but in practice works (see the aforementioned manual) */
+		if (ia32_fxsr) {
+			struct {
+				FLAC__uint32 buff[128];
+			} __attribute__((aligned(16))) fxsr;
+			FLAC__uint32 old_val, new_val;
+
+			asm volatile ("fxsave %0"  : "=m" (fxsr) : "m" (fxsr));
+			old_val = fxsr.buff[50];
+			fxsr.buff[50] ^= 0x0013c0de;                             /* change value in the buffer */
+			asm volatile ("fxrstor %0" : "=m" (fxsr) : "m" (fxsr));  /* try to change SSE register */
+			fxsr.buff[50] = old_val;                                 /* restore old value in the buffer */
+			asm volatile ("fxsave %0 " : "=m" (fxsr) : "m" (fxsr));  /* old value will be overwritten if SSE register was changed */
+			new_val = fxsr.buff[50];                                 /* == old_val if FXRSTOR didn't change SSE register and (old_val ^ 0x0013c0de) otherwise */
+			fxsr.buff[50] = old_val;                                 /* again restore old value in the buffer */
+			asm volatile ("fxrstor %0" : "=m" (fxsr) : "m" (fxsr));  /* restore old values of registers */
+
+			if ((old_val^new_val) == 0x0013c0de)
+				sse = 1;
+		}
+		if(!sse)
+			disable_sse(info);
+#else
+		/* no way to test, disable to be safe */
+		disable_sse(info);
+#endif
+#ifdef DEBUG
+		fprintf(stderr, "  SSE OS sup . %c\n", info->ia32.sse ? 'Y' : 'n');
+#endif
 	}
-#   endif
-#  else /* !FLAC__USE_ALTIVEC */
-	info->data.ppc.altivec = 0;
-	info->data.ppc.ppc64 = 0;
-#  endif
-# else
+	else /* info->ia32.sse == false */
+		disable_sse(info);
+
+	/*
+	 * now have to check for OS support of AVX instructions
+	 */
+	if(info->ia32.avx && ia32_osxsave) {
+		FLAC__uint32 ecr = FLAC__cpu_xgetbv_x86();
+		if ((ecr & 0x6) != 0x6)
+			disable_avx(info);
+#ifdef DEBUG
+		fprintf(stderr, "  AVX OS sup . %c\n", info->ia32.avx ? 'Y' : 'n');
+#endif
+	}
+	else /* no OS AVX support*/
+		disable_avx(info);
+#else
 	info->use_asm = false;
-# endif
+#endif
 
 /*
- * unknown CPI
+ * x86-64-specific
+ */
+#elif defined FLAC__CPU_X86_64
+	FLAC__bool x86_osxsave = false;
+	(void) x86_osxsave; /* to avoid warnings about unused variables */
+	memset(info, 0, sizeof(*info));
+	info->type = FLAC__CPUINFO_TYPE_X86_64;
+#if !defined FLAC__NO_ASM && defined FLAC__HAS_X86INTRIN
+	info->use_asm = true;
+	{
+		/* http://www.sandpile.org/x86/cpuid.htm */
+		FLAC__uint32 flags_eax, flags_ebx, flags_ecx, flags_edx;
+		FLAC__cpu_info_x86(1, &flags_eax, &flags_ebx, &flags_ecx, &flags_edx);
+		info->x86.sse3  = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_SSE3 )? true : false;
+		info->x86.ssse3 = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_SSSE3)? true : false;
+		info->x86.sse41 = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_SSE41)? true : false;
+		info->x86.sse42 = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_SSE42)? true : false;
+#if defined FLAC__AVX_SUPPORTED
+		    x86_osxsave = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_OSXSAVE)? true : false;
+		info->x86.avx   = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_AVX    )? true : false;
+		info->x86.fma   = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_FMA    )? true : false;
+		FLAC__cpu_info_x86(7, &flags_eax, &flags_ebx, &flags_ecx, &flags_edx);
+		info->x86.avx2  = (flags_ebx & FLAC__CPUINFO_IA32_CPUID_AVX2   )? true : false;
+#endif
+	}
+#ifdef DEBUG
+	fprintf(stderr, "CPU info (x86-64):\n");
+	fprintf(stderr, "  SSE3 ....... %c\n", info->x86.sse3  ? 'Y' : 'n');
+	fprintf(stderr, "  SSSE3 ...... %c\n", info->x86.ssse3 ? 'Y' : 'n');
+	fprintf(stderr, "  SSE41 ...... %c\n", info->x86.sse41 ? 'Y' : 'n');
+	fprintf(stderr, "  SSE42 ...... %c\n", info->x86.sse42 ? 'Y' : 'n');
+# if defined FLAC__AVX_SUPPORTED
+	fprintf(stderr, "  AVX ........ %c\n", info->x86.avx   ? 'Y' : 'n');
+	fprintf(stderr, "  FMA ........ %c\n", info->x86.fma   ? 'Y' : 'n');
+	fprintf(stderr, "  AVX2 ....... %c\n", info->x86.avx2  ? 'Y' : 'n');
+# endif
+#endif
+
+	/*
+	 * now have to check for OS support of AVX instructions
+	 */
+	if(info->x86.avx && x86_osxsave) {
+		FLAC__uint32 ecr = FLAC__cpu_xgetbv_x86();
+		if ((ecr & 0x6) != 0x6)
+			disable_avx(info);
+#ifdef DEBUG
+		fprintf(stderr, "  AVX OS sup . %c\n", info->x86.avx ? 'Y' : 'n');
+#endif
+	}
+	else /* no OS AVX support*/
+		disable_avx(info);
+#else
+	info->use_asm = false;
+#endif
+
+/*
+ * unknown CPU
  */
 #else
 	info->type = FLAC__CPUINFO_TYPE_UNKNOWN;
 	info->use_asm = false;
 #endif
 }
+
+#if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
+
+#if defined _MSC_VER
+#include <intrin.h> /* for __cpuid() and _xgetbv() */
+#elif defined __GNUC__ && defined HAVE_CPUID_H
+#include <cpuid.h> /* for __get_cpuid() and __get_cpuid_max() */
+#endif
+
+FLAC__uint32 FLAC__cpu_have_cpuid_x86(void)
+{
+#ifdef FLAC__CPU_X86_64
+	return 1;
+#else
+# if defined _MSC_VER || defined __INTEL_COMPILER /* Do they support CPUs w/o CPUID support (or OSes that work on those CPUs)? */
+	FLAC__uint32 flags1, flags2;
+	__asm {
+		pushfd
+		pushfd
+		pop		eax
+		mov		flags1, eax
+		xor		eax, 0x200000
+		push	eax
+		popfd
+		pushfd
+		pop		eax
+		mov		flags2, eax
+		popfd
+	}
+	if (((flags1^flags2) & 0x200000) != 0)
+		return 1;
+	else
+		return 0;
+# elif defined __GNUC__ && defined HAVE_CPUID_H
+	if (__get_cpuid_max(0, 0) != 0)
+		return 1;
+	else
+		return 0;
+# else
+	return 0;
+# endif
+#endif
+}
+
+void FLAC__cpu_info_x86(FLAC__uint32 level, FLAC__uint32 *eax, FLAC__uint32 *ebx, FLAC__uint32 *ecx, FLAC__uint32 *edx)
+{
+#if defined _MSC_VER || defined __INTEL_COMPILER
+	int cpuinfo[4];
+	int ext = level & 0x80000000;
+	__cpuid(cpuinfo, ext);
+	if((unsigned)cpuinfo[0] < level) {
+		*eax = *ebx = *ecx = *edx = 0;
+		return;
+	}
+#if defined FLAC__AVX_SUPPORTED
+	__cpuidex(cpuinfo, level, 0); /* for AVX2 detection */
+#else
+	__cpuid(cpuinfo, level); /* some old compilers don't support __cpuidex */
+#endif
+	*eax = cpuinfo[0]; *ebx = cpuinfo[1]; *ecx = cpuinfo[2]; *edx = cpuinfo[3];
+#elif defined __GNUC__ && defined HAVE_CPUID_H
+	FLAC__uint32 ext = level & 0x80000000;
+	__cpuid(ext, *eax, *ebx, *ecx, *edx);
+	if (*eax < level) {
+		*eax = *ebx = *ecx = *edx = 0;
+		return;
+	}
+	__cpuid_count(level, 0, *eax, *ebx, *ecx, *edx);
+#else
+	*eax = *ebx = *ecx = *edx = 0;
+#endif
+}
+
+FLAC__uint32 FLAC__cpu_xgetbv_x86(void)
+{
+#if (defined _MSC_VER || defined __INTEL_COMPILER) && defined FLAC__AVX_SUPPORTED
+	return (FLAC__uint32)_xgetbv(0);
+#elif defined __GNUC__
+	FLAC__uint32 lo, hi;
+	asm volatile (".byte 0x0f, 0x01, 0xd0" : "=a"(lo), "=d"(hi) : "c" (0));
+	return lo;
+#else
+	return 0;
+#endif
+}
+
+#endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
diff --git a/src/libFLAC/crc.c b/src/libFLAC/crc.c
index 15eeff8..de2cb18 100644
--- a/src/libFLAC/crc.c
+++ b/src/libFLAC/crc.c
@@ -1,5 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
- * Copyright (C) 2000,2001,2002,2003,2004,2005,2006,2007  Josh Coalson
+ * Copyright (C) 2000-2009  Josh Coalson
+ * Copyright (C) 2011-2014  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -29,7 +30,7 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#if HAVE_CONFIG_H
+#ifdef HAVE_CONFIG_H
 #  include <config.h>
 #endif
 
@@ -74,7 +75,7 @@
 
 /* CRC-16, poly = x^16 + x^15 + x^2 + x^0, init = 0 */
 
-unsigned FLAC__crc16_table[256] = {
+unsigned const FLAC__crc16_table[256] = {
 	0x0000,  0x8005,  0x800f,  0x000a,  0x801b,  0x001e,  0x0014,  0x8011,
 	0x8033,  0x0036,  0x003c,  0x8039,  0x0028,  0x802d,  0x8027,  0x0022,
 	0x8063,  0x0066,  0x006c,  0x8069,  0x0078,  0x807d,  0x8077,  0x0072,
@@ -115,13 +116,13 @@
 	*crc = FLAC__crc8_table[*crc ^ data];
 }
 
-void FLAC__crc8_update_block(const FLAC__byte *data, size_t len, FLAC__uint8 *crc)
+void FLAC__crc8_update_block(const FLAC__byte *data, unsigned len, FLAC__uint8 *crc)
 {
 	while(len--)
 		*crc = FLAC__crc8_table[*crc ^ *data++];
 }
 
-FLAC__uint8 FLAC__crc8(const FLAC__byte *data, size_t len)
+FLAC__uint8 FLAC__crc8(const FLAC__byte *data, unsigned len)
 {
 	FLAC__uint8 crc = 0;
 
@@ -131,7 +132,7 @@
 	return crc;
 }
 
-unsigned FLAC__crc16(const FLAC__byte *data, size_t len)
+unsigned FLAC__crc16(const FLAC__byte *data, unsigned len)
 {
 	unsigned crc = 0;
 
diff --git a/src/libFLAC/fixed.c b/src/libFLAC/fixed.c
index 1a3aac0..74b31b3 100644
--- a/src/libFLAC/fixed.c
+++ b/src/libFLAC/fixed.c
@@ -1,5 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
- * Copyright (C) 2000,2001,2002,2003,2004,2005,2006,2007  Josh Coalson
+ * Copyright (C) 2000-2009  Josh Coalson
+ * Copyright (C) 2011-2014  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -29,26 +30,18 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#if HAVE_CONFIG_H
+#ifdef HAVE_CONFIG_H
 #  include <config.h>
 #endif
 
 #include <math.h>
 #include <string.h>
+#include "share/compat.h"
 #include "private/bitmath.h"
 #include "private/fixed.h"
+#include "private/macros.h"
 #include "FLAC/assert.h"
 
-#ifndef M_LN2
-/* math.h in VC++ doesn't seem to have this (how Microsoft is that?) */
-#define M_LN2 0.69314718055994530942
-#endif
-
-#ifdef min
-#undef min
-#endif
-#define min(x,y) ((x) < (y)? (x) : (y))
-
 #ifdef local_abs
 #undef local_abs
 #endif
@@ -242,11 +235,11 @@
 		error -= last_error_3; total_error_4 += local_abs(error); last_error_3 = save;
 	}
 
-	if(total_error_0 < min(min(min(total_error_1, total_error_2), total_error_3), total_error_4))
+	if(total_error_0 < flac_min(flac_min(flac_min(total_error_1, total_error_2), total_error_3), total_error_4))
 		order = 0;
-	else if(total_error_1 < min(min(total_error_2, total_error_3), total_error_4))
+	else if(total_error_1 < flac_min(flac_min(total_error_2, total_error_3), total_error_4))
 		order = 1;
-	else if(total_error_2 < min(total_error_3, total_error_4))
+	else if(total_error_2 < flac_min(total_error_3, total_error_4))
 		order = 2;
 	else if(total_error_3 < total_error_4)
 		order = 3;
@@ -304,11 +297,11 @@
 		error -= last_error_3; total_error_4 += local_abs(error); last_error_3 = save;
 	}
 
-	if(total_error_0 < min(min(min(total_error_1, total_error_2), total_error_3), total_error_4))
+	if(total_error_0 < flac_min(flac_min(flac_min(total_error_1, total_error_2), total_error_3), total_error_4))
 		order = 0;
-	else if(total_error_1 < min(min(total_error_2, total_error_3), total_error_4))
+	else if(total_error_1 < flac_min(flac_min(total_error_2, total_error_3), total_error_4))
 		order = 1;
-	else if(total_error_2 < min(total_error_3, total_error_4))
+	else if(total_error_2 < flac_min(total_error_3, total_error_4))
 		order = 2;
 	else if(total_error_3 < total_error_4)
 		order = 3;
@@ -324,20 +317,11 @@
 	FLAC__ASSERT(data_len > 0 || total_error_3 == 0);
 	FLAC__ASSERT(data_len > 0 || total_error_4 == 0);
 #ifndef FLAC__INTEGER_ONLY_LIBRARY
-#if defined _MSC_VER || defined __MINGW32__
-	/* with MSVC you have to spoon feed it the casting */
-	residual_bits_per_sample[0] = (FLAC__float)((total_error_0 > 0) ? log(M_LN2 * (FLAC__double)(FLAC__int64)total_error_0 / (FLAC__double)data_len) / M_LN2 : 0.0);
-	residual_bits_per_sample[1] = (FLAC__float)((total_error_1 > 0) ? log(M_LN2 * (FLAC__double)(FLAC__int64)total_error_1 / (FLAC__double)data_len) / M_LN2 : 0.0);
-	residual_bits_per_sample[2] = (FLAC__float)((total_error_2 > 0) ? log(M_LN2 * (FLAC__double)(FLAC__int64)total_error_2 / (FLAC__double)data_len) / M_LN2 : 0.0);
-	residual_bits_per_sample[3] = (FLAC__float)((total_error_3 > 0) ? log(M_LN2 * (FLAC__double)(FLAC__int64)total_error_3 / (FLAC__double)data_len) / M_LN2 : 0.0);
-	residual_bits_per_sample[4] = (FLAC__float)((total_error_4 > 0) ? log(M_LN2 * (FLAC__double)(FLAC__int64)total_error_4 / (FLAC__double)data_len) / M_LN2 : 0.0);
-#else
 	residual_bits_per_sample[0] = (FLAC__float)((total_error_0 > 0) ? log(M_LN2 * (FLAC__double)total_error_0 / (FLAC__double)data_len) / M_LN2 : 0.0);
 	residual_bits_per_sample[1] = (FLAC__float)((total_error_1 > 0) ? log(M_LN2 * (FLAC__double)total_error_1 / (FLAC__double)data_len) / M_LN2 : 0.0);
 	residual_bits_per_sample[2] = (FLAC__float)((total_error_2 > 0) ? log(M_LN2 * (FLAC__double)total_error_2 / (FLAC__double)data_len) / M_LN2 : 0.0);
 	residual_bits_per_sample[3] = (FLAC__float)((total_error_3 > 0) ? log(M_LN2 * (FLAC__double)total_error_3 / (FLAC__double)data_len) / M_LN2 : 0.0);
 	residual_bits_per_sample[4] = (FLAC__float)((total_error_4 > 0) ? log(M_LN2 * (FLAC__double)total_error_4 / (FLAC__double)data_len) / M_LN2 : 0.0);
-#endif
 #else
 	residual_bits_per_sample[0] = (total_error_0 > 0) ? local__compute_rbps_wide_integerized(total_error_0, data_len) : 0;
 	residual_bits_per_sample[1] = (total_error_1 > 0) ? local__compute_rbps_wide_integerized(total_error_1, data_len) : 0;
diff --git a/src/libFLAC/fixed_intrin_sse2.c b/src/libFLAC/fixed_intrin_sse2.c
new file mode 100644
index 0000000..6785f92
--- /dev/null
+++ b/src/libFLAC/fixed_intrin_sse2.c
@@ -0,0 +1,253 @@
+/* libFLAC - Free Lossless Audio Codec library
+ * Copyright (C) 2000-2009  Josh Coalson
+ * Copyright (C) 2011-2014  Xiph.Org Foundation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the Xiph.org Foundation nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#  include <config.h>
+#endif
+
+#ifndef FLAC__INTEGER_ONLY_LIBRARY
+#ifndef FLAC__NO_ASM
+#if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
+#include "private/fixed.h"
+#ifdef FLAC__SSE2_SUPPORTED
+
+#include <emmintrin.h> /* SSE2 */
+#include <math.h>
+#include "private/macros.h"
+#include "share/compat.h"
+#include "FLAC/assert.h"
+
+#ifdef FLAC__CPU_IA32
+#define m128i_to_i64(dest, src) _mm_storel_epi64((__m128i*)&dest, src)
+#else
+#define m128i_to_i64(dest, src) dest = _mm_cvtsi128_si64(src)
+#endif
+
+FLAC__SSE_TARGET("sse2")
+unsigned FLAC__fixed_compute_best_predictor_intrin_sse2(const FLAC__int32 data[], unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER + 1])
+{
+	FLAC__uint32 total_error_0, total_error_1, total_error_2, total_error_3, total_error_4;
+	unsigned i, order;
+
+	__m128i total_err0, total_err1, total_err2;
+
+	{
+		FLAC__int32 itmp;
+		__m128i last_error;
+
+		last_error = _mm_cvtsi32_si128(data[-1]);							// 0   0   0   le0
+		itmp = data[-2];
+		last_error = _mm_shuffle_epi32(last_error, _MM_SHUFFLE(2,1,0,0));
+		last_error = _mm_sub_epi32(last_error, _mm_cvtsi32_si128(itmp));	// 0   0   le0 le1
+		itmp -= data[-3];
+		last_error = _mm_shuffle_epi32(last_error, _MM_SHUFFLE(2,1,0,0));
+		last_error = _mm_sub_epi32(last_error, _mm_cvtsi32_si128(itmp));	// 0   le0 le1 le2
+		itmp -= data[-3] - data[-4];
+		last_error = _mm_shuffle_epi32(last_error, _MM_SHUFFLE(2,1,0,0));
+		last_error = _mm_sub_epi32(last_error, _mm_cvtsi32_si128(itmp));	// le0 le1 le2 le3
+
+		total_err0 = total_err1 = _mm_setzero_si128();
+		for(i = 0; i < data_len; i++) {
+			__m128i err0, err1, tmp;
+			err0 = _mm_cvtsi32_si128(data[i]);								// 0   0   0   e0
+			err1 = _mm_shuffle_epi32(err0, _MM_SHUFFLE(0,0,0,0));			// e0  e0  e0  e0
+#if 1 /* OPT_SSE */
+			err1 = _mm_sub_epi32(err1, last_error);
+			last_error = _mm_srli_si128(last_error, 4);						// 0   le0 le1 le2
+			err1 = _mm_sub_epi32(err1, last_error);
+			last_error = _mm_srli_si128(last_error, 4);						// 0   0   le0 le1
+			err1 = _mm_sub_epi32(err1, last_error);
+			last_error = _mm_srli_si128(last_error, 4);						// 0   0   0   le0
+			err1 = _mm_sub_epi32(err1, last_error);							// e1  e2  e3  e4
+#else
+			last_error = _mm_add_epi32(last_error, _mm_srli_si128(last_error, 8));	// le0  le1  le2+le0  le3+le1
+			last_error = _mm_add_epi32(last_error, _mm_srli_si128(last_error, 4));	// le0  le1+le0  le2+le0+le1  le3+le1+le2+le0
+			err1 = _mm_sub_epi32(err1, last_error);							// e1  e2  e3  e4
+#endif
+			tmp = _mm_slli_si128(err0, 12);									// e0   0   0   0
+			last_error = _mm_srli_si128(err1, 4);							//  0  e1  e2  e3
+			last_error = _mm_or_si128(last_error, tmp);						// e0  e1  e2  e3
+
+			tmp = _mm_srai_epi32(err0, 31);
+			err0 = _mm_xor_si128(err0, tmp);
+			err0 = _mm_sub_epi32(err0, tmp);
+			tmp = _mm_srai_epi32(err1, 31);
+			err1 = _mm_xor_si128(err1, tmp);
+			err1 = _mm_sub_epi32(err1, tmp);
+
+			total_err0 = _mm_add_epi32(total_err0, err0);					// 0   0   0   te0
+			total_err1 = _mm_add_epi32(total_err1, err1);					// te1 te2 te3 te4
+		}
+	}
+	
+	total_error_0 = _mm_cvtsi128_si32(total_err0);
+	total_err2 = total_err1;											// te1  te2  te3  te4
+	total_err1 = _mm_srli_si128(total_err1, 8);							//  0    0   te1  te2
+	total_error_4 = _mm_cvtsi128_si32(total_err2);
+	total_error_2 = _mm_cvtsi128_si32(total_err1);
+	total_err2 = _mm_srli_si128(total_err2,	4);							//  0   te1  te2  te3
+	total_err1 = _mm_srli_si128(total_err1, 4);							//  0    0    0   te1
+	total_error_3 = _mm_cvtsi128_si32(total_err2);
+	total_error_1 = _mm_cvtsi128_si32(total_err1);
+
+	/* prefer higher order */
+	if(total_error_0 < flac_min(flac_min(flac_min(total_error_1, total_error_2), total_error_3), total_error_4))
+		order = 0;
+	else if(total_error_1 < flac_min(flac_min(total_error_2, total_error_3), total_error_4))
+		order = 1;
+	else if(total_error_2 < flac_min(total_error_3, total_error_4))
+		order = 2;
+	else if(total_error_3 < total_error_4)
+		order = 3;
+	else
+		order = 4;
+
+	/* Estimate the expected number of bits per residual signal sample. */
+	/* 'total_error*' is linearly related to the variance of the residual */
+	/* signal, so we use it directly to compute E(|x|) */
+	FLAC__ASSERT(data_len > 0 || total_error_0 == 0);
+	FLAC__ASSERT(data_len > 0 || total_error_1 == 0);
+	FLAC__ASSERT(data_len > 0 || total_error_2 == 0);
+	FLAC__ASSERT(data_len > 0 || total_error_3 == 0);
+	FLAC__ASSERT(data_len > 0 || total_error_4 == 0);
+
+	residual_bits_per_sample[0] = (FLAC__float)((total_error_0 > 0) ? log(M_LN2 * (FLAC__double)total_error_0 / (FLAC__double)data_len) / M_LN2 : 0.0);
+	residual_bits_per_sample[1] = (FLAC__float)((total_error_1 > 0) ? log(M_LN2 * (FLAC__double)total_error_1 / (FLAC__double)data_len) / M_LN2 : 0.0);
+	residual_bits_per_sample[2] = (FLAC__float)((total_error_2 > 0) ? log(M_LN2 * (FLAC__double)total_error_2 / (FLAC__double)data_len) / M_LN2 : 0.0);
+	residual_bits_per_sample[3] = (FLAC__float)((total_error_3 > 0) ? log(M_LN2 * (FLAC__double)total_error_3 / (FLAC__double)data_len) / M_LN2 : 0.0);
+	residual_bits_per_sample[4] = (FLAC__float)((total_error_4 > 0) ? log(M_LN2 * (FLAC__double)total_error_4 / (FLAC__double)data_len) / M_LN2 : 0.0);
+
+	return order;
+}
+
+FLAC__SSE_TARGET("sse2")
+unsigned FLAC__fixed_compute_best_predictor_wide_intrin_sse2(const FLAC__int32 data[], unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER + 1])
+{
+	FLAC__uint64 total_error_0, total_error_1, total_error_2, total_error_3, total_error_4;
+	unsigned i, order;
+
+	__m128i total_err0, total_err1, total_err3;
+
+	{
+		FLAC__int32 itmp;
+		__m128i last_error, zero = _mm_setzero_si128();
+		
+		last_error = _mm_cvtsi32_si128(data[-1]);							// 0   0   0   le0
+		itmp = data[-2];
+		last_error = _mm_shuffle_epi32(last_error, _MM_SHUFFLE(2,1,0,0));
+		last_error = _mm_sub_epi32(last_error, _mm_cvtsi32_si128(itmp));	// 0   0   le0 le1
+		itmp -= data[-3];
+		last_error = _mm_shuffle_epi32(last_error, _MM_SHUFFLE(2,1,0,0));
+		last_error = _mm_sub_epi32(last_error, _mm_cvtsi32_si128(itmp));	// 0   le0 le1 le2
+		itmp -= data[-3] - data[-4];
+		last_error = _mm_shuffle_epi32(last_error, _MM_SHUFFLE(2,1,0,0));
+		last_error = _mm_sub_epi32(last_error, _mm_cvtsi32_si128(itmp));	// le0 le1 le2 le3
+
+		total_err0 = total_err1 = total_err3 = _mm_setzero_si128();
+		for(i = 0; i < data_len; i++) {
+			__m128i err0, err1, tmp;
+			err0 = _mm_cvtsi32_si128(data[i]);								// 0   0   0   e0
+			err1 = _mm_shuffle_epi32(err0, _MM_SHUFFLE(0,0,0,0));			// e0  e0  e0  e0
+#if 1 /* OPT_SSE */
+			err1 = _mm_sub_epi32(err1, last_error);
+			last_error = _mm_srli_si128(last_error, 4);						// 0   le0 le1 le2
+			err1 = _mm_sub_epi32(err1, last_error);
+			last_error = _mm_srli_si128(last_error, 4);						// 0   0   le0 le1
+			err1 = _mm_sub_epi32(err1, last_error);
+			last_error = _mm_srli_si128(last_error, 4);						// 0   0   0   le0
+			err1 = _mm_sub_epi32(err1, last_error);							// e1  e2  e3  e4
+#else
+			last_error = _mm_add_epi32(last_error, _mm_srli_si128(last_error, 8));	// le0  le1  le2+le0  le3+le1
+			last_error = _mm_add_epi32(last_error, _mm_srli_si128(last_error, 4));	// le0  le1+le0  le2+le0+le1  le3+le1+le2+le0
+			err1 = _mm_sub_epi32(err1, last_error);							// e1  e2  e3  e4
+#endif
+			tmp = _mm_slli_si128(err0, 12);									// e0   0   0   0
+			last_error = _mm_srli_si128(err1, 4);							//  0  e1  e2  e3
+			last_error = _mm_or_si128(last_error, tmp);						// e0  e1  e2  e3
+
+			tmp = _mm_srai_epi32(err0, 31);
+			err0 = _mm_xor_si128(err0, tmp);
+			err0 = _mm_sub_epi32(err0, tmp);
+			tmp = _mm_srai_epi32(err1, 31);
+			err1 = _mm_xor_si128(err1, tmp);
+			err1 = _mm_sub_epi32(err1, tmp);
+
+			total_err0 = _mm_add_epi64(total_err0, err0);					//        0       te0
+			err0 = _mm_unpacklo_epi32(err1, zero);							//   0  |e3|   0  |e4|
+			err1 = _mm_unpackhi_epi32(err1, zero);							//   0  |e1|   0  |e2|
+			total_err3 = _mm_add_epi64(total_err3, err0);					//       te3      te4
+			total_err1 = _mm_add_epi64(total_err1, err1);					//       te1      te2
+		}
+	}
+	
+	m128i_to_i64(total_error_0, total_err0);
+	m128i_to_i64(total_error_4, total_err3);
+	m128i_to_i64(total_error_2, total_err1);
+	total_err3 = _mm_srli_si128(total_err3,	8);							//         0      te3
+	total_err1 = _mm_srli_si128(total_err1, 8);							//         0      te1
+	m128i_to_i64(total_error_3, total_err3);
+	m128i_to_i64(total_error_1, total_err1);
+
+	/* prefer higher order */
+	if(total_error_0 < flac_min(flac_min(flac_min(total_error_1, total_error_2), total_error_3), total_error_4))
+		order = 0;
+	else if(total_error_1 < flac_min(flac_min(total_error_2, total_error_3), total_error_4))
+		order = 1;
+	else if(total_error_2 < flac_min(total_error_3, total_error_4))
+		order = 2;
+	else if(total_error_3 < total_error_4)
+		order = 3;
+	else
+		order = 4;
+
+	/* Estimate the expected number of bits per residual signal sample. */
+	/* 'total_error*' is linearly related to the variance of the residual */
+	/* signal, so we use it directly to compute E(|x|) */
+	FLAC__ASSERT(data_len > 0 || total_error_0 == 0);
+	FLAC__ASSERT(data_len > 0 || total_error_1 == 0);
+	FLAC__ASSERT(data_len > 0 || total_error_2 == 0);
+	FLAC__ASSERT(data_len > 0 || total_error_3 == 0);
+	FLAC__ASSERT(data_len > 0 || total_error_4 == 0);
+
+	residual_bits_per_sample[0] = (FLAC__float)((total_error_0 > 0) ? log(M_LN2 * (FLAC__double)total_error_0 / (FLAC__double)data_len) / M_LN2 : 0.0);
+	residual_bits_per_sample[1] = (FLAC__float)((total_error_1 > 0) ? log(M_LN2 * (FLAC__double)total_error_1 / (FLAC__double)data_len) / M_LN2 : 0.0);
+	residual_bits_per_sample[2] = (FLAC__float)((total_error_2 > 0) ? log(M_LN2 * (FLAC__double)total_error_2 / (FLAC__double)data_len) / M_LN2 : 0.0);
+	residual_bits_per_sample[3] = (FLAC__float)((total_error_3 > 0) ? log(M_LN2 * (FLAC__double)total_error_3 / (FLAC__double)data_len) / M_LN2 : 0.0);
+	residual_bits_per_sample[4] = (FLAC__float)((total_error_4 > 0) ? log(M_LN2 * (FLAC__double)total_error_4 / (FLAC__double)data_len) / M_LN2 : 0.0);
+
+	return order;
+}
+
+#endif /* FLAC__SSE2_SUPPORTED */
+#endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
+#endif /* FLAC__NO_ASM */
+#endif /* FLAC__INTEGER_ONLY_LIBRARY */
diff --git a/src/libFLAC/fixed_intrin_ssse3.c b/src/libFLAC/fixed_intrin_ssse3.c
new file mode 100644
index 0000000..50c663d
--- /dev/null
+++ b/src/libFLAC/fixed_intrin_ssse3.c
@@ -0,0 +1,241 @@
+/* libFLAC - Free Lossless Audio Codec library
+ * Copyright (C) 2000-2009  Josh Coalson
+ * Copyright (C) 2011-2014  Xiph.Org Foundation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the Xiph.org Foundation nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#  include <config.h>
+#endif
+
+#ifndef FLAC__INTEGER_ONLY_LIBRARY
+#ifndef FLAC__NO_ASM
+#if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
+#include "private/fixed.h"
+#ifdef FLAC__SSSE3_SUPPORTED
+
+#include <tmmintrin.h> /* SSSE3 */
+#include <math.h>
+#include "private/macros.h"
+#include "share/compat.h"
+#include "FLAC/assert.h"
+
+#ifdef FLAC__CPU_IA32
+#define m128i_to_i64(dest, src) _mm_storel_epi64((__m128i*)&dest, src)
+#else
+#define m128i_to_i64(dest, src) dest = _mm_cvtsi128_si64(src)
+#endif
+
+FLAC__SSE_TARGET("ssse3")
+unsigned FLAC__fixed_compute_best_predictor_intrin_ssse3(const FLAC__int32 data[], unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER + 1])
+{
+	FLAC__uint32 total_error_0, total_error_1, total_error_2, total_error_3, total_error_4;
+	unsigned i, order;
+
+	__m128i total_err0, total_err1, total_err2;
+
+	{
+		FLAC__int32 itmp;
+		__m128i last_error;
+
+		last_error = _mm_cvtsi32_si128(data[-1]);							// 0   0   0   le0
+		itmp = data[-2];
+		last_error = _mm_shuffle_epi32(last_error, _MM_SHUFFLE(2,1,0,0));
+		last_error = _mm_sub_epi32(last_error, _mm_cvtsi32_si128(itmp));	// 0   0   le0 le1
+		itmp -= data[-3];
+		last_error = _mm_shuffle_epi32(last_error, _MM_SHUFFLE(2,1,0,0));
+		last_error = _mm_sub_epi32(last_error, _mm_cvtsi32_si128(itmp));	// 0   le0 le1 le2
+		itmp -= data[-3] - data[-4];
+		last_error = _mm_shuffle_epi32(last_error, _MM_SHUFFLE(2,1,0,0));
+		last_error = _mm_sub_epi32(last_error, _mm_cvtsi32_si128(itmp));	// le0 le1 le2 le3
+
+		total_err0 = total_err1 = _mm_setzero_si128();
+		for(i = 0; i < data_len; i++) {
+			__m128i err0, err1;
+			err0 = _mm_cvtsi32_si128(data[i]);								// 0   0   0   e0
+			err1 = _mm_shuffle_epi32(err0, _MM_SHUFFLE(0,0,0,0));			// e0  e0  e0  e0
+#if 1 /* OPT_SSE */
+			err1 = _mm_sub_epi32(err1, last_error);
+			last_error = _mm_srli_si128(last_error, 4);						// 0   le0 le1 le2
+			err1 = _mm_sub_epi32(err1, last_error);
+			last_error = _mm_srli_si128(last_error, 4);						// 0   0   le0 le1
+			err1 = _mm_sub_epi32(err1, last_error);
+			last_error = _mm_srli_si128(last_error, 4);						// 0   0   0   le0
+			err1 = _mm_sub_epi32(err1, last_error);							// e1  e2  e3  e4
+#else
+			last_error = _mm_add_epi32(last_error, _mm_srli_si128(last_error, 8));	// le0  le1  le2+le0  le3+le1
+			last_error = _mm_add_epi32(last_error, _mm_srli_si128(last_error, 4));	// le0  le1+le0  le2+le0+le1  le3+le1+le2+le0
+			err1 = _mm_sub_epi32(err1, last_error);							// e1  e2  e3  e4
+#endif
+			last_error = _mm_alignr_epi8(err0, err1, 4);					// e0  e1  e2  e3
+
+			err0 = _mm_abs_epi32(err0);
+			err1 = _mm_abs_epi32(err1);
+
+			total_err0 = _mm_add_epi32(total_err0, err0);					// 0   0   0   te0
+			total_err1 = _mm_add_epi32(total_err1, err1);					// te1 te2 te3 te4
+		}
+	}
+	
+	total_error_0 = _mm_cvtsi128_si32(total_err0);
+	total_err2 = total_err1;											// te1  te2  te3  te4
+	total_err1 = _mm_srli_si128(total_err1, 8);							//  0    0   te1  te2
+	total_error_4 = _mm_cvtsi128_si32(total_err2);
+	total_error_2 = _mm_cvtsi128_si32(total_err1);
+	total_err2 = _mm_srli_si128(total_err2,	4);							//  0   te1  te2  te3
+	total_err1 = _mm_srli_si128(total_err1, 4);							//  0    0    0   te1
+	total_error_3 = _mm_cvtsi128_si32(total_err2);
+	total_error_1 = _mm_cvtsi128_si32(total_err1);
+
+	/* prefer higher order */
+	if(total_error_0 < flac_min(flac_min(flac_min(total_error_1, total_error_2), total_error_3), total_error_4))
+		order = 0;
+	else if(total_error_1 < flac_min(flac_min(total_error_2, total_error_3), total_error_4))
+		order = 1;
+	else if(total_error_2 < flac_min(total_error_3, total_error_4))
+		order = 2;
+	else if(total_error_3 < total_error_4)
+		order = 3;
+	else
+		order = 4;
+
+	/* Estimate the expected number of bits per residual signal sample. */
+	/* 'total_error*' is linearly related to the variance of the residual */
+	/* signal, so we use it directly to compute E(|x|) */
+	FLAC__ASSERT(data_len > 0 || total_error_0 == 0);
+	FLAC__ASSERT(data_len > 0 || total_error_1 == 0);
+	FLAC__ASSERT(data_len > 0 || total_error_2 == 0);
+	FLAC__ASSERT(data_len > 0 || total_error_3 == 0);
+	FLAC__ASSERT(data_len > 0 || total_error_4 == 0);
+
+	residual_bits_per_sample[0] = (FLAC__float)((total_error_0 > 0) ? log(M_LN2 * (FLAC__double)total_error_0 / (FLAC__double)data_len) / M_LN2 : 0.0);
+	residual_bits_per_sample[1] = (FLAC__float)((total_error_1 > 0) ? log(M_LN2 * (FLAC__double)total_error_1 / (FLAC__double)data_len) / M_LN2 : 0.0);
+	residual_bits_per_sample[2] = (FLAC__float)((total_error_2 > 0) ? log(M_LN2 * (FLAC__double)total_error_2 / (FLAC__double)data_len) / M_LN2 : 0.0);
+	residual_bits_per_sample[3] = (FLAC__float)((total_error_3 > 0) ? log(M_LN2 * (FLAC__double)total_error_3 / (FLAC__double)data_len) / M_LN2 : 0.0);
+	residual_bits_per_sample[4] = (FLAC__float)((total_error_4 > 0) ? log(M_LN2 * (FLAC__double)total_error_4 / (FLAC__double)data_len) / M_LN2 : 0.0);
+
+	return order;
+}
+
+FLAC__SSE_TARGET("ssse3")
+unsigned FLAC__fixed_compute_best_predictor_wide_intrin_ssse3(const FLAC__int32 data[], unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER + 1])
+{
+	FLAC__uint64 total_error_0, total_error_1, total_error_2, total_error_3, total_error_4;
+	unsigned i, order;
+
+	__m128i total_err0, total_err1, total_err3;
+
+	{
+		FLAC__int32 itmp;
+		__m128i last_error, zero = _mm_setzero_si128();
+		
+		last_error = _mm_cvtsi32_si128(data[-1]);							// 0   0   0   le0
+		itmp = data[-2];
+		last_error = _mm_shuffle_epi32(last_error, _MM_SHUFFLE(2,1,0,0));
+		last_error = _mm_sub_epi32(last_error, _mm_cvtsi32_si128(itmp));	// 0   0   le0 le1
+		itmp -= data[-3];
+		last_error = _mm_shuffle_epi32(last_error, _MM_SHUFFLE(2,1,0,0));
+		last_error = _mm_sub_epi32(last_error, _mm_cvtsi32_si128(itmp));	// 0   le0 le1 le2
+		itmp -= data[-3] - data[-4];
+		last_error = _mm_shuffle_epi32(last_error, _MM_SHUFFLE(2,1,0,0));
+		last_error = _mm_sub_epi32(last_error, _mm_cvtsi32_si128(itmp));	// le0 le1 le2 le3
+
+		total_err0 = total_err1 = total_err3 = _mm_setzero_si128();
+		for(i = 0; i < data_len; i++) {
+			__m128i err0, err1;
+			err0 = _mm_cvtsi32_si128(data[i]);								// 0   0   0   e0
+			err1 = _mm_shuffle_epi32(err0, _MM_SHUFFLE(0,0,0,0));			// e0  e0  e0  e0
+#if 1 /* OPT_SSE */
+			err1 = _mm_sub_epi32(err1, last_error);
+			last_error = _mm_srli_si128(last_error, 4);						// 0   le0 le1 le2
+			err1 = _mm_sub_epi32(err1, last_error);
+			last_error = _mm_srli_si128(last_error, 4);						// 0   0   le0 le1
+			err1 = _mm_sub_epi32(err1, last_error);
+			last_error = _mm_srli_si128(last_error, 4);						// 0   0   0   le0
+			err1 = _mm_sub_epi32(err1, last_error);							// e1  e2  e3  e4
+#else
+			last_error = _mm_add_epi32(last_error, _mm_srli_si128(last_error, 8));	// le0  le1  le2+le0  le3+le1
+			last_error = _mm_add_epi32(last_error, _mm_srli_si128(last_error, 4));	// le0  le1+le0  le2+le0+le1  le3+le1+le2+le0
+			err1 = _mm_sub_epi32(err1, last_error);							// e1  e2  e3  e4
+#endif
+			last_error = _mm_alignr_epi8(err0, err1, 4);					// e0  e1  e2  e3
+			
+			err0 = _mm_abs_epi32(err0);
+			err1 = _mm_abs_epi32(err1);										// |e1| |e2| |e3| |e4|
+
+			total_err0 = _mm_add_epi64(total_err0, err0);					//        0       te0
+			err0 = _mm_unpacklo_epi32(err1, zero);							//   0  |e3|   0  |e4|
+			err1 = _mm_unpackhi_epi32(err1, zero);							//   0  |e1|   0  |e2|
+			total_err3 = _mm_add_epi64(total_err3, err0);					//       te3      te4
+			total_err1 = _mm_add_epi64(total_err1, err1);					//       te1      te2
+		}
+	}
+	
+	m128i_to_i64(total_error_0, total_err0);
+	m128i_to_i64(total_error_4, total_err3);
+	m128i_to_i64(total_error_2, total_err1);
+	total_err3 = _mm_srli_si128(total_err3,	8);							//         0      te3
+	total_err1 = _mm_srli_si128(total_err1, 8);							//         0      te1
+	m128i_to_i64(total_error_3, total_err3);
+	m128i_to_i64(total_error_1, total_err1);
+
+	/* prefer higher order */
+	if(total_error_0 < flac_min(flac_min(flac_min(total_error_1, total_error_2), total_error_3), total_error_4))
+		order = 0;
+	else if(total_error_1 < flac_min(flac_min(total_error_2, total_error_3), total_error_4))
+		order = 1;
+	else if(total_error_2 < flac_min(total_error_3, total_error_4))
+		order = 2;
+	else if(total_error_3 < total_error_4)
+		order = 3;
+	else
+		order = 4;
+
+	/* Estimate the expected number of bits per residual signal sample. */
+	/* 'total_error*' is linearly related to the variance of the residual */
+	/* signal, so we use it directly to compute E(|x|) */
+	FLAC__ASSERT(data_len > 0 || total_error_0 == 0);
+	FLAC__ASSERT(data_len > 0 || total_error_1 == 0);
+	FLAC__ASSERT(data_len > 0 || total_error_2 == 0);
+	FLAC__ASSERT(data_len > 0 || total_error_3 == 0);
+	FLAC__ASSERT(data_len > 0 || total_error_4 == 0);
+
+	residual_bits_per_sample[0] = (FLAC__float)((total_error_0 > 0) ? log(M_LN2 * (FLAC__double)total_error_0 / (FLAC__double)data_len) / M_LN2 : 0.0);
+	residual_bits_per_sample[1] = (FLAC__float)((total_error_1 > 0) ? log(M_LN2 * (FLAC__double)total_error_1 / (FLAC__double)data_len) / M_LN2 : 0.0);
+	residual_bits_per_sample[2] = (FLAC__float)((total_error_2 > 0) ? log(M_LN2 * (FLAC__double)total_error_2 / (FLAC__double)data_len) / M_LN2 : 0.0);
+	residual_bits_per_sample[3] = (FLAC__float)((total_error_3 > 0) ? log(M_LN2 * (FLAC__double)total_error_3 / (FLAC__double)data_len) / M_LN2 : 0.0);
+	residual_bits_per_sample[4] = (FLAC__float)((total_error_4 > 0) ? log(M_LN2 * (FLAC__double)total_error_4 / (FLAC__double)data_len) / M_LN2 : 0.0);
+
+	return order;
+}
+
+#endif /* FLAC__SSSE3_SUPPORTED */
+#endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
+#endif /* FLAC__NO_ASM */
+#endif /* FLAC__INTEGER_ONLY_LIBRARY */
diff --git a/src/libFLAC/float.c b/src/libFLAC/float.c
index 8e19280..068069f 100644
--- a/src/libFLAC/float.c
+++ b/src/libFLAC/float.c
@@ -1,5 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
- * Copyright (C) 2004,2005,2006,2007  Josh Coalson
+ * Copyright (C) 2004-2009  Josh Coalson
+ * Copyright (C) 2011-2014  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -29,23 +30,16 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#if HAVE_CONFIG_H
+#ifdef HAVE_CONFIG_H
 #  include <config.h>
 #endif
 
 #include "FLAC/assert.h"
-
+#include "share/compat.h"
 #include "private/float.h"
 
 #ifdef FLAC__INTEGER_ONLY_LIBRARY
 
-/* adjust for compilers that can't understand using LLU suffix for uint64_t literals */
-#ifdef _MSC_VER
-#define FLAC__U64L(x) x
-#else
-#define FLAC__U64L(x) x##LLU
-#endif
-
 const FLAC__fixedpoint FLAC__FP_ZERO = 0;
 const FLAC__fixedpoint FLAC__FP_ONE_HALF = 0x00008000;
 const FLAC__fixedpoint FLAC__FP_ONE = 0x00010000;
@@ -282,7 +276,7 @@
 
 	if(x < ONE)
 		return 0;
-	
+
 	if(precision > LOG2_LOOKUP_PRECISION)
 		precision = LOG2_LOOKUP_PRECISION;
 
diff --git a/src/libFLAC/format.c b/src/libFLAC/format.c
index 4c1bae9..4d0d832 100644
--- a/src/libFLAC/format.c
+++ b/src/libFLAC/format.c
@@ -1,5 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
- * Copyright (C) 2000,2001,2002,2003,2004,2005,2006,2007  Josh Coalson
+ * Copyright (C) 2000-2009  Josh Coalson
+ * Copyright (C) 2011-2014  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -29,7 +30,7 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#if HAVE_CONFIG_H
+#ifdef HAVE_CONFIG_H
 #  include <config.h>
 #endif
 
@@ -38,33 +39,14 @@
 #include <string.h> /* for memset() */
 #include "FLAC/assert.h"
 #include "FLAC/format.h"
+#include "share/compat.h"
 #include "private/format.h"
-
-#ifndef FLaC__INLINE
-#define FLaC__INLINE
-#endif
-
-#ifdef min
-#undef min
-#endif
-#define min(a,b) ((a)<(b)?(a):(b))
-
-/* adjust for compilers that can't understand using LLU suffix for uint64_t literals */
-#ifdef _MSC_VER
-#define FLAC__U64L(x) x
-#else
-#define FLAC__U64L(x) x##LLU
-#endif
+#include "private/macros.h"
 
 /* VERSION should come from configure */
 FLAC_API const char *FLAC__VERSION_STRING = VERSION;
 
-#if defined _MSC_VER || defined __BORLANDC__ || defined __MINW32__
-/* yet one more hack because of MSVC6: */
-FLAC_API const char *FLAC__VENDOR_STRING = "reference libFLAC 1.2.1 20070917";
-#else
-FLAC_API const char *FLAC__VENDOR_STRING = "reference libFLAC " VERSION " 20070917";
-#endif
+FLAC_API const char *FLAC__VENDOR_STRING = "reference libFLAC " VERSION " 20141125";
 
 FLAC_API const FLAC__byte FLAC__STREAM_SYNC_STRING[4] = { 'f','L','a','C' };
 FLAC_API const unsigned FLAC__STREAM_SYNC = 0x664C6143;
@@ -223,6 +205,16 @@
 		return true;
 }
 
+FLAC_API FLAC__bool FLAC__format_blocksize_is_subset(unsigned blocksize, unsigned sample_rate)
+{
+	if(blocksize > 16384)
+		return false;
+	else if(sample_rate <= 48000 && blocksize > 4608)
+		return false;
+	else
+		return true;
+}
+
 FLAC_API FLAC__bool FLAC__format_sample_rate_is_subset(unsigned sample_rate)
 {
 	if(
@@ -313,7 +305,7 @@
  * and a more clear explanation at the end of this section:
  *   http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
  */
-static FLaC__INLINE unsigned utf8len_(const FLAC__byte *utf8)
+static unsigned utf8len_(const FLAC__byte *utf8)
 {
 	FLAC__ASSERT(0 != utf8);
 	if ((utf8[0] & 0x80) == 0) {
@@ -536,7 +528,7 @@
 		max_rice_partition_order++;
 		blocksize >>= 1;
 	}
-	return min(FLAC__MAX_RICE_PARTITION_ORDER, max_rice_partition_order);
+	return flac_min(FLAC__MAX_RICE_PARTITION_ORDER, max_rice_partition_order);
 }
 
 unsigned FLAC__format_get_max_rice_partition_order_from_blocksize_limited_max_and_predictor_order(unsigned limit, unsigned blocksize, unsigned predictor_order)
@@ -581,11 +573,11 @@
 	FLAC__ASSERT(object->capacity_by_order > 0 || (0 == object->parameters && 0 == object->raw_bits));
 
 	if(object->capacity_by_order < max_partition_order) {
-		if(0 == (object->parameters = (unsigned*)realloc(object->parameters, sizeof(unsigned)*((size_t)1 << max_partition_order))))
+		if(0 == (object->parameters = realloc(object->parameters, sizeof(unsigned)*(1 << max_partition_order))))
 			return false;
-		if(0 == (object->raw_bits = (unsigned*)realloc(object->raw_bits, sizeof(unsigned)*((size_t)1 << max_partition_order))))
+		if(0 == (object->raw_bits = realloc(object->raw_bits, sizeof(unsigned)*(1 << max_partition_order))))
 			return false;
-		memset(object->raw_bits, 0, sizeof(unsigned)*((size_t)1 << max_partition_order));
+		memset(object->raw_bits, 0, sizeof(unsigned)*(1 << max_partition_order));
 		object->capacity_by_order = max_partition_order;
 	}
 
diff --git a/src/libFLAC/ia32/cpu_asm.nasm b/src/libFLAC/ia32/cpu_asm.nasm
new file mode 100644
index 0000000..036e865
--- /dev/null
+++ b/src/libFLAC/ia32/cpu_asm.nasm
@@ -0,0 +1,98 @@
+;  vim:filetype=nasm ts=8
+
+;  libFLAC - Free Lossless Audio Codec library
+;  Copyright (C) 2001-2009  Josh Coalson
+;  Copyright (C) 2011-2014  Xiph.Org Foundation
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;
+;  - Redistributions of source code must retain the above copyright
+;  notice, this list of conditions and the following disclaimer.
+;
+;  - Redistributions in binary form must reproduce the above copyright
+;  notice, this list of conditions and the following disclaimer in the
+;  documentation and/or other materials provided with the distribution.
+;
+;  - Neither the name of the Xiph.org Foundation nor the names of its
+;  contributors may be used to endorse or promote products derived from
+;  this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+;  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "nasm.h"
+
+	data_section
+
+cglobal FLAC__cpu_have_cpuid_asm_ia32
+cglobal FLAC__cpu_info_asm_ia32
+
+	code_section
+
+; **********************************************************************
+;
+; FLAC__uint32 FLAC__cpu_have_cpuid_asm_ia32()
+;
+
+cident FLAC__cpu_have_cpuid_asm_ia32
+	pushfd
+	pop	eax
+	mov	edx, eax
+	xor	eax, 0x00200000
+	push	eax
+	popfd
+	pushfd
+	pop	eax
+	xor	eax, edx
+	and	eax, 0x00200000
+	shr	eax, 0x15
+	push	edx
+	popfd
+	ret
+
+; **********************************************************************
+;
+; void FLAC__cpu_info_asm_ia32(FLAC__uint32 *flags_edx, FLAC__uint32 *flags_ecx)
+;
+
+cident FLAC__cpu_info_asm_ia32
+	;[esp + 8] == flags_edx
+	;[esp + 12] == flags_ecx
+
+	push	ebx
+	call	FLAC__cpu_have_cpuid_asm_ia32
+	test	eax, eax
+	jz	.no_cpuid
+	mov	eax, 0
+	cpuid
+	cmp	eax, 1
+	jb	.no_cpuid
+	mov	eax, 1
+	cpuid
+	mov	ebx, [esp + 8]
+	mov	[ebx], edx
+	mov	ebx, [esp + 12]
+	mov	[ebx], ecx
+	jmp	.end
+.no_cpuid:
+	xor	eax, eax
+	mov	ebx, [esp + 8]
+	mov	[ebx], eax
+	mov	ebx, [esp + 12]
+	mov	[ebx], eax
+.end:
+	pop	ebx
+	ret
+
+; end
diff --git a/src/libFLAC/ia32/fixed_asm.nasm b/src/libFLAC/ia32/fixed_asm.nasm
new file mode 100644
index 0000000..402c02a
--- /dev/null
+++ b/src/libFLAC/ia32/fixed_asm.nasm
@@ -0,0 +1,309 @@
+;  vim:filetype=nasm ts=8
+
+;  libFLAC - Free Lossless Audio Codec library
+;  Copyright (C) 2001-2009  Josh Coalson
+;  Copyright (C) 2011-2014  Xiph.Org Foundation
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;
+;  - Redistributions of source code must retain the above copyright
+;  notice, this list of conditions and the following disclaimer.
+;
+;  - Redistributions in binary form must reproduce the above copyright
+;  notice, this list of conditions and the following disclaimer in the
+;  documentation and/or other materials provided with the distribution.
+;
+;  - Neither the name of the Xiph.org Foundation nor the names of its
+;  contributors may be used to endorse or promote products derived from
+;  this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+;  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "nasm.h"
+
+	data_section
+
+cglobal FLAC__fixed_compute_best_predictor_asm_ia32_mmx_cmov
+
+	code_section
+
+; **********************************************************************
+;
+; unsigned FLAC__fixed_compute_best_predictor(const FLAC__int32 *data, unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1])
+; {
+; 	FLAC__int32 last_error_0 = data[-1];
+; 	FLAC__int32 last_error_1 = data[-1] - data[-2];
+; 	FLAC__int32 last_error_2 = last_error_1 - (data[-2] - data[-3]);
+; 	FLAC__int32 last_error_3 = last_error_2 - (data[-2] - 2*data[-3] + data[-4]);
+; 	FLAC__int32 error, save;
+; 	FLAC__uint32 total_error_0 = 0, total_error_1 = 0, total_error_2 = 0, total_error_3 = 0, total_error_4 = 0;
+; 	unsigned i, order;
+;
+; 	for(i = 0; i < data_len; i++) {
+; 		error  = data[i]     ; total_error_0 += local_abs(error);                      save = error;
+; 		error -= last_error_0; total_error_1 += local_abs(error); last_error_0 = save; save = error;
+; 		error -= last_error_1; total_error_2 += local_abs(error); last_error_1 = save; save = error;
+; 		error -= last_error_2; total_error_3 += local_abs(error); last_error_2 = save; save = error;
+; 		error -= last_error_3; total_error_4 += local_abs(error); last_error_3 = save;
+; 	}
+;
+; 	if(total_error_0 < min(min(min(total_error_1, total_error_2), total_error_3), total_error_4))
+; 		order = 0;
+; 	else if(total_error_1 < min(min(total_error_2, total_error_3), total_error_4))
+; 		order = 1;
+; 	else if(total_error_2 < min(total_error_3, total_error_4))
+; 		order = 2;
+; 	else if(total_error_3 < total_error_4)
+; 		order = 3;
+; 	else
+; 		order = 4;
+;
+; 	residual_bits_per_sample[0] = (FLAC__float)((data_len > 0 && total_error_0 > 0) ? log(M_LN2 * (FLAC__double)total_error_0 / (FLAC__double)data_len) / M_LN2 : 0.0);
+; 	residual_bits_per_sample[1] = (FLAC__float)((data_len > 0 && total_error_1 > 0) ? log(M_LN2 * (FLAC__double)total_error_1 / (FLAC__double)data_len) / M_LN2 : 0.0);
+; 	residual_bits_per_sample[2] = (FLAC__float)((data_len > 0 && total_error_2 > 0) ? log(M_LN2 * (FLAC__double)total_error_2 / (FLAC__double)data_len) / M_LN2 : 0.0);
+; 	residual_bits_per_sample[3] = (FLAC__float)((data_len > 0 && total_error_3 > 0) ? log(M_LN2 * (FLAC__double)total_error_3 / (FLAC__double)data_len) / M_LN2 : 0.0);
+; 	residual_bits_per_sample[4] = (FLAC__float)((data_len > 0 && total_error_4 > 0) ? log(M_LN2 * (FLAC__double)total_error_4 / (FLAC__double)data_len) / M_LN2 : 0.0);
+;
+; 	return order;
+; }
+	ALIGN 16
+cident FLAC__fixed_compute_best_predictor_asm_ia32_mmx_cmov
+
+	; esp + 36 == data[]
+	; esp + 40 == data_len
+	; esp + 44 == residual_bits_per_sample[]
+
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+	sub	esp, byte 16
+	; qword [esp] == temp space for loading FLAC__uint64s to FPU regs
+
+	; ebx == &data[i]
+	; ecx == loop counter (i)
+	; ebp == order
+	; mm0 == total_error_1:total_error_0
+	; mm1 == total_error_2:total_error_3
+	; mm2 == :total_error_4
+	; mm3 == last_error_1:last_error_0
+	; mm4 == last_error_2:last_error_3
+
+	mov	ecx, [esp + 40]			; ecx = data_len
+	test	ecx, ecx
+	jz	near .data_len_is_0
+
+	mov	ebx, [esp + 36]			; ebx = data[]
+	movd	mm3, [ebx - 4]			; mm3 = 0:last_error_0
+	movd	mm2, [ebx - 8]			; mm2 = 0:data[-2]
+	movd	mm1, [ebx - 12]			; mm1 = 0:data[-3]
+	movd	mm0, [ebx - 16]			; mm0 = 0:data[-4]
+	movq	mm5, mm3			; mm5 = 0:last_error_0
+	psubd	mm5, mm2			; mm5 = 0:last_error_1
+	punpckldq	mm3, mm5		; mm3 = last_error_1:last_error_0
+	psubd	mm2, mm1			; mm2 = 0:data[-2] - data[-3]
+	psubd	mm5, mm2			; mm5 = 0:last_error_2
+	movq	mm4, mm5			; mm4 = 0:last_error_2
+	psubd	mm4, mm2			; mm4 = 0:last_error_2 - (data[-2] - data[-3])
+	paddd	mm4, mm1			; mm4 = 0:last_error_2 - (data[-2] - 2 * data[-3])
+	psubd	mm4, mm0			; mm4 = 0:last_error_3
+	punpckldq	mm4, mm5		; mm4 = last_error_2:last_error_3
+	pxor	mm0, mm0			; mm0 = total_error_1:total_error_0
+	pxor	mm1, mm1			; mm1 = total_error_2:total_error_3
+	pxor	mm2, mm2			; mm2 = 0:total_error_4
+
+	ALIGN 16
+.loop:
+	movd	mm7, [ebx]			; mm7 = 0:error_0
+	add	ebx, byte 4
+	movq	mm6, mm7			; mm6 = 0:error_0
+	psubd	mm7, mm3			; mm7 = :error_1
+	punpckldq	mm6, mm7		; mm6 = error_1:error_0
+	movq	mm5, mm6			; mm5 = error_1:error_0
+	movq	mm7, mm6			; mm7 = error_1:error_0
+	psubd	mm5, mm3			; mm5 = error_2:
+	movq	mm3, mm6			; mm3 = error_1:error_0	
+	psrad	mm6, 31
+	pxor	mm7, mm6
+	psubd	mm7, mm6			; mm7 = abs(error_1):abs(error_0)
+	paddd	mm0, mm7			; mm0 = total_error_1:total_error_0
+	movq	mm6, mm5			; mm6 = error_2:
+	psubd	mm5, mm4			; mm5 = error_3:
+	punpckhdq	mm5, mm6		; mm5 = error_2:error_3
+	movq	mm7, mm5			; mm7 = error_2:error_3
+	movq	mm6, mm5			; mm6 = error_2:error_3
+	psubd	mm5, mm4			; mm5 = :error_4
+	movq	mm4, mm6			; mm4 = error_2:error_3
+	psrad	mm6, 31
+	pxor	mm7, mm6
+	psubd	mm7, mm6			; mm7 = abs(error_2):abs(error_3)
+	paddd	mm1, mm7			; mm1 = total_error_2:total_error_3
+	movq	mm6, mm5			; mm6 = :error_4
+	psrad	mm5, 31
+	pxor	mm6, mm5
+	psubd	mm6, mm5			; mm6 = :abs(error_4)
+	paddd	mm2, mm6			; mm2 = :total_error_4
+	
+	dec	ecx
+	jnz	short .loop
+
+; 	if(total_error_0 < min(min(min(total_error_1, total_error_2), total_error_3), total_error_4))
+; 		order = 0;
+; 	else if(total_error_1 < min(min(total_error_2, total_error_3), total_error_4))
+; 		order = 1;
+; 	else if(total_error_2 < min(total_error_3, total_error_4))
+; 		order = 2;
+; 	else if(total_error_3 < total_error_4)
+; 		order = 3;
+; 	else
+; 		order = 4;
+	movq	mm3, mm0			; mm3 = total_error_1:total_error_0
+	movd	edi, mm2			; edi = total_error_4
+	movd	esi, mm1			; esi = total_error_3
+	movd	eax, mm0			; eax = total_error_0
+	punpckhdq	mm1, mm1		; mm1 = total_error_2:total_error_2
+	punpckhdq	mm3, mm3		; mm3 = total_error_1:total_error_1
+	movd	edx, mm1			; edx = total_error_2
+	movd	ecx, mm3			; ecx = total_error_1
+
+	xor	ebx, ebx
+	xor	ebp, ebp
+	inc	ebx
+	cmp	ecx, eax
+	cmovb	eax, ecx			; eax = min(total_error_0, total_error_1)
+	cmovbe	ebp, ebx
+	inc	ebx
+	cmp	edx, eax
+	cmovb	eax, edx			; eax = min(total_error_0, total_error_1, total_error_2)
+	cmovbe	ebp, ebx
+	inc	ebx
+	cmp	esi, eax
+	cmovb	eax, esi			; eax = min(total_error_0, total_error_1, total_error_2, total_error_3)
+	cmovbe	ebp, ebx
+	inc	ebx
+	cmp	edi, eax
+	cmovb	eax, edi			; eax = min(total_error_0, total_error_1, total_error_2, total_error_3, total_error_4)
+	cmovbe	ebp, ebx
+	movd	ebx, mm0			; ebx = total_error_0
+	emms
+
+	; 	residual_bits_per_sample[0] = (FLAC__float)((data_len > 0 && total_error_0 > 0) ? log(M_LN2 * (FLAC__double)total_error_0 / (FLAC__double)data_len) / M_LN2 : 0.0);
+	; 	residual_bits_per_sample[1] = (FLAC__float)((data_len > 0 && total_error_1 > 0) ? log(M_LN2 * (FLAC__double)total_error_1 / (FLAC__double)data_len) / M_LN2 : 0.0);
+	; 	residual_bits_per_sample[2] = (FLAC__float)((data_len > 0 && total_error_2 > 0) ? log(M_LN2 * (FLAC__double)total_error_2 / (FLAC__double)data_len) / M_LN2 : 0.0);
+	; 	residual_bits_per_sample[3] = (FLAC__float)((data_len > 0 && total_error_3 > 0) ? log(M_LN2 * (FLAC__double)total_error_3 / (FLAC__double)data_len) / M_LN2 : 0.0);
+	; 	residual_bits_per_sample[4] = (FLAC__float)((data_len > 0 && total_error_4 > 0) ? log(M_LN2 * (FLAC__double)total_error_4 / (FLAC__double)data_len) / M_LN2 : 0.0);
+	xor	eax, eax
+	fild	dword [esp + 40]		; ST = data_len (NOTE: assumes data_len is <2gigs)
+.rbps_0:
+	test	ebx, ebx
+	jz	.total_error_0_is_0
+	fld1					; ST = 1.0 data_len
+	mov	[esp], ebx
+	mov	[esp + 4], eax			; [esp] = (FLAC__uint64)total_error_0
+	mov	ebx, [esp + 44]
+	fild	qword [esp]			; ST = total_error_0 1.0 data_len
+	fdiv	st2				; ST = total_error_0/data_len 1.0 data_len
+	fldln2					; ST = ln2 total_error_0/data_len 1.0 data_len
+	fmulp	st1				; ST = ln2*total_error_0/data_len 1.0 data_len
+	fyl2x					; ST = log2(ln2*total_error_0/data_len) data_len
+	fstp	dword [ebx]			; residual_bits_per_sample[0] = log2(ln2*total_error_0/data_len)   ST = data_len
+	jmp	short .rbps_1
+.total_error_0_is_0:
+	mov	ebx, [esp + 44]
+	mov	[ebx], eax			; residual_bits_per_sample[0] = 0.0
+.rbps_1:
+	test	ecx, ecx
+	jz	.total_error_1_is_0
+	fld1					; ST = 1.0 data_len
+	mov	[esp], ecx
+	mov	[esp + 4], eax			; [esp] = (FLAC__uint64)total_error_1
+	fild	qword [esp]			; ST = total_error_1 1.0 data_len
+	fdiv	st2				; ST = total_error_1/data_len 1.0 data_len
+	fldln2					; ST = ln2 total_error_1/data_len 1.0 data_len
+	fmulp	st1				; ST = ln2*total_error_1/data_len 1.0 data_len
+	fyl2x					; ST = log2(ln2*total_error_1/data_len) data_len
+	fstp	dword [ebx + 4]			; residual_bits_per_sample[1] = log2(ln2*total_error_1/data_len)   ST = data_len
+	jmp	short .rbps_2
+.total_error_1_is_0:
+	mov	[ebx + 4], eax			; residual_bits_per_sample[1] = 0.0
+.rbps_2:
+	test	edx, edx
+	jz	.total_error_2_is_0
+	fld1					; ST = 1.0 data_len
+	mov	[esp], edx
+	mov	[esp + 4], eax			; [esp] = (FLAC__uint64)total_error_2
+	fild	qword [esp]			; ST = total_error_2 1.0 data_len
+	fdiv	st2				; ST = total_error_2/data_len 1.0 data_len
+	fldln2					; ST = ln2 total_error_2/data_len 1.0 data_len
+	fmulp	st1				; ST = ln2*total_error_2/data_len 1.0 data_len
+	fyl2x					; ST = log2(ln2*total_error_2/data_len) data_len
+	fstp	dword [ebx + 8]			; residual_bits_per_sample[2] = log2(ln2*total_error_2/data_len)   ST = data_len
+	jmp	short .rbps_3
+.total_error_2_is_0:
+	mov	[ebx + 8], eax			; residual_bits_per_sample[2] = 0.0
+.rbps_3:
+	test	esi, esi
+	jz	.total_error_3_is_0
+	fld1					; ST = 1.0 data_len
+	mov	[esp], esi
+	mov	[esp + 4], eax			; [esp] = (FLAC__uint64)total_error_3
+	fild	qword [esp]			; ST = total_error_3 1.0 data_len
+	fdiv	st2				; ST = total_error_3/data_len 1.0 data_len
+	fldln2					; ST = ln2 total_error_3/data_len 1.0 data_len
+	fmulp	st1				; ST = ln2*total_error_3/data_len 1.0 data_len
+	fyl2x					; ST = log2(ln2*total_error_3/data_len) data_len
+	fstp	dword [ebx + 12]		; residual_bits_per_sample[3] = log2(ln2*total_error_3/data_len)   ST = data_len
+	jmp	short .rbps_4
+.total_error_3_is_0:
+	mov	[ebx + 12], eax			; residual_bits_per_sample[3] = 0.0
+.rbps_4:
+	test	edi, edi
+	jz	.total_error_4_is_0
+	fld1					; ST = 1.0 data_len
+	mov	[esp], edi
+	mov	[esp + 4], eax			; [esp] = (FLAC__uint64)total_error_4
+	fild	qword [esp]			; ST = total_error_4 1.0 data_len
+	fdiv	st2				; ST = total_error_4/data_len 1.0 data_len
+	fldln2					; ST = ln2 total_error_4/data_len 1.0 data_len
+	fmulp	st1				; ST = ln2*total_error_4/data_len 1.0 data_len
+	fyl2x					; ST = log2(ln2*total_error_4/data_len) data_len
+	fstp	dword [ebx + 16]		; residual_bits_per_sample[4] = log2(ln2*total_error_4/data_len)   ST = data_len
+	jmp	short .rbps_end
+.total_error_4_is_0:
+	mov	[ebx + 16], eax			; residual_bits_per_sample[4] = 0.0
+.rbps_end:
+	fstp	st0				; ST = [empty]
+	jmp	short .end
+.data_len_is_0:
+	; data_len == 0, so residual_bits_per_sample[*] = 0.0
+	xor	ebp, ebp
+	mov	edi, [esp + 44]
+	mov	[edi], ebp
+	mov	[edi + 4], ebp
+	mov	[edi + 8], ebp
+	mov	[edi + 12], ebp
+	mov	[edi + 16], ebp
+	add	ebp, byte 4			; order = 4
+
+.end:
+	mov	eax, ebp			; return order
+	add	esp, byte 16
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+
+; end
diff --git a/src/libFLAC/ia32/lpc_asm.nasm b/src/libFLAC/ia32/lpc_asm.nasm
new file mode 100644
index 0000000..bf65032
--- /dev/null
+++ b/src/libFLAC/ia32/lpc_asm.nasm
@@ -0,0 +1,2049 @@
+;  vim:filetype=nasm ts=8
+
+;  libFLAC - Free Lossless Audio Codec library
+;  Copyright (C) 2001-2009  Josh Coalson
+;  Copyright (C) 2011-2014  Xiph.Org Foundation
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;
+;  - Redistributions of source code must retain the above copyright
+;  notice, this list of conditions and the following disclaimer.
+;
+;  - Redistributions in binary form must reproduce the above copyright
+;  notice, this list of conditions and the following disclaimer in the
+;  documentation and/or other materials provided with the distribution.
+;
+;  - Neither the name of the Xiph.org Foundation nor the names of its
+;  contributors may be used to endorse or promote products derived from
+;  this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+;  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "nasm.h"
+
+	data_section
+
+cglobal FLAC__lpc_compute_autocorrelation_asm_ia32
+cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4
+cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8
+cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12
+cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16
+cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32
+cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx
+cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_wide_asm_ia32
+cglobal FLAC__lpc_restore_signal_asm_ia32
+cglobal FLAC__lpc_restore_signal_asm_ia32_mmx
+cglobal FLAC__lpc_restore_signal_wide_asm_ia32
+
+	code_section
+
+; **********************************************************************
+;
+; void FLAC__lpc_compute_autocorrelation_asm(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
+; {
+;	FLAC__real d;
+;	unsigned sample, coeff;
+;	const unsigned limit = data_len - lag;
+;
+;	FLAC__ASSERT(lag > 0);
+;	FLAC__ASSERT(lag <= data_len);
+;
+;	for(coeff = 0; coeff < lag; coeff++)
+;		autoc[coeff] = 0.0;
+;	for(sample = 0; sample <= limit; sample++) {
+;		d = data[sample];
+;		for(coeff = 0; coeff < lag; coeff++)
+;			autoc[coeff] += d * data[sample+coeff];
+;	}
+;	for(; sample < data_len; sample++) {
+;		d = data[sample];
+;		for(coeff = 0; coeff < data_len - sample; coeff++)
+;			autoc[coeff] += d * data[sample+coeff];
+;	}
+; }
+;
+	ALIGN 16
+cident FLAC__lpc_compute_autocorrelation_asm_ia32
+	;[esp + 28] == autoc[]
+	;[esp + 24] == lag
+	;[esp + 20] == data_len
+	;[esp + 16] == data[]
+
+	;ASSERT(lag > 0)
+	;ASSERT(lag <= 33)
+	;ASSERT(lag <= data_len)
+
+.begin:
+	push	esi
+	push	edi
+	push	ebx
+
+	;	for(coeff = 0; coeff < lag; coeff++)
+	;		autoc[coeff] = 0.0;
+	mov	edi, [esp + 28]			; edi == autoc
+	mov	ecx, [esp + 24]			; ecx = # of dwords (=lag) of 0 to write
+	xor	eax, eax
+	rep	stosd
+
+	;	const unsigned limit = data_len - lag;
+	mov	eax, [esp + 24]			; eax == lag
+	mov	ecx, [esp + 20]
+	sub	ecx, eax			; ecx == limit
+
+	mov	edi, [esp + 28]			; edi == autoc
+	mov	esi, [esp + 16]			; esi == data
+	inc	ecx				; we are looping <= limit so we add one to the counter
+
+	;	for(sample = 0; sample <= limit; sample++) {
+	;		d = data[sample];
+	;		for(coeff = 0; coeff < lag; coeff++)
+	;			autoc[coeff] += d * data[sample+coeff];
+	;	}
+	fld	dword [esi]			; ST = d <- data[sample]
+	; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax)
+	lea	edx, [eax + eax*2]
+	neg	edx
+	lea	edx, [eax + edx*4 + .jumper1_0 - .get_eip1]
+	call	.mov_eip_to_ebx
+.get_eip1:
+	add	edx, ebx
+	inc	edx				; compensate for the shorter opcode on the last iteration
+	inc	edx				; compensate for the shorter opcode on the last iteration
+	inc	edx				; compensate for the shorter opcode on the last iteration
+	cmp	eax, 33
+	jne	.loop1_start
+	sub	edx, byte 9			; compensate for the longer opcodes on the first iteration
+.loop1_start:
+	jmp	edx
+
+.mov_eip_to_ebx:
+	mov	ebx, [esp]
+	ret
+
+	fld	st0				; ST = d d
+	fmul	dword [esi + (32*4)]		; ST = d*data[sample+32] d		WATCHOUT: not a byte displacement here!
+	fadd	dword [edi + (32*4)]		; ST = autoc[32]+d*data[sample+32] d	WATCHOUT: not a byte displacement here!
+	fstp	dword [edi + (32*4)]		; autoc[32]+=d*data[sample+32]  ST = d	WATCHOUT: not a byte displacement here!
+	fld	st0				; ST = d d
+	fmul	dword [esi + (31*4)]		; ST = d*data[sample+31] d
+	fadd	dword [edi + (31*4)]		; ST = autoc[31]+d*data[sample+31] d
+	fstp	dword [edi + (31*4)]		; autoc[31]+=d*data[sample+31]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + (30*4)]		; ST = d*data[sample+30] d
+	fadd	dword [edi + (30*4)]		; ST = autoc[30]+d*data[sample+30] d
+	fstp	dword [edi + (30*4)]		; autoc[30]+=d*data[sample+30]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + (29*4)]		; ST = d*data[sample+29] d
+	fadd	dword [edi + (29*4)]		; ST = autoc[29]+d*data[sample+29] d
+	fstp	dword [edi + (29*4)]		; autoc[29]+=d*data[sample+29]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + (28*4)]		; ST = d*data[sample+28] d
+	fadd	dword [edi + (28*4)]		; ST = autoc[28]+d*data[sample+28] d
+	fstp	dword [edi + (28*4)]		; autoc[28]+=d*data[sample+28]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + (27*4)]		; ST = d*data[sample+27] d
+	fadd	dword [edi + (27*4)]		; ST = autoc[27]+d*data[sample+27] d
+	fstp	dword [edi + (27*4)]		; autoc[27]+=d*data[sample+27]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + (26*4)]		; ST = d*data[sample+26] d
+	fadd	dword [edi + (26*4)]		; ST = autoc[26]+d*data[sample+26] d
+	fstp	dword [edi + (26*4)]		; autoc[26]+=d*data[sample+26]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + (25*4)]		; ST = d*data[sample+25] d
+	fadd	dword [edi + (25*4)]		; ST = autoc[25]+d*data[sample+25] d
+	fstp	dword [edi + (25*4)]		; autoc[25]+=d*data[sample+25]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + (24*4)]		; ST = d*data[sample+24] d
+	fadd	dword [edi + (24*4)]		; ST = autoc[24]+d*data[sample+24] d
+	fstp	dword [edi + (24*4)]		; autoc[24]+=d*data[sample+24]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + (23*4)]		; ST = d*data[sample+23] d
+	fadd	dword [edi + (23*4)]		; ST = autoc[23]+d*data[sample+23] d
+	fstp	dword [edi + (23*4)]		; autoc[23]+=d*data[sample+23]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + (22*4)]		; ST = d*data[sample+22] d
+	fadd	dword [edi + (22*4)]		; ST = autoc[22]+d*data[sample+22] d
+	fstp	dword [edi + (22*4)]		; autoc[22]+=d*data[sample+22]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + (21*4)]		; ST = d*data[sample+21] d
+	fadd	dword [edi + (21*4)]		; ST = autoc[21]+d*data[sample+21] d
+	fstp	dword [edi + (21*4)]		; autoc[21]+=d*data[sample+21]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + (20*4)]		; ST = d*data[sample+20] d
+	fadd	dword [edi + (20*4)]		; ST = autoc[20]+d*data[sample+20] d
+	fstp	dword [edi + (20*4)]		; autoc[20]+=d*data[sample+20]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + (19*4)]		; ST = d*data[sample+19] d
+	fadd	dword [edi + (19*4)]		; ST = autoc[19]+d*data[sample+19] d
+	fstp	dword [edi + (19*4)]		; autoc[19]+=d*data[sample+19]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + (18*4)]		; ST = d*data[sample+18] d
+	fadd	dword [edi + (18*4)]		; ST = autoc[18]+d*data[sample+18] d
+	fstp	dword [edi + (18*4)]		; autoc[18]+=d*data[sample+18]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + (17*4)]		; ST = d*data[sample+17] d
+	fadd	dword [edi + (17*4)]		; ST = autoc[17]+d*data[sample+17] d
+	fstp	dword [edi + (17*4)]		; autoc[17]+=d*data[sample+17]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + (16*4)]		; ST = d*data[sample+16] d
+	fadd	dword [edi + (16*4)]		; ST = autoc[16]+d*data[sample+16] d
+	fstp	dword [edi + (16*4)]		; autoc[16]+=d*data[sample+16]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + (15*4)]		; ST = d*data[sample+15] d
+	fadd	dword [edi + (15*4)]		; ST = autoc[15]+d*data[sample+15] d
+	fstp	dword [edi + (15*4)]		; autoc[15]+=d*data[sample+15]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + (14*4)]		; ST = d*data[sample+14] d
+	fadd	dword [edi + (14*4)]		; ST = autoc[14]+d*data[sample+14] d
+	fstp	dword [edi + (14*4)]		; autoc[14]+=d*data[sample+14]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + (13*4)]		; ST = d*data[sample+13] d
+	fadd	dword [edi + (13*4)]		; ST = autoc[13]+d*data[sample+13] d
+	fstp	dword [edi + (13*4)]		; autoc[13]+=d*data[sample+13]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + (12*4)]		; ST = d*data[sample+12] d
+	fadd	dword [edi + (12*4)]		; ST = autoc[12]+d*data[sample+12] d
+	fstp	dword [edi + (12*4)]		; autoc[12]+=d*data[sample+12]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + (11*4)]		; ST = d*data[sample+11] d
+	fadd	dword [edi + (11*4)]		; ST = autoc[11]+d*data[sample+11] d
+	fstp	dword [edi + (11*4)]		; autoc[11]+=d*data[sample+11]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + (10*4)]		; ST = d*data[sample+10] d
+	fadd	dword [edi + (10*4)]		; ST = autoc[10]+d*data[sample+10] d
+	fstp	dword [edi + (10*4)]		; autoc[10]+=d*data[sample+10]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + ( 9*4)]		; ST = d*data[sample+9] d
+	fadd	dword [edi + ( 9*4)]		; ST = autoc[9]+d*data[sample+9] d
+	fstp	dword [edi + ( 9*4)]		; autoc[9]+=d*data[sample+9]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + ( 8*4)]		; ST = d*data[sample+8] d
+	fadd	dword [edi + ( 8*4)]		; ST = autoc[8]+d*data[sample+8] d
+	fstp	dword [edi + ( 8*4)]		; autoc[8]+=d*data[sample+8]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + ( 7*4)]		; ST = d*data[sample+7] d
+	fadd	dword [edi + ( 7*4)]		; ST = autoc[7]+d*data[sample+7] d
+	fstp	dword [edi + ( 7*4)]		; autoc[7]+=d*data[sample+7]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + ( 6*4)]		; ST = d*data[sample+6] d
+	fadd	dword [edi + ( 6*4)]		; ST = autoc[6]+d*data[sample+6] d
+	fstp	dword [edi + ( 6*4)]		; autoc[6]+=d*data[sample+6]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + ( 5*4)]		; ST = d*data[sample+4] d
+	fadd	dword [edi + ( 5*4)]		; ST = autoc[4]+d*data[sample+4] d
+	fstp	dword [edi + ( 5*4)]		; autoc[4]+=d*data[sample+4]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + ( 4*4)]		; ST = d*data[sample+4] d
+	fadd	dword [edi + ( 4*4)]		; ST = autoc[4]+d*data[sample+4] d
+	fstp	dword [edi + ( 4*4)]		; autoc[4]+=d*data[sample+4]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + ( 3*4)]		; ST = d*data[sample+3] d
+	fadd	dword [edi + ( 3*4)]		; ST = autoc[3]+d*data[sample+3] d
+	fstp	dword [edi + ( 3*4)]		; autoc[3]+=d*data[sample+3]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + ( 2*4)]		; ST = d*data[sample+2] d
+	fadd	dword [edi + ( 2*4)]		; ST = autoc[2]+d*data[sample+2] d
+	fstp	dword [edi + ( 2*4)]		; autoc[2]+=d*data[sample+2]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + ( 1*4)]		; ST = d*data[sample+1] d
+	fadd	dword [edi + ( 1*4)]		; ST = autoc[1]+d*data[sample+1] d
+	fstp	dword [edi + ( 1*4)]		; autoc[1]+=d*data[sample+1]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi]			; ST = d*data[sample] d			WATCHOUT: no displacement byte here!
+	fadd	dword [edi]			; ST = autoc[0]+d*data[sample] d	WATCHOUT: no displacement byte here!
+	fstp	dword [edi]			; autoc[0]+=d*data[sample]  ST = d	WATCHOUT: no displacement byte here!
+.jumper1_0:
+
+	fstp	st0				; pop d, ST = empty
+	add	esi, byte 4			; sample++
+	dec	ecx
+	jz	.loop1_end
+	fld	dword [esi]			; ST = d <- data[sample]
+	jmp	edx
+.loop1_end:
+
+	;	for(; sample < data_len; sample++) {
+	;		d = data[sample];
+	;		for(coeff = 0; coeff < data_len - sample; coeff++)
+	;			autoc[coeff] += d * data[sample+coeff];
+	;	}
+	mov	ecx, [esp + 24]			; ecx <- lag
+	dec	ecx				; ecx <- lag - 1
+	jz	near .end			; skip loop if 0 (i.e. lag == 1)
+
+	fld	dword [esi]			; ST = d <- data[sample]
+	mov	eax, ecx			; eax <- lag - 1 == data_len - sample the first time through
+	; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax)
+	lea	edx, [eax + eax*2]
+	neg	edx
+	lea	edx, [eax + edx*4 + .jumper2_0 - .get_eip2]
+	call	.mov_eip_to_ebx
+.get_eip2:
+	add	edx, ebx
+	inc	edx				; compensate for the shorter opcode on the last iteration
+	inc	edx				; compensate for the shorter opcode on the last iteration
+	inc	edx				; compensate for the shorter opcode on the last iteration
+	jmp	edx
+
+	fld	st0				; ST = d d
+	fmul	dword [esi + (31*4)]		; ST = d*data[sample+31] d
+	fadd	dword [edi + (31*4)]		; ST = autoc[31]+d*data[sample+31] d
+	fstp	dword [edi + (31*4)]		; autoc[31]+=d*data[sample+31]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + (30*4)]		; ST = d*data[sample+30] d
+	fadd	dword [edi + (30*4)]		; ST = autoc[30]+d*data[sample+30] d
+	fstp	dword [edi + (30*4)]		; autoc[30]+=d*data[sample+30]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + (29*4)]		; ST = d*data[sample+29] d
+	fadd	dword [edi + (29*4)]		; ST = autoc[29]+d*data[sample+29] d
+	fstp	dword [edi + (29*4)]		; autoc[29]+=d*data[sample+29]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + (28*4)]		; ST = d*data[sample+28] d
+	fadd	dword [edi + (28*4)]		; ST = autoc[28]+d*data[sample+28] d
+	fstp	dword [edi + (28*4)]		; autoc[28]+=d*data[sample+28]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + (27*4)]		; ST = d*data[sample+27] d
+	fadd	dword [edi + (27*4)]		; ST = autoc[27]+d*data[sample+27] d
+	fstp	dword [edi + (27*4)]		; autoc[27]+=d*data[sample+27]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + (26*4)]		; ST = d*data[sample+26] d
+	fadd	dword [edi + (26*4)]		; ST = autoc[26]+d*data[sample+26] d
+	fstp	dword [edi + (26*4)]		; autoc[26]+=d*data[sample+26]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + (25*4)]		; ST = d*data[sample+25] d
+	fadd	dword [edi + (25*4)]		; ST = autoc[25]+d*data[sample+25] d
+	fstp	dword [edi + (25*4)]		; autoc[25]+=d*data[sample+25]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + (24*4)]		; ST = d*data[sample+24] d
+	fadd	dword [edi + (24*4)]		; ST = autoc[24]+d*data[sample+24] d
+	fstp	dword [edi + (24*4)]		; autoc[24]+=d*data[sample+24]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + (23*4)]		; ST = d*data[sample+23] d
+	fadd	dword [edi + (23*4)]		; ST = autoc[23]+d*data[sample+23] d
+	fstp	dword [edi + (23*4)]		; autoc[23]+=d*data[sample+23]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + (22*4)]		; ST = d*data[sample+22] d
+	fadd	dword [edi + (22*4)]		; ST = autoc[22]+d*data[sample+22] d
+	fstp	dword [edi + (22*4)]		; autoc[22]+=d*data[sample+22]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + (21*4)]		; ST = d*data[sample+21] d
+	fadd	dword [edi + (21*4)]		; ST = autoc[21]+d*data[sample+21] d
+	fstp	dword [edi + (21*4)]		; autoc[21]+=d*data[sample+21]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + (20*4)]		; ST = d*data[sample+20] d
+	fadd	dword [edi + (20*4)]		; ST = autoc[20]+d*data[sample+20] d
+	fstp	dword [edi + (20*4)]		; autoc[20]+=d*data[sample+20]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + (19*4)]		; ST = d*data[sample+19] d
+	fadd	dword [edi + (19*4)]		; ST = autoc[19]+d*data[sample+19] d
+	fstp	dword [edi + (19*4)]		; autoc[19]+=d*data[sample+19]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + (18*4)]		; ST = d*data[sample+18] d
+	fadd	dword [edi + (18*4)]		; ST = autoc[18]+d*data[sample+18] d
+	fstp	dword [edi + (18*4)]		; autoc[18]+=d*data[sample+18]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + (17*4)]		; ST = d*data[sample+17] d
+	fadd	dword [edi + (17*4)]		; ST = autoc[17]+d*data[sample+17] d
+	fstp	dword [edi + (17*4)]		; autoc[17]+=d*data[sample+17]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + (16*4)]		; ST = d*data[sample+16] d
+	fadd	dword [edi + (16*4)]		; ST = autoc[16]+d*data[sample+16] d
+	fstp	dword [edi + (16*4)]		; autoc[16]+=d*data[sample+16]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + (15*4)]		; ST = d*data[sample+15] d
+	fadd	dword [edi + (15*4)]		; ST = autoc[15]+d*data[sample+15] d
+	fstp	dword [edi + (15*4)]		; autoc[15]+=d*data[sample+15]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + (14*4)]		; ST = d*data[sample+14] d
+	fadd	dword [edi + (14*4)]		; ST = autoc[14]+d*data[sample+14] d
+	fstp	dword [edi + (14*4)]		; autoc[14]+=d*data[sample+14]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + (13*4)]		; ST = d*data[sample+13] d
+	fadd	dword [edi + (13*4)]		; ST = autoc[13]+d*data[sample+13] d
+	fstp	dword [edi + (13*4)]		; autoc[13]+=d*data[sample+13]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + (12*4)]		; ST = d*data[sample+12] d
+	fadd	dword [edi + (12*4)]		; ST = autoc[12]+d*data[sample+12] d
+	fstp	dword [edi + (12*4)]		; autoc[12]+=d*data[sample+12]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + (11*4)]		; ST = d*data[sample+11] d
+	fadd	dword [edi + (11*4)]		; ST = autoc[11]+d*data[sample+11] d
+	fstp	dword [edi + (11*4)]		; autoc[11]+=d*data[sample+11]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + (10*4)]		; ST = d*data[sample+10] d
+	fadd	dword [edi + (10*4)]		; ST = autoc[10]+d*data[sample+10] d
+	fstp	dword [edi + (10*4)]		; autoc[10]+=d*data[sample+10]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + ( 9*4)]		; ST = d*data[sample+9] d
+	fadd	dword [edi + ( 9*4)]		; ST = autoc[9]+d*data[sample+9] d
+	fstp	dword [edi + ( 9*4)]		; autoc[9]+=d*data[sample+9]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + ( 8*4)]		; ST = d*data[sample+8] d
+	fadd	dword [edi + ( 8*4)]		; ST = autoc[8]+d*data[sample+8] d
+	fstp	dword [edi + ( 8*4)]		; autoc[8]+=d*data[sample+8]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + ( 7*4)]		; ST = d*data[sample+7] d
+	fadd	dword [edi + ( 7*4)]		; ST = autoc[7]+d*data[sample+7] d
+	fstp	dword [edi + ( 7*4)]		; autoc[7]+=d*data[sample+7]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + ( 6*4)]		; ST = d*data[sample+6] d
+	fadd	dword [edi + ( 6*4)]		; ST = autoc[6]+d*data[sample+6] d
+	fstp	dword [edi + ( 6*4)]		; autoc[6]+=d*data[sample+6]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + ( 5*4)]		; ST = d*data[sample+4] d
+	fadd	dword [edi + ( 5*4)]		; ST = autoc[4]+d*data[sample+4] d
+	fstp	dword [edi + ( 5*4)]		; autoc[4]+=d*data[sample+4]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + ( 4*4)]		; ST = d*data[sample+4] d
+	fadd	dword [edi + ( 4*4)]		; ST = autoc[4]+d*data[sample+4] d
+	fstp	dword [edi + ( 4*4)]		; autoc[4]+=d*data[sample+4]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + ( 3*4)]		; ST = d*data[sample+3] d
+	fadd	dword [edi + ( 3*4)]		; ST = autoc[3]+d*data[sample+3] d
+	fstp	dword [edi + ( 3*4)]		; autoc[3]+=d*data[sample+3]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + ( 2*4)]		; ST = d*data[sample+2] d
+	fadd	dword [edi + ( 2*4)]		; ST = autoc[2]+d*data[sample+2] d
+	fstp	dword [edi + ( 2*4)]		; autoc[2]+=d*data[sample+2]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi + ( 1*4)]		; ST = d*data[sample+1] d
+	fadd	dword [edi + ( 1*4)]		; ST = autoc[1]+d*data[sample+1] d
+	fstp	dword [edi + ( 1*4)]		; autoc[1]+=d*data[sample+1]  ST = d
+	fld	st0				; ST = d d
+	fmul	dword [esi]			; ST = d*data[sample] d			WATCHOUT: no displacement byte here!
+	fadd	dword [edi]			; ST = autoc[0]+d*data[sample] d	WATCHOUT: no displacement byte here!
+	fstp	dword [edi]			; autoc[0]+=d*data[sample]  ST = d	WATCHOUT: no displacement byte here!
+.jumper2_0:
+
+	fstp	st0				; pop d, ST = empty
+	add	esi, byte 4			; sample++
+	dec	ecx
+	jz	.loop2_end
+	add	edx, byte 11			; adjust our inner loop counter by adjusting the jump target
+	fld	dword [esi]			; ST = d <- data[sample]
+	jmp	edx
+.loop2_end:
+
+.end:
+	pop	ebx
+	pop	edi
+	pop	esi
+	ret
+
+	ALIGN 16
+cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4
+	;[esp + 16] == autoc[]
+	;[esp + 12] == lag
+	;[esp + 8] == data_len
+	;[esp + 4] == data[]
+
+	;ASSERT(lag > 0)
+	;ASSERT(lag <= 4)
+	;ASSERT(lag <= data_len)
+
+	;	for(coeff = 0; coeff < lag; coeff++)
+	;		autoc[coeff] = 0.0;
+	xorps	xmm5, xmm5
+
+	mov	edx, [esp + 8]			; edx == data_len
+	mov	eax, [esp + 4]			; eax == &data[sample] <- &data[0]
+
+	movss	xmm0, [eax]			; xmm0 = 0,0,0,data[0]
+	add	eax, 4
+	movaps	xmm2, xmm0			; xmm2 = 0,0,0,data[0]
+	shufps	xmm0, xmm0, 0			; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
+.warmup:					; xmm2 == data[sample-3],data[sample-2],data[sample-1],data[sample]
+	mulps	xmm0, xmm2			; xmm0 = xmm0 * xmm2
+	addps	xmm5, xmm0			; xmm5 += xmm0 * xmm2
+	dec	edx
+	jz	.loop_end
+	ALIGN 16
+.loop_start:
+	; start by reading the next sample
+	movss	xmm0, [eax]			; xmm0 = 0,0,0,data[sample]
+	add	eax, 4
+	shufps	xmm0, xmm0, 0			; xmm0 = data[sample],data[sample],data[sample],data[sample]
+	shufps	xmm2, xmm2, 93h			; 93h=2-1-0-3 => xmm2 gets rotated left by one float
+	movss	xmm2, xmm0
+	mulps	xmm0, xmm2			; xmm0 = xmm0 * xmm2
+	addps	xmm5, xmm0			; xmm5 += xmm0 * xmm2
+	dec	edx
+	jnz	.loop_start
+.loop_end:
+	; store autoc
+	mov	edx, [esp + 16]			; edx == autoc
+	movups	[edx], xmm5
+
+.end:
+	ret
+
+	ALIGN 16
+cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8
+	;[esp + 16] == autoc[]
+	;[esp + 12] == lag
+	;[esp + 8] == data_len
+	;[esp + 4] == data[]
+
+	;ASSERT(lag > 0)
+	;ASSERT(lag <= 8)
+	;ASSERT(lag <= data_len)
+
+	;	for(coeff = 0; coeff < lag; coeff++)
+	;		autoc[coeff] = 0.0;
+	xorps	xmm5, xmm5
+	xorps	xmm6, xmm6
+
+	mov	edx, [esp + 8]			; edx == data_len
+	mov	eax, [esp + 4]			; eax == &data[sample] <- &data[0]
+
+	movss	xmm0, [eax]			; xmm0 = 0,0,0,data[0]
+	add	eax, 4
+	movaps	xmm2, xmm0			; xmm2 = 0,0,0,data[0]
+	shufps	xmm0, xmm0, 0			; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
+	movaps	xmm1, xmm0			; xmm1 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
+	xorps	xmm3, xmm3			; xmm3 = 0,0,0,0
+.warmup:					; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample]
+	mulps	xmm0, xmm2
+	mulps	xmm1, xmm3			; xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2
+	addps	xmm5, xmm0
+	addps	xmm6, xmm1			; xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2
+	dec	edx
+	jz	.loop_end
+	ALIGN 16
+.loop_start:
+	; start by reading the next sample
+	movss	xmm0, [eax]			; xmm0 = 0,0,0,data[sample]
+	; here we reorder the instructions; see the (#) indexes for a logical order
+	shufps	xmm2, xmm2, 93h			; (3) 93h=2-1-0-3 => xmm2 gets rotated left by one float
+	add	eax, 4				; (0)
+	shufps	xmm3, xmm3, 93h			; (4) 93h=2-1-0-3 => xmm3 gets rotated left by one float
+	shufps	xmm0, xmm0, 0			; (1) xmm0 = data[sample],data[sample],data[sample],data[sample]
+	movss	xmm3, xmm2			; (5)
+	movaps	xmm1, xmm0			; (2) xmm1 = data[sample],data[sample],data[sample],data[sample]
+	movss	xmm2, xmm0			; (6)
+	mulps	xmm1, xmm3			; (8)
+	mulps	xmm0, xmm2			; (7) xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2
+	addps	xmm6, xmm1			; (10)
+	addps	xmm5, xmm0			; (9) xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2
+	dec	edx
+	jnz	.loop_start
+.loop_end:
+	; store autoc
+	mov	edx, [esp + 16]			; edx == autoc
+	movups	[edx], xmm5
+	movups	[edx + 16], xmm6
+
+.end:
+	ret
+
+	ALIGN 16
+cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12
+	;[esp + 16] == autoc[]
+	;[esp + 12] == lag
+	;[esp + 8] == data_len
+	;[esp + 4] == data[]
+
+	;ASSERT(lag > 0)
+	;ASSERT(lag <= 12)
+	;ASSERT(lag <= data_len)
+
+	;	for(coeff = 0; coeff < lag; coeff++)
+	;		autoc[coeff] = 0.0;
+	xorps	xmm5, xmm5
+	xorps	xmm6, xmm6
+	xorps	xmm7, xmm7
+
+	mov	edx, [esp + 8]			; edx == data_len
+	mov	eax, [esp + 4]			; eax == &data[sample] <- &data[0]
+
+	movss	xmm0, [eax]			; xmm0 = 0,0,0,data[0]
+	add	eax, 4
+	movaps	xmm2, xmm0			; xmm2 = 0,0,0,data[0]
+	shufps	xmm0, xmm0, 0			; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
+	xorps	xmm3, xmm3			; xmm3 = 0,0,0,0
+	xorps	xmm4, xmm4			; xmm4 = 0,0,0,0
+.warmup:					; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample]
+	movaps	xmm1, xmm0
+	mulps	xmm1, xmm2
+	addps	xmm5, xmm1
+	movaps	xmm1, xmm0
+	mulps	xmm1, xmm3
+	addps	xmm6, xmm1
+	mulps	xmm0, xmm4
+	addps	xmm7, xmm0			; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2
+	dec	edx
+	jz	.loop_end
+	ALIGN 16
+.loop_start:
+	; start by reading the next sample
+	movss	xmm0, [eax]			; xmm0 = 0,0,0,data[sample]
+	add	eax, 4
+	shufps	xmm0, xmm0, 0			; xmm0 = data[sample],data[sample],data[sample],data[sample]
+
+	; shift xmm4:xmm3:xmm2 left by one float
+	shufps	xmm2, xmm2, 93h			; 93h=2-1-0-3 => xmm2 gets rotated left by one float
+	shufps	xmm3, xmm3, 93h			; 93h=2-1-0-3 => xmm3 gets rotated left by one float
+	shufps	xmm4, xmm4, 93h			; 93h=2-1-0-3 => xmm4 gets rotated left by one float
+	movss	xmm4, xmm3
+	movss	xmm3, xmm2
+	movss	xmm2, xmm0
+
+	; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2
+	movaps	xmm1, xmm0
+	mulps	xmm1, xmm2
+	addps	xmm5, xmm1
+	movaps	xmm1, xmm0
+	mulps	xmm1, xmm3
+	addps	xmm6, xmm1
+	mulps	xmm0, xmm4
+	addps	xmm7, xmm0
+
+	dec	edx
+	jnz	.loop_start
+.loop_end:
+	; store autoc
+	mov	edx, [esp + 16]			; edx == autoc
+	movups	[edx], xmm5
+	movups	[edx + 16], xmm6
+	movups	[edx + 32], xmm7
+
+.end:
+	ret
+
+	ALIGN 16
+cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16
+	;[ebp + 20] == autoc[]
+	;[ebp + 16] == lag
+	;[ebp + 12] == data_len
+	;[ebp +  8] == data[]
+	;[esp] == __m128
+	;[esp + 16] == __m128
+
+	push	ebp
+	mov	ebp, esp
+	and	esp, -16 ; stack realign for SSE instructions 'movaps' and 'addps'
+	sub	esp, 32
+
+	;ASSERT(lag > 0)
+	;ASSERT(lag <= 12)
+	;ASSERT(lag <= data_len)
+	;ASSERT(data_len > 0)
+
+	;	for(coeff = 0; coeff < lag; coeff++)
+	;		autoc[coeff] = 0.0;
+	xorps	xmm5, xmm5
+	xorps	xmm6, xmm6
+	movaps	[esp], xmm5
+	movaps	[esp + 16], xmm6
+
+	mov	edx, [ebp + 12]			; edx == data_len
+	mov	eax, [ebp +  8]			; eax == &data[sample] <- &data[0]
+
+	movss	xmm0, [eax]			; xmm0 = 0,0,0,data[0]
+	add	eax, 4
+	movaps	xmm1, xmm0			; xmm1 = 0,0,0,data[0]
+	shufps	xmm0, xmm0, 0		; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
+	xorps	xmm2, xmm2			; xmm2 = 0,0,0,0
+	xorps	xmm3, xmm3			; xmm3 = 0,0,0,0
+	xorps	xmm4, xmm4			; xmm4 = 0,0,0,0
+	movaps	xmm7, xmm0
+	mulps	xmm7, xmm1
+	addps	xmm5, xmm7
+	dec	edx
+	jz	.loop_end
+	ALIGN 16
+.loop_start:
+	; start by reading the next sample
+	movss	xmm0, [eax]				; xmm0 = 0,0,0,data[sample]
+	add	eax, 4
+	shufps	xmm0, xmm0, 0			; xmm0 = data[sample],data[sample],data[sample],data[sample]
+
+	; shift xmm4:xmm3:xmm2:xmm1 left by one float
+	shufps	xmm1, xmm1, 93h
+	shufps	xmm2, xmm2, 93h
+	shufps	xmm3, xmm3, 93h
+	shufps	xmm4, xmm4, 93h
+	movss	xmm4, xmm3
+	movss	xmm3, xmm2
+	movss	xmm2, xmm1
+	movss	xmm1, xmm0
+
+	; xmmB:xmmA:xmm6:xmm5 += xmm0:xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2:xmm1
+	movaps	xmm7, xmm0
+	mulps	xmm7, xmm1
+	addps	xmm5, xmm7
+	movaps	xmm7, xmm0
+	mulps	xmm7, xmm2
+	addps	xmm6, xmm7
+	movaps	xmm7, xmm0
+	mulps	xmm7, xmm3
+	mulps	xmm0, xmm4
+	addps	xmm7, [esp]
+	addps	xmm0, [esp + 16]
+	movaps	[esp], xmm7
+	movaps	[esp + 16], xmm0
+
+	dec	edx
+	jnz	.loop_start
+.loop_end:
+	; store autoc
+	mov	edx, [ebp + 20]				; edx == autoc
+	movups	[edx], xmm5
+	movups	[edx + 16], xmm6
+	movaps	xmm5, [esp]
+	movaps	xmm6, [esp + 16]
+	movups	[edx + 32], xmm5
+	movups	[edx + 48], xmm6
+.end:
+	mov	esp, ebp
+	pop	ebp
+	ret
+
+;void FLAC__lpc_compute_residual_from_qlp_coefficients(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
+;
+;	for(i = 0; i < data_len; i++) {
+;		sum = 0;
+;		for(j = 0; j < order; j++)
+;			sum += qlp_coeff[j] * data[i-j-1];
+;		residual[i] = data[i] - (sum >> lp_quantization);
+;	}
+;
+	ALIGN	16
+cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32
+	;[esp + 40]	residual[]
+	;[esp + 36]	lp_quantization
+	;[esp + 32]	order
+	;[esp + 28]	qlp_coeff[]
+	;[esp + 24]	data_len
+	;[esp + 20]	data[]
+
+	;ASSERT(order > 0)
+
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+
+	mov	esi, [esp + 20]			; esi = data[]
+	mov	edi, [esp + 40]			; edi = residual[]
+	mov	eax, [esp + 32]			; eax = order
+	mov	ebx, [esp + 24]			; ebx = data_len
+
+	test	ebx, ebx
+	jz	near .end			; do nothing if data_len == 0
+.begin:
+	cmp	eax, byte 1
+	jg	short .i_1more
+
+	mov	ecx, [esp + 28]
+	mov	edx, [ecx]			; edx = qlp_coeff[0]
+	mov	eax, [esi - 4]			; eax = data[-1]
+	mov	ecx, [esp + 36]			; cl = lp_quantization
+	ALIGN	16
+.i_1_loop_i:
+	imul	eax, edx
+	sar	eax, cl
+	neg	eax
+	add	eax, [esi]
+	mov	[edi], eax
+	mov	eax, [esi]
+	add	edi, byte 4
+	add	esi, byte 4
+	dec	ebx
+	jnz	.i_1_loop_i
+
+	jmp	.end
+
+.i_1more:
+	cmp	eax, byte 32			; for order <= 32 there is a faster routine
+	jbe	short .i_32
+
+	; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32
+	ALIGN 16
+.i_32more_loop_i:
+	xor	ebp, ebp
+	mov	ecx, [esp + 32]
+	mov	edx, ecx
+	shl	edx, 2
+	add	edx, [esp + 28]
+	neg	ecx
+	ALIGN	16
+.i_32more_loop_j:
+	sub	edx, byte 4
+	mov	eax, [edx]
+	imul	eax, [esi + 4 * ecx]
+	add	ebp, eax
+	inc	ecx
+	jnz	short .i_32more_loop_j
+
+	mov	ecx, [esp + 36]
+	sar	ebp, cl
+	neg	ebp
+	add	ebp, [esi]
+	mov	[edi], ebp
+	add	esi, byte 4
+	add	edi, byte 4
+
+	dec	ebx
+	jnz	.i_32more_loop_i
+
+	jmp	.end
+
+.mov_eip_to_eax:
+	mov	eax, [esp]
+	ret
+
+.i_32:
+	sub	edi, esi
+	neg	eax
+	lea	edx, [eax + eax * 8 + .jumper_0 - .get_eip0]
+	call	.mov_eip_to_eax
+.get_eip0:
+	add	edx, eax
+	inc	edx
+	mov	eax, [esp + 28]			; eax = qlp_coeff[]
+	xor	ebp, ebp
+	jmp	edx
+
+	mov	ecx, [eax + 124]
+	imul	ecx, [esi - 128]
+	add	ebp, ecx
+	mov	ecx, [eax + 120]
+	imul	ecx, [esi - 124]
+	add	ebp, ecx
+	mov	ecx, [eax + 116]
+	imul	ecx, [esi - 120]
+	add	ebp, ecx
+	mov	ecx, [eax + 112]
+	imul	ecx, [esi - 116]
+	add	ebp, ecx
+	mov	ecx, [eax + 108]
+	imul	ecx, [esi - 112]
+	add	ebp, ecx
+	mov	ecx, [eax + 104]
+	imul	ecx, [esi - 108]
+	add	ebp, ecx
+	mov	ecx, [eax + 100]
+	imul	ecx, [esi - 104]
+	add	ebp, ecx
+	mov	ecx, [eax + 96]
+	imul	ecx, [esi - 100]
+	add	ebp, ecx
+	mov	ecx, [eax + 92]
+	imul	ecx, [esi - 96]
+	add	ebp, ecx
+	mov	ecx, [eax + 88]
+	imul	ecx, [esi - 92]
+	add	ebp, ecx
+	mov	ecx, [eax + 84]
+	imul	ecx, [esi - 88]
+	add	ebp, ecx
+	mov	ecx, [eax + 80]
+	imul	ecx, [esi - 84]
+	add	ebp, ecx
+	mov	ecx, [eax + 76]
+	imul	ecx, [esi - 80]
+	add	ebp, ecx
+	mov	ecx, [eax + 72]
+	imul	ecx, [esi - 76]
+	add	ebp, ecx
+	mov	ecx, [eax + 68]
+	imul	ecx, [esi - 72]
+	add	ebp, ecx
+	mov	ecx, [eax + 64]
+	imul	ecx, [esi - 68]
+	add	ebp, ecx
+	mov	ecx, [eax + 60]
+	imul	ecx, [esi - 64]
+	add	ebp, ecx
+	mov	ecx, [eax + 56]
+	imul	ecx, [esi - 60]
+	add	ebp, ecx
+	mov	ecx, [eax + 52]
+	imul	ecx, [esi - 56]
+	add	ebp, ecx
+	mov	ecx, [eax + 48]
+	imul	ecx, [esi - 52]
+	add	ebp, ecx
+	mov	ecx, [eax + 44]
+	imul	ecx, [esi - 48]
+	add	ebp, ecx
+	mov	ecx, [eax + 40]
+	imul	ecx, [esi - 44]
+	add	ebp, ecx
+	mov	ecx, [eax + 36]
+	imul	ecx, [esi - 40]
+	add	ebp, ecx
+	mov	ecx, [eax + 32]
+	imul	ecx, [esi - 36]
+	add	ebp, ecx
+	mov	ecx, [eax + 28]
+	imul	ecx, [esi - 32]
+	add	ebp, ecx
+	mov	ecx, [eax + 24]
+	imul	ecx, [esi - 28]
+	add	ebp, ecx
+	mov	ecx, [eax + 20]
+	imul	ecx, [esi - 24]
+	add	ebp, ecx
+	mov	ecx, [eax + 16]
+	imul	ecx, [esi - 20]
+	add	ebp, ecx
+	mov	ecx, [eax + 12]
+	imul	ecx, [esi - 16]
+	add	ebp, ecx
+	mov	ecx, [eax + 8]
+	imul	ecx, [esi - 12]
+	add	ebp, ecx
+	mov	ecx, [eax + 4]
+	imul	ecx, [esi - 8]
+	add	ebp, ecx
+	mov	ecx, [eax]			; there is one byte missing
+	imul	ecx, [esi - 4]
+	add	ebp, ecx
+.jumper_0:
+
+	mov	ecx, [esp + 36]
+	sar	ebp, cl
+	neg	ebp
+	add	ebp, [esi]
+	mov	[edi + esi], ebp
+	add	esi, byte 4
+
+	dec	ebx
+	jz	short .end
+	xor	ebp, ebp
+	jmp	edx
+
+.end:
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+
+; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for
+; the channel and qlp_coeffs must be <= 16.  Especially note that this routine
+; cannot be used for side-channel coded 16bps channels since the effective bps
+; is 17.
+	ALIGN	16
+cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx
+	;[esp + 40]	residual[]
+	;[esp + 36]	lp_quantization
+	;[esp + 32]	order
+	;[esp + 28]	qlp_coeff[]
+	;[esp + 24]	data_len
+	;[esp + 20]	data[]
+
+	;ASSERT(order > 0)
+
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+
+	mov	esi, [esp + 20]			; esi = data[]
+	mov	edi, [esp + 40]			; edi = residual[]
+	mov	eax, [esp + 32]			; eax = order
+	mov	ebx, [esp + 24]			; ebx = data_len
+
+	test	ebx, ebx
+	jz	near .end			; do nothing if data_len == 0
+	dec	ebx
+	test	ebx, ebx
+	jz	near .last_one
+
+	mov	edx, [esp + 28]			; edx = qlp_coeff[]
+	movd	mm6, [esp + 36]			; mm6 = 0:lp_quantization
+	mov	ebp, esp
+
+	and	esp, 0xfffffff8
+
+	xor	ecx, ecx
+.copy_qlp_loop:
+	push	word [edx + 4 * ecx]
+	inc	ecx
+	cmp	ecx, eax
+	jnz	short .copy_qlp_loop
+
+	and	ecx, 0x3
+	test	ecx, ecx
+	je	short .za_end
+	sub	ecx, byte 4
+.za_loop:
+	push	word 0
+	inc	eax
+	inc	ecx
+	jnz	short .za_loop
+.za_end:
+
+	movq	mm5, [esp + 2 * eax - 8]
+	movd	mm4, [esi - 16]
+	punpckldq	mm4, [esi - 12]
+	movd	mm0, [esi - 8]
+	punpckldq	mm0, [esi - 4]
+	packssdw	mm4, mm0
+
+	cmp	eax, byte 4
+	jnbe	short .mmx_4more
+
+	ALIGN	16
+.mmx_4_loop_i:
+	movd	mm1, [esi]
+	movq	mm3, mm4
+	punpckldq	mm1, [esi + 4]
+	psrlq	mm4, 16
+	movq	mm0, mm1
+	psllq	mm0, 48
+	por	mm4, mm0
+	movq	mm2, mm4
+	psrlq	mm4, 16
+	pxor	mm0, mm0
+	punpckhdq	mm0, mm1
+	pmaddwd	mm3, mm5
+	pmaddwd	mm2, mm5
+	psllq	mm0, 16
+	por	mm4, mm0
+	movq	mm0, mm3
+	punpckldq	mm3, mm2
+	punpckhdq	mm0, mm2
+	paddd	mm3, mm0
+	psrad	mm3, mm6
+	psubd	mm1, mm3
+	movd	[edi], mm1
+	punpckhdq	mm1, mm1
+	movd	[edi + 4], mm1
+
+	add	edi, byte 8
+	add	esi, byte 8
+
+	sub	ebx, 2
+	jg	.mmx_4_loop_i
+	jmp	.mmx_end
+
+.mmx_4more:
+	shl	eax, 2
+	neg	eax
+	add	eax, byte 16
+
+	ALIGN	16
+.mmx_4more_loop_i:
+	movd	mm1, [esi]
+	punpckldq	mm1, [esi + 4]
+	movq	mm3, mm4
+	psrlq	mm4, 16
+	movq	mm0, mm1
+	psllq	mm0, 48
+	por	mm4, mm0
+	movq	mm2, mm4
+	psrlq	mm4, 16
+	pxor	mm0, mm0
+	punpckhdq	mm0, mm1
+	pmaddwd	mm3, mm5
+	pmaddwd	mm2, mm5
+	psllq	mm0, 16
+	por	mm4, mm0
+
+	mov	ecx, esi
+	add	ecx, eax
+	mov	edx, esp
+
+	ALIGN	16
+.mmx_4more_loop_j:
+	movd	mm0, [ecx - 16]
+	movd	mm7, [ecx - 8]
+	punpckldq	mm0, [ecx - 12]
+	punpckldq	mm7, [ecx - 4]
+	packssdw	mm0, mm7
+	pmaddwd	mm0, [edx]
+	punpckhdq	mm7, mm7
+	paddd	mm3, mm0
+	movd	mm0, [ecx - 12]
+	punpckldq	mm0, [ecx - 8]
+	punpckldq	mm7, [ecx]
+	packssdw	mm0, mm7
+	pmaddwd	mm0, [edx]
+	paddd	mm2, mm0
+
+	add	edx, byte 8
+	add	ecx, byte 16
+	cmp	ecx, esi
+	jnz	.mmx_4more_loop_j
+
+	movq	mm0, mm3
+	punpckldq	mm3, mm2
+	punpckhdq	mm0, mm2
+	paddd	mm3, mm0
+	psrad	mm3, mm6
+	psubd	mm1, mm3
+	movd	[edi], mm1
+	punpckhdq	mm1, mm1
+	movd	[edi + 4], mm1
+
+	add	edi, byte 8
+	add	esi, byte 8
+
+	sub	ebx, 2
+	jg	near .mmx_4more_loop_i
+
+.mmx_end:
+	emms
+	mov	esp, ebp
+.last_one:
+	mov	eax, [esp + 32]
+	inc	ebx
+	jnz	near FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32.begin
+
+.end:
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+
+; **********************************************************************
+;
+; void FLAC__lpc_restore_signal(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
+; {
+; 	unsigned i, j;
+; 	FLAC__int32 sum;
+;
+; 	FLAC__ASSERT(order > 0);
+;
+; 	for(i = 0; i < data_len; i++) {
+; 		sum = 0;
+; 		for(j = 0; j < order; j++)
+; 			sum += qlp_coeff[j] * data[i-j-1];
+; 		data[i] = residual[i] + (sum >> lp_quantization);
+; 	}
+; }
+	ALIGN	16
+cident FLAC__lpc_restore_signal_asm_ia32
+	;[esp + 40]	data[]
+	;[esp + 36]	lp_quantization
+	;[esp + 32]	order
+	;[esp + 28]	qlp_coeff[]
+	;[esp + 24]	data_len
+	;[esp + 20]	residual[]
+
+	;ASSERT(order > 0)
+
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+
+	mov	esi, [esp + 20]			; esi = residual[]
+	mov	edi, [esp + 40]			; edi = data[]
+	mov	eax, [esp + 32]			; eax = order
+	mov	ebx, [esp + 24]			; ebx = data_len
+
+	test	ebx, ebx
+	jz	near .end			; do nothing if data_len == 0
+
+.begin:
+	cmp	eax, byte 1
+	jg	short .x87_1more
+
+	mov	ecx, [esp + 28]
+	mov	edx, [ecx]
+	mov	eax, [edi - 4]
+	mov	ecx, [esp + 36]
+	ALIGN	16
+.x87_1_loop_i:
+	imul	eax, edx
+	sar	eax, cl
+	add	eax, [esi]
+	mov	[edi], eax
+	add	esi, byte 4
+	add	edi, byte 4
+	dec	ebx
+	jnz	.x87_1_loop_i
+
+	jmp	.end
+
+.x87_1more:
+	cmp	eax, byte 32			; for order <= 32 there is a faster routine
+	jbe	short .x87_32
+
+	; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32
+	ALIGN 16
+.x87_32more_loop_i:
+	xor	ebp, ebp
+	mov	ecx, [esp + 32]
+	mov	edx, ecx
+	shl	edx, 2
+	add	edx, [esp + 28]
+	neg	ecx
+	ALIGN	16
+.x87_32more_loop_j:
+	sub	edx, byte 4
+	mov	eax, [edx]
+	imul	eax, [edi + 4 * ecx]
+	add	ebp, eax
+	inc	ecx
+	jnz	short .x87_32more_loop_j
+
+	mov	ecx, [esp + 36]
+	sar	ebp, cl
+	add	ebp, [esi]
+	mov	[edi], ebp
+	add	edi, byte 4
+	add	esi, byte 4
+
+	dec	ebx
+	jnz	.x87_32more_loop_i
+
+	jmp	.end
+
+.mov_eip_to_eax:
+	mov	eax, [esp]
+	ret
+
+.x87_32:
+	sub	esi, edi
+	neg	eax
+	lea	edx, [eax + eax * 8 + .jumper_0 - .get_eip0]
+	call	.mov_eip_to_eax
+.get_eip0:
+	add	edx, eax
+	inc	edx				; compensate for the shorter opcode on the last iteration
+	mov	eax, [esp + 28]			; eax = qlp_coeff[]
+	xor	ebp, ebp
+	jmp	edx
+
+	mov	ecx, [eax + 124]		; ecx =  qlp_coeff[31]
+	imul	ecx, [edi - 128]		; ecx =  qlp_coeff[31] * data[i-32]
+	add	ebp, ecx			; sum += qlp_coeff[31] * data[i-32]
+	mov	ecx, [eax + 120]		; ecx =  qlp_coeff[30]
+	imul	ecx, [edi - 124]		; ecx =  qlp_coeff[30] * data[i-31]
+	add	ebp, ecx			; sum += qlp_coeff[30] * data[i-31]
+	mov	ecx, [eax + 116]		; ecx =  qlp_coeff[29]
+	imul	ecx, [edi - 120]		; ecx =  qlp_coeff[29] * data[i-30]
+	add	ebp, ecx			; sum += qlp_coeff[29] * data[i-30]
+	mov	ecx, [eax + 112]		; ecx =  qlp_coeff[28]
+	imul	ecx, [edi - 116]		; ecx =  qlp_coeff[28] * data[i-29]
+	add	ebp, ecx			; sum += qlp_coeff[28] * data[i-29]
+	mov	ecx, [eax + 108]		; ecx =  qlp_coeff[27]
+	imul	ecx, [edi - 112]		; ecx =  qlp_coeff[27] * data[i-28]
+	add	ebp, ecx			; sum += qlp_coeff[27] * data[i-28]
+	mov	ecx, [eax + 104]		; ecx =  qlp_coeff[26]
+	imul	ecx, [edi - 108]		; ecx =  qlp_coeff[26] * data[i-27]
+	add	ebp, ecx			; sum += qlp_coeff[26] * data[i-27]
+	mov	ecx, [eax + 100]		; ecx =  qlp_coeff[25]
+	imul	ecx, [edi - 104]		; ecx =  qlp_coeff[25] * data[i-26]
+	add	ebp, ecx			; sum += qlp_coeff[25] * data[i-26]
+	mov	ecx, [eax + 96]			; ecx =  qlp_coeff[24]
+	imul	ecx, [edi - 100]		; ecx =  qlp_coeff[24] * data[i-25]
+	add	ebp, ecx			; sum += qlp_coeff[24] * data[i-25]
+	mov	ecx, [eax + 92]			; ecx =  qlp_coeff[23]
+	imul	ecx, [edi - 96]			; ecx =  qlp_coeff[23] * data[i-24]
+	add	ebp, ecx			; sum += qlp_coeff[23] * data[i-24]
+	mov	ecx, [eax + 88]			; ecx =  qlp_coeff[22]
+	imul	ecx, [edi - 92]			; ecx =  qlp_coeff[22] * data[i-23]
+	add	ebp, ecx			; sum += qlp_coeff[22] * data[i-23]
+	mov	ecx, [eax + 84]			; ecx =  qlp_coeff[21]
+	imul	ecx, [edi - 88]			; ecx =  qlp_coeff[21] * data[i-22]
+	add	ebp, ecx			; sum += qlp_coeff[21] * data[i-22]
+	mov	ecx, [eax + 80]			; ecx =  qlp_coeff[20]
+	imul	ecx, [edi - 84]			; ecx =  qlp_coeff[20] * data[i-21]
+	add	ebp, ecx			; sum += qlp_coeff[20] * data[i-21]
+	mov	ecx, [eax + 76]			; ecx =  qlp_coeff[19]
+	imul	ecx, [edi - 80]			; ecx =  qlp_coeff[19] * data[i-20]
+	add	ebp, ecx			; sum += qlp_coeff[19] * data[i-20]
+	mov	ecx, [eax + 72]			; ecx =  qlp_coeff[18]
+	imul	ecx, [edi - 76]			; ecx =  qlp_coeff[18] * data[i-19]
+	add	ebp, ecx			; sum += qlp_coeff[18] * data[i-19]
+	mov	ecx, [eax + 68]			; ecx =  qlp_coeff[17]
+	imul	ecx, [edi - 72]			; ecx =  qlp_coeff[17] * data[i-18]
+	add	ebp, ecx			; sum += qlp_coeff[17] * data[i-18]
+	mov	ecx, [eax + 64]			; ecx =  qlp_coeff[16]
+	imul	ecx, [edi - 68]			; ecx =  qlp_coeff[16] * data[i-17]
+	add	ebp, ecx			; sum += qlp_coeff[16] * data[i-17]
+	mov	ecx, [eax + 60]			; ecx =  qlp_coeff[15]
+	imul	ecx, [edi - 64]			; ecx =  qlp_coeff[15] * data[i-16]
+	add	ebp, ecx			; sum += qlp_coeff[15] * data[i-16]
+	mov	ecx, [eax + 56]			; ecx =  qlp_coeff[14]
+	imul	ecx, [edi - 60]			; ecx =  qlp_coeff[14] * data[i-15]
+	add	ebp, ecx			; sum += qlp_coeff[14] * data[i-15]
+	mov	ecx, [eax + 52]			; ecx =  qlp_coeff[13]
+	imul	ecx, [edi - 56]			; ecx =  qlp_coeff[13] * data[i-14]
+	add	ebp, ecx			; sum += qlp_coeff[13] * data[i-14]
+	mov	ecx, [eax + 48]			; ecx =  qlp_coeff[12]
+	imul	ecx, [edi - 52]			; ecx =  qlp_coeff[12] * data[i-13]
+	add	ebp, ecx			; sum += qlp_coeff[12] * data[i-13]
+	mov	ecx, [eax + 44]			; ecx =  qlp_coeff[11]
+	imul	ecx, [edi - 48]			; ecx =  qlp_coeff[11] * data[i-12]
+	add	ebp, ecx			; sum += qlp_coeff[11] * data[i-12]
+	mov	ecx, [eax + 40]			; ecx =  qlp_coeff[10]
+	imul	ecx, [edi - 44]			; ecx =  qlp_coeff[10] * data[i-11]
+	add	ebp, ecx			; sum += qlp_coeff[10] * data[i-11]
+	mov	ecx, [eax + 36]			; ecx =  qlp_coeff[ 9]
+	imul	ecx, [edi - 40]			; ecx =  qlp_coeff[ 9] * data[i-10]
+	add	ebp, ecx			; sum += qlp_coeff[ 9] * data[i-10]
+	mov	ecx, [eax + 32]			; ecx =  qlp_coeff[ 8]
+	imul	ecx, [edi - 36]			; ecx =  qlp_coeff[ 8] * data[i- 9]
+	add	ebp, ecx			; sum += qlp_coeff[ 8] * data[i- 9]
+	mov	ecx, [eax + 28]			; ecx =  qlp_coeff[ 7]
+	imul	ecx, [edi - 32]			; ecx =  qlp_coeff[ 7] * data[i- 8]
+	add	ebp, ecx			; sum += qlp_coeff[ 7] * data[i- 8]
+	mov	ecx, [eax + 24]			; ecx =  qlp_coeff[ 6]
+	imul	ecx, [edi - 28]			; ecx =  qlp_coeff[ 6] * data[i- 7]
+	add	ebp, ecx			; sum += qlp_coeff[ 6] * data[i- 7]
+	mov	ecx, [eax + 20]			; ecx =  qlp_coeff[ 5]
+	imul	ecx, [edi - 24]			; ecx =  qlp_coeff[ 5] * data[i- 6]
+	add	ebp, ecx			; sum += qlp_coeff[ 5] * data[i- 6]
+	mov	ecx, [eax + 16]			; ecx =  qlp_coeff[ 4]
+	imul	ecx, [edi - 20]			; ecx =  qlp_coeff[ 4] * data[i- 5]
+	add	ebp, ecx			; sum += qlp_coeff[ 4] * data[i- 5]
+	mov	ecx, [eax + 12]			; ecx =  qlp_coeff[ 3]
+	imul	ecx, [edi - 16]			; ecx =  qlp_coeff[ 3] * data[i- 4]
+	add	ebp, ecx			; sum += qlp_coeff[ 3] * data[i- 4]
+	mov	ecx, [eax + 8]			; ecx =  qlp_coeff[ 2]
+	imul	ecx, [edi - 12]			; ecx =  qlp_coeff[ 2] * data[i- 3]
+	add	ebp, ecx			; sum += qlp_coeff[ 2] * data[i- 3]
+	mov	ecx, [eax + 4]			; ecx =  qlp_coeff[ 1]
+	imul	ecx, [edi - 8]			; ecx =  qlp_coeff[ 1] * data[i- 2]
+	add	ebp, ecx			; sum += qlp_coeff[ 1] * data[i- 2]
+	mov	ecx, [eax]			; ecx =  qlp_coeff[ 0] (NOTE: one byte missing from instruction)
+	imul	ecx, [edi - 4]			; ecx =  qlp_coeff[ 0] * data[i- 1]
+	add	ebp, ecx			; sum += qlp_coeff[ 0] * data[i- 1]
+.jumper_0:
+
+	mov	ecx, [esp + 36]
+	sar	ebp, cl				; ebp = (sum >> lp_quantization)
+	add	ebp, [esi + edi]		; ebp = residual[i] + (sum >> lp_quantization)
+	mov	[edi], ebp			; data[i] = residual[i] + (sum >> lp_quantization)
+	add	edi, byte 4
+
+	dec	ebx
+	jz	short .end
+	xor	ebp, ebp
+	jmp	edx
+
+.end:
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+
+; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for
+; the channel and qlp_coeffs must be <= 16.  Especially note that this routine
+; cannot be used for side-channel coded 16bps channels since the effective bps
+; is 17.
+; WATCHOUT: this routine requires that each data array have a buffer of up to
+; 3 zeroes in front (at negative indices) for alignment purposes, i.e. for each
+; channel n, data[n][-1] through data[n][-3] should be accessible and zero.
+	ALIGN	16
+cident FLAC__lpc_restore_signal_asm_ia32_mmx
+	;[esp + 40]	data[]
+	;[esp + 36]	lp_quantization
+	;[esp + 32]	order
+	;[esp + 28]	qlp_coeff[]
+	;[esp + 24]	data_len
+	;[esp + 20]	residual[]
+
+	;ASSERT(order > 0)
+
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+
+	mov	esi, [esp + 20]
+	mov	edi, [esp + 40]
+	mov	eax, [esp + 32]
+	mov	ebx, [esp + 24]
+
+	test	ebx, ebx
+	jz	near .end			; do nothing if data_len == 0
+	cmp	eax, byte 4
+	jb	near FLAC__lpc_restore_signal_asm_ia32.begin
+
+	mov	edx, [esp + 28]
+	movd	mm6, [esp + 36]
+	mov	ebp, esp
+
+	and	esp, 0xfffffff8
+
+	xor	ecx, ecx
+.copy_qlp_loop:
+	push	word [edx + 4 * ecx]
+	inc	ecx
+	cmp	ecx, eax
+	jnz	short .copy_qlp_loop
+
+	and	ecx, 0x3
+	test	ecx, ecx
+	je	short .za_end
+	sub	ecx, byte 4
+.za_loop:
+	push	word 0
+	inc	eax
+	inc	ecx
+	jnz	short .za_loop
+.za_end:
+
+	movq	mm5, [esp + 2 * eax - 8]
+	movd	mm4, [edi - 16]
+	punpckldq	mm4, [edi - 12]
+	movd	mm0, [edi - 8]
+	punpckldq	mm0, [edi - 4]
+	packssdw	mm4, mm0
+
+	cmp	eax, byte 4
+	jnbe	short .mmx_4more
+
+	ALIGN	16
+.mmx_4_loop_i:
+	movq	mm7, mm4
+	pmaddwd	mm7, mm5
+	movq	mm0, mm7
+	punpckhdq	mm7, mm7
+	paddd	mm7, mm0
+	psrad	mm7, mm6
+	movd	mm1, [esi]
+	paddd	mm7, mm1
+	movd	[edi], mm7
+	psllq	mm7, 48
+	psrlq	mm4, 16
+	por	mm4, mm7
+
+	add	esi, byte 4
+	add	edi, byte 4
+
+	dec	ebx
+	jnz	.mmx_4_loop_i
+	jmp	.mmx_end
+.mmx_4more:
+	shl	eax, 2
+	neg	eax
+	add	eax, byte 16
+	ALIGN	16
+.mmx_4more_loop_i:
+	mov	ecx, edi
+	add	ecx, eax
+	mov	edx, esp
+
+	movq	mm7, mm4
+	pmaddwd	mm7, mm5
+
+	ALIGN	16
+.mmx_4more_loop_j:
+	movd	mm0, [ecx - 16]
+	punpckldq	mm0, [ecx - 12]
+	movd	mm1, [ecx - 8]
+	punpckldq	mm1, [ecx - 4]
+	packssdw	mm0, mm1
+	pmaddwd	mm0, [edx]
+	paddd	mm7, mm0
+
+	add	edx, byte 8
+	add	ecx, byte 16
+	cmp	ecx, edi
+	jnz	.mmx_4more_loop_j
+
+	movq	mm0, mm7
+	punpckhdq	mm7, mm7
+	paddd	mm7, mm0
+	psrad	mm7, mm6
+	movd	mm1, [esi]
+	paddd	mm7, mm1
+	movd	[edi], mm7
+	psllq	mm7, 48
+	psrlq	mm4, 16
+	por	mm4, mm7
+
+	add	esi, byte 4
+	add	edi, byte 4
+
+	dec	ebx
+	jnz	short .mmx_4more_loop_i
+.mmx_end:
+	emms
+	mov	esp, ebp
+
+.end:
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+
+
+; **********************************************************************
+;
+;void FLAC__lpc_compute_residual_from_qlp_coefficients_wide(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
+; {
+; 	unsigned i, j;
+; 	FLAC__int64 sum;
+;
+; 	FLAC__ASSERT(order > 0);
+;
+;	for(i = 0; i < data_len; i++) {
+;		sum = 0;
+;		for(j = 0; j < order; j++)
+;			sum += qlp_coeff[j] * (FLAC__int64)data[i-j-1];
+;		residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantization);
+;	}
+; }
+	ALIGN	16
+cident FLAC__lpc_compute_residual_from_qlp_coefficients_wide_asm_ia32
+	;[esp + 40]	residual[]
+	;[esp + 36]	lp_quantization
+	;[esp + 32]	order
+	;[esp + 28]	qlp_coeff[]
+	;[esp + 24]	data_len
+	;[esp + 20]	data[]
+
+	;ASSERT(order > 0)
+	;ASSERT(order <= 32)
+	;ASSERT(lp_quantization <= 31)
+
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+
+	mov	ebx, [esp + 24]			; ebx = data_len
+	test	ebx, ebx
+	jz	near .end				; do nothing if data_len == 0
+
+.begin:
+	mov	eax, [esp + 32]			; eax = order
+	cmp	eax, 1
+	jg	short .i_32
+
+	mov	esi, [esp + 40]			; esi = residual[]
+	mov	edi, [esp + 20]			; edi = data[]
+	mov	ecx, [esp + 28]			; ecx = qlp_coeff[]
+	mov	ebp, [ecx]				; ebp = qlp_coeff[0]
+	mov	eax, [edi - 4]			; eax = data[-1]
+	mov	ecx, [esp + 36]			; cl = lp_quantization
+	ALIGN	16
+.i_1_loop_i:
+	imul	ebp					; edx:eax = qlp_coeff[0] * (FLAC__int64)data[i-1]
+	shrd	eax, edx, cl		; 0 <= lp_quantization <= 15
+	neg	eax
+	add	eax, [edi]
+	mov	[esi], eax
+	mov	eax, [edi]
+	add	esi, 4
+	add	edi, 4
+	dec	ebx
+	jnz	.i_1_loop_i
+	jmp	.end
+
+.mov_eip_to_eax:
+	mov	eax, [esp]
+	ret
+
+.i_32:	; eax = order
+	neg	eax
+	add	eax, eax
+	lea	ebp, [eax + eax * 4 + .jumper_0 - .get_eip0]
+	call	.mov_eip_to_eax
+.get_eip0:
+	add	ebp, eax
+	inc	ebp				; compensate for the shorter opcode on the last iteration
+
+	mov	ebx, [esp + 28]			; ebx = qlp_coeff[]
+	mov	edi, [esp + 20]			; edi = data[]
+	sub	[esp + 40], edi			; residual[] -= data[]
+
+	xor	ecx, ecx
+	xor	esi, esi
+	jmp	ebp
+
+;eax = --
+;edx = --
+;ecx = 0
+;esi = 0
+;
+;ebx = qlp_coeff[]
+;edi = data[]
+;ebp = @address
+
+	mov	eax, [ebx + 124]			; eax =  qlp_coeff[31]
+	imul	dword [edi - 128]		; edx:eax =  qlp_coeff[31] * data[i-32]
+	add	ecx, eax
+	adc	esi, edx					; sum += qlp_coeff[31] * data[i-32]
+
+	mov	eax, [ebx + 120]			; eax =  qlp_coeff[30]
+	imul	dword [edi - 124]		; edx:eax =  qlp_coeff[30] * data[i-31]
+	add	ecx, eax
+	adc	esi, edx					; sum += qlp_coeff[30] * data[i-31]
+
+	mov	eax, [ebx + 116]
+	imul	dword [edi - 120]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 112]
+	imul	dword [edi - 116]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 108]
+	imul	dword [edi - 112]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 104]
+	imul	dword [edi - 108]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 100]
+	imul	dword [edi - 104]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 96]
+	imul	dword [edi - 100]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 92]
+	imul	dword [edi - 96]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 88]
+	imul	dword [edi - 92]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 84]
+	imul	dword [edi - 88]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 80]
+	imul	dword [edi - 84]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 76]
+	imul	dword [edi - 80]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 72]
+	imul	dword [edi - 76]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 68]
+	imul	dword [edi - 72]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 64]
+	imul	dword [edi - 68]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 60]
+	imul	dword [edi - 64]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 56]
+	imul	dword [edi - 60]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 52]
+	imul	dword [edi - 56]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 48]
+	imul	dword [edi - 52]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 44]
+	imul	dword [edi - 48]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 40]
+	imul	dword [edi - 44]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 36]
+	imul	dword [edi - 40]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 32]
+	imul	dword [edi - 36]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 28]
+	imul	dword [edi - 32]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 24]
+	imul	dword [edi - 28]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 20]
+	imul	dword [edi - 24]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 16]
+	imul	dword [edi - 20]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 12]
+	imul	dword [edi - 16]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 8]
+	imul	dword [edi - 12]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 4]
+	imul	dword [edi - 8]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx]					; eax =  qlp_coeff[ 0] (NOTE: one byte missing from instruction)
+	imul	dword [edi - 4]			; edx:eax =  qlp_coeff[ 0] * data[i- 1]
+	add	ecx, eax
+	adc	esi, edx					; sum += qlp_coeff[ 0] * data[i- 1]
+
+.jumper_0:
+	mov	edx, ecx
+;esi:edx = sum
+	mov	ecx, [esp + 36]			; cl = lp_quantization
+	shrd	edx, esi, cl		; edx = (sum >> lp_quantization)
+;eax = --
+;ecx = --
+;edx = sum >> lp_q
+;esi = --
+	neg	edx						; edx = -(sum >> lp_quantization)
+	mov	eax, [esp + 40]			; residual[] - data[]
+	add	edx, [edi]				; edx = data[i] - (sum >> lp_quantization)
+	mov	[edi + eax], edx
+	add	edi, 4
+
+	dec	dword [esp + 24]
+	jz	short .end
+	xor	ecx, ecx
+	xor	esi, esi
+	jmp	ebp
+
+.end:
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+
+; **********************************************************************
+;
+; void FLAC__lpc_restore_signal_wide(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
+; {
+; 	unsigned i, j;
+; 	FLAC__int64 sum;
+;
+; 	FLAC__ASSERT(order > 0);
+;
+; 	for(i = 0; i < data_len; i++) {
+; 		sum = 0;
+; 		for(j = 0; j < order; j++)
+; 			sum += qlp_coeff[j] * (FLAC__int64)data[i-j-1];
+; 		data[i] = residual[i] + (FLAC__int32)(sum >> lp_quantization);
+; 	}
+; }
+	ALIGN	16
+cident FLAC__lpc_restore_signal_wide_asm_ia32
+	;[esp + 40]	data[]
+	;[esp + 36]	lp_quantization
+	;[esp + 32]	order
+	;[esp + 28]	qlp_coeff[]
+	;[esp + 24]	data_len
+	;[esp + 20]	residual[]
+
+	;ASSERT(order > 0)
+	;ASSERT(order <= 32)
+	;ASSERT(lp_quantization <= 31)
+
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+
+	mov	ebx, [esp + 24]			; ebx = data_len
+	test	ebx, ebx
+	jz	near .end				; do nothing if data_len == 0
+
+.begin:
+	mov	eax, [esp + 32]			; eax = order
+	cmp	eax, 1
+	jg	short .x87_32
+
+	mov	esi, [esp + 20]			; esi = residual[]
+	mov	edi, [esp + 40]			; edi = data[]
+	mov	ecx, [esp + 28]			; ecx = qlp_coeff[]
+	mov	ebp, [ecx]				; ebp = qlp_coeff[0]
+	mov	eax, [edi - 4]			; eax = data[-1]
+	mov	ecx, [esp + 36]			; cl = lp_quantization
+	ALIGN	16
+.x87_1_loop_i:
+	imul	ebp					; edx:eax = qlp_coeff[0] * (FLAC__int64)data[i-1]
+	shrd	eax, edx, cl		; 0 <= lp_quantization <= 15
+;
+	add	eax, [esi]
+	mov	[edi], eax
+;
+	add	esi, 4
+	add	edi, 4
+	dec	ebx
+	jnz	.x87_1_loop_i
+	jmp	.end
+
+.mov_eip_to_eax:
+	mov	eax, [esp]
+	ret
+
+.x87_32:	; eax = order
+	neg	eax
+	add	eax, eax
+	lea	ebp, [eax + eax * 4 + .jumper_0 - .get_eip0]
+	call	.mov_eip_to_eax
+.get_eip0:
+	add	ebp, eax
+	inc	ebp				; compensate for the shorter opcode on the last iteration
+
+	mov	ebx, [esp + 28]			; ebx = qlp_coeff[]
+	mov	edi, [esp + 40]			; esi = data[]
+	sub	[esp + 20], edi			; residual[] -= data[]
+
+	xor	ecx, ecx
+	xor	esi, esi
+	jmp	ebp
+
+;eax = --
+;edx = --
+;ecx = 0
+;esi = 0
+;
+;ebx = qlp_coeff[]
+;edi = data[]
+;ebp = @address
+
+	mov	eax, [ebx + 124]			; eax =  qlp_coeff[31]
+	imul	dword [edi - 128]		; edx:eax =  qlp_coeff[31] * data[i-32]
+	add	ecx, eax
+	adc	esi, edx					; sum += qlp_coeff[31] * data[i-32]
+
+	mov	eax, [ebx + 120]			; eax =  qlp_coeff[30]
+	imul	dword [edi - 124]		; edx:eax =  qlp_coeff[30] * data[i-31]
+	add	ecx, eax
+	adc	esi, edx					; sum += qlp_coeff[30] * data[i-31]
+
+	mov	eax, [ebx + 116]
+	imul	dword [edi - 120]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 112]
+	imul	dword [edi - 116]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 108]
+	imul	dword [edi - 112]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 104]
+	imul	dword [edi - 108]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 100]
+	imul	dword [edi - 104]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 96]
+	imul	dword [edi - 100]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 92]
+	imul	dword [edi - 96]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 88]
+	imul	dword [edi - 92]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 84]
+	imul	dword [edi - 88]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 80]
+	imul	dword [edi - 84]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 76]
+	imul	dword [edi - 80]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 72]
+	imul	dword [edi - 76]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 68]
+	imul	dword [edi - 72]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 64]
+	imul	dword [edi - 68]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 60]
+	imul	dword [edi - 64]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 56]
+	imul	dword [edi - 60]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 52]
+	imul	dword [edi - 56]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 48]
+	imul	dword [edi - 52]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 44]
+	imul	dword [edi - 48]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 40]
+	imul	dword [edi - 44]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 36]
+	imul	dword [edi - 40]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 32]
+	imul	dword [edi - 36]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 28]
+	imul	dword [edi - 32]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 24]
+	imul	dword [edi - 28]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 20]
+	imul	dword [edi - 24]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 16]
+	imul	dword [edi - 20]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 12]
+	imul	dword [edi - 16]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 8]
+	imul	dword [edi - 12]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx + 4]
+	imul	dword [edi - 8]
+	add	ecx, eax
+	adc	esi, edx
+
+	mov	eax, [ebx]					; eax =  qlp_coeff[ 0] (NOTE: one byte missing from instruction)
+	imul	dword [edi - 4]			; edx:eax =  qlp_coeff[ 0] * data[i- 1]
+	add	ecx, eax
+	adc	esi, edx					; sum += qlp_coeff[ 0] * data[i- 1]
+
+.jumper_0:
+	mov	edx, ecx
+;esi:edx = sum
+	mov	ecx, [esp + 36]			; cl = lp_quantization
+	shrd	edx, esi, cl		; edx = (sum >> lp_quantization)
+;eax = --
+;ecx = --
+;edx = sum >> lp_q
+;esi = --
+;
+	mov	eax, [esp + 20]			; residual[] - data[]
+	add	edx, [edi + eax]		; edx = residual[i] + (sum >> lp_quantization)
+	mov	[edi], edx				; data[i] = residual[i] + (sum >> lp_quantization)
+	add	edi, 4
+
+	dec	dword [esp + 24]
+	jz	short .end
+	xor	ecx, ecx
+	xor	esi, esi
+	jmp	ebp
+
+.end:
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+
+; end
diff --git a/src/libFLAC/ia32/nasm.h b/src/libFLAC/ia32/nasm.h
new file mode 100644
index 0000000..efa6808
--- /dev/null
+++ b/src/libFLAC/ia32/nasm.h
@@ -0,0 +1,85 @@
+;  libFLAC - Free Lossless Audio Codec library
+;  Copyright (C) 2001-2009  Josh Coalson
+;  Copyright (C) 2011-2014  Xiph.Org Foundation
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;
+;  - Redistributions of source code must retain the above copyright
+;  notice, this list of conditions and the following disclaimer.
+;
+;  - Redistributions in binary form must reproduce the above copyright
+;  notice, this list of conditions and the following disclaimer in the
+;  documentation and/or other materials provided with the distribution.
+;
+;  - Neither the name of the Xiph.org Foundation nor the names of its
+;  contributors may be used to endorse or promote products derived from
+;  this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+;  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+	bits 32
+
+%ifdef OBJ_FORMAT_win32
+	%define FLAC__PUBLIC_NEEDS_UNDERSCORE
+	%idefine code_section section .text align=16 class=CODE use32
+	%idefine data_section section .data align=32 class=DATA use32
+	%idefine bss_section  section .bss  align=32 class=DATA use32
+%elifdef OBJ_FORMAT_aout
+	%define FLAC__PUBLIC_NEEDS_UNDERSCORE
+	%idefine code_section section .text
+	%idefine data_section section .data
+	%idefine bss_section  section .bss
+%elifdef OBJ_FORMAT_aoutb
+	%define FLAC__PUBLIC_NEEDS_UNDERSCORE
+	%idefine code_section section .text
+	%idefine data_section section .data
+	%idefine bss_section  section .bss
+%elifdef OBJ_FORMAT_elf
+	%idefine code_section section .text align=16
+	%idefine data_section section .data align=32
+	%idefine bss_section  section .bss  align=32
+%else
+	%error unsupported object format!
+%endif
+
+%imacro cglobal 1
+	%ifdef FLAC__PUBLIC_NEEDS_UNDERSCORE
+		global _%1
+	%else
+		%if __NASM_MAJOR__ >= 2
+			global %1:function hidden
+		%else
+			global %1
+		%endif
+	%endif
+%endmacro
+
+%imacro cextern 1
+	%ifdef FLAC__PUBLIC_NEEDS_UNDERSCORE
+		extern _%1
+	%else
+		extern %1
+	%endif
+%endmacro
+
+%imacro cident 1
+_%1:
+%1:
+%endmacro
+
+%ifdef OBJ_FORMAT_elf
+section .note.GNU-stack progbits noalloc noexec nowrite align=1
+%endif
+
diff --git a/src/libFLAC/include/private/all.h b/src/libFLAC/include/private/all.h
index 304c471..a4463d2 100644
--- a/src/libFLAC/include/private/all.h
+++ b/src/libFLAC/include/private/all.h
@@ -1,5 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
- * Copyright (C) 2000,2001,2002,2003,2004,2005,2006,2007  Josh Coalson
+ * Copyright (C) 2000-2009  Josh Coalson
+ * Copyright (C) 2011-2014  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
diff --git a/src/libFLAC/include/private/bitmath.h b/src/libFLAC/include/private/bitmath.h
index 87fa0fa..22d894f 100644
--- a/src/libFLAC/include/private/bitmath.h
+++ b/src/libFLAC/include/private/bitmath.h
@@ -1,5 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
- * Copyright (C) 2001,2002,2003,2004,2005,2006,2007  Josh Coalson
+ * Copyright (C) 2001-2009  Josh Coalson
+ * Copyright (C) 2011-2014  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -33,9 +34,152 @@
 #define FLAC__PRIVATE__BITMATH_H
 
 #include "FLAC/ordinals.h"
+#include "FLAC/assert.h"
 
-unsigned FLAC__bitmath_ilog2(FLAC__uint32 v);
-unsigned FLAC__bitmath_ilog2_wide(FLAC__uint64 v);
+/* for CHAR_BIT */
+#include <limits.h>
+#include "share/compat.h"
+
+#if defined(_MSC_VER) && (_MSC_VER >= 1400)
+#include <intrin.h> /* for _BitScanReverse* */
+#endif
+
+/* Will never be emitted for MSVC, GCC, Intel compilers */
+static inline unsigned int FLAC__clz_soft_uint32(unsigned int word)
+{
+    static const unsigned char byte_to_unary_table[] = {
+    8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
+    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    };
+
+    return (word) > 0xffffff ? byte_to_unary_table[(word) >> 24] :
+    (word) > 0xffff ? byte_to_unary_table[(word) >> 16] + 8 :
+    (word) > 0xff ? byte_to_unary_table[(word) >> 8] + 16 :
+    byte_to_unary_table[(word)] + 24;
+}
+
+static inline unsigned int FLAC__clz_uint32(FLAC__uint32 v)
+{
+/* Never used with input 0 */
+    FLAC__ASSERT(v > 0);
+#if defined(__INTEL_COMPILER)
+    return _bit_scan_reverse(v) ^ 31U;
+#elif defined(__GNUC__) && (__GNUC__ >= 4 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
+/* This will translate either to (bsr ^ 31U), clz , ctlz, cntlz, lzcnt depending on
+ * -march= setting or to a software routine in exotic machines. */
+    return __builtin_clz(v);
+#elif defined(_MSC_VER) && (_MSC_VER >= 1400)
+    {
+        unsigned long idx;
+        _BitScanReverse(&idx, v);
+        return idx ^ 31U;
+    }
+#else
+    return FLAC__clz_soft_uint32(v);
+#endif
+}
+
+/* This one works with input 0 */
+static inline unsigned int FLAC__clz2_uint32(FLAC__uint32 v)
+{
+    if (!v)
+        return 32;
+    return FLAC__clz_uint32(v);
+}
+
+/* An example of what FLAC__bitmath_ilog2() computes:
+ *
+ * ilog2( 0) = assertion failure
+ * ilog2( 1) = 0
+ * ilog2( 2) = 1
+ * ilog2( 3) = 1
+ * ilog2( 4) = 2
+ * ilog2( 5) = 2
+ * ilog2( 6) = 2
+ * ilog2( 7) = 2
+ * ilog2( 8) = 3
+ * ilog2( 9) = 3
+ * ilog2(10) = 3
+ * ilog2(11) = 3
+ * ilog2(12) = 3
+ * ilog2(13) = 3
+ * ilog2(14) = 3
+ * ilog2(15) = 3
+ * ilog2(16) = 4
+ * ilog2(17) = 4
+ * ilog2(18) = 4
+ */
+
+static inline unsigned FLAC__bitmath_ilog2(FLAC__uint32 v)
+{
+    FLAC__ASSERT(v > 0);
+#if defined(__INTEL_COMPILER)
+    return _bit_scan_reverse(v);
+#elif defined(_MSC_VER) && (_MSC_VER >= 1400)
+    {
+        unsigned long idx;
+        _BitScanReverse(&idx, v);
+        return idx;
+    }
+#else
+    return sizeof(FLAC__uint32) * CHAR_BIT  - 1 - FLAC__clz_uint32(v);
+#endif
+}
+
+
+#ifdef FLAC__INTEGER_ONLY_LIBRARY /* Unused otherwise */
+
+static inline unsigned FLAC__bitmath_ilog2_wide(FLAC__uint64 v)
+{
+    FLAC__ASSERT(v > 0);
+#if defined(__GNUC__) && (__GNUC__ >= 4 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
+    return sizeof(FLAC__uint64) * CHAR_BIT - 1 - __builtin_clzll(v);
+/* Sorry, only supported in x64/Itanium.. and both have fast FPU which makes integer-only encoder pointless */
+#elif (defined(_MSC_VER) && (_MSC_VER >= 1400)) && (defined(_M_IA64) || defined(_M_X64))
+    {
+        unsigned long idx;
+        _BitScanReverse64(&idx, v);
+        return idx;
+    }
+#else
+/*  Brain-damaged compilers will use the fastest possible way that is,
+    de Bruijn sequences (http://supertech.csail.mit.edu/papers/debruijn.pdf)
+    (C) Timothy B. Terriberry (tterribe@xiph.org) 2001-2009 CC0 (Public domain).
+*/
+    {
+        static const unsigned char DEBRUIJN_IDX64[64]={
+            0, 1, 2, 7, 3,13, 8,19, 4,25,14,28, 9,34,20,40,
+            5,17,26,38,15,46,29,48,10,31,35,54,21,50,41,57,
+            63, 6,12,18,24,27,33,39,16,37,45,47,30,53,49,56,
+            62,11,23,32,36,44,52,55,61,22,43,51,60,42,59,58
+        };
+        v|= v>>1;
+        v|= v>>2;
+        v|= v>>4;
+        v|= v>>8;
+        v|= v>>16;
+        v|= v>>32;
+        v= (v>>1)+1;
+        return DEBRUIJN_IDX64[v*0x218A392CD3D5DBF>>58&0x3F];
+    }
+#endif
+}
+#endif
+
 unsigned FLAC__bitmath_silog2(int v);
 unsigned FLAC__bitmath_silog2_wide(FLAC__int64 v);
 
diff --git a/src/libFLAC/include/private/bitreader.h b/src/libFLAC/include/private/bitreader.h
index fd0f6aa..4dea8d0 100644
--- a/src/libFLAC/include/private/bitreader.h
+++ b/src/libFLAC/include/private/bitreader.h
@@ -1,5 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
- * Copyright (C) 2000,2001,2002,2003,2004,2005,2006,2007  Josh Coalson
+ * Copyright (C) 2000-2009  Josh Coalson
+ * Copyright (C) 2011-2014  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -49,7 +50,7 @@
  */
 FLAC__BitReader *FLAC__bitreader_new(void);
 void FLAC__bitreader_delete(FLAC__BitReader *br);
-FLAC__bool FLAC__bitreader_init(FLAC__BitReader *br, FLAC__CPUInfo cpu, FLAC__BitReaderReadCallback rcb, void *cd);
+FLAC__bool FLAC__bitreader_init(FLAC__BitReader *br, FLAC__BitReaderReadCallback rcb, void *cd);
 void FLAC__bitreader_free(FLAC__BitReader *br); /* does not 'free(br)' */
 FLAC__bool FLAC__bitreader_clear(FLAC__BitReader *br);
 void FLAC__bitreader_dump(const FLAC__BitReader *br, FILE *out);
@@ -81,19 +82,10 @@
 FLAC__bool FLAC__bitreader_read_unary_unsigned(FLAC__BitReader *br, unsigned *val);
 FLAC__bool FLAC__bitreader_read_rice_signed(FLAC__BitReader *br, int *val, unsigned parameter);
 FLAC__bool FLAC__bitreader_read_rice_signed_block(FLAC__BitReader *br, int vals[], unsigned nvals, unsigned parameter);
-#ifndef FLAC__NO_ASM
-#  ifdef FLAC__CPU_IA32
-#    ifdef FLAC__HAS_NASM
-FLAC__bool FLAC__bitreader_read_rice_signed_block_asm_ia32_bswap(FLAC__BitReader *br, int vals[], unsigned nvals, unsigned parameter);
-#    endif
-#  endif
-#endif
 #if 0 /* UNUSED */
 FLAC__bool FLAC__bitreader_read_golomb_signed(FLAC__BitReader *br, int *val, unsigned parameter);
 FLAC__bool FLAC__bitreader_read_golomb_unsigned(FLAC__BitReader *br, unsigned *val, unsigned parameter);
 #endif
 FLAC__bool FLAC__bitreader_read_utf8_uint32(FLAC__BitReader *br, FLAC__uint32 *val, FLAC__byte *raw, unsigned *rawlen);
 FLAC__bool FLAC__bitreader_read_utf8_uint64(FLAC__BitReader *br, FLAC__uint64 *val, FLAC__byte *raw, unsigned *rawlen);
-
-FLAC__bool bitreader_read_from_client_(FLAC__BitReader *br);
 #endif
diff --git a/src/libFLAC/include/private/bitwriter.h b/src/libFLAC/include/private/bitwriter.h
index aa5c4f7..3b1362d 100644
--- a/src/libFLAC/include/private/bitwriter.h
+++ b/src/libFLAC/include/private/bitwriter.h
@@ -1,5 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
- * Copyright (C) 2000,2001,2002,2003,2004,2005,2006,2007  Josh Coalson
+ * Copyright (C) 2000-2009  Josh Coalson
+ * Copyright (C) 2011-2014  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
diff --git a/src/libFLAC/include/private/cpu.h b/src/libFLAC/include/private/cpu.h
index 651bb22..8927897 100644
--- a/src/libFLAC/include/private/cpu.h
+++ b/src/libFLAC/include/private/cpu.h
@@ -1,5 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
- * Copyright (C) 2001,2002,2003,2004,2005,2006,2007  Josh Coalson
+ * Copyright (C) 2001-2009  Josh Coalson
+ * Copyright (C) 2011-2014  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -38,51 +39,130 @@
 #include <config.h>
 #endif
 
+#if defined FLAC__HAS_X86INTRIN
+/* SSE intrinsics support by ICC/MSVC/GCC */
+#if defined __INTEL_COMPILER
+  #define FLAC__SSE_TARGET(x)
+  #define FLAC__SSE_SUPPORTED 1
+  #define FLAC__SSE2_SUPPORTED 1
+  #if (__INTEL_COMPILER >= 1000) /* Intel C++ Compiler 10.0 */
+    #define FLAC__SSSE3_SUPPORTED 1
+    #define FLAC__SSE4_1_SUPPORTED 1
+  #endif
+  #if (__INTEL_COMPILER >= 1110) /* Intel C++ Compiler 11.1 */
+    #define FLAC__AVX_SUPPORTED 1
+  #endif
+  #if (__INTEL_COMPILER >= 1300) /* Intel C++ Compiler 13.0 */
+    #define FLAC__AVX2_SUPPORTED 1
+    #define FLAC__FMA_SUPPORTED 1
+  #endif
+#elif defined _MSC_VER
+  #define FLAC__SSE_TARGET(x)
+  #define FLAC__SSE_SUPPORTED 1
+  #define FLAC__SSE2_SUPPORTED 1
+  #if (_MSC_VER >= 1500) /* MS Visual Studio 2008 */
+    #define FLAC__SSSE3_SUPPORTED 1
+    #define FLAC__SSE4_1_SUPPORTED 1
+  #endif
+  #if (_MSC_FULL_VER >= 160040219) /* MS Visual Studio 2010 SP1 */
+    #define FLAC__AVX_SUPPORTED 1
+  #endif
+  #if (_MSC_VER >= 1700) /* MS Visual Studio 2012 */
+    #define FLAC__AVX2_SUPPORTED 1
+    #define FLAC__FMA_SUPPORTED 1
+  #endif
+#elif defined __GNUC__
+  #if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 9)) /* since GCC 4.9 -msse.. compiler options aren't necessary */
+    #define FLAC__SSE_TARGET(x) __attribute__ ((__target__ (x)))
+    #define FLAC__SSE_SUPPORTED 1
+    #define FLAC__SSE2_SUPPORTED 1
+    #define FLAC__SSSE3_SUPPORTED 1
+    #define FLAC__SSE4_1_SUPPORTED 1
+    #define FLAC__AVX_SUPPORTED 1
+    #define FLAC__AVX2_SUPPORTED 1
+    #define FLAC__FMA_SUPPORTED 1
+  #else /* for GCC older than 4.9 */
+    #define FLAC__SSE_TARGET(x)
+    #ifdef __SSE__
+      #define FLAC__SSE_SUPPORTED 1
+    #endif
+    #ifdef __SSE2__
+      #define FLAC__SSE2_SUPPORTED 1
+    #endif
+    #ifdef __SSSE3__
+      #define FLAC__SSSE3_SUPPORTED 1
+    #endif
+    #ifdef __SSE4_1__
+      #define FLAC__SSE4_1_SUPPORTED 1
+    #endif
+    #ifdef __AVX__
+      #define FLAC__AVX_SUPPORTED 1
+    #endif
+    #ifdef __AVX2__
+      #define FLAC__AVX2_SUPPORTED 1
+    #endif
+    #ifdef __FMA__
+      #define FLAC__FMA_SUPPORTED 1
+    #endif
+  #endif /* GCC version */
+#endif /* compiler version */
+#endif /* intrinsics support */
+
 typedef enum {
 	FLAC__CPUINFO_TYPE_IA32,
-	FLAC__CPUINFO_TYPE_PPC,
+	FLAC__CPUINFO_TYPE_X86_64,
 	FLAC__CPUINFO_TYPE_UNKNOWN
 } FLAC__CPUInfo_Type;
 
+#if defined FLAC__CPU_IA32
 typedef struct {
-	FLAC__bool cpuid;
-	FLAC__bool bswap;
 	FLAC__bool cmov;
 	FLAC__bool mmx;
-	FLAC__bool fxsr;
 	FLAC__bool sse;
 	FLAC__bool sse2;
+
 	FLAC__bool sse3;
 	FLAC__bool ssse3;
-	FLAC__bool _3dnow;
-	FLAC__bool ext3dnow;
-	FLAC__bool extmmx;
+	FLAC__bool sse41;
+	FLAC__bool sse42;
+	FLAC__bool avx;
+	FLAC__bool avx2;
+	FLAC__bool fma;
 } FLAC__CPUInfo_IA32;
-
+#elif defined FLAC__CPU_X86_64
 typedef struct {
-	FLAC__bool altivec;
-	FLAC__bool ppc64;
-} FLAC__CPUInfo_PPC;
+	FLAC__bool sse3;
+	FLAC__bool ssse3;
+	FLAC__bool sse41;
+	FLAC__bool sse42;
+	FLAC__bool avx;
+	FLAC__bool avx2;
+	FLAC__bool fma;
+} FLAC__CPUInfo_x86;
+#endif
 
 typedef struct {
 	FLAC__bool use_asm;
 	FLAC__CPUInfo_Type type;
-	union {
-		FLAC__CPUInfo_IA32 ia32;
-		FLAC__CPUInfo_PPC ppc;
-	} data;
+#if defined FLAC__CPU_IA32
+	FLAC__CPUInfo_IA32 ia32;
+#elif defined FLAC__CPU_X86_64
+	FLAC__CPUInfo_x86 x86;
+#endif
 } FLAC__CPUInfo;
 
 void FLAC__cpu_info(FLAC__CPUInfo *info);
 
 #ifndef FLAC__NO_ASM
-#ifdef FLAC__CPU_IA32
-#ifdef FLAC__HAS_NASM
+# if defined FLAC__CPU_IA32 && defined FLAC__HAS_NASM
 FLAC__uint32 FLAC__cpu_have_cpuid_asm_ia32(void);
 void         FLAC__cpu_info_asm_ia32(FLAC__uint32 *flags_edx, FLAC__uint32 *flags_ecx);
-FLAC__uint32 FLAC__cpu_info_extended_amd_asm_ia32(void);
-#endif
-#endif
+# endif
+# if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
+FLAC__uint32 FLAC__cpu_have_cpuid_x86(void);
+void         FLAC__cpu_info_x86(FLAC__uint32 level, FLAC__uint32 *eax, FLAC__uint32 *ebx, FLAC__uint32 *ecx, FLAC__uint32 *edx);
+FLAC__uint32 FLAC__cpu_xgetbv_x86(void);
+# endif
 #endif
 
 #endif
diff --git a/src/libFLAC/include/private/crc.h b/src/libFLAC/include/private/crc.h
index 7a224b4..29c512c 100644
--- a/src/libFLAC/include/private/crc.h
+++ b/src/libFLAC/include/private/crc.h
@@ -1,5 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
- * Copyright (C) 2000,2001,2002,2003,2004,2005,2006,2007  Josh Coalson
+ * Copyright (C) 2000-2009  Josh Coalson
+ * Copyright (C) 2011-2014  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -33,7 +34,6 @@
 #define FLAC__PRIVATE__CRC_H
 
 #include "FLAC/ordinals.h"
-#include <stdlib.h>
 
 /* 8 bit CRC generator, MSB shifted first
 ** polynomial = x^8 + x^2 + x^1 + x^0
@@ -42,21 +42,21 @@
 extern FLAC__byte const FLAC__crc8_table[256];
 #define FLAC__CRC8_UPDATE(data, crc) (crc) = FLAC__crc8_table[(crc) ^ (data)];
 void FLAC__crc8_update(const FLAC__byte data, FLAC__uint8 *crc);
-void FLAC__crc8_update_block(const FLAC__byte *data, size_t len, FLAC__uint8 *crc);
-FLAC__uint8 FLAC__crc8(const FLAC__byte *data, size_t len);
+void FLAC__crc8_update_block(const FLAC__byte *data, unsigned len, FLAC__uint8 *crc);
+FLAC__uint8 FLAC__crc8(const FLAC__byte *data, unsigned len);
 
 /* 16 bit CRC generator, MSB shifted first
 ** polynomial = x^16 + x^15 + x^2 + x^0
 ** init = 0
 */
-extern unsigned FLAC__crc16_table[256];
+extern unsigned const FLAC__crc16_table[256];
 
-#define FLAC__CRC16_UPDATE(data, crc) (((((crc)<<8) & 0xffff) ^ FLAC__crc16_table[((crc)>>8) ^ (data)]))
+#define FLAC__CRC16_UPDATE(data, crc) ((((crc)<<8) & 0xffff) ^ FLAC__crc16_table[((crc)>>8) ^ (data)])
 /* this alternate may be faster on some systems/compilers */
 #if 0
 #define FLAC__CRC16_UPDATE(data, crc) ((((crc)<<8) ^ FLAC__crc16_table[((crc)>>8) ^ (data)]) & 0xffff)
 #endif
 
-unsigned FLAC__crc16(const FLAC__byte *data, size_t len);
+unsigned FLAC__crc16(const FLAC__byte *data, unsigned len);
 
 #endif
diff --git a/src/libFLAC/include/private/fixed.h b/src/libFLAC/include/private/fixed.h
index 6656b79..dcc4715 100644
--- a/src/libFLAC/include/private/fixed.h
+++ b/src/libFLAC/include/private/fixed.h
@@ -1,5 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
- * Copyright (C) 2000,2001,2002,2003,2004,2005,2006,2007  Josh Coalson
+ * Copyright (C) 2000-2009  Josh Coalson
+ * Copyright (C) 2011-2014  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -36,6 +37,7 @@
 #include <config.h>
 #endif
 
+#include "private/cpu.h"
 #include "private/float.h"
 #include "FLAC/format.h"
 
@@ -53,14 +55,22 @@
  */
 #ifndef FLAC__INTEGER_ONLY_LIBRARY
 unsigned FLAC__fixed_compute_best_predictor(const FLAC__int32 data[], unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]);
+unsigned FLAC__fixed_compute_best_predictor_wide(const FLAC__int32 data[], unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]);
 # ifndef FLAC__NO_ASM
-#  ifdef FLAC__CPU_IA32
-#   ifdef FLAC__HAS_NASM
-unsigned FLAC__fixed_compute_best_predictor_asm_ia32_mmx_cmov(const FLAC__int32 data[], unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]);
+#  if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
+#   ifdef FLAC__SSE2_SUPPORTED
+unsigned FLAC__fixed_compute_best_predictor_intrin_sse2(const FLAC__int32 data[], unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER + 1]);
+unsigned FLAC__fixed_compute_best_predictor_wide_intrin_sse2(const FLAC__int32 data[], unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER + 1]);
+#   endif
+#   ifdef FLAC__SSSE3_SUPPORTED
+unsigned FLAC__fixed_compute_best_predictor_intrin_ssse3(const FLAC__int32 data[], unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]);
+unsigned FLAC__fixed_compute_best_predictor_wide_intrin_ssse3(const FLAC__int32 data[], unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER + 1]);
 #   endif
 #  endif
+#  if defined FLAC__CPU_IA32 && defined FLAC__HAS_NASM
+unsigned FLAC__fixed_compute_best_predictor_asm_ia32_mmx_cmov(const FLAC__int32 data[], unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]);
+#  endif
 # endif
-unsigned FLAC__fixed_compute_best_predictor_wide(const FLAC__int32 data[], unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]);
 #else
 unsigned FLAC__fixed_compute_best_predictor(const FLAC__int32 data[], unsigned data_len, FLAC__fixedpoint residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]);
 unsigned FLAC__fixed_compute_best_predictor_wide(const FLAC__int32 data[], unsigned data_len, FLAC__fixedpoint residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]);
diff --git a/src/libFLAC/include/private/float.h b/src/libFLAC/include/private/float.h
index 73313f6..ab26432 100644
--- a/src/libFLAC/include/private/float.h
+++ b/src/libFLAC/include/private/float.h
@@ -1,5 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
- * Copyright (C) 2004,2005,2006,2007  Josh Coalson
+ * Copyright (C) 2004-2009  Josh Coalson
+ * Copyright (C) 2011-2014  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
diff --git a/src/libFLAC/include/private/format.h b/src/libFLAC/include/private/format.h
index 7f5cc93..2fd3460 100644
--- a/src/libFLAC/include/private/format.h
+++ b/src/libFLAC/include/private/format.h
@@ -1,5 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
- * Copyright (C) 2000,2001,2002,2003,2004,2005,2006,2007  Josh Coalson
+ * Copyright (C) 2000-2009  Josh Coalson
+ * Copyright (C) 2011-2014  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
diff --git a/src/libFLAC/include/private/lpc.h b/src/libFLAC/include/private/lpc.h
index 2cb139b..d36b30b 100644
--- a/src/libFLAC/include/private/lpc.h
+++ b/src/libFLAC/include/private/lpc.h
@@ -1,5 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
- * Copyright (C) 2000,2001,2002,2003,2004,2005,2006,2007  Josh Coalson
+ * Copyright (C) 2000-2009  Josh Coalson
+ * Copyright (C) 2011-2014  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -36,6 +37,7 @@
 #include <config.h>
 #endif
 
+#include "private/cpu.h"
 #include "private/float.h"
 #include "FLAC/format.h"
 
@@ -74,7 +76,15 @@
 void FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
 void FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
 void FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
-void FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
+void FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
+#    endif
+#  endif
+#  if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
+#    ifdef FLAC__SSE_SUPPORTED
+void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
+void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
+void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
+void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
 #    endif
 #  endif
 #endif
@@ -144,6 +154,22 @@
 #    ifdef FLAC__HAS_NASM
 void FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]);
 void FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]);
+void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_asm_ia32(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]);
+#    endif
+#  endif
+#  if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
+#    ifdef FLAC__SSE2_SUPPORTED
+void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]);
+void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]);
+#    endif
+#    ifdef FLAC__SSE4_1_SUPPORTED
+void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse41(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]);
+void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]);
+#    endif
+#    ifdef FLAC__AVX2_SUPPORTED
+void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_avx2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]);
+void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_avx2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]);
+void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]);
 #    endif
 #  endif
 #endif
@@ -172,11 +198,17 @@
 #    ifdef FLAC__HAS_NASM
 void FLAC__lpc_restore_signal_asm_ia32(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]);
 void FLAC__lpc_restore_signal_asm_ia32_mmx(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]);
+void FLAC__lpc_restore_signal_wide_asm_ia32(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]);
 #    endif /* FLAC__HAS_NASM */
-#  elif defined FLAC__CPU_PPC
-void FLAC__lpc_restore_signal_asm_ppc_altivec_16(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]);
-void FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]);
-#  endif/* FLAC__CPU_IA32 || FLAC__CPU_PPC */
+#  endif /* FLAC__CPU_IA32 */
+#  if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
+#    ifdef FLAC__SSE2_SUPPORTED
+void FLAC__lpc_restore_signal_16_intrin_sse2(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]);
+#    endif
+#    ifdef FLAC__SSE4_1_SUPPORTED
+void FLAC__lpc_restore_signal_wide_intrin_sse41(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]);
+#    endif
+#  endif
 #endif /* FLAC__NO_ASM */
 
 #ifndef FLAC__INTEGER_ONLY_LIBRARY
diff --git a/src/libFLAC/include/private/macros.h b/src/libFLAC/include/private/macros.h
new file mode 100644
index 0000000..0eed2d3
--- /dev/null
+++ b/src/libFLAC/include/private/macros.h
@@ -0,0 +1,72 @@
+/* libFLAC - Free Lossless Audio Codec library
+ * Copyright (C) 2012-2014  Xiph.org Foundation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the Xiph.org Foundation nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef FLAC__PRIVATE__MACROS_H
+#define FLAC__PRIVATE__MACROS_H
+
+#if defined(__GNUC__)
+
+#define flac_max(a,b) \
+    ({ __typeof__ (a) _a = (a); \
+    __typeof__ (b) _b = (b); \
+    _a > _b ? _a : _b; })
+
+#define MIN_PASTE(A,B) A##B
+#define MIN_IMPL(A,B,L) ({ \
+    __typeof__(A) MIN_PASTE(__a,L) = (A); \
+    __typeof__(B) MIN_PASTE(__b,L) = (B); \
+    MIN_PASTE(__a,L) < MIN_PASTE(__b,L) ? MIN_PASTE(__a,L) : MIN_PASTE(__b,L); \
+    })
+
+#define flac_min(A,B) MIN_IMPL(A,B,__COUNTER__)
+
+/* Whatever other unix that has sys/param.h */
+#elif defined(HAVE_SYS_PARAM_H)
+#include <sys/param.h>
+#define flac_max(a,b) MAX(a,b)
+#define flac_min(a,b) MIN(a,b)
+
+/* Windows VS has them in stdlib.h.. XXX:Untested */
+#elif defined(_MSC_VER)
+#include <stdlib.h>
+#define flac_max(a,b) __max(a,b)
+#define flac_min(a,b) __min(a,b)
+#endif
+
+#ifndef MIN
+#define MIN(x,y)	((x) <= (y) ? (x) : (y))
+#endif
+
+#ifndef MAX
+#define MAX(x,y)	((x) >= (y) ? (x) : (y))
+#endif
+
+#endif
diff --git a/src/libFLAC/include/private/md5.h b/src/libFLAC/include/private/md5.h
index e5f675a..c665ab3 100644
--- a/src/libFLAC/include/private/md5.h
+++ b/src/libFLAC/include/private/md5.h
@@ -28,11 +28,17 @@
 
 #include "FLAC/ordinals.h"
 
+typedef union {
+	FLAC__byte *p8;
+	FLAC__int16 *p16;
+	FLAC__int32 *p32;
+} FLAC__multibyte;
+
 typedef struct {
 	FLAC__uint32 in[16];
 	FLAC__uint32 buf[4];
 	FLAC__uint32 bytes[2];
-	FLAC__byte *internal_buf;
+	FLAC__multibyte internal_buf;
 	size_t capacity;
 } FLAC__MD5Context;
 
diff --git a/src/libFLAC/include/private/memory.h b/src/libFLAC/include/private/memory.h
index 7852c81..c9f2712 100644
--- a/src/libFLAC/include/private/memory.h
+++ b/src/libFLAC/include/private/memory.h
@@ -1,5 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
- * Copyright (C) 2001,2002,2003,2004,2005,2006,2007  Josh Coalson
+ * Copyright (C) 2001-2009  Josh Coalson
+ * Copyright (C) 2011-2014  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -45,12 +46,13 @@
  * Use free() on this address to deallocate.
  */
 void *FLAC__memory_alloc_aligned(size_t bytes, void **aligned_address);
-FLAC__bool FLAC__memory_alloc_aligned_int32_array(unsigned elements, FLAC__int32 **unaligned_pointer, FLAC__int32 **aligned_pointer);
-FLAC__bool FLAC__memory_alloc_aligned_uint32_array(unsigned elements, FLAC__uint32 **unaligned_pointer, FLAC__uint32 **aligned_pointer);
-FLAC__bool FLAC__memory_alloc_aligned_uint64_array(unsigned elements, FLAC__uint64 **unaligned_pointer, FLAC__uint64 **aligned_pointer);
-FLAC__bool FLAC__memory_alloc_aligned_unsigned_array(unsigned elements, unsigned **unaligned_pointer, unsigned **aligned_pointer);
+FLAC__bool FLAC__memory_alloc_aligned_int32_array(size_t elements, FLAC__int32 **unaligned_pointer, FLAC__int32 **aligned_pointer);
+FLAC__bool FLAC__memory_alloc_aligned_uint32_array(size_t elements, FLAC__uint32 **unaligned_pointer, FLAC__uint32 **aligned_pointer);
+FLAC__bool FLAC__memory_alloc_aligned_uint64_array(size_t elements, FLAC__uint64 **unaligned_pointer, FLAC__uint64 **aligned_pointer);
+FLAC__bool FLAC__memory_alloc_aligned_unsigned_array(size_t elements, unsigned **unaligned_pointer, unsigned **aligned_pointer);
 #ifndef FLAC__INTEGER_ONLY_LIBRARY
-FLAC__bool FLAC__memory_alloc_aligned_real_array(unsigned elements, FLAC__real **unaligned_pointer, FLAC__real **aligned_pointer);
+FLAC__bool FLAC__memory_alloc_aligned_real_array(size_t elements, FLAC__real **unaligned_pointer, FLAC__real **aligned_pointer);
 #endif
+void *safe_malloc_mul_2op_p(size_t size1, size_t size2);
 
 #endif
diff --git a/src/libFLAC/include/private/metadata.h b/src/libFLAC/include/private/metadata.h
index b5268c9..092764c 100644
--- a/src/libFLAC/include/private/metadata.h
+++ b/src/libFLAC/include/private/metadata.h
@@ -1,5 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
- * Copyright (C) 2002,2003,2004,2005,2006,2007  Josh Coalson
+ * Copyright (C) 2002-2009  Josh Coalson
+ * Copyright (C) 2011-2014  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
diff --git a/src/libFLAC/include/private/stream_encoder.h b/src/libFLAC/include/private/stream_encoder.h
new file mode 100644
index 0000000..96d3135
--- /dev/null
+++ b/src/libFLAC/include/private/stream_encoder.h
@@ -0,0 +1,67 @@
+/* libFLAC - Free Lossless Audio Codec library
+ * Copyright (C) 2000-2009  Josh Coalson
+ * Copyright (C) 2011-2014  Xiph.Org Foundation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the Xiph.org Foundation nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef FLAC__PRIVATE__STREAM_ENCODER_H
+#define FLAC__PRIVATE__STREAM_ENCODER_H
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+/*
+ * This is used to avoid overflow with unusual signals in 32-bit
+ * accumulator in the *precompute_partition_info_sums_* functions.
+ */
+#define FLAC__MAX_EXTRA_RESIDUAL_BPS 4
+
+#if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
+#include "private/cpu.h"
+#include "FLAC/format.h"
+
+#ifdef FLAC__SSE2_SUPPORTED
+extern void FLAC__precompute_partition_info_sums_intrin_sse2(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[],
+			unsigned residual_samples, unsigned predictor_order, unsigned min_partition_order, unsigned max_partition_order, unsigned bps);
+#endif
+
+#ifdef FLAC__SSSE3_SUPPORTED
+extern void FLAC__precompute_partition_info_sums_intrin_ssse3(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[],
+			unsigned residual_samples, unsigned predictor_order, unsigned min_partition_order, unsigned max_partition_order, unsigned bps);
+#endif
+
+#ifdef FLAC__AVX2_SUPPORTED
+extern void FLAC__precompute_partition_info_sums_intrin_avx2(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[],
+			unsigned residual_samples, unsigned predictor_order, unsigned min_partition_order, unsigned max_partition_order, unsigned bps);
+#endif
+
+#endif
+
+#endif
diff --git a/src/libFLAC/include/private/stream_encoder_framing.h b/src/libFLAC/include/private/stream_encoder_framing.h
index 4865c16..2b7387a 100644
--- a/src/libFLAC/include/private/stream_encoder_framing.h
+++ b/src/libFLAC/include/private/stream_encoder_framing.h
@@ -1,5 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
- * Copyright (C) 2000,2001,2002,2003,2004,2005,2006,2007  Josh Coalson
+ * Copyright (C) 2000-2009  Josh Coalson
+ * Copyright (C) 2011-2014  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
diff --git a/src/libFLAC/include/private/window.h b/src/libFLAC/include/private/window.h
index 01e0fc4..52c9262 100644
--- a/src/libFLAC/include/private/window.h
+++ b/src/libFLAC/include/private/window.h
@@ -1,5 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
- * Copyright (C) 2006,2007  Josh Coalson
+ * Copyright (C) 2006-2009  Josh Coalson
+ * Copyright (C) 2011-2014  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -64,6 +65,8 @@
 void FLAC__window_rectangle(FLAC__real *window, const FLAC__int32 L);
 void FLAC__window_triangle(FLAC__real *window, const FLAC__int32 L);
 void FLAC__window_tukey(FLAC__real *window, const FLAC__int32 L, const FLAC__real p);
+void FLAC__window_partial_tukey(FLAC__real *window, const FLAC__int32 L, const FLAC__real p, const FLAC__real start, const FLAC__real end);
+void FLAC__window_punchout_tukey(FLAC__real *window, const FLAC__int32 L, const FLAC__real p, const FLAC__real start, const FLAC__real end);
 void FLAC__window_welch(FLAC__real *window, const FLAC__int32 L);
 
 #endif /* !defined FLAC__INTEGER_ONLY_LIBRARY */
diff --git a/src/libFLAC/include/protected/all.h b/src/libFLAC/include/protected/all.h
index 2921092..90912af 100644
--- a/src/libFLAC/include/protected/all.h
+++ b/src/libFLAC/include/protected/all.h
@@ -1,5 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
- * Copyright (C) 2001,2002,2003,2004,2005,2006,2007  Josh Coalson
+ * Copyright (C) 2001-2009  Josh Coalson
+ * Copyright (C) 2011-2014  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
diff --git a/src/libFLAC/include/protected/stream_decoder.h b/src/libFLAC/include/protected/stream_decoder.h
index 9108ca7..7be95af 100644
--- a/src/libFLAC/include/protected/stream_decoder.h
+++ b/src/libFLAC/include/protected/stream_decoder.h
@@ -1,5 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
- * Copyright (C) 2000,2001,2002,2003,2004,2005,2006,2007  Josh Coalson
+ * Copyright (C) 2000-2009  Josh Coalson
+ * Copyright (C) 2011-2014  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -39,6 +40,7 @@
 
 typedef struct FLAC__StreamDecoderProtected {
 	FLAC__StreamDecoderState state;
+	FLAC__StreamDecoderInitStatus initstate;
 	unsigned channels;
 	FLAC__ChannelAssignment channel_assignment;
 	unsigned bits_per_sample;
diff --git a/src/libFLAC/include/protected/stream_encoder.h b/src/libFLAC/include/protected/stream_encoder.h
index 4101ee5..6917f5d 100644
--- a/src/libFLAC/include/protected/stream_encoder.h
+++ b/src/libFLAC/include/protected/stream_encoder.h
@@ -1,5 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
- * Copyright (C) 2001,2002,2003,2004,2005,2006,2007  Josh Coalson
+ * Copyright (C) 2001-2009  Josh Coalson
+ * Copyright (C) 2011-2014  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -58,6 +59,8 @@
 	FLAC__APODIZATION_RECTANGLE,
 	FLAC__APODIZATION_TRIANGLE,
 	FLAC__APODIZATION_TUKEY,
+	FLAC__APODIZATION_PARTIAL_TUKEY,
+	FLAC__APODIZATION_PUNCHOUT_TUKEY,
 	FLAC__APODIZATION_WELCH
 } FLAC__ApodizationFunction;
 
@@ -70,6 +73,11 @@
 		struct {
 			FLAC__real p;
 		} tukey;
+		struct {
+			FLAC__real p;
+			FLAC__real start;
+			FLAC__real end;
+		} multiple_tukey;
 	} parameters;
 } FLAC__ApodizationSpecification;
 
diff --git a/src/libFLAC/lpc.c b/src/libFLAC/lpc.c
index 7806348..ca98c5c 100644
--- a/src/libFLAC/lpc.c
+++ b/src/libFLAC/lpc.c
@@ -1,5 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
- * Copyright (C) 2000,2001,2002,2003,2004,2005,2006,2007  Josh Coalson
+ * Copyright (C) 2000-2009  Josh Coalson
+ * Copyright (C) 2011-2014  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -29,29 +30,39 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#if HAVE_CONFIG_H
+#ifdef HAVE_CONFIG_H
 #  include <config.h>
 #endif
 
 #include <math.h>
+
 #include "FLAC/assert.h"
 #include "FLAC/format.h"
+#include "share/compat.h"
 #include "private/bitmath.h"
 #include "private/lpc.h"
+#include "private/macros.h"
 #if defined DEBUG || defined FLAC__OVERFLOW_DETECT || defined FLAC__OVERFLOW_DETECT_VERBOSE
 #include <stdio.h>
 #endif
 
-#ifndef FLAC__INTEGER_ONLY_LIBRARY
-
-#ifndef M_LN2
-/* math.h in VC++ doesn't seem to have this (how Microsoft is that?) */
-#define M_LN2 0.69314718055994530942
-#endif
-
 /* OPT: #undef'ing this may improve the speed on some architectures */
 #define FLAC__LPC_UNROLLED_FILTER_LOOPS
 
+#ifndef FLAC__INTEGER_ONLY_LIBRARY
+
+#if !defined(HAVE_LROUND)
+#if defined(_MSC_VER)
+#include <float.h>
+#define copysign _copysign
+#elif defined(__GNUC__)
+#define copysign __builtin_copysign
+#endif
+static inline long int lround(double x) {
+    return (long)(x + copysign (0.5, x));
+}
+/* If this fails, we are in the presence of a mid 90's compiler, move along... */
+#endif
 
 void FLAC__lpc_window_data(const FLAC__int32 in[], const FLAC__real window[], FLAC__real out[], unsigned data_len)
 {
@@ -112,7 +123,7 @@
 void FLAC__lpc_compute_lp_coefficients(const FLAC__real autoc[], unsigned *max_order, FLAC__real lp_coeff[][FLAC__MAX_LPC_ORDER], FLAC__double error[])
 {
 	unsigned i, j;
-	FLAC__double r, err, ref[FLAC__MAX_LPC_ORDER], lpc[FLAC__MAX_LPC_ORDER];
+	FLAC__double r, err, lpc[FLAC__MAX_LPC_ORDER];
 
 	FLAC__ASSERT(0 != max_order);
 	FLAC__ASSERT(0 < *max_order);
@@ -126,7 +137,7 @@
 		r = -autoc[i+1];
 		for(j = 0; j < i; j++)
 			r -= lpc[j] * autoc[i-j];
-		ref[i] = (r/=err);
+		r /= err;
 
 		/* Update LPC coefficients and total error. */
 		lpc[i]=r;
@@ -145,7 +156,7 @@
 			lp_coeff[i][j] = (FLAC__real)(-lpc[j]); /* negate FIR filter coeff to get predictor coeff */
 		error[i] = err;
 
-		/* see SF bug #1601812 http://sourceforge.net/tracker/index.php?func=detail&aid=1601812&group_id=13478&atid=113478 */
+		/* see SF bug https://sourceforge.net/p/flac/bugs/234/ */
 		if(err == 0.0) {
 			*max_order = i+1;
 			return;
@@ -200,14 +211,8 @@
 		FLAC__int32 q;
 		for(i = 0; i < order; i++) {
 			error += lp_coeff[i] * (1 << *shift);
-#if 1 /* unfortunately lround() is C99 */
-			if(error >= 0.0)
-				q = (FLAC__int32)(error + 0.5);
-			else
-				q = (FLAC__int32)(error - 0.5);
-#else
 			q = lround(error);
-#endif
+
 #ifdef FLAC__OVERFLOW_DETECT
 			if(q > qmax+1) /* we expect q==qmax+1 occasionally due to rounding */
 				fprintf(stderr,"FLAC__lpc_quantize_coefficients: quantizer overflow: q>qmax %d>%d shift=%d cmax=%f precision=%u lpc[%u]=%f\n",q,qmax,*shift,cmax,precision+1,i,lp_coeff[i]);
@@ -235,14 +240,7 @@
 #endif
 		for(i = 0; i < order; i++) {
 			error += lp_coeff[i] / (1 << nshift);
-#if 1 /* unfortunately lround() is C99 */
-			if(error >= 0.0)
-				q = (FLAC__int32)(error + 0.5);
-			else
-				q = (FLAC__int32)(error - 0.5);
-#else
 			q = lround(error);
-#endif
 #ifdef FLAC__OVERFLOW_DETECT
 			if(q > qmax+1) /* we expect q==qmax+1 occasionally due to rounding */
 				fprintf(stderr,"FLAC__lpc_quantize_coefficients: quantizer overflow: q>qmax %d>%d shift=%d cmax=%f precision=%u lpc[%u]=%f\n",q,qmax,*shift,cmax,precision+1,i,lp_coeff[i]);
@@ -262,7 +260,12 @@
 	return 0;
 }
 
-void FLAC__lpc_compute_residual_from_qlp_coefficients(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
+#if defined(_MSC_VER)
+// silence MSVC warnings about __restrict modifier
+#pragma warning ( disable : 4028 )
+#endif
+
+void FLAC__lpc_compute_residual_from_qlp_coefficients(const FLAC__int32 * flac_restrict data, unsigned data_len, const FLAC__int32 * flac_restrict qlp_coeff, unsigned order, int lp_quantization, FLAC__int32 * flac_restrict residual)
 #if defined(FLAC__OVERFLOW_DETECT) || !defined(FLAC__LPC_UNROLLED_FILTER_LOOPS)
 {
 	FLAC__int64 sumo;
@@ -285,13 +288,7 @@
 		for(j = 0; j < order; j++) {
 			sum += qlp_coeff[j] * (*(--history));
 			sumo += (FLAC__int64)qlp_coeff[j] * (FLAC__int64)(*history);
-#if defined _MSC_VER
-			if(sumo > 2147483647I64 || sumo < -2147483648I64)
-				fprintf(stderr,"FLAC__lpc_compute_residual_from_qlp_coefficients: OVERFLOW, i=%u, j=%u, c=%d, d=%d, sumo=%I64d\n",i,j,qlp_coeff[j],*history,sumo);
-#else
-			if(sumo > 2147483647ll || sumo < -2147483648ll)
-				fprintf(stderr,"FLAC__lpc_compute_residual_from_qlp_coefficients: OVERFLOW, i=%u, j=%u, c=%d, d=%d, sumo=%lld\n",i,j,qlp_coeff[j],*history,(long long)sumo);
-#endif
+				fprintf(stderr,"FLAC__lpc_compute_residual_from_qlp_coefficients: OVERFLOW, i=%u, j=%u, c=%d, d=%d, sumo=%" PRId64 "\n",i,j,qlp_coeff[j],*history,sumo);
 		}
 		*(residual++) = *(data++) - (sum >> lp_quantization);
 	}
@@ -528,7 +525,7 @@
 }
 #endif
 
-void FLAC__lpc_compute_residual_from_qlp_coefficients_wide(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
+void FLAC__lpc_compute_residual_from_qlp_coefficients_wide(const FLAC__int32 * flac_restrict data, unsigned data_len, const FLAC__int32 * flac_restrict qlp_coeff, unsigned order, int lp_quantization, FLAC__int32 * flac_restrict residual)
 #if defined(FLAC__OVERFLOW_DETECT) || !defined(FLAC__LPC_UNROLLED_FILTER_LOOPS)
 {
 	unsigned i, j;
@@ -549,19 +546,11 @@
 		for(j = 0; j < order; j++)
 			sum += (FLAC__int64)qlp_coeff[j] * (FLAC__int64)(*(--history));
 		if(FLAC__bitmath_silog2_wide(sum >> lp_quantization) > 32) {
-#if defined _MSC_VER
-			fprintf(stderr,"FLAC__lpc_compute_residual_from_qlp_coefficients_wide: OVERFLOW, i=%u, sum=%I64d\n", i, sum >> lp_quantization);
-#else
-			fprintf(stderr,"FLAC__lpc_compute_residual_from_qlp_coefficients_wide: OVERFLOW, i=%u, sum=%lld\n", i, (long long)(sum >> lp_quantization));
-#endif
+			fprintf(stderr,"FLAC__lpc_compute_residual_from_qlp_coefficients_wide: OVERFLOW, i=%u, sum=%" PRId64 "\n", i, (sum >> lp_quantization));
 			break;
 		}
 		if(FLAC__bitmath_silog2_wide((FLAC__int64)(*data) - (sum >> lp_quantization)) > 32) {
-#if defined _MSC_VER
-			fprintf(stderr,"FLAC__lpc_compute_residual_from_qlp_coefficients_wide: OVERFLOW, i=%u, data=%d, sum=%I64d, residual=%I64d\n", i, *data, sum >> lp_quantization, (FLAC__int64)(*data) - (sum >> lp_quantization));
-#else
-			fprintf(stderr,"FLAC__lpc_compute_residual_from_qlp_coefficients_wide: OVERFLOW, i=%u, data=%d, sum=%lld, residual=%lld\n", i, *data, (long long)(sum >> lp_quantization), (long long)((FLAC__int64)(*data) - (sum >> lp_quantization)));
-#endif
+			fprintf(stderr,"FLAC__lpc_compute_residual_from_qlp_coefficients_wide: OVERFLOW, i=%u, data=%d, sum=%" PRId64 ", residual=%" PRId64 "\n", i, *data, (int64_t)(sum >> lp_quantization), ((FLAC__int64)(*data) - (sum >> lp_quantization)));
 			break;
 		}
 		*(residual++) = *(data++) - (FLAC__int32)(sum >> lp_quantization);
@@ -792,7 +781,7 @@
 
 #endif /* !defined FLAC__INTEGER_ONLY_LIBRARY */
 
-void FLAC__lpc_restore_signal(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
+void FLAC__lpc_restore_signal(const FLAC__int32 * flac_restrict residual, unsigned data_len, const FLAC__int32 * flac_restrict qlp_coeff, unsigned order, int lp_quantization, FLAC__int32 * flac_restrict data)
 #if defined(FLAC__OVERFLOW_DETECT) || !defined(FLAC__LPC_UNROLLED_FILTER_LOOPS)
 {
 	FLAC__int64 sumo;
@@ -815,13 +804,8 @@
 		for(j = 0; j < order; j++) {
 			sum += qlp_coeff[j] * (*(--history));
 			sumo += (FLAC__int64)qlp_coeff[j] * (FLAC__int64)(*history);
-#if defined _MSC_VER
-			if(sumo > 2147483647I64 || sumo < -2147483648I64)
-				fprintf(stderr,"FLAC__lpc_restore_signal: OVERFLOW, i=%u, j=%u, c=%d, d=%d, sumo=%I64d\n",i,j,qlp_coeff[j],*history,sumo);
-#else
 			if(sumo > 2147483647ll || sumo < -2147483648ll)
-				fprintf(stderr,"FLAC__lpc_restore_signal: OVERFLOW, i=%u, j=%u, c=%d, d=%d, sumo=%lld\n",i,j,qlp_coeff[j],*history,(long long)sumo);
-#endif
+				fprintf(stderr,"FLAC__lpc_restore_signal: OVERFLOW, i=%u, j=%u, c=%d, d=%d, sumo=%" PRId64 "\n",i,j,qlp_coeff[j],*history,sumo);
 		}
 		*(data++) = *(r++) + (sum >> lp_quantization);
 	}
@@ -1058,7 +1042,7 @@
 }
 #endif
 
-void FLAC__lpc_restore_signal_wide(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
+void FLAC__lpc_restore_signal_wide(const FLAC__int32 * flac_restrict residual, unsigned data_len, const FLAC__int32 * flac_restrict qlp_coeff, unsigned order, int lp_quantization, FLAC__int32 * flac_restrict data)
 #if defined(FLAC__OVERFLOW_DETECT) || !defined(FLAC__LPC_UNROLLED_FILTER_LOOPS)
 {
 	unsigned i, j;
@@ -1079,19 +1063,11 @@
 		for(j = 0; j < order; j++)
 			sum += (FLAC__int64)qlp_coeff[j] * (FLAC__int64)(*(--history));
 		if(FLAC__bitmath_silog2_wide(sum >> lp_quantization) > 32) {
-#ifdef _MSC_VER
-			fprintf(stderr,"FLAC__lpc_restore_signal_wide: OVERFLOW, i=%u, sum=%I64d\n", i, sum >> lp_quantization);
-#else
-			fprintf(stderr,"FLAC__lpc_restore_signal_wide: OVERFLOW, i=%u, sum=%lld\n", i, (long long)(sum >> lp_quantization));
-#endif
+			fprintf(stderr,"FLAC__lpc_restore_signal_wide: OVERFLOW, i=%u, sum=%" PRId64 "\n", i, (sum >> lp_quantization));
 			break;
 		}
 		if(FLAC__bitmath_silog2_wide((FLAC__int64)(*r) + (sum >> lp_quantization)) > 32) {
-#ifdef _MSC_VER
-			fprintf(stderr,"FLAC__lpc_restore_signal_wide: OVERFLOW, i=%u, residual=%d, sum=%I64d, data=%I64d\n", i, *r, sum >> lp_quantization, (FLAC__int64)(*r) + (sum >> lp_quantization));
-#else
-			fprintf(stderr,"FLAC__lpc_restore_signal_wide: OVERFLOW, i=%u, residual=%d, sum=%lld, data=%lld\n", i, *r, (long long)(sum >> lp_quantization), (long long)((FLAC__int64)(*r) + (sum >> lp_quantization)));
-#endif
+			fprintf(stderr,"FLAC__lpc_restore_signal_wide: OVERFLOW, i=%u, residual=%d, sum=%" PRId64 ", data=%" PRId64 "\n", i, *r, (sum >> lp_quantization), ((FLAC__int64)(*r) + (sum >> lp_quantization)));
 			break;
 		}
 		*(data++) = *(r++) + (FLAC__int32)(sum >> lp_quantization);
@@ -1320,6 +1296,10 @@
 }
 #endif
 
+#if defined(_MSC_VER)
+#pragma warning ( default : 4028 )
+#endif
+
 #ifndef FLAC__INTEGER_ONLY_LIBRARY
 
 FLAC__double FLAC__lpc_compute_expected_bits_per_residual_sample(FLAC__double lpc_error, unsigned total_samples)
@@ -1352,7 +1332,7 @@
 
 unsigned FLAC__lpc_compute_best_order(const FLAC__double lpc_error[], unsigned max_order, unsigned total_samples, unsigned overhead_bits_per_order)
 {
-	unsigned order, index, best_index; /* 'index' the index into lpc_error; index==order-1 since lpc_error[0] is for order==1, lpc_error[1] is for order==2, etc */
+	unsigned order, indx, best_index; /* 'index' the index into lpc_error; index==order-1 since lpc_error[0] is for order==1, lpc_error[1] is for order==2, etc */
 	FLAC__double bits, best_bits, error_scale;
 
 	FLAC__ASSERT(max_order > 0);
@@ -1363,15 +1343,15 @@
 	best_index = 0;
 	best_bits = (unsigned)(-1);
 
-	for(index = 0, order = 1; index < max_order; index++, order++) {
-		bits = FLAC__lpc_compute_expected_bits_per_residual_sample_with_error_scale(lpc_error[index], error_scale) * (FLAC__double)(total_samples - order) + (FLAC__double)(order * overhead_bits_per_order);
+	for(indx = 0, order = 1; indx < max_order; indx++, order++) {
+		bits = FLAC__lpc_compute_expected_bits_per_residual_sample_with_error_scale(lpc_error[indx], error_scale) * (FLAC__double)(total_samples - order) + (FLAC__double)(order * overhead_bits_per_order);
 		if(bits < best_bits) {
-			best_index = index;
+			best_index = indx;
 			best_bits = bits;
 		}
 	}
 
-	return best_index+1; /* +1 since index of lpc_error[] is order-1 */
+	return best_index+1; /* +1 since indx of lpc_error[] is order-1 */
 }
 
 #endif /* !defined FLAC__INTEGER_ONLY_LIBRARY */
diff --git a/src/libFLAC/lpc_intrin_avx2.c b/src/libFLAC/lpc_intrin_avx2.c
new file mode 100644
index 0000000..8eec85e
--- /dev/null
+++ b/src/libFLAC/lpc_intrin_avx2.c
@@ -0,0 +1,1120 @@
+/* libFLAC - Free Lossless Audio Codec library
+ * Copyright (C) 2000-2009  Josh Coalson
+ * Copyright (C) 2011-2014  Xiph.Org Foundation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the Xiph.org Foundation nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#  include <config.h>
+#endif
+
+#ifndef FLAC__INTEGER_ONLY_LIBRARY
+#ifndef FLAC__NO_ASM
+#if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
+#include "private/lpc.h"
+#ifdef FLAC__AVX2_SUPPORTED
+
+#include "FLAC/assert.h"
+#include "FLAC/format.h"
+
+#include <immintrin.h> /* AVX2 */
+
+FLAC__SSE_TARGET("avx2")
+void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_avx2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
+{
+	int i;
+	FLAC__int32 sum;
+	__m128i cnt = _mm_cvtsi32_si128(lp_quantization);
+
+	FLAC__ASSERT(order > 0);
+	FLAC__ASSERT(order <= 32);
+
+	if(order <= 12) {
+		if(order > 8) {
+			if(order > 10) {
+				if(order == 12) {
+					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
+					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
+					q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
+					q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
+					q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
+					q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
+					q5  = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
+					q6  = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
+					q7  = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]);
+					q8  = _mm256_set1_epi32(0xffff & qlp_coeff[8 ]);
+					q9  = _mm256_set1_epi32(0xffff & qlp_coeff[9 ]);
+					q10 = _mm256_set1_epi32(0xffff & qlp_coeff[10]);
+					q11 = _mm256_set1_epi32(0xffff & qlp_coeff[11]);
+
+					for(i = 0; i < (int)data_len-7; i+=8) {
+						__m256i summ, mull;
+						summ = _mm256_madd_epi16(q11, _mm256_loadu_si256((const __m256i*)(data+i-12)));
+						mull = _mm256_madd_epi16(q10, _mm256_loadu_si256((const __m256i*)(data+i-11))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q9,  _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
+						summ = _mm256_sra_epi32(summ, cnt);
+						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
+					}
+				}
+				else { /* order == 11 */
+					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;
+					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
+					q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
+					q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
+					q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
+					q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
+					q5  = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
+					q6  = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
+					q7  = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]);
+					q8  = _mm256_set1_epi32(0xffff & qlp_coeff[8 ]);
+					q9  = _mm256_set1_epi32(0xffff & qlp_coeff[9 ]);
+					q10 = _mm256_set1_epi32(0xffff & qlp_coeff[10]);
+
+					for(i = 0; i < (int)data_len-7; i+=8) {
+						__m256i summ, mull;
+						summ = _mm256_madd_epi16(q10, _mm256_loadu_si256((const __m256i*)(data+i-11)));
+						mull = _mm256_madd_epi16(q9,  _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
+						summ = _mm256_sra_epi32(summ, cnt);
+						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
+					}
+				}
+			}
+			else {
+				if(order == 10) {
+					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;
+					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
+					q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
+					q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
+					q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
+					q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
+					q5  = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
+					q6  = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
+					q7  = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]);
+					q8  = _mm256_set1_epi32(0xffff & qlp_coeff[8 ]);
+					q9  = _mm256_set1_epi32(0xffff & qlp_coeff[9 ]);
+
+					for(i = 0; i < (int)data_len-7; i+=8) {
+						__m256i summ, mull;
+						summ = _mm256_madd_epi16(q9,  _mm256_loadu_si256((const __m256i*)(data+i-10)));
+						mull = _mm256_madd_epi16(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
+						summ = _mm256_sra_epi32(summ, cnt);
+						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
+					}
+				}
+				else { /* order == 9 */
+					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8;
+					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
+					q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
+					q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
+					q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
+					q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
+					q5  = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
+					q6  = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
+					q7  = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]);
+					q8  = _mm256_set1_epi32(0xffff & qlp_coeff[8 ]);
+
+					for(i = 0; i < (int)data_len-7; i+=8) {
+						__m256i summ, mull;
+						summ = _mm256_madd_epi16(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9 )));
+						mull = _mm256_madd_epi16(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
+						summ = _mm256_sra_epi32(summ, cnt);
+						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
+					}
+				}
+			}
+		}
+		else if(order > 4) {
+			if(order > 6) {
+				if(order == 8) {
+					__m256i q0, q1, q2, q3, q4, q5, q6, q7;
+					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
+					q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
+					q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
+					q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
+					q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
+					q5  = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
+					q6  = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
+					q7  = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]);
+
+					for(i = 0; i < (int)data_len-7; i+=8) {
+						__m256i summ, mull;
+						summ = _mm256_madd_epi16(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8 )));
+						mull = _mm256_madd_epi16(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
+						summ = _mm256_sra_epi32(summ, cnt);
+						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
+					}
+				}
+				else { /* order == 7 */
+					__m256i q0, q1, q2, q3, q4, q5, q6;
+					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
+					q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
+					q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
+					q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
+					q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
+					q5  = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
+					q6  = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
+
+					for(i = 0; i < (int)data_len-7; i+=8) {
+						__m256i summ, mull;
+						summ = _mm256_madd_epi16(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7 )));
+						mull = _mm256_madd_epi16(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
+						summ = _mm256_sra_epi32(summ, cnt);
+						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
+					}
+				}
+			}
+			else {
+				if(order == 6) {
+					__m256i q0, q1, q2, q3, q4, q5;
+					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
+					q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
+					q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
+					q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
+					q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
+					q5  = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
+
+					for(i = 0; i < (int)data_len-7; i+=8) {
+						__m256i summ, mull;
+						summ = _mm256_madd_epi16(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6 )));
+						mull = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
+						summ = _mm256_sra_epi32(summ, cnt);
+						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
+					}
+				}
+				else { /* order == 5 */
+					__m256i q0, q1, q2, q3, q4;
+					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
+					q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
+					q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
+					q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
+					q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
+
+					for(i = 0; i < (int)data_len-7; i+=8) {
+						__m256i summ, mull;
+						summ = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 )));
+						mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
+						summ = _mm256_sra_epi32(summ, cnt);
+						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
+					}
+				}
+			}
+		}
+		else {
+			if(order > 2) {
+				if(order == 4) {
+					__m256i q0, q1, q2, q3;
+					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
+					q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
+					q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
+					q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
+
+					for(i = 0; i < (int)data_len-7; i+=8) {
+						__m256i summ, mull;
+						summ = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 )));
+						mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
+						summ = _mm256_sra_epi32(summ, cnt);
+						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
+					}
+				}
+				else { /* order == 3 */
+					__m256i q0, q1, q2;
+					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
+					q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
+					q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
+
+					for(i = 0; i < (int)data_len-7; i+=8) {
+						__m256i summ, mull;
+						summ = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 )));
+						mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
+						summ = _mm256_sra_epi32(summ, cnt);
+						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
+					}
+				}
+			}
+			else {
+				if(order == 2) {
+					__m256i q0, q1;
+					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
+					q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
+
+					for(i = 0; i < (int)data_len-7; i+=8) {
+						__m256i summ, mull;
+						summ = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 )));
+						mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
+						summ = _mm256_sra_epi32(summ, cnt);
+						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
+					}
+				}
+				else { /* order == 1 */
+					__m256i q0;
+					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
+
+					for(i = 0; i < (int)data_len-7; i+=8) {
+						__m256i summ;
+						summ = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 )));
+						summ = _mm256_sra_epi32(summ, cnt);
+						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
+					}
+				}
+			}
+		}
+		for(; i < (int)data_len; i++) {
+			sum = 0;
+			switch(order) {
+				case 12: sum += qlp_coeff[11] * data[i-12];
+				case 11: sum += qlp_coeff[10] * data[i-11];
+				case 10: sum += qlp_coeff[ 9] * data[i-10];
+				case 9:  sum += qlp_coeff[ 8] * data[i- 9];
+				case 8:  sum += qlp_coeff[ 7] * data[i- 8];
+				case 7:  sum += qlp_coeff[ 6] * data[i- 7];
+				case 6:  sum += qlp_coeff[ 5] * data[i- 6];
+				case 5:  sum += qlp_coeff[ 4] * data[i- 5];
+				case 4:  sum += qlp_coeff[ 3] * data[i- 4];
+				case 3:  sum += qlp_coeff[ 2] * data[i- 3];
+				case 2:  sum += qlp_coeff[ 1] * data[i- 2];
+				case 1:  sum += qlp_coeff[ 0] * data[i- 1];
+			}
+			residual[i] = data[i] - (sum >> lp_quantization);
+		}
+	}
+	else { /* order > 12 */
+		for(i = 0; i < (int)data_len; i++) {
+			sum = 0;
+			switch(order) {
+				case 32: sum += qlp_coeff[31] * data[i-32];
+				case 31: sum += qlp_coeff[30] * data[i-31];
+				case 30: sum += qlp_coeff[29] * data[i-30];
+				case 29: sum += qlp_coeff[28] * data[i-29];
+				case 28: sum += qlp_coeff[27] * data[i-28];
+				case 27: sum += qlp_coeff[26] * data[i-27];
+				case 26: sum += qlp_coeff[25] * data[i-26];
+				case 25: sum += qlp_coeff[24] * data[i-25];
+				case 24: sum += qlp_coeff[23] * data[i-24];
+				case 23: sum += qlp_coeff[22] * data[i-23];
+				case 22: sum += qlp_coeff[21] * data[i-22];
+				case 21: sum += qlp_coeff[20] * data[i-21];
+				case 20: sum += qlp_coeff[19] * data[i-20];
+				case 19: sum += qlp_coeff[18] * data[i-19];
+				case 18: sum += qlp_coeff[17] * data[i-18];
+				case 17: sum += qlp_coeff[16] * data[i-17];
+				case 16: sum += qlp_coeff[15] * data[i-16];
+				case 15: sum += qlp_coeff[14] * data[i-15];
+				case 14: sum += qlp_coeff[13] * data[i-14];
+				case 13: sum += qlp_coeff[12] * data[i-13];
+				         sum += qlp_coeff[11] * data[i-12];
+				         sum += qlp_coeff[10] * data[i-11];
+				         sum += qlp_coeff[ 9] * data[i-10];
+				         sum += qlp_coeff[ 8] * data[i- 9];
+				         sum += qlp_coeff[ 7] * data[i- 8];
+				         sum += qlp_coeff[ 6] * data[i- 7];
+				         sum += qlp_coeff[ 5] * data[i- 6];
+				         sum += qlp_coeff[ 4] * data[i- 5];
+				         sum += qlp_coeff[ 3] * data[i- 4];
+				         sum += qlp_coeff[ 2] * data[i- 3];
+				         sum += qlp_coeff[ 1] * data[i- 2];
+				         sum += qlp_coeff[ 0] * data[i- 1];
+			}
+			residual[i] = data[i] - (sum >> lp_quantization);
+		}
+	}
+	_mm256_zeroupper();
+}
+
+FLAC__SSE_TARGET("avx2")
+void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_avx2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
+{
+	int i;
+	FLAC__int32 sum;
+	__m128i cnt = _mm_cvtsi32_si128(lp_quantization);
+
+	FLAC__ASSERT(order > 0);
+	FLAC__ASSERT(order <= 32);
+
+	if(order <= 12) {
+		if(order > 8) {
+			if(order > 10) {
+				if(order == 12) {
+					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
+					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
+					q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
+					q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
+					q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
+					q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
+					q5  = _mm256_set1_epi32(qlp_coeff[5 ]);
+					q6  = _mm256_set1_epi32(qlp_coeff[6 ]);
+					q7  = _mm256_set1_epi32(qlp_coeff[7 ]);
+					q8  = _mm256_set1_epi32(qlp_coeff[8 ]);
+					q9  = _mm256_set1_epi32(qlp_coeff[9 ]);
+					q10 = _mm256_set1_epi32(qlp_coeff[10]);
+					q11 = _mm256_set1_epi32(qlp_coeff[11]);
+
+					for(i = 0; i < (int)data_len-7; i+=8) {
+						__m256i summ, mull;
+						summ = _mm256_mullo_epi32(q11, _mm256_loadu_si256((const __m256i*)(data+i-12)));
+						mull = _mm256_mullo_epi32(q10, _mm256_loadu_si256((const __m256i*)(data+i-11))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q9,  _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
+						summ = _mm256_sra_epi32(summ, cnt);
+						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
+					}
+				}
+				else { /* order == 11 */
+					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;
+					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
+					q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
+					q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
+					q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
+					q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
+					q5  = _mm256_set1_epi32(qlp_coeff[5 ]);
+					q6  = _mm256_set1_epi32(qlp_coeff[6 ]);
+					q7  = _mm256_set1_epi32(qlp_coeff[7 ]);
+					q8  = _mm256_set1_epi32(qlp_coeff[8 ]);
+					q9  = _mm256_set1_epi32(qlp_coeff[9 ]);
+					q10 = _mm256_set1_epi32(qlp_coeff[10]);
+
+					for(i = 0; i < (int)data_len-7; i+=8) {
+						__m256i summ, mull;
+						summ = _mm256_mullo_epi32(q10, _mm256_loadu_si256((const __m256i*)(data+i-11)));
+						mull = _mm256_mullo_epi32(q9,  _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
+						summ = _mm256_sra_epi32(summ, cnt);
+						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
+					}
+				}
+			}
+			else {
+				if(order == 10) {
+					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;
+					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
+					q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
+					q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
+					q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
+					q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
+					q5  = _mm256_set1_epi32(qlp_coeff[5 ]);
+					q6  = _mm256_set1_epi32(qlp_coeff[6 ]);
+					q7  = _mm256_set1_epi32(qlp_coeff[7 ]);
+					q8  = _mm256_set1_epi32(qlp_coeff[8 ]);
+					q9  = _mm256_set1_epi32(qlp_coeff[9 ]);
+
+					for(i = 0; i < (int)data_len-7; i+=8) {
+						__m256i summ, mull;
+						summ = _mm256_mullo_epi32(q9,  _mm256_loadu_si256((const __m256i*)(data+i-10)));
+						mull = _mm256_mullo_epi32(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
+						summ = _mm256_sra_epi32(summ, cnt);
+						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
+					}
+				}
+				else { /* order == 9 */
+					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8;
+					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
+					q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
+					q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
+					q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
+					q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
+					q5  = _mm256_set1_epi32(qlp_coeff[5 ]);
+					q6  = _mm256_set1_epi32(qlp_coeff[6 ]);
+					q7  = _mm256_set1_epi32(qlp_coeff[7 ]);
+					q8  = _mm256_set1_epi32(qlp_coeff[8 ]);
+
+					for(i = 0; i < (int)data_len-7; i+=8) {
+						__m256i summ, mull;
+						summ = _mm256_mullo_epi32(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9)));
+						mull = _mm256_mullo_epi32(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
+						summ = _mm256_sra_epi32(summ, cnt);
+						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
+					}
+				}
+			}
+		}
+		else if(order > 4) {
+			if(order > 6) {
+				if(order == 8) {
+					__m256i q0, q1, q2, q3, q4, q5, q6, q7;
+					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
+					q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
+					q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
+					q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
+					q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
+					q5  = _mm256_set1_epi32(qlp_coeff[5 ]);
+					q6  = _mm256_set1_epi32(qlp_coeff[6 ]);
+					q7  = _mm256_set1_epi32(qlp_coeff[7 ]);
+
+					for(i = 0; i < (int)data_len-7; i+=8) {
+						__m256i summ, mull;
+						summ = _mm256_mullo_epi32(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8)));
+						mull = _mm256_mullo_epi32(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
+						summ = _mm256_sra_epi32(summ, cnt);
+						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
+					}
+				}
+				else { /* order == 7 */
+					__m256i q0, q1, q2, q3, q4, q5, q6;
+					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
+					q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
+					q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
+					q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
+					q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
+					q5  = _mm256_set1_epi32(qlp_coeff[5 ]);
+					q6  = _mm256_set1_epi32(qlp_coeff[6 ]);
+
+					for(i = 0; i < (int)data_len-7; i+=8) {
+						__m256i summ, mull;
+						summ = _mm256_mullo_epi32(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7)));
+						mull = _mm256_mullo_epi32(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
+						summ = _mm256_sra_epi32(summ, cnt);
+						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
+					}
+				}
+			}
+			else {
+				if(order == 6) {
+					__m256i q0, q1, q2, q3, q4, q5;
+					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
+					q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
+					q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
+					q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
+					q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
+					q5  = _mm256_set1_epi32(qlp_coeff[5 ]);
+
+					for(i = 0; i < (int)data_len-7; i+=8) {
+						__m256i summ, mull;
+						summ = _mm256_mullo_epi32(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6)));
+						mull = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
+						summ = _mm256_sra_epi32(summ, cnt);
+						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
+					}
+				}
+				else { /* order == 5 */
+					__m256i q0, q1, q2, q3, q4;
+					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
+					q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
+					q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
+					q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
+					q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
+
+					for(i = 0; i < (int)data_len-7; i+=8) {
+						__m256i summ, mull;
+						summ = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));
+						mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
+						summ = _mm256_sra_epi32(summ, cnt);
+						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
+					}
+				}
+			}
+		}
+		else {
+			if(order > 2) {
+				if(order == 4) {
+					__m256i q0, q1, q2, q3;
+					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
+					q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
+					q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
+					q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
+
+					for(i = 0; i < (int)data_len-7; i+=8) {
+						__m256i summ, mull;
+						summ = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));
+						mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
+						summ = _mm256_sra_epi32(summ, cnt);
+						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
+					}
+				}
+				else { /* order == 3 */
+					__m256i q0, q1, q2;
+					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
+					q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
+					q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
+
+					for(i = 0; i < (int)data_len-7; i+=8) {
+						__m256i summ, mull;
+						summ = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));
+						mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
+						mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
+						summ = _mm256_sra_epi32(summ, cnt);
+						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
+					}
+				}
+			}
+			else {
+				if(order == 2) {
+					__m256i q0, q1;
+					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
+					q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
+
+					for(i = 0; i < (int)data_len-7; i+=8) {
+						__m256i summ, mull;
+						summ = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));
+						mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
+						summ = _mm256_sra_epi32(summ, cnt);
+						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
+					}
+				}
+				else { /* order == 1 */
+					__m256i q0;
+					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
+
+					for(i = 0; i < (int)data_len-7; i+=8) {
+						__m256i summ;
+						summ = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));
+						summ = _mm256_sra_epi32(summ, cnt);
+						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
+					}
+				}
+			}
+		}
+		for(; i < (int)data_len; i++) {
+			sum = 0;
+			switch(order) {
+				case 12: sum += qlp_coeff[11] * data[i-12];
+				case 11: sum += qlp_coeff[10] * data[i-11];
+				case 10: sum += qlp_coeff[ 9] * data[i-10];
+				case 9:  sum += qlp_coeff[ 8] * data[i- 9];
+				case 8:  sum += qlp_coeff[ 7] * data[i- 8];
+				case 7:  sum += qlp_coeff[ 6] * data[i- 7];
+				case 6:  sum += qlp_coeff[ 5] * data[i- 6];
+				case 5:  sum += qlp_coeff[ 4] * data[i- 5];
+				case 4:  sum += qlp_coeff[ 3] * data[i- 4];
+				case 3:  sum += qlp_coeff[ 2] * data[i- 3];
+				case 2:  sum += qlp_coeff[ 1] * data[i- 2];
+				case 1:  sum += qlp_coeff[ 0] * data[i- 1];
+			}
+			residual[i] = data[i] - (sum >> lp_quantization);
+		}
+	}
+	else { /* order > 12 */
+		for(i = 0; i < (int)data_len; i++) {
+			sum = 0;
+			switch(order) {
+				case 32: sum += qlp_coeff[31] * data[i-32];
+				case 31: sum += qlp_coeff[30] * data[i-31];
+				case 30: sum += qlp_coeff[29] * data[i-30];
+				case 29: sum += qlp_coeff[28] * data[i-29];
+				case 28: sum += qlp_coeff[27] * data[i-28];
+				case 27: sum += qlp_coeff[26] * data[i-27];
+				case 26: sum += qlp_coeff[25] * data[i-26];
+				case 25: sum += qlp_coeff[24] * data[i-25];
+				case 24: sum += qlp_coeff[23] * data[i-24];
+				case 23: sum += qlp_coeff[22] * data[i-23];
+				case 22: sum += qlp_coeff[21] * data[i-22];
+				case 21: sum += qlp_coeff[20] * data[i-21];
+				case 20: sum += qlp_coeff[19] * data[i-20];
+				case 19: sum += qlp_coeff[18] * data[i-19];
+				case 18: sum += qlp_coeff[17] * data[i-18];
+				case 17: sum += qlp_coeff[16] * data[i-17];
+				case 16: sum += qlp_coeff[15] * data[i-16];
+				case 15: sum += qlp_coeff[14] * data[i-15];
+				case 14: sum += qlp_coeff[13] * data[i-14];
+				case 13: sum += qlp_coeff[12] * data[i-13];
+				         sum += qlp_coeff[11] * data[i-12];
+				         sum += qlp_coeff[10] * data[i-11];
+				         sum += qlp_coeff[ 9] * data[i-10];
+				         sum += qlp_coeff[ 8] * data[i- 9];
+				         sum += qlp_coeff[ 7] * data[i- 8];
+				         sum += qlp_coeff[ 6] * data[i- 7];
+				         sum += qlp_coeff[ 5] * data[i- 6];
+				         sum += qlp_coeff[ 4] * data[i- 5];
+				         sum += qlp_coeff[ 3] * data[i- 4];
+				         sum += qlp_coeff[ 2] * data[i- 3];
+				         sum += qlp_coeff[ 1] * data[i- 2];
+				         sum += qlp_coeff[ 0] * data[i- 1];
+			}
+			residual[i] = data[i] - (sum >> lp_quantization);
+		}
+	}
+	_mm256_zeroupper();
+}
+
+static FLAC__int32 pack_arr[8] = { 0, 2, 4, 6, 1, 3, 5, 7 };
+
+FLAC__SSE_TARGET("avx2")
+void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
+{
+	int i;
+	FLAC__int64 sum;
+	__m128i cnt = _mm_cvtsi32_si128(lp_quantization);
+	__m256i pack = _mm256_loadu_si256((const __m256i *)pack_arr);
+
+	FLAC__ASSERT(order > 0);
+	FLAC__ASSERT(order <= 32);
+	FLAC__ASSERT(lp_quantization <= 32); /* there's no _mm256_sra_epi64() so we have to use _mm256_srl_epi64() */
+
+	if(order <= 12) {
+		if(order > 8) {
+			if(order > 10) {
+				if(order == 12) {
+					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
+					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
+					q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
+					q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
+					q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
+					q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
+					q5  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
+					q6  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
+					q7  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ]));
+					q8  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[8 ]));
+					q9  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[9 ]));
+					q10 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[10]));
+					q11 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[11]));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m256i summ, mull;
+						summ = _mm256_mul_epi32(q11, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-12))));
+						mull = _mm256_mul_epi32(q10, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-11)))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q9,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-10)))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q8,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q7,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q6,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q5,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
+						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
+					}
+				}
+				else { /* order == 11 */
+					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;
+					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
+					q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
+					q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
+					q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
+					q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
+					q5  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
+					q6  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
+					q7  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ]));
+					q8  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[8 ]));
+					q9  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[9 ]));
+					q10 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[10]));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m256i summ, mull;
+						summ = _mm256_mul_epi32(q10, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-11))));
+						mull = _mm256_mul_epi32(q9,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-10)))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q8,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q7,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q6,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q5,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
+						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
+					}
+				}
+			}
+			else {
+				if(order == 10) {
+					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;
+					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
+					q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
+					q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
+					q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
+					q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
+					q5  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
+					q6  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
+					q7  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ]));
+					q8  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[8 ]));
+					q9  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[9 ]));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m256i summ, mull;
+						summ = _mm256_mul_epi32(q9,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-10))));
+						mull = _mm256_mul_epi32(q8,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q7,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q6,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q5,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
+						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
+					}
+				}
+				else { /* order == 9 */
+					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8;
+					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
+					q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
+					q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
+					q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
+					q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
+					q5  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
+					q6  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
+					q7  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ]));
+					q8  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[8 ]));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m256i summ, mull;
+						summ = _mm256_mul_epi32(q8,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 ))));
+						mull = _mm256_mul_epi32(q7,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q6,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q5,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
+						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
+					}
+				}
+			}
+		}
+		else if(order > 4) {
+			if(order > 6) {
+				if(order == 8) {
+					__m256i q0, q1, q2, q3, q4, q5, q6, q7;
+					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
+					q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
+					q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
+					q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
+					q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
+					q5  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
+					q6  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
+					q7  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ]));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m256i summ, mull;
+						summ = _mm256_mul_epi32(q7,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 ))));
+						mull = _mm256_mul_epi32(q6,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q5,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
+						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
+					}
+				}
+				else { /* order == 7 */
+					__m256i q0, q1, q2, q3, q4, q5, q6;
+					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
+					q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
+					q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
+					q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
+					q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
+					q5  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
+					q6  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m256i summ, mull;
+						summ = _mm256_mul_epi32(q6,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 ))));
+						mull = _mm256_mul_epi32(q5,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
+						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
+					}
+				}
+			}
+			else {
+				if(order == 6) {
+					__m256i q0, q1, q2, q3, q4, q5;
+					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
+					q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
+					q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
+					q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
+					q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
+					q5  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m256i summ, mull;
+						summ = _mm256_mul_epi32(q5,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 ))));
+						mull = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
+						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
+					}
+				}
+				else { /* order == 5 */
+					__m256i q0, q1, q2, q3, q4;
+					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
+					q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
+					q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
+					q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
+					q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m256i summ, mull;
+						summ = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 ))));
+						mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
+						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
+					}
+				}
+			}
+		}
+		else {
+			if(order > 2) {
+				if(order == 4) {
+					__m256i q0, q1, q2, q3;
+					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
+					q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
+					q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
+					q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m256i summ, mull;
+						summ = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 ))));
+						mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
+						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
+					}
+				}
+				else { /* order == 3 */
+					__m256i q0, q1, q2;
+					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
+					q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
+					q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m256i summ, mull;
+						summ = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 ))));
+						mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
+						mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
+						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
+					}
+				}
+			}
+			else {
+				if(order == 2) {
+					__m256i q0, q1;
+					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
+					q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m256i summ, mull;
+						summ = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 ))));
+						mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
+						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
+					}
+				}
+				else { /* order == 1 */
+					__m256i q0;
+					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m256i summ;
+						summ = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 ))));
+						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
+					}
+				}
+			}
+		}
+		for(; i < (int)data_len; i++) {
+			sum = 0;
+			switch(order) {
+				case 12: sum += qlp_coeff[11] * (FLAC__int64)data[i-12];
+				case 11: sum += qlp_coeff[10] * (FLAC__int64)data[i-11];
+				case 10: sum += qlp_coeff[ 9] * (FLAC__int64)data[i-10];
+				case 9:  sum += qlp_coeff[ 8] * (FLAC__int64)data[i- 9];
+				case 8:  sum += qlp_coeff[ 7] * (FLAC__int64)data[i- 8];
+				case 7:  sum += qlp_coeff[ 6] * (FLAC__int64)data[i- 7];
+				case 6:  sum += qlp_coeff[ 5] * (FLAC__int64)data[i- 6];
+				case 5:  sum += qlp_coeff[ 4] * (FLAC__int64)data[i- 5];
+				case 4:  sum += qlp_coeff[ 3] * (FLAC__int64)data[i- 4];
+				case 3:  sum += qlp_coeff[ 2] * (FLAC__int64)data[i- 3];
+				case 2:  sum += qlp_coeff[ 1] * (FLAC__int64)data[i- 2];
+				case 1:  sum += qlp_coeff[ 0] * (FLAC__int64)data[i- 1];
+			}
+			residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantization);
+		}
+	}
+	else { /* order > 12 */
+		for(i = 0; i < (int)data_len; i++) {
+			sum = 0;
+			switch(order) {
+				case 32: sum += qlp_coeff[31] * (FLAC__int64)data[i-32];
+				case 31: sum += qlp_coeff[30] * (FLAC__int64)data[i-31];
+				case 30: sum += qlp_coeff[29] * (FLAC__int64)data[i-30];
+				case 29: sum += qlp_coeff[28] * (FLAC__int64)data[i-29];
+				case 28: sum += qlp_coeff[27] * (FLAC__int64)data[i-28];
+				case 27: sum += qlp_coeff[26] * (FLAC__int64)data[i-27];
+				case 26: sum += qlp_coeff[25] * (FLAC__int64)data[i-26];
+				case 25: sum += qlp_coeff[24] * (FLAC__int64)data[i-25];
+				case 24: sum += qlp_coeff[23] * (FLAC__int64)data[i-24];
+				case 23: sum += qlp_coeff[22] * (FLAC__int64)data[i-23];
+				case 22: sum += qlp_coeff[21] * (FLAC__int64)data[i-22];
+				case 21: sum += qlp_coeff[20] * (FLAC__int64)data[i-21];
+				case 20: sum += qlp_coeff[19] * (FLAC__int64)data[i-20];
+				case 19: sum += qlp_coeff[18] * (FLAC__int64)data[i-19];
+				case 18: sum += qlp_coeff[17] * (FLAC__int64)data[i-18];
+				case 17: sum += qlp_coeff[16] * (FLAC__int64)data[i-17];
+				case 16: sum += qlp_coeff[15] * (FLAC__int64)data[i-16];
+				case 15: sum += qlp_coeff[14] * (FLAC__int64)data[i-15];
+				case 14: sum += qlp_coeff[13] * (FLAC__int64)data[i-14];
+				case 13: sum += qlp_coeff[12] * (FLAC__int64)data[i-13];
+				         sum += qlp_coeff[11] * (FLAC__int64)data[i-12];
+				         sum += qlp_coeff[10] * (FLAC__int64)data[i-11];
+				         sum += qlp_coeff[ 9] * (FLAC__int64)data[i-10];
+				         sum += qlp_coeff[ 8] * (FLAC__int64)data[i- 9];
+				         sum += qlp_coeff[ 7] * (FLAC__int64)data[i- 8];
+				         sum += qlp_coeff[ 6] * (FLAC__int64)data[i- 7];
+				         sum += qlp_coeff[ 5] * (FLAC__int64)data[i- 6];
+				         sum += qlp_coeff[ 4] * (FLAC__int64)data[i- 5];
+				         sum += qlp_coeff[ 3] * (FLAC__int64)data[i- 4];
+				         sum += qlp_coeff[ 2] * (FLAC__int64)data[i- 3];
+				         sum += qlp_coeff[ 1] * (FLAC__int64)data[i- 2];
+				         sum += qlp_coeff[ 0] * (FLAC__int64)data[i- 1];
+			}
+			residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantization);
+		}
+	}
+	_mm256_zeroupper();
+}
+
+#endif /* FLAC__AVX2_SUPPORTED */
+#endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
+#endif /* FLAC__NO_ASM */
+#endif /* FLAC__INTEGER_ONLY_LIBRARY */
diff --git a/src/libFLAC/lpc_intrin_sse.c b/src/libFLAC/lpc_intrin_sse.c
new file mode 100644
index 0000000..81bf586
--- /dev/null
+++ b/src/libFLAC/lpc_intrin_sse.c
@@ -0,0 +1,451 @@
+/* libFLAC - Free Lossless Audio Codec library
+ * Copyright (C) 2000-2009  Josh Coalson
+ * Copyright (C) 2011-2014  Xiph.Org Foundation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the Xiph.org Foundation nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#  include <config.h>
+#endif
+
+#ifndef FLAC__INTEGER_ONLY_LIBRARY
+#ifndef FLAC__NO_ASM
+#if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
+#include "private/lpc.h"
+#ifdef FLAC__SSE_SUPPORTED
+
+#include "FLAC/assert.h"
+#include "FLAC/format.h"
+
+#include <xmmintrin.h> /* SSE */
+
+#if 1
+/* Faster on current Intel (starting from Core i aka Nehalem) and all AMD CPUs */
+
+FLAC__SSE_TARGET("sse")
+void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
+{
+	int i;
+	int limit = data_len - 4;
+	__m128 sum0;
+
+	(void) lag;
+	FLAC__ASSERT(lag <= 4);
+	FLAC__ASSERT(lag <= data_len);
+
+	sum0 = _mm_setzero_ps();
+
+	for(i = 0; i <= limit; i++) {
+		__m128 d, d0;
+		d0 = _mm_loadu_ps(data+i);
+		d = d0; d = _mm_shuffle_ps(d, d, 0);
+		sum0 = _mm_add_ps(sum0, _mm_mul_ps(d0, d));
+	}
+
+	{
+		__m128 d0 = _mm_setzero_ps();
+		limit++; if(limit < 0) limit = 0;
+
+		for(i = data_len-1; i >= limit; i--) {
+			__m128 d;
+			d = _mm_load_ss(data+i); d = _mm_shuffle_ps(d, d, 0);
+			d0 = _mm_shuffle_ps(d0, d0, _MM_SHUFFLE(2,1,0,3));
+			d0 = _mm_move_ss(d0, d);
+			sum0 = _mm_add_ps(sum0, _mm_mul_ps(d, d0));
+		}
+	}
+
+	_mm_storeu_ps(autoc,   sum0);
+}
+
+FLAC__SSE_TARGET("sse")
+void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
+{
+	int i;
+	int limit = data_len - 8;
+	__m128 sum0, sum1;
+
+	(void) lag;
+	FLAC__ASSERT(lag <= 8);
+	FLAC__ASSERT(lag <= data_len);
+
+	sum0 = _mm_setzero_ps();
+	sum1 = _mm_setzero_ps();
+
+	for(i = 0; i <= limit; i++) {
+		__m128 d, d0, d1;
+		d0 = _mm_loadu_ps(data+i);
+		d1 = _mm_loadu_ps(data+i+4);
+		d = d0; d = _mm_shuffle_ps(d, d, 0);
+		sum0 = _mm_add_ps(sum0, _mm_mul_ps(d0, d));
+		sum1 = _mm_add_ps(sum1, _mm_mul_ps(d1, d));
+	}
+
+	{
+		__m128 d0 = _mm_setzero_ps();
+		__m128 d1 = _mm_setzero_ps();
+		limit++; if(limit < 0) limit = 0;
+
+		for(i = data_len-1; i >= limit; i--) {
+			__m128 d;
+			d = _mm_load_ss(data+i); d = _mm_shuffle_ps(d, d, 0);
+			d1 = _mm_shuffle_ps(d1, d1, _MM_SHUFFLE(2,1,0,3));
+			d0 = _mm_shuffle_ps(d0, d0, _MM_SHUFFLE(2,1,0,3));
+			d1 = _mm_move_ss(d1, d0);
+			d0 = _mm_move_ss(d0, d);
+			sum1 = _mm_add_ps(sum1, _mm_mul_ps(d, d1));
+			sum0 = _mm_add_ps(sum0, _mm_mul_ps(d, d0));
+		}
+	}
+
+	_mm_storeu_ps(autoc,   sum0);
+	_mm_storeu_ps(autoc+4, sum1);
+}
+
+FLAC__SSE_TARGET("sse")
+void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
+{
+	int i;
+	int limit = data_len - 12;
+	__m128 sum0, sum1, sum2;
+
+	(void) lag;
+	FLAC__ASSERT(lag <= 12);
+	FLAC__ASSERT(lag <= data_len);
+
+	sum0 = _mm_setzero_ps();
+	sum1 = _mm_setzero_ps();
+	sum2 = _mm_setzero_ps();
+
+	for(i = 0; i <= limit; i++) {
+		__m128 d, d0, d1, d2;
+		d0 = _mm_loadu_ps(data+i);
+		d1 = _mm_loadu_ps(data+i+4);
+		d2 = _mm_loadu_ps(data+i+8);
+		d = d0; d = _mm_shuffle_ps(d, d, 0);
+		sum0 = _mm_add_ps(sum0, _mm_mul_ps(d0, d));
+		sum1 = _mm_add_ps(sum1, _mm_mul_ps(d1, d));
+		sum2 = _mm_add_ps(sum2, _mm_mul_ps(d2, d));
+	}
+
+	{
+		__m128 d0 = _mm_setzero_ps();
+		__m128 d1 = _mm_setzero_ps();
+		__m128 d2 = _mm_setzero_ps();
+		limit++; if(limit < 0) limit = 0;
+
+		for(i = data_len-1; i >= limit; i--) {
+			__m128 d;
+			d = _mm_load_ss(data+i); d = _mm_shuffle_ps(d, d, 0);
+			d2 = _mm_shuffle_ps(d2, d2, _MM_SHUFFLE(2,1,0,3));
+			d1 = _mm_shuffle_ps(d1, d1, _MM_SHUFFLE(2,1,0,3));
+			d0 = _mm_shuffle_ps(d0, d0, _MM_SHUFFLE(2,1,0,3));
+			d2 = _mm_move_ss(d2, d1);
+			d1 = _mm_move_ss(d1, d0);
+			d0 = _mm_move_ss(d0, d);
+			sum2 = _mm_add_ps(sum2, _mm_mul_ps(d, d2));
+			sum1 = _mm_add_ps(sum1, _mm_mul_ps(d, d1));
+			sum0 = _mm_add_ps(sum0, _mm_mul_ps(d, d0));
+		}
+	}
+
+	_mm_storeu_ps(autoc,   sum0);
+	_mm_storeu_ps(autoc+4, sum1);
+	_mm_storeu_ps(autoc+8, sum2);
+}
+
+FLAC__SSE_TARGET("sse")
+void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
+{
+	int i;
+	int limit = data_len - 16;
+	__m128 sum0, sum1, sum2, sum3;
+
+	(void) lag;
+	FLAC__ASSERT(lag <= 16);
+	FLAC__ASSERT(lag <= data_len);
+
+	sum0 = _mm_setzero_ps();
+	sum1 = _mm_setzero_ps();
+	sum2 = _mm_setzero_ps();
+	sum3 = _mm_setzero_ps();
+
+	for(i = 0; i <= limit; i++) {
+		__m128 d, d0, d1, d2, d3;
+		d0 = _mm_loadu_ps(data+i);
+		d1 = _mm_loadu_ps(data+i+4);
+		d2 = _mm_loadu_ps(data+i+8);
+		d3 = _mm_loadu_ps(data+i+12);
+		d = d0; d = _mm_shuffle_ps(d, d, 0);
+		sum0 = _mm_add_ps(sum0, _mm_mul_ps(d0, d));
+		sum1 = _mm_add_ps(sum1, _mm_mul_ps(d1, d));
+		sum2 = _mm_add_ps(sum2, _mm_mul_ps(d2, d));
+		sum3 = _mm_add_ps(sum3, _mm_mul_ps(d3, d));
+	}
+
+	{
+		__m128 d0 = _mm_setzero_ps();
+		__m128 d1 = _mm_setzero_ps();
+		__m128 d2 = _mm_setzero_ps();
+		__m128 d3 = _mm_setzero_ps();
+		limit++; if(limit < 0) limit = 0;
+
+		for(i = data_len-1; i >= limit; i--) {
+			__m128 d;
+			d = _mm_load_ss(data+i); d = _mm_shuffle_ps(d, d, 0);
+			d3 = _mm_shuffle_ps(d3, d3, _MM_SHUFFLE(2,1,0,3));
+			d2 = _mm_shuffle_ps(d2, d2, _MM_SHUFFLE(2,1,0,3));
+			d1 = _mm_shuffle_ps(d1, d1, _MM_SHUFFLE(2,1,0,3));
+			d0 = _mm_shuffle_ps(d0, d0, _MM_SHUFFLE(2,1,0,3));
+			d3 = _mm_move_ss(d3, d2);
+			d2 = _mm_move_ss(d2, d1);
+			d1 = _mm_move_ss(d1, d0);
+			d0 = _mm_move_ss(d0, d);
+			sum3 = _mm_add_ps(sum3, _mm_mul_ps(d, d3));
+			sum2 = _mm_add_ps(sum2, _mm_mul_ps(d, d2));
+			sum1 = _mm_add_ps(sum1, _mm_mul_ps(d, d1));
+			sum0 = _mm_add_ps(sum0, _mm_mul_ps(d, d0));
+		}
+	}
+
+	_mm_storeu_ps(autoc,   sum0);
+	_mm_storeu_ps(autoc+4, sum1);
+	_mm_storeu_ps(autoc+8, sum2);
+	_mm_storeu_ps(autoc+12,sum3);
+}
+
+#else
+/* Faster on older Intel CPUs (up to Core 2) */
+
+FLAC__SSE_TARGET("sse")
+void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
+{
+	__m128 xmm0, xmm2, xmm5;
+
+	(void) lag;
+	FLAC__ASSERT(lag > 0);
+	FLAC__ASSERT(lag <= 4);
+	FLAC__ASSERT(lag <= data_len);
+	FLAC__ASSERT(data_len > 0);
+
+	xmm5 = _mm_setzero_ps();
+
+	xmm0 = _mm_load_ss(data++);
+	xmm2 = xmm0;
+	xmm0 = _mm_shuffle_ps(xmm0, xmm0, 0);
+
+	xmm0 = _mm_mul_ps(xmm0, xmm2);
+	xmm5 = _mm_add_ps(xmm5, xmm0);
+
+	data_len--;
+
+	while(data_len)
+	{
+		xmm0 = _mm_load1_ps(data++);
+
+		xmm2 = _mm_shuffle_ps(xmm2, xmm2, _MM_SHUFFLE(2,1,0,3));
+		xmm2 = _mm_move_ss(xmm2, xmm0);
+		xmm0 = _mm_mul_ps(xmm0, xmm2);
+		xmm5 = _mm_add_ps(xmm5, xmm0);
+
+		data_len--;
+	}
+
+	_mm_storeu_ps(autoc, xmm5);
+}
+
+FLAC__SSE_TARGET("sse")
+void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
+{
+	__m128 xmm0, xmm1, xmm2, xmm3, xmm5, xmm6;
+
+	(void) lag;
+	FLAC__ASSERT(lag > 0);
+	FLAC__ASSERT(lag <= 8);
+	FLAC__ASSERT(lag <= data_len);
+	FLAC__ASSERT(data_len > 0);
+
+	xmm5 = _mm_setzero_ps();
+	xmm6 = _mm_setzero_ps();
+
+	xmm0 = _mm_load_ss(data++);
+	xmm2 = xmm0;
+	xmm0 = _mm_shuffle_ps(xmm0, xmm0, 0);
+	xmm3 = _mm_setzero_ps();
+
+	xmm0 = _mm_mul_ps(xmm0, xmm2);
+	xmm5 = _mm_add_ps(xmm5, xmm0);
+
+	data_len--;
+
+	while(data_len)
+	{
+		xmm0 = _mm_load1_ps(data++);
+
+		xmm2 = _mm_shuffle_ps(xmm2, xmm2, _MM_SHUFFLE(2,1,0,3));
+		xmm3 = _mm_shuffle_ps(xmm3, xmm3, _MM_SHUFFLE(2,1,0,3));
+		xmm3 = _mm_move_ss(xmm3, xmm2);
+		xmm2 = _mm_move_ss(xmm2, xmm0);
+
+		xmm1 = xmm0;
+		xmm1 = _mm_mul_ps(xmm1, xmm3);
+		xmm0 = _mm_mul_ps(xmm0, xmm2);
+		xmm6 = _mm_add_ps(xmm6, xmm1);
+		xmm5 = _mm_add_ps(xmm5, xmm0);
+
+		data_len--;
+	}
+
+	_mm_storeu_ps(autoc,   xmm5);
+	_mm_storeu_ps(autoc+4, xmm6);
+}
+
+FLAC__SSE_TARGET("sse")
+void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
+{
+	__m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+
+	(void) lag;
+	FLAC__ASSERT(lag > 0);
+	FLAC__ASSERT(lag <= 12);
+	FLAC__ASSERT(lag <= data_len);
+	FLAC__ASSERT(data_len > 0);
+
+	xmm5 = _mm_setzero_ps();
+	xmm6 = _mm_setzero_ps();
+	xmm7 = _mm_setzero_ps();
+
+	xmm0 = _mm_load_ss(data++);
+	xmm2 = xmm0;
+	xmm0 = _mm_shuffle_ps(xmm0, xmm0, 0);
+	xmm3 = _mm_setzero_ps();
+	xmm4 = _mm_setzero_ps();
+
+	xmm0 = _mm_mul_ps(xmm0, xmm2);
+	xmm5 = _mm_add_ps(xmm5, xmm0);
+
+	data_len--;
+
+	while(data_len)
+	{
+		xmm0 = _mm_load1_ps(data++);
+
+		xmm2 = _mm_shuffle_ps(xmm2, xmm2, _MM_SHUFFLE(2,1,0,3));
+		xmm3 = _mm_shuffle_ps(xmm3, xmm3, _MM_SHUFFLE(2,1,0,3));
+		xmm4 = _mm_shuffle_ps(xmm4, xmm4, _MM_SHUFFLE(2,1,0,3));
+		xmm4 = _mm_move_ss(xmm4, xmm3);
+		xmm3 = _mm_move_ss(xmm3, xmm2);
+		xmm2 = _mm_move_ss(xmm2, xmm0);
+
+		xmm1 = xmm0;
+		xmm1 = _mm_mul_ps(xmm1, xmm2);
+		xmm5 = _mm_add_ps(xmm5, xmm1);
+		xmm1 = xmm0;
+		xmm1 = _mm_mul_ps(xmm1, xmm3);
+		xmm6 = _mm_add_ps(xmm6, xmm1);
+		xmm0 = _mm_mul_ps(xmm0, xmm4);
+		xmm7 = _mm_add_ps(xmm7, xmm0);
+
+		data_len--;
+	}
+
+	_mm_storeu_ps(autoc,   xmm5);
+	_mm_storeu_ps(autoc+4, xmm6);
+	_mm_storeu_ps(autoc+8, xmm7);
+}
+
+FLAC__SSE_TARGET("sse")
+void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
+{
+	__m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9;
+
+	(void) lag;
+	FLAC__ASSERT(lag > 0);
+	FLAC__ASSERT(lag <= 16);
+	FLAC__ASSERT(lag <= data_len);
+	FLAC__ASSERT(data_len > 0);
+
+	xmm6 = _mm_setzero_ps();
+	xmm7 = _mm_setzero_ps();
+	xmm8 = _mm_setzero_ps();
+	xmm9 = _mm_setzero_ps();
+
+	xmm0 = _mm_load_ss(data++);
+	xmm2 = xmm0;
+	xmm0 = _mm_shuffle_ps(xmm0, xmm0, 0);
+	xmm3 = _mm_setzero_ps();
+	xmm4 = _mm_setzero_ps();
+	xmm5 = _mm_setzero_ps();
+
+	xmm0 = _mm_mul_ps(xmm0, xmm2);
+	xmm6 = _mm_add_ps(xmm6, xmm0);
+
+	data_len--;
+
+	while(data_len)
+	{
+		xmm0 = _mm_load1_ps(data++);
+
+		/* shift xmm5:xmm4:xmm3:xmm2 left by one float */
+		xmm5 = _mm_shuffle_ps(xmm5, xmm5, _MM_SHUFFLE(2,1,0,3));
+		xmm4 = _mm_shuffle_ps(xmm4, xmm4, _MM_SHUFFLE(2,1,0,3));
+		xmm3 = _mm_shuffle_ps(xmm3, xmm3, _MM_SHUFFLE(2,1,0,3));
+		xmm2 = _mm_shuffle_ps(xmm2, xmm2, _MM_SHUFFLE(2,1,0,3));
+		xmm5 = _mm_move_ss(xmm5, xmm4);
+		xmm4 = _mm_move_ss(xmm4, xmm3);
+		xmm3 = _mm_move_ss(xmm3, xmm2);
+		xmm2 = _mm_move_ss(xmm2, xmm0);
+
+		/* xmm9|xmm8|xmm7|xmm6 += xmm0|xmm0|xmm0|xmm0 * xmm5|xmm4|xmm3|xmm2 */
+		xmm1 = xmm0;
+		xmm1 = _mm_mul_ps(xmm1, xmm5);
+		xmm9 = _mm_add_ps(xmm9, xmm1);
+		xmm1 = xmm0;
+		xmm1 = _mm_mul_ps(xmm1, xmm4);
+		xmm8 = _mm_add_ps(xmm8, xmm1);
+		xmm1 = xmm0;
+		xmm1 = _mm_mul_ps(xmm1, xmm3);
+		xmm7 = _mm_add_ps(xmm7, xmm1);
+		xmm0 = _mm_mul_ps(xmm0, xmm2);
+		xmm6 = _mm_add_ps(xmm6, xmm0);
+
+		data_len--;
+	}
+
+	_mm_storeu_ps(autoc,   xmm6);
+	_mm_storeu_ps(autoc+4, xmm7);
+	_mm_storeu_ps(autoc+8, xmm8);
+	_mm_storeu_ps(autoc+12,xmm9);
+}
+#endif
+
+#endif /* FLAC__SSE_SUPPORTED */
+#endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
+#endif /* FLAC__NO_ASM */
+#endif /* FLAC__INTEGER_ONLY_LIBRARY */
diff --git a/src/libFLAC/lpc_intrin_sse2.c b/src/libFLAC/lpc_intrin_sse2.c
new file mode 100644
index 0000000..e1908ed
--- /dev/null
+++ b/src/libFLAC/lpc_intrin_sse2.c
@@ -0,0 +1,1088 @@
+/* libFLAC - Free Lossless Audio Codec library
+ * Copyright (C) 2000-2009  Josh Coalson
+ * Copyright (C) 2011-2014  Xiph.Org Foundation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the Xiph.org Foundation nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#  include <config.h>
+#endif
+
+#ifndef FLAC__INTEGER_ONLY_LIBRARY
+#ifndef FLAC__NO_ASM
+#if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
+#include "private/lpc.h"
+#ifdef FLAC__SSE2_SUPPORTED
+
+#include "FLAC/assert.h"
+#include "FLAC/format.h"
+
+#include <emmintrin.h> /* SSE2 */
+
+#define RESIDUAL16_RESULT(xmmN) curr = *data++; *residual++ = curr - (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
+#define     DATA16_RESULT(xmmN) curr = *residual++ + (_mm_cvtsi128_si32(xmmN) >> lp_quantization); *data++ = curr;
+
+#define RESIDUAL32_RESULT(xmmN) residual[i] = data[i] - (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
+#define     DATA32_RESULT(xmmN) data[i] = residual[i] + (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
+
+FLAC__SSE_TARGET("sse2")
+void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
+{
+	int i;
+	FLAC__int32 sum;
+	__m128i cnt = _mm_cvtsi32_si128(lp_quantization);
+
+	FLAC__ASSERT(order > 0);
+	FLAC__ASSERT(order <= 32);
+
+	if(order <= 12) {
+		if(order > 8) {
+			if(order > 10) {
+				if(order == 12) {
+					__m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
+					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
+					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
+					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
+					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
+					q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
+					q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
+					q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
+					q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
+					q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
+					q9 = _mm_cvtsi32_si128(0xffff & qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
+					q10 = _mm_cvtsi32_si128(0xffff & qlp_coeff[10]); q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0));
+					q11 = _mm_cvtsi32_si128(0xffff & qlp_coeff[11]); q11 = _mm_shuffle_epi32(q11, _MM_SHUFFLE(0,0,0,0));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m128i summ, mull;
+						summ = _mm_madd_epi16(q11, _mm_loadu_si128((const __m128i*)(data+i-12)));
+						mull = _mm_madd_epi16(q10, _mm_loadu_si128((const __m128i*)(data+i-11))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
+						summ = _mm_sra_epi32(summ, cnt);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
+					}
+				}
+				else { /* order == 11 */
+					__m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;
+					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
+					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
+					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
+					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
+					q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
+					q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
+					q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
+					q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
+					q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
+					q9 = _mm_cvtsi32_si128(0xffff & qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
+					q10 = _mm_cvtsi32_si128(0xffff & qlp_coeff[10]); q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m128i summ, mull;
+						summ = _mm_madd_epi16(q10, _mm_loadu_si128((const __m128i*)(data+i-11)));
+						mull = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
+						summ = _mm_sra_epi32(summ, cnt);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
+					}
+				}
+			}
+			else {
+				if(order == 10) {
+					__m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;
+					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
+					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
+					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
+					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
+					q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
+					q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
+					q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
+					q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
+					q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
+					q9 = _mm_cvtsi32_si128(0xffff & qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m128i summ, mull;
+						summ = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(data+i-10)));
+						mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
+						summ = _mm_sra_epi32(summ, cnt);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
+					}
+				}
+				else { /* order == 9 */
+					__m128i q0, q1, q2, q3, q4, q5, q6, q7, q8;
+					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
+					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
+					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
+					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
+					q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
+					q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
+					q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
+					q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
+					q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m128i summ, mull;
+						summ = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9)));
+						mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
+						summ = _mm_sra_epi32(summ, cnt);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
+					}
+				}
+			}
+		}
+		else if(order > 4) {
+			if(order > 6) {
+				if(order == 8) {
+					__m128i q0, q1, q2, q3, q4, q5, q6, q7;
+					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
+					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
+					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
+					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
+					q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
+					q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
+					q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
+					q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m128i summ, mull;
+						summ = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8)));
+						mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
+						summ = _mm_sra_epi32(summ, cnt);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
+					}
+				}
+				else { /* order == 7 */
+					__m128i q0, q1, q2, q3, q4, q5, q6;
+					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
+					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
+					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
+					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
+					q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
+					q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
+					q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m128i summ, mull;
+						summ = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7)));
+						mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
+						summ = _mm_sra_epi32(summ, cnt);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
+					}
+				}
+			}
+			else {
+				if(order == 6) {
+					__m128i q0, q1, q2, q3, q4, q5;
+					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
+					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
+					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
+					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
+					q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
+					q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m128i summ, mull;
+						summ = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6)));
+						mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
+						summ = _mm_sra_epi32(summ, cnt);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
+					}
+				}
+				else { /* order == 5 */
+					__m128i q0, q1, q2, q3, q4;
+					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
+					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
+					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
+					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
+					q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m128i summ, mull;
+						summ = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5)));
+						mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
+						summ = _mm_sra_epi32(summ, cnt);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
+					}
+				}
+			}
+		}
+		else {
+			if(order > 2) {
+				if(order == 4) {
+					__m128i q0, q1, q2, q3;
+					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
+					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
+					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
+					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m128i summ, mull;
+						summ = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4)));
+						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
+						summ = _mm_sra_epi32(summ, cnt);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
+					}
+				}
+				else { /* order == 3 */
+					__m128i q0, q1, q2;
+					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
+					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
+					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m128i summ, mull;
+						summ = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3)));
+						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
+						summ = _mm_sra_epi32(summ, cnt);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
+					}
+				}
+			}
+			else {
+				if(order == 2) {
+					__m128i q0, q1;
+					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
+					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m128i summ, mull;
+						summ = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2)));
+						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
+						summ = _mm_sra_epi32(summ, cnt);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
+					}
+				}
+				else { /* order == 1 */
+					__m128i q0;
+					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m128i summ;
+						summ = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1)));
+						summ = _mm_sra_epi32(summ, cnt);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
+					}
+				}
+			}
+		}
+		for(; i < (int)data_len; i++) {
+			sum = 0;
+			switch(order) {
+				case 12: sum += qlp_coeff[11] * data[i-12];
+				case 11: sum += qlp_coeff[10] * data[i-11];
+				case 10: sum += qlp_coeff[ 9] * data[i-10];
+				case 9:  sum += qlp_coeff[ 8] * data[i- 9];
+				case 8:  sum += qlp_coeff[ 7] * data[i- 8];
+				case 7:  sum += qlp_coeff[ 6] * data[i- 7];
+				case 6:  sum += qlp_coeff[ 5] * data[i- 6];
+				case 5:  sum += qlp_coeff[ 4] * data[i- 5];
+				case 4:  sum += qlp_coeff[ 3] * data[i- 4];
+				case 3:  sum += qlp_coeff[ 2] * data[i- 3];
+				case 2:  sum += qlp_coeff[ 1] * data[i- 2];
+				case 1:  sum += qlp_coeff[ 0] * data[i- 1];
+			}
+			residual[i] = data[i] - (sum >> lp_quantization);
+		}
+	}
+	else { /* order > 12 */
+		for(i = 0; i < (int)data_len; i++) {
+			sum = 0;
+			switch(order) {
+				case 32: sum += qlp_coeff[31] * data[i-32];
+				case 31: sum += qlp_coeff[30] * data[i-31];
+				case 30: sum += qlp_coeff[29] * data[i-30];
+				case 29: sum += qlp_coeff[28] * data[i-29];
+				case 28: sum += qlp_coeff[27] * data[i-28];
+				case 27: sum += qlp_coeff[26] * data[i-27];
+				case 26: sum += qlp_coeff[25] * data[i-26];
+				case 25: sum += qlp_coeff[24] * data[i-25];
+				case 24: sum += qlp_coeff[23] * data[i-24];
+				case 23: sum += qlp_coeff[22] * data[i-23];
+				case 22: sum += qlp_coeff[21] * data[i-22];
+				case 21: sum += qlp_coeff[20] * data[i-21];
+				case 20: sum += qlp_coeff[19] * data[i-20];
+				case 19: sum += qlp_coeff[18] * data[i-19];
+				case 18: sum += qlp_coeff[17] * data[i-18];
+				case 17: sum += qlp_coeff[16] * data[i-17];
+				case 16: sum += qlp_coeff[15] * data[i-16];
+				case 15: sum += qlp_coeff[14] * data[i-15];
+				case 14: sum += qlp_coeff[13] * data[i-14];
+				case 13: sum += qlp_coeff[12] * data[i-13];
+				         sum += qlp_coeff[11] * data[i-12];
+				         sum += qlp_coeff[10] * data[i-11];
+				         sum += qlp_coeff[ 9] * data[i-10];
+				         sum += qlp_coeff[ 8] * data[i- 9];
+				         sum += qlp_coeff[ 7] * data[i- 8];
+				         sum += qlp_coeff[ 6] * data[i- 7];
+				         sum += qlp_coeff[ 5] * data[i- 6];
+				         sum += qlp_coeff[ 4] * data[i- 5];
+				         sum += qlp_coeff[ 3] * data[i- 4];
+				         sum += qlp_coeff[ 2] * data[i- 3];
+				         sum += qlp_coeff[ 1] * data[i- 2];
+				         sum += qlp_coeff[ 0] * data[i- 1];
+			}
+			residual[i] = data[i] - (sum >> lp_quantization);
+		}
+	}
+}
+
+FLAC__SSE_TARGET("sse2")
+void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
+{
+	int i;
+
+	FLAC__ASSERT(order > 0);
+	FLAC__ASSERT(order <= 32);
+
+	if(order <= 12) {
+		if(order > 8) { /* order == 9, 10, 11, 12 */
+			if(order > 10) { /* order == 11, 12 */
+				if(order == 12) {
+					__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));  // 0  0  q[1]  q[0]
+					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));  // 0  0  q[3]  q[2]
+					xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));  // 0  0  q[5]  q[4]
+					xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));  // 0  0  q[7]  q[6]
+					xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));  // 0  0  q[9]  q[8]
+					xmm5 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+10)); // 0  0  q[11] q[10]
+
+					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); // 0  q[1]  0  q[0]
+					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); // 0  q[3]  0  q[2]
+					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); // 0  q[5]  0  q[4]
+					xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); // 0  q[7]  0  q[6]
+					xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0)); // 0  q[9]  0  q[8]
+					xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(3,1,2,0)); // 0  q[11] 0  q[10]
+
+					for(i = 0; i < (int)data_len; i++) {
+						//sum = 0;
+						//sum += qlp_coeff[11] * data[i-12];
+						//sum += qlp_coeff[10] * data[i-11];
+						xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-12));  // 0   0        d[i-11]  d[i-12]
+						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); // 0  d[i-12]   0        d[i-11]
+						xmm7 = _mm_mul_epu32(xmm7, xmm5); /* we use _unsigned_ multiplication and discard high dword of the result values */
+
+						//sum += qlp_coeff[9] * data[i-10];
+						//sum += qlp_coeff[8] * data[i-9];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm4);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						//sum += qlp_coeff[7] * data[i-8];
+						//sum += qlp_coeff[6] * data[i-7];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm3);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						//sum += qlp_coeff[5] * data[i-6];
+						//sum += qlp_coeff[4] * data[i-5];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm2);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						//sum += qlp_coeff[3] * data[i-4];
+						//sum += qlp_coeff[2] * data[i-3];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm1);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						//sum += qlp_coeff[1] * data[i-2];
+						//sum += qlp_coeff[0] * data[i-1];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm0);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
+						RESIDUAL32_RESULT(xmm7);
+					}
+				}
+				else { /* order == 11 */
+					__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
+					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
+					xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
+					xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
+					xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
+					xmm5 = _mm_cvtsi32_si128(qlp_coeff[10]);
+
+					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
+					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
+					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
+					xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
+					xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
+
+					for(i = 0; i < (int)data_len; i++) {
+						//sum = 0;
+						//sum  = qlp_coeff[10] * data[i-11];
+						xmm7 = _mm_cvtsi32_si128(data[i-11]);
+						xmm7 = _mm_mul_epu32(xmm7, xmm5);
+
+						//sum += qlp_coeff[9] * data[i-10];
+						//sum += qlp_coeff[8] * data[i-9];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm4);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						//sum += qlp_coeff[7] * data[i-8];
+						//sum += qlp_coeff[6] * data[i-7];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm3);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						//sum += qlp_coeff[5] * data[i-6];
+						//sum += qlp_coeff[4] * data[i-5];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm2);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						//sum += qlp_coeff[3] * data[i-4];
+						//sum += qlp_coeff[2] * data[i-3];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm1);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						//sum += qlp_coeff[1] * data[i-2];
+						//sum += qlp_coeff[0] * data[i-1];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm0);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
+						RESIDUAL32_RESULT(xmm7);
+					}
+				}
+			}
+			else { /* order == 9, 10 */
+				if(order == 10) {
+					__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
+					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
+					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
+					xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
+					xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
+					xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
+
+					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
+					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
+					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
+					xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
+					xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
+
+					for(i = 0; i < (int)data_len; i++) {
+						//sum = 0;
+						//sum += qlp_coeff[9] * data[i-10];
+						//sum += qlp_coeff[8] * data[i-9];
+						xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-10));
+						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
+						xmm7 = _mm_mul_epu32(xmm7, xmm4);
+
+						//sum += qlp_coeff[7] * data[i-8];
+						//sum += qlp_coeff[6] * data[i-7];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm3);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						//sum += qlp_coeff[5] * data[i-6];
+						//sum += qlp_coeff[4] * data[i-5];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm2);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						//sum += qlp_coeff[3] * data[i-4];
+						//sum += qlp_coeff[2] * data[i-3];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm1);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						//sum += qlp_coeff[1] * data[i-2];
+						//sum += qlp_coeff[0] * data[i-1];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm0);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
+						RESIDUAL32_RESULT(xmm7);
+					}
+				}
+				else { /* order == 9 */
+					__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
+					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
+					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
+					xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
+					xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
+					xmm4 = _mm_cvtsi32_si128(qlp_coeff[8]);
+
+					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
+					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
+					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
+					xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
+
+					for(i = 0; i < (int)data_len; i++) {
+						//sum = 0;
+						//sum  = qlp_coeff[8] * data[i-9];
+						xmm7 = _mm_cvtsi32_si128(data[i-9]);
+						xmm7 = _mm_mul_epu32(xmm7, xmm4);
+
+						//sum += qlp_coeff[7] * data[i-8];
+						//sum += qlp_coeff[6] * data[i-7];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm3);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						//sum += qlp_coeff[5] * data[i-6];
+						//sum += qlp_coeff[4] * data[i-5];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm2);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						//sum += qlp_coeff[3] * data[i-4];
+						//sum += qlp_coeff[2] * data[i-3];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm1);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						//sum += qlp_coeff[1] * data[i-2];
+						//sum += qlp_coeff[0] * data[i-1];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm0);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
+						RESIDUAL32_RESULT(xmm7);
+					}
+				}
+			}
+		}
+		else if(order > 4) { /* order == 5, 6, 7, 8 */
+			if(order > 6) { /* order == 7, 8 */
+				if(order == 8) {
+					__m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
+					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
+					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
+					xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
+					xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
+
+					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
+					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
+					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
+					xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
+
+					for(i = 0; i < (int)data_len; i++) {
+						//sum = 0;
+						//sum += qlp_coeff[7] * data[i-8];
+						//sum += qlp_coeff[6] * data[i-7];
+						xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-8));
+						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
+						xmm7 = _mm_mul_epu32(xmm7, xmm3);
+
+						//sum += qlp_coeff[5] * data[i-6];
+						//sum += qlp_coeff[4] * data[i-5];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm2);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						//sum += qlp_coeff[3] * data[i-4];
+						//sum += qlp_coeff[2] * data[i-3];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm1);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						//sum += qlp_coeff[1] * data[i-2];
+						//sum += qlp_coeff[0] * data[i-1];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm0);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
+						RESIDUAL32_RESULT(xmm7);
+					}
+				}
+				else { /* order == 7 */
+					__m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
+					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
+					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
+					xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
+					xmm3 = _mm_cvtsi32_si128(qlp_coeff[6]);
+
+					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
+					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
+					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
+
+					for(i = 0; i < (int)data_len; i++) {
+						//sum = 0;
+						//sum  = qlp_coeff[6] * data[i-7];
+						xmm7 = _mm_cvtsi32_si128(data[i-7]);
+						xmm7 = _mm_mul_epu32(xmm7, xmm3);
+
+						//sum += qlp_coeff[5] * data[i-6];
+						//sum += qlp_coeff[4] * data[i-5];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm2);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						//sum += qlp_coeff[3] * data[i-4];
+						//sum += qlp_coeff[2] * data[i-3];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm1);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						//sum += qlp_coeff[1] * data[i-2];
+						//sum += qlp_coeff[0] * data[i-1];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm0);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
+						RESIDUAL32_RESULT(xmm7);
+					}
+				}
+			}
+			else { /* order == 5, 6 */
+				if(order == 6) {
+					__m128i xmm0, xmm1, xmm2, xmm6, xmm7;
+					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
+					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
+					xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
+
+					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
+					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
+					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
+
+					for(i = 0; i < (int)data_len; i++) {
+						//sum = 0;
+						//sum += qlp_coeff[5] * data[i-6];
+						//sum += qlp_coeff[4] * data[i-5];
+						xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-6));
+						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
+						xmm7 = _mm_mul_epu32(xmm7, xmm2);
+
+						//sum += qlp_coeff[3] * data[i-4];
+						//sum += qlp_coeff[2] * data[i-3];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm1);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						//sum += qlp_coeff[1] * data[i-2];
+						//sum += qlp_coeff[0] * data[i-1];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm0);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
+						RESIDUAL32_RESULT(xmm7);
+					}
+				}
+				else { /* order == 5 */
+					__m128i xmm0, xmm1, xmm2, xmm6, xmm7;
+					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
+					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
+					xmm2 = _mm_cvtsi32_si128(qlp_coeff[4]);
+
+					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
+					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
+
+					for(i = 0; i < (int)data_len; i++) {
+						//sum = 0;
+						//sum  = qlp_coeff[4] * data[i-5];
+						xmm7 = _mm_cvtsi32_si128(data[i-5]);
+						xmm7 = _mm_mul_epu32(xmm7, xmm2);
+
+						//sum += qlp_coeff[3] * data[i-4];
+						//sum += qlp_coeff[2] * data[i-3];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm1);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						//sum += qlp_coeff[1] * data[i-2];
+						//sum += qlp_coeff[0] * data[i-1];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm0);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
+						RESIDUAL32_RESULT(xmm7);
+					}
+				}
+			}
+		}
+		else { /* order == 1, 2, 3, 4 */
+			if(order > 2) { /* order == 3, 4 */
+				if(order == 4) {
+					__m128i xmm0, xmm1, xmm6, xmm7;
+					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
+					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
+
+					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
+					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
+
+					for(i = 0; i < (int)data_len; i++) {
+						//sum = 0;
+						//sum += qlp_coeff[3] * data[i-4];
+						//sum += qlp_coeff[2] * data[i-3];
+						xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-4));
+						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
+						xmm7 = _mm_mul_epu32(xmm7, xmm1);
+
+						//sum += qlp_coeff[1] * data[i-2];
+						//sum += qlp_coeff[0] * data[i-1];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm0);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
+						RESIDUAL32_RESULT(xmm7);
+					}
+				}
+				else { /* order == 3 */
+					__m128i xmm0, xmm1, xmm6, xmm7;
+					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
+					xmm1 = _mm_cvtsi32_si128(qlp_coeff[2]);
+
+					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
+
+					for(i = 0; i < (int)data_len; i++) {
+						//sum = 0;
+						//sum  = qlp_coeff[2] * data[i-3];
+						xmm7 = _mm_cvtsi32_si128(data[i-3]);
+						xmm7 = _mm_mul_epu32(xmm7, xmm1);
+
+						//sum += qlp_coeff[1] * data[i-2];
+						//sum += qlp_coeff[0] * data[i-1];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epu32(xmm6, xmm0);
+						xmm7 = _mm_add_epi32(xmm7, xmm6);
+
+						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
+						RESIDUAL32_RESULT(xmm7);
+					}
+				}
+			}
+			else { /* order == 1, 2 */
+				if(order == 2) {
+					__m128i xmm0, xmm7;
+					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
+					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
+
+					for(i = 0; i < (int)data_len; i++) {
+						//sum = 0;
+						//sum += qlp_coeff[1] * data[i-2];
+						//sum += qlp_coeff[0] * data[i-1];
+						xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-2));
+						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
+						xmm7 = _mm_mul_epu32(xmm7, xmm0);
+
+						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
+						RESIDUAL32_RESULT(xmm7);
+					}
+				}
+				else { /* order == 1 */
+					for(i = 0; i < (int)data_len; i++)
+						residual[i] = data[i] - ((qlp_coeff[0] * data[i-1]) >> lp_quantization);
+				}
+			}
+		}
+	}
+	else { /* order > 12 */
+		FLAC__int32 sum;
+		for(i = 0; i < (int)data_len; i++) {
+			sum = 0;
+			switch(order) {
+				case 32: sum += qlp_coeff[31] * data[i-32];
+				case 31: sum += qlp_coeff[30] * data[i-31];
+				case 30: sum += qlp_coeff[29] * data[i-30];
+				case 29: sum += qlp_coeff[28] * data[i-29];
+				case 28: sum += qlp_coeff[27] * data[i-28];
+				case 27: sum += qlp_coeff[26] * data[i-27];
+				case 26: sum += qlp_coeff[25] * data[i-26];
+				case 25: sum += qlp_coeff[24] * data[i-25];
+				case 24: sum += qlp_coeff[23] * data[i-24];
+				case 23: sum += qlp_coeff[22] * data[i-23];
+				case 22: sum += qlp_coeff[21] * data[i-22];
+				case 21: sum += qlp_coeff[20] * data[i-21];
+				case 20: sum += qlp_coeff[19] * data[i-20];
+				case 19: sum += qlp_coeff[18] * data[i-19];
+				case 18: sum += qlp_coeff[17] * data[i-18];
+				case 17: sum += qlp_coeff[16] * data[i-17];
+				case 16: sum += qlp_coeff[15] * data[i-16];
+				case 15: sum += qlp_coeff[14] * data[i-15];
+				case 14: sum += qlp_coeff[13] * data[i-14];
+				case 13: sum += qlp_coeff[12] * data[i-13];
+				         sum += qlp_coeff[11] * data[i-12];
+				         sum += qlp_coeff[10] * data[i-11];
+				         sum += qlp_coeff[ 9] * data[i-10];
+				         sum += qlp_coeff[ 8] * data[i- 9];
+				         sum += qlp_coeff[ 7] * data[i- 8];
+				         sum += qlp_coeff[ 6] * data[i- 7];
+				         sum += qlp_coeff[ 5] * data[i- 6];
+				         sum += qlp_coeff[ 4] * data[i- 5];
+				         sum += qlp_coeff[ 3] * data[i- 4];
+				         sum += qlp_coeff[ 2] * data[i- 3];
+				         sum += qlp_coeff[ 1] * data[i- 2];
+				         sum += qlp_coeff[ 0] * data[i- 1];
+			}
+			residual[i] = data[i] - (sum >> lp_quantization);
+		}
+	}
+}
+
+#if defined FLAC__CPU_IA32 && !defined FLAC__HAS_NASM /* unused for x64; not better than MMX asm */
+
+FLAC__SSE_TARGET("sse2")
+void FLAC__lpc_restore_signal_16_intrin_sse2(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
+{
+	if (order < 8 || order > 12) {
+		FLAC__lpc_restore_signal(residual, data_len, qlp_coeff, order, lp_quantization, data);
+		return;
+	}
+	if (data_len == 0)
+		return;
+
+	FLAC__ASSERT(order >= 8);
+	FLAC__ASSERT(order <= 12);
+
+	if(order > 8) { /* order == 9, 10, 11, 12 */
+		FLAC__int32 curr;
+		__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+		xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
+		xmm6 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
+		xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+8)); /* read 0 to 3 uninitialized coeffs... */
+		switch(order)                                          /* ...and zero them out */
+		{
+		case 9:
+			xmm1 = _mm_slli_si128(xmm1, 12); xmm1 = _mm_srli_si128(xmm1, 12); break;
+		case 10:
+			xmm1 = _mm_slli_si128(xmm1, 8); xmm1 = _mm_srli_si128(xmm1, 8); break;
+		case 11:
+			xmm1 = _mm_slli_si128(xmm1, 4); xmm1 = _mm_srli_si128(xmm1, 4); break;
+		}
+		xmm2 = _mm_setzero_si128();
+		xmm0 = _mm_packs_epi32(xmm0, xmm6);
+		xmm1 = _mm_packs_epi32(xmm1, xmm2);
+
+		xmm4 = _mm_loadu_si128((const __m128i*)(data-12));
+		xmm5 = _mm_loadu_si128((const __m128i*)(data-8));
+		xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
+		xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(0,1,2,3));
+		xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(0,1,2,3));
+		xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
+		xmm4 = _mm_packs_epi32(xmm4, xmm2);
+		xmm3 = _mm_packs_epi32(xmm3, xmm5);
+
+		xmm7 = _mm_slli_si128(xmm1, 2);
+		xmm7 = _mm_or_si128(xmm7, _mm_srli_si128(xmm0, 14));
+		xmm2 = _mm_slli_si128(xmm0, 2);
+
+		/* xmm0, xmm1: qlp_coeff
+			xmm2, xmm7: qlp_coeff << 16 bit
+			xmm3, xmm4: data */
+
+		xmm5 = _mm_madd_epi16(xmm4, xmm1);
+		xmm6 = _mm_madd_epi16(xmm3, xmm0);
+		xmm6 = _mm_add_epi32(xmm6, xmm5);
+		xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
+		xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
+
+		DATA16_RESULT(xmm6);
+
+		data_len--;
+
+		if(data_len % 2) {
+			xmm6 = _mm_srli_si128(xmm3, 14);
+			xmm4 = _mm_slli_si128(xmm4, 2);
+			xmm3 = _mm_slli_si128(xmm3, 2);
+			xmm4 = _mm_or_si128(xmm4, xmm6);
+			xmm3 = _mm_insert_epi16(xmm3, curr, 0);
+
+			xmm5 = _mm_madd_epi16(xmm4, xmm1);
+			xmm6 = _mm_madd_epi16(xmm3, xmm0);
+			xmm6 = _mm_add_epi32(xmm6, xmm5);
+			xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
+			xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
+
+			DATA16_RESULT(xmm6);
+
+			data_len--;
+		}
+
+		while(data_len) { /* data_len is a multiple of 2 */
+			/* 1 _mm_slli_si128 per data element less but we need shifted qlp_coeff in xmm2:xmm7 */
+			xmm6 = _mm_srli_si128(xmm3, 12);
+			xmm4 = _mm_slli_si128(xmm4, 4);
+			xmm3 = _mm_slli_si128(xmm3, 4);
+			xmm4 = _mm_or_si128(xmm4, xmm6);
+			xmm3 = _mm_insert_epi16(xmm3, curr, 1);
+
+			xmm5 = _mm_madd_epi16(xmm4, xmm7);
+			xmm6 = _mm_madd_epi16(xmm3, xmm2);
+			xmm6 = _mm_add_epi32(xmm6, xmm5);
+			xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
+			xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
+
+			DATA16_RESULT(xmm6);
+
+			xmm3 = _mm_insert_epi16(xmm3, curr, 0);
+
+			xmm5 = _mm_madd_epi16(xmm4, xmm1);
+			xmm6 = _mm_madd_epi16(xmm3, xmm0);
+			xmm6 = _mm_add_epi32(xmm6, xmm5);
+			xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
+			xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
+
+			DATA16_RESULT(xmm6);
+
+			data_len-=2;
+		}
+	} /* endif(order > 8) */
+	else
+	{
+		FLAC__int32 curr;
+		__m128i xmm0, xmm1, xmm3, xmm6;
+		xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
+		xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
+		xmm0 = _mm_packs_epi32(xmm0, xmm1);
+
+		xmm1 = _mm_loadu_si128((const __m128i*)(data-8));
+		xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
+		xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(0,1,2,3));
+		xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
+		xmm3 = _mm_packs_epi32(xmm3, xmm1);
+
+		/* xmm0: qlp_coeff
+			xmm3: data */
+
+		xmm6 = _mm_madd_epi16(xmm3, xmm0);
+		xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
+		xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
+
+		DATA16_RESULT(xmm6);
+
+		data_len--;
+
+		while(data_len) {
+			xmm3 = _mm_slli_si128(xmm3, 2);
+			xmm3 = _mm_insert_epi16(xmm3, curr, 0);
+
+			xmm6 = _mm_madd_epi16(xmm3, xmm0);
+			xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
+			xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
+
+			DATA16_RESULT(xmm6);
+
+			data_len--;
+		}
+	}
+}
+
+#endif /* defined FLAC__CPU_IA32 && !defined FLAC__HAS_NASM */
+
+#endif /* FLAC__SSE2_SUPPORTED */
+#endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
+#endif /* FLAC__NO_ASM */
+#endif /* FLAC__INTEGER_ONLY_LIBRARY */
diff --git a/src/libFLAC/lpc_intrin_sse41.c b/src/libFLAC/lpc_intrin_sse41.c
new file mode 100644
index 0000000..b6f4e5e
--- /dev/null
+++ b/src/libFLAC/lpc_intrin_sse41.c
@@ -0,0 +1,1312 @@
+/* libFLAC - Free Lossless Audio Codec library
+ * Copyright (C) 2000-2009  Josh Coalson
+ * Copyright (C) 2011-2014  Xiph.Org Foundation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the Xiph.org Foundation nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#  include <config.h>
+#endif
+
+#ifndef FLAC__INTEGER_ONLY_LIBRARY
+#ifndef FLAC__NO_ASM
+#if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
+#include "private/lpc.h"
+#ifdef FLAC__SSE4_1_SUPPORTED
+
+#include "FLAC/assert.h"
+#include "FLAC/format.h"
+
+#include <smmintrin.h> /* SSE4.1 */
+
+#if defined FLAC__CPU_IA32 /* unused for x64 */
+
+#define RESIDUAL64_RESULT(xmmN)  residual[i] = data[i] - _mm_cvtsi128_si32(_mm_srl_epi64(xmmN, cnt))
+#define RESIDUAL64_RESULT1(xmmN) residual[i] = data[i] - _mm_cvtsi128_si32(_mm_srli_epi64(xmmN, lp_quantization))
+
+FLAC__SSE_TARGET("sse4.1")
+void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
+{
+	int i;
+	__m128i cnt = _mm_cvtsi32_si128(lp_quantization);
+
+	FLAC__ASSERT(order > 0);
+	FLAC__ASSERT(order <= 32);
+	FLAC__ASSERT(lp_quantization <= 32); /* there's no _mm_sra_epi64() so we have to use _mm_srl_epi64() */
+
+	if(order <= 12) {
+		if(order > 8) { /* order == 9, 10, 11, 12 */
+			if(order > 10) { /* order == 11, 12 */
+				if(order == 12) {
+					__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));  // 0  0  q[1]  q[0]
+					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));  // 0  0  q[3]  q[2]
+					xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));  // 0  0  q[5]  q[4]
+					xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));  // 0  0  q[7]  q[6]
+					xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));  // 0  0  q[9]  q[8]
+					xmm5 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+10)); // 0  0  q[11] q[10]
+
+					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); // 0  q[1]  0  q[0]
+					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); // 0  q[3]  0  q[2]
+					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); // 0  q[5]  0  q[4]
+					xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); // 0  q[7]  0  q[6]
+					xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0)); // 0  q[9]  0  q[8]
+					xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(3,1,2,0)); // 0  q[11] 0  q[10]
+
+					for(i = 0; i < (int)data_len; i++) {
+						//sum = 0;
+						//sum += qlp_coeff[11] * (FLAC__int64)data[i-12];
+						//sum += qlp_coeff[10] * (FLAC__int64)data[i-11];
+						xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-12));  // 0   0        d[i-11]  d[i-12]
+						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); // 0  d[i-12]   0        d[i-11]
+						xmm7 = _mm_mul_epi32(xmm7, xmm5);
+
+						//sum += qlp_coeff[9] * (FLAC__int64)data[i-10];
+						//sum += qlp_coeff[8] * (FLAC__int64)data[i-9];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm4);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						//sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
+						//sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm3);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						//sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
+						//sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm2);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						//sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
+						//sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm1);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
+						//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm0);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
+						RESIDUAL64_RESULT1(xmm7);
+					}
+				}
+				else { /* order == 11 */
+					__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
+					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
+					xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
+					xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
+					xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
+					xmm5 = _mm_cvtsi32_si128(qlp_coeff[10]);
+
+					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
+					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
+					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
+					xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
+					xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
+
+					for(i = 0; i < (int)data_len; i++) {
+						//sum = 0;
+						//sum  = qlp_coeff[10] * (FLAC__int64)data[i-11];
+						xmm7 = _mm_cvtsi32_si128(data[i-11]);
+						xmm7 = _mm_mul_epi32(xmm7, xmm5);
+
+						//sum += qlp_coeff[9] * (FLAC__int64)data[i-10];
+						//sum += qlp_coeff[8] * (FLAC__int64)data[i-9];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm4);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						//sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
+						//sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm3);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						//sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
+						//sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm2);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						//sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
+						//sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm1);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
+						//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm0);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
+						RESIDUAL64_RESULT1(xmm7);
+					}
+				}
+			}
+			else { /* order == 9, 10 */
+				if(order == 10) {
+					__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
+					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
+					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
+					xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
+					xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
+					xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
+
+					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
+					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
+					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
+					xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
+					xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
+
+					for(i = 0; i < (int)data_len; i++) {
+						//sum = 0;
+						//sum += qlp_coeff[9] * (FLAC__int64)data[i-10];
+						//sum += qlp_coeff[8] * (FLAC__int64)data[i-9];
+						xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-10));
+						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
+						xmm7 = _mm_mul_epi32(xmm7, xmm4);
+
+						//sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
+						//sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm3);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						//sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
+						//sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm2);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						//sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
+						//sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm1);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
+						//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm0);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
+						RESIDUAL64_RESULT(xmm7);
+					}
+				}
+				else { /* order == 9 */
+					__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
+					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
+					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
+					xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
+					xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
+					xmm4 = _mm_cvtsi32_si128(qlp_coeff[8]);
+
+					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
+					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
+					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
+					xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
+
+					for(i = 0; i < (int)data_len; i++) {
+						//sum = 0;
+						//sum  = qlp_coeff[8] * (FLAC__int64)data[i-9];
+						xmm7 = _mm_cvtsi32_si128(data[i-9]);
+						xmm7 = _mm_mul_epi32(xmm7, xmm4);
+
+						//sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
+						//sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm3);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						//sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
+						//sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm2);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						//sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
+						//sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm1);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
+						//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm0);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
+						RESIDUAL64_RESULT(xmm7);
+					}
+				}
+			}
+		}
+		else if(order > 4) { /* order == 5, 6, 7, 8 */
+			if(order > 6) { /* order == 7, 8 */
+				if(order == 8) {
+					__m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
+					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
+					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
+					xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
+					xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
+
+					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
+					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
+					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
+					xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
+
+					for(i = 0; i < (int)data_len; i++) {
+						//sum = 0;
+						//sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
+						//sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
+						xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-8));
+						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
+						xmm7 = _mm_mul_epi32(xmm7, xmm3);
+
+						//sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
+						//sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm2);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						//sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
+						//sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm1);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
+						//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm0);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
+						RESIDUAL64_RESULT(xmm7);
+					}
+				}
+				else { /* order == 7 */
+					__m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
+					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
+					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
+					xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
+					xmm3 = _mm_cvtsi32_si128(qlp_coeff[6]);
+
+					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
+					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
+					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
+
+					for(i = 0; i < (int)data_len; i++) {
+						//sum = 0;
+						//sum  = qlp_coeff[6] * (FLAC__int64)data[i-7];
+						xmm7 = _mm_cvtsi32_si128(data[i-7]);
+						xmm7 = _mm_mul_epi32(xmm7, xmm3);
+
+						//sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
+						//sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm2);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						//sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
+						//sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm1);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
+						//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm0);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
+						RESIDUAL64_RESULT(xmm7);
+					}
+				}
+			}
+			else { /* order == 5, 6 */
+				if(order == 6) {
+					__m128i xmm0, xmm1, xmm2, xmm6, xmm7;
+					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
+					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
+					xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
+
+					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
+					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
+					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
+
+					for(i = 0; i < (int)data_len; i++) {
+						//sum = 0;
+						//sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
+						//sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
+						xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-6));
+						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
+						xmm7 = _mm_mul_epi32(xmm7, xmm2);
+
+						//sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
+						//sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm1);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
+						//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm0);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
+						RESIDUAL64_RESULT(xmm7);
+					}
+				}
+				else { /* order == 5 */
+					__m128i xmm0, xmm1, xmm2, xmm6, xmm7;
+					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
+					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
+					xmm2 = _mm_cvtsi32_si128(qlp_coeff[4]);
+
+					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
+					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
+
+					for(i = 0; i < (int)data_len; i++) {
+						//sum = 0;
+						//sum  = qlp_coeff[4] * (FLAC__int64)data[i-5];
+						xmm7 = _mm_cvtsi32_si128(data[i-5]);
+						xmm7 = _mm_mul_epi32(xmm7, xmm2);
+
+						//sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
+						//sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm1);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
+						//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm0);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
+						RESIDUAL64_RESULT(xmm7);
+					}
+				}
+			}
+		}
+		else { /* order == 1, 2, 3, 4 */
+			if(order > 2) { /* order == 3, 4 */
+				if(order == 4) {
+					__m128i xmm0, xmm1, xmm6, xmm7;
+					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
+					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
+
+					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
+					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
+
+					for(i = 0; i < (int)data_len; i++) {
+						//sum = 0;
+						//sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
+						//sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
+						xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-4));
+						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
+						xmm7 = _mm_mul_epi32(xmm7, xmm1);
+
+						//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
+						//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm0);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
+						RESIDUAL64_RESULT(xmm7);
+					}
+				}
+				else { /* order == 3 */
+					__m128i xmm0, xmm1, xmm6, xmm7;
+					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
+					xmm1 = _mm_cvtsi32_si128(qlp_coeff[2]);
+
+					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
+
+					for(i = 0; i < (int)data_len; i++) {
+						//sum = 0;
+						//sum  = qlp_coeff[2] * (FLAC__int64)data[i-3];
+						xmm7 = _mm_cvtsi32_si128(data[i-3]);
+						xmm7 = _mm_mul_epi32(xmm7, xmm1);
+
+						//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
+						//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
+						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
+						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
+						xmm6 = _mm_mul_epi32(xmm6, xmm0);
+						xmm7 = _mm_add_epi64(xmm7, xmm6);
+
+						xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
+						RESIDUAL64_RESULT(xmm7);
+					}
+				}
+			}
+			else { /* order == 1, 2 */
+				if(order == 2) {
+					__m128i xmm0, xmm7;
+					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
+					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
+
+					for(i = 0; i < (int)data_len; i++) {
+						//sum = 0;
+						//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
+						//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
+						xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-2));
+						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
+						xmm7 = _mm_mul_epi32(xmm7, xmm0);
+
+						xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
+						RESIDUAL64_RESULT(xmm7);
+					}
+				}
+				else { /* order == 1 */
+					__m128i xmm0, xmm7;
+					xmm0 = _mm_cvtsi32_si128(qlp_coeff[0]);
+
+					for(i = 0; i < (int)data_len; i++) {
+						//sum = qlp_coeff[0] * (FLAC__int64)data[i-1];
+						xmm7 = _mm_cvtsi32_si128(data[i-1]);
+						xmm7 = _mm_mul_epi32(xmm7, xmm0);
+						RESIDUAL64_RESULT(xmm7);
+					}
+				}
+			}
+		}
+	}
+	else { /* order > 12 */
+		FLAC__int64 sum;
+		for(i = 0; i < (int)data_len; i++) {
+			sum = 0;
+			switch(order) {
+				case 32: sum += qlp_coeff[31] * (FLAC__int64)data[i-32];
+				case 31: sum += qlp_coeff[30] * (FLAC__int64)data[i-31];
+				case 30: sum += qlp_coeff[29] * (FLAC__int64)data[i-30];
+				case 29: sum += qlp_coeff[28] * (FLAC__int64)data[i-29];
+				case 28: sum += qlp_coeff[27] * (FLAC__int64)data[i-28];
+				case 27: sum += qlp_coeff[26] * (FLAC__int64)data[i-27];
+				case 26: sum += qlp_coeff[25] * (FLAC__int64)data[i-26];
+				case 25: sum += qlp_coeff[24] * (FLAC__int64)data[i-25];
+				case 24: sum += qlp_coeff[23] * (FLAC__int64)data[i-24];
+				case 23: sum += qlp_coeff[22] * (FLAC__int64)data[i-23];
+				case 22: sum += qlp_coeff[21] * (FLAC__int64)data[i-22];
+				case 21: sum += qlp_coeff[20] * (FLAC__int64)data[i-21];
+				case 20: sum += qlp_coeff[19] * (FLAC__int64)data[i-20];
+				case 19: sum += qlp_coeff[18] * (FLAC__int64)data[i-19];
+				case 18: sum += qlp_coeff[17] * (FLAC__int64)data[i-18];
+				case 17: sum += qlp_coeff[16] * (FLAC__int64)data[i-17];
+				case 16: sum += qlp_coeff[15] * (FLAC__int64)data[i-16];
+				case 15: sum += qlp_coeff[14] * (FLAC__int64)data[i-15];
+				case 14: sum += qlp_coeff[13] * (FLAC__int64)data[i-14];
+				case 13: sum += qlp_coeff[12] * (FLAC__int64)data[i-13];
+				         sum += qlp_coeff[11] * (FLAC__int64)data[i-12];
+				         sum += qlp_coeff[10] * (FLAC__int64)data[i-11];
+				         sum += qlp_coeff[ 9] * (FLAC__int64)data[i-10];
+				         sum += qlp_coeff[ 8] * (FLAC__int64)data[i- 9];
+				         sum += qlp_coeff[ 7] * (FLAC__int64)data[i- 8];
+				         sum += qlp_coeff[ 6] * (FLAC__int64)data[i- 7];
+				         sum += qlp_coeff[ 5] * (FLAC__int64)data[i- 6];
+				         sum += qlp_coeff[ 4] * (FLAC__int64)data[i- 5];
+				         sum += qlp_coeff[ 3] * (FLAC__int64)data[i- 4];
+				         sum += qlp_coeff[ 2] * (FLAC__int64)data[i- 3];
+				         sum += qlp_coeff[ 1] * (FLAC__int64)data[i- 2];
+				         sum += qlp_coeff[ 0] * (FLAC__int64)data[i- 1];
+			}
+			residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantization);
+		}
+	}
+}
+
+FLAC__SSE_TARGET("sse4.1")
+void FLAC__lpc_restore_signal_wide_intrin_sse41(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
+{
+	int i;
+	__m128i cnt = _mm_cvtsi32_si128(lp_quantization);
+
+	if (!data_len)
+		return;
+
+	FLAC__ASSERT(order > 0);
+	FLAC__ASSERT(order <= 32);
+	FLAC__ASSERT(lp_quantization <= 32); /* there's no _mm_sra_epi64() so we have to use _mm_srl_epi64() */
+
+	if(order <= 12) {
+		if(order > 8) { /* order == 9, 10, 11, 12 */
+			if(order > 10) { /* order == 11, 12 */
+				__m128i qlp[6], dat[6];
+				__m128i summ, temp;
+				qlp[0] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));	// 0  0  q[1]  q[0]
+				qlp[1] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));	// 0  0  q[3]  q[2]
+				qlp[2] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));	// 0  0  q[5]  q[4]
+				qlp[3] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));	// 0  0  q[7]  q[6]
+				qlp[4] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));	// 0  0  q[9]  q[8]
+				if (order == 12)
+					qlp[5] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+10));	// 0  0  q[11] q[10]
+				else
+					qlp[5] = _mm_cvtsi32_si128(qlp_coeff[10]);					// 0  0  0     q[10]
+
+				qlp[0] = _mm_shuffle_epi32(qlp[0], _MM_SHUFFLE(2,0,3,1));	// 0  q[0]  0  q[1]
+				qlp[1] = _mm_shuffle_epi32(qlp[1], _MM_SHUFFLE(2,0,3,1));	// 0  q[2]  0  q[3]
+				qlp[2] = _mm_shuffle_epi32(qlp[2], _MM_SHUFFLE(2,0,3,1));	// 0  q[4]  0  q[5]
+				qlp[3] = _mm_shuffle_epi32(qlp[3], _MM_SHUFFLE(2,0,3,1));	// 0  q[5]  0  q[7]
+				qlp[4] = _mm_shuffle_epi32(qlp[4], _MM_SHUFFLE(2,0,3,1));	// 0  q[8]  0  q[9]
+				qlp[5] = _mm_shuffle_epi32(qlp[5], _MM_SHUFFLE(2,0,3,1));	// 0  q[10] 0  q[11]
+
+				dat[5] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-12)));	// ?  d[i-11]  ?  d[i-12]
+				dat[4] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-10)));	// ?  d[i-9]   ?  d[i-10]
+				dat[3] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-8 )));	// ?  d[i-7]   ?  d[i-8]
+				dat[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-6 )));	// ?  d[i-5]   ?  d[i-6]
+				dat[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-4 )));	// ?  d[i-3]   ?  d[i-4]
+				dat[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-2 )));	// ?  d[i-1]   ?  d[i-2]
+
+				summ =                     _mm_mul_epi32(dat[5], qlp[5]) ;
+				summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[4], qlp[4]));
+				summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[3], qlp[3]));
+				summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2]));
+				summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
+				summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
+
+				summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));	// ?_64  sum_64
+				summ = _mm_srl_epi64(summ, cnt);						// ?_64  (sum >> lp_quantization)_64  ==  ?_32  ?_32  ?_32  (sum >> lp_quantization)_32
+				temp = _mm_cvtsi32_si128(residual[0]);					// 0  0  0  r[i]
+				temp = _mm_add_epi32(temp, summ);						// ?  ?  ?  d[i]
+				data[0] = _mm_cvtsi128_si32(temp);
+
+				for(i = 1; i < (int)data_len; i++) {
+					dat[5] = _mm_alignr_epi8(dat[4], dat[5], 8);	//  ?  d[i-10] ?  d[i-11]
+					dat[4] = _mm_alignr_epi8(dat[3], dat[4], 8);	//  ?  d[i-8]  ?  d[i-9]
+					dat[3] = _mm_alignr_epi8(dat[2], dat[3], 8);	//  ?  d[i-6]  ?  d[i-7]
+					dat[2] = _mm_alignr_epi8(dat[1], dat[2], 8);	//  ?  d[i-4]  ?  d[i-5]
+					dat[1] = _mm_alignr_epi8(dat[0], dat[1], 8);	//  ?  d[i-2]  ?  d[i-3]
+					dat[0] = _mm_alignr_epi8(temp,   dat[0], 8);	//  ?  d[i  ]  ?  d[i-1]
+
+					summ =                     _mm_mul_epi32(dat[5], qlp[5]) ;
+					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[4], qlp[4]));
+					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[3], qlp[3]));
+					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2]));
+					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
+					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
+
+					summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));	// ?_64  sum_64
+					summ = _mm_srl_epi64(summ, cnt);						// ?_64  (sum >> lp_quantization)_64  ==  ?_32  ?_32  ?_32  (sum >> lp_quantization)_32
+					temp = _mm_cvtsi32_si128(residual[i]);					// 0  0  0  r[i]
+					temp = _mm_add_epi32(temp, summ);						// ?  ?  ?  d[i]
+					data[i] = _mm_cvtsi128_si32(temp);
+				}
+			}
+			else { /* order == 9, 10 */
+				__m128i qlp[5], dat[5];
+				__m128i summ, temp;
+				qlp[0] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
+				qlp[1] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
+				qlp[2] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
+				qlp[3] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
+				if (order == 10)
+					qlp[4] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
+				else
+					qlp[4] = _mm_cvtsi32_si128(qlp_coeff[8]);
+
+				qlp[0] = _mm_shuffle_epi32(qlp[0], _MM_SHUFFLE(2,0,3,1));
+				qlp[1] = _mm_shuffle_epi32(qlp[1], _MM_SHUFFLE(2,0,3,1));
+				qlp[2] = _mm_shuffle_epi32(qlp[2], _MM_SHUFFLE(2,0,3,1));
+				qlp[3] = _mm_shuffle_epi32(qlp[3], _MM_SHUFFLE(2,0,3,1));
+				qlp[4] = _mm_shuffle_epi32(qlp[4], _MM_SHUFFLE(2,0,3,1));
+
+				dat[4] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-10)));
+				dat[3] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-8 )));
+				dat[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-6 )));
+				dat[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-4 )));
+				dat[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-2 )));
+
+				summ =                     _mm_mul_epi32(dat[4], qlp[4]) ;
+				summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[3], qlp[3]));
+				summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2]));
+				summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
+				summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
+
+				summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
+				summ = _mm_srl_epi64(summ, cnt);
+				temp = _mm_cvtsi32_si128(residual[0]);
+				temp = _mm_add_epi32(temp, summ);
+				data[0] = _mm_cvtsi128_si32(temp);
+
+				for(i = 1; i < (int)data_len; i++) {
+					dat[4] = _mm_alignr_epi8(dat[3], dat[4], 8);
+					dat[3] = _mm_alignr_epi8(dat[2], dat[3], 8);
+					dat[2] = _mm_alignr_epi8(dat[1], dat[2], 8);
+					dat[1] = _mm_alignr_epi8(dat[0], dat[1], 8);
+					dat[0] = _mm_alignr_epi8(temp,   dat[0], 8);
+
+					summ =                     _mm_mul_epi32(dat[4], qlp[4]) ;
+					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[3], qlp[3]));
+					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2]));
+					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
+					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
+
+					summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
+					summ = _mm_srl_epi64(summ, cnt);
+					temp = _mm_cvtsi32_si128(residual[i]);
+					temp = _mm_add_epi32(temp, summ);
+					data[i] = _mm_cvtsi128_si32(temp);
+				}
+			}
+		}
+		else if(order > 4) { /* order == 5, 6, 7, 8 */
+			if(order > 6) { /* order == 7, 8 */
+				__m128i qlp[4], dat[4];
+				__m128i summ, temp;
+				qlp[0] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
+				qlp[1] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
+				qlp[2] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
+				if (order == 8)
+					qlp[3] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
+				else
+					qlp[3] = _mm_cvtsi32_si128(qlp_coeff[6]);
+
+				qlp[0] = _mm_shuffle_epi32(qlp[0], _MM_SHUFFLE(2,0,3,1));
+				qlp[1] = _mm_shuffle_epi32(qlp[1], _MM_SHUFFLE(2,0,3,1));
+				qlp[2] = _mm_shuffle_epi32(qlp[2], _MM_SHUFFLE(2,0,3,1));
+				qlp[3] = _mm_shuffle_epi32(qlp[3], _MM_SHUFFLE(2,0,3,1));
+
+				dat[3] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-8 )));
+				dat[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-6 )));
+				dat[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-4 )));
+				dat[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-2 )));
+
+				summ =                     _mm_mul_epi32(dat[3], qlp[3]) ;
+				summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2]));
+				summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
+				summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
+
+				summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
+				summ = _mm_srl_epi64(summ, cnt);
+				temp = _mm_cvtsi32_si128(residual[0]);
+				temp = _mm_add_epi32(temp, summ);
+				data[0] = _mm_cvtsi128_si32(temp);
+
+				for(i = 1; i < (int)data_len; i++) {
+					dat[3] = _mm_alignr_epi8(dat[2], dat[3], 8);
+					dat[2] = _mm_alignr_epi8(dat[1], dat[2], 8);
+					dat[1] = _mm_alignr_epi8(dat[0], dat[1], 8);
+					dat[0] = _mm_alignr_epi8(temp,   dat[0], 8);
+
+					summ =                     _mm_mul_epi32(dat[3], qlp[3]) ;
+					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2]));
+					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
+					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
+
+					summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
+					summ = _mm_srl_epi64(summ, cnt);
+					temp = _mm_cvtsi32_si128(residual[i]);
+					temp = _mm_add_epi32(temp, summ);
+					data[i] = _mm_cvtsi128_si32(temp);
+				}
+			}
+			else { /* order == 5, 6 */
+				__m128i qlp[3], dat[3];
+				__m128i summ, temp;
+				qlp[0] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
+				qlp[1] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
+				if (order == 6)
+					qlp[2] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
+				else
+					qlp[2] = _mm_cvtsi32_si128(qlp_coeff[4]);
+
+				qlp[0] = _mm_shuffle_epi32(qlp[0], _MM_SHUFFLE(2,0,3,1));
+				qlp[1] = _mm_shuffle_epi32(qlp[1], _MM_SHUFFLE(2,0,3,1));
+				qlp[2] = _mm_shuffle_epi32(qlp[2], _MM_SHUFFLE(2,0,3,1));
+
+				dat[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-6 )));
+				dat[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-4 )));
+				dat[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-2 )));
+
+				summ =                     _mm_mul_epi32(dat[2], qlp[2]) ;
+				summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
+				summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
+
+				summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
+				summ = _mm_srl_epi64(summ, cnt);
+				temp = _mm_cvtsi32_si128(residual[0]);
+				temp = _mm_add_epi32(temp, summ);
+				data[0] = _mm_cvtsi128_si32(temp);
+
+				for(i = 1; i < (int)data_len; i++) {
+					dat[2] = _mm_alignr_epi8(dat[1], dat[2], 8);
+					dat[1] = _mm_alignr_epi8(dat[0], dat[1], 8);
+					dat[0] = _mm_alignr_epi8(temp,   dat[0], 8);
+
+					summ =                     _mm_mul_epi32(dat[2], qlp[2]) ;
+					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
+					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
+
+					summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
+					summ = _mm_srl_epi64(summ, cnt);
+					temp = _mm_cvtsi32_si128(residual[i]);
+					temp = _mm_add_epi32(temp, summ);
+					data[i] = _mm_cvtsi128_si32(temp);
+				}
+			}
+		}
+		else { /* order == 1, 2, 3, 4 */
+			if(order > 2) { /* order == 3, 4 */
+				__m128i qlp[2], dat[2];
+				__m128i summ, temp;
+				qlp[0] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
+				if (order == 4)
+					qlp[1] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
+				else
+					qlp[1] = _mm_cvtsi32_si128(qlp_coeff[2]);
+
+				qlp[0] = _mm_shuffle_epi32(qlp[0], _MM_SHUFFLE(2,0,3,1));
+				qlp[1] = _mm_shuffle_epi32(qlp[1], _MM_SHUFFLE(2,0,3,1));
+
+				dat[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-4 )));
+				dat[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-2 )));
+
+				summ =                     _mm_mul_epi32(dat[1], qlp[1]) ;
+				summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
+
+				summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
+				summ = _mm_srl_epi64(summ, cnt);
+				temp = _mm_cvtsi32_si128(residual[0]);
+				temp = _mm_add_epi32(temp, summ);
+				data[0] = _mm_cvtsi128_si32(temp);
+
+				for(i = 1; i < (int)data_len; i++) {
+					dat[1] = _mm_alignr_epi8(dat[0], dat[1], 8);
+					dat[0] = _mm_alignr_epi8(temp,   dat[0], 8);
+
+					summ =                     _mm_mul_epi32(dat[1], qlp[1]) ;
+					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
+
+					summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
+					summ = _mm_srl_epi64(summ, cnt);
+					temp = _mm_cvtsi32_si128(residual[i]);
+					temp = _mm_add_epi32(temp, summ);
+					data[i] = _mm_cvtsi128_si32(temp);
+				}
+			}
+			else { /* order == 1, 2 */
+				if(order == 2) {
+					__m128i qlp0, dat0;
+					__m128i summ, temp;
+					qlp0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff));
+					qlp0 = _mm_shuffle_epi32(qlp0, _MM_SHUFFLE(2,0,3,1));
+
+					dat0 = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-2 )));
+
+					summ = _mm_mul_epi32(dat0, qlp0) ;
+
+					summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
+					summ = _mm_srl_epi64(summ, cnt);
+					temp = _mm_cvtsi32_si128(residual[0]);
+					temp = _mm_add_epi32(temp, summ);
+					data[0] = _mm_cvtsi128_si32(temp);
+
+					for(i = 1; i < (int)data_len; i++) {
+						dat0 = _mm_alignr_epi8(temp, dat0, 8);
+
+						summ = _mm_mul_epi32(dat0, qlp0) ;
+
+						summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
+						summ = _mm_srl_epi64(summ, cnt);
+						temp = _mm_cvtsi32_si128(residual[i]);
+						temp = _mm_add_epi32(temp, summ);
+						data[i] = _mm_cvtsi128_si32(temp);
+					}
+				}
+				else { /* order == 1 */
+					__m128i qlp0;
+					__m128i summ, temp;
+					qlp0 = _mm_cvtsi32_si128(qlp_coeff[0]);
+					temp = _mm_cvtsi32_si128(data[-1]);
+
+					summ = _mm_mul_epi32(temp, qlp0);
+					summ = _mm_srl_epi64(summ, cnt);
+					temp = _mm_cvtsi32_si128(residual[0]);
+					temp = _mm_add_epi32(temp, summ);
+					data[0] = _mm_cvtsi128_si32(temp);
+
+					for(i = 1; i < (int)data_len; i++) {
+						summ = _mm_mul_epi32(temp, qlp0) ;
+						summ = _mm_srl_epi64(summ, cnt);
+						temp = _mm_cvtsi32_si128(residual[i]);
+						temp = _mm_add_epi32(temp, summ);
+						data[i] = _mm_cvtsi128_si32(temp);
+					}
+				}
+			}
+		}
+	}
+	else { /* order > 12 */
+		FLAC__int64 sum;
+		for(i = 0; i < (int)data_len; i++) {
+			sum = 0;
+			switch(order) {
+				case 32: sum += qlp_coeff[31] * (FLAC__int64)data[i-32];
+				case 31: sum += qlp_coeff[30] * (FLAC__int64)data[i-31];
+				case 30: sum += qlp_coeff[29] * (FLAC__int64)data[i-30];
+				case 29: sum += qlp_coeff[28] * (FLAC__int64)data[i-29];
+				case 28: sum += qlp_coeff[27] * (FLAC__int64)data[i-28];
+				case 27: sum += qlp_coeff[26] * (FLAC__int64)data[i-27];
+				case 26: sum += qlp_coeff[25] * (FLAC__int64)data[i-26];
+				case 25: sum += qlp_coeff[24] * (FLAC__int64)data[i-25];
+				case 24: sum += qlp_coeff[23] * (FLAC__int64)data[i-24];
+				case 23: sum += qlp_coeff[22] * (FLAC__int64)data[i-23];
+				case 22: sum += qlp_coeff[21] * (FLAC__int64)data[i-22];
+				case 21: sum += qlp_coeff[20] * (FLAC__int64)data[i-21];
+				case 20: sum += qlp_coeff[19] * (FLAC__int64)data[i-20];
+				case 19: sum += qlp_coeff[18] * (FLAC__int64)data[i-19];
+				case 18: sum += qlp_coeff[17] * (FLAC__int64)data[i-18];
+				case 17: sum += qlp_coeff[16] * (FLAC__int64)data[i-17];
+				case 16: sum += qlp_coeff[15] * (FLAC__int64)data[i-16];
+				case 15: sum += qlp_coeff[14] * (FLAC__int64)data[i-15];
+				case 14: sum += qlp_coeff[13] * (FLAC__int64)data[i-14];
+				case 13: sum += qlp_coeff[12] * (FLAC__int64)data[i-13];
+				         sum += qlp_coeff[11] * (FLAC__int64)data[i-12];
+				         sum += qlp_coeff[10] * (FLAC__int64)data[i-11];
+				         sum += qlp_coeff[ 9] * (FLAC__int64)data[i-10];
+				         sum += qlp_coeff[ 8] * (FLAC__int64)data[i- 9];
+				         sum += qlp_coeff[ 7] * (FLAC__int64)data[i- 8];
+				         sum += qlp_coeff[ 6] * (FLAC__int64)data[i- 7];
+				         sum += qlp_coeff[ 5] * (FLAC__int64)data[i- 6];
+				         sum += qlp_coeff[ 4] * (FLAC__int64)data[i- 5];
+				         sum += qlp_coeff[ 3] * (FLAC__int64)data[i- 4];
+				         sum += qlp_coeff[ 2] * (FLAC__int64)data[i- 3];
+				         sum += qlp_coeff[ 1] * (FLAC__int64)data[i- 2];
+				         sum += qlp_coeff[ 0] * (FLAC__int64)data[i- 1];
+			}
+			data[i] = residual[i] + (FLAC__int32)(sum >> lp_quantization);
+		}
+	}
+}
+
+#endif /* defined FLAC__CPU_IA32 */
+
+FLAC__SSE_TARGET("sse4.1")
+void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse41(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
+{
+	int i;
+	FLAC__int32 sum;
+	__m128i cnt = _mm_cvtsi32_si128(lp_quantization);
+
+	FLAC__ASSERT(order > 0);
+	FLAC__ASSERT(order <= 32);
+
+	if(order <= 12) {
+		if(order > 8) {
+			if(order > 10) {
+				if(order == 12) {
+					__m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
+					q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
+					q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
+					q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
+					q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
+					q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
+					q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
+					q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
+					q7 = _mm_cvtsi32_si128(qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
+					q8 = _mm_cvtsi32_si128(qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
+					q9 = _mm_cvtsi32_si128(qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
+					q10 = _mm_cvtsi32_si128(qlp_coeff[10]); q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0));
+					q11 = _mm_cvtsi32_si128(qlp_coeff[11]); q11 = _mm_shuffle_epi32(q11, _MM_SHUFFLE(0,0,0,0));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m128i summ, mull;
+						summ = _mm_mullo_epi32(q11, _mm_loadu_si128((const __m128i*)(data+i-12)));
+						mull = _mm_mullo_epi32(q10, _mm_loadu_si128((const __m128i*)(data+i-11))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q9, _mm_loadu_si128((const __m128i*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
+						summ = _mm_sra_epi32(summ, cnt);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
+					}
+				}
+				else { /* order == 11 */
+					__m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;
+					q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
+					q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
+					q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
+					q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
+					q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
+					q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
+					q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
+					q7 = _mm_cvtsi32_si128(qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
+					q8 = _mm_cvtsi32_si128(qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
+					q9 = _mm_cvtsi32_si128(qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
+					q10 = _mm_cvtsi32_si128(qlp_coeff[10]); q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m128i summ, mull;
+						summ = _mm_mullo_epi32(q10, _mm_loadu_si128((const __m128i*)(data+i-11)));
+						mull = _mm_mullo_epi32(q9, _mm_loadu_si128((const __m128i*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
+						summ = _mm_sra_epi32(summ, cnt);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
+					}
+				}
+			}
+			else {
+				if(order == 10) {
+					__m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;
+					q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
+					q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
+					q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
+					q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
+					q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
+					q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
+					q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
+					q7 = _mm_cvtsi32_si128(qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
+					q8 = _mm_cvtsi32_si128(qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
+					q9 = _mm_cvtsi32_si128(qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m128i summ, mull;
+						summ = _mm_mullo_epi32(q9, _mm_loadu_si128((const __m128i*)(data+i-10)));
+						mull = _mm_mullo_epi32(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
+						summ = _mm_sra_epi32(summ, cnt);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
+					}
+				}
+				else { /* order == 9 */
+					__m128i q0, q1, q2, q3, q4, q5, q6, q7, q8;
+					q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
+					q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
+					q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
+					q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
+					q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
+					q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
+					q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
+					q7 = _mm_cvtsi32_si128(qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
+					q8 = _mm_cvtsi32_si128(qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m128i summ, mull;
+						summ = _mm_mullo_epi32(q8, _mm_loadu_si128((const __m128i*)(data+i-9)));
+						mull = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
+						summ = _mm_sra_epi32(summ, cnt);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
+					}
+				}
+			}
+		}
+		else if(order > 4) {
+			if(order > 6) {
+				if(order == 8) {
+					__m128i q0, q1, q2, q3, q4, q5, q6, q7;
+					q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
+					q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
+					q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
+					q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
+					q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
+					q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
+					q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
+					q7 = _mm_cvtsi32_si128(qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m128i summ, mull;
+						summ = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(data+i-8)));
+						mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
+						summ = _mm_sra_epi32(summ, cnt);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
+					}
+				}
+				else { /* order == 7 */
+					__m128i q0, q1, q2, q3, q4, q5, q6;
+					q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
+					q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
+					q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
+					q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
+					q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
+					q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
+					q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m128i summ, mull;
+						summ = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(data+i-7)));
+						mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
+						summ = _mm_sra_epi32(summ, cnt);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
+					}
+				}
+			}
+			else {
+				if(order == 6) {
+					__m128i q0, q1, q2, q3, q4, q5;
+					q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
+					q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
+					q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
+					q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
+					q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
+					q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m128i summ, mull;
+						summ = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(data+i-6)));
+						mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
+						summ = _mm_sra_epi32(summ, cnt);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
+					}
+				}
+				else { /* order == 5 */
+					__m128i q0, q1, q2, q3, q4;
+					q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
+					q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
+					q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
+					q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
+					q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m128i summ, mull;
+						summ = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5)));
+						mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
+						summ = _mm_sra_epi32(summ, cnt);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
+					}
+				}
+			}
+		}
+		else {
+			if(order > 2) {
+				if(order == 4) {
+					__m128i q0, q1, q2, q3;
+					q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
+					q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
+					q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
+					q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m128i summ, mull;
+						summ = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4)));
+						mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
+						summ = _mm_sra_epi32(summ, cnt);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
+					}
+				}
+				else { /* order == 3 */
+					__m128i q0, q1, q2;
+					q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
+					q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
+					q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m128i summ, mull;
+						summ = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3)));
+						mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
+						mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
+						summ = _mm_sra_epi32(summ, cnt);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
+					}
+				}
+			}
+			else {
+				if(order == 2) {
+					__m128i q0, q1;
+					q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
+					q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m128i summ, mull;
+						summ = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2)));
+						mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
+						summ = _mm_sra_epi32(summ, cnt);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
+					}
+				}
+				else { /* order == 1 */
+					__m128i q0;
+					q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
+
+					for(i = 0; i < (int)data_len-3; i+=4) {
+						__m128i summ;
+						summ = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1)));
+						summ = _mm_sra_epi32(summ, cnt);
+						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
+					}
+				}
+			}
+		}
+		for(; i < (int)data_len; i++) {
+			sum = 0;
+			switch(order) {
+				case 12: sum += qlp_coeff[11] * data[i-12];
+				case 11: sum += qlp_coeff[10] * data[i-11];
+				case 10: sum += qlp_coeff[ 9] * data[i-10];
+				case 9:  sum += qlp_coeff[ 8] * data[i- 9];
+				case 8:  sum += qlp_coeff[ 7] * data[i- 8];
+				case 7:  sum += qlp_coeff[ 6] * data[i- 7];
+				case 6:  sum += qlp_coeff[ 5] * data[i- 6];
+				case 5:  sum += qlp_coeff[ 4] * data[i- 5];
+				case 4:  sum += qlp_coeff[ 3] * data[i- 4];
+				case 3:  sum += qlp_coeff[ 2] * data[i- 3];
+				case 2:  sum += qlp_coeff[ 1] * data[i- 2];
+				case 1:  sum += qlp_coeff[ 0] * data[i- 1];
+			}
+			residual[i] = data[i] - (sum >> lp_quantization);
+		}
+	}
+	else { /* order > 12 */
+		for(i = 0; i < (int)data_len; i++) {
+			sum = 0;
+			switch(order) {
+				case 32: sum += qlp_coeff[31] * data[i-32];
+				case 31: sum += qlp_coeff[30] * data[i-31];
+				case 30: sum += qlp_coeff[29] * data[i-30];
+				case 29: sum += qlp_coeff[28] * data[i-29];
+				case 28: sum += qlp_coeff[27] * data[i-28];
+				case 27: sum += qlp_coeff[26] * data[i-27];
+				case 26: sum += qlp_coeff[25] * data[i-26];
+				case 25: sum += qlp_coeff[24] * data[i-25];
+				case 24: sum += qlp_coeff[23] * data[i-24];
+				case 23: sum += qlp_coeff[22] * data[i-23];
+				case 22: sum += qlp_coeff[21] * data[i-22];
+				case 21: sum += qlp_coeff[20] * data[i-21];
+				case 20: sum += qlp_coeff[19] * data[i-20];
+				case 19: sum += qlp_coeff[18] * data[i-19];
+				case 18: sum += qlp_coeff[17] * data[i-18];
+				case 17: sum += qlp_coeff[16] * data[i-17];
+				case 16: sum += qlp_coeff[15] * data[i-16];
+				case 15: sum += qlp_coeff[14] * data[i-15];
+				case 14: sum += qlp_coeff[13] * data[i-14];
+				case 13: sum += qlp_coeff[12] * data[i-13];
+				         sum += qlp_coeff[11] * data[i-12];
+				         sum += qlp_coeff[10] * data[i-11];
+				         sum += qlp_coeff[ 9] * data[i-10];
+				         sum += qlp_coeff[ 8] * data[i- 9];
+				         sum += qlp_coeff[ 7] * data[i- 8];
+				         sum += qlp_coeff[ 6] * data[i- 7];
+				         sum += qlp_coeff[ 5] * data[i- 6];
+				         sum += qlp_coeff[ 4] * data[i- 5];
+				         sum += qlp_coeff[ 3] * data[i- 4];
+				         sum += qlp_coeff[ 2] * data[i- 3];
+				         sum += qlp_coeff[ 1] * data[i- 2];
+				         sum += qlp_coeff[ 0] * data[i- 1];
+			}
+			residual[i] = data[i] - (sum >> lp_quantization);
+		}
+	}
+}
+
+#endif /* FLAC__SSE4_1_SUPPORTED */
+#endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
+#endif /* FLAC__NO_ASM */
+#endif /* FLAC__INTEGER_ONLY_LIBRARY */
diff --git a/src/libFLAC/md5.c b/src/libFLAC/md5.c
index 9f47944..e5adc3e 100644
--- a/src/libFLAC/md5.c
+++ b/src/libFLAC/md5.c
@@ -1,4 +1,4 @@
-#if HAVE_CONFIG_H
+#ifdef HAVE_CONFIG_H
 #  include <config.h>
 #endif
 
@@ -7,10 +7,7 @@
 
 #include "private/md5.h"
 #include "share/alloc.h"
-
-#ifndef FLaC__INLINE
-#define FLaC__INLINE
-#endif
+#include "share/endswap.h"
 
 /*
  * This code implements the MD5 message-digest algorithm.
@@ -143,7 +140,7 @@
 {
 	register FLAC__uint32 x;
 	do {
-		x = *buf; 
+		x = *buf;
 		x = ((x << 8) & 0xff00ff00) | ((x >> 8) & 0x00ff00ff);
 		*buf++ = (x >> 16) | (x << 16);
 	} while (--words);
@@ -227,7 +224,7 @@
 	ctx->bytes[0] = 0;
 	ctx->bytes[1] = 0;
 
-	ctx->internal_buf = 0;
+	ctx->internal_buf.p8= 0;
 	ctx->capacity = 0;
 }
 
@@ -263,9 +260,9 @@
 
 	byteSwap(ctx->buf, 4);
 	memcpy(digest, ctx->buf, 16);
-	if(0 != ctx->internal_buf) {
-		free(ctx->internal_buf);
-		ctx->internal_buf = 0;
+	if (0 != ctx->internal_buf.p8) {
+		free(ctx->internal_buf.p8);
+		ctx->internal_buf.p8= 0;
 		ctx->capacity = 0;
 	}
 	memset(ctx, 0, sizeof(*ctx));	/* In case it's sensitive */
@@ -274,58 +271,124 @@
 /*
  * Convert the incoming audio signal to a byte stream
  */
-static void format_input_(FLAC__byte *buf, const FLAC__int32 * const signal[], unsigned channels, unsigned samples, unsigned bytes_per_sample)
+static void format_input_(FLAC__multibyte *mbuf, const FLAC__int32 * const signal[], unsigned channels, unsigned samples, unsigned bytes_per_sample)
 {
+	FLAC__byte *buf_ = mbuf->p8;
+	FLAC__int16 *buf16 = mbuf->p16;
+	FLAC__int32 *buf32 = mbuf->p32;
+	FLAC__int32 a_word;
 	unsigned channel, sample;
-	register FLAC__int32 a_word;
-	register FLAC__byte *buf_ = buf;
 
-#if WORDS_BIGENDIAN
-#else
-	if(channels == 2 && bytes_per_sample == 2) {
-		FLAC__int16 *buf1_ = ((FLAC__int16*)buf_) + 1;
-		memcpy(buf_, signal[0], sizeof(FLAC__int32) * samples);
-		for(sample = 0; sample < samples; sample++, buf1_+=2)
-			*buf1_ = (FLAC__int16)signal[1][sample];
-	}
-	else if(channels == 1 && bytes_per_sample == 2) {
-		FLAC__int16 *buf1_ = (FLAC__int16*)buf_;
-		for(sample = 0; sample < samples; sample++)
-			*buf1_++ = (FLAC__int16)signal[0][sample];
-	}
-	else
-#endif
-	if(bytes_per_sample == 2) {
-		if(channels == 2) {
-			for(sample = 0; sample < samples; sample++) {
+	/* Storage in the output buffer, buf, is little endian. */
+
+#define BYTES_CHANNEL_SELECTOR(bytes, channels)   (bytes * 100 + channels)
+
+	/* First do the most commonly used combinations. */
+	switch (BYTES_CHANNEL_SELECTOR (bytes_per_sample, channels)) {
+		/* One byte per sample. */
+		case (BYTES_CHANNEL_SELECTOR (1, 1)):
+			for (sample = 0; sample < samples; sample++)
+				*buf_++ = signal[0][sample];
+			return;
+
+		case (BYTES_CHANNEL_SELECTOR (1, 2)):
+			for (sample = 0; sample < samples; sample++) {
+				*buf_++ = signal[0][sample];
+				*buf_++ = signal[1][sample];
+			}
+			return;
+
+		case (BYTES_CHANNEL_SELECTOR (1, 4)):
+			for (sample = 0; sample < samples; sample++) {
+				*buf_++ = signal[0][sample];
+				*buf_++ = signal[1][sample];
+				*buf_++ = signal[2][sample];
+				*buf_++ = signal[3][sample];
+			}
+			return;
+
+		case (BYTES_CHANNEL_SELECTOR (1, 6)):
+			for (sample = 0; sample < samples; sample++) {
+				*buf_++ = signal[0][sample];
+				*buf_++ = signal[1][sample];
+				*buf_++ = signal[2][sample];
+				*buf_++ = signal[3][sample];
+				*buf_++ = signal[4][sample];
+				*buf_++ = signal[5][sample];
+			}
+			return;
+
+		case (BYTES_CHANNEL_SELECTOR (1, 8)):
+			for (sample = 0; sample < samples; sample++) {
+				*buf_++ = signal[0][sample];
+				*buf_++ = signal[1][sample];
+				*buf_++ = signal[2][sample];
+				*buf_++ = signal[3][sample];
+				*buf_++ = signal[4][sample];
+				*buf_++ = signal[5][sample];
+				*buf_++ = signal[6][sample];
+				*buf_++ = signal[7][sample];
+			}
+			return;
+
+		/* Two bytes per sample. */
+		case (BYTES_CHANNEL_SELECTOR (2, 1)):
+			for (sample = 0; sample < samples; sample++)
+				*buf16++ = H2LE_16(signal[0][sample]);
+			return;
+
+		case (BYTES_CHANNEL_SELECTOR (2, 2)):
+			for (sample = 0; sample < samples; sample++) {
+				*buf16++ = H2LE_16(signal[0][sample]);
+				*buf16++ = H2LE_16(signal[1][sample]);
+			}
+			return;
+
+		case (BYTES_CHANNEL_SELECTOR (2, 4)):
+			for (sample = 0; sample < samples; sample++) {
+				*buf16++ = H2LE_16(signal[0][sample]);
+				*buf16++ = H2LE_16(signal[1][sample]);
+				*buf16++ = H2LE_16(signal[2][sample]);
+				*buf16++ = H2LE_16(signal[3][sample]);
+			}
+			return;
+
+		case (BYTES_CHANNEL_SELECTOR (2, 6)):
+			for (sample = 0; sample < samples; sample++) {
+				*buf16++ = H2LE_16(signal[0][sample]);
+				*buf16++ = H2LE_16(signal[1][sample]);
+				*buf16++ = H2LE_16(signal[2][sample]);
+				*buf16++ = H2LE_16(signal[3][sample]);
+				*buf16++ = H2LE_16(signal[4][sample]);
+				*buf16++ = H2LE_16(signal[5][sample]);
+			}
+			return;
+
+		case (BYTES_CHANNEL_SELECTOR (2, 8)):
+			for (sample = 0; sample < samples; sample++) {
+				*buf16++ = H2LE_16(signal[0][sample]);
+				*buf16++ = H2LE_16(signal[1][sample]);
+				*buf16++ = H2LE_16(signal[2][sample]);
+				*buf16++ = H2LE_16(signal[3][sample]);
+				*buf16++ = H2LE_16(signal[4][sample]);
+				*buf16++ = H2LE_16(signal[5][sample]);
+				*buf16++ = H2LE_16(signal[6][sample]);
+				*buf16++ = H2LE_16(signal[7][sample]);
+			}
+			return;
+
+		/* Three bytes per sample. */
+		case (BYTES_CHANNEL_SELECTOR (3, 1)):
+			for (sample = 0; sample < samples; sample++) {
 				a_word = signal[0][sample];
 				*buf_++ = (FLAC__byte)a_word; a_word >>= 8;
-				*buf_++ = (FLAC__byte)a_word;
-				a_word = signal[1][sample];
 				*buf_++ = (FLAC__byte)a_word; a_word >>= 8;
 				*buf_++ = (FLAC__byte)a_word;
 			}
-		}
-		else if(channels == 1) {
-			for(sample = 0; sample < samples; sample++) {
-				a_word = signal[0][sample];
-				*buf_++ = (FLAC__byte)a_word; a_word >>= 8;
-				*buf_++ = (FLAC__byte)a_word;
-			}
-		}
-		else {
-			for(sample = 0; sample < samples; sample++) {
-				for(channel = 0; channel < channels; channel++) {
-					a_word = signal[channel][sample];
-					*buf_++ = (FLAC__byte)a_word; a_word >>= 8;
-					*buf_++ = (FLAC__byte)a_word;
-				}
-			}
-		}
-	}
-	else if(bytes_per_sample == 3) {
-		if(channels == 2) {
-			for(sample = 0; sample < samples; sample++) {
+			return;
+
+		case (BYTES_CHANNEL_SELECTOR (3, 2)):
+			for (sample = 0; sample < samples; sample++) {
 				a_word = signal[0][sample];
 				*buf_++ = (FLAC__byte)a_word; a_word >>= 8;
 				*buf_++ = (FLAC__byte)a_word; a_word >>= 8;
@@ -335,60 +398,90 @@
 				*buf_++ = (FLAC__byte)a_word; a_word >>= 8;
 				*buf_++ = (FLAC__byte)a_word;
 			}
-		}
-		else if(channels == 1) {
-			for(sample = 0; sample < samples; sample++) {
-				a_word = signal[0][sample];
-				*buf_++ = (FLAC__byte)a_word; a_word >>= 8;
-				*buf_++ = (FLAC__byte)a_word; a_word >>= 8;
-				*buf_++ = (FLAC__byte)a_word;
+			return;
+
+		/* Four bytes per sample. */
+		case (BYTES_CHANNEL_SELECTOR (4, 1)):
+			for (sample = 0; sample < samples; sample++)
+				*buf32++ = H2LE_32(signal[0][sample]);
+			return;
+
+		case (BYTES_CHANNEL_SELECTOR (4, 2)):
+			for (sample = 0; sample < samples; sample++) {
+				*buf32++ = H2LE_32(signal[0][sample]);
+				*buf32++ = H2LE_32(signal[1][sample]);
 			}
-		}
-		else {
-			for(sample = 0; sample < samples; sample++) {
-				for(channel = 0; channel < channels; channel++) {
+			return;
+
+		case (BYTES_CHANNEL_SELECTOR (4, 4)):
+			for (sample = 0; sample < samples; sample++) {
+				*buf32++ = H2LE_32(signal[0][sample]);
+				*buf32++ = H2LE_32(signal[1][sample]);
+				*buf32++ = H2LE_32(signal[2][sample]);
+				*buf32++ = H2LE_32(signal[3][sample]);
+			}
+			return;
+
+		case (BYTES_CHANNEL_SELECTOR (4, 6)):
+			for (sample = 0; sample < samples; sample++) {
+				*buf32++ = H2LE_32(signal[0][sample]);
+				*buf32++ = H2LE_32(signal[1][sample]);
+				*buf32++ = H2LE_32(signal[2][sample]);
+				*buf32++ = H2LE_32(signal[3][sample]);
+				*buf32++ = H2LE_32(signal[4][sample]);
+				*buf32++ = H2LE_32(signal[5][sample]);
+			}
+			return;
+
+		case (BYTES_CHANNEL_SELECTOR (4, 8)):
+			for (sample = 0; sample < samples; sample++) {
+				*buf32++ = H2LE_32(signal[0][sample]);
+				*buf32++ = H2LE_32(signal[1][sample]);
+				*buf32++ = H2LE_32(signal[2][sample]);
+				*buf32++ = H2LE_32(signal[3][sample]);
+				*buf32++ = H2LE_32(signal[4][sample]);
+				*buf32++ = H2LE_32(signal[5][sample]);
+				*buf32++ = H2LE_32(signal[6][sample]);
+				*buf32++ = H2LE_32(signal[7][sample]);
+			}
+			return;
+
+		default:
+			break;
+	}
+
+	/* General version. */
+	switch (bytes_per_sample) {
+		case 1:
+			for (sample = 0; sample < samples; sample++)
+				for (channel = 0; channel < channels; channel++)
+					*buf_++ = signal[channel][sample];
+			return;
+
+		case 2:
+			for (sample = 0; sample < samples; sample++)
+				for (channel = 0; channel < channels; channel++)
+					*buf16++ = H2LE_16(signal[channel][sample]);
+			return;
+
+		case 3:
+			for (sample = 0; sample < samples; sample++)
+				for (channel = 0; channel < channels; channel++) {
 					a_word = signal[channel][sample];
 					*buf_++ = (FLAC__byte)a_word; a_word >>= 8;
 					*buf_++ = (FLAC__byte)a_word; a_word >>= 8;
 					*buf_++ = (FLAC__byte)a_word;
 				}
-			}
-		}
-	}
-	else if(bytes_per_sample == 1) {
-		if(channels == 2) {
-			for(sample = 0; sample < samples; sample++) {
-				a_word = signal[0][sample];
-				*buf_++ = (FLAC__byte)a_word;
-				a_word = signal[1][sample];
-				*buf_++ = (FLAC__byte)a_word;
-			}
-		}
-		else if(channels == 1) {
-			for(sample = 0; sample < samples; sample++) {
-				a_word = signal[0][sample];
-				*buf_++ = (FLAC__byte)a_word;
-			}
-		}
-		else {
-			for(sample = 0; sample < samples; sample++) {
-				for(channel = 0; channel < channels; channel++) {
-					a_word = signal[channel][sample];
-					*buf_++ = (FLAC__byte)a_word;
-				}
-			}
-		}
-	}
-	else { /* bytes_per_sample == 4, maybe optimize more later */
-		for(sample = 0; sample < samples; sample++) {
-			for(channel = 0; channel < channels; channel++) {
-				a_word = signal[channel][sample];
-				*buf_++ = (FLAC__byte)a_word; a_word >>= 8;
-				*buf_++ = (FLAC__byte)a_word; a_word >>= 8;
-				*buf_++ = (FLAC__byte)a_word; a_word >>= 8;
-				*buf_++ = (FLAC__byte)a_word;
-			}
-		}
+			return;
+
+		case 4:
+			for (sample = 0; sample < samples; sample++)
+				for (channel = 0; channel < channels; channel++)
+					*buf32++ = H2LE_32(signal[channel][sample]);
+			return;
+
+		default:
+			break;
 	}
 }
 
@@ -400,25 +493,26 @@
 	const size_t bytes_needed = (size_t)channels * (size_t)samples * (size_t)bytes_per_sample;
 
 	/* overflow check */
-	if((size_t)channels > SIZE_MAX / (size_t)bytes_per_sample)
+	if ((size_t)channels > SIZE_MAX / (size_t)bytes_per_sample)
 		return false;
-	if((size_t)channels * (size_t)bytes_per_sample > SIZE_MAX / (size_t)samples)
+	if ((size_t)channels * (size_t)bytes_per_sample > SIZE_MAX / (size_t)samples)
 		return false;
 
-	if(ctx->capacity < bytes_needed) {
-		FLAC__byte *tmp = (FLAC__byte*)realloc(ctx->internal_buf, bytes_needed);
-		if(0 == tmp) {
-			free(ctx->internal_buf);
-			if(0 == (ctx->internal_buf = (FLAC__byte*)safe_malloc_(bytes_needed)))
+	if (ctx->capacity < bytes_needed) {
+		FLAC__byte *tmp = realloc(ctx->internal_buf.p8, bytes_needed);
+		if (0 == tmp) {
+			free(ctx->internal_buf.p8);
+			if (0 == (ctx->internal_buf.p8= safe_malloc_(bytes_needed)))
 				return false;
 		}
-		ctx->internal_buf = tmp;
+		else
+			ctx->internal_buf.p8= tmp;
 		ctx->capacity = bytes_needed;
 	}
 
-	format_input_(ctx->internal_buf, signal, channels, samples, bytes_per_sample);
+	format_input_(&ctx->internal_buf, signal, channels, samples, bytes_per_sample);
 
-	FLAC__MD5Update(ctx, ctx->internal_buf, (unsigned)bytes_needed);
+	FLAC__MD5Update(ctx, ctx->internal_buf.p8, bytes_needed);
 
 	return true;
 }
diff --git a/src/libFLAC/memory.c b/src/libFLAC/memory.c
index 4d10097..d22df70 100644
--- a/src/libFLAC/memory.c
+++ b/src/libFLAC/memory.c
@@ -1,5 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
- * Copyright (C) 2001,2002,2003,2004,2005,2006,2007  Josh Coalson
+ * Copyright (C) 2001-2009  Josh Coalson
+ * Copyright (C) 2011-2014  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -29,10 +30,14 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#if HAVE_CONFIG_H
+#ifdef HAVE_CONFIG_H
 #  include <config.h>
 #endif
 
+#ifdef HAVE_STDINT_H
+#include <stdint.h>
+#endif
+
 #include "private/memory.h"
 #include "FLAC/assert.h"
 #include "share/alloc.h"
@@ -45,25 +50,8 @@
 
 #ifdef FLAC__ALIGN_MALLOC_DATA
 	/* align on 32-byte (256-bit) boundary */
-	x = safe_malloc_add_2op_(bytes, /*+*/31);
-#ifdef SIZEOF_VOIDP
-#if SIZEOF_VOIDP == 4
-		/* could do  *aligned_address = x + ((unsigned) (32 - (((unsigned)x) & 31))) & 31; */
-		*aligned_address = (void*)(((unsigned)x + 31) & -32);
-#elif SIZEOF_VOIDP == 8
-		*aligned_address = (void*)(((FLAC__uint64)x + 31) & (FLAC__uint64)(-((FLAC__int64)32)));
-#else
-# error  Unsupported sizeof(void*)
-#endif
-#else
-	/* there's got to be a better way to do this right for all archs */
-	if(sizeof(void*) == sizeof(unsigned))
-		*aligned_address = (void*)(((unsigned)x + 31) & -32);
-	else if(sizeof(void*) == sizeof(FLAC__uint64))
-		*aligned_address = (void*)(((FLAC__uint64)x + 31) & (FLAC__uint64)(-((FLAC__int64)32)));
-	else
-		return 0;
-#endif
+	x = safe_malloc_add_2op_(bytes, /*+*/31L);
+	*aligned_address = (void*)(((uintptr_t)x + 31L) & -32L);
 #else
 	x = safe_malloc_(bytes);
 	*aligned_address = x;
@@ -71,7 +59,7 @@
 	return x;
 }
 
-FLAC__bool FLAC__memory_alloc_aligned_int32_array(unsigned elements, FLAC__int32 **unaligned_pointer, FLAC__int32 **aligned_pointer)
+FLAC__bool FLAC__memory_alloc_aligned_int32_array(size_t elements, FLAC__int32 **unaligned_pointer, FLAC__int32 **aligned_pointer)
 {
 	FLAC__int32 *pu; /* unaligned pointer */
 	union { /* union needed to comply with C99 pointer aliasing rules */
@@ -84,10 +72,10 @@
 	FLAC__ASSERT(0 != aligned_pointer);
 	FLAC__ASSERT(unaligned_pointer != aligned_pointer);
 
-	if((size_t)elements > SIZE_MAX / sizeof(*pu)) /* overflow check */
+	if(elements > SIZE_MAX / sizeof(*pu)) /* overflow check */
 		return false;
 
-	pu = (FLAC__int32*)FLAC__memory_alloc_aligned(sizeof(*pu) * (size_t)elements, &u.pv);
+	pu = FLAC__memory_alloc_aligned(sizeof(*pu) * elements, &u.pv);
 	if(0 == pu) {
 		return false;
 	}
@@ -100,7 +88,7 @@
 	}
 }
 
-FLAC__bool FLAC__memory_alloc_aligned_uint32_array(unsigned elements, FLAC__uint32 **unaligned_pointer, FLAC__uint32 **aligned_pointer)
+FLAC__bool FLAC__memory_alloc_aligned_uint32_array(size_t elements, FLAC__uint32 **unaligned_pointer, FLAC__uint32 **aligned_pointer)
 {
 	FLAC__uint32 *pu; /* unaligned pointer */
 	union { /* union needed to comply with C99 pointer aliasing rules */
@@ -113,10 +101,10 @@
 	FLAC__ASSERT(0 != aligned_pointer);
 	FLAC__ASSERT(unaligned_pointer != aligned_pointer);
 
-	if((size_t)elements > SIZE_MAX / sizeof(*pu)) /* overflow check */
+	if(elements > SIZE_MAX / sizeof(*pu)) /* overflow check */
 		return false;
 
-	pu = (FLAC__uint32*)FLAC__memory_alloc_aligned(sizeof(*pu) * elements, &u.pv);
+	pu = FLAC__memory_alloc_aligned(sizeof(*pu) * elements, &u.pv);
 	if(0 == pu) {
 		return false;
 	}
@@ -129,7 +117,7 @@
 	}
 }
 
-FLAC__bool FLAC__memory_alloc_aligned_uint64_array(unsigned elements, FLAC__uint64 **unaligned_pointer, FLAC__uint64 **aligned_pointer)
+FLAC__bool FLAC__memory_alloc_aligned_uint64_array(size_t elements, FLAC__uint64 **unaligned_pointer, FLAC__uint64 **aligned_pointer)
 {
 	FLAC__uint64 *pu; /* unaligned pointer */
 	union { /* union needed to comply with C99 pointer aliasing rules */
@@ -142,10 +130,10 @@
 	FLAC__ASSERT(0 != aligned_pointer);
 	FLAC__ASSERT(unaligned_pointer != aligned_pointer);
 
-	if((size_t)elements > SIZE_MAX / sizeof(*pu)) /* overflow check */
+	if(elements > SIZE_MAX / sizeof(*pu)) /* overflow check */
 		return false;
 
-	pu = (FLAC__uint64*)FLAC__memory_alloc_aligned(sizeof(*pu) * elements, &u.pv);
+	pu = FLAC__memory_alloc_aligned(sizeof(*pu) * elements, &u.pv);
 	if(0 == pu) {
 		return false;
 	}
@@ -158,7 +146,7 @@
 	}
 }
 
-FLAC__bool FLAC__memory_alloc_aligned_unsigned_array(unsigned elements, unsigned **unaligned_pointer, unsigned **aligned_pointer)
+FLAC__bool FLAC__memory_alloc_aligned_unsigned_array(size_t elements, unsigned **unaligned_pointer, unsigned **aligned_pointer)
 {
 	unsigned *pu; /* unaligned pointer */
 	union { /* union needed to comply with C99 pointer aliasing rules */
@@ -171,10 +159,10 @@
 	FLAC__ASSERT(0 != aligned_pointer);
 	FLAC__ASSERT(unaligned_pointer != aligned_pointer);
 
-	if((size_t)elements > SIZE_MAX / sizeof(*pu)) /* overflow check */
+	if(elements > SIZE_MAX / sizeof(*pu)) /* overflow check */
 		return false;
 
-	pu = (unsigned*)FLAC__memory_alloc_aligned(sizeof(*pu) * elements, &u.pv);
+	pu = FLAC__memory_alloc_aligned(sizeof(*pu) * elements, &u.pv);
 	if(0 == pu) {
 		return false;
 	}
@@ -189,7 +177,7 @@
 
 #ifndef FLAC__INTEGER_ONLY_LIBRARY
 
-FLAC__bool FLAC__memory_alloc_aligned_real_array(unsigned elements, FLAC__real **unaligned_pointer, FLAC__real **aligned_pointer)
+FLAC__bool FLAC__memory_alloc_aligned_real_array(size_t elements, FLAC__real **unaligned_pointer, FLAC__real **aligned_pointer)
 {
 	FLAC__real *pu; /* unaligned pointer */
 	union { /* union needed to comply with C99 pointer aliasing rules */
@@ -202,10 +190,10 @@
 	FLAC__ASSERT(0 != aligned_pointer);
 	FLAC__ASSERT(unaligned_pointer != aligned_pointer);
 
-	if((size_t)elements > SIZE_MAX / sizeof(*pu)) /* overflow check */
+	if(elements > SIZE_MAX / sizeof(*pu)) /* overflow check */
 		return false;
 
-	pu = (FLAC__real*)FLAC__memory_alloc_aligned(sizeof(*pu) * elements, &u.pv);
+	pu = FLAC__memory_alloc_aligned(sizeof(*pu) * elements, &u.pv);
 	if(0 == pu) {
 		return false;
 	}
@@ -219,3 +207,12 @@
 }
 
 #endif
+
+void *safe_malloc_mul_2op_p(size_t size1, size_t size2)
+{
+	if(!size1 || !size2)
+		return malloc(1); /* malloc(0) is undefined; FLAC src convention is to always allocate */
+	if(size1 > SIZE_MAX / size2)
+		return 0;
+	return malloc(size1*size2);
+}
diff --git a/src/libFLAC/metadata_iterators.c b/src/libFLAC/metadata_iterators.c
new file mode 100644
index 0000000..1e28925
--- /dev/null
+++ b/src/libFLAC/metadata_iterators.c
@@ -0,0 +1,3402 @@
+/* libFLAC - Free Lossless Audio Codec library
+ * Copyright (C) 2001-2009  Josh Coalson
+ * Copyright (C) 2011-2014  Xiph.Org Foundation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the Xiph.org Foundation nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#  include <config.h>
+#endif
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+
+#include <sys/stat.h> /* for stat(), maybe chmod() */
+
+#include "private/metadata.h"
+
+#include "FLAC/assert.h"
+#include "FLAC/stream_decoder.h"
+#include "share/alloc.h"
+#include "share/compat.h"
+#include "share/macros.h"
+#include "share/safe_str.h"
+#include "private/macros.h"
+#include "private/memory.h"
+
+/* Alias the first (in share/alloc.h) to the second (in src/libFLAC/memory.c). */
+#define safe_malloc_mul_2op_ safe_malloc_mul_2op_p
+
+/****************************************************************************
+ *
+ * Local function declarations
+ *
+ ***************************************************************************/
+
+static void pack_uint32_(FLAC__uint32 val, FLAC__byte *b, unsigned bytes);
+static void pack_uint32_little_endian_(FLAC__uint32 val, FLAC__byte *b, unsigned bytes);
+static void pack_uint64_(FLAC__uint64 val, FLAC__byte *b, unsigned bytes);
+static FLAC__uint32 unpack_uint32_(FLAC__byte *b, unsigned bytes);
+static FLAC__uint32 unpack_uint32_little_endian_(FLAC__byte *b, unsigned bytes);
+static FLAC__uint64 unpack_uint64_(FLAC__byte *b, unsigned bytes);
+
+static FLAC__bool read_metadata_block_header_(FLAC__Metadata_SimpleIterator *iterator);
+static FLAC__bool read_metadata_block_data_(FLAC__Metadata_SimpleIterator *iterator, FLAC__StreamMetadata *block);
+static FLAC__bool read_metadata_block_header_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__bool *is_last, FLAC__MetadataType *type, unsigned *length);
+static FLAC__Metadata_SimpleIteratorStatus read_metadata_block_data_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__IOCallback_Seek seek_cb, FLAC__StreamMetadata *block);
+static FLAC__Metadata_SimpleIteratorStatus read_metadata_block_data_streaminfo_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__StreamMetadata_StreamInfo *block);
+static FLAC__Metadata_SimpleIteratorStatus read_metadata_block_data_padding_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Seek seek_cb, FLAC__StreamMetadata_Padding *block, unsigned block_length);
+static FLAC__Metadata_SimpleIteratorStatus read_metadata_block_data_application_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__StreamMetadata_Application *block, unsigned block_length);
+static FLAC__Metadata_SimpleIteratorStatus read_metadata_block_data_seektable_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__StreamMetadata_SeekTable *block, unsigned block_length);
+static FLAC__Metadata_SimpleIteratorStatus read_metadata_block_data_vorbis_comment_entry_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__StreamMetadata_VorbisComment_Entry *entry, unsigned max_length);
+static FLAC__Metadata_SimpleIteratorStatus read_metadata_block_data_vorbis_comment_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__IOCallback_Seek seek_cb, FLAC__StreamMetadata_VorbisComment *block, unsigned block_length);
+static FLAC__Metadata_SimpleIteratorStatus read_metadata_block_data_cuesheet_track_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__StreamMetadata_CueSheet_Track *track);
+static FLAC__Metadata_SimpleIteratorStatus read_metadata_block_data_cuesheet_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__StreamMetadata_CueSheet *block);
+static FLAC__Metadata_SimpleIteratorStatus read_metadata_block_data_picture_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__StreamMetadata_Picture *block);
+static FLAC__Metadata_SimpleIteratorStatus read_metadata_block_data_unknown_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__StreamMetadata_Unknown *block, unsigned block_length);
+
+static FLAC__bool write_metadata_block_header_(FILE *file, FLAC__Metadata_SimpleIteratorStatus *status, const FLAC__StreamMetadata *block);
+static FLAC__bool write_metadata_block_data_(FILE *file, FLAC__Metadata_SimpleIteratorStatus *status, const FLAC__StreamMetadata *block);
+static FLAC__bool write_metadata_block_header_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Write write_cb, const FLAC__StreamMetadata *block);
+static FLAC__bool write_metadata_block_data_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Write write_cb, const FLAC__StreamMetadata *block);
+static FLAC__bool write_metadata_block_data_streaminfo_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Write write_cb, const FLAC__StreamMetadata_StreamInfo *block);
+static FLAC__bool write_metadata_block_data_padding_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Write write_cb, const FLAC__StreamMetadata_Padding *block, unsigned block_length);
+static FLAC__bool write_metadata_block_data_application_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Write write_cb, const FLAC__StreamMetadata_Application *block, unsigned block_length);
+static FLAC__bool write_metadata_block_data_seektable_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Write write_cb, const FLAC__StreamMetadata_SeekTable *block);
+static FLAC__bool write_metadata_block_data_vorbis_comment_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Write write_cb, const FLAC__StreamMetadata_VorbisComment *block);
+static FLAC__bool write_metadata_block_data_cuesheet_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Write write_cb, const FLAC__StreamMetadata_CueSheet *block);
+static FLAC__bool write_metadata_block_data_picture_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Write write_cb, const FLAC__StreamMetadata_Picture *block);
+static FLAC__bool write_metadata_block_data_unknown_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Write write_cb, const FLAC__StreamMetadata_Unknown *block, unsigned block_length);
+
+static FLAC__bool write_metadata_block_stationary_(FLAC__Metadata_SimpleIterator *iterator, const FLAC__StreamMetadata *block);
+static FLAC__bool write_metadata_block_stationary_with_padding_(FLAC__Metadata_SimpleIterator *iterator, FLAC__StreamMetadata *block, unsigned padding_length, FLAC__bool padding_is_last);
+static FLAC__bool rewrite_whole_file_(FLAC__Metadata_SimpleIterator *iterator, FLAC__StreamMetadata *block, FLAC__bool append);
+
+static void simple_iterator_push_(FLAC__Metadata_SimpleIterator *iterator);
+static FLAC__bool simple_iterator_pop_(FLAC__Metadata_SimpleIterator *iterator);
+
+static unsigned seek_to_first_metadata_block_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__IOCallback_Seek seek_cb);
+static unsigned seek_to_first_metadata_block_(FILE *f);
+
+static FLAC__bool simple_iterator_copy_file_prefix_(FLAC__Metadata_SimpleIterator *iterator, FILE **tempfile, char **tempfilename, FLAC__bool append);
+static FLAC__bool simple_iterator_copy_file_postfix_(FLAC__Metadata_SimpleIterator *iterator, FILE **tempfile, char **tempfilename, int fixup_is_last_code, FLAC__off_t fixup_is_last_flag_offset, FLAC__bool backup);
+
+static FLAC__bool copy_n_bytes_from_file_(FILE *file, FILE *tempfile, FLAC__off_t bytes, FLAC__Metadata_SimpleIteratorStatus *status);
+static FLAC__bool copy_n_bytes_from_file_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__IOHandle temp_handle, FLAC__IOCallback_Write temp_write_cb, FLAC__off_t bytes, FLAC__Metadata_SimpleIteratorStatus *status);
+static FLAC__bool copy_remaining_bytes_from_file_(FILE *file, FILE *tempfile, FLAC__Metadata_SimpleIteratorStatus *status);
+static FLAC__bool copy_remaining_bytes_from_file_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__IOCallback_Eof eof_cb, FLAC__IOHandle temp_handle, FLAC__IOCallback_Write temp_write_cb, FLAC__Metadata_SimpleIteratorStatus *status);
+
+static FLAC__bool open_tempfile_(const char *filename, const char *tempfile_path_prefix, FILE **tempfile, char **tempfilename, FLAC__Metadata_SimpleIteratorStatus *status);
+static FLAC__bool transport_tempfile_(const char *filename, FILE **tempfile, char **tempfilename, FLAC__Metadata_SimpleIteratorStatus *status);
+static void cleanup_tempfile_(FILE **tempfile, char **tempfilename);
+
+static FLAC__bool get_file_stats_(const char *filename, struct flac_stat_s *stats);
+static void set_file_stats_(const char *filename, struct flac_stat_s *stats);
+
+static int fseek_wrapper_(FLAC__IOHandle handle, FLAC__int64 offset, int whence);
+static FLAC__int64 ftell_wrapper_(FLAC__IOHandle handle);
+
+static FLAC__Metadata_ChainStatus get_equivalent_status_(FLAC__Metadata_SimpleIteratorStatus status);
+
+
+#ifdef FLAC__VALGRIND_TESTING
+static size_t local__fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream)
+{
+	size_t ret = fwrite(ptr, size, nmemb, stream);
+	if(!ferror(stream))
+		fflush(stream);
+	return ret;
+}
+#else
+#define local__fwrite fwrite
+#endif
+
+/****************************************************************************
+ *
+ * Level 0 implementation
+ *
+ ***************************************************************************/
+
+static FLAC__StreamDecoderWriteStatus write_callback_(const FLAC__StreamDecoder *decoder, const FLAC__Frame *frame, const FLAC__int32 * const buffer[], void *client_data);
+static void metadata_callback_(const FLAC__StreamDecoder *decoder, const FLAC__StreamMetadata *metadata, void *client_data);
+static void error_callback_(const FLAC__StreamDecoder *decoder, FLAC__StreamDecoderErrorStatus status, void *client_data);
+
+typedef struct {
+	FLAC__bool got_error;
+	FLAC__StreamMetadata *object;
+} level0_client_data;
+
+static FLAC__StreamMetadata *get_one_metadata_block_(const char *filename, FLAC__MetadataType type)
+{
+	level0_client_data cd;
+	FLAC__StreamDecoder *decoder;
+
+	FLAC__ASSERT(0 != filename);
+
+	cd.got_error = false;
+	cd.object = 0;
+
+	decoder = FLAC__stream_decoder_new();
+
+	if(0 == decoder)
+		return 0;
+
+	FLAC__stream_decoder_set_md5_checking(decoder, false);
+	FLAC__stream_decoder_set_metadata_ignore_all(decoder);
+	FLAC__stream_decoder_set_metadata_respond(decoder, type);
+
+	if(FLAC__stream_decoder_init_file(decoder, filename, write_callback_, metadata_callback_, error_callback_, &cd) != FLAC__STREAM_DECODER_INIT_STATUS_OK || cd.got_error) {
+		(void)FLAC__stream_decoder_finish(decoder);
+		FLAC__stream_decoder_delete(decoder);
+		return 0;
+	}
+
+	if(!FLAC__stream_decoder_process_until_end_of_metadata(decoder) || cd.got_error) {
+		(void)FLAC__stream_decoder_finish(decoder);
+		FLAC__stream_decoder_delete(decoder);
+		if(0 != cd.object)
+			FLAC__metadata_object_delete(cd.object);
+		return 0;
+	}
+
+	(void)FLAC__stream_decoder_finish(decoder);
+	FLAC__stream_decoder_delete(decoder);
+
+	return cd.object;
+}
+
+FLAC_API FLAC__bool FLAC__metadata_get_streaminfo(const char *filename, FLAC__StreamMetadata *streaminfo)
+{
+	FLAC__StreamMetadata *object;
+
+	FLAC__ASSERT(0 != filename);
+	FLAC__ASSERT(0 != streaminfo);
+
+	object = get_one_metadata_block_(filename, FLAC__METADATA_TYPE_STREAMINFO);
+
+	if (object) {
+		/* can just copy the contents since STREAMINFO has no internal structure */
+		*streaminfo = *object;
+		FLAC__metadata_object_delete(object);
+		return true;
+	}
+	else {
+		return false;
+	}
+}
+
+FLAC_API FLAC__bool FLAC__metadata_get_tags(const char *filename, FLAC__StreamMetadata **tags)
+{
+	FLAC__ASSERT(0 != filename);
+	FLAC__ASSERT(0 != tags);
+
+	*tags = get_one_metadata_block_(filename, FLAC__METADATA_TYPE_VORBIS_COMMENT);
+
+	return 0 != *tags;
+}
+
+FLAC_API FLAC__bool FLAC__metadata_get_cuesheet(const char *filename, FLAC__StreamMetadata **cuesheet)
+{
+	FLAC__ASSERT(0 != filename);
+	FLAC__ASSERT(0 != cuesheet);
+
+	*cuesheet = get_one_metadata_block_(filename, FLAC__METADATA_TYPE_CUESHEET);
+
+	return 0 != *cuesheet;
+}
+
+FLAC__StreamDecoderWriteStatus write_callback_(const FLAC__StreamDecoder *decoder, const FLAC__Frame *frame, const FLAC__int32 * const buffer[], void *client_data)
+{
+	(void)decoder, (void)frame, (void)buffer, (void)client_data;
+
+	return FLAC__STREAM_DECODER_WRITE_STATUS_CONTINUE;
+}
+
+void metadata_callback_(const FLAC__StreamDecoder *decoder, const FLAC__StreamMetadata *metadata, void *client_data)
+{
+	level0_client_data *cd = (level0_client_data *)client_data;
+	(void)decoder;
+
+	/*
+	 * we assume we only get here when the one metadata block we were
+	 * looking for was passed to us
+	 */
+	if(!cd->got_error && 0 == cd->object) {
+		if(0 == (cd->object = FLAC__metadata_object_clone(metadata)))
+			cd->got_error = true;
+	}
+}
+
+void error_callback_(const FLAC__StreamDecoder *decoder, FLAC__StreamDecoderErrorStatus status, void *client_data)
+{
+	level0_client_data *cd = (level0_client_data *)client_data;
+	(void)decoder;
+
+	if(status != FLAC__STREAM_DECODER_ERROR_STATUS_LOST_SYNC)
+		cd->got_error = true;
+}
+
+FLAC_API FLAC__bool FLAC__metadata_get_picture(const char *filename, FLAC__StreamMetadata **picture, FLAC__StreamMetadata_Picture_Type type, const char *mime_type, const FLAC__byte *description, unsigned max_width, unsigned max_height, unsigned max_depth, unsigned max_colors)
+{
+	FLAC__Metadata_SimpleIterator *it;
+	FLAC__uint64 max_area_seen = 0;
+	FLAC__uint64 max_depth_seen = 0;
+
+	FLAC__ASSERT(0 != filename);
+	FLAC__ASSERT(0 != picture);
+
+	*picture = 0;
+
+	it = FLAC__metadata_simple_iterator_new();
+	if(0 == it)
+		return false;
+	if(!FLAC__metadata_simple_iterator_init(it, filename, /*read_only=*/true, /*preserve_file_stats=*/true)) {
+		FLAC__metadata_simple_iterator_delete(it);
+		return false;
+	}
+	do {
+		if(FLAC__metadata_simple_iterator_get_block_type(it) == FLAC__METADATA_TYPE_PICTURE) {
+			FLAC__StreamMetadata *obj = FLAC__metadata_simple_iterator_get_block(it);
+			FLAC__uint64 area = (FLAC__uint64)obj->data.picture.width * (FLAC__uint64)obj->data.picture.height;
+			/* check constraints */
+			if(
+				(type == (FLAC__StreamMetadata_Picture_Type)(-1) || type == obj->data.picture.type) &&
+				(mime_type == 0 || !strcmp(mime_type, obj->data.picture.mime_type)) &&
+				(description == 0 || !strcmp((const char *)description, (const char *)obj->data.picture.description)) &&
+				obj->data.picture.width <= max_width &&
+				obj->data.picture.height <= max_height &&
+				obj->data.picture.depth <= max_depth &&
+				obj->data.picture.colors <= max_colors &&
+				(area > max_area_seen || (area == max_area_seen && obj->data.picture.depth > max_depth_seen))
+			) {
+				if(*picture)
+					FLAC__metadata_object_delete(*picture);
+				*picture = obj;
+				max_area_seen = area;
+				max_depth_seen = obj->data.picture.depth;
+			}
+			else {
+				FLAC__metadata_object_delete(obj);
+			}
+		}
+	} while(FLAC__metadata_simple_iterator_next(it));
+
+	FLAC__metadata_simple_iterator_delete(it);
+
+	return (0 != *picture);
+}
+
+
+/****************************************************************************
+ *
+ * Level 1 implementation
+ *
+ ***************************************************************************/
+
+#define SIMPLE_ITERATOR_MAX_PUSH_DEPTH (1+4)
+/* 1 for initial offset, +4 for our own personal use */
+
+struct FLAC__Metadata_SimpleIterator {
+	FILE *file;
+	char *filename, *tempfile_path_prefix;
+	struct flac_stat_s stats;
+	FLAC__bool has_stats;
+	FLAC__bool is_writable;
+	FLAC__Metadata_SimpleIteratorStatus status;
+	FLAC__off_t offset[SIMPLE_ITERATOR_MAX_PUSH_DEPTH];
+	FLAC__off_t first_offset; /* this is the offset to the STREAMINFO block */
+	unsigned depth;
+	/* this is the metadata block header of the current block we are pointing to: */
+	FLAC__bool is_last;
+	FLAC__MetadataType type;
+	unsigned length;
+};
+
+FLAC_API const char * const FLAC__Metadata_SimpleIteratorStatusString[] = {
+	"FLAC__METADATA_SIMPLE_ITERATOR_STATUS_OK",
+	"FLAC__METADATA_SIMPLE_ITERATOR_STATUS_ILLEGAL_INPUT",
+	"FLAC__METADATA_SIMPLE_ITERATOR_STATUS_ERROR_OPENING_FILE",
+	"FLAC__METADATA_SIMPLE_ITERATOR_STATUS_NOT_A_FLAC_FILE",
+	"FLAC__METADATA_SIMPLE_ITERATOR_STATUS_NOT_WRITABLE",
+	"FLAC__METADATA_SIMPLE_ITERATOR_STATUS_BAD_METADATA",
+	"FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR",
+	"FLAC__METADATA_SIMPLE_ITERATOR_STATUS_SEEK_ERROR",
+	"FLAC__METADATA_SIMPLE_ITERATOR_STATUS_WRITE_ERROR",
+	"FLAC__METADATA_SIMPLE_ITERATOR_STATUS_RENAME_ERROR",
+	"FLAC__METADATA_SIMPLE_ITERATOR_STATUS_UNLINK_ERROR",
+	"FLAC__METADATA_SIMPLE_ITERATOR_STATUS_MEMORY_ALLOCATION_ERROR",
+	"FLAC__METADATA_SIMPLE_ITERATOR_STATUS_INTERNAL_ERROR"
+};
+
+
+FLAC_API FLAC__Metadata_SimpleIterator *FLAC__metadata_simple_iterator_new(void)
+{
+	FLAC__Metadata_SimpleIterator *iterator = calloc(1, sizeof(FLAC__Metadata_SimpleIterator));
+
+	if(0 != iterator) {
+		iterator->file = 0;
+		iterator->filename = 0;
+		iterator->tempfile_path_prefix = 0;
+		iterator->has_stats = false;
+		iterator->is_writable = false;
+		iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_OK;
+		iterator->first_offset = iterator->offset[0] = -1;
+		iterator->depth = 0;
+	}
+
+	return iterator;
+}
+
+static void simple_iterator_free_guts_(FLAC__Metadata_SimpleIterator *iterator)
+{
+	FLAC__ASSERT(0 != iterator);
+
+	if(0 != iterator->file) {
+		fclose(iterator->file);
+		iterator->file = 0;
+		if(iterator->has_stats)
+			set_file_stats_(iterator->filename, &iterator->stats);
+	}
+	if(0 != iterator->filename) {
+		free(iterator->filename);
+		iterator->filename = 0;
+	}
+	if(0 != iterator->tempfile_path_prefix) {
+		free(iterator->tempfile_path_prefix);
+		iterator->tempfile_path_prefix = 0;
+	}
+}
+
+FLAC_API void FLAC__metadata_simple_iterator_delete(FLAC__Metadata_SimpleIterator *iterator)
+{
+	FLAC__ASSERT(0 != iterator);
+
+	simple_iterator_free_guts_(iterator);
+	free(iterator);
+}
+
+FLAC_API FLAC__Metadata_SimpleIteratorStatus FLAC__metadata_simple_iterator_status(FLAC__Metadata_SimpleIterator *iterator)
+{
+	FLAC__Metadata_SimpleIteratorStatus status;
+
+	FLAC__ASSERT(0 != iterator);
+
+	status = iterator->status;
+	iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_OK;
+	return status;
+}
+
+static FLAC__bool simple_iterator_prime_input_(FLAC__Metadata_SimpleIterator *iterator, FLAC__bool read_only)
+{
+	unsigned ret;
+
+	FLAC__ASSERT(0 != iterator);
+
+	if(read_only || 0 == (iterator->file = flac_fopen(iterator->filename, "r+b"))) {
+		iterator->is_writable = false;
+		if(read_only || errno == EACCES) {
+			if(0 == (iterator->file = flac_fopen(iterator->filename, "rb"))) {
+				iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_ERROR_OPENING_FILE;
+				return false;
+			}
+		}
+		else {
+			iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_ERROR_OPENING_FILE;
+			return false;
+		}
+	}
+	else {
+		iterator->is_writable = true;
+	}
+
+	ret = seek_to_first_metadata_block_(iterator->file);
+	switch(ret) {
+		case 0:
+			iterator->depth = 0;
+			iterator->first_offset = iterator->offset[iterator->depth] = ftello(iterator->file);
+			return read_metadata_block_header_(iterator);
+		case 1:
+			iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+			return false;
+		case 2:
+			iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_SEEK_ERROR;
+			return false;
+		case 3:
+			iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_NOT_A_FLAC_FILE;
+			return false;
+		default:
+			FLAC__ASSERT(0);
+			return false;
+	}
+}
+
+#if 0
+@@@ If we decide to finish implementing this, put this comment back in metadata.h
+/*
+ * The 'tempfile_path_prefix' allows you to specify a directory where
+ * tempfiles should go.  Remember that if your metadata edits cause the
+ * FLAC file to grow, the entire file will have to be rewritten.  If
+ * 'tempfile_path_prefix' is NULL, the temp file will be written in the
+ * same directory as the original FLAC file.  This makes replacing the
+ * original with the tempfile fast but requires extra space in the same
+ * partition for the tempfile.  If space is a problem, you can pass a
+ * directory name belonging to a different partition in
+ * 'tempfile_path_prefix'.  Note that you should use the forward slash
+ * '/' as the directory separator.  A trailing slash is not needed; it
+ * will be added automatically.
+ */
+FLAC__bool FLAC__metadata_simple_iterator_init(FLAC__Metadata_SimpleIterator *iterator, const char *filename, FLAC__bool preserve_file_stats, const char *tempfile_path_prefix);
+#endif
+
+FLAC_API FLAC__bool FLAC__metadata_simple_iterator_init(FLAC__Metadata_SimpleIterator *iterator, const char *filename, FLAC__bool read_only, FLAC__bool preserve_file_stats)
+{
+	const char *tempfile_path_prefix = 0; /*@@@ search for comments near 'flac_rename(...)' for what it will take to finish implementing this */
+
+	FLAC__ASSERT(0 != iterator);
+	FLAC__ASSERT(0 != filename);
+
+	simple_iterator_free_guts_(iterator);
+
+	if(!read_only && preserve_file_stats)
+		iterator->has_stats = get_file_stats_(filename, &iterator->stats);
+
+	if(0 == (iterator->filename = strdup(filename))) {
+		iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_MEMORY_ALLOCATION_ERROR;
+		return false;
+	}
+	if(0 != tempfile_path_prefix && 0 == (iterator->tempfile_path_prefix = strdup(tempfile_path_prefix))) {
+		iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_MEMORY_ALLOCATION_ERROR;
+		return false;
+	}
+
+	return simple_iterator_prime_input_(iterator, read_only);
+}
+
+FLAC_API FLAC__bool FLAC__metadata_simple_iterator_is_writable(const FLAC__Metadata_SimpleIterator *iterator)
+{
+	FLAC__ASSERT(0 != iterator);
+	FLAC__ASSERT(0 != iterator->file);
+
+	return iterator->is_writable;
+}
+
+FLAC_API FLAC__bool FLAC__metadata_simple_iterator_next(FLAC__Metadata_SimpleIterator *iterator)
+{
+	FLAC__ASSERT(0 != iterator);
+	FLAC__ASSERT(0 != iterator->file);
+
+	if(iterator->is_last)
+		return false;
+
+	if(0 != fseeko(iterator->file, iterator->length, SEEK_CUR)) {
+		iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_SEEK_ERROR;
+		return false;
+	}
+
+	iterator->offset[iterator->depth] = ftello(iterator->file);
+
+	return read_metadata_block_header_(iterator);
+}
+
+FLAC_API FLAC__bool FLAC__metadata_simple_iterator_prev(FLAC__Metadata_SimpleIterator *iterator)
+{
+	FLAC__off_t this_offset;
+
+	FLAC__ASSERT(0 != iterator);
+	FLAC__ASSERT(0 != iterator->file);
+
+	if(iterator->offset[iterator->depth] == iterator->first_offset)
+		return false;
+
+	if(0 != fseeko(iterator->file, iterator->first_offset, SEEK_SET)) {
+		iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_SEEK_ERROR;
+		return false;
+	}
+	this_offset = iterator->first_offset;
+	if(!read_metadata_block_header_(iterator))
+		return false;
+
+	/* we ignore any error from ftello() and catch it in fseeko() */
+	while(ftello(iterator->file) + (FLAC__off_t)iterator->length < iterator->offset[iterator->depth]) {
+		if(0 != fseeko(iterator->file, iterator->length, SEEK_CUR)) {
+			iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_SEEK_ERROR;
+			return false;
+		}
+		this_offset = ftello(iterator->file);
+		if(!read_metadata_block_header_(iterator))
+			return false;
+	}
+
+	iterator->offset[iterator->depth] = this_offset;
+
+	return true;
+}
+
+/*@@@@add to tests*/
+FLAC_API FLAC__bool FLAC__metadata_simple_iterator_is_last(const FLAC__Metadata_SimpleIterator *iterator)
+{
+	FLAC__ASSERT(0 != iterator);
+	FLAC__ASSERT(0 != iterator->file);
+
+	return iterator->is_last;
+}
+
+/*@@@@add to tests*/
+FLAC_API off_t FLAC__metadata_simple_iterator_get_block_offset(const FLAC__Metadata_SimpleIterator *iterator)
+{
+	FLAC__ASSERT(0 != iterator);
+	FLAC__ASSERT(0 != iterator->file);
+
+	return iterator->offset[iterator->depth];
+}
+
+FLAC_API FLAC__MetadataType FLAC__metadata_simple_iterator_get_block_type(const FLAC__Metadata_SimpleIterator *iterator)
+{
+	FLAC__ASSERT(0 != iterator);
+	FLAC__ASSERT(0 != iterator->file);
+
+	return iterator->type;
+}
+
+/*@@@@add to tests*/
+FLAC_API unsigned FLAC__metadata_simple_iterator_get_block_length(const FLAC__Metadata_SimpleIterator *iterator)
+{
+	FLAC__ASSERT(0 != iterator);
+	FLAC__ASSERT(0 != iterator->file);
+
+	return iterator->length;
+}
+
+/*@@@@add to tests*/
+FLAC_API FLAC__bool FLAC__metadata_simple_iterator_get_application_id(FLAC__Metadata_SimpleIterator *iterator, FLAC__byte *id)
+{
+	const unsigned id_bytes = FLAC__STREAM_METADATA_APPLICATION_ID_LEN / 8;
+
+	FLAC__ASSERT(0 != iterator);
+	FLAC__ASSERT(0 != iterator->file);
+	FLAC__ASSERT(0 != id);
+
+	if(iterator->type != FLAC__METADATA_TYPE_APPLICATION) {
+		iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_ILLEGAL_INPUT;
+		return false;
+	}
+
+	if(fread(id, 1, id_bytes, iterator->file) != id_bytes) {
+		iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+		return false;
+	}
+
+	/* back up */
+	if(0 != fseeko(iterator->file, -((int)id_bytes), SEEK_CUR)) {
+		iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_SEEK_ERROR;
+		return false;
+	}
+
+	return true;
+}
+
+FLAC_API FLAC__StreamMetadata *FLAC__metadata_simple_iterator_get_block(FLAC__Metadata_SimpleIterator *iterator)
+{
+	FLAC__StreamMetadata *block = FLAC__metadata_object_new(iterator->type);
+
+	FLAC__ASSERT(0 != iterator);
+	FLAC__ASSERT(0 != iterator->file);
+
+	if(0 != block) {
+		block->is_last = iterator->is_last;
+		block->length = iterator->length;
+
+		if(!read_metadata_block_data_(iterator, block)) {
+			FLAC__metadata_object_delete(block);
+			return 0;
+		}
+
+		/* back up to the beginning of the block data to stay consistent */
+		if(0 != fseeko(iterator->file, iterator->offset[iterator->depth] + FLAC__STREAM_METADATA_HEADER_LENGTH, SEEK_SET)) {
+			iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_SEEK_ERROR;
+			FLAC__metadata_object_delete(block);
+			return 0;
+		}
+	}
+	else
+		iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_MEMORY_ALLOCATION_ERROR;
+
+	return block;
+}
+
+FLAC_API FLAC__bool FLAC__metadata_simple_iterator_set_block(FLAC__Metadata_SimpleIterator *iterator, FLAC__StreamMetadata *block, FLAC__bool use_padding)
+{
+	FLAC__ASSERT_DECLARATION(FLAC__off_t debug_target_offset = iterator->offset[iterator->depth];)
+	FLAC__bool ret;
+
+	FLAC__ASSERT(0 != iterator);
+	FLAC__ASSERT(0 != iterator->file);
+	FLAC__ASSERT(0 != block);
+
+	if(!iterator->is_writable) {
+		iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_NOT_WRITABLE;
+		return false;
+	}
+
+	if(iterator->type == FLAC__METADATA_TYPE_STREAMINFO || block->type == FLAC__METADATA_TYPE_STREAMINFO) {
+		if(iterator->type != block->type) {
+			iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_ILLEGAL_INPUT;
+			return false;
+		}
+	}
+
+	block->is_last = iterator->is_last;
+
+	if(iterator->length == block->length)
+		return write_metadata_block_stationary_(iterator, block);
+	else if(iterator->length > block->length) {
+		if(use_padding && iterator->length >= FLAC__STREAM_METADATA_HEADER_LENGTH + block->length) {
+			ret = write_metadata_block_stationary_with_padding_(iterator, block, iterator->length - FLAC__STREAM_METADATA_HEADER_LENGTH - block->length, block->is_last);
+			FLAC__ASSERT(!ret || iterator->offset[iterator->depth] == debug_target_offset);
+			FLAC__ASSERT(!ret || ftello(iterator->file) == debug_target_offset + (FLAC__off_t)FLAC__STREAM_METADATA_HEADER_LENGTH);
+			return ret;
+		}
+		else {
+			ret = rewrite_whole_file_(iterator, block, /*append=*/false);
+			FLAC__ASSERT(!ret || iterator->offset[iterator->depth] == debug_target_offset);
+			FLAC__ASSERT(!ret || ftello(iterator->file) == debug_target_offset + (FLAC__off_t)FLAC__STREAM_METADATA_HEADER_LENGTH);
+			return ret;
+		}
+	}
+	else /* iterator->length < block->length */ {
+		unsigned padding_leftover = 0;
+		FLAC__bool padding_is_last = false;
+		if(use_padding) {
+			/* first see if we can even use padding */
+			if(iterator->is_last) {
+				use_padding = false;
+			}
+			else {
+				const unsigned extra_padding_bytes_required = block->length - iterator->length;
+				simple_iterator_push_(iterator);
+				if(!FLAC__metadata_simple_iterator_next(iterator)) {
+					(void)simple_iterator_pop_(iterator);
+					return false;
+				}
+				if(iterator->type != FLAC__METADATA_TYPE_PADDING) {
+					use_padding = false;
+				}
+				else {
+					if(FLAC__STREAM_METADATA_HEADER_LENGTH + iterator->length == extra_padding_bytes_required) {
+						padding_leftover = 0;
+						block->is_last = iterator->is_last;
+					}
+					else if(iterator->length < extra_padding_bytes_required)
+						use_padding = false;
+					else {
+						padding_leftover = FLAC__STREAM_METADATA_HEADER_LENGTH + iterator->length - extra_padding_bytes_required;
+						padding_is_last = iterator->is_last;
+						block->is_last = false;
+					}
+				}
+				if(!simple_iterator_pop_(iterator))
+					return false;
+			}
+		}
+		if(use_padding) {
+			if(padding_leftover == 0) {
+				ret = write_metadata_block_stationary_(iterator, block);
+				FLAC__ASSERT(!ret || iterator->offset[iterator->depth] == debug_target_offset);
+				FLAC__ASSERT(!ret || ftello(iterator->file) == debug_target_offset + (FLAC__off_t)FLAC__STREAM_METADATA_HEADER_LENGTH);
+				return ret;
+			}
+			else {
+				FLAC__ASSERT(padding_leftover >= FLAC__STREAM_METADATA_HEADER_LENGTH);
+				ret = write_metadata_block_stationary_with_padding_(iterator, block, padding_leftover - FLAC__STREAM_METADATA_HEADER_LENGTH, padding_is_last);
+				FLAC__ASSERT(!ret || iterator->offset[iterator->depth] == debug_target_offset);
+				FLAC__ASSERT(!ret || ftello(iterator->file) == debug_target_offset + (FLAC__off_t)FLAC__STREAM_METADATA_HEADER_LENGTH);
+				return ret;
+			}
+		}
+		else {
+			ret = rewrite_whole_file_(iterator, block, /*append=*/false);
+			FLAC__ASSERT(!ret || iterator->offset[iterator->depth] == debug_target_offset);
+			FLAC__ASSERT(!ret || ftello(iterator->file) == debug_target_offset + (FLAC__off_t)FLAC__STREAM_METADATA_HEADER_LENGTH);
+			return ret;
+		}
+	}
+}
+
+FLAC_API FLAC__bool FLAC__metadata_simple_iterator_insert_block_after(FLAC__Metadata_SimpleIterator *iterator, FLAC__StreamMetadata *block, FLAC__bool use_padding)
+{
+	unsigned padding_leftover = 0;
+	FLAC__bool padding_is_last = false;
+
+	FLAC__ASSERT_DECLARATION(FLAC__off_t debug_target_offset = iterator->offset[iterator->depth] + FLAC__STREAM_METADATA_HEADER_LENGTH + iterator->length;)
+	FLAC__bool ret;
+
+	FLAC__ASSERT(0 != iterator);
+	FLAC__ASSERT(0 != iterator->file);
+	FLAC__ASSERT(0 != block);
+
+	if(!iterator->is_writable) {
+		iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_NOT_WRITABLE;
+		return false;
+	}
+
+	if(block->type == FLAC__METADATA_TYPE_STREAMINFO) {
+		iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_ILLEGAL_INPUT;
+		return false;
+	}
+
+	block->is_last = iterator->is_last;
+
+	if(use_padding) {
+		/* first see if we can even use padding */
+		if(iterator->is_last) {
+			use_padding = false;
+		}
+		else {
+			simple_iterator_push_(iterator);
+			if(!FLAC__metadata_simple_iterator_next(iterator)) {
+				(void)simple_iterator_pop_(iterator);
+				return false;
+			}
+			if(iterator->type != FLAC__METADATA_TYPE_PADDING) {
+				use_padding = false;
+			}
+			else {
+				if(iterator->length == block->length) {
+					padding_leftover = 0;
+					block->is_last = iterator->is_last;
+				}
+				else if(iterator->length < FLAC__STREAM_METADATA_HEADER_LENGTH + block->length)
+					use_padding = false;
+				else {
+					padding_leftover = iterator->length - block->length;
+					padding_is_last = iterator->is_last;
+					block->is_last = false;
+				}
+			}
+			if(!simple_iterator_pop_(iterator))
+				return false;
+		}
+	}
+	if(use_padding) {
+		/* move to the next block, which is suitable padding */
+		if(!FLAC__metadata_simple_iterator_next(iterator))
+			return false;
+		if(padding_leftover == 0) {
+			ret = write_metadata_block_stationary_(iterator, block);
+			FLAC__ASSERT(iterator->offset[iterator->depth] == debug_target_offset);
+			FLAC__ASSERT(ftello(iterator->file) == debug_target_offset + (FLAC__off_t)FLAC__STREAM_METADATA_HEADER_LENGTH);
+			return ret;
+		}
+		else {
+			FLAC__ASSERT(padding_leftover >= FLAC__STREAM_METADATA_HEADER_LENGTH);
+			ret = write_metadata_block_stationary_with_padding_(iterator, block, padding_leftover - FLAC__STREAM_METADATA_HEADER_LENGTH, padding_is_last);
+			FLAC__ASSERT(iterator->offset[iterator->depth] == debug_target_offset);
+			FLAC__ASSERT(ftello(iterator->file) == debug_target_offset + (FLAC__off_t)FLAC__STREAM_METADATA_HEADER_LENGTH);
+			return ret;
+		}
+	}
+	else {
+		ret = rewrite_whole_file_(iterator, block, /*append=*/true);
+		FLAC__ASSERT(iterator->offset[iterator->depth] == debug_target_offset);
+		FLAC__ASSERT(ftello(iterator->file) == debug_target_offset + (FLAC__off_t)FLAC__STREAM_METADATA_HEADER_LENGTH);
+		return ret;
+	}
+}
+
+FLAC_API FLAC__bool FLAC__metadata_simple_iterator_delete_block(FLAC__Metadata_SimpleIterator *iterator, FLAC__bool use_padding)
+{
+	FLAC__ASSERT_DECLARATION(FLAC__off_t debug_target_offset = iterator->offset[iterator->depth];)
+	FLAC__bool ret;
+
+	if(iterator->type == FLAC__METADATA_TYPE_STREAMINFO) {
+		iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_ILLEGAL_INPUT;
+		return false;
+	}
+
+	if(use_padding) {
+		FLAC__StreamMetadata *padding = FLAC__metadata_object_new(FLAC__METADATA_TYPE_PADDING);
+		if(0 == padding) {
+			iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_MEMORY_ALLOCATION_ERROR;
+			return false;
+		}
+		padding->length = iterator->length;
+		if(!FLAC__metadata_simple_iterator_set_block(iterator, padding, false)) {
+			FLAC__metadata_object_delete(padding);
+			return false;
+		}
+		FLAC__metadata_object_delete(padding);
+		if(!FLAC__metadata_simple_iterator_prev(iterator))
+			return false;
+		FLAC__ASSERT(iterator->offset[iterator->depth] + (FLAC__off_t)FLAC__STREAM_METADATA_HEADER_LENGTH + (FLAC__off_t)iterator->length == debug_target_offset);
+		FLAC__ASSERT(ftello(iterator->file) + (FLAC__off_t)iterator->length == debug_target_offset);
+		return true;
+	}
+	else {
+		ret = rewrite_whole_file_(iterator, 0, /*append=*/false);
+		FLAC__ASSERT(iterator->offset[iterator->depth] + (FLAC__off_t)FLAC__STREAM_METADATA_HEADER_LENGTH + (FLAC__off_t)iterator->length == debug_target_offset);
+		FLAC__ASSERT(ftello(iterator->file) + (FLAC__off_t)iterator->length == debug_target_offset);
+		return ret;
+	}
+}
+
+
+
+/****************************************************************************
+ *
+ * Level 2 implementation
+ *
+ ***************************************************************************/
+
+
+typedef struct FLAC__Metadata_Node {
+	FLAC__StreamMetadata *data;
+	struct FLAC__Metadata_Node *prev, *next;
+} FLAC__Metadata_Node;
+
+struct FLAC__Metadata_Chain {
+	char *filename; /* will be NULL if using callbacks */
+	FLAC__bool is_ogg;
+	FLAC__Metadata_Node *head;
+	FLAC__Metadata_Node *tail;
+	unsigned nodes;
+	FLAC__Metadata_ChainStatus status;
+	FLAC__off_t first_offset, last_offset;
+	/*
+	 * This is the length of the chain initially read from the FLAC file.
+	 * it is used to compare against the current length to decide whether
+	 * or not the whole file has to be rewritten.
+	 */
+	FLAC__off_t initial_length;
+	/* @@@ hacky, these are currently only needed by ogg reader */
+	FLAC__IOHandle handle;
+	FLAC__IOCallback_Read read_cb;
+};
+
+struct FLAC__Metadata_Iterator {
+	FLAC__Metadata_Chain *chain;
+	FLAC__Metadata_Node *current;
+};
+
+FLAC_API const char * const FLAC__Metadata_ChainStatusString[] = {
+	"FLAC__METADATA_CHAIN_STATUS_OK",
+	"FLAC__METADATA_CHAIN_STATUS_ILLEGAL_INPUT",
+	"FLAC__METADATA_CHAIN_STATUS_ERROR_OPENING_FILE",
+	"FLAC__METADATA_CHAIN_STATUS_NOT_A_FLAC_FILE",
+	"FLAC__METADATA_CHAIN_STATUS_NOT_WRITABLE",
+	"FLAC__METADATA_CHAIN_STATUS_BAD_METADATA",
+	"FLAC__METADATA_CHAIN_STATUS_READ_ERROR",
+	"FLAC__METADATA_CHAIN_STATUS_SEEK_ERROR",
+	"FLAC__METADATA_CHAIN_STATUS_WRITE_ERROR",
+	"FLAC__METADATA_CHAIN_STATUS_RENAME_ERROR",
+	"FLAC__METADATA_CHAIN_STATUS_UNLINK_ERROR",
+	"FLAC__METADATA_CHAIN_STATUS_MEMORY_ALLOCATION_ERROR",
+	"FLAC__METADATA_CHAIN_STATUS_INTERNAL_ERROR",
+	"FLAC__METADATA_CHAIN_STATUS_INVALID_CALLBACKS",
+	"FLAC__METADATA_CHAIN_STATUS_READ_WRITE_MISMATCH",
+	"FLAC__METADATA_CHAIN_STATUS_WRONG_WRITE_CALL"
+};
+
+
+static FLAC__Metadata_Node *node_new_(void)
+{
+	return calloc(1, sizeof(FLAC__Metadata_Node));
+}
+
+static void node_delete_(FLAC__Metadata_Node *node)
+{
+	FLAC__ASSERT(0 != node);
+	if(0 != node->data)
+		FLAC__metadata_object_delete(node->data);
+	free(node);
+}
+
+static void chain_init_(FLAC__Metadata_Chain *chain)
+{
+	FLAC__ASSERT(0 != chain);
+
+	chain->filename = 0;
+	chain->is_ogg = false;
+	chain->head = chain->tail = 0;
+	chain->nodes = 0;
+	chain->status = FLAC__METADATA_CHAIN_STATUS_OK;
+	chain->initial_length = 0;
+	chain->read_cb = 0;
+}
+
+static void chain_clear_(FLAC__Metadata_Chain *chain)
+{
+	FLAC__Metadata_Node *node, *next;
+
+	FLAC__ASSERT(0 != chain);
+
+	for(node = chain->head; node; ) {
+		next = node->next;
+		node_delete_(node);
+		node = next;
+	}
+
+	if(0 != chain->filename)
+		free(chain->filename);
+
+	chain_init_(chain);
+}
+
+static void chain_append_node_(FLAC__Metadata_Chain *chain, FLAC__Metadata_Node *node)
+{
+	FLAC__ASSERT(0 != chain);
+	FLAC__ASSERT(0 != node);
+	FLAC__ASSERT(0 != node->data);
+
+	node->next = node->prev = 0;
+	node->data->is_last = true;
+	if(0 != chain->tail)
+		chain->tail->data->is_last = false;
+
+	if(0 == chain->head)
+		chain->head = node;
+	else {
+		FLAC__ASSERT(0 != chain->tail);
+		chain->tail->next = node;
+		node->prev = chain->tail;
+	}
+	chain->tail = node;
+	chain->nodes++;
+}
+
+static void chain_remove_node_(FLAC__Metadata_Chain *chain, FLAC__Metadata_Node *node)
+{
+	FLAC__ASSERT(0 != chain);
+	FLAC__ASSERT(0 != node);
+
+	if(node == chain->head)
+		chain->head = node->next;
+	else
+		node->prev->next = node->next;
+
+	if(node == chain->tail)
+		chain->tail = node->prev;
+	else
+		node->next->prev = node->prev;
+
+	if(0 != chain->tail)
+		chain->tail->data->is_last = true;
+
+	chain->nodes--;
+}
+
+static void chain_delete_node_(FLAC__Metadata_Chain *chain, FLAC__Metadata_Node *node)
+{
+	chain_remove_node_(chain, node);
+	node_delete_(node);
+}
+
+static FLAC__off_t chain_calculate_length_(FLAC__Metadata_Chain *chain)
+{
+	const FLAC__Metadata_Node *node;
+	FLAC__off_t length = 0;
+	for(node = chain->head; node; node = node->next)
+		length += (FLAC__STREAM_METADATA_HEADER_LENGTH + node->data->length);
+	return length;
+}
+
+static void iterator_insert_node_(FLAC__Metadata_Iterator *iterator, FLAC__Metadata_Node *node)
+{
+	FLAC__ASSERT(0 != node);
+	FLAC__ASSERT(0 != node->data);
+	FLAC__ASSERT(0 != iterator);
+	FLAC__ASSERT(0 != iterator->current);
+	FLAC__ASSERT(0 != iterator->chain);
+	FLAC__ASSERT(0 != iterator->chain->head);
+	FLAC__ASSERT(0 != iterator->chain->tail);
+
+	node->data->is_last = false;
+
+	node->prev = iterator->current->prev;
+	node->next = iterator->current;
+
+	if(0 == node->prev)
+		iterator->chain->head = node;
+	else
+		node->prev->next = node;
+
+	iterator->current->prev = node;
+
+	iterator->chain->nodes++;
+}
+
+static void iterator_insert_node_after_(FLAC__Metadata_Iterator *iterator, FLAC__Metadata_Node *node)
+{
+	FLAC__ASSERT(0 != node);
+	FLAC__ASSERT(0 != node->data);
+	FLAC__ASSERT(0 != iterator);
+	FLAC__ASSERT(0 != iterator->current);
+	FLAC__ASSERT(0 != iterator->chain);
+	FLAC__ASSERT(0 != iterator->chain->head);
+	FLAC__ASSERT(0 != iterator->chain->tail);
+
+	iterator->current->data->is_last = false;
+
+	node->prev = iterator->current;
+	node->next = iterator->current->next;
+
+	if(0 == node->next)
+		iterator->chain->tail = node;
+	else
+		node->next->prev = node;
+
+	node->prev->next = node;
+
+	iterator->chain->tail->data->is_last = true;
+
+	iterator->chain->nodes++;
+}
+
+/* return true iff node and node->next are both padding */
+static FLAC__bool chain_merge_adjacent_padding_(FLAC__Metadata_Chain *chain, FLAC__Metadata_Node *node)
+{
+	if(node->data->type == FLAC__METADATA_TYPE_PADDING && 0 != node->next && node->next->data->type == FLAC__METADATA_TYPE_PADDING) {
+		const unsigned growth = FLAC__STREAM_METADATA_HEADER_LENGTH + node->next->data->length;
+		node->data->length += growth;
+
+		chain_delete_node_(chain, node->next);
+		return true;
+	}
+	else
+		return false;
+}
+
+/* Returns the new length of the chain, or 0 if there was an error. */
+/* WATCHOUT: This can get called multiple times before a write, so
+ * it should still work when this happens.
+ */
+/* WATCHOUT: Make sure to also update the logic in
+ * FLAC__metadata_chain_check_if_tempfile_needed() if the logic here changes.
+ */
+static FLAC__off_t chain_prepare_for_write_(FLAC__Metadata_Chain *chain, FLAC__bool use_padding)
+{
+	FLAC__off_t current_length = chain_calculate_length_(chain);
+
+	if(use_padding) {
+		/* if the metadata shrank and the last block is padding, we just extend the last padding block */
+		if(current_length < chain->initial_length && chain->tail->data->type == FLAC__METADATA_TYPE_PADDING) {
+			const FLAC__off_t delta = chain->initial_length - current_length;
+			chain->tail->data->length += delta;
+			current_length += delta;
+			FLAC__ASSERT(current_length == chain->initial_length);
+		}
+		/* if the metadata shrank more than 4 bytes then there's room to add another padding block */
+		else if(current_length + (FLAC__off_t)FLAC__STREAM_METADATA_HEADER_LENGTH <= chain->initial_length) {
+			FLAC__StreamMetadata *padding;
+			FLAC__Metadata_Node *node;
+			if(0 == (padding = FLAC__metadata_object_new(FLAC__METADATA_TYPE_PADDING))) {
+				chain->status = FLAC__METADATA_CHAIN_STATUS_MEMORY_ALLOCATION_ERROR;
+				return 0;
+			}
+			padding->length = chain->initial_length - (FLAC__STREAM_METADATA_HEADER_LENGTH + current_length);
+			if(0 == (node = node_new_())) {
+				FLAC__metadata_object_delete(padding);
+				chain->status = FLAC__METADATA_CHAIN_STATUS_MEMORY_ALLOCATION_ERROR;
+				return 0;
+			}
+			node->data = padding;
+			chain_append_node_(chain, node);
+			current_length = chain_calculate_length_(chain);
+			FLAC__ASSERT(current_length == chain->initial_length);
+		}
+		/* if the metadata grew but the last block is padding, try cutting the padding to restore the original length so we don't have to rewrite the whole file */
+		else if(current_length > chain->initial_length) {
+			const FLAC__off_t delta = current_length - chain->initial_length;
+			if(chain->tail->data->type == FLAC__METADATA_TYPE_PADDING) {
+				/* if the delta is exactly the size of the last padding block, remove the padding block */
+				if((FLAC__off_t)chain->tail->data->length + (FLAC__off_t)FLAC__STREAM_METADATA_HEADER_LENGTH == delta) {
+					chain_delete_node_(chain, chain->tail);
+					current_length = chain_calculate_length_(chain);
+					FLAC__ASSERT(current_length == chain->initial_length);
+				}
+				/* if there is at least 'delta' bytes of padding, trim the padding down */
+				else if((FLAC__off_t)chain->tail->data->length >= delta) {
+					chain->tail->data->length -= delta;
+					current_length -= delta;
+					FLAC__ASSERT(current_length == chain->initial_length);
+				}
+			}
+		}
+	}
+
+	return current_length;
+}
+
+static FLAC__bool chain_read_cb_(FLAC__Metadata_Chain *chain, FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__IOCallback_Seek seek_cb, FLAC__IOCallback_Tell tell_cb)
+{
+	FLAC__Metadata_Node *node;
+
+	FLAC__ASSERT(0 != chain);
+
+	/* we assume we're already at the beginning of the file */
+
+	switch(seek_to_first_metadata_block_cb_(handle, read_cb, seek_cb)) {
+		case 0:
+			break;
+		case 1:
+			chain->status = FLAC__METADATA_CHAIN_STATUS_READ_ERROR;
+			return false;
+		case 2:
+			chain->status = FLAC__METADATA_CHAIN_STATUS_SEEK_ERROR;
+			return false;
+		case 3:
+			chain->status = FLAC__METADATA_CHAIN_STATUS_NOT_A_FLAC_FILE;
+			return false;
+		default:
+			FLAC__ASSERT(0);
+			return false;
+	}
+
+	{
+		FLAC__int64 pos = tell_cb(handle);
+		if(pos < 0) {
+			chain->status = FLAC__METADATA_CHAIN_STATUS_READ_ERROR;
+			return false;
+		}
+		chain->first_offset = (FLAC__off_t)pos;
+	}
+
+	{
+		FLAC__bool is_last;
+		FLAC__MetadataType type;
+		unsigned length;
+
+		do {
+			node = node_new_();
+			if(0 == node) {
+				chain->status = FLAC__METADATA_CHAIN_STATUS_MEMORY_ALLOCATION_ERROR;
+				return false;
+			}
+
+			if(!read_metadata_block_header_cb_(handle, read_cb, &is_last, &type, &length)) {
+				node_delete_(node);
+				chain->status = FLAC__METADATA_CHAIN_STATUS_READ_ERROR;
+				return false;
+			}
+
+			node->data = FLAC__metadata_object_new(type);
+			if(0 == node->data) {
+				node_delete_(node);
+				chain->status = FLAC__METADATA_CHAIN_STATUS_MEMORY_ALLOCATION_ERROR;
+				return false;
+			}
+
+			node->data->is_last = is_last;
+			node->data->length = length;
+
+			chain->status = get_equivalent_status_(read_metadata_block_data_cb_(handle, read_cb, seek_cb, node->data));
+			if(chain->status != FLAC__METADATA_CHAIN_STATUS_OK) {
+				node_delete_(node);
+				return false;
+			}
+			chain_append_node_(chain, node);
+		} while(!is_last);
+	}
+
+	{
+		FLAC__int64 pos = tell_cb(handle);
+		if(pos < 0) {
+			chain->status = FLAC__METADATA_CHAIN_STATUS_READ_ERROR;
+			return false;
+		}
+		chain->last_offset = (FLAC__off_t)pos;
+	}
+
+	chain->initial_length = chain_calculate_length_(chain);
+
+	return true;
+}
+
+static FLAC__StreamDecoderReadStatus chain_read_ogg_read_cb_(const FLAC__StreamDecoder *decoder, FLAC__byte buffer[], size_t *bytes, void *client_data)
+{
+	FLAC__Metadata_Chain *chain = (FLAC__Metadata_Chain*)client_data;
+	(void)decoder;
+	if(*bytes > 0 && chain->status == FLAC__METADATA_CHAIN_STATUS_OK) {
+		*bytes = chain->read_cb(buffer, sizeof(FLAC__byte), *bytes, chain->handle);
+		if(*bytes == 0)
+			return FLAC__STREAM_DECODER_READ_STATUS_END_OF_STREAM;
+		else
+			return FLAC__STREAM_DECODER_READ_STATUS_CONTINUE;
+	}
+	else
+		return FLAC__STREAM_DECODER_READ_STATUS_ABORT;
+}
+
+static FLAC__StreamDecoderWriteStatus chain_read_ogg_write_cb_(const FLAC__StreamDecoder *decoder, const FLAC__Frame *frame, const FLAC__int32 * const buffer[], void *client_data)
+{
+	(void)decoder, (void)frame, (void)buffer, (void)client_data;
+	return FLAC__STREAM_DECODER_WRITE_STATUS_ABORT;
+}
+
+static void chain_read_ogg_metadata_cb_(const FLAC__StreamDecoder *decoder, const FLAC__StreamMetadata *metadata, void *client_data)
+{
+	FLAC__Metadata_Chain *chain = (FLAC__Metadata_Chain*)client_data;
+	FLAC__Metadata_Node *node;
+
+	(void)decoder;
+
+	node = node_new_();
+	if(0 == node) {
+		chain->status = FLAC__METADATA_CHAIN_STATUS_MEMORY_ALLOCATION_ERROR;
+		return;
+	}
+
+	node->data = FLAC__metadata_object_clone(metadata);
+	if(0 == node->data) {
+		node_delete_(node);
+		chain->status = FLAC__METADATA_CHAIN_STATUS_MEMORY_ALLOCATION_ERROR;
+		return;
+	}
+
+	chain_append_node_(chain, node);
+}
+
+static void chain_read_ogg_error_cb_(const FLAC__StreamDecoder *decoder, FLAC__StreamDecoderErrorStatus status, void *client_data)
+{
+	FLAC__Metadata_Chain *chain = (FLAC__Metadata_Chain*)client_data;
+	(void)decoder, (void)status;
+	chain->status = FLAC__METADATA_CHAIN_STATUS_INTERNAL_ERROR; /*@@@ maybe needs better error code */
+}
+
+static FLAC__bool chain_read_ogg_cb_(FLAC__Metadata_Chain *chain, FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb)
+{
+	FLAC__StreamDecoder *decoder;
+
+	FLAC__ASSERT(0 != chain);
+
+	/* we assume we're already at the beginning of the file */
+
+	chain->handle = handle;
+	chain->read_cb = read_cb;
+	if(0 == (decoder = FLAC__stream_decoder_new())) {
+		chain->status = FLAC__METADATA_CHAIN_STATUS_MEMORY_ALLOCATION_ERROR;
+		return false;
+	}
+	FLAC__stream_decoder_set_metadata_respond_all(decoder);
+	if(FLAC__stream_decoder_init_ogg_stream(decoder, chain_read_ogg_read_cb_, /*seek_callback=*/0, /*tell_callback=*/0, /*length_callback=*/0, /*eof_callback=*/0, chain_read_ogg_write_cb_, chain_read_ogg_metadata_cb_, chain_read_ogg_error_cb_, chain) != FLAC__STREAM_DECODER_INIT_STATUS_OK) {
+		FLAC__stream_decoder_delete(decoder);
+		chain->status = FLAC__METADATA_CHAIN_STATUS_INTERNAL_ERROR; /*@@@ maybe needs better error code */
+		return false;
+	}
+
+	chain->first_offset = 0; /*@@@ wrong; will need to be set correctly to implement metadata writing for Ogg FLAC */
+
+	if(!FLAC__stream_decoder_process_until_end_of_metadata(decoder))
+		chain->status = FLAC__METADATA_CHAIN_STATUS_INTERNAL_ERROR; /*@@@ maybe needs better error code */
+	if(chain->status != FLAC__METADATA_CHAIN_STATUS_OK) {
+		FLAC__stream_decoder_delete(decoder);
+		return false;
+	}
+
+	FLAC__stream_decoder_delete(decoder);
+
+	chain->last_offset = 0; /*@@@ wrong; will need to be set correctly to implement metadata writing for Ogg FLAC */
+
+	chain->initial_length = chain_calculate_length_(chain);
+
+	return true;
+}
+
+static FLAC__bool chain_rewrite_metadata_in_place_cb_(FLAC__Metadata_Chain *chain, FLAC__IOHandle handle, FLAC__IOCallback_Write write_cb, FLAC__IOCallback_Seek seek_cb)
+{
+	FLAC__Metadata_Node *node;
+
+	FLAC__ASSERT(0 != chain);
+	FLAC__ASSERT(0 != chain->head);
+
+	if(0 != seek_cb(handle, chain->first_offset, SEEK_SET)) {
+		chain->status = FLAC__METADATA_CHAIN_STATUS_SEEK_ERROR;
+		return false;
+	}
+
+	for(node = chain->head; node; node = node->next) {
+		if(!write_metadata_block_header_cb_(handle, write_cb, node->data)) {
+			chain->status = FLAC__METADATA_CHAIN_STATUS_WRITE_ERROR;
+			return false;
+		}
+		if(!write_metadata_block_data_cb_(handle, write_cb, node->data)) {
+			chain->status = FLAC__METADATA_CHAIN_STATUS_WRITE_ERROR;
+			return false;
+		}
+	}
+
+	/*FLAC__ASSERT(fflush(), ftello() == chain->last_offset);*/
+
+	chain->status = FLAC__METADATA_CHAIN_STATUS_OK;
+	return true;
+}
+
+static FLAC__bool chain_rewrite_metadata_in_place_(FLAC__Metadata_Chain *chain)
+{
+	FILE *file;
+	FLAC__bool ret;
+
+	FLAC__ASSERT(0 != chain->filename);
+
+	if(0 == (file = flac_fopen(chain->filename, "r+b"))) {
+		chain->status = FLAC__METADATA_CHAIN_STATUS_ERROR_OPENING_FILE;
+		return false;
+	}
+
+	/* chain_rewrite_metadata_in_place_cb_() sets chain->status for us */
+	ret = chain_rewrite_metadata_in_place_cb_(chain, (FLAC__IOHandle)file, (FLAC__IOCallback_Write)fwrite, fseek_wrapper_);
+
+	fclose(file);
+
+	return ret;
+}
+
+static FLAC__bool chain_rewrite_file_(FLAC__Metadata_Chain *chain, const char *tempfile_path_prefix)
+{
+	FILE *f, *tempfile = NULL;
+	char *tempfilename;
+	FLAC__Metadata_SimpleIteratorStatus status;
+	const FLAC__Metadata_Node *node;
+
+	FLAC__ASSERT(0 != chain);
+	FLAC__ASSERT(0 != chain->filename);
+	FLAC__ASSERT(0 != chain->head);
+
+	/* copy the file prefix (data up to first metadata block */
+	if(0 == (f = flac_fopen(chain->filename, "rb"))) {
+		chain->status = FLAC__METADATA_CHAIN_STATUS_ERROR_OPENING_FILE;
+		return false;
+	}
+	if(!open_tempfile_(chain->filename, tempfile_path_prefix, &tempfile, &tempfilename, &status)) {
+		chain->status = get_equivalent_status_(status);
+		goto err;
+	}
+	if(!copy_n_bytes_from_file_(f, tempfile, chain->first_offset, &status)) {
+		chain->status = get_equivalent_status_(status);
+		goto err;
+	}
+
+	/* write the metadata */
+	for(node = chain->head; node; node = node->next) {
+		if(!write_metadata_block_header_(tempfile, &status, node->data)) {
+			chain->status = get_equivalent_status_(status);
+			goto err;
+		}
+		if(!write_metadata_block_data_(tempfile, &status, node->data)) {
+			chain->status = get_equivalent_status_(status);
+			goto err;
+		}
+	}
+	/*FLAC__ASSERT(fflush(), ftello() == chain->last_offset);*/
+
+	/* copy the file postfix (everything after the metadata) */
+	if(0 != fseeko(f, chain->last_offset, SEEK_SET)) {
+		chain->status = FLAC__METADATA_CHAIN_STATUS_SEEK_ERROR;
+		goto err;
+	}
+	if(!copy_remaining_bytes_from_file_(f, tempfile, &status)) {
+		chain->status = get_equivalent_status_(status);
+		goto err;
+	}
+
+	/* move the tempfile on top of the original */
+	(void)fclose(f);
+	if(!transport_tempfile_(chain->filename, &tempfile, &tempfilename, &status))
+		return false;
+
+	return true;
+
+err:
+	(void)fclose(f);
+	cleanup_tempfile_(&tempfile, &tempfilename);
+	return false;
+}
+
+/* assumes 'handle' is already at beginning of file */
+static FLAC__bool chain_rewrite_file_cb_(FLAC__Metadata_Chain *chain, FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__IOCallback_Seek seek_cb, FLAC__IOCallback_Eof eof_cb, FLAC__IOHandle temp_handle, FLAC__IOCallback_Write temp_write_cb)
+{
+	FLAC__Metadata_SimpleIteratorStatus status;
+	const FLAC__Metadata_Node *node;
+
+	FLAC__ASSERT(0 != chain);
+	FLAC__ASSERT(0 == chain->filename);
+	FLAC__ASSERT(0 != chain->head);
+
+	/* copy the file prefix (data up to first metadata block */
+	if(!copy_n_bytes_from_file_cb_(handle, read_cb, temp_handle, temp_write_cb, chain->first_offset, &status)) {
+		chain->status = get_equivalent_status_(status);
+		return false;
+	}
+
+	/* write the metadata */
+	for(node = chain->head; node; node = node->next) {
+		if(!write_metadata_block_header_cb_(temp_handle, temp_write_cb, node->data)) {
+			chain->status = FLAC__METADATA_CHAIN_STATUS_WRITE_ERROR;
+			return false;
+		}
+		if(!write_metadata_block_data_cb_(temp_handle, temp_write_cb, node->data)) {
+			chain->status = FLAC__METADATA_CHAIN_STATUS_WRITE_ERROR;
+			return false;
+		}
+	}
+	/*FLAC__ASSERT(fflush(), ftello() == chain->last_offset);*/
+
+	/* copy the file postfix (everything after the metadata) */
+	if(0 != seek_cb(handle, chain->last_offset, SEEK_SET)) {
+		chain->status = FLAC__METADATA_CHAIN_STATUS_SEEK_ERROR;
+		return false;
+	}
+	if(!copy_remaining_bytes_from_file_cb_(handle, read_cb, eof_cb, temp_handle, temp_write_cb, &status)) {
+		chain->status = get_equivalent_status_(status);
+		return false;
+	}
+
+	return true;
+}
+
+FLAC_API FLAC__Metadata_Chain *FLAC__metadata_chain_new(void)
+{
+	FLAC__Metadata_Chain *chain = calloc(1, sizeof(FLAC__Metadata_Chain));
+
+	if(0 != chain)
+		chain_init_(chain);
+
+	return chain;
+}
+
+FLAC_API void FLAC__metadata_chain_delete(FLAC__Metadata_Chain *chain)
+{
+	FLAC__ASSERT(0 != chain);
+
+	chain_clear_(chain);
+
+	free(chain);
+}
+
+FLAC_API FLAC__Metadata_ChainStatus FLAC__metadata_chain_status(FLAC__Metadata_Chain *chain)
+{
+	FLAC__Metadata_ChainStatus status;
+
+	FLAC__ASSERT(0 != chain);
+
+	status = chain->status;
+	chain->status = FLAC__METADATA_CHAIN_STATUS_OK;
+	return status;
+}
+
+static FLAC__bool chain_read_(FLAC__Metadata_Chain *chain, const char *filename, FLAC__bool is_ogg)
+{
+	FILE *file;
+	FLAC__bool ret;
+
+	FLAC__ASSERT(0 != chain);
+	FLAC__ASSERT(0 != filename);
+
+	chain_clear_(chain);
+
+	if(0 == (chain->filename = strdup(filename))) {
+		chain->status = FLAC__METADATA_CHAIN_STATUS_MEMORY_ALLOCATION_ERROR;
+		return false;
+	}
+
+	chain->is_ogg = is_ogg;
+
+	if(0 == (file = flac_fopen(filename, "rb"))) {
+		chain->status = FLAC__METADATA_CHAIN_STATUS_ERROR_OPENING_FILE;
+		return false;
+	}
+
+	/* the function also sets chain->status for us */
+	ret = is_ogg?
+		chain_read_ogg_cb_(chain, file, (FLAC__IOCallback_Read)fread) :
+		chain_read_cb_(chain, file, (FLAC__IOCallback_Read)fread, fseek_wrapper_, ftell_wrapper_)
+	;
+
+	fclose(file);
+
+	return ret;
+}
+
+FLAC_API FLAC__bool FLAC__metadata_chain_read(FLAC__Metadata_Chain *chain, const char *filename)
+{
+	return chain_read_(chain, filename, /*is_ogg=*/false);
+}
+
+/*@@@@add to tests*/
+FLAC_API FLAC__bool FLAC__metadata_chain_read_ogg(FLAC__Metadata_Chain *chain, const char *filename)
+{
+	return chain_read_(chain, filename, /*is_ogg=*/true);
+}
+
+static FLAC__bool chain_read_with_callbacks_(FLAC__Metadata_Chain *chain, FLAC__IOHandle handle, FLAC__IOCallbacks callbacks, FLAC__bool is_ogg)
+{
+	FLAC__bool ret;
+
+	FLAC__ASSERT(0 != chain);
+
+	chain_clear_(chain);
+
+	if (0 == callbacks.read || 0 == callbacks.seek || 0 == callbacks.tell) {
+		chain->status = FLAC__METADATA_CHAIN_STATUS_INVALID_CALLBACKS;
+		return false;
+	}
+
+	chain->is_ogg = is_ogg;
+
+	/* rewind */
+	if(0 != callbacks.seek(handle, 0, SEEK_SET)) {
+		chain->status = FLAC__METADATA_CHAIN_STATUS_SEEK_ERROR;
+		return false;
+	}
+
+	/* the function also sets chain->status for us */
+	ret = is_ogg?
+		chain_read_ogg_cb_(chain, handle, callbacks.read) :
+		chain_read_cb_(chain, handle, callbacks.read, callbacks.seek, callbacks.tell)
+	;
+
+	return ret;
+}
+
+FLAC_API FLAC__bool FLAC__metadata_chain_read_with_callbacks(FLAC__Metadata_Chain *chain, FLAC__IOHandle handle, FLAC__IOCallbacks callbacks)
+{
+	return chain_read_with_callbacks_(chain, handle, callbacks, /*is_ogg=*/false);
+}
+
+/*@@@@add to tests*/
+FLAC_API FLAC__bool FLAC__metadata_chain_read_ogg_with_callbacks(FLAC__Metadata_Chain *chain, FLAC__IOHandle handle, FLAC__IOCallbacks callbacks)
+{
+	return chain_read_with_callbacks_(chain, handle, callbacks, /*is_ogg=*/true);
+}
+
+FLAC_API FLAC__bool FLAC__metadata_chain_check_if_tempfile_needed(FLAC__Metadata_Chain *chain, FLAC__bool use_padding)
+{
+	/* This does all the same checks that are in chain_prepare_for_write_()
+	 * but doesn't actually alter the chain.  Make sure to update the logic
+	 * here if chain_prepare_for_write_() changes.
+	 */
+	const FLAC__off_t current_length = chain_calculate_length_(chain);
+
+	FLAC__ASSERT(0 != chain);
+
+	if(use_padding) {
+		/* if the metadata shrank and the last block is padding, we just extend the last padding block */
+		if(current_length < chain->initial_length && chain->tail->data->type == FLAC__METADATA_TYPE_PADDING)
+			return false;
+		/* if the metadata shrank more than 4 bytes then there's room to add another padding block */
+		else if(current_length + (FLAC__off_t)FLAC__STREAM_METADATA_HEADER_LENGTH <= chain->initial_length)
+			return false;
+		/* if the metadata grew but the last block is padding, try cutting the padding to restore the original length so we don't have to rewrite the whole file */
+		else if(current_length > chain->initial_length) {
+			const FLAC__off_t delta = current_length - chain->initial_length;
+			if(chain->tail->data->type == FLAC__METADATA_TYPE_PADDING) {
+				/* if the delta is exactly the size of the last padding block, remove the padding block */
+				if((FLAC__off_t)chain->tail->data->length + (FLAC__off_t)FLAC__STREAM_METADATA_HEADER_LENGTH == delta)
+					return false;
+				/* if there is at least 'delta' bytes of padding, trim the padding down */
+				else if((FLAC__off_t)chain->tail->data->length >= delta)
+					return false;
+			}
+		}
+	}
+
+	return (current_length != chain->initial_length);
+}
+
+FLAC_API FLAC__bool FLAC__metadata_chain_write(FLAC__Metadata_Chain *chain, FLAC__bool use_padding, FLAC__bool preserve_file_stats)
+{
+	struct flac_stat_s stats;
+	const char *tempfile_path_prefix = 0;
+	FLAC__off_t current_length;
+
+	FLAC__ASSERT(0 != chain);
+
+	if (chain->is_ogg) { /* cannot write back to Ogg FLAC yet */
+		chain->status = FLAC__METADATA_CHAIN_STATUS_INTERNAL_ERROR;
+		return false;
+	}
+
+	if (0 == chain->filename) {
+		chain->status = FLAC__METADATA_CHAIN_STATUS_READ_WRITE_MISMATCH;
+		return false;
+	}
+
+	current_length = chain_prepare_for_write_(chain, use_padding);
+
+	/* a return value of 0 means there was an error; chain->status is already set */
+	if (0 == current_length)
+		return false;
+
+	if(preserve_file_stats)
+		get_file_stats_(chain->filename, &stats);
+
+	if(current_length == chain->initial_length) {
+		if(!chain_rewrite_metadata_in_place_(chain))
+			return false;
+	}
+	else {
+		if(!chain_rewrite_file_(chain, tempfile_path_prefix))
+			return false;
+
+		/* recompute lengths and offsets */
+		{
+			const FLAC__Metadata_Node *node;
+			chain->initial_length = current_length;
+			chain->last_offset = chain->first_offset;
+			for(node = chain->head; node; node = node->next)
+				chain->last_offset += (FLAC__STREAM_METADATA_HEADER_LENGTH + node->data->length);
+		}
+	}
+
+	if(preserve_file_stats)
+		set_file_stats_(chain->filename, &stats);
+
+	return true;
+}
+
+FLAC_API FLAC__bool FLAC__metadata_chain_write_with_callbacks(FLAC__Metadata_Chain *chain, FLAC__bool use_padding, FLAC__IOHandle handle, FLAC__IOCallbacks callbacks)
+{
+	FLAC__off_t current_length;
+
+	FLAC__ASSERT(0 != chain);
+
+	if (chain->is_ogg) { /* cannot write back to Ogg FLAC yet */
+		chain->status = FLAC__METADATA_CHAIN_STATUS_INTERNAL_ERROR;
+		return false;
+	}
+
+	if (0 != chain->filename) {
+		chain->status = FLAC__METADATA_CHAIN_STATUS_READ_WRITE_MISMATCH;
+		return false;
+	}
+
+	if (0 == callbacks.write || 0 == callbacks.seek) {
+		chain->status = FLAC__METADATA_CHAIN_STATUS_INVALID_CALLBACKS;
+		return false;
+	}
+
+	if (FLAC__metadata_chain_check_if_tempfile_needed(chain, use_padding)) {
+		chain->status = FLAC__METADATA_CHAIN_STATUS_WRONG_WRITE_CALL;
+		return false;
+	}
+
+	current_length = chain_prepare_for_write_(chain, use_padding);
+
+	/* a return value of 0 means there was an error; chain->status is already set */
+	if (0 == current_length)
+		return false;
+
+	FLAC__ASSERT(current_length == chain->initial_length);
+
+	return chain_rewrite_metadata_in_place_cb_(chain, handle, callbacks.write, callbacks.seek);
+}
+
+FLAC_API FLAC__bool FLAC__metadata_chain_write_with_callbacks_and_tempfile(FLAC__Metadata_Chain *chain, FLAC__bool use_padding, FLAC__IOHandle handle, FLAC__IOCallbacks callbacks, FLAC__IOHandle temp_handle, FLAC__IOCallbacks temp_callbacks)
+{
+	FLAC__off_t current_length;
+
+	FLAC__ASSERT(0 != chain);
+
+	if (chain->is_ogg) { /* cannot write back to Ogg FLAC yet */
+		chain->status = FLAC__METADATA_CHAIN_STATUS_INTERNAL_ERROR;
+		return false;
+	}
+
+	if (0 != chain->filename) {
+		chain->status = FLAC__METADATA_CHAIN_STATUS_READ_WRITE_MISMATCH;
+		return false;
+	}
+
+	if (0 == callbacks.read || 0 == callbacks.seek || 0 == callbacks.eof) {
+		chain->status = FLAC__METADATA_CHAIN_STATUS_INVALID_CALLBACKS;
+		return false;
+	}
+	if (0 == temp_callbacks.write) {
+		chain->status = FLAC__METADATA_CHAIN_STATUS_INVALID_CALLBACKS;
+		return false;
+	}
+
+	if (!FLAC__metadata_chain_check_if_tempfile_needed(chain, use_padding)) {
+		chain->status = FLAC__METADATA_CHAIN_STATUS_WRONG_WRITE_CALL;
+		return false;
+	}
+
+	current_length = chain_prepare_for_write_(chain, use_padding);
+
+	/* a return value of 0 means there was an error; chain->status is already set */
+	if (0 == current_length)
+		return false;
+
+	FLAC__ASSERT(current_length != chain->initial_length);
+
+	/* rewind */
+	if(0 != callbacks.seek(handle, 0, SEEK_SET)) {
+		chain->status = FLAC__METADATA_CHAIN_STATUS_SEEK_ERROR;
+		return false;
+	}
+
+	if(!chain_rewrite_file_cb_(chain, handle, callbacks.read, callbacks.seek, callbacks.eof, temp_handle, temp_callbacks.write))
+		return false;
+
+	/* recompute lengths and offsets */
+	{
+		const FLAC__Metadata_Node *node;
+		chain->initial_length = current_length;
+		chain->last_offset = chain->first_offset;
+		for(node = chain->head; node; node = node->next)
+			chain->last_offset += (FLAC__STREAM_METADATA_HEADER_LENGTH + node->data->length);
+	}
+
+	return true;
+}
+
+FLAC_API void FLAC__metadata_chain_merge_padding(FLAC__Metadata_Chain *chain)
+{
+	FLAC__Metadata_Node *node;
+
+	FLAC__ASSERT(0 != chain);
+
+	for(node = chain->head; node; ) {
+		if(!chain_merge_adjacent_padding_(chain, node))
+			node = node->next;
+	}
+}
+
+FLAC_API void FLAC__metadata_chain_sort_padding(FLAC__Metadata_Chain *chain)
+{
+	FLAC__Metadata_Node *node, *save;
+	unsigned i;
+
+	FLAC__ASSERT(0 != chain);
+
+	/*
+	 * Don't try and be too smart... this simple algo is good enough for
+	 * the small number of nodes that we deal with.
+	 */
+	for(i = 0, node = chain->head; i < chain->nodes; i++) {
+		if(node->data->type == FLAC__METADATA_TYPE_PADDING) {
+			save = node->next;
+			chain_remove_node_(chain, node);
+			chain_append_node_(chain, node);
+			node = save;
+		}
+		else {
+			node = node->next;
+		}
+	}
+
+	FLAC__metadata_chain_merge_padding(chain);
+}
+
+
+FLAC_API FLAC__Metadata_Iterator *FLAC__metadata_iterator_new(void)
+{
+	FLAC__Metadata_Iterator *iterator = calloc(1, sizeof(FLAC__Metadata_Iterator));
+
+	/* calloc() implies:
+		iterator->current = 0;
+		iterator->chain = 0;
+	*/
+
+	return iterator;
+}
+
+FLAC_API void FLAC__metadata_iterator_delete(FLAC__Metadata_Iterator *iterator)
+{
+	FLAC__ASSERT(0 != iterator);
+
+	free(iterator);
+}
+
+FLAC_API void FLAC__metadata_iterator_init(FLAC__Metadata_Iterator *iterator, FLAC__Metadata_Chain *chain)
+{
+	FLAC__ASSERT(0 != iterator);
+	FLAC__ASSERT(0 != chain);
+	FLAC__ASSERT(0 != chain->head);
+
+	iterator->chain = chain;
+	iterator->current = chain->head;
+}
+
+FLAC_API FLAC__bool FLAC__metadata_iterator_next(FLAC__Metadata_Iterator *iterator)
+{
+	FLAC__ASSERT(0 != iterator);
+
+	if(0 == iterator->current || 0 == iterator->current->next)
+		return false;
+
+	iterator->current = iterator->current->next;
+	return true;
+}
+
+FLAC_API FLAC__bool FLAC__metadata_iterator_prev(FLAC__Metadata_Iterator *iterator)
+{
+	FLAC__ASSERT(0 != iterator);
+
+	if(0 == iterator->current || 0 == iterator->current->prev)
+		return false;
+
+	iterator->current = iterator->current->prev;
+	return true;
+}
+
+FLAC_API FLAC__MetadataType FLAC__metadata_iterator_get_block_type(const FLAC__Metadata_Iterator *iterator)
+{
+	FLAC__ASSERT(0 != iterator);
+	FLAC__ASSERT(0 != iterator->current);
+	FLAC__ASSERT(0 != iterator->current->data);
+
+	return iterator->current->data->type;
+}
+
+FLAC_API FLAC__StreamMetadata *FLAC__metadata_iterator_get_block(FLAC__Metadata_Iterator *iterator)
+{
+	FLAC__ASSERT(0 != iterator);
+	FLAC__ASSERT(0 != iterator->current);
+
+	return iterator->current->data;
+}
+
+FLAC_API FLAC__bool FLAC__metadata_iterator_set_block(FLAC__Metadata_Iterator *iterator, FLAC__StreamMetadata *block)
+{
+	FLAC__ASSERT(0 != iterator);
+	FLAC__ASSERT(0 != block);
+	return FLAC__metadata_iterator_delete_block(iterator, false) && FLAC__metadata_iterator_insert_block_after(iterator, block);
+}
+
+FLAC_API FLAC__bool FLAC__metadata_iterator_delete_block(FLAC__Metadata_Iterator *iterator, FLAC__bool replace_with_padding)
+{
+	FLAC__Metadata_Node *save;
+
+	FLAC__ASSERT(0 != iterator);
+	FLAC__ASSERT(0 != iterator->current);
+
+	if(0 == iterator->current->prev) {
+		FLAC__ASSERT(iterator->current->data->type == FLAC__METADATA_TYPE_STREAMINFO);
+		return false;
+	}
+
+	save = iterator->current->prev;
+
+	if(replace_with_padding) {
+		FLAC__metadata_object_delete_data(iterator->current->data);
+		iterator->current->data->type = FLAC__METADATA_TYPE_PADDING;
+	}
+	else {
+		chain_delete_node_(iterator->chain, iterator->current);
+	}
+
+	iterator->current = save;
+	return true;
+}
+
+FLAC_API FLAC__bool FLAC__metadata_iterator_insert_block_before(FLAC__Metadata_Iterator *iterator, FLAC__StreamMetadata *block)
+{
+	FLAC__Metadata_Node *node;
+
+	FLAC__ASSERT(0 != iterator);
+	FLAC__ASSERT(0 != iterator->current);
+	FLAC__ASSERT(0 != block);
+
+	if(block->type == FLAC__METADATA_TYPE_STREAMINFO)
+		return false;
+
+	if(0 == iterator->current->prev) {
+		FLAC__ASSERT(iterator->current->data->type == FLAC__METADATA_TYPE_STREAMINFO);
+		return false;
+	}
+
+	if(0 == (node = node_new_()))
+		return false;
+
+	node->data = block;
+	iterator_insert_node_(iterator, node);
+	iterator->current = node;
+	return true;
+}
+
+FLAC_API FLAC__bool FLAC__metadata_iterator_insert_block_after(FLAC__Metadata_Iterator *iterator, FLAC__StreamMetadata *block)
+{
+	FLAC__Metadata_Node *node;
+
+	FLAC__ASSERT(0 != iterator);
+	FLAC__ASSERT(0 != iterator->current);
+	FLAC__ASSERT(0 != block);
+
+	if(block->type == FLAC__METADATA_TYPE_STREAMINFO)
+		return false;
+
+	if(0 == (node = node_new_()))
+		return false;
+
+	node->data = block;
+	iterator_insert_node_after_(iterator, node);
+	iterator->current = node;
+	return true;
+}
+
+
+/****************************************************************************
+ *
+ * Local function definitions
+ *
+ ***************************************************************************/
+
+void pack_uint32_(FLAC__uint32 val, FLAC__byte *b, unsigned bytes)
+{
+	unsigned i;
+
+	b += bytes;
+
+	for(i = 0; i < bytes; i++) {
+		*(--b) = (FLAC__byte)(val & 0xff);
+		val >>= 8;
+	}
+}
+
+void pack_uint32_little_endian_(FLAC__uint32 val, FLAC__byte *b, unsigned bytes)
+{
+	unsigned i;
+
+	for(i = 0; i < bytes; i++) {
+		*(b++) = (FLAC__byte)(val & 0xff);
+		val >>= 8;
+	}
+}
+
+void pack_uint64_(FLAC__uint64 val, FLAC__byte *b, unsigned bytes)
+{
+	unsigned i;
+
+	b += bytes;
+
+	for(i = 0; i < bytes; i++) {
+		*(--b) = (FLAC__byte)(val & 0xff);
+		val >>= 8;
+	}
+}
+
+FLAC__uint32 unpack_uint32_(FLAC__byte *b, unsigned bytes)
+{
+	FLAC__uint32 ret = 0;
+	unsigned i;
+
+	for(i = 0; i < bytes; i++)
+		ret = (ret << 8) | (FLAC__uint32)(*b++);
+
+	return ret;
+}
+
+FLAC__uint32 unpack_uint32_little_endian_(FLAC__byte *b, unsigned bytes)
+{
+	FLAC__uint32 ret = 0;
+	unsigned i;
+
+	b += bytes;
+
+	for(i = 0; i < bytes; i++)
+		ret = (ret << 8) | (FLAC__uint32)(*--b);
+
+	return ret;
+}
+
+FLAC__uint64 unpack_uint64_(FLAC__byte *b, unsigned bytes)
+{
+	FLAC__uint64 ret = 0;
+	unsigned i;
+
+	for(i = 0; i < bytes; i++)
+		ret = (ret << 8) | (FLAC__uint64)(*b++);
+
+	return ret;
+}
+
+FLAC__bool read_metadata_block_header_(FLAC__Metadata_SimpleIterator *iterator)
+{
+	FLAC__ASSERT(0 != iterator);
+	FLAC__ASSERT(0 != iterator->file);
+
+	if(!read_metadata_block_header_cb_((FLAC__IOHandle)iterator->file, (FLAC__IOCallback_Read)fread, &iterator->is_last, &iterator->type, &iterator->length)) {
+		iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+		return false;
+	}
+
+	return true;
+}
+
+FLAC__bool read_metadata_block_data_(FLAC__Metadata_SimpleIterator *iterator, FLAC__StreamMetadata *block)
+{
+	FLAC__ASSERT(0 != iterator);
+	FLAC__ASSERT(0 != iterator->file);
+
+	iterator->status = read_metadata_block_data_cb_((FLAC__IOHandle)iterator->file, (FLAC__IOCallback_Read)fread, fseek_wrapper_, block);
+
+	return (iterator->status == FLAC__METADATA_SIMPLE_ITERATOR_STATUS_OK);
+}
+
+FLAC__bool read_metadata_block_header_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__bool *is_last, FLAC__MetadataType *type, unsigned *length)
+{
+	FLAC__byte raw_header[FLAC__STREAM_METADATA_HEADER_LENGTH];
+
+	if(read_cb(raw_header, 1, FLAC__STREAM_METADATA_HEADER_LENGTH, handle) != FLAC__STREAM_METADATA_HEADER_LENGTH)
+		return false;
+
+	*is_last = raw_header[0] & 0x80? true : false;
+	*type = (FLAC__MetadataType)(raw_header[0] & 0x7f);
+	*length = unpack_uint32_(raw_header + 1, 3);
+
+	/* Note that we don't check:
+	 *    if(iterator->type >= FLAC__METADATA_TYPE_UNDEFINED)
+	 * we just will read in an opaque block
+	 */
+
+	return true;
+}
+
+FLAC__Metadata_SimpleIteratorStatus read_metadata_block_data_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__IOCallback_Seek seek_cb, FLAC__StreamMetadata *block)
+{
+	switch(block->type) {
+		case FLAC__METADATA_TYPE_STREAMINFO:
+			return read_metadata_block_data_streaminfo_cb_(handle, read_cb, &block->data.stream_info);
+		case FLAC__METADATA_TYPE_PADDING:
+			return read_metadata_block_data_padding_cb_(handle, seek_cb, &block->data.padding, block->length);
+		case FLAC__METADATA_TYPE_APPLICATION:
+			return read_metadata_block_data_application_cb_(handle, read_cb, &block->data.application, block->length);
+		case FLAC__METADATA_TYPE_SEEKTABLE:
+			return read_metadata_block_data_seektable_cb_(handle, read_cb, &block->data.seek_table, block->length);
+		case FLAC__METADATA_TYPE_VORBIS_COMMENT:
+			return read_metadata_block_data_vorbis_comment_cb_(handle, read_cb, seek_cb, &block->data.vorbis_comment, block->length);
+		case FLAC__METADATA_TYPE_CUESHEET:
+			return read_metadata_block_data_cuesheet_cb_(handle, read_cb, &block->data.cue_sheet);
+		case FLAC__METADATA_TYPE_PICTURE:
+			return read_metadata_block_data_picture_cb_(handle, read_cb, &block->data.picture);
+		default:
+			return read_metadata_block_data_unknown_cb_(handle, read_cb, &block->data.unknown, block->length);
+	}
+}
+
+FLAC__Metadata_SimpleIteratorStatus read_metadata_block_data_streaminfo_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__StreamMetadata_StreamInfo *block)
+{
+	FLAC__byte buffer[FLAC__STREAM_METADATA_STREAMINFO_LENGTH], *b;
+
+	if(read_cb(buffer, 1, FLAC__STREAM_METADATA_STREAMINFO_LENGTH, handle) != FLAC__STREAM_METADATA_STREAMINFO_LENGTH)
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+
+	b = buffer;
+
+	/* we are using hardcoded numbers for simplicity but we should
+	 * probably eventually write a bit-level unpacker and use the
+	 * _STREAMINFO_ constants.
+	 */
+	block->min_blocksize = unpack_uint32_(b, 2); b += 2;
+	block->max_blocksize = unpack_uint32_(b, 2); b += 2;
+	block->min_framesize = unpack_uint32_(b, 3); b += 3;
+	block->max_framesize = unpack_uint32_(b, 3); b += 3;
+	block->sample_rate = (unpack_uint32_(b, 2) << 4) | ((unsigned)(b[2] & 0xf0) >> 4);
+	block->channels = (unsigned)((b[2] & 0x0e) >> 1) + 1;
+	block->bits_per_sample = ((((unsigned)(b[2] & 0x01)) << 4) | (((unsigned)(b[3] & 0xf0)) >> 4)) + 1;
+	block->total_samples = (((FLAC__uint64)(b[3] & 0x0f)) << 32) | unpack_uint64_(b+4, 4);
+	memcpy(block->md5sum, b+8, 16);
+
+	return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_OK;
+}
+
+FLAC__Metadata_SimpleIteratorStatus read_metadata_block_data_padding_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Seek seek_cb, FLAC__StreamMetadata_Padding *block, unsigned block_length)
+{
+	(void)block; /* nothing to do; we don't care about reading the padding bytes */
+
+	if(0 != seek_cb(handle, block_length, SEEK_CUR))
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_SEEK_ERROR;
+
+	return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_OK;
+}
+
+FLAC__Metadata_SimpleIteratorStatus read_metadata_block_data_application_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__StreamMetadata_Application *block, unsigned block_length)
+{
+	const unsigned id_bytes = FLAC__STREAM_METADATA_APPLICATION_ID_LEN / 8;
+
+	if(read_cb(block->id, 1, id_bytes, handle) != id_bytes)
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+
+	if(block_length < id_bytes)
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+
+	block_length -= id_bytes;
+
+	if(block_length == 0) {
+		block->data = 0;
+	}
+	else {
+		if(0 == (block->data = malloc(block_length)))
+			return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_MEMORY_ALLOCATION_ERROR;
+
+		if(read_cb(block->data, 1, block_length, handle) != block_length)
+			return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+	}
+
+	return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_OK;
+}
+
+FLAC__Metadata_SimpleIteratorStatus read_metadata_block_data_seektable_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__StreamMetadata_SeekTable *block, unsigned block_length)
+{
+	unsigned i;
+	FLAC__byte buffer[FLAC__STREAM_METADATA_SEEKPOINT_LENGTH];
+
+	FLAC__ASSERT(block_length % FLAC__STREAM_METADATA_SEEKPOINT_LENGTH == 0);
+
+	block->num_points = block_length / FLAC__STREAM_METADATA_SEEKPOINT_LENGTH;
+
+	if(block->num_points == 0)
+		block->points = 0;
+	else if(0 == (block->points = safe_malloc_mul_2op_p(block->num_points, /*times*/sizeof(FLAC__StreamMetadata_SeekPoint))))
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_MEMORY_ALLOCATION_ERROR;
+
+	for(i = 0; i < block->num_points; i++) {
+		if(read_cb(buffer, 1, FLAC__STREAM_METADATA_SEEKPOINT_LENGTH, handle) != FLAC__STREAM_METADATA_SEEKPOINT_LENGTH)
+			return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+		/* some MAGIC NUMBERs here */
+		block->points[i].sample_number = unpack_uint64_(buffer, 8);
+		block->points[i].stream_offset = unpack_uint64_(buffer+8, 8);
+		block->points[i].frame_samples = unpack_uint32_(buffer+16, 2);
+	}
+
+	return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_OK;
+}
+
+FLAC__Metadata_SimpleIteratorStatus read_metadata_block_data_vorbis_comment_entry_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__StreamMetadata_VorbisComment_Entry *entry, unsigned max_length)
+{
+	const unsigned entry_length_len = FLAC__STREAM_METADATA_VORBIS_COMMENT_ENTRY_LENGTH_LEN / 8;
+	FLAC__byte buffer[4]; /* magic number is asserted below */
+
+	FLAC__ASSERT(FLAC__STREAM_METADATA_VORBIS_COMMENT_ENTRY_LENGTH_LEN / 8 == sizeof(buffer));
+
+	if(max_length < entry_length_len)
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_BAD_METADATA;
+
+	max_length -= entry_length_len;
+	if(read_cb(buffer, 1, entry_length_len, handle) != entry_length_len)
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+	entry->length = unpack_uint32_little_endian_(buffer, entry_length_len);
+	if(max_length < entry->length) {
+		entry->length = 0;
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_BAD_METADATA;
+	} else max_length -= entry->length;
+
+	if(0 != entry->entry)
+		free(entry->entry);
+
+	if(entry->length == 0) {
+		entry->entry = 0;
+	}
+	else {
+		if(0 == (entry->entry = safe_malloc_add_2op_(entry->length, /*+*/1)))
+			return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_MEMORY_ALLOCATION_ERROR;
+
+		if(read_cb(entry->entry, 1, entry->length, handle) != entry->length)
+			return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+
+		entry->entry[entry->length] = '\0';
+	}
+
+	return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_OK;
+}
+
+FLAC__Metadata_SimpleIteratorStatus read_metadata_block_data_vorbis_comment_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__IOCallback_Seek seek_cb, FLAC__StreamMetadata_VorbisComment *block, unsigned block_length)
+{
+	unsigned i;
+	FLAC__Metadata_SimpleIteratorStatus status;
+	const unsigned num_comments_len = FLAC__STREAM_METADATA_VORBIS_COMMENT_NUM_COMMENTS_LEN / 8;
+	FLAC__byte buffer[4]; /* magic number is asserted below */
+
+	FLAC__ASSERT(FLAC__STREAM_METADATA_VORBIS_COMMENT_NUM_COMMENTS_LEN / 8 == sizeof(buffer));
+
+	status = read_metadata_block_data_vorbis_comment_entry_cb_(handle, read_cb, &(block->vendor_string), block_length);
+	if(block_length >= 4)
+		block_length -= 4;
+	if(status == FLAC__METADATA_SIMPLE_ITERATOR_STATUS_BAD_METADATA)
+		goto skip;
+	else if(status != FLAC__METADATA_SIMPLE_ITERATOR_STATUS_OK)
+		return status;
+	block_length -= block->vendor_string.length;
+
+	if(block_length < num_comments_len) goto skip; else block_length -= num_comments_len;
+	if(read_cb(buffer, 1, num_comments_len, handle) != num_comments_len)
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+	block->num_comments = unpack_uint32_little_endian_(buffer, num_comments_len);
+
+	if(block->num_comments == 0) {
+		block->comments = 0;
+	}
+	else if(0 == (block->comments = calloc(block->num_comments, sizeof(FLAC__StreamMetadata_VorbisComment_Entry))))
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_MEMORY_ALLOCATION_ERROR;
+
+	for(i = 0; i < block->num_comments; i++) {
+		status = read_metadata_block_data_vorbis_comment_entry_cb_(handle, read_cb, block->comments + i, block_length);
+		if(block_length >= 4) block_length -= 4;
+		if(status == FLAC__METADATA_SIMPLE_ITERATOR_STATUS_BAD_METADATA) {
+			block->num_comments = i;
+			goto skip;
+		}
+		else if(status != FLAC__METADATA_SIMPLE_ITERATOR_STATUS_OK) return status;
+		block_length -= block->comments[i].length;
+	}
+
+  skip:
+	if(block_length > 0) {
+		/* bad metadata */
+		if(0 != seek_cb(handle, block_length, SEEK_CUR))
+			return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_SEEK_ERROR;
+	}
+
+	return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_OK;
+}
+
+FLAC__Metadata_SimpleIteratorStatus read_metadata_block_data_cuesheet_track_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__StreamMetadata_CueSheet_Track *track)
+{
+	unsigned i, len;
+	FLAC__byte buffer[32]; /* asserted below that this is big enough */
+
+	FLAC__ASSERT(sizeof(buffer) >= sizeof(FLAC__uint64));
+	FLAC__ASSERT(sizeof(buffer) >= FLAC__STREAM_METADATA_CUESHEET_INDEX_RESERVED_LEN/8);
+	FLAC__ASSERT(sizeof(buffer) >= (FLAC__STREAM_METADATA_CUESHEET_TRACK_TYPE_LEN + FLAC__STREAM_METADATA_CUESHEET_TRACK_PRE_EMPHASIS_LEN + FLAC__STREAM_METADATA_CUESHEET_TRACK_RESERVED_LEN) / 8);
+
+	FLAC__ASSERT(FLAC__STREAM_METADATA_CUESHEET_TRACK_OFFSET_LEN % 8 == 0);
+	len = FLAC__STREAM_METADATA_CUESHEET_TRACK_OFFSET_LEN / 8;
+	if(read_cb(buffer, 1, len, handle) != len)
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+	track->offset = unpack_uint64_(buffer, len);
+
+	FLAC__ASSERT(FLAC__STREAM_METADATA_CUESHEET_TRACK_NUMBER_LEN % 8 == 0);
+	len = FLAC__STREAM_METADATA_CUESHEET_TRACK_NUMBER_LEN / 8;
+	if(read_cb(buffer, 1, len, handle) != len)
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+	track->number = (FLAC__byte)unpack_uint32_(buffer, len);
+
+	FLAC__ASSERT(FLAC__STREAM_METADATA_CUESHEET_TRACK_ISRC_LEN % 8 == 0);
+	len = FLAC__STREAM_METADATA_CUESHEET_TRACK_ISRC_LEN / 8;
+	if(read_cb(track->isrc, 1, len, handle) != len)
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+
+	FLAC__ASSERT((FLAC__STREAM_METADATA_CUESHEET_TRACK_TYPE_LEN + FLAC__STREAM_METADATA_CUESHEET_TRACK_PRE_EMPHASIS_LEN + FLAC__STREAM_METADATA_CUESHEET_TRACK_RESERVED_LEN) % 8 == 0);
+	len = (FLAC__STREAM_METADATA_CUESHEET_TRACK_TYPE_LEN + FLAC__STREAM_METADATA_CUESHEET_TRACK_PRE_EMPHASIS_LEN + FLAC__STREAM_METADATA_CUESHEET_TRACK_RESERVED_LEN) / 8;
+	if(read_cb(buffer, 1, len, handle) != len)
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+	FLAC__ASSERT(FLAC__STREAM_METADATA_CUESHEET_TRACK_TYPE_LEN == 1);
+	FLAC__ASSERT(FLAC__STREAM_METADATA_CUESHEET_TRACK_PRE_EMPHASIS_LEN == 1);
+	track->type = buffer[0] >> 7;
+	track->pre_emphasis = (buffer[0] >> 6) & 1;
+
+	FLAC__ASSERT(FLAC__STREAM_METADATA_CUESHEET_TRACK_NUM_INDICES_LEN % 8 == 0);
+	len = FLAC__STREAM_METADATA_CUESHEET_TRACK_NUM_INDICES_LEN / 8;
+	if(read_cb(buffer, 1, len, handle) != len)
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+	track->num_indices = (FLAC__byte)unpack_uint32_(buffer, len);
+
+	if(track->num_indices == 0) {
+		track->indices = 0;
+	}
+	else if(0 == (track->indices = calloc(track->num_indices, sizeof(FLAC__StreamMetadata_CueSheet_Index))))
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_MEMORY_ALLOCATION_ERROR;
+
+	for(i = 0; i < track->num_indices; i++) {
+		FLAC__ASSERT(FLAC__STREAM_METADATA_CUESHEET_INDEX_OFFSET_LEN % 8 == 0);
+		len = FLAC__STREAM_METADATA_CUESHEET_INDEX_OFFSET_LEN / 8;
+		if(read_cb(buffer, 1, len, handle) != len)
+			return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+		track->indices[i].offset = unpack_uint64_(buffer, len);
+
+		FLAC__ASSERT(FLAC__STREAM_METADATA_CUESHEET_INDEX_NUMBER_LEN % 8 == 0);
+		len = FLAC__STREAM_METADATA_CUESHEET_INDEX_NUMBER_LEN / 8;
+		if(read_cb(buffer, 1, len, handle) != len)
+			return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+		track->indices[i].number = (FLAC__byte)unpack_uint32_(buffer, len);
+
+		FLAC__ASSERT(FLAC__STREAM_METADATA_CUESHEET_INDEX_RESERVED_LEN % 8 == 0);
+		len = FLAC__STREAM_METADATA_CUESHEET_INDEX_RESERVED_LEN / 8;
+		if(read_cb(buffer, 1, len, handle) != len)
+			return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+	}
+
+	return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_OK;
+}
+
+FLAC__Metadata_SimpleIteratorStatus read_metadata_block_data_cuesheet_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__StreamMetadata_CueSheet *block)
+{
+	unsigned i, len;
+	FLAC__Metadata_SimpleIteratorStatus status;
+	FLAC__byte buffer[1024]; /* MSVC needs a constant expression so we put a magic number and assert */
+
+	FLAC__ASSERT((FLAC__STREAM_METADATA_CUESHEET_IS_CD_LEN + FLAC__STREAM_METADATA_CUESHEET_RESERVED_LEN)/8 <= sizeof(buffer));
+	FLAC__ASSERT(sizeof(FLAC__uint64) <= sizeof(buffer));
+
+	FLAC__ASSERT(FLAC__STREAM_METADATA_CUESHEET_MEDIA_CATALOG_NUMBER_LEN % 8 == 0);
+	len = FLAC__STREAM_METADATA_CUESHEET_MEDIA_CATALOG_NUMBER_LEN / 8;
+	if(read_cb(block->media_catalog_number, 1, len, handle) != len)
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+
+	FLAC__ASSERT(FLAC__STREAM_METADATA_CUESHEET_LEAD_IN_LEN % 8 == 0);
+	len = FLAC__STREAM_METADATA_CUESHEET_LEAD_IN_LEN / 8;
+	if(read_cb(buffer, 1, len, handle) != len)
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+	block->lead_in = unpack_uint64_(buffer, len);
+
+	FLAC__ASSERT((FLAC__STREAM_METADATA_CUESHEET_IS_CD_LEN + FLAC__STREAM_METADATA_CUESHEET_RESERVED_LEN) % 8 == 0);
+	len = (FLAC__STREAM_METADATA_CUESHEET_IS_CD_LEN + FLAC__STREAM_METADATA_CUESHEET_RESERVED_LEN) / 8;
+	if(read_cb(buffer, 1, len, handle) != len)
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+	block->is_cd = buffer[0]&0x80? true : false;
+
+	FLAC__ASSERT(FLAC__STREAM_METADATA_CUESHEET_NUM_TRACKS_LEN % 8 == 0);
+	len = FLAC__STREAM_METADATA_CUESHEET_NUM_TRACKS_LEN / 8;
+	if(read_cb(buffer, 1, len, handle) != len)
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+	block->num_tracks = unpack_uint32_(buffer, len);
+
+	if(block->num_tracks == 0) {
+		block->tracks = 0;
+	}
+	else if(0 == (block->tracks = calloc(block->num_tracks, sizeof(FLAC__StreamMetadata_CueSheet_Track))))
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_MEMORY_ALLOCATION_ERROR;
+
+	for(i = 0; i < block->num_tracks; i++) {
+		if(FLAC__METADATA_SIMPLE_ITERATOR_STATUS_OK != (status = read_metadata_block_data_cuesheet_track_cb_(handle, read_cb, block->tracks + i)))
+			return status;
+	}
+
+	return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_OK;
+}
+
+static FLAC__Metadata_SimpleIteratorStatus read_metadata_block_data_picture_cstring_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__byte **data, FLAC__uint32 *length, FLAC__uint32 length_len)
+{
+	FLAC__byte buffer[sizeof(FLAC__uint32)];
+
+	FLAC__ASSERT(0 != data);
+	FLAC__ASSERT(length_len%8 == 0);
+
+	length_len /= 8; /* convert to bytes */
+
+	FLAC__ASSERT(sizeof(buffer) >= length_len);
+
+	if(read_cb(buffer, 1, length_len, handle) != length_len)
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+	*length = unpack_uint32_(buffer, length_len);
+
+	if(0 != *data)
+		free(*data);
+
+	if(0 == (*data = safe_malloc_add_2op_(*length, /*+*/1)))
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_MEMORY_ALLOCATION_ERROR;
+
+	if(*length > 0) {
+		if(read_cb(*data, 1, *length, handle) != *length)
+			return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+	}
+
+	(*data)[*length] = '\0';
+
+	return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_OK;
+}
+
+FLAC__Metadata_SimpleIteratorStatus read_metadata_block_data_picture_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__StreamMetadata_Picture *block)
+{
+	FLAC__Metadata_SimpleIteratorStatus status;
+	FLAC__byte buffer[4]; /* asserted below that this is big enough */
+	FLAC__uint32 len;
+
+	FLAC__ASSERT(sizeof(buffer) >= FLAC__STREAM_METADATA_PICTURE_TYPE_LEN/8);
+	FLAC__ASSERT(sizeof(buffer) >= FLAC__STREAM_METADATA_PICTURE_WIDTH_LEN/8);
+	FLAC__ASSERT(sizeof(buffer) >= FLAC__STREAM_METADATA_PICTURE_HEIGHT_LEN/8);
+	FLAC__ASSERT(sizeof(buffer) >= FLAC__STREAM_METADATA_PICTURE_DEPTH_LEN/8);
+	FLAC__ASSERT(sizeof(buffer) >= FLAC__STREAM_METADATA_PICTURE_COLORS_LEN/8);
+
+	FLAC__ASSERT(FLAC__STREAM_METADATA_PICTURE_TYPE_LEN % 8 == 0);
+	len = FLAC__STREAM_METADATA_PICTURE_TYPE_LEN / 8;
+	if(read_cb(buffer, 1, len, handle) != len)
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+	block->type = (FLAC__StreamMetadata_Picture_Type)unpack_uint32_(buffer, len);
+
+	if((status = read_metadata_block_data_picture_cstring_cb_(handle, read_cb, (FLAC__byte**)(&(block->mime_type)), &len, FLAC__STREAM_METADATA_PICTURE_MIME_TYPE_LENGTH_LEN)) != FLAC__METADATA_SIMPLE_ITERATOR_STATUS_OK)
+		return status;
+
+	if((status = read_metadata_block_data_picture_cstring_cb_(handle, read_cb, &(block->description), &len, FLAC__STREAM_METADATA_PICTURE_DESCRIPTION_LENGTH_LEN)) != FLAC__METADATA_SIMPLE_ITERATOR_STATUS_OK)
+		return status;
+
+	FLAC__ASSERT(FLAC__STREAM_METADATA_PICTURE_WIDTH_LEN % 8 == 0);
+	len = FLAC__STREAM_METADATA_PICTURE_WIDTH_LEN / 8;
+	if(read_cb(buffer, 1, len, handle) != len)
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+	block->width = unpack_uint32_(buffer, len);
+
+	FLAC__ASSERT(FLAC__STREAM_METADATA_PICTURE_HEIGHT_LEN % 8 == 0);
+	len = FLAC__STREAM_METADATA_PICTURE_HEIGHT_LEN / 8;
+	if(read_cb(buffer, 1, len, handle) != len)
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+	block->height = unpack_uint32_(buffer, len);
+
+	FLAC__ASSERT(FLAC__STREAM_METADATA_PICTURE_DEPTH_LEN % 8 == 0);
+	len = FLAC__STREAM_METADATA_PICTURE_DEPTH_LEN / 8;
+	if(read_cb(buffer, 1, len, handle) != len)
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+	block->depth = unpack_uint32_(buffer, len);
+
+	FLAC__ASSERT(FLAC__STREAM_METADATA_PICTURE_COLORS_LEN % 8 == 0);
+	len = FLAC__STREAM_METADATA_PICTURE_COLORS_LEN / 8;
+	if(read_cb(buffer, 1, len, handle) != len)
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+	block->colors = unpack_uint32_(buffer, len);
+
+	/* for convenience we use read_metadata_block_data_picture_cstring_cb_() even though it adds an extra terminating NUL we don't use */
+	if((status = read_metadata_block_data_picture_cstring_cb_(handle, read_cb, &(block->data), &(block->data_length), FLAC__STREAM_METADATA_PICTURE_DATA_LENGTH_LEN)) != FLAC__METADATA_SIMPLE_ITERATOR_STATUS_OK)
+		return status;
+
+	return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_OK;
+}
+
+FLAC__Metadata_SimpleIteratorStatus read_metadata_block_data_unknown_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__StreamMetadata_Unknown *block, unsigned block_length)
+{
+	if(block_length == 0) {
+		block->data = 0;
+	}
+	else {
+		if(0 == (block->data = malloc(block_length)))
+			return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_MEMORY_ALLOCATION_ERROR;
+
+		if(read_cb(block->data, 1, block_length, handle) != block_length)
+			return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+	}
+
+	return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_OK;
+}
+
+FLAC__bool write_metadata_block_header_(FILE *file, FLAC__Metadata_SimpleIteratorStatus *status, const FLAC__StreamMetadata *block)
+{
+	FLAC__ASSERT(0 != file);
+	FLAC__ASSERT(0 != status);
+
+	if(!write_metadata_block_header_cb_((FLAC__IOHandle)file, (FLAC__IOCallback_Write)fwrite, block)) {
+		*status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_WRITE_ERROR;
+		return false;
+	}
+
+	return true;
+}
+
+FLAC__bool write_metadata_block_data_(FILE *file, FLAC__Metadata_SimpleIteratorStatus *status, const FLAC__StreamMetadata *block)
+{
+	FLAC__ASSERT(0 != file);
+	FLAC__ASSERT(0 != status);
+
+	if (write_metadata_block_data_cb_((FLAC__IOHandle)file, (FLAC__IOCallback_Write)fwrite, block)) {
+		*status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_OK;
+		return true;
+	}
+	else {
+		*status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_WRITE_ERROR;
+		return false;
+	}
+}
+
+FLAC__bool write_metadata_block_header_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Write write_cb, const FLAC__StreamMetadata *block)
+{
+	FLAC__byte buffer[FLAC__STREAM_METADATA_HEADER_LENGTH];
+
+	FLAC__ASSERT(block->length < (1u << FLAC__STREAM_METADATA_LENGTH_LEN));
+
+	buffer[0] = (block->is_last? 0x80 : 0) | (FLAC__byte)block->type;
+	pack_uint32_(block->length, buffer + 1, 3);
+
+	if(write_cb(buffer, 1, FLAC__STREAM_METADATA_HEADER_LENGTH, handle) != FLAC__STREAM_METADATA_HEADER_LENGTH)
+		return false;
+
+	return true;
+}
+
+FLAC__bool write_metadata_block_data_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Write write_cb, const FLAC__StreamMetadata *block)
+{
+	FLAC__ASSERT(0 != block);
+
+	switch(block->type) {
+		case FLAC__METADATA_TYPE_STREAMINFO:
+			return write_metadata_block_data_streaminfo_cb_(handle, write_cb, &block->data.stream_info);
+		case FLAC__METADATA_TYPE_PADDING:
+			return write_metadata_block_data_padding_cb_(handle, write_cb, &block->data.padding, block->length);
+		case FLAC__METADATA_TYPE_APPLICATION:
+			return write_metadata_block_data_application_cb_(handle, write_cb, &block->data.application, block->length);
+		case FLAC__METADATA_TYPE_SEEKTABLE:
+			return write_metadata_block_data_seektable_cb_(handle, write_cb, &block->data.seek_table);
+		case FLAC__METADATA_TYPE_VORBIS_COMMENT:
+			return write_metadata_block_data_vorbis_comment_cb_(handle, write_cb, &block->data.vorbis_comment);
+		case FLAC__METADATA_TYPE_CUESHEET:
+			return write_metadata_block_data_cuesheet_cb_(handle, write_cb, &block->data.cue_sheet);
+		case FLAC__METADATA_TYPE_PICTURE:
+			return write_metadata_block_data_picture_cb_(handle, write_cb, &block->data.picture);
+		default:
+			return write_metadata_block_data_unknown_cb_(handle, write_cb, &block->data.unknown, block->length);
+	}
+}
+
+FLAC__bool write_metadata_block_data_streaminfo_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Write write_cb, const FLAC__StreamMetadata_StreamInfo *block)
+{
+	FLAC__byte buffer[FLAC__STREAM_METADATA_STREAMINFO_LENGTH];
+	const unsigned channels1 = block->channels - 1;
+	const unsigned bps1 = block->bits_per_sample - 1;
+
+	/* we are using hardcoded numbers for simplicity but we should
+	 * probably eventually write a bit-level packer and use the
+	 * _STREAMINFO_ constants.
+	 */
+	pack_uint32_(block->min_blocksize, buffer, 2);
+	pack_uint32_(block->max_blocksize, buffer+2, 2);
+	pack_uint32_(block->min_framesize, buffer+4, 3);
+	pack_uint32_(block->max_framesize, buffer+7, 3);
+	buffer[10] = (block->sample_rate >> 12) & 0xff;
+	buffer[11] = (block->sample_rate >> 4) & 0xff;
+	buffer[12] = ((block->sample_rate & 0x0f) << 4) | (channels1 << 1) | (bps1 >> 4);
+	buffer[13] = (FLAC__byte)(((bps1 & 0x0f) << 4) | ((block->total_samples >> 32) & 0x0f));
+	pack_uint32_((FLAC__uint32)block->total_samples, buffer+14, 4);
+	memcpy(buffer+18, block->md5sum, 16);
+
+	if(write_cb(buffer, 1, FLAC__STREAM_METADATA_STREAMINFO_LENGTH, handle) != FLAC__STREAM_METADATA_STREAMINFO_LENGTH)
+		return false;
+
+	return true;
+}
+
+FLAC__bool write_metadata_block_data_padding_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Write write_cb, const FLAC__StreamMetadata_Padding *block, unsigned block_length)
+{
+	unsigned i, n = block_length;
+	FLAC__byte buffer[1024];
+
+	(void)block;
+
+	memset(buffer, 0, 1024);
+
+	for(i = 0; i < n/1024; i++)
+		if(write_cb(buffer, 1, 1024, handle) != 1024)
+			return false;
+
+	n %= 1024;
+
+	if(write_cb(buffer, 1, n, handle) != n)
+		return false;
+
+	return true;
+}
+
+FLAC__bool write_metadata_block_data_application_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Write write_cb, const FLAC__StreamMetadata_Application *block, unsigned block_length)
+{
+	const unsigned id_bytes = FLAC__STREAM_METADATA_APPLICATION_ID_LEN / 8;
+
+	if(write_cb(block->id, 1, id_bytes, handle) != id_bytes)
+		return false;
+
+	block_length -= id_bytes;
+
+	if(write_cb(block->data, 1, block_length, handle) != block_length)
+		return false;
+
+	return true;
+}
+
+FLAC__bool write_metadata_block_data_seektable_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Write write_cb, const FLAC__StreamMetadata_SeekTable *block)
+{
+	unsigned i;
+	FLAC__byte buffer[FLAC__STREAM_METADATA_SEEKPOINT_LENGTH];
+
+	for(i = 0; i < block->num_points; i++) {
+		/* some MAGIC NUMBERs here */
+		pack_uint64_(block->points[i].sample_number, buffer, 8);
+		pack_uint64_(block->points[i].stream_offset, buffer+8, 8);
+		pack_uint32_(block->points[i].frame_samples, buffer+16, 2);
+		if(write_cb(buffer, 1, FLAC__STREAM_METADATA_SEEKPOINT_LENGTH, handle) != FLAC__STREAM_METADATA_SEEKPOINT_LENGTH)
+			return false;
+	}
+
+	return true;
+}
+
+FLAC__bool write_metadata_block_data_vorbis_comment_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Write write_cb, const FLAC__StreamMetadata_VorbisComment *block)
+{
+	unsigned i;
+	const unsigned entry_length_len = FLAC__STREAM_METADATA_VORBIS_COMMENT_ENTRY_LENGTH_LEN / 8;
+	const unsigned num_comments_len = FLAC__STREAM_METADATA_VORBIS_COMMENT_NUM_COMMENTS_LEN / 8;
+	FLAC__byte buffer[4]; /* magic number is asserted below */
+
+	FLAC__ASSERT(flac_max(FLAC__STREAM_METADATA_VORBIS_COMMENT_ENTRY_LENGTH_LEN, FLAC__STREAM_METADATA_VORBIS_COMMENT_NUM_COMMENTS_LEN) / 8 == sizeof(buffer));
+
+	pack_uint32_little_endian_(block->vendor_string.length, buffer, entry_length_len);
+	if(write_cb(buffer, 1, entry_length_len, handle) != entry_length_len)
+		return false;
+	if(write_cb(block->vendor_string.entry, 1, block->vendor_string.length, handle) != block->vendor_string.length)
+		return false;
+
+	pack_uint32_little_endian_(block->num_comments, buffer, num_comments_len);
+	if(write_cb(buffer, 1, num_comments_len, handle) != num_comments_len)
+		return false;
+
+	for(i = 0; i < block->num_comments; i++) {
+		pack_uint32_little_endian_(block->comments[i].length, buffer, entry_length_len);
+		if(write_cb(buffer, 1, entry_length_len, handle) != entry_length_len)
+			return false;
+		if(write_cb(block->comments[i].entry, 1, block->comments[i].length, handle) != block->comments[i].length)
+			return false;
+	}
+
+	return true;
+}
+
+FLAC__bool write_metadata_block_data_cuesheet_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Write write_cb, const FLAC__StreamMetadata_CueSheet *block)
+{
+	unsigned i, j, len;
+	FLAC__byte buffer[1024]; /* asserted below that this is big enough */
+
+	FLAC__ASSERT(sizeof(buffer) >= sizeof(FLAC__uint64));
+	FLAC__ASSERT(sizeof(buffer) >= FLAC__STREAM_METADATA_CUESHEET_RESERVED_LEN/8);
+	FLAC__ASSERT(sizeof(buffer) >= (FLAC__STREAM_METADATA_CUESHEET_TRACK_TYPE_LEN + FLAC__STREAM_METADATA_CUESHEET_TRACK_PRE_EMPHASIS_LEN + FLAC__STREAM_METADATA_CUESHEET_IS_CD_LEN + FLAC__STREAM_METADATA_CUESHEET_TRACK_RESERVED_LEN)/8);
+	FLAC__ASSERT(sizeof(buffer) >= FLAC__STREAM_METADATA_CUESHEET_INDEX_RESERVED_LEN/8);
+
+	FLAC__ASSERT(FLAC__STREAM_METADATA_CUESHEET_MEDIA_CATALOG_NUMBER_LEN % 8 == 0);
+	len = FLAC__STREAM_METADATA_CUESHEET_MEDIA_CATALOG_NUMBER_LEN / 8;
+	if(write_cb(block->media_catalog_number, 1, len, handle) != len)
+		return false;
+
+	FLAC__ASSERT(FLAC__STREAM_METADATA_CUESHEET_LEAD_IN_LEN % 8 == 0);
+	len = FLAC__STREAM_METADATA_CUESHEET_LEAD_IN_LEN / 8;
+	pack_uint64_(block->lead_in, buffer, len);
+	if(write_cb(buffer, 1, len, handle) != len)
+		return false;
+
+	FLAC__ASSERT((FLAC__STREAM_METADATA_CUESHEET_IS_CD_LEN + FLAC__STREAM_METADATA_CUESHEET_RESERVED_LEN) % 8 == 0);
+	len = (FLAC__STREAM_METADATA_CUESHEET_IS_CD_LEN + FLAC__STREAM_METADATA_CUESHEET_RESERVED_LEN) / 8;
+	memset(buffer, 0, len);
+	if(block->is_cd)
+		buffer[0] |= 0x80;
+	if(write_cb(buffer, 1, len, handle) != len)
+		return false;
+
+	FLAC__ASSERT(FLAC__STREAM_METADATA_CUESHEET_NUM_TRACKS_LEN % 8 == 0);
+	len = FLAC__STREAM_METADATA_CUESHEET_NUM_TRACKS_LEN / 8;
+	pack_uint32_(block->num_tracks, buffer, len);
+	if(write_cb(buffer, 1, len, handle) != len)
+		return false;
+
+	for(i = 0; i < block->num_tracks; i++) {
+		FLAC__StreamMetadata_CueSheet_Track *track = block->tracks + i;
+
+		FLAC__ASSERT(FLAC__STREAM_METADATA_CUESHEET_TRACK_OFFSET_LEN % 8 == 0);
+		len = FLAC__STREAM_METADATA_CUESHEET_TRACK_OFFSET_LEN / 8;
+		pack_uint64_(track->offset, buffer, len);
+		if(write_cb(buffer, 1, len, handle) != len)
+			return false;
+
+		FLAC__ASSERT(FLAC__STREAM_METADATA_CUESHEET_TRACK_NUMBER_LEN % 8 == 0);
+		len = FLAC__STREAM_METADATA_CUESHEET_TRACK_NUMBER_LEN / 8;
+		pack_uint32_(track->number, buffer, len);
+		if(write_cb(buffer, 1, len, handle) != len)
+			return false;
+
+		FLAC__ASSERT(FLAC__STREAM_METADATA_CUESHEET_TRACK_ISRC_LEN % 8 == 0);
+		len = FLAC__STREAM_METADATA_CUESHEET_TRACK_ISRC_LEN / 8;
+		if(write_cb(track->isrc, 1, len, handle) != len)
+			return false;
+
+		FLAC__ASSERT((FLAC__STREAM_METADATA_CUESHEET_TRACK_TYPE_LEN + FLAC__STREAM_METADATA_CUESHEET_TRACK_PRE_EMPHASIS_LEN + FLAC__STREAM_METADATA_CUESHEET_TRACK_RESERVED_LEN) % 8 == 0);
+		len = (FLAC__STREAM_METADATA_CUESHEET_TRACK_TYPE_LEN + FLAC__STREAM_METADATA_CUESHEET_TRACK_PRE_EMPHASIS_LEN + FLAC__STREAM_METADATA_CUESHEET_TRACK_RESERVED_LEN) / 8;
+		memset(buffer, 0, len);
+		buffer[0] = (track->type << 7) | (track->pre_emphasis << 6);
+		if(write_cb(buffer, 1, len, handle) != len)
+			return false;
+
+		FLAC__ASSERT(FLAC__STREAM_METADATA_CUESHEET_TRACK_NUM_INDICES_LEN % 8 == 0);
+		len = FLAC__STREAM_METADATA_CUESHEET_TRACK_NUM_INDICES_LEN / 8;
+		pack_uint32_(track->num_indices, buffer, len);
+		if(write_cb(buffer, 1, len, handle) != len)
+			return false;
+
+		for(j = 0; j < track->num_indices; j++) {
+			FLAC__StreamMetadata_CueSheet_Index *indx = track->indices + j;
+
+			FLAC__ASSERT(FLAC__STREAM_METADATA_CUESHEET_INDEX_OFFSET_LEN % 8 == 0);
+			len = FLAC__STREAM_METADATA_CUESHEET_INDEX_OFFSET_LEN / 8;
+			pack_uint64_(indx->offset, buffer, len);
+			if(write_cb(buffer, 1, len, handle) != len)
+				return false;
+
+			FLAC__ASSERT(FLAC__STREAM_METADATA_CUESHEET_INDEX_NUMBER_LEN % 8 == 0);
+			len = FLAC__STREAM_METADATA_CUESHEET_INDEX_NUMBER_LEN / 8;
+			pack_uint32_(indx->number, buffer, len);
+			if(write_cb(buffer, 1, len, handle) != len)
+				return false;
+
+			FLAC__ASSERT(FLAC__STREAM_METADATA_CUESHEET_INDEX_RESERVED_LEN % 8 == 0);
+			len = FLAC__STREAM_METADATA_CUESHEET_INDEX_RESERVED_LEN / 8;
+			memset(buffer, 0, len);
+			if(write_cb(buffer, 1, len, handle) != len)
+				return false;
+		}
+	}
+
+	return true;
+}
+
+FLAC__bool write_metadata_block_data_picture_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Write write_cb, const FLAC__StreamMetadata_Picture *block)
+{
+	unsigned len;
+	size_t slen;
+	FLAC__byte buffer[4]; /* magic number is asserted below */
+
+	FLAC__ASSERT(0 == FLAC__STREAM_METADATA_PICTURE_TYPE_LEN%8);
+	FLAC__ASSERT(0 == FLAC__STREAM_METADATA_PICTURE_MIME_TYPE_LENGTH_LEN%8);
+	FLAC__ASSERT(0 == FLAC__STREAM_METADATA_PICTURE_DESCRIPTION_LENGTH_LEN%8);
+	FLAC__ASSERT(0 == FLAC__STREAM_METADATA_PICTURE_WIDTH_LEN%8);
+	FLAC__ASSERT(0 == FLAC__STREAM_METADATA_PICTURE_HEIGHT_LEN%8);
+	FLAC__ASSERT(0 == FLAC__STREAM_METADATA_PICTURE_DEPTH_LEN%8);
+	FLAC__ASSERT(0 == FLAC__STREAM_METADATA_PICTURE_COLORS_LEN%8);
+	FLAC__ASSERT(0 == FLAC__STREAM_METADATA_PICTURE_DATA_LENGTH_LEN%8);
+	FLAC__ASSERT(sizeof(buffer) >= FLAC__STREAM_METADATA_PICTURE_TYPE_LEN/8);
+	FLAC__ASSERT(sizeof(buffer) >= FLAC__STREAM_METADATA_PICTURE_MIME_TYPE_LENGTH_LEN/8);
+	FLAC__ASSERT(sizeof(buffer) >= FLAC__STREAM_METADATA_PICTURE_DESCRIPTION_LENGTH_LEN/8);
+	FLAC__ASSERT(sizeof(buffer) >= FLAC__STREAM_METADATA_PICTURE_WIDTH_LEN/8);
+	FLAC__ASSERT(sizeof(buffer) >= FLAC__STREAM_METADATA_PICTURE_HEIGHT_LEN/8);
+	FLAC__ASSERT(sizeof(buffer) >= FLAC__STREAM_METADATA_PICTURE_DEPTH_LEN/8);
+	FLAC__ASSERT(sizeof(buffer) >= FLAC__STREAM_METADATA_PICTURE_COLORS_LEN/8);
+	FLAC__ASSERT(sizeof(buffer) >= FLAC__STREAM_METADATA_PICTURE_DATA_LENGTH_LEN/8);
+
+	len = FLAC__STREAM_METADATA_PICTURE_TYPE_LEN/8;
+	pack_uint32_(block->type, buffer, len);
+	if(write_cb(buffer, 1, len, handle) != len)
+		return false;
+
+	len = FLAC__STREAM_METADATA_PICTURE_MIME_TYPE_LENGTH_LEN/8;
+	slen = strlen(block->mime_type);
+	pack_uint32_(slen, buffer, len);
+	if(write_cb(buffer, 1, len, handle) != len)
+		return false;
+	if(write_cb(block->mime_type, 1, slen, handle) != slen)
+		return false;
+
+	len = FLAC__STREAM_METADATA_PICTURE_DESCRIPTION_LENGTH_LEN/8;
+	slen = strlen((const char *)block->description);
+	pack_uint32_(slen, buffer, len);
+	if(write_cb(buffer, 1, len, handle) != len)
+		return false;
+	if(write_cb(block->description, 1, slen, handle) != slen)
+		return false;
+
+	len = FLAC__STREAM_METADATA_PICTURE_WIDTH_LEN/8;
+	pack_uint32_(block->width, buffer, len);
+	if(write_cb(buffer, 1, len, handle) != len)
+		return false;
+
+	len = FLAC__STREAM_METADATA_PICTURE_HEIGHT_LEN/8;
+	pack_uint32_(block->height, buffer, len);
+	if(write_cb(buffer, 1, len, handle) != len)
+		return false;
+
+	len = FLAC__STREAM_METADATA_PICTURE_DEPTH_LEN/8;
+	pack_uint32_(block->depth, buffer, len);
+	if(write_cb(buffer, 1, len, handle) != len)
+		return false;
+
+	len = FLAC__STREAM_METADATA_PICTURE_COLORS_LEN/8;
+	pack_uint32_(block->colors, buffer, len);
+	if(write_cb(buffer, 1, len, handle) != len)
+		return false;
+
+	len = FLAC__STREAM_METADATA_PICTURE_DATA_LENGTH_LEN/8;
+	pack_uint32_(block->data_length, buffer, len);
+	if(write_cb(buffer, 1, len, handle) != len)
+		return false;
+	if(write_cb(block->data, 1, block->data_length, handle) != block->data_length)
+		return false;
+
+	return true;
+}
+
+FLAC__bool write_metadata_block_data_unknown_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Write write_cb, const FLAC__StreamMetadata_Unknown *block, unsigned block_length)
+{
+	if(write_cb(block->data, 1, block_length, handle) != block_length)
+		return false;
+
+	return true;
+}
+
+FLAC__bool write_metadata_block_stationary_(FLAC__Metadata_SimpleIterator *iterator, const FLAC__StreamMetadata *block)
+{
+	if(0 != fseeko(iterator->file, iterator->offset[iterator->depth], SEEK_SET)) {
+		iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_SEEK_ERROR;
+		return false;
+	}
+
+	if(!write_metadata_block_header_(iterator->file, &iterator->status, block))
+		return false;
+
+	if(!write_metadata_block_data_(iterator->file, &iterator->status, block))
+		return false;
+
+	if(0 != fseeko(iterator->file, iterator->offset[iterator->depth], SEEK_SET)) {
+		iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_SEEK_ERROR;
+		return false;
+	}
+
+	return read_metadata_block_header_(iterator);
+}
+
+FLAC__bool write_metadata_block_stationary_with_padding_(FLAC__Metadata_SimpleIterator *iterator, FLAC__StreamMetadata *block, unsigned padding_length, FLAC__bool padding_is_last)
+{
+	FLAC__StreamMetadata *padding;
+
+	if(0 != fseeko(iterator->file, iterator->offset[iterator->depth], SEEK_SET)) {
+		iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_SEEK_ERROR;
+		return false;
+	}
+
+	block->is_last = false;
+
+	if(!write_metadata_block_header_(iterator->file, &iterator->status, block))
+		return false;
+
+	if(!write_metadata_block_data_(iterator->file, &iterator->status, block))
+		return false;
+
+	if(0 == (padding = FLAC__metadata_object_new(FLAC__METADATA_TYPE_PADDING)))
+		return FLAC__METADATA_SIMPLE_ITERATOR_STATUS_MEMORY_ALLOCATION_ERROR;
+
+	padding->is_last = padding_is_last;
+	padding->length = padding_length;
+
+	if(!write_metadata_block_header_(iterator->file, &iterator->status, padding)) {
+		FLAC__metadata_object_delete(padding);
+		return false;
+	}
+
+	if(!write_metadata_block_data_(iterator->file, &iterator->status, padding)) {
+		FLAC__metadata_object_delete(padding);
+		return false;
+	}
+
+	FLAC__metadata_object_delete(padding);
+
+	if(0 != fseeko(iterator->file, iterator->offset[iterator->depth], SEEK_SET)) {
+		iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_SEEK_ERROR;
+		return false;
+	}
+
+	return read_metadata_block_header_(iterator);
+}
+
+FLAC__bool rewrite_whole_file_(FLAC__Metadata_SimpleIterator *iterator, FLAC__StreamMetadata *block, FLAC__bool append)
+{
+	FILE *tempfile = NULL;
+	char *tempfilename = NULL;
+	int fixup_is_last_code = 0; /* 0 => no need to change any is_last flags */
+	FLAC__off_t fixup_is_last_flag_offset = -1;
+
+	FLAC__ASSERT(0 != block || append == false);
+
+	if(iterator->is_last) {
+		if(append) {
+			fixup_is_last_code = 1; /* 1 => clear the is_last flag at the following offset */
+			fixup_is_last_flag_offset = iterator->offset[iterator->depth];
+		}
+		else if(0 == block) {
+			simple_iterator_push_(iterator);
+			if(!FLAC__metadata_simple_iterator_prev(iterator)) {
+				(void)simple_iterator_pop_(iterator);
+				return false;
+			}
+			fixup_is_last_code = -1; /* -1 => set the is_last the flag at the following offset */
+			fixup_is_last_flag_offset = iterator->offset[iterator->depth];
+			if(!simple_iterator_pop_(iterator))
+				return false;
+		}
+	}
+
+	if(!simple_iterator_copy_file_prefix_(iterator, &tempfile, &tempfilename, append))
+		return false;
+
+	if(0 != block) {
+		if(!write_metadata_block_header_(tempfile, &iterator->status, block)) {
+			cleanup_tempfile_(&tempfile, &tempfilename);
+			return false;
+		}
+
+		if(!write_metadata_block_data_(tempfile, &iterator->status, block)) {
+			cleanup_tempfile_(&tempfile, &tempfilename);
+			return false;
+		}
+	}
+
+	if(!simple_iterator_copy_file_postfix_(iterator, &tempfile, &tempfilename, fixup_is_last_code, fixup_is_last_flag_offset, block==0))
+		return false;
+
+	if(append)
+		return FLAC__metadata_simple_iterator_next(iterator);
+
+	return true;
+}
+
+void simple_iterator_push_(FLAC__Metadata_SimpleIterator *iterator)
+{
+	FLAC__ASSERT(iterator->depth+1 < SIMPLE_ITERATOR_MAX_PUSH_DEPTH);
+	iterator->offset[iterator->depth+1] = iterator->offset[iterator->depth];
+	iterator->depth++;
+}
+
+FLAC__bool simple_iterator_pop_(FLAC__Metadata_SimpleIterator *iterator)
+{
+	FLAC__ASSERT(iterator->depth > 0);
+	iterator->depth--;
+	if(0 != fseeko(iterator->file, iterator->offset[iterator->depth], SEEK_SET)) {
+		iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_SEEK_ERROR;
+		return false;
+	}
+
+	return read_metadata_block_header_(iterator);
+}
+
+/* return meanings:
+ * 0: ok
+ * 1: read error
+ * 2: seek error
+ * 3: not a FLAC file
+ */
+unsigned seek_to_first_metadata_block_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__IOCallback_Seek seek_cb)
+{
+	FLAC__byte buffer[4];
+	size_t n;
+	unsigned i;
+
+	FLAC__ASSERT(FLAC__STREAM_SYNC_LENGTH == sizeof(buffer));
+
+	/* skip any id3v2 tag */
+	errno = 0;
+	n = read_cb(buffer, 1, 4, handle);
+	if(errno)
+		return 1;
+	else if(n != 4)
+		return 3;
+	else if(0 == memcmp(buffer, "ID3", 3)) {
+		unsigned tag_length = 0;
+
+		/* skip to the tag length */
+		if(seek_cb(handle, 2, SEEK_CUR) < 0)
+			return 2;
+
+		/* read the length */
+		for(i = 0; i < 4; i++) {
+			if(read_cb(buffer, 1, 1, handle) < 1 || buffer[0] & 0x80)
+				return 1;
+			tag_length <<= 7;
+			tag_length |= (buffer[0] & 0x7f);
+		}
+
+		/* skip the rest of the tag */
+		if(seek_cb(handle, tag_length, SEEK_CUR) < 0)
+			return 2;
+
+		/* read the stream sync code */
+		errno = 0;
+		n = read_cb(buffer, 1, 4, handle);
+		if(errno)
+			return 1;
+		else if(n != 4)
+			return 3;
+	}
+
+	/* check for the fLaC signature */
+	if(0 == memcmp(FLAC__STREAM_SYNC_STRING, buffer, FLAC__STREAM_SYNC_LENGTH))
+		return 0;
+	else
+		return 3;
+}
+
+unsigned seek_to_first_metadata_block_(FILE *f)
+{
+	return seek_to_first_metadata_block_cb_((FLAC__IOHandle)f, (FLAC__IOCallback_Read)fread, fseek_wrapper_);
+}
+
+FLAC__bool simple_iterator_copy_file_prefix_(FLAC__Metadata_SimpleIterator *iterator, FILE **tempfile, char **tempfilename, FLAC__bool append)
+{
+	const FLAC__off_t offset_end = append? iterator->offset[iterator->depth] + (FLAC__off_t)FLAC__STREAM_METADATA_HEADER_LENGTH + (FLAC__off_t)iterator->length : iterator->offset[iterator->depth];
+
+	if(0 != fseeko(iterator->file, 0, SEEK_SET)) {
+		iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_SEEK_ERROR;
+		return false;
+	}
+	if(!open_tempfile_(iterator->filename, iterator->tempfile_path_prefix, tempfile, tempfilename, &iterator->status)) {
+		cleanup_tempfile_(tempfile, tempfilename);
+		return false;
+	}
+	if(!copy_n_bytes_from_file_(iterator->file, *tempfile, offset_end, &iterator->status)) {
+		cleanup_tempfile_(tempfile, tempfilename);
+		return false;
+	}
+
+	return true;
+}
+
+FLAC__bool simple_iterator_copy_file_postfix_(FLAC__Metadata_SimpleIterator *iterator, FILE **tempfile, char **tempfilename, int fixup_is_last_code, FLAC__off_t fixup_is_last_flag_offset, FLAC__bool backup)
+{
+	FLAC__off_t save_offset = iterator->offset[iterator->depth];
+	FLAC__ASSERT(0 != *tempfile);
+
+	if(0 != fseeko(iterator->file, save_offset + (FLAC__off_t)FLAC__STREAM_METADATA_HEADER_LENGTH + (FLAC__off_t)iterator->length, SEEK_SET)) {
+		cleanup_tempfile_(tempfile, tempfilename);
+		iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_SEEK_ERROR;
+		return false;
+	}
+	if(!copy_remaining_bytes_from_file_(iterator->file, *tempfile, &iterator->status)) {
+		cleanup_tempfile_(tempfile, tempfilename);
+		return false;
+	}
+
+	if(fixup_is_last_code != 0) {
+		/*
+		 * if code == 1, it means a block was appended to the end so
+		 *   we have to clear the is_last flag of the previous block
+		 * if code == -1, it means the last block was deleted so
+		 *   we have to set the is_last flag of the previous block
+		 */
+		/* MAGIC NUMBERs here; we know the is_last flag is the high bit of the byte at this location */
+		FLAC__byte x;
+		if(0 != fseeko(*tempfile, fixup_is_last_flag_offset, SEEK_SET)) {
+			cleanup_tempfile_(tempfile, tempfilename);
+			iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_SEEK_ERROR;
+			return false;
+		}
+		if(fread(&x, 1, 1, *tempfile) != 1) {
+			cleanup_tempfile_(tempfile, tempfilename);
+			iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+			return false;
+		}
+		if(fixup_is_last_code > 0) {
+			FLAC__ASSERT(x & 0x80);
+			x &= 0x7f;
+		}
+		else {
+			FLAC__ASSERT(!(x & 0x80));
+			x |= 0x80;
+		}
+		if(0 != fseeko(*tempfile, fixup_is_last_flag_offset, SEEK_SET)) {
+			cleanup_tempfile_(tempfile, tempfilename);
+			iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_SEEK_ERROR;
+			return false;
+		}
+		if(local__fwrite(&x, 1, 1, *tempfile) != 1) {
+			cleanup_tempfile_(tempfile, tempfilename);
+			iterator->status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_WRITE_ERROR;
+			return false;
+		}
+	}
+
+	(void)fclose(iterator->file);
+
+	if(!transport_tempfile_(iterator->filename, tempfile, tempfilename, &iterator->status))
+		return false;
+
+	if(iterator->has_stats)
+		set_file_stats_(iterator->filename, &iterator->stats);
+
+	if(!simple_iterator_prime_input_(iterator, !iterator->is_writable))
+		return false;
+	if(backup) {
+		while(iterator->offset[iterator->depth] + (FLAC__off_t)FLAC__STREAM_METADATA_HEADER_LENGTH + (FLAC__off_t)iterator->length < save_offset)
+			if(!FLAC__metadata_simple_iterator_next(iterator))
+				return false;
+		return true;
+	}
+	else {
+		/* move the iterator to it's original block faster by faking a push, then doing a pop_ */
+		FLAC__ASSERT(iterator->depth == 0);
+		iterator->offset[0] = save_offset;
+		iterator->depth++;
+		return simple_iterator_pop_(iterator);
+	}
+}
+
+FLAC__bool copy_n_bytes_from_file_(FILE *file, FILE *tempfile, FLAC__off_t bytes, FLAC__Metadata_SimpleIteratorStatus *status)
+{
+	FLAC__byte buffer[8192];
+	size_t n;
+
+	FLAC__ASSERT(bytes >= 0);
+	while(bytes > 0) {
+		n = flac_min(sizeof(buffer), (size_t)bytes);
+		if(fread(buffer, 1, n, file) != n) {
+			*status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+			return false;
+		}
+		if(local__fwrite(buffer, 1, n, tempfile) != n) {
+			*status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_WRITE_ERROR;
+			return false;
+		}
+		bytes -= n;
+	}
+
+	return true;
+}
+
+FLAC__bool copy_n_bytes_from_file_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__IOHandle temp_handle, FLAC__IOCallback_Write temp_write_cb, FLAC__off_t bytes, FLAC__Metadata_SimpleIteratorStatus *status)
+{
+	FLAC__byte buffer[8192];
+	size_t n;
+
+	FLAC__ASSERT(bytes >= 0);
+	while(bytes > 0) {
+		n = flac_min(sizeof(buffer), (size_t)bytes);
+		if(read_cb(buffer, 1, n, handle) != n) {
+			*status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+			return false;
+		}
+		if(temp_write_cb(buffer, 1, n, temp_handle) != n) {
+			*status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_WRITE_ERROR;
+			return false;
+		}
+		bytes -= n;
+	}
+
+	return true;
+}
+
+FLAC__bool copy_remaining_bytes_from_file_(FILE *file, FILE *tempfile, FLAC__Metadata_SimpleIteratorStatus *status)
+{
+	FLAC__byte buffer[8192];
+	size_t n;
+
+	while(!feof(file)) {
+		n = fread(buffer, 1, sizeof(buffer), file);
+		if(n == 0 && !feof(file)) {
+			*status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+			return false;
+		}
+		if(n > 0 && local__fwrite(buffer, 1, n, tempfile) != n) {
+			*status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_WRITE_ERROR;
+			return false;
+		}
+	}
+
+	return true;
+}
+
+FLAC__bool copy_remaining_bytes_from_file_cb_(FLAC__IOHandle handle, FLAC__IOCallback_Read read_cb, FLAC__IOCallback_Eof eof_cb, FLAC__IOHandle temp_handle, FLAC__IOCallback_Write temp_write_cb, FLAC__Metadata_SimpleIteratorStatus *status)
+{
+	FLAC__byte buffer[8192];
+	size_t n;
+
+	while(!eof_cb(handle)) {
+		n = read_cb(buffer, 1, sizeof(buffer), handle);
+		if(n == 0 && !eof_cb(handle)) {
+			*status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR;
+			return false;
+		}
+		if(n > 0 && temp_write_cb(buffer, 1, n, temp_handle) != n) {
+			*status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_WRITE_ERROR;
+			return false;
+		}
+	}
+
+	return true;
+}
+
+static int
+local_snprintf(char *str, size_t size, const char *fmt, ...)
+{
+	va_list va;
+	int rc;
+
+	va_start (va, fmt);
+
+#if defined _MSC_VER
+	if (size == 0)
+		return 1024;
+	rc = vsnprintf_s (str, size, _TRUNCATE, fmt, va);
+	if (rc < 0)
+		rc = size - 1;
+#elif defined __MINGW32__
+	rc = __mingw_vsnprintf (str, size, fmt, va);
+#else
+	rc = vsnprintf (str, size, fmt, va);
+#endif
+	va_end (va);
+
+	return rc;
+}
+
+FLAC__bool open_tempfile_(const char *filename, const char *tempfile_path_prefix, FILE **tempfile, char **tempfilename, FLAC__Metadata_SimpleIteratorStatus *status)
+{
+	static const char *tempfile_suffix = ".metadata_edit";
+	if(0 == tempfile_path_prefix) {
+		size_t dest_len = strlen(filename) + strlen(tempfile_suffix) + 1;
+		if(0 == (*tempfilename = safe_malloc_(dest_len))) {
+			*status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_MEMORY_ALLOCATION_ERROR;
+			return false;
+		}
+		local_snprintf(*tempfilename, dest_len, "%s%s", filename, tempfile_suffix);
+	}
+	else {
+		const char *p = strrchr(filename, '/');
+		size_t dest_len;
+		if(0 == p)
+			p = filename;
+		else
+			p++;
+
+		dest_len = strlen(tempfile_path_prefix) + strlen(p) + strlen(tempfile_suffix) + 2;
+
+		if(0 == (*tempfilename = safe_malloc_(dest_len))) {
+			*status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_MEMORY_ALLOCATION_ERROR;
+			return false;
+		}
+		local_snprintf(*tempfilename, dest_len, "%s/%s%s", tempfile_path_prefix, p, tempfile_suffix);
+	}
+
+	if(0 == (*tempfile = flac_fopen(*tempfilename, "w+b"))) {
+		*status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_ERROR_OPENING_FILE;
+		return false;
+	}
+
+	return true;
+}
+
+FLAC__bool transport_tempfile_(const char *filename, FILE **tempfile, char **tempfilename, FLAC__Metadata_SimpleIteratorStatus *status)
+{
+	FLAC__ASSERT(0 != filename);
+	FLAC__ASSERT(0 != tempfile);
+	FLAC__ASSERT(0 != *tempfile);
+	FLAC__ASSERT(0 != tempfilename);
+	FLAC__ASSERT(0 != *tempfilename);
+	FLAC__ASSERT(0 != status);
+
+	(void)fclose(*tempfile);
+	*tempfile = 0;
+
+#if defined _MSC_VER || defined __BORLANDC__ || defined __MINGW32__ || defined __EMX__
+	/* on some flavors of windows, flac_rename() will fail if the destination already exists */
+	if(flac_unlink(filename) < 0) {
+		cleanup_tempfile_(tempfile, tempfilename);
+		*status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_UNLINK_ERROR;
+		return false;
+	}
+#endif
+
+	/*@@@ to fully support the tempfile_path_prefix we need to update this piece to actually copy across filesystems instead of just flac_rename(): */
+	if(0 != flac_rename(*tempfilename, filename)) {
+		cleanup_tempfile_(tempfile, tempfilename);
+		*status = FLAC__METADATA_SIMPLE_ITERATOR_STATUS_RENAME_ERROR;
+		return false;
+	}
+
+	cleanup_tempfile_(tempfile, tempfilename);
+
+	return true;
+}
+
+void cleanup_tempfile_(FILE **tempfile, char **tempfilename)
+{
+	if(0 != *tempfile) {
+		(void)fclose(*tempfile);
+		*tempfile = 0;
+	}
+
+	if(0 != *tempfilename) {
+		(void)flac_unlink(*tempfilename);
+		free(*tempfilename);
+		*tempfilename = 0;
+	}
+}
+
+FLAC__bool get_file_stats_(const char *filename, struct flac_stat_s *stats)
+{
+	FLAC__ASSERT(0 != filename);
+	FLAC__ASSERT(0 != stats);
+	return (0 == flac_stat(filename, stats));
+}
+
+void set_file_stats_(const char *filename, struct flac_stat_s *stats)
+{
+	struct utimbuf srctime;
+
+	FLAC__ASSERT(0 != filename);
+	FLAC__ASSERT(0 != stats);
+
+	srctime.actime = stats->st_atime;
+	srctime.modtime = stats->st_mtime;
+	(void)flac_chmod(filename, stats->st_mode);
+	(void)flac_utime(filename, &srctime);
+#if !defined _MSC_VER && !defined __BORLANDC__ && !defined __MINGW32__
+	FLAC_CHECK_RETURN(chown(filename, stats->st_uid, -1));
+	FLAC_CHECK_RETURN(chown(filename, -1, stats->st_gid));
+#endif
+}
+
+int fseek_wrapper_(FLAC__IOHandle handle, FLAC__int64 offset, int whence)
+{
+	return fseeko((FILE*)handle, (FLAC__off_t)offset, whence);
+}
+
+FLAC__int64 ftell_wrapper_(FLAC__IOHandle handle)
+{
+	return ftello((FILE*)handle);
+}
+
+FLAC__Metadata_ChainStatus get_equivalent_status_(FLAC__Metadata_SimpleIteratorStatus status)
+{
+	switch(status) {
+		case FLAC__METADATA_SIMPLE_ITERATOR_STATUS_OK:
+			return FLAC__METADATA_CHAIN_STATUS_OK;
+		case FLAC__METADATA_SIMPLE_ITERATOR_STATUS_ILLEGAL_INPUT:
+			return FLAC__METADATA_CHAIN_STATUS_ILLEGAL_INPUT;
+		case FLAC__METADATA_SIMPLE_ITERATOR_STATUS_ERROR_OPENING_FILE:
+			return FLAC__METADATA_CHAIN_STATUS_ERROR_OPENING_FILE;
+		case FLAC__METADATA_SIMPLE_ITERATOR_STATUS_NOT_A_FLAC_FILE:
+			return FLAC__METADATA_CHAIN_STATUS_NOT_A_FLAC_FILE;
+		case FLAC__METADATA_SIMPLE_ITERATOR_STATUS_NOT_WRITABLE:
+			return FLAC__METADATA_CHAIN_STATUS_NOT_WRITABLE;
+		case FLAC__METADATA_SIMPLE_ITERATOR_STATUS_BAD_METADATA:
+			return FLAC__METADATA_CHAIN_STATUS_BAD_METADATA;
+		case FLAC__METADATA_SIMPLE_ITERATOR_STATUS_READ_ERROR:
+			return FLAC__METADATA_CHAIN_STATUS_READ_ERROR;
+		case FLAC__METADATA_SIMPLE_ITERATOR_STATUS_SEEK_ERROR:
+			return FLAC__METADATA_CHAIN_STATUS_SEEK_ERROR;
+		case FLAC__METADATA_SIMPLE_ITERATOR_STATUS_WRITE_ERROR:
+			return FLAC__METADATA_CHAIN_STATUS_WRITE_ERROR;
+		case FLAC__METADATA_SIMPLE_ITERATOR_STATUS_RENAME_ERROR:
+			return FLAC__METADATA_CHAIN_STATUS_RENAME_ERROR;
+		case FLAC__METADATA_SIMPLE_ITERATOR_STATUS_UNLINK_ERROR:
+			return FLAC__METADATA_CHAIN_STATUS_UNLINK_ERROR;
+		case FLAC__METADATA_SIMPLE_ITERATOR_STATUS_MEMORY_ALLOCATION_ERROR:
+			return FLAC__METADATA_CHAIN_STATUS_MEMORY_ALLOCATION_ERROR;
+		case FLAC__METADATA_SIMPLE_ITERATOR_STATUS_INTERNAL_ERROR:
+		default:
+			return FLAC__METADATA_CHAIN_STATUS_INTERNAL_ERROR;
+	}
+}
diff --git a/src/libFLAC/metadata_object.c b/src/libFLAC/metadata_object.c
new file mode 100644
index 0000000..0456297
--- /dev/null
+++ b/src/libFLAC/metadata_object.c
@@ -0,0 +1,1824 @@
+/* libFLAC - Free Lossless Audio Codec library
+ * Copyright (C) 2001-2009  Josh Coalson
+ * Copyright (C) 2011-2014  Xiph.Org Foundation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the Xiph.org Foundation nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#  include <config.h>
+#endif
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "private/metadata.h"
+#include "private/memory.h"
+
+#include "FLAC/assert.h"
+#include "share/alloc.h"
+#include "share/compat.h"
+
+/* Alias the first (in share/alloc.h) to the second (in src/libFLAC/memory.c). */
+#define safe_malloc_mul_2op_ safe_malloc_mul_2op_p
+
+
+/****************************************************************************
+ *
+ * Local routines
+ *
+ ***************************************************************************/
+
+/* copy bytes:
+ *  from = NULL  && bytes = 0
+ *       to <- NULL
+ *  from != NULL && bytes > 0
+ *       to <- copy of from
+ *  else ASSERT
+ * malloc error leaves 'to' unchanged
+ */
+static FLAC__bool copy_bytes_(FLAC__byte **to, const FLAC__byte *from, unsigned bytes)
+{
+	FLAC__ASSERT(0 != to);
+	if(bytes > 0 && 0 != from) {
+		FLAC__byte *x;
+		if(0 == (x = safe_malloc_(bytes)))
+			return false;
+		memcpy(x, from, bytes);
+		*to = x;
+	}
+	else {
+		FLAC__ASSERT(0 == from);
+		FLAC__ASSERT(bytes == 0);
+		*to = 0;
+	}
+	return true;
+}
+
+#if 0 /* UNUSED */
+/* like copy_bytes_(), but free()s the original '*to' if the copy succeeds and the original '*to' is non-NULL */
+static FLAC__bool free_copy_bytes_(FLAC__byte **to, const FLAC__byte *from, unsigned bytes)
+{
+	FLAC__byte *copy;
+	FLAC__ASSERT(0 != to);
+	if(copy_bytes_(&copy, from, bytes)) {
+		if(*to)
+			free(*to);
+		*to = copy;
+		return true;
+	}
+	else
+		return false;
+}
+#endif
+
+/* reallocate entry to 1 byte larger and add a terminating NUL */
+/* realloc() failure leaves entry unchanged */
+static FLAC__bool ensure_null_terminated_(FLAC__byte **entry, unsigned length)
+{
+	FLAC__byte *x = safe_realloc_add_2op_(*entry, length, /*+*/1);
+	if(0 != x) {
+		x[length] = '\0';
+		*entry = x;
+		return true;
+	}
+	else
+		return false;
+}
+
+/* copies the NUL-terminated C-string 'from' to '*to', leaving '*to'
+ * unchanged if malloc fails, free()ing the original '*to' if it
+ * succeeds and the original '*to' was not NULL
+ */
+static FLAC__bool copy_cstring_(char **to, const char *from)
+{
+	char *copy = strdup(from);
+	FLAC__ASSERT(to);
+	if(copy) {
+		if(*to)
+			free(*to);
+		*to = copy;
+		return true;
+	}
+	else
+		return false;
+}
+
+static FLAC__bool copy_vcentry_(FLAC__StreamMetadata_VorbisComment_Entry *to, const FLAC__StreamMetadata_VorbisComment_Entry *from)
+{
+	to->length = from->length;
+	if(0 == from->entry) {
+		FLAC__ASSERT(from->length == 0);
+		to->entry = 0;
+	}
+	else {
+		FLAC__byte *x;
+		FLAC__ASSERT(from->length > 0);
+		if(0 == (x = safe_malloc_add_2op_(from->length, /*+*/1)))
+			return false;
+		memcpy(x, from->entry, from->length);
+		x[from->length] = '\0';
+		to->entry = x;
+	}
+	return true;
+}
+
+static FLAC__bool copy_track_(FLAC__StreamMetadata_CueSheet_Track *to, const FLAC__StreamMetadata_CueSheet_Track *from)
+{
+	memcpy(to, from, sizeof(FLAC__StreamMetadata_CueSheet_Track));
+	if(0 == from->indices) {
+		FLAC__ASSERT(from->num_indices == 0);
+	}
+	else {
+		FLAC__StreamMetadata_CueSheet_Index *x;
+		FLAC__ASSERT(from->num_indices > 0);
+		if(0 == (x = safe_malloc_mul_2op_p(from->num_indices, /*times*/sizeof(FLAC__StreamMetadata_CueSheet_Index))))
+			return false;
+		memcpy(x, from->indices, from->num_indices * sizeof(FLAC__StreamMetadata_CueSheet_Index));
+		to->indices = x;
+	}
+	return true;
+}
+
+static void seektable_calculate_length_(FLAC__StreamMetadata *object)
+{
+	FLAC__ASSERT(0 != object);
+	FLAC__ASSERT(object->type == FLAC__METADATA_TYPE_SEEKTABLE);
+
+	object->length = object->data.seek_table.num_points * FLAC__STREAM_METADATA_SEEKPOINT_LENGTH;
+}
+
+static FLAC__StreamMetadata_SeekPoint *seekpoint_array_new_(unsigned num_points)
+{
+	FLAC__StreamMetadata_SeekPoint *object_array;
+
+	FLAC__ASSERT(num_points > 0);
+
+	object_array = safe_malloc_mul_2op_p(num_points, /*times*/sizeof(FLAC__StreamMetadata_SeekPoint));
+
+	if(0 != object_array) {
+		unsigned i;
+		for(i = 0; i < num_points; i++) {
+			object_array[i].sample_number = FLAC__STREAM_METADATA_SEEKPOINT_PLACEHOLDER;
+			object_array[i].stream_offset = 0;
+			object_array[i].frame_samples = 0;
+		}
+	}
+
+	return object_array;
+}
+
+static void vorbiscomment_calculate_length_(FLAC__StreamMetadata *object)
+{
+	unsigned i;
+
+	FLAC__ASSERT(object->type == FLAC__METADATA_TYPE_VORBIS_COMMENT);
+
+	object->length = (FLAC__STREAM_METADATA_VORBIS_COMMENT_ENTRY_LENGTH_LEN) / 8;
+	object->length += object->data.vorbis_comment.vendor_string.length;
+	object->length += (FLAC__STREAM_METADATA_VORBIS_COMMENT_NUM_COMMENTS_LEN) / 8;
+	for(i = 0; i < object->data.vorbis_comment.num_comments; i++) {
+		object->length += (FLAC__STREAM_METADATA_VORBIS_COMMENT_ENTRY_LENGTH_LEN / 8);
+		object->length += object->data.vorbis_comment.comments[i].length;
+	}
+}
+
+static FLAC__StreamMetadata_VorbisComment_Entry *vorbiscomment_entry_array_new_(unsigned num_comments)
+{
+	FLAC__ASSERT(num_comments > 0);
+
+	return safe_calloc_(num_comments, sizeof(FLAC__StreamMetadata_VorbisComment_Entry));
+}
+
+static void vorbiscomment_entry_array_delete_(FLAC__StreamMetadata_VorbisComment_Entry *object_array, unsigned num_comments)
+{
+	unsigned i;
+
+	FLAC__ASSERT(0 != object_array && num_comments > 0);
+
+	for(i = 0; i < num_comments; i++)
+		if(0 != object_array[i].entry)
+			free(object_array[i].entry);
+
+	if(0 != object_array)
+		free(object_array);
+}
+
+static FLAC__StreamMetadata_VorbisComment_Entry *vorbiscomment_entry_array_copy_(const FLAC__StreamMetadata_VorbisComment_Entry *object_array, unsigned num_comments)
+{
+	FLAC__StreamMetadata_VorbisComment_Entry *return_array;
+
+	FLAC__ASSERT(0 != object_array);
+	FLAC__ASSERT(num_comments > 0);
+
+	return_array = vorbiscomment_entry_array_new_(num_comments);
+
+	if(0 != return_array) {
+		unsigned i;
+
+		for(i = 0; i < num_comments; i++) {
+			if(!copy_vcentry_(return_array+i, object_array+i)) {
+				vorbiscomment_entry_array_delete_(return_array, num_comments);
+				return 0;
+			}
+		}
+	}
+
+	return return_array;
+}
+
+static FLAC__bool vorbiscomment_set_entry_(FLAC__StreamMetadata *object, FLAC__StreamMetadata_VorbisComment_Entry *dest, const FLAC__StreamMetadata_VorbisComment_Entry *src, FLAC__bool copy)
+{
+	FLAC__byte *save;
+
+	FLAC__ASSERT(0 != object);
+	FLAC__ASSERT(0 != dest);
+	FLAC__ASSERT(0 != src);
+	FLAC__ASSERT(object->type == FLAC__METADATA_TYPE_VORBIS_COMMENT);
+	FLAC__ASSERT((0 != src->entry && src->length > 0) || (0 == src->entry && src->length == 0));
+
+	save = dest->entry;
+
+	if(0 != src->entry) {
+		if(copy) {
+			/* do the copy first so that if we fail we leave the dest object untouched */
+			if(!copy_vcentry_(dest, src))
+				return false;
+		}
+		else {
+			/* we have to make sure that the string we're taking over is null-terminated */
+
+			/*
+			 * Stripping the const from src->entry is OK since we're taking
+			 * ownership of the pointer.  This is a hack around a deficiency
+			 * in the API where the same function is used for 'copy' and
+			 * 'own', but the source entry is a const pointer.  If we were
+			 * precise, the 'own' flavor would be a separate function with a
+			 * non-const source pointer.  But it's not, so we hack away.
+			 */
+			if(!ensure_null_terminated_((FLAC__byte**)(&src->entry), src->length))
+				return false;
+			*dest = *src;
+		}
+	}
+	else {
+		/* the src is null */
+		*dest = *src;
+	}
+
+	if(0 != save)
+		free(save);
+
+	vorbiscomment_calculate_length_(object);
+	return true;
+}
+
+static int vorbiscomment_find_entry_from_(const FLAC__StreamMetadata *object, unsigned offset, const char *field_name, unsigned field_name_length)
+{
+	unsigned i;
+
+	FLAC__ASSERT(0 != object);
+	FLAC__ASSERT(object->type == FLAC__METADATA_TYPE_VORBIS_COMMENT);
+	FLAC__ASSERT(0 != field_name);
+
+	for(i = offset; i < object->data.vorbis_comment.num_comments; i++) {
+		if(FLAC__metadata_object_vorbiscomment_entry_matches(object->data.vorbis_comment.comments[i], field_name, field_name_length))
+			return (int)i;
+	}
+
+	return -1;
+}
+
+static void cuesheet_calculate_length_(FLAC__StreamMetadata *object)
+{
+	unsigned i;
+
+	FLAC__ASSERT(object->type == FLAC__METADATA_TYPE_CUESHEET);
+
+	object->length = (
+		FLAC__STREAM_METADATA_CUESHEET_MEDIA_CATALOG_NUMBER_LEN +
+		FLAC__STREAM_METADATA_CUESHEET_LEAD_IN_LEN +
+		FLAC__STREAM_METADATA_CUESHEET_IS_CD_LEN +
+		FLAC__STREAM_METADATA_CUESHEET_RESERVED_LEN +
+		FLAC__STREAM_METADATA_CUESHEET_NUM_TRACKS_LEN
+	) / 8;
+
+	object->length += object->data.cue_sheet.num_tracks * (
+		FLAC__STREAM_METADATA_CUESHEET_TRACK_OFFSET_LEN +
+		FLAC__STREAM_METADATA_CUESHEET_TRACK_NUMBER_LEN +
+		FLAC__STREAM_METADATA_CUESHEET_TRACK_ISRC_LEN +
+		FLAC__STREAM_METADATA_CUESHEET_TRACK_TYPE_LEN +
+		FLAC__STREAM_METADATA_CUESHEET_TRACK_PRE_EMPHASIS_LEN +
+		FLAC__STREAM_METADATA_CUESHEET_TRACK_RESERVED_LEN +
+		FLAC__STREAM_METADATA_CUESHEET_TRACK_NUM_INDICES_LEN
+	) / 8;
+
+	for(i = 0; i < object->data.cue_sheet.num_tracks; i++) {
+		object->length += object->data.cue_sheet.tracks[i].num_indices * (
+			FLAC__STREAM_METADATA_CUESHEET_INDEX_OFFSET_LEN +
+			FLAC__STREAM_METADATA_CUESHEET_INDEX_NUMBER_LEN +
+			FLAC__STREAM_METADATA_CUESHEET_INDEX_RESERVED_LEN
+		) / 8;
+	}
+}
+
+static FLAC__StreamMetadata_CueSheet_Index *cuesheet_track_index_array_new_(unsigned num_indices)
+{
+	FLAC__ASSERT(num_indices > 0);
+
+	return safe_calloc_(num_indices, sizeof(FLAC__StreamMetadata_CueSheet_Index));
+}
+
+static FLAC__StreamMetadata_CueSheet_Track *cuesheet_track_array_new_(unsigned num_tracks)
+{
+	FLAC__ASSERT(num_tracks > 0);
+
+	return safe_calloc_(num_tracks, sizeof(FLAC__StreamMetadata_CueSheet_Track));
+}
+
+static void cuesheet_track_array_delete_(FLAC__StreamMetadata_CueSheet_Track *object_array, unsigned num_tracks)
+{
+	unsigned i;
+
+	FLAC__ASSERT(0 != object_array && num_tracks > 0);
+
+	for(i = 0; i < num_tracks; i++) {
+		if(0 != object_array[i].indices) {
+			FLAC__ASSERT(object_array[i].num_indices > 0);
+			free(object_array[i].indices);
+		}
+	}
+
+	if(0 != object_array)
+		free(object_array);
+}
+
+static FLAC__StreamMetadata_CueSheet_Track *cuesheet_track_array_copy_(const FLAC__StreamMetadata_CueSheet_Track *object_array, unsigned num_tracks)
+{
+	FLAC__StreamMetadata_CueSheet_Track *return_array;
+
+	FLAC__ASSERT(0 != object_array);
+	FLAC__ASSERT(num_tracks > 0);
+
+	return_array = cuesheet_track_array_new_(num_tracks);
+
+	if(0 != return_array) {
+		unsigned i;
+
+		for(i = 0; i < num_tracks; i++) {
+			if(!copy_track_(return_array+i, object_array+i)) {
+				cuesheet_track_array_delete_(return_array, num_tracks);
+				return 0;
+			}
+		}
+	}
+
+	return return_array;
+}
+
+static FLAC__bool cuesheet_set_track_(FLAC__StreamMetadata *object, FLAC__StreamMetadata_CueSheet_Track *dest, const FLAC__StreamMetadata_CueSheet_Track *src, FLAC__bool copy)
+{
+	FLAC__StreamMetadata_CueSheet_Index *save;
+
+	FLAC__ASSERT(0 != object);
+	FLAC__ASSERT(0 != dest);
+	FLAC__ASSERT(0 != src);
+	FLAC__ASSERT(object->type == FLAC__METADATA_TYPE_CUESHEET);
+	FLAC__ASSERT((0 != src->indices && src->num_indices > 0) || (0 == src->indices && src->num_indices == 0));
+
+	save = dest->indices;
+
+	/* do the copy first so that if we fail we leave the object untouched */
+	if(copy) {
+		if(!copy_track_(dest, src))
+			return false;
+	}
+	else {
+		*dest = *src;
+	}
+
+	if(0 != save)
+		free(save);
+
+	cuesheet_calculate_length_(object);
+	return true;
+}
+
+
+/****************************************************************************
+ *
+ * Metadata object routines
+ *
+ ***************************************************************************/
+
+FLAC_API FLAC__StreamMetadata *FLAC__metadata_object_new(FLAC__MetadataType type)
+{
+	FLAC__StreamMetadata *object;
+
+	if(type > FLAC__MAX_METADATA_TYPE)
+		return 0;
+
+	object = calloc(1, sizeof(FLAC__StreamMetadata));
+	if(0 != object) {
+		object->is_last = false;
+		object->type = type;
+		switch(type) {
+			case FLAC__METADATA_TYPE_STREAMINFO:
+				object->length = FLAC__STREAM_METADATA_STREAMINFO_LENGTH;
+				break;
+			case FLAC__METADATA_TYPE_PADDING:
+				/* calloc() took care of this for us:
+				object->length = 0;
+				*/
+				break;
+			case FLAC__METADATA_TYPE_APPLICATION:
+				object->length = FLAC__STREAM_METADATA_APPLICATION_ID_LEN / 8;
+				/* calloc() took care of this for us:
+				object->data.application.data = 0;
+				*/
+				break;
+			case FLAC__METADATA_TYPE_SEEKTABLE:
+				/* calloc() took care of this for us:
+				object->length = 0;
+				object->data.seek_table.num_points = 0;
+				object->data.seek_table.points = 0;
+				*/
+				break;
+			case FLAC__METADATA_TYPE_VORBIS_COMMENT:
+				object->data.vorbis_comment.vendor_string.length = (unsigned)strlen(FLAC__VENDOR_STRING);
+				if(!copy_bytes_(&object->data.vorbis_comment.vendor_string.entry, (const FLAC__byte*)FLAC__VENDOR_STRING, object->data.vorbis_comment.vendor_string.length+1)) {
+					free(object);
+					return 0;
+				}
+				vorbiscomment_calculate_length_(object);
+				break;
+			case FLAC__METADATA_TYPE_CUESHEET:
+				cuesheet_calculate_length_(object);
+				break;
+			case FLAC__METADATA_TYPE_PICTURE:
+				object->length = (
+					FLAC__STREAM_METADATA_PICTURE_TYPE_LEN +
+					FLAC__STREAM_METADATA_PICTURE_MIME_TYPE_LENGTH_LEN + /* empty mime_type string */
+					FLAC__STREAM_METADATA_PICTURE_DESCRIPTION_LENGTH_LEN + /* empty description string */
+					FLAC__STREAM_METADATA_PICTURE_WIDTH_LEN +
+					FLAC__STREAM_METADATA_PICTURE_HEIGHT_LEN +
+					FLAC__STREAM_METADATA_PICTURE_DEPTH_LEN +
+					FLAC__STREAM_METADATA_PICTURE_COLORS_LEN +
+					FLAC__STREAM_METADATA_PICTURE_DATA_LENGTH_LEN +
+					0 /* no data */
+				) / 8;
+				object->data.picture.type = FLAC__STREAM_METADATA_PICTURE_TYPE_OTHER;
+				object->data.picture.mime_type = 0;
+				object->data.picture.description = 0;
+				/* calloc() took care of this for us:
+				object->data.picture.width = 0;
+				object->data.picture.height = 0;
+				object->data.picture.depth = 0;
+				object->data.picture.colors = 0;
+				object->data.picture.data_length = 0;
+				object->data.picture.data = 0;
+				*/
+				/* now initialize mime_type and description with empty strings to make things easier on the client */
+				if(!copy_cstring_(&object->data.picture.mime_type, "")) {
+					free(object);
+					return 0;
+				}
+				if(!copy_cstring_((char**)(&object->data.picture.description), "")) {
+					if(object->data.picture.mime_type)
+						free(object->data.picture.mime_type);
+					free(object);
+					return 0;
+				}
+				break;
+			default:
+				/* calloc() took care of this for us:
+				object->length = 0;
+				object->data.unknown.data = 0;
+				*/
+				break;
+		}
+	}
+
+	return object;
+}
+
+FLAC_API FLAC__StreamMetadata *FLAC__metadata_object_clone(const FLAC__StreamMetadata *object)
+{
+	FLAC__StreamMetadata *to;
+
+	FLAC__ASSERT(0 != object);
+
+	if(0 != (to = FLAC__metadata_object_new(object->type))) {
+		to->is_last = object->is_last;
+		to->type = object->type;
+		to->length = object->length;
+		switch(to->type) {
+			case FLAC__METADATA_TYPE_STREAMINFO:
+				memcpy(&to->data.stream_info, &object->data.stream_info, sizeof(FLAC__StreamMetadata_StreamInfo));
+				break;
+			case FLAC__METADATA_TYPE_PADDING:
+				break;
+			case FLAC__METADATA_TYPE_APPLICATION:
+				if(to->length < FLAC__STREAM_METADATA_APPLICATION_ID_LEN / 8) { /* underflow check */
+					FLAC__metadata_object_delete(to);
+					return 0;
+				}
+				memcpy(&to->data.application.id, &object->data.application.id, FLAC__STREAM_METADATA_APPLICATION_ID_LEN / 8);
+				if(!copy_bytes_(&to->data.application.data, object->data.application.data, object->length - FLAC__STREAM_METADATA_APPLICATION_ID_LEN / 8)) {
+					FLAC__metadata_object_delete(to);
+					return 0;
+				}
+				break;
+			case FLAC__METADATA_TYPE_SEEKTABLE:
+				to->data.seek_table.num_points = object->data.seek_table.num_points;
+				if(to->data.seek_table.num_points > UINT32_MAX / sizeof(FLAC__StreamMetadata_SeekPoint)) { /* overflow check */
+					FLAC__metadata_object_delete(to);
+					return 0;
+				}
+				if(!copy_bytes_((FLAC__byte**)&to->data.seek_table.points, (FLAC__byte*)object->data.seek_table.points, object->data.seek_table.num_points * sizeof(FLAC__StreamMetadata_SeekPoint))) {
+					FLAC__metadata_object_delete(to);
+					return 0;
+				}
+				break;
+			case FLAC__METADATA_TYPE_VORBIS_COMMENT:
+				if(0 != to->data.vorbis_comment.vendor_string.entry) {
+					free(to->data.vorbis_comment.vendor_string.entry);
+					to->data.vorbis_comment.vendor_string.entry = 0;
+				}
+				if(!copy_vcentry_(&to->data.vorbis_comment.vendor_string, &object->data.vorbis_comment.vendor_string)) {
+					FLAC__metadata_object_delete(to);
+					return 0;
+				}
+				if(object->data.vorbis_comment.num_comments == 0) {
+					FLAC__ASSERT(0 == object->data.vorbis_comment.comments);
+					to->data.vorbis_comment.comments = 0;
+				}
+				else {
+					FLAC__ASSERT(0 != object->data.vorbis_comment.comments);
+					to->data.vorbis_comment.comments = vorbiscomment_entry_array_copy_(object->data.vorbis_comment.comments, object->data.vorbis_comment.num_comments);
+					if(0 == to->data.vorbis_comment.comments) {
+						FLAC__metadata_object_delete(to);
+						return 0;
+					}
+				}
+				to->data.vorbis_comment.num_comments = object->data.vorbis_comment.num_comments;
+				break;
+			case FLAC__METADATA_TYPE_CUESHEET:
+				memcpy(&to->data.cue_sheet, &object->data.cue_sheet, sizeof(FLAC__StreamMetadata_CueSheet));
+				if(object->data.cue_sheet.num_tracks == 0) {
+					FLAC__ASSERT(0 == object->data.cue_sheet.tracks);
+				}
+				else {
+					FLAC__ASSERT(0 != object->data.cue_sheet.tracks);
+					to->data.cue_sheet.tracks = cuesheet_track_array_copy_(object->data.cue_sheet.tracks, object->data.cue_sheet.num_tracks);
+					if(0 == to->data.cue_sheet.tracks) {
+						FLAC__metadata_object_delete(to);
+						return 0;
+					}
+				}
+				break;
+			case FLAC__METADATA_TYPE_PICTURE:
+				to->data.picture.type = object->data.picture.type;
+				if(!copy_cstring_(&to->data.picture.mime_type, object->data.picture.mime_type)) {
+					FLAC__metadata_object_delete(to);
+					return 0;
+				}
+				if(!copy_cstring_((char**)(&to->data.picture.description), (const char*)object->data.picture.description)) {
+					FLAC__metadata_object_delete(to);
+					return 0;
+				}
+				to->data.picture.width = object->data.picture.width;
+				to->data.picture.height = object->data.picture.height;
+				to->data.picture.depth = object->data.picture.depth;
+				to->data.picture.colors = object->data.picture.colors;
+				to->data.picture.data_length = object->data.picture.data_length;
+				if(!copy_bytes_((&to->data.picture.data), object->data.picture.data, object->data.picture.data_length)) {
+					FLAC__metadata_object_delete(to);
+					return 0;
+				}
+				break;
+			default:
+				if(!copy_bytes_(&to->data.unknown.data, object->data.unknown.data, object->length)) {
+					FLAC__metadata_object_delete(to);
+					return 0;
+				}
+				break;
+		}
+	}
+
+	return to;
+}
+
+void FLAC__metadata_object_delete_data(FLAC__StreamMetadata *object)
+{
+	FLAC__ASSERT(0 != object);
+
+	switch(object->type) {
+		case FLAC__METADATA_TYPE_STREAMINFO:
+		case FLAC__METADATA_TYPE_PADDING:
+			break;
+		case FLAC__METADATA_TYPE_APPLICATION:
+			if(0 != object->data.application.data) {
+				free(object->data.application.data);
+				object->data.application.data = 0;
+			}
+			break;
+		case FLAC__METADATA_TYPE_SEEKTABLE:
+			if(0 != object->data.seek_table.points) {
+				free(object->data.seek_table.points);
+				object->data.seek_table.points = 0;
+			}
+			break;
+		case FLAC__METADATA_TYPE_VORBIS_COMMENT:
+			if(0 != object->data.vorbis_comment.vendor_string.entry) {
+				free(object->data.vorbis_comment.vendor_string.entry);
+				object->data.vorbis_comment.vendor_string.entry = 0;
+			}
+			if(0 != object->data.vorbis_comment.comments) {
+				FLAC__ASSERT(object->data.vorbis_comment.num_comments > 0);
+				vorbiscomment_entry_array_delete_(object->data.vorbis_comment.comments, object->data.vorbis_comment.num_comments);
+			}
+			break;
+		case FLAC__METADATA_TYPE_CUESHEET:
+			if(0 != object->data.cue_sheet.tracks) {
+				FLAC__ASSERT(object->data.cue_sheet.num_tracks > 0);
+				cuesheet_track_array_delete_(object->data.cue_sheet.tracks, object->data.cue_sheet.num_tracks);
+			}
+			break;
+		case FLAC__METADATA_TYPE_PICTURE:
+			if(0 != object->data.picture.mime_type) {
+				free(object->data.picture.mime_type);
+				object->data.picture.mime_type = 0;
+			}
+			if(0 != object->data.picture.description) {
+				free(object->data.picture.description);
+				object->data.picture.description = 0;
+			}
+			if(0 != object->data.picture.data) {
+				free(object->data.picture.data);
+				object->data.picture.data = 0;
+			}
+			break;
+		default:
+			if(0 != object->data.unknown.data) {
+				free(object->data.unknown.data);
+				object->data.unknown.data = 0;
+			}
+			break;
+	}
+}
+
+FLAC_API void FLAC__metadata_object_delete(FLAC__StreamMetadata *object)
+{
+	FLAC__metadata_object_delete_data(object);
+	free(object);
+}
+
+static FLAC__bool compare_block_data_streaminfo_(const FLAC__StreamMetadata_StreamInfo *block1, const FLAC__StreamMetadata_StreamInfo *block2)
+{
+	if(block1->min_blocksize != block2->min_blocksize)
+		return false;
+	if(block1->max_blocksize != block2->max_blocksize)
+		return false;
+	if(block1->min_framesize != block2->min_framesize)
+		return false;
+	if(block1->max_framesize != block2->max_framesize)
+		return false;
+	if(block1->sample_rate != block2->sample_rate)
+		return false;
+	if(block1->channels != block2->channels)
+		return false;
+	if(block1->bits_per_sample != block2->bits_per_sample)
+		return false;
+	if(block1->total_samples != block2->total_samples)
+		return false;
+	if(0 != memcmp(block1->md5sum, block2->md5sum, 16))
+		return false;
+	return true;
+}
+
+static FLAC__bool compare_block_data_application_(const FLAC__StreamMetadata_Application *block1, const FLAC__StreamMetadata_Application *block2, unsigned block_length)
+{
+	FLAC__ASSERT(0 != block1);
+	FLAC__ASSERT(0 != block2);
+	FLAC__ASSERT(block_length >= sizeof(block1->id));
+
+	if(0 != memcmp(block1->id, block2->id, sizeof(block1->id)))
+		return false;
+	if(0 != block1->data && 0 != block2->data)
+		return 0 == memcmp(block1->data, block2->data, block_length - sizeof(block1->id));
+	else
+		return block1->data == block2->data;
+}
+
+static FLAC__bool compare_block_data_seektable_(const FLAC__StreamMetadata_SeekTable *block1, const FLAC__StreamMetadata_SeekTable *block2)
+{
+	unsigned i;
+
+	FLAC__ASSERT(0 != block1);
+	FLAC__ASSERT(0 != block2);
+
+	if(block1->num_points != block2->num_points)
+		return false;
+
+	if(0 != block1->points && 0 != block2->points) {
+		for(i = 0; i < block1->num_points; i++) {
+			if(block1->points[i].sample_number != block2->points[i].sample_number)
+				return false;
+			if(block1->points[i].stream_offset != block2->points[i].stream_offset)
+				return false;
+			if(block1->points[i].frame_samples != block2->points[i].frame_samples)
+				return false;
+		}
+		return true;
+	}
+	else
+		return block1->points == block2->points;
+}
+
+static FLAC__bool compare_block_data_vorbiscomment_(const FLAC__StreamMetadata_VorbisComment *block1, const FLAC__StreamMetadata_VorbisComment *block2)
+{
+	unsigned i;
+
+	if(block1->vendor_string.length != block2->vendor_string.length)
+		return false;
+
+	if(0 != block1->vendor_string.entry && 0 != block2->vendor_string.entry) {
+		if(0 != memcmp(block1->vendor_string.entry, block2->vendor_string.entry, block1->vendor_string.length))
+			return false;
+	}
+	else if(block1->vendor_string.entry != block2->vendor_string.entry)
+		return false;
+
+	if(block1->num_comments != block2->num_comments)
+		return false;
+
+	for(i = 0; i < block1->num_comments; i++) {
+		if(0 != block1->comments[i].entry && 0 != block2->comments[i].entry) {
+			if(0 != memcmp(block1->comments[i].entry, block2->comments[i].entry, block1->comments[i].length))
+				return false;
+		}
+		else if(block1->comments[i].entry != block2->comments[i].entry)
+			return false;
+	}
+	return true;
+}
+
+static FLAC__bool compare_block_data_cuesheet_(const FLAC__StreamMetadata_CueSheet *block1, const FLAC__StreamMetadata_CueSheet *block2)
+{
+	unsigned i, j;
+
+	if(0 != strcmp(block1->media_catalog_number, block2->media_catalog_number))
+		return false;
+
+	if(block1->lead_in != block2->lead_in)
+		return false;
+
+	if(block1->is_cd != block2->is_cd)
+		return false;
+
+	if(block1->num_tracks != block2->num_tracks)
+		return false;
+
+	if(0 != block1->tracks && 0 != block2->tracks) {
+		FLAC__ASSERT(block1->num_tracks > 0);
+		for(i = 0; i < block1->num_tracks; i++) {
+			if(block1->tracks[i].offset != block2->tracks[i].offset)
+				return false;
+			if(block1->tracks[i].number != block2->tracks[i].number)
+				return false;
+			if(0 != memcmp(block1->tracks[i].isrc, block2->tracks[i].isrc, sizeof(block1->tracks[i].isrc)))
+				return false;
+			if(block1->tracks[i].type != block2->tracks[i].type)
+				return false;
+			if(block1->tracks[i].pre_emphasis != block2->tracks[i].pre_emphasis)
+				return false;
+			if(block1->tracks[i].num_indices != block2->tracks[i].num_indices)
+				return false;
+			if(0 != block1->tracks[i].indices && 0 != block2->tracks[i].indices) {
+				FLAC__ASSERT(block1->tracks[i].num_indices > 0);
+				for(j = 0; j < block1->tracks[i].num_indices; j++) {
+					if(block1->tracks[i].indices[j].offset != block2->tracks[i].indices[j].offset)
+						return false;
+					if(block1->tracks[i].indices[j].number != block2->tracks[i].indices[j].number)
+						return false;
+				}
+			}
+			else if(block1->tracks[i].indices != block2->tracks[i].indices)
+				return false;
+		}
+	}
+	else if(block1->tracks != block2->tracks)
+		return false;
+	return true;
+}
+
+static FLAC__bool compare_block_data_picture_(const FLAC__StreamMetadata_Picture *block1, const FLAC__StreamMetadata_Picture *block2)
+{
+	if(block1->type != block2->type)
+		return false;
+	if(block1->mime_type != block2->mime_type && (0 == block1->mime_type || 0 == block2->mime_type || strcmp(block1->mime_type, block2->mime_type)))
+		return false;
+	if(block1->description != block2->description && (0 == block1->description || 0 == block2->description || strcmp((const char *)block1->description, (const char *)block2->description)))
+		return false;
+	if(block1->width != block2->width)
+		return false;
+	if(block1->height != block2->height)
+		return false;
+	if(block1->depth != block2->depth)
+		return false;
+	if(block1->colors != block2->colors)
+		return false;
+	if(block1->data_length != block2->data_length)
+		return false;
+	if(block1->data != block2->data && (0 == block1->data || 0 == block2->data || memcmp(block1->data, block2->data, block1->data_length)))
+		return false;
+	return true;
+}
+
+static FLAC__bool compare_block_data_unknown_(const FLAC__StreamMetadata_Unknown *block1, const FLAC__StreamMetadata_Unknown *block2, unsigned block_length)
+{
+	FLAC__ASSERT(0 != block1);
+	FLAC__ASSERT(0 != block2);
+
+	if(0 != block1->data && 0 != block2->data)
+		return 0 == memcmp(block1->data, block2->data, block_length);
+	else
+		return block1->data == block2->data;
+}
+
+FLAC_API FLAC__bool FLAC__metadata_object_is_equal(const FLAC__StreamMetadata *block1, const FLAC__StreamMetadata *block2)
+{
+	FLAC__ASSERT(0 != block1);
+	FLAC__ASSERT(0 != block2);
+
+	if(block1->type != block2->type) {
+		return false;
+	}
+	if(block1->is_last != block2->is_last) {
+		return false;
+	}
+	if(block1->length != block2->length) {
+		return false;
+	}
+	switch(block1->type) {
+		case FLAC__METADATA_TYPE_STREAMINFO:
+			return compare_block_data_streaminfo_(&block1->data.stream_info, &block2->data.stream_info);
+		case FLAC__METADATA_TYPE_PADDING:
+			return true; /* we don't compare the padding guts */
+		case FLAC__METADATA_TYPE_APPLICATION:
+			return compare_block_data_application_(&block1->data.application, &block2->data.application, block1->length);
+		case FLAC__METADATA_TYPE_SEEKTABLE:
+			return compare_block_data_seektable_(&block1->data.seek_table, &block2->data.seek_table);
+		case FLAC__METADATA_TYPE_VORBIS_COMMENT:
+			return compare_block_data_vorbiscomment_(&block1->data.vorbis_comment, &block2->data.vorbis_comment);
+		case FLAC__METADATA_TYPE_CUESHEET:
+			return compare_block_data_cuesheet_(&block1->data.cue_sheet, &block2->data.cue_sheet);
+		case FLAC__METADATA_TYPE_PICTURE:
+			return compare_block_data_picture_(&block1->data.picture, &block2->data.picture);
+		default:
+			return compare_block_data_unknown_(&block1->data.unknown, &block2->data.unknown, block1->length);
+	}
+}
+
+FLAC_API FLAC__bool FLAC__metadata_object_application_set_data(FLAC__StreamMetadata *object, FLAC__byte *data, unsigned length, FLAC__bool copy)
+{
+	FLAC__byte *save;
+
+	FLAC__ASSERT(0 != object);
+	FLAC__ASSERT(object->type == FLAC__METADATA_TYPE_APPLICATION);
+	FLAC__ASSERT((0 != data && length > 0) || (0 == data && length == 0 && copy == false));
+
+	save = object->data.application.data;
+
+	/* do the copy first so that if we fail we leave the object untouched */
+	if(copy) {
+		if(!copy_bytes_(&object->data.application.data, data, length))
+			return false;
+	}
+	else {
+		object->data.application.data = data;
+	}
+
+	if(0 != save)
+		free(save);
+
+	object->length = FLAC__STREAM_METADATA_APPLICATION_ID_LEN / 8 + length;
+	return true;
+}
+
+FLAC_API FLAC__bool FLAC__metadata_object_seektable_resize_points(FLAC__StreamMetadata *object, unsigned new_num_points)
+{
+	FLAC__ASSERT(0 != object);
+	FLAC__ASSERT(object->type == FLAC__METADATA_TYPE_SEEKTABLE);
+
+	if(0 == object->data.seek_table.points) {
+		FLAC__ASSERT(object->data.seek_table.num_points == 0);
+		if(0 == new_num_points)
+			return true;
+		else if(0 == (object->data.seek_table.points = seekpoint_array_new_(new_num_points)))
+			return false;
+	}
+	else {
+		const size_t old_size = object->data.seek_table.num_points * sizeof(FLAC__StreamMetadata_SeekPoint);
+		const size_t new_size = new_num_points * sizeof(FLAC__StreamMetadata_SeekPoint);
+
+		/* overflow check */
+		if(new_num_points > UINT32_MAX / sizeof(FLAC__StreamMetadata_SeekPoint))
+			return false;
+
+		FLAC__ASSERT(object->data.seek_table.num_points > 0);
+
+		if(new_size == 0) {
+			free(object->data.seek_table.points);
+			object->data.seek_table.points = 0;
+		}
+		else if(0 == (object->data.seek_table.points = realloc(object->data.seek_table.points, new_size)))
+			return false;
+
+		/* if growing, set new elements to placeholders */
+		if(new_size > old_size) {
+			unsigned i;
+			for(i = object->data.seek_table.num_points; i < new_num_points; i++) {
+				object->data.seek_table.points[i].sample_number = FLAC__STREAM_METADATA_SEEKPOINT_PLACEHOLDER;
+				object->data.seek_table.points[i].stream_offset = 0;
+				object->data.seek_table.points[i].frame_samples = 0;
+			}
+		}
+	}
+
+	object->data.seek_table.num_points = new_num_points;
+
+	seektable_calculate_length_(object);
+	return true;
+}
+
+FLAC_API void FLAC__metadata_object_seektable_set_point(FLAC__StreamMetadata *object, unsigned point_num, FLAC__StreamMetadata_SeekPoint point)
+{
+	FLAC__ASSERT(0 != object);
+	FLAC__ASSERT(object->type == FLAC__METADATA_TYPE_SEEKTABLE);
+	FLAC__ASSERT(point_num < object->data.seek_table.num_points);
+
+	object->data.seek_table.points[point_num] = point;
+}
+
+FLAC_API FLAC__bool FLAC__metadata_object_seektable_insert_point(FLAC__StreamMetadata *object, unsigned point_num, FLAC__StreamMetadata_SeekPoint point)
+{
+	int i;
+
+	FLAC__ASSERT(0 != object);
+	FLAC__ASSERT(object->type == FLAC__METADATA_TYPE_SEEKTABLE);
+	FLAC__ASSERT(point_num <= object->data.seek_table.num_points);
+
+	if(!FLAC__metadata_object_seektable_resize_points(object, object->data.seek_table.num_points+1))
+		return false;
+
+	/* move all points >= point_num forward one space */
+	for(i = (int)object->data.seek_table.num_points-1; i > (int)point_num; i--)
+		object->data.seek_table.points[i] = object->data.seek_table.points[i-1];
+
+	FLAC__metadata_object_seektable_set_point(object, point_num, point);
+	seektable_calculate_length_(object);
+	return true;
+}
+
+FLAC_API FLAC__bool FLAC__metadata_object_seektable_delete_point(FLAC__StreamMetadata *object, unsigned point_num)
+{
+	unsigned i;
+
+	FLAC__ASSERT(0 != object);
+	FLAC__ASSERT(object->type == FLAC__METADATA_TYPE_SEEKTABLE);
+	FLAC__ASSERT(point_num < object->data.seek_table.num_points);
+
+	/* move all points > point_num backward one space */
+	for(i = point_num; i < object->data.seek_table.num_points-1; i++)
+		object->data.seek_table.points[i] = object->data.seek_table.points[i+1];
+
+	return FLAC__metadata_object_seektable_resize_points(object, object->data.seek_table.num_points-1);
+}
+
+FLAC_API FLAC__bool FLAC__metadata_object_seektable_is_legal(const FLAC__StreamMetadata *object)
+{
+	FLAC__ASSERT(0 != object);
+	FLAC__ASSERT(object->type == FLAC__METADATA_TYPE_SEEKTABLE);
+
+	return FLAC__format_seektable_is_legal(&object->data.seek_table);
+}
+
+FLAC_API FLAC__bool FLAC__metadata_object_seektable_template_append_placeholders(FLAC__StreamMetadata *object, unsigned num)
+{
+	FLAC__ASSERT(0 != object);
+	FLAC__ASSERT(object->type == FLAC__METADATA_TYPE_SEEKTABLE);
+
+	if(num > 0)
+		/* WATCHOUT: we rely on the fact that growing the array adds PLACEHOLDERS at the end */
+		return FLAC__metadata_object_seektable_resize_points(object, object->data.seek_table.num_points + num);
+	else
+		return true;
+}
+
+FLAC_API FLAC__bool FLAC__metadata_object_seektable_template_append_point(FLAC__StreamMetadata *object, FLAC__uint64 sample_number)
+{
+	FLAC__StreamMetadata_SeekTable *seek_table;
+
+	FLAC__ASSERT(0 != object);
+	FLAC__ASSERT(object->type == FLAC__METADATA_TYPE_SEEKTABLE);
+
+	seek_table = &object->data.seek_table;
+
+	if(!FLAC__metadata_object_seektable_resize_points(object, seek_table->num_points + 1))
+		return false;
+
+	seek_table->points[seek_table->num_points - 1].sample_number = sample_number;
+	seek_table->points[seek_table->num_points - 1].stream_offset = 0;
+	seek_table->points[seek_table->num_points - 1].frame_samples = 0;
+
+	return true;
+}
+
+FLAC_API FLAC__bool FLAC__metadata_object_seektable_template_append_points(FLAC__StreamMetadata *object, FLAC__uint64 sample_numbers[], unsigned num)
+{
+	FLAC__ASSERT(0 != object);
+	FLAC__ASSERT(object->type == FLAC__METADATA_TYPE_SEEKTABLE);
+	FLAC__ASSERT(0 != sample_numbers || num == 0);
+
+	if(num > 0) {
+		FLAC__StreamMetadata_SeekTable *seek_table = &object->data.seek_table;
+		unsigned i, j;
+
+		i = seek_table->num_points;
+
+		if(!FLAC__metadata_object_seektable_resize_points(object, seek_table->num_points + num))
+			return false;
+
+		for(j = 0; j < num; i++, j++) {
+			seek_table->points[i].sample_number = sample_numbers[j];
+			seek_table->points[i].stream_offset = 0;
+			seek_table->points[i].frame_samples = 0;
+		}
+	}
+
+	return true;
+}
+
+FLAC_API FLAC__bool FLAC__metadata_object_seektable_template_append_spaced_points(FLAC__StreamMetadata *object, unsigned num, FLAC__uint64 total_samples)
+{
+	FLAC__ASSERT(0 != object);
+	FLAC__ASSERT(object->type == FLAC__METADATA_TYPE_SEEKTABLE);
+	FLAC__ASSERT(total_samples > 0);
+
+	if(num > 0 && total_samples > 0) {
+		FLAC__StreamMetadata_SeekTable *seek_table = &object->data.seek_table;
+		unsigned i, j;
+
+		i = seek_table->num_points;
+
+		if(!FLAC__metadata_object_seektable_resize_points(object, seek_table->num_points + num))
+			return false;
+
+		for(j = 0; j < num; i++, j++) {
+			seek_table->points[i].sample_number = total_samples * (FLAC__uint64)j / (FLAC__uint64)num;
+			seek_table->points[i].stream_offset = 0;
+			seek_table->points[i].frame_samples = 0;
+		}
+	}
+
+	return true;
+}
+
+FLAC_API FLAC__bool FLAC__metadata_object_seektable_template_append_spaced_points_by_samples(FLAC__StreamMetadata *object, unsigned samples, FLAC__uint64 total_samples)
+{
+	FLAC__ASSERT(0 != object);
+	FLAC__ASSERT(object->type == FLAC__METADATA_TYPE_SEEKTABLE);
+	FLAC__ASSERT(samples > 0);
+	FLAC__ASSERT(total_samples > 0);
+
+	if(samples > 0 && total_samples > 0) {
+		FLAC__StreamMetadata_SeekTable *seek_table = &object->data.seek_table;
+		unsigned i, j;
+		FLAC__uint64 num, sample;
+
+		num = 1 + total_samples / samples; /* 1+ for the first sample at 0 */
+		/* now account for the fact that we don't place a seekpoint at "total_samples" since samples are number from 0: */
+		if(total_samples % samples == 0)
+			num--;
+
+		i = seek_table->num_points;
+
+		if(!FLAC__metadata_object_seektable_resize_points(object, seek_table->num_points + (unsigned)num))
+			return false;
+
+		sample = 0;
+		for(j = 0; j < num; i++, j++, sample += samples) {
+			seek_table->points[i].sample_number = sample;
+			seek_table->points[i].stream_offset = 0;
+			seek_table->points[i].frame_samples = 0;
+		}
+	}
+
+	return true;
+}
+
+FLAC_API FLAC__bool FLAC__metadata_object_seektable_template_sort(FLAC__StreamMetadata *object, FLAC__bool compact)
+{
+	unsigned unique;
+
+	FLAC__ASSERT(0 != object);
+	FLAC__ASSERT(object->type == FLAC__METADATA_TYPE_SEEKTABLE);
+
+	unique = FLAC__format_seektable_sort(&object->data.seek_table);
+
+	return !compact || FLAC__metadata_object_seektable_resize_points(object, unique);
+}
+
+FLAC_API FLAC__bool FLAC__metadata_object_vorbiscomment_set_vendor_string(FLAC__StreamMetadata *object, FLAC__StreamMetadata_VorbisComment_Entry entry, FLAC__bool copy)
+{
+	if(!FLAC__format_vorbiscomment_entry_value_is_legal(entry.entry, entry.length))
+		return false;
+	return vorbiscomment_set_entry_(object, &object->data.vorbis_comment.vendor_string, &entry, copy);
+}
+
+FLAC_API FLAC__bool FLAC__metadata_object_vorbiscomment_resize_comments(FLAC__StreamMetadata *object, unsigned new_num_comments)
+{
+	FLAC__ASSERT(0 != object);
+	FLAC__ASSERT(object->type == FLAC__METADATA_TYPE_VORBIS_COMMENT);
+
+	if(0 == object->data.vorbis_comment.comments) {
+		FLAC__ASSERT(object->data.vorbis_comment.num_comments == 0);
+		if(0 == new_num_comments)
+			return true;
+		else if(0 == (object->data.vorbis_comment.comments = vorbiscomment_entry_array_new_(new_num_comments)))
+			return false;
+	}
+	else {
+		const size_t old_size = object->data.vorbis_comment.num_comments * sizeof(FLAC__StreamMetadata_VorbisComment_Entry);
+		const size_t new_size = new_num_comments * sizeof(FLAC__StreamMetadata_VorbisComment_Entry);
+
+		/* overflow check */
+		if(new_num_comments > UINT32_MAX / sizeof(FLAC__StreamMetadata_VorbisComment_Entry))
+			return false;
+
+		FLAC__ASSERT(object->data.vorbis_comment.num_comments > 0);
+
+		/* if shrinking, free the truncated entries */
+		if(new_num_comments < object->data.vorbis_comment.num_comments) {
+			unsigned i;
+			for(i = new_num_comments; i < object->data.vorbis_comment.num_comments; i++)
+				if(0 != object->data.vorbis_comment.comments[i].entry)
+					free(object->data.vorbis_comment.comments[i].entry);
+		}
+
+		if(new_size == 0) {
+			free(object->data.vorbis_comment.comments);
+			object->data.vorbis_comment.comments = 0;
+		}
+		else if(0 == (object->data.vorbis_comment.comments = realloc(object->data.vorbis_comment.comments, new_size)))
+			return false;
+
+		/* if growing, zero all the length/pointers of new elements */
+		if(new_size > old_size)
+			memset(object->data.vorbis_comment.comments + object->data.vorbis_comment.num_comments, 0, new_size - old_size);
+	}
+
+	object->data.vorbis_comment.num_comments = new_num_comments;
+
+	vorbiscomment_calculate_length_(object);
+	return true;
+}
+
+FLAC_API FLAC__bool FLAC__metadata_object_vorbiscomment_set_comment(FLAC__StreamMetadata *object, unsigned comment_num, FLAC__StreamMetadata_VorbisComment_Entry entry, FLAC__bool copy)
+{
+	FLAC__ASSERT(0 != object);
+	FLAC__ASSERT(comment_num < object->data.vorbis_comment.num_comments);
+
+	if(!FLAC__format_vorbiscomment_entry_is_legal(entry.entry, entry.length))
+		return false;
+	return vorbiscomment_set_entry_(object, &object->data.vorbis_comment.comments[comment_num], &entry, copy);
+}
+
+FLAC_API FLAC__bool FLAC__metadata_object_vorbiscomment_insert_comment(FLAC__StreamMetadata *object, unsigned comment_num, FLAC__StreamMetadata_VorbisComment_Entry entry, FLAC__bool copy)
+{
+	FLAC__StreamMetadata_VorbisComment *vc;
+
+	FLAC__ASSERT(0 != object);
+	FLAC__ASSERT(object->type == FLAC__METADATA_TYPE_VORBIS_COMMENT);
+	FLAC__ASSERT(comment_num <= object->data.vorbis_comment.num_comments);
+
+	if(!FLAC__format_vorbiscomment_entry_is_legal(entry.entry, entry.length))
+		return false;
+
+	vc = &object->data.vorbis_comment;
+
+	if(!FLAC__metadata_object_vorbiscomment_resize_comments(object, vc->num_comments+1))
+		return false;
+
+	/* move all comments >= comment_num forward one space */
+	memmove(&vc->comments[comment_num+1], &vc->comments[comment_num], sizeof(FLAC__StreamMetadata_VorbisComment_Entry)*(vc->num_comments-1-comment_num));
+	vc->comments[comment_num].length = 0;
+	vc->comments[comment_num].entry = 0;
+
+	return FLAC__metadata_object_vorbiscomment_set_comment(object, comment_num, entry, copy);
+}
+
+FLAC_API FLAC__bool FLAC__metadata_object_vorbiscomment_append_comment(FLAC__StreamMetadata *object, FLAC__StreamMetadata_VorbisComment_Entry entry, FLAC__bool copy)
+{
+	FLAC__ASSERT(0 != object);
+	FLAC__ASSERT(object->type == FLAC__METADATA_TYPE_VORBIS_COMMENT);
+	return FLAC__metadata_object_vorbiscomment_insert_comment(object, object->data.vorbis_comment.num_comments, entry, copy);
+}
+
+FLAC_API FLAC__bool FLAC__metadata_object_vorbiscomment_replace_comment(FLAC__StreamMetadata *object, FLAC__StreamMetadata_VorbisComment_Entry entry, FLAC__bool all, FLAC__bool copy)
+{
+	FLAC__ASSERT(0 != entry.entry && entry.length > 0);
+
+	if(!FLAC__format_vorbiscomment_entry_is_legal(entry.entry, entry.length))
+		return false;
+
+	{
+		int i;
+		size_t field_name_length;
+		const FLAC__byte *eq = (FLAC__byte*)memchr(entry.entry, '=', entry.length);
+
+		FLAC__ASSERT(0 != eq);
+
+		if(0 == eq)
+			return false; /* double protection */
+
+		field_name_length = eq-entry.entry;
+
+		i = vorbiscomment_find_entry_from_(object, 0, (const char *)entry.entry, field_name_length);
+		if(i >= 0) {
+			unsigned indx = (unsigned)i;
+			if(!FLAC__metadata_object_vorbiscomment_set_comment(object, indx, entry, copy))
+				return false;
+			entry = object->data.vorbis_comment.comments[indx];
+			indx++; /* skip over replaced comment */
+			if(all && indx < object->data.vorbis_comment.num_comments) {
+				i = vorbiscomment_find_entry_from_(object, indx, (const char *)entry.entry, field_name_length);
+				while(i >= 0) {
+					indx = (unsigned)i;
+					if(!FLAC__metadata_object_vorbiscomment_delete_comment(object, indx))
+						return false;
+					if(indx < object->data.vorbis_comment.num_comments)
+						i = vorbiscomment_find_entry_from_(object, indx, (const char *)entry.entry, field_name_length);
+					else
+						i = -1;
+				}
+			}
+			return true;
+		}
+		else
+			return FLAC__metadata_object_vorbiscomment_append_comment(object, entry, copy);
+	}
+}
+
+FLAC_API FLAC__bool FLAC__metadata_object_vorbiscomment_delete_comment(FLAC__StreamMetadata *object, unsigned comment_num)
+{
+	FLAC__StreamMetadata_VorbisComment *vc;
+
+	FLAC__ASSERT(0 != object);
+	FLAC__ASSERT(object->type == FLAC__METADATA_TYPE_VORBIS_COMMENT);
+	FLAC__ASSERT(comment_num < object->data.vorbis_comment.num_comments);
+
+	vc = &object->data.vorbis_comment;
+
+	/* free the comment at comment_num */
+	if(0 != vc->comments[comment_num].entry)
+		free(vc->comments[comment_num].entry);
+
+	/* move all comments > comment_num backward one space */
+	memmove(&vc->comments[comment_num], &vc->comments[comment_num+1], sizeof(FLAC__StreamMetadata_VorbisComment_Entry)*(vc->num_comments-comment_num-1));
+	vc->comments[vc->num_comments-1].length = 0;
+	vc->comments[vc->num_comments-1].entry = 0;
+
+	return FLAC__metadata_object_vorbiscomment_resize_comments(object, vc->num_comments-1);
+}
+
+FLAC_API FLAC__bool FLAC__metadata_object_vorbiscomment_entry_from_name_value_pair(FLAC__StreamMetadata_VorbisComment_Entry *entry, const char *field_name, const char *field_value)
+{
+	FLAC__ASSERT(0 != entry);
+	FLAC__ASSERT(0 != field_name);
+	FLAC__ASSERT(0 != field_value);
+
+	if(!FLAC__format_vorbiscomment_entry_name_is_legal(field_name))
+		return false;
+	if(!FLAC__format_vorbiscomment_entry_value_is_legal((const FLAC__byte *)field_value, (unsigned)(-1)))
+		return false;
+
+	{
+		const size_t nn = strlen(field_name);
+		const size_t nv = strlen(field_value);
+		entry->length = nn + 1 /*=*/ + nv;
+		if(0 == (entry->entry = safe_malloc_add_4op_(nn, /*+*/1, /*+*/nv, /*+*/1)))
+			return false;
+		memcpy(entry->entry, field_name, nn);
+		entry->entry[nn] = '=';
+		memcpy(entry->entry+nn+1, field_value, nv);
+		entry->entry[entry->length] = '\0';
+	}
+
+	return true;
+}
+
+FLAC_API FLAC__bool FLAC__metadata_object_vorbiscomment_entry_to_name_value_pair(const FLAC__StreamMetadata_VorbisComment_Entry entry, char **field_name, char **field_value)
+{
+	FLAC__ASSERT(0 != entry.entry && entry.length > 0);
+	FLAC__ASSERT(0 != field_name);
+	FLAC__ASSERT(0 != field_value);
+
+	if(!FLAC__format_vorbiscomment_entry_is_legal(entry.entry, entry.length))
+		return false;
+
+	{
+		const FLAC__byte *eq = (FLAC__byte*)memchr(entry.entry, '=', entry.length);
+		const size_t nn = eq-entry.entry;
+		const size_t nv = entry.length-nn-1; /* -1 for the '=' */
+		FLAC__ASSERT(0 != eq);
+		if(0 == eq)
+			return false; /* double protection */
+		if(0 == (*field_name = safe_malloc_add_2op_(nn, /*+*/1)))
+			return false;
+		if(0 == (*field_value = safe_malloc_add_2op_(nv, /*+*/1))) {
+			free(*field_name);
+			return false;
+		}
+		memcpy(*field_name, entry.entry, nn);
+		memcpy(*field_value, entry.entry+nn+1, nv);
+		(*field_name)[nn] = '\0';
+		(*field_value)[nv] = '\0';
+	}
+
+	return true;
+}
+
+FLAC_API FLAC__bool FLAC__metadata_object_vorbiscomment_entry_matches(const FLAC__StreamMetadata_VorbisComment_Entry entry, const char *field_name, unsigned field_name_length)
+{
+	FLAC__ASSERT(0 != entry.entry && entry.length > 0);
+	{
+		const FLAC__byte *eq = (FLAC__byte*)memchr(entry.entry, '=', entry.length);
+		return (0 != eq && (unsigned)(eq-entry.entry) == field_name_length && 0 == FLAC__STRNCASECMP(field_name, (const char *)entry.entry, field_name_length));
+	}
+}
+
+FLAC_API int FLAC__metadata_object_vorbiscomment_find_entry_from(const FLAC__StreamMetadata *object, unsigned offset, const char *field_name)
+{
+	FLAC__ASSERT(0 != field_name);
+
+	return vorbiscomment_find_entry_from_(object, offset, field_name, strlen(field_name));
+}
+
+FLAC_API int FLAC__metadata_object_vorbiscomment_remove_entry_matching(FLAC__StreamMetadata *object, const char *field_name)
+{
+	const unsigned field_name_length = strlen(field_name);
+	unsigned i;
+
+	FLAC__ASSERT(0 != object);
+	FLAC__ASSERT(object->type == FLAC__METADATA_TYPE_VORBIS_COMMENT);
+
+	for(i = 0; i < object->data.vorbis_comment.num_comments; i++) {
+		if(FLAC__metadata_object_vorbiscomment_entry_matches(object->data.vorbis_comment.comments[i], field_name, field_name_length)) {
+			if(!FLAC__metadata_object_vorbiscomment_delete_comment(object, i))
+				return -1;
+			else
+				return 1;
+		}
+	}
+
+	return 0;
+}
+
+FLAC_API int FLAC__metadata_object_vorbiscomment_remove_entries_matching(FLAC__StreamMetadata *object, const char *field_name)
+{
+	FLAC__bool ok = true;
+	unsigned matching = 0;
+	const unsigned field_name_length = strlen(field_name);
+	int i;
+
+	FLAC__ASSERT(0 != object);
+	FLAC__ASSERT(object->type == FLAC__METADATA_TYPE_VORBIS_COMMENT);
+
+	/* must delete from end to start otherwise it will interfere with our iteration */
+	for(i = (int)object->data.vorbis_comment.num_comments - 1; ok && i >= 0; i--) {
+		if(FLAC__metadata_object_vorbiscomment_entry_matches(object->data.vorbis_comment.comments[i], field_name, field_name_length)) {
+			matching++;
+			ok &= FLAC__metadata_object_vorbiscomment_delete_comment(object, (unsigned)i);
+		}
+	}
+
+	return ok? (int)matching : -1;
+}
+
+FLAC_API FLAC__StreamMetadata_CueSheet_Track *FLAC__metadata_object_cuesheet_track_new(void)
+{
+	return calloc(1, sizeof(FLAC__StreamMetadata_CueSheet_Track));
+}
+
+FLAC_API FLAC__StreamMetadata_CueSheet_Track *FLAC__metadata_object_cuesheet_track_clone(const FLAC__StreamMetadata_CueSheet_Track *object)
+{
+	FLAC__StreamMetadata_CueSheet_Track *to;
+
+	FLAC__ASSERT(0 != object);
+
+	if(0 != (to = FLAC__metadata_object_cuesheet_track_new())) {
+		if(!copy_track_(to, object)) {
+			FLAC__metadata_object_cuesheet_track_delete(to);
+			return 0;
+		}
+	}
+
+	return to;
+}
+
+void FLAC__metadata_object_cuesheet_track_delete_data(FLAC__StreamMetadata_CueSheet_Track *object)
+{
+	FLAC__ASSERT(0 != object);
+
+	if(0 != object->indices) {
+		FLAC__ASSERT(object->num_indices > 0);
+		free(object->indices);
+	}
+}
+
+FLAC_API void FLAC__metadata_object_cuesheet_track_delete(FLAC__StreamMetadata_CueSheet_Track *object)
+{
+	FLAC__metadata_object_cuesheet_track_delete_data(object);
+	free(object);
+}
+
+FLAC_API FLAC__bool FLAC__metadata_object_cuesheet_track_resize_indices(FLAC__StreamMetadata *object, unsigned track_num, unsigned new_num_indices)
+{
+	FLAC__StreamMetadata_CueSheet_Track *track;
+	FLAC__ASSERT(0 != object);
+	FLAC__ASSERT(object->type == FLAC__METADATA_TYPE_CUESHEET);
+	FLAC__ASSERT(track_num < object->data.cue_sheet.num_tracks);
+
+	track = &object->data.cue_sheet.tracks[track_num];
+
+	if(0 == track->indices) {
+		FLAC__ASSERT(track->num_indices == 0);
+		if(0 == new_num_indices)
+			return true;
+		else if(0 == (track->indices = cuesheet_track_index_array_new_(new_num_indices)))
+			return false;
+	}
+	else {
+		const size_t old_size = track->num_indices * sizeof(FLAC__StreamMetadata_CueSheet_Index);
+		const size_t new_size = new_num_indices * sizeof(FLAC__StreamMetadata_CueSheet_Index);
+
+		/* overflow check */
+		if(new_num_indices > UINT32_MAX / sizeof(FLAC__StreamMetadata_CueSheet_Index))
+			return false;
+
+		FLAC__ASSERT(track->num_indices > 0);
+
+		if(new_size == 0) {
+			free(track->indices);
+			track->indices = 0;
+		}
+		else if(0 == (track->indices = realloc(track->indices, new_size)))
+			return false;
+
+		/* if growing, zero all the lengths/pointers of new elements */
+		if(new_size > old_size)
+			memset(track->indices + track->num_indices, 0, new_size - old_size);
+	}
+
+	track->num_indices = new_num_indices;
+
+	cuesheet_calculate_length_(object);
+	return true;
+}
+
+FLAC_API FLAC__bool FLAC__metadata_object_cuesheet_track_insert_index(FLAC__StreamMetadata *object, unsigned track_num, unsigned index_num, FLAC__StreamMetadata_CueSheet_Index indx)
+{
+	FLAC__StreamMetadata_CueSheet_Track *track;
+
+	FLAC__ASSERT(0 != object);
+	FLAC__ASSERT(object->type == FLAC__METADATA_TYPE_CUESHEET);
+	FLAC__ASSERT(track_num < object->data.cue_sheet.num_tracks);
+	FLAC__ASSERT(index_num <= object->data.cue_sheet.tracks[track_num].num_indices);
+
+	track = &object->data.cue_sheet.tracks[track_num];
+
+	if(!FLAC__metadata_object_cuesheet_track_resize_indices(object, track_num, track->num_indices+1))
+		return false;
+
+	/* move all indices >= index_num forward one space */
+	memmove(&track->indices[index_num+1], &track->indices[index_num], sizeof(FLAC__StreamMetadata_CueSheet_Index)*(track->num_indices-1-index_num));
+
+	track->indices[index_num] = indx;
+	cuesheet_calculate_length_(object);
+	return true;
+}
+
+FLAC_API FLAC__bool FLAC__metadata_object_cuesheet_track_insert_blank_index(FLAC__StreamMetadata *object, unsigned track_num, unsigned index_num)
+{
+	FLAC__StreamMetadata_CueSheet_Index indx;
+	memset(&indx, 0, sizeof(indx));
+	return FLAC__metadata_object_cuesheet_track_insert_index(object, track_num, index_num, indx);
+}
+
+FLAC_API FLAC__bool FLAC__metadata_object_cuesheet_track_delete_index(FLAC__StreamMetadata *object, unsigned track_num, unsigned index_num)
+{
+	FLAC__StreamMetadata_CueSheet_Track *track;
+
+	FLAC__ASSERT(0 != object);
+	FLAC__ASSERT(object->type == FLAC__METADATA_TYPE_CUESHEET);
+	FLAC__ASSERT(track_num < object->data.cue_sheet.num_tracks);
+	FLAC__ASSERT(index_num < object->data.cue_sheet.tracks[track_num].num_indices);
+
+	track = &object->data.cue_sheet.tracks[track_num];
+
+	/* move all indices > index_num backward one space */
+	memmove(&track->indices[index_num], &track->indices[index_num+1], sizeof(FLAC__StreamMetadata_CueSheet_Index)*(track->num_indices-index_num-1));
+
+	FLAC__metadata_object_cuesheet_track_resize_indices(object, track_num, track->num_indices-1);
+	cuesheet_calculate_length_(object);
+	return true;
+}
+
+FLAC_API FLAC__bool FLAC__metadata_object_cuesheet_resize_tracks(FLAC__StreamMetadata *object, unsigned new_num_tracks)
+{
+	FLAC__ASSERT(0 != object);
+	FLAC__ASSERT(object->type == FLAC__METADATA_TYPE_CUESHEET);
+
+	if(0 == object->data.cue_sheet.tracks) {
+		FLAC__ASSERT(object->data.cue_sheet.num_tracks == 0);
+		if(0 == new_num_tracks)
+			return true;
+		else if(0 == (object->data.cue_sheet.tracks = cuesheet_track_array_new_(new_num_tracks)))
+			return false;
+	}
+	else {
+		const size_t old_size = object->data.cue_sheet.num_tracks * sizeof(FLAC__StreamMetadata_CueSheet_Track);
+		const size_t new_size = new_num_tracks * sizeof(FLAC__StreamMetadata_CueSheet_Track);
+
+		/* overflow check */
+		if(new_num_tracks > UINT32_MAX / sizeof(FLAC__StreamMetadata_CueSheet_Track))
+			return false;
+
+		FLAC__ASSERT(object->data.cue_sheet.num_tracks > 0);
+
+		/* if shrinking, free the truncated entries */
+		if(new_num_tracks < object->data.cue_sheet.num_tracks) {
+			unsigned i;
+			for(i = new_num_tracks; i < object->data.cue_sheet.num_tracks; i++)
+				if(0 != object->data.cue_sheet.tracks[i].indices)
+					free(object->data.cue_sheet.tracks[i].indices);
+		}
+
+		if(new_size == 0) {
+			free(object->data.cue_sheet.tracks);
+			object->data.cue_sheet.tracks = 0;
+		}
+		else if(0 == (object->data.cue_sheet.tracks = realloc(object->data.cue_sheet.tracks, new_size)))
+			return false;
+
+		/* if growing, zero all the lengths/pointers of new elements */
+		if(new_size > old_size)
+			memset(object->data.cue_sheet.tracks + object->data.cue_sheet.num_tracks, 0, new_size - old_size);
+	}
+
+	object->data.cue_sheet.num_tracks = new_num_tracks;
+
+	cuesheet_calculate_length_(object);
+	return true;
+}
+
+FLAC_API FLAC__bool FLAC__metadata_object_cuesheet_set_track(FLAC__StreamMetadata *object, unsigned track_num, FLAC__StreamMetadata_CueSheet_Track *track, FLAC__bool copy)
+{
+	FLAC__ASSERT(0 != object);
+	FLAC__ASSERT(track_num < object->data.cue_sheet.num_tracks);
+
+	return cuesheet_set_track_(object, object->data.cue_sheet.tracks + track_num, track, copy);
+}
+
+FLAC_API FLAC__bool FLAC__metadata_object_cuesheet_insert_track(FLAC__StreamMetadata *object, unsigned track_num, FLAC__StreamMetadata_CueSheet_Track *track, FLAC__bool copy)
+{
+	FLAC__StreamMetadata_CueSheet *cs;
+
+	FLAC__ASSERT(0 != object);
+	FLAC__ASSERT(object->type == FLAC__METADATA_TYPE_CUESHEET);
+	FLAC__ASSERT(track_num <= object->data.cue_sheet.num_tracks);
+
+	cs = &object->data.cue_sheet;
+
+	if(!FLAC__metadata_object_cuesheet_resize_tracks(object, cs->num_tracks+1))
+		return false;
+
+	/* move all tracks >= track_num forward one space */
+	memmove(&cs->tracks[track_num+1], &cs->tracks[track_num], sizeof(FLAC__StreamMetadata_CueSheet_Track)*(cs->num_tracks-1-track_num));
+	cs->tracks[track_num].num_indices = 0;
+	cs->tracks[track_num].indices = 0;
+
+	return FLAC__metadata_object_cuesheet_set_track(object, track_num, track, copy);
+}
+
+FLAC_API FLAC__bool FLAC__metadata_object_cuesheet_insert_blank_track(FLAC__StreamMetadata *object, unsigned track_num)
+{
+	FLAC__StreamMetadata_CueSheet_Track track;
+	memset(&track, 0, sizeof(track));
+	return FLAC__metadata_object_cuesheet_insert_track(object, track_num, &track, /*copy=*/false);
+}
+
+FLAC_API FLAC__bool FLAC__metadata_object_cuesheet_delete_track(FLAC__StreamMetadata *object, unsigned track_num)
+{
+	FLAC__StreamMetadata_CueSheet *cs;
+
+	FLAC__ASSERT(0 != object);
+	FLAC__ASSERT(object->type == FLAC__METADATA_TYPE_CUESHEET);
+	FLAC__ASSERT(track_num < object->data.cue_sheet.num_tracks);
+
+	cs = &object->data.cue_sheet;
+
+	/* free the track at track_num */
+	if(0 != cs->tracks[track_num].indices)
+		free(cs->tracks[track_num].indices);
+
+	/* move all tracks > track_num backward one space */
+	memmove(&cs->tracks[track_num], &cs->tracks[track_num+1], sizeof(FLAC__StreamMetadata_CueSheet_Track)*(cs->num_tracks-track_num-1));
+	cs->tracks[cs->num_tracks-1].num_indices = 0;
+	cs->tracks[cs->num_tracks-1].indices = 0;
+
+	return FLAC__metadata_object_cuesheet_resize_tracks(object, cs->num_tracks-1);
+}
+
+FLAC_API FLAC__bool FLAC__metadata_object_cuesheet_is_legal(const FLAC__StreamMetadata *object, FLAC__bool check_cd_da_subset, const char **violation)
+{
+	FLAC__ASSERT(0 != object);
+	FLAC__ASSERT(object->type == FLAC__METADATA_TYPE_CUESHEET);
+
+	return FLAC__format_cuesheet_is_legal(&object->data.cue_sheet, check_cd_da_subset, violation);
+}
+
+static FLAC__uint64 get_index_01_offset_(const FLAC__StreamMetadata_CueSheet *cs, unsigned track)
+{
+	if (track >= (cs->num_tracks-1) || cs->tracks[track].num_indices < 1)
+		return 0;
+	else if (cs->tracks[track].indices[0].number == 1)
+		return cs->tracks[track].indices[0].offset + cs->tracks[track].offset + cs->lead_in;
+	else if (cs->tracks[track].num_indices < 2)
+		return 0;
+	else if (cs->tracks[track].indices[1].number == 1)
+		return cs->tracks[track].indices[1].offset + cs->tracks[track].offset + cs->lead_in;
+	else
+		return 0;
+}
+
+static FLAC__uint32 cddb_add_digits_(FLAC__uint32 x)
+{
+	FLAC__uint32 n = 0;
+	while (x) {
+		n += (x%10);
+		x /= 10;
+	}
+	return n;
+}
+
+/*@@@@add to tests*/
+FLAC_API FLAC__uint32 FLAC__metadata_object_cuesheet_calculate_cddb_id(const FLAC__StreamMetadata *object)
+{
+	const FLAC__StreamMetadata_CueSheet *cs;
+
+	FLAC__ASSERT(0 != object);
+	FLAC__ASSERT(object->type == FLAC__METADATA_TYPE_CUESHEET);
+
+	cs = &object->data.cue_sheet;
+
+	if (cs->num_tracks < 2) /* need at least one real track and the lead-out track */
+		return 0;
+
+	{
+		FLAC__uint32 i, length, sum = 0;
+		for (i = 0; i < (cs->num_tracks-1); i++) /* -1 to avoid counting the lead-out */
+			sum += cddb_add_digits_((FLAC__uint32)(get_index_01_offset_(cs, i) / 44100));
+		length = (FLAC__uint32)((cs->tracks[cs->num_tracks-1].offset+cs->lead_in) / 44100) - (FLAC__uint32)(get_index_01_offset_(cs, 0) / 44100);
+
+		return (sum % 0xFF) << 24 | length << 8 | (FLAC__uint32)(cs->num_tracks-1);
+	}
+}
+
+FLAC_API FLAC__bool FLAC__metadata_object_picture_set_mime_type(FLAC__StreamMetadata *object, char *mime_type, FLAC__bool copy)
+{
+	char *old;
+	size_t old_length, new_length;
+
+	FLAC__ASSERT(0 != object);
+	FLAC__ASSERT(object->type == FLAC__METADATA_TYPE_PICTURE);
+	FLAC__ASSERT(0 != mime_type);
+
+	old = object->data.picture.mime_type;
+	old_length = old? strlen(old) : 0;
+	new_length = strlen(mime_type);
+
+	/* do the copy first so that if we fail we leave the object untouched */
+	if(copy) {
+		if(new_length >= SIZE_MAX) /* overflow check */
+			return false;
+		if(!copy_bytes_((FLAC__byte**)(&object->data.picture.mime_type), (FLAC__byte*)mime_type, new_length+1))
+			return false;
+	}
+	else {
+		object->data.picture.mime_type = mime_type;
+	}
+
+	if(0 != old)
+		free(old);
+
+	object->length -= old_length;
+	object->length += new_length;
+	return true;
+}
+
+FLAC_API FLAC__bool FLAC__metadata_object_picture_set_description(FLAC__StreamMetadata *object, FLAC__byte *description, FLAC__bool copy)
+{
+	FLAC__byte *old;
+	size_t old_length, new_length;
+
+	FLAC__ASSERT(0 != object);
+	FLAC__ASSERT(object->type == FLAC__METADATA_TYPE_PICTURE);
+	FLAC__ASSERT(0 != description);
+
+	old = object->data.picture.description;
+	old_length = old? strlen((const char *)old) : 0;
+	new_length = strlen((const char *)description);
+
+	/* do the copy first so that if we fail we leave the object untouched */
+	if(copy) {
+		if(new_length >= SIZE_MAX) /* overflow check */
+			return false;
+		if(!copy_bytes_(&object->data.picture.description, description, new_length+1))
+			return false;
+	}
+	else {
+		object->data.picture.description = description;
+	}
+
+	if(0 != old)
+		free(old);
+
+	object->length -= old_length;
+	object->length += new_length;
+	return true;
+}
+
+FLAC_API FLAC__bool FLAC__metadata_object_picture_set_data(FLAC__StreamMetadata *object, FLAC__byte *data, FLAC__uint32 length, FLAC__bool copy)
+{
+	FLAC__byte *old;
+
+	FLAC__ASSERT(0 != object);
+	FLAC__ASSERT(object->type == FLAC__METADATA_TYPE_PICTURE);
+	FLAC__ASSERT((0 != data && length > 0) || (0 == data && length == 0 && copy == false));
+
+	old = object->data.picture.data;
+
+	/* do the copy first so that if we fail we leave the object untouched */
+	if(copy) {
+		if(!copy_bytes_(&object->data.picture.data, data, length))
+			return false;
+	}
+	else {
+		object->data.picture.data = data;
+	}
+
+	if(0 != old)
+		free(old);
+
+	object->length -= object->data.picture.data_length;
+	object->data.picture.data_length = length;
+	object->length += length;
+	return true;
+}
+
+FLAC_API FLAC__bool FLAC__metadata_object_picture_is_legal(const FLAC__StreamMetadata *object, const char **violation)
+{
+	FLAC__ASSERT(0 != object);
+	FLAC__ASSERT(object->type == FLAC__METADATA_TYPE_PICTURE);
+
+	return FLAC__format_picture_is_legal(&object->data.picture, violation);
+}
diff --git a/src/libFLAC/stream_decoder.c b/src/libFLAC/stream_decoder.c
index 1762648..6632d31 100644
--- a/src/libFLAC/stream_decoder.c
+++ b/src/libFLAC/stream_decoder.c
@@ -1,5 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
- * Copyright (C) 2000,2001,2002,2003,2004,2005,2006,2007  Josh Coalson
+ * Copyright (C) 2000-2009  Josh Coalson
+ * Copyright (C) 2011-2014  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -29,32 +30,16 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#if HAVE_CONFIG_H
+#ifdef HAVE_CONFIG_H
 #  include <config.h>
 #endif
 
-#if defined _MSC_VER || defined __MINGW32__
-#include <io.h> /* for _setmode() */
-#include <fcntl.h> /* for _O_BINARY */
-#endif
-#if defined __CYGWIN__ || defined __EMX__
-#include <io.h> /* for setmode(), O_BINARY */
-#include <fcntl.h> /* for _O_BINARY */
-#endif
 #include <stdio.h>
 #include <stdlib.h> /* for malloc() */
 #include <string.h> /* for memset/memcpy() */
 #include <sys/stat.h> /* for stat() */
 #include <sys/types.h> /* for off_t */
-#if defined _MSC_VER || defined __BORLANDC__ || defined __MINGW32__
-#if _MSC_VER <= 1600 || defined __BORLANDC__ /* @@@ [2G limit] */
-#define fseeko fseek
-#define ftello ftell
-#else
-#define fseeko _fseeki64
-#define ftello _ftelli64
-#endif
-#endif
+#include "share/compat.h"
 #include "FLAC/assert.h"
 #include "share/alloc.h"
 #include "protected/stream_decoder.h"
@@ -67,18 +52,7 @@
 #include "private/lpc.h"
 #include "private/md5.h"
 #include "private/memory.h"
-
-#ifdef max
-#undef max
-#endif
-#define max(a,b) ((a)>(b)?(a):(b))
-
-/* adjust for compilers that can't understand using LLU suffix for uint64_t literals */
-#ifdef _MSC_VER
-#define FLAC__U64L(x) x
-#else
-#define FLAC__U64L(x) x##LLU
-#endif
+#include "private/macros.h"
 
 
 /* technically this should be in an "export.c" but this is convenient enough */
@@ -97,7 +71,7 @@
  *
  ***********************************************************************/
 
-static FLAC__byte ID3V2_TAG_[3] = { 'I', 'D', '3' };
+static const FLAC__byte ID3V2_TAG_[3] = { 'I', 'D', '3' };
 
 /***********************************************************************
  *
@@ -113,7 +87,7 @@
 static FLAC__bool read_metadata_(FLAC__StreamDecoder *decoder);
 static FLAC__bool read_metadata_streaminfo_(FLAC__StreamDecoder *decoder, FLAC__bool is_last, unsigned length);
 static FLAC__bool read_metadata_seektable_(FLAC__StreamDecoder *decoder, FLAC__bool is_last, unsigned length);
-static FLAC__bool read_metadata_vorbiscomment_(FLAC__StreamDecoder *decoder, FLAC__StreamMetadata_VorbisComment *obj);
+static FLAC__bool read_metadata_vorbiscomment_(FLAC__StreamDecoder *decoder, FLAC__StreamMetadata_VorbisComment *obj, unsigned length);
 static FLAC__bool read_metadata_cuesheet_(FLAC__StreamDecoder *decoder, FLAC__StreamMetadata_CueSheet *obj);
 static FLAC__bool read_metadata_picture_(FLAC__StreamDecoder *decoder, FLAC__StreamMetadata_Picture *obj);
 static FLAC__bool skip_id3v2_tag_(FLAC__StreamDecoder *decoder);
@@ -168,9 +142,6 @@
 	void (*local_lpc_restore_signal_64bit)(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]);
 	/* for use when the signal is <= 16 bits-per-sample, or <= 15 bits-per-sample on a side channel (which requires 1 extra bit): */
 	void (*local_lpc_restore_signal_16bit)(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]);
-	/* for use when the signal is <= 16 bits-per-sample, or <= 15 bits-per-sample on a side channel (which requires 1 extra bit), AND order <= 8: */
-	void (*local_lpc_restore_signal_16bit_order8)(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]);
-	FLAC__bool (*local_bitreader_read_rice_signed_block)(FLAC__BitReader *br, int vals[], unsigned nvals, unsigned parameter);
 	void *client_data;
 	FILE *file; /* only used if FLAC__stream_decoder_init_file()/FLAC__stream_decoder_init_file() called, else NULL */
 	FLAC__BitReader *input;
@@ -284,18 +255,18 @@
 
 	FLAC__ASSERT(sizeof(int) >= 4); /* we want to die right away if this is not true */
 
-	decoder = (FLAC__StreamDecoder*)calloc(1, sizeof(FLAC__StreamDecoder));
+	decoder = calloc(1, sizeof(FLAC__StreamDecoder));
 	if(decoder == 0) {
 		return 0;
 	}
 
-	decoder->protected_ = (FLAC__StreamDecoderProtected*)calloc(1, sizeof(FLAC__StreamDecoderProtected));
+	decoder->protected_ = calloc(1, sizeof(FLAC__StreamDecoderProtected));
 	if(decoder->protected_ == 0) {
 		free(decoder);
 		return 0;
 	}
 
-	decoder->private_ = (FLAC__StreamDecoderPrivate*)calloc(1, sizeof(FLAC__StreamDecoderPrivate));
+	decoder->private_ = calloc(1, sizeof(FLAC__StreamDecoderPrivate));
 	if(decoder->private_ == 0) {
 		free(decoder->protected_);
 		free(decoder);
@@ -311,7 +282,7 @@
 	}
 
 	decoder->private_->metadata_filter_ids_capacity = 16;
-	if(0 == (decoder->private_->metadata_filter_ids = (FLAC__byte*)malloc((FLAC__STREAM_METADATA_APPLICATION_ID_LEN/8) * decoder->private_->metadata_filter_ids_capacity))) {
+	if(0 == (decoder->private_->metadata_filter_ids = malloc((FLAC__STREAM_METADATA_APPLICATION_ID_LEN/8) * decoder->private_->metadata_filter_ids_capacity))) {
 		FLAC__bitreader_delete(decoder->private_->input);
 		free(decoder->private_);
 		free(decoder->protected_);
@@ -344,7 +315,9 @@
 {
 	unsigned i;
 
-	FLAC__ASSERT(0 != decoder);
+	if (decoder == NULL)
+		return ;
+
 	FLAC__ASSERT(0 != decoder->protected_);
 	FLAC__ASSERT(0 != decoder->private_);
 	FLAC__ASSERT(0 != decoder->private_->input);
@@ -405,7 +378,7 @@
 #if FLAC__HAS_OGG
 	decoder->private_->is_ogg = is_ogg;
 	if(is_ogg && !FLAC__ogg_decoder_aspect_init(&decoder->protected_->ogg_decoder_aspect))
-		return decoder->protected_->state = FLAC__STREAM_DECODER_OGG_ERROR;
+		return decoder->protected_->initstate = FLAC__STREAM_DECODER_INIT_STATUS_ERROR_OPENING_FILE;
 #endif
 
 	/*
@@ -416,42 +389,44 @@
 	decoder->private_->local_lpc_restore_signal = FLAC__lpc_restore_signal;
 	decoder->private_->local_lpc_restore_signal_64bit = FLAC__lpc_restore_signal_wide;
 	decoder->private_->local_lpc_restore_signal_16bit = FLAC__lpc_restore_signal;
-	decoder->private_->local_lpc_restore_signal_16bit_order8 = FLAC__lpc_restore_signal;
-	decoder->private_->local_bitreader_read_rice_signed_block = FLAC__bitreader_read_rice_signed_block;
 	/* now override with asm where appropriate */
 #ifndef FLAC__NO_ASM
 	if(decoder->private_->cpuinfo.use_asm) {
 #ifdef FLAC__CPU_IA32
 		FLAC__ASSERT(decoder->private_->cpuinfo.type == FLAC__CPUINFO_TYPE_IA32);
 #ifdef FLAC__HAS_NASM
-#if 1 /*@@@@@@ OPT: not clearly faster, needs more testing */
-		if(decoder->private_->cpuinfo.data.ia32.bswap)
-			decoder->private_->local_bitreader_read_rice_signed_block = FLAC__bitreader_read_rice_signed_block_asm_ia32_bswap;
-#endif
-		if(decoder->private_->cpuinfo.data.ia32.mmx) {
+		decoder->private_->local_lpc_restore_signal_64bit = FLAC__lpc_restore_signal_wide_asm_ia32; /* OPT_IA32: was really necessary for GCC < 4.9 */
+		if(decoder->private_->cpuinfo.ia32.mmx) {
 			decoder->private_->local_lpc_restore_signal = FLAC__lpc_restore_signal_asm_ia32;
 			decoder->private_->local_lpc_restore_signal_16bit = FLAC__lpc_restore_signal_asm_ia32_mmx;
-			decoder->private_->local_lpc_restore_signal_16bit_order8 = FLAC__lpc_restore_signal_asm_ia32_mmx;
 		}
 		else {
 			decoder->private_->local_lpc_restore_signal = FLAC__lpc_restore_signal_asm_ia32;
 			decoder->private_->local_lpc_restore_signal_16bit = FLAC__lpc_restore_signal_asm_ia32;
-			decoder->private_->local_lpc_restore_signal_16bit_order8 = FLAC__lpc_restore_signal_asm_ia32;
 		}
 #endif
-#elif defined FLAC__CPU_PPC
-		FLAC__ASSERT(decoder->private_->cpuinfo.type == FLAC__CPUINFO_TYPE_PPC);
-		if(decoder->private_->cpuinfo.data.ppc.altivec) {
-			decoder->private_->local_lpc_restore_signal_16bit = FLAC__lpc_restore_signal_asm_ppc_altivec_16;
-			decoder->private_->local_lpc_restore_signal_16bit_order8 = FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8;
+#ifdef FLAC__HAS_X86INTRIN
+# if defined FLAC__SSE2_SUPPORTED && !defined FLAC__HAS_NASM /* OPT_SSE: not better than MMX asm */
+		if(decoder->private_->cpuinfo.ia32.sse2) {
+			decoder->private_->local_lpc_restore_signal_16bit = FLAC__lpc_restore_signal_16_intrin_sse2;
 		}
+# endif
+# if defined FLAC__SSE4_1_SUPPORTED
+		if(decoder->private_->cpuinfo.ia32.sse41) {
+			decoder->private_->local_lpc_restore_signal_64bit = FLAC__lpc_restore_signal_wide_intrin_sse41;
+		}
+# endif
+#endif
+#elif defined FLAC__CPU_X86_64
+		FLAC__ASSERT(decoder->private_->cpuinfo.type == FLAC__CPUINFO_TYPE_X86_64);
+		/* No useful SSE optimizations yet */
 #endif
 	}
 #endif
 
 	/* from here on, errors are fatal */
 
-	if(!FLAC__bitreader_init(decoder->private_->input, decoder->private_->cpuinfo, read_callback_, decoder)) {
+	if(!FLAC__bitreader_init(decoder->private_->input, read_callback_, decoder)) {
 		decoder->protected_->state = FLAC__STREAM_DECODER_MEMORY_ALLOCATION_ERROR;
 		return FLAC__STREAM_DECODER_INIT_STATUS_MEMORY_ALLOCATION_ERROR;
 	}
@@ -552,10 +527,10 @@
 	FLAC__ASSERT(0 != file);
 
 	if(decoder->protected_->state != FLAC__STREAM_DECODER_UNINITIALIZED)
-		return decoder->protected_->state = FLAC__STREAM_DECODER_INIT_STATUS_ALREADY_INITIALIZED;
+		return decoder->protected_->initstate = FLAC__STREAM_DECODER_INIT_STATUS_ALREADY_INITIALIZED;
 
 	if(0 == write_callback || 0 == error_callback)
-		return decoder->protected_->state = FLAC__STREAM_DECODER_INIT_STATUS_INVALID_CALLBACKS;
+		return decoder->protected_->initstate = FLAC__STREAM_DECODER_INIT_STATUS_INVALID_CALLBACKS;
 
 	/*
 	 * To make sure that our file does not go unclosed after an error, we
@@ -626,12 +601,12 @@
 	 * in FLAC__stream_decoder_init_FILE() before the FILE* is assigned.
 	 */
 	if(decoder->protected_->state != FLAC__STREAM_DECODER_UNINITIALIZED)
-		return decoder->protected_->state = FLAC__STREAM_DECODER_INIT_STATUS_ALREADY_INITIALIZED;
+		return decoder->protected_->initstate = FLAC__STREAM_DECODER_INIT_STATUS_ALREADY_INITIALIZED;
 
 	if(0 == write_callback || 0 == error_callback)
-		return decoder->protected_->state = FLAC__STREAM_DECODER_INIT_STATUS_INVALID_CALLBACKS;
+		return decoder->protected_->initstate = FLAC__STREAM_DECODER_INIT_STATUS_INVALID_CALLBACKS;
 
-	file = filename? fopen(filename, "rb") : stdin;
+	file = filename? flac_fopen(filename, "rb") : stdin;
 
 	if(0 == file)
 		return FLAC__STREAM_DECODER_INIT_STATUS_ERROR_OPENING_FILE;
@@ -675,7 +650,7 @@
 	if(decoder->protected_->state == FLAC__STREAM_DECODER_UNINITIALIZED)
 		return true;
 
-	/* see the comment in FLAC__seekable_stream_decoder_reset() as to why we
+	/* see the comment in FLAC__stream_decoder_reset() as to why we
 	 * always call FLAC__MD5Final()
 	 */
 	FLAC__MD5Final(decoder->private_->computed_md5sum, &decoder->private_->md5context);
@@ -788,7 +763,7 @@
 	FLAC__ASSERT(0 != decoder->private_->metadata_filter_ids);
 
 	if(decoder->private_->metadata_filter_ids_count == decoder->private_->metadata_filter_ids_capacity) {
-		if(0 == (decoder->private_->metadata_filter_ids = (FLAC__byte*)safe_realloc_mul_2op_(decoder->private_->metadata_filter_ids, decoder->private_->metadata_filter_ids_capacity, /*times*/2))) {
+		if(0 == (decoder->private_->metadata_filter_ids = safe_realloc_mul_2op_(decoder->private_->metadata_filter_ids, decoder->private_->metadata_filter_ids_capacity, /*times*/2))) {
 			decoder->protected_->state = FLAC__STREAM_DECODER_MEMORY_ALLOCATION_ERROR;
 			return false;
 		}
@@ -847,7 +822,7 @@
 	FLAC__ASSERT(0 != decoder->private_->metadata_filter_ids);
 
 	if(decoder->private_->metadata_filter_ids_count == decoder->private_->metadata_filter_ids_capacity) {
-		if(0 == (decoder->private_->metadata_filter_ids = (FLAC__byte*)safe_realloc_mul_2op_(decoder->private_->metadata_filter_ids, decoder->private_->metadata_filter_ids_capacity, /*times*/2))) {
+		if(0 == (decoder->private_->metadata_filter_ids = safe_realloc_mul_2op_(decoder->private_->metadata_filter_ids, decoder->private_->metadata_filter_ids_capacity, /*times*/2))) {
 			decoder->protected_->state = FLAC__STREAM_DECODER_MEMORY_ALLOCATION_ERROR;
 			return false;
 		}
@@ -1291,7 +1266,7 @@
 	 */
 #if defined _MSC_VER || defined __MINGW32__
 	_setmode(_fileno(stdin), _O_BINARY);
-#elif defined __CYGWIN__ 
+#elif defined __CYGWIN__
 	/* almost certainly not needed for any modern Cygwin, but let's be safe... */
 	setmode(_fileno(stdin), _O_BINARY);
 #elif defined __EMX__
@@ -1329,7 +1304,7 @@
 		 * (at negative indices) for alignment purposes; we use 4
 		 * to keep the data well-aligned.
 		 */
-		tmp = (FLAC__int32*)safe_malloc_muladd2_(sizeof(FLAC__int32), /*times (*/size, /*+*/4/*)*/);
+		tmp = safe_malloc_muladd2_(sizeof(FLAC__int32), /*times (*/size, /*+*/4/*)*/);
 		if(tmp == 0) {
 			decoder->protected_->state = FLAC__STREAM_DECODER_MEMORY_ALLOCATION_ERROR;
 			return false;
@@ -1337,9 +1312,6 @@
 		memset(tmp, 0, sizeof(FLAC__int32)*4);
 		decoder->private_->output[i] = tmp + 4;
 
-		/* WATCHOUT:
-		 * minimum of quadword alignment for PPC vector optimizations is REQUIRED:
-		 */
 		if(!FLAC__memory_alloc_aligned_int32_array(size, &decoder->private_->residual_unaligned[i], &decoder->private_->residual[i])) {
 			decoder->protected_->state = FLAC__STREAM_DECODER_MEMORY_ALLOCATION_ERROR;
 			return false;
@@ -1389,6 +1361,10 @@
 			id = 0;
 			continue;
 		}
+
+		if(id >= 3)
+			return false;
+
 		if(x == ID3V2_TAG_[id]) {
 			id++;
 			i = 0;
@@ -1410,7 +1386,7 @@
 				decoder->private_->lookahead = (FLAC__byte)x;
 				decoder->private_->cached = true;
 			}
-			else if(x >> 2 == 0x3e) { /* MAGIC NUMBER for the last 6 sync bits */
+			else if(x >> 1 == 0x7c) { /* MAGIC NUMBER for the last 6 sync bits and reserved 7th bit */
 				decoder->private_->header_warmup[1] = (FLAC__byte)x;
 				decoder->protected_->state = FLAC__STREAM_DECODER_READ_FRAME;
 				return true;
@@ -1467,6 +1443,7 @@
 		unsigned real_length = length;
 		FLAC__StreamMetadata block;
 
+		memset(&block, 0, sizeof(block));
 		block.is_last = is_last;
 		block.type = (FLAC__MetadataType)type;
 		block.length = length;
@@ -1491,36 +1468,37 @@
 				return false; /* read_callback_ sets the state for us */
 		}
 		else {
+			FLAC__bool ok = true;
 			switch(type) {
 				case FLAC__METADATA_TYPE_PADDING:
 					/* skip the padding bytes */
 					if(!FLAC__bitreader_skip_byte_block_aligned_no_crc(decoder->private_->input, real_length))
-						return false; /* read_callback_ sets the state for us */
+						ok = false; /* read_callback_ sets the state for us */
 					break;
 				case FLAC__METADATA_TYPE_APPLICATION:
 					/* remember, we read the ID already */
 					if(real_length > 0) {
-						if(0 == (block.data.application.data = (FLAC__byte*)malloc(real_length))) {
+						if(0 == (block.data.application.data = malloc(real_length))) {
 							decoder->protected_->state = FLAC__STREAM_DECODER_MEMORY_ALLOCATION_ERROR;
-							return false;
+							ok = false;
 						}
-						if(!FLAC__bitreader_read_byte_block_aligned_no_crc(decoder->private_->input, block.data.application.data, real_length))
-							return false; /* read_callback_ sets the state for us */
+						else if(!FLAC__bitreader_read_byte_block_aligned_no_crc(decoder->private_->input, block.data.application.data, real_length))
+							ok = false; /* read_callback_ sets the state for us */
 					}
 					else
 						block.data.application.data = 0;
 					break;
 				case FLAC__METADATA_TYPE_VORBIS_COMMENT:
-					if(!read_metadata_vorbiscomment_(decoder, &block.data.vorbis_comment))
-						return false;
+					if(!read_metadata_vorbiscomment_(decoder, &block.data.vorbis_comment, real_length))
+						ok = false;
 					break;
 				case FLAC__METADATA_TYPE_CUESHEET:
 					if(!read_metadata_cuesheet_(decoder, &block.data.cue_sheet))
-						return false;
+						ok = false;
 					break;
 				case FLAC__METADATA_TYPE_PICTURE:
 					if(!read_metadata_picture_(decoder, &block.data.picture))
-						return false;
+						ok = false;
 					break;
 				case FLAC__METADATA_TYPE_STREAMINFO:
 				case FLAC__METADATA_TYPE_SEEKTABLE:
@@ -1528,18 +1506,18 @@
 					break;
 				default:
 					if(real_length > 0) {
-						if(0 == (block.data.unknown.data = (FLAC__byte*)malloc(real_length))) {
+						if(0 == (block.data.unknown.data = malloc(real_length))) {
 							decoder->protected_->state = FLAC__STREAM_DECODER_MEMORY_ALLOCATION_ERROR;
-							return false;
+							ok = false;
 						}
-						if(!FLAC__bitreader_read_byte_block_aligned_no_crc(decoder->private_->input, block.data.unknown.data, real_length))
-							return false; /* read_callback_ sets the state for us */
+						else if(!FLAC__bitreader_read_byte_block_aligned_no_crc(decoder->private_->input, block.data.unknown.data, real_length))
+							ok = false; /* read_callback_ sets the state for us */
 					}
 					else
 						block.data.unknown.data = 0;
 					break;
 			}
-			if(!decoder->private_->is_seeking && decoder->private_->metadata_callback)
+			if(ok && !decoder->private_->is_seeking && decoder->private_->metadata_callback)
 				decoder->private_->metadata_callback(decoder, &block, decoder->private_->client_data);
 
 			/* now we have to free any malloc()ed data in the block */
@@ -1584,6 +1562,9 @@
 						free(block.data.unknown.data);
 					break;
 			}
+
+			if(!ok) /* anything that unsets "ok" should also make sure decoder->protected_->state is updated */
+				return false;
 		}
 	}
 
@@ -1682,7 +1663,7 @@
 	decoder->private_->seek_table.data.seek_table.num_points = length / FLAC__STREAM_METADATA_SEEKPOINT_LENGTH;
 
 	/* use realloc since we may pass through here several times (e.g. after seeking) */
-	if(0 == (decoder->private_->seek_table.data.seek_table.points = (FLAC__StreamMetadata_SeekPoint*)safe_realloc_mul_2op_(decoder->private_->seek_table.data.seek_table.points, decoder->private_->seek_table.data.seek_table.num_points, /*times*/sizeof(FLAC__StreamMetadata_SeekPoint)))) {
+	if(0 == (decoder->private_->seek_table.data.seek_table.points = safe_realloc_mul_2op_(decoder->private_->seek_table.data.seek_table.points, decoder->private_->seek_table.data.seek_table.num_points, /*times*/sizeof(FLAC__StreamMetadata_SeekPoint)))) {
 		decoder->protected_->state = FLAC__STREAM_DECODER_MEMORY_ALLOCATION_ERROR;
 		return false;
 	}
@@ -1710,58 +1691,88 @@
 	return true;
 }
 
-FLAC__bool read_metadata_vorbiscomment_(FLAC__StreamDecoder *decoder, FLAC__StreamMetadata_VorbisComment *obj)
+FLAC__bool read_metadata_vorbiscomment_(FLAC__StreamDecoder *decoder, FLAC__StreamMetadata_VorbisComment *obj, unsigned length)
 {
 	FLAC__uint32 i;
 
 	FLAC__ASSERT(FLAC__bitreader_is_consumed_byte_aligned(decoder->private_->input));
 
 	/* read vendor string */
-	FLAC__ASSERT(FLAC__STREAM_METADATA_VORBIS_COMMENT_ENTRY_LENGTH_LEN == 32);
-	if(!FLAC__bitreader_read_uint32_little_endian(decoder->private_->input, &obj->vendor_string.length))
-		return false; /* read_callback_ sets the state for us */
-	if(obj->vendor_string.length > 0) {
-		if(0 == (obj->vendor_string.entry = (FLAC__byte*)safe_malloc_add_2op_(obj->vendor_string.length, /*+*/1))) {
-			decoder->protected_->state = FLAC__STREAM_DECODER_MEMORY_ALLOCATION_ERROR;
-			return false;
-		}
-		if(!FLAC__bitreader_read_byte_block_aligned_no_crc(decoder->private_->input, obj->vendor_string.entry, obj->vendor_string.length))
+	if (length >= 8) {
+		length -= 8; /* vendor string length + num comments entries alone take 8 bytes */
+		FLAC__ASSERT(FLAC__STREAM_METADATA_VORBIS_COMMENT_ENTRY_LENGTH_LEN == 32);
+		if (!FLAC__bitreader_read_uint32_little_endian(decoder->private_->input, &obj->vendor_string.length))
 			return false; /* read_callback_ sets the state for us */
-		obj->vendor_string.entry[obj->vendor_string.length] = '\0';
-	}
-	else
-		obj->vendor_string.entry = 0;
-
-	/* read num comments */
-	FLAC__ASSERT(FLAC__STREAM_METADATA_VORBIS_COMMENT_NUM_COMMENTS_LEN == 32);
-	if(!FLAC__bitreader_read_uint32_little_endian(decoder->private_->input, &obj->num_comments))
-		return false; /* read_callback_ sets the state for us */
-
-	/* read comments */
-	if(obj->num_comments > 0) {
-		if(0 == (obj->comments = (FLAC__StreamMetadata_VorbisComment_Entry*)safe_malloc_mul_2op_(obj->num_comments, /*times*/sizeof(FLAC__StreamMetadata_VorbisComment_Entry)))) {
-			decoder->protected_->state = FLAC__STREAM_DECODER_MEMORY_ALLOCATION_ERROR;
-			return false;
-		}
-		for(i = 0; i < obj->num_comments; i++) {
-			FLAC__ASSERT(FLAC__STREAM_METADATA_VORBIS_COMMENT_ENTRY_LENGTH_LEN == 32);
-			if(!FLAC__bitreader_read_uint32_little_endian(decoder->private_->input, &obj->comments[i].length))
-				return false; /* read_callback_ sets the state for us */
-			if(obj->comments[i].length > 0) {
-				if(0 == (obj->comments[i].entry = (FLAC__byte*)safe_malloc_add_2op_(obj->comments[i].length, /*+*/1))) {
-					decoder->protected_->state = FLAC__STREAM_DECODER_MEMORY_ALLOCATION_ERROR;
-					return false;
-				}
-				if(!FLAC__bitreader_read_byte_block_aligned_no_crc(decoder->private_->input, obj->comments[i].entry, obj->comments[i].length))
-					return false; /* read_callback_ sets the state for us */
-				obj->comments[i].entry[obj->comments[i].length] = '\0';
+		if (obj->vendor_string.length > 0) {
+			if (length < obj->vendor_string.length) {
+				obj->vendor_string.length = 0;
+				obj->vendor_string.entry = 0;
+				goto skip;
 			}
 			else
-				obj->comments[i].entry = 0;
+				length -= obj->vendor_string.length;
+			if (0 == (obj->vendor_string.entry = safe_malloc_add_2op_(obj->vendor_string.length, /*+*/1))) {
+				decoder->protected_->state = FLAC__STREAM_DECODER_MEMORY_ALLOCATION_ERROR;
+				return false;
+			}
+			if (!FLAC__bitreader_read_byte_block_aligned_no_crc(decoder->private_->input, obj->vendor_string.entry, obj->vendor_string.length))
+				return false; /* read_callback_ sets the state for us */
+			obj->vendor_string.entry[obj->vendor_string.length] = '\0';
 		}
+		else
+			obj->vendor_string.entry = 0;
+
+		/* read num comments */
+		FLAC__ASSERT(FLAC__STREAM_METADATA_VORBIS_COMMENT_NUM_COMMENTS_LEN == 32);
+		if (!FLAC__bitreader_read_uint32_little_endian(decoder->private_->input, &obj->num_comments))
+			return false; /* read_callback_ sets the state for us */
+
+		/* read comments */
+		if (obj->num_comments > 0) {
+			if (0 == (obj->comments = safe_malloc_mul_2op_p(obj->num_comments, /*times*/sizeof(FLAC__StreamMetadata_VorbisComment_Entry)))) {
+				decoder->protected_->state = FLAC__STREAM_DECODER_MEMORY_ALLOCATION_ERROR;
+				return false;
+			}
+			for (i = 0; i < obj->num_comments; i++) {
+				FLAC__ASSERT(FLAC__STREAM_METADATA_VORBIS_COMMENT_ENTRY_LENGTH_LEN == 32);
+				if (length < 4) {
+					obj->num_comments = i;
+					goto skip;
+				}
+				else
+					length -= 4;
+				if (!FLAC__bitreader_read_uint32_little_endian(decoder->private_->input, &obj->comments[i].length))
+					return false; /* read_callback_ sets the state for us */
+				if (obj->comments[i].length > 0) {
+					if (length < obj->comments[i].length) {
+						obj->comments[i].length = 0;
+						obj->comments[i].entry = 0;
+						obj->num_comments = i;
+						goto skip;
+					}
+					else
+						length -= obj->comments[i].length;
+					if (0 == (obj->comments[i].entry = safe_malloc_add_2op_(obj->comments[i].length, /*+*/1))) {
+						decoder->protected_->state = FLAC__STREAM_DECODER_MEMORY_ALLOCATION_ERROR;
+						return false;
+					}
+					if (!FLAC__bitreader_read_byte_block_aligned_no_crc(decoder->private_->input, obj->comments[i].entry, obj->comments[i].length))
+						return false; /* read_callback_ sets the state for us */
+					obj->comments[i].entry[obj->comments[i].length] = '\0';
+				}
+				else
+					obj->comments[i].entry = 0;
+			}
+		}
+		else
+			obj->comments = 0;
 	}
-	else {
-		obj->comments = 0;
+
+  skip:
+	if (length > 0) {
+		/* This will only happen on files with invalid data in comments */
+		if(!FLAC__bitreader_skip_byte_block_aligned_no_crc(decoder->private_->input, length))
+			return false; /* read_callback_ sets the state for us */
 	}
 
 	return true;
@@ -1794,7 +1805,7 @@
 	obj->num_tracks = x;
 
 	if(obj->num_tracks > 0) {
-		if(0 == (obj->tracks = (FLAC__StreamMetadata_CueSheet_Track*)safe_calloc_(obj->num_tracks, sizeof(FLAC__StreamMetadata_CueSheet_Track)))) {
+		if(0 == (obj->tracks = safe_calloc_(obj->num_tracks, sizeof(FLAC__StreamMetadata_CueSheet_Track)))) {
 			decoder->protected_->state = FLAC__STREAM_DECODER_MEMORY_ALLOCATION_ERROR;
 			return false;
 		}
@@ -1827,18 +1838,18 @@
 			track->num_indices = (FLAC__byte)x;
 
 			if(track->num_indices > 0) {
-				if(0 == (track->indices = (FLAC__StreamMetadata_CueSheet_Index*)safe_calloc_(track->num_indices, sizeof(FLAC__StreamMetadata_CueSheet_Index)))) {
+				if(0 == (track->indices = safe_calloc_(track->num_indices, sizeof(FLAC__StreamMetadata_CueSheet_Index)))) {
 					decoder->protected_->state = FLAC__STREAM_DECODER_MEMORY_ALLOCATION_ERROR;
 					return false;
 				}
 				for(j = 0; j < track->num_indices; j++) {
-					FLAC__StreamMetadata_CueSheet_Index *index = &track->indices[j];
-					if(!FLAC__bitreader_read_raw_uint64(decoder->private_->input, &index->offset, FLAC__STREAM_METADATA_CUESHEET_INDEX_OFFSET_LEN))
+					FLAC__StreamMetadata_CueSheet_Index *indx = &track->indices[j];
+					if(!FLAC__bitreader_read_raw_uint64(decoder->private_->input, &indx->offset, FLAC__STREAM_METADATA_CUESHEET_INDEX_OFFSET_LEN))
 						return false; /* read_callback_ sets the state for us */
 
 					if(!FLAC__bitreader_read_raw_uint32(decoder->private_->input, &x, FLAC__STREAM_METADATA_CUESHEET_INDEX_NUMBER_LEN))
 						return false; /* read_callback_ sets the state for us */
-					index->number = (FLAC__byte)x;
+					indx->number = (FLAC__byte)x;
 
 					if(!FLAC__bitreader_skip_bits_no_crc(decoder->private_->input, FLAC__STREAM_METADATA_CUESHEET_INDEX_RESERVED_LEN))
 						return false; /* read_callback_ sets the state for us */
@@ -1864,7 +1875,7 @@
 	/* read MIME type */
 	if(!FLAC__bitreader_read_raw_uint32(decoder->private_->input, &x, FLAC__STREAM_METADATA_PICTURE_MIME_TYPE_LENGTH_LEN))
 		return false; /* read_callback_ sets the state for us */
-	if(0 == (obj->mime_type = (char*)safe_malloc_add_2op_(x, /*+*/1))) {
+	if(0 == (obj->mime_type = safe_malloc_add_2op_(x, /*+*/1))) {
 		decoder->protected_->state = FLAC__STREAM_DECODER_MEMORY_ALLOCATION_ERROR;
 		return false;
 	}
@@ -1877,7 +1888,7 @@
 	/* read description */
 	if(!FLAC__bitreader_read_raw_uint32(decoder->private_->input, &x, FLAC__STREAM_METADATA_PICTURE_DESCRIPTION_LENGTH_LEN))
 		return false; /* read_callback_ sets the state for us */
-	if(0 == (obj->description = (FLAC__byte*)safe_malloc_add_2op_(x, /*+*/1))) {
+	if(0 == (obj->description = safe_malloc_add_2op_(x, /*+*/1))) {
 		decoder->protected_->state = FLAC__STREAM_DECODER_MEMORY_ALLOCATION_ERROR;
 		return false;
 	}
@@ -1906,7 +1917,7 @@
 	/* read data */
 	if(!FLAC__bitreader_read_raw_uint32(decoder->private_->input, &(obj->data_length), FLAC__STREAM_METADATA_PICTURE_DATA_LENGTH_LEN))
 		return false; /* read_callback_ sets the state for us */
-	if(0 == (obj->data = (FLAC__byte*)safe_malloc_(obj->data_length))) {
+	if(0 == (obj->data = safe_malloc_(obj->data_length))) {
 		decoder->protected_->state = FLAC__STREAM_DECODER_MEMORY_ALLOCATION_ERROR;
 		return false;
 	}
@@ -1980,7 +1991,7 @@
 				decoder->private_->lookahead = (FLAC__byte)x;
 				decoder->private_->cached = true;
 			}
-			else if(x >> 2 == 0x3e) { /* MAGIC NUMBER for the last 6 sync bits */
+			else if(x >> 1 == 0x7c) { /* MAGIC NUMBER for the last 6 sync bits and reserved 7th bit */
 				decoder->private_->header_warmup[1] = (FLAC__byte)x;
 				decoder->protected_->state = FLAC__STREAM_DECODER_READ_FRAME;
 				return true;
@@ -2676,12 +2687,8 @@
 		if( (FLAC__uint64)order * ((((FLAC__uint64)1)<<bps)-1) * ((1<<subframe->qlp_coeff_precision)-1) < (((FLAC__uint64)-1) << 32) )
 		*/
 		if(bps + subframe->qlp_coeff_precision + FLAC__bitmath_ilog2(order) <= 32)
-			if(bps <= 16 && subframe->qlp_coeff_precision <= 16) {
-				if(order <= 8)
-					decoder->private_->local_lpc_restore_signal_16bit_order8(decoder->private_->residual[channel], decoder->private_->frame.header.blocksize-order, subframe->qlp_coeff, order, subframe->quantization_level, decoder->private_->output[channel]+order);
-				else
-					decoder->private_->local_lpc_restore_signal_16bit(decoder->private_->residual[channel], decoder->private_->frame.header.blocksize-order, subframe->qlp_coeff, order, subframe->quantization_level, decoder->private_->output[channel]+order);
-			}
+			if(bps <= 16 && subframe->qlp_coeff_precision <= 16)
+				decoder->private_->local_lpc_restore_signal_16bit(decoder->private_->residual[channel], decoder->private_->frame.header.blocksize-order, subframe->qlp_coeff, order, subframe->quantization_level, decoder->private_->output[channel]+order);
 			else
 				decoder->private_->local_lpc_restore_signal(decoder->private_->residual[channel], decoder->private_->frame.header.blocksize-order, subframe->qlp_coeff, order, subframe->quantization_level, decoder->private_->output[channel]+order);
 		else
@@ -2729,18 +2736,20 @@
 		if(decoder->private_->frame.header.blocksize < predictor_order) {
 			send_error_to_client_(decoder, FLAC__STREAM_DECODER_ERROR_STATUS_LOST_SYNC);
 			decoder->protected_->state = FLAC__STREAM_DECODER_SEARCH_FOR_FRAME_SYNC;
-			return true;
+			/* We have received a potentially malicious bit stream. All we can do is error out to avoid a heap overflow. */
+			return false;
 		}
 	}
 	else {
 		if(partition_samples < predictor_order) {
 			send_error_to_client_(decoder, FLAC__STREAM_DECODER_ERROR_STATUS_LOST_SYNC);
 			decoder->protected_->state = FLAC__STREAM_DECODER_SEARCH_FOR_FRAME_SYNC;
-			return true;
+			/* We have received a potentially malicious bit stream. All we can do is error out to avoid a heap overflow. */
+			return false;
 		}
 	}
 
-	if(!FLAC__format_entropy_coding_method_partitioned_rice_contents_ensure_size(partitioned_rice_contents, max(6, partition_order))) {
+	if(!FLAC__format_entropy_coding_method_partitioned_rice_contents_ensure_size(partitioned_rice_contents, flac_max(6u, partition_order))) {
 		decoder->protected_->state = FLAC__STREAM_DECODER_MEMORY_ALLOCATION_ERROR;
 		return false;
 	}
@@ -2753,7 +2762,7 @@
 		if(rice_parameter < pesc) {
 			partitioned_rice_contents->raw_bits[partition] = 0;
 			u = (partition_order == 0 || partition > 0)? partition_samples : partition_samples - predictor_order;
-			if(!decoder->private_->local_bitreader_read_rice_signed_block(decoder->private_->input, residual + sample, u, rice_parameter))
+			if(!FLAC__bitreader_read_rice_signed_block(decoder->private_->input, residual + sample, u, rice_parameter))
 				return false; /* read_callback_ sets the state for us */
 			sample += u;
 		}
@@ -3095,12 +3104,7 @@
 			return false;
 		}
 #ifndef FLAC__INTEGER_ONLY_LIBRARY
-#if defined _MSC_VER || defined __MINGW32__
-		/* with VC++ you have to spoon feed it the casting */
-		pos = (FLAC__int64)lower_bound + (FLAC__int64)((FLAC__double)(FLAC__int64)(target_sample - lower_bound_sample) / (FLAC__double)(FLAC__int64)(upper_bound_sample - lower_bound_sample) * (FLAC__double)(FLAC__int64)(upper_bound - lower_bound)) - approx_bytes_per_frame;
-#else
 		pos = (FLAC__int64)lower_bound + (FLAC__int64)((FLAC__double)(target_sample - lower_bound_sample) / (FLAC__double)(upper_bound_sample - lower_bound_sample) * (FLAC__double)(upper_bound - lower_bound)) - approx_bytes_per_frame;
-#endif
 #else
 		/* a little less accurate: */
 		if(upper_bound - lower_bound < 0xffffffff)
@@ -3152,11 +3156,11 @@
 			}
 			/* our last move backwards wasn't big enough, try again */
 			approx_bytes_per_frame = approx_bytes_per_frame? approx_bytes_per_frame * 2 : 16;
-			continue;	
+			continue;
 		}
 		/* allow one seek over upper bound, so we can get a correct upper_bound_sample for streams with unknown total_samples */
 		first_seek = false;
-		
+
 		/* make sure we are not seeking in corrupted stream */
 		if (this_frame_sample < lower_bound_sample) {
 			decoder->protected_->state = FLAC__STREAM_DECODER_SEEK_ERROR;
@@ -3196,7 +3200,7 @@
 	FLAC__bool did_a_seek;
 	unsigned iteration = 0;
 
-	/* In the first iterations, we will calculate the target byte position 
+	/* In the first iterations, we will calculate the target byte position
 	 * by the distance from the target sample to left_sample and
 	 * right_sample (let's call it "proportional search").  After that, we
 	 * will switch to binary search.
@@ -3224,12 +3228,7 @@
 			}
 			else {
 #ifndef FLAC__INTEGER_ONLY_LIBRARY
-#if defined _MSC_VER || defined __MINGW32__
-				/* with MSVC you have to spoon feed it the casting */
-				pos = (FLAC__uint64)((FLAC__double)(FLAC__int64)(target_sample - left_sample) / (FLAC__double)(FLAC__int64)(right_sample - left_sample) * (FLAC__double)(FLAC__int64)(right_pos - left_pos));
-#else
 				pos = (FLAC__uint64)((FLAC__double)(target_sample - left_sample) / (FLAC__double)(right_sample - left_sample) * (FLAC__double)(right_pos - left_pos));
-#endif
 #else
 				/* a little less accurate: */
 				if ((target_sample-left_sample <= 0xffffffff) && (right_pos-left_pos <= 0xffffffff))
@@ -3346,7 +3345,7 @@
 
 	if(decoder->private_->file == stdin)
 		return FLAC__STREAM_DECODER_SEEK_STATUS_UNSUPPORTED;
-	else if(fseeko(decoder->private_->file, (off_t)absolute_byte_offset, SEEK_SET) < 0)
+	else if(fseeko(decoder->private_->file, (FLAC__off_t)absolute_byte_offset, SEEK_SET) < 0)
 		return FLAC__STREAM_DECODER_SEEK_STATUS_ERROR;
 	else
 		return FLAC__STREAM_DECODER_SEEK_STATUS_OK;
@@ -3354,7 +3353,7 @@
 
 FLAC__StreamDecoderTellStatus file_tell_callback_(const FLAC__StreamDecoder *decoder, FLAC__uint64 *absolute_byte_offset, void *client_data)
 {
-	off_t pos;
+	FLAC__off_t pos;
 	(void)client_data;
 
 	if(decoder->private_->file == stdin)
@@ -3369,12 +3368,12 @@
 
 FLAC__StreamDecoderLengthStatus file_length_callback_(const FLAC__StreamDecoder *decoder, FLAC__uint64 *stream_length, void *client_data)
 {
-	struct stat filestats;
+	struct flac_stat_s filestats;
 	(void)client_data;
 
 	if(decoder->private_->file == stdin)
 		return FLAC__STREAM_DECODER_LENGTH_STATUS_UNSUPPORTED;
-	else if(fstat(fileno(decoder->private_->file), &filestats) != 0)
+	else if(flac_fstat(fileno(decoder->private_->file), &filestats) != 0)
 		return FLAC__STREAM_DECODER_LENGTH_STATUS_ERROR;
 	else {
 		*stream_length = (FLAC__uint64)filestats.st_size;
diff --git a/src/libFLAC/stream_encoder.c b/src/libFLAC/stream_encoder.c
index fc0d22d..45bdb25 100644
--- a/src/libFLAC/stream_encoder.c
+++ b/src/libFLAC/stream_encoder.c
@@ -1,5 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
- * Copyright (C) 2000,2001,2002,2003,2004,2005,2006,2007  Josh Coalson
+ * Copyright (C) 2000-2009  Josh Coalson
+ * Copyright (C) 2011-2014  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -29,35 +30,18 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#if HAVE_CONFIG_H
+#ifdef HAVE_CONFIG_H
 #  include <config.h>
 #endif
 
-#if defined _MSC_VER || defined __MINGW32__
-#include <io.h> /* for _setmode() */
-#include <fcntl.h> /* for _O_BINARY */
-#endif
-#if defined __CYGWIN__ || defined __EMX__
-#include <io.h> /* for setmode(), O_BINARY */
-#include <fcntl.h> /* for _O_BINARY */
-#endif
 #include <limits.h>
 #include <stdio.h>
 #include <stdlib.h> /* for malloc() */
 #include <string.h> /* for memcpy() */
 #include <sys/types.h> /* for off_t */
-#if defined _MSC_VER || defined __BORLANDC__ || defined __MINGW32__
-#if _MSC_VER <= 1600 || defined __BORLANDC__ /* @@@ [2G limit] */
-#define fseeko fseek
-#define ftello ftell
-#else
-#define fseeko _fseeki64
-#define ftello _ftelli64
-#endif
-#endif
+#include "share/compat.h"
 #include "FLAC/assert.h"
 #include "FLAC/stream_decoder.h"
-#include "share/alloc.h"
 #include "protected/stream_encoder.h"
 #include "private/bitwriter.h"
 #include "private/bitmath.h"
@@ -68,26 +52,17 @@
 #include "private/lpc.h"
 #include "private/md5.h"
 #include "private/memory.h"
+#include "private/macros.h"
 #if FLAC__HAS_OGG
 #include "private/ogg_helper.h"
 #include "private/ogg_mapping.h"
 #endif
+#include "private/stream_encoder.h"
 #include "private/stream_encoder_framing.h"
 #include "private/window.h"
+#include "share/alloc.h"
+#include "share/private.h"
 
-#ifndef FLaC__INLINE
-#define FLaC__INLINE
-#endif
-
-#ifdef min
-#undef min
-#endif
-#define min(x,y) ((x)<(y)?(x):(y))
-
-#ifdef max
-#undef max
-#endif
-#define max(x,y) ((x)>(y)?(x):(y))
 
 /* Exact Rice codeword length calculation is off by default.  The simple
  * (and fast) estimation (of how many bits a residual value will be
@@ -99,7 +74,7 @@
  * parameter estimation in this encoder is very good, almost always
  * yielding compression within 0.1% of the optimal parameters.
  */
-#undef ENABLE_RICE_PARAMETER_SEARCH 
+#undef ENABLE_RICE_PARAMETER_SEARCH
 
 
 typedef struct {
@@ -111,7 +86,7 @@
 typedef struct {
 	const FLAC__byte *data;
 	unsigned capacity;
-	size_t bytes;
+	unsigned bytes;
 } verify_output;
 
 typedef enum {
@@ -131,16 +106,18 @@
 	unsigned min_residual_partition_order;
 	unsigned max_residual_partition_order;
 	unsigned rice_parameter_search_dist;
+	const char *apodization;
 } compression_levels_[] = {
-	{ false, false,  0, 0, false, false, false, 0, 3, 0 },
-	{ true , true ,  0, 0, false, false, false, 0, 3, 0 },
-	{ true , false,  0, 0, false, false, false, 0, 3, 0 },
-	{ false, false,  6, 0, false, false, false, 0, 4, 0 },
-	{ true , true ,  8, 0, false, false, false, 0, 4, 0 },
-	{ true , false,  8, 0, false, false, false, 0, 5, 0 },
-	{ true , false,  8, 0, false, false, false, 0, 6, 0 },
-	{ true , false,  8, 0, false, false, true , 0, 6, 0 },
-	{ true , false, 12, 0, false, false, true , 0, 6, 0 }
+	{ false, false,  0, 0, false, false, false, 0, 3, 0, "tukey(5e-1)" },
+	{ true , true ,  0, 0, false, false, false, 0, 3, 0, "tukey(5e-1)" },
+	{ true , false,  0, 0, false, false, false, 0, 3, 0, "tukey(5e-1)" },
+	{ false, false,  6, 0, false, false, false, 0, 4, 0, "tukey(5e-1)" },
+	{ true , true ,  8, 0, false, false, false, 0, 4, 0, "tukey(5e-1)" },
+	{ true , false,  8, 0, false, false, false, 0, 5, 0, "tukey(5e-1)" },
+	{ true , false,  8, 0, false, false, false, 0, 6, 0, "tukey(5e-1);partial_tukey(2)" },
+	{ true , false, 12, 0, false, false, false, 0, 6, 0, "tukey(5e-1);partial_tukey(2)" },
+	{ true , false, 12, 0, false, false, false, 0, 6, 0, "tukey(5e-1);partial_tukey(2);punchout_tukey(3)" }
+	/* here we use locale-independent 5e-1 instead of 0.5 or 0,5 */
 };
 
 
@@ -235,7 +212,7 @@
 #endif
 
 static unsigned evaluate_verbatim_subframe_(
-	FLAC__StreamEncoder *encoder, 
+	FLAC__StreamEncoder *encoder,
 	const FLAC__int32 signal[],
 	unsigned blocksize,
 	unsigned subframe_bps,
@@ -370,10 +347,13 @@
 	unsigned current_frame_number;
 	FLAC__MD5Context md5context;
 	FLAC__CPUInfo cpuinfo;
+	void (*local_precompute_partition_info_sums)(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[], unsigned residual_samples, unsigned predictor_order, unsigned min_partition_order, unsigned max_partition_order, unsigned bps);
 #ifndef FLAC__INTEGER_ONLY_LIBRARY
 	unsigned (*local_fixed_compute_best_predictor)(const FLAC__int32 data[], unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]);
+	unsigned (*local_fixed_compute_best_predictor_wide)(const FLAC__int32 data[], unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]);
 #else
 	unsigned (*local_fixed_compute_best_predictor)(const FLAC__int32 data[], unsigned data_len, FLAC__fixedpoint residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]);
+	unsigned (*local_fixed_compute_best_predictor_wide)(const FLAC__int32 data[], unsigned data_len, FLAC__fixedpoint residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]);
 #endif
 #ifndef FLAC__INTEGER_ONLY_LIBRARY
 	void (*local_lpc_compute_autocorrelation)(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
@@ -480,7 +460,7 @@
 	"FLAC__STREAM_ENCODER_INIT_STATUS_ALREADY_INITIALIZED"
 };
 
-FLAC_API const char * const FLAC__treamEncoderReadStatusString[] = {
+FLAC_API const char * const FLAC__StreamEncoderReadStatusString[] = {
 	"FLAC__STREAM_ENCODER_READ_STATUS_CONTINUE",
 	"FLAC__STREAM_ENCODER_READ_STATUS_END_OF_STREAM",
 	"FLAC__STREAM_ENCODER_READ_STATUS_ABORT",
@@ -529,18 +509,18 @@
 
 	FLAC__ASSERT(sizeof(int) >= 4); /* we want to die right away if this is not true */
 
-	encoder = (FLAC__StreamEncoder*)calloc(1, sizeof(FLAC__StreamEncoder));
+	encoder = calloc(1, sizeof(FLAC__StreamEncoder));
 	if(encoder == 0) {
 		return 0;
 	}
 
-	encoder->protected_ = (FLAC__StreamEncoderProtected*)calloc(1, sizeof(FLAC__StreamEncoderProtected));
+	encoder->protected_ = calloc(1, sizeof(FLAC__StreamEncoderProtected));
 	if(encoder->protected_ == 0) {
 		free(encoder);
 		return 0;
 	}
 
-	encoder->private_ = (FLAC__StreamEncoderPrivate*)calloc(1, sizeof(FLAC__StreamEncoderPrivate));
+	encoder->private_ = calloc(1, sizeof(FLAC__StreamEncoderPrivate));
 	if(encoder->private_ == 0) {
 		free(encoder->protected_);
 		free(encoder);
@@ -598,7 +578,9 @@
 {
 	unsigned i;
 
-	FLAC__ASSERT(0 != encoder);
+	if (encoder == NULL)
+		return ;
+
 	FLAC__ASSERT(0 != encoder->protected_);
 	FLAC__ASSERT(0 != encoder->private_);
 	FLAC__ASSERT(0 != encoder->private_->frame);
@@ -699,7 +681,7 @@
 		if(encoder->protected_->bits_per_sample < 16) {
 			/* @@@ need some data about how to set this here w.r.t. blocksize and sample rate */
 			/* @@@ until then we'll make a guess */
-			encoder->protected_->qlp_coeff_precision = max(FLAC__MIN_QLP_COEFF_PRECISION, 2 + encoder->protected_->bits_per_sample / 2);
+			encoder->protected_->qlp_coeff_precision = flac_max(FLAC__MIN_QLP_COEFF_PRECISION, 2 + encoder->protected_->bits_per_sample / 2);
 		}
 		else if(encoder->protected_->bits_per_sample == 16) {
 			if(encoder->protected_->blocksize <= 192)
@@ -731,20 +713,7 @@
 		return FLAC__STREAM_ENCODER_INIT_STATUS_INVALID_QLP_COEFF_PRECISION;
 
 	if(encoder->protected_->streamable_subset) {
-		if(
-			encoder->protected_->blocksize != 192 &&
-			encoder->protected_->blocksize != 576 &&
-			encoder->protected_->blocksize != 1152 &&
-			encoder->protected_->blocksize != 2304 &&
-			encoder->protected_->blocksize != 4608 &&
-			encoder->protected_->blocksize != 256 &&
-			encoder->protected_->blocksize != 512 &&
-			encoder->protected_->blocksize != 1024 &&
-			encoder->protected_->blocksize != 2048 &&
-			encoder->protected_->blocksize != 4096 &&
-			encoder->protected_->blocksize != 8192 &&
-			encoder->protected_->blocksize != 16384
-		)
+		if(!FLAC__format_blocksize_is_subset(encoder->protected_->blocksize, encoder->protected_->sample_rate))
 			return FLAC__STREAM_ENCODER_INIT_STATUS_NOT_STREAMABLE;
 		if(!FLAC__format_sample_rate_is_subset(encoder->protected_->sample_rate))
 			return FLAC__STREAM_ENCODER_INIT_STATUS_NOT_STREAMABLE;
@@ -777,12 +746,12 @@
 #if FLAC__HAS_OGG
 	/* reorder metadata if necessary to ensure that any VORBIS_COMMENT is the first, according to the mapping spec */
 	if(is_ogg && 0 != encoder->protected_->metadata && encoder->protected_->num_metadata_blocks > 1) {
-		unsigned i;
-		for(i = 1; i < encoder->protected_->num_metadata_blocks; i++) {
-			if(0 != encoder->protected_->metadata[i] && encoder->protected_->metadata[i]->type == FLAC__METADATA_TYPE_VORBIS_COMMENT) {
-				FLAC__StreamMetadata *vc = encoder->protected_->metadata[i];
-				for( ; i > 0; i--)
-					encoder->protected_->metadata[i] = encoder->protected_->metadata[i-1];
+		unsigned i1;
+		for(i1 = 1; i1 < encoder->protected_->num_metadata_blocks; i1++) {
+			if(0 != encoder->protected_->metadata[i1] && encoder->protected_->metadata[i1]->type == FLAC__METADATA_TYPE_VORBIS_COMMENT) {
+				FLAC__StreamMetadata *vc = encoder->protected_->metadata[i1];
+				for( ; i1 > 0; i1--)
+					encoder->protected_->metadata[i1] = encoder->protected_->metadata[i1-1];
 				encoder->protected_->metadata[0] = vc;
 				break;
 			}
@@ -791,10 +760,10 @@
 #endif
 	/* keep track of any SEEKTABLE block */
 	if(0 != encoder->protected_->metadata && encoder->protected_->num_metadata_blocks > 0) {
-		unsigned i;
-		for(i = 0; i < encoder->protected_->num_metadata_blocks; i++) {
-			if(0 != encoder->protected_->metadata[i] && encoder->protected_->metadata[i]->type == FLAC__METADATA_TYPE_SEEKTABLE) {
-				encoder->private_->seek_table = &encoder->protected_->metadata[i]->data.seek_table;
+		unsigned i2;
+		for(i2 = 0; i2 < encoder->protected_->num_metadata_blocks; i2++) {
+			if(0 != encoder->protected_->metadata[i2] && encoder->protected_->metadata[i2]->type == FLAC__METADATA_TYPE_SEEKTABLE) {
+				encoder->private_->seek_table = &encoder->protected_->metadata[i2]->data.seek_table;
 				break; /* take only the first one */
 			}
 		}
@@ -836,7 +805,7 @@
 				metadata_picture_has_type1 = true;
 				/* standard icon must be 32x32 pixel PNG */
 				if(
-					m->data.picture.type == FLAC__STREAM_METADATA_PICTURE_TYPE_FILE_ICON_STANDARD && 
+					m->data.picture.type == FLAC__STREAM_METADATA_PICTURE_TYPE_FILE_ICON_STANDARD &&
 					(
 						(strcmp(m->data.picture.mime_type, "image/png") && strcmp(m->data.picture.mime_type, "-->")) ||
 						m->data.picture.width != 32 ||
@@ -901,7 +870,7 @@
 	encoder->private_->current_frame_number = 0;
 
 	encoder->private_->use_wide_by_block = (encoder->protected_->bits_per_sample + FLAC__bitmath_ilog2(encoder->protected_->blocksize)+1 > 30);
-	encoder->private_->use_wide_by_order = (encoder->protected_->bits_per_sample + FLAC__bitmath_ilog2(max(encoder->protected_->max_lpc_order, FLAC__MAX_FIXED_ORDER))+1 > 30); /*@@@ need to use this? */
+	encoder->private_->use_wide_by_order = (encoder->protected_->bits_per_sample + FLAC__bitmath_ilog2(flac_max(encoder->protected_->max_lpc_order, FLAC__MAX_FIXED_ORDER))+1 > 30); /*@@@ need to use this? */
 	encoder->private_->use_wide_by_partition = (false); /*@@@ need to set this */
 
 	/*
@@ -912,7 +881,9 @@
 #ifndef FLAC__INTEGER_ONLY_LIBRARY
 	encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation;
 #endif
+	encoder->private_->local_precompute_partition_info_sums = precompute_partition_info_sums_;
 	encoder->private_->local_fixed_compute_best_predictor = FLAC__fixed_compute_best_predictor;
+	encoder->private_->local_fixed_compute_best_predictor_wide = FLAC__fixed_compute_best_predictor_wide;
 #ifndef FLAC__INTEGER_ONLY_LIBRARY
 	encoder->private_->local_lpc_compute_residual_from_qlp_coefficients = FLAC__lpc_compute_residual_from_qlp_coefficients;
 	encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_64bit = FLAC__lpc_compute_residual_from_qlp_coefficients_wide;
@@ -925,21 +896,23 @@
 #  ifdef FLAC__CPU_IA32
 		FLAC__ASSERT(encoder->private_->cpuinfo.type == FLAC__CPUINFO_TYPE_IA32);
 #   ifdef FLAC__HAS_NASM
-		if(encoder->private_->cpuinfo.data.ia32.sse) {
+		if(encoder->private_->cpuinfo.ia32.sse) {
 			if(encoder->protected_->max_lpc_order < 4)
 				encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4;
 			else if(encoder->protected_->max_lpc_order < 8)
 				encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8;
 			else if(encoder->protected_->max_lpc_order < 12)
 				encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12;
+			else if(encoder->protected_->max_lpc_order < 16)
+				encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16;
 			else
 				encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_asm_ia32;
 		}
-		else if(encoder->private_->cpuinfo.data.ia32._3dnow)
-			encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow;
 		else
 			encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_asm_ia32;
-		if(encoder->private_->cpuinfo.data.ia32.mmx) {
+
+		encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_64bit = FLAC__lpc_compute_residual_from_qlp_coefficients_wide_asm_ia32; /* OPT_IA32: was really necessary for GCC < 4.9 */
+		if(encoder->private_->cpuinfo.ia32.mmx) {
 			encoder->private_->local_lpc_compute_residual_from_qlp_coefficients = FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32;
 			encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_16bit = FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx;
 		}
@@ -947,16 +920,137 @@
 			encoder->private_->local_lpc_compute_residual_from_qlp_coefficients = FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32;
 			encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_16bit = FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32;
 		}
-		if(encoder->private_->cpuinfo.data.ia32.mmx && encoder->private_->cpuinfo.data.ia32.cmov)
+
+		if(encoder->private_->cpuinfo.ia32.mmx && encoder->private_->cpuinfo.ia32.cmov)
 			encoder->private_->local_fixed_compute_best_predictor = FLAC__fixed_compute_best_predictor_asm_ia32_mmx_cmov;
 #   endif /* FLAC__HAS_NASM */
-#  endif /* FLAC__CPU_IA32 */
+#   ifdef FLAC__HAS_X86INTRIN
+#    if defined FLAC__SSE_SUPPORTED
+		if(encoder->private_->cpuinfo.ia32.sse) {
+			if(encoder->protected_->max_lpc_order < 4)
+				encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4;
+			else if(encoder->protected_->max_lpc_order < 8)
+				encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8;
+			else if(encoder->protected_->max_lpc_order < 12)
+				encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12;
+			else if(encoder->protected_->max_lpc_order < 16)
+				encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16;
+			else
+				encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation;
+		}
+#    endif
+
+#    ifdef FLAC__SSE2_SUPPORTED
+		if(encoder->private_->cpuinfo.ia32.sse2) {
+			encoder->private_->local_lpc_compute_residual_from_qlp_coefficients       = FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2;
+			encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_16bit = FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2;
+		}
+#    endif
+#    ifdef FLAC__SSE4_1_SUPPORTED
+		if(encoder->private_->cpuinfo.ia32.sse41) {
+			encoder->private_->local_lpc_compute_residual_from_qlp_coefficients       = FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse41;
+			encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_64bit = FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41;
+		}
+#    endif
+#    ifdef FLAC__AVX2_SUPPORTED
+		if(encoder->private_->cpuinfo.ia32.avx2) {
+			encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_16bit = FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_avx2;
+			encoder->private_->local_lpc_compute_residual_from_qlp_coefficients       = FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_avx2;
+			encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_64bit = FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2;
+		}
+#    endif
+
+#    ifdef FLAC__SSE2_SUPPORTED
+		if (encoder->private_->cpuinfo.ia32.sse2) {
+			encoder->private_->local_fixed_compute_best_predictor      = FLAC__fixed_compute_best_predictor_intrin_sse2;
+			encoder->private_->local_fixed_compute_best_predictor_wide = FLAC__fixed_compute_best_predictor_wide_intrin_sse2;
+		}
+#    endif
+#    ifdef FLAC__SSSE3_SUPPORTED
+		if (encoder->private_->cpuinfo.ia32.ssse3) {
+			encoder->private_->local_fixed_compute_best_predictor      = FLAC__fixed_compute_best_predictor_intrin_ssse3;
+			encoder->private_->local_fixed_compute_best_predictor_wide = FLAC__fixed_compute_best_predictor_wide_intrin_ssse3;
+		}
+#    endif
+#   endif /* FLAC__HAS_X86INTRIN */
+#  elif defined FLAC__CPU_X86_64
+		FLAC__ASSERT(encoder->private_->cpuinfo.type == FLAC__CPUINFO_TYPE_X86_64);
+#   ifdef FLAC__HAS_X86INTRIN
+#    ifdef FLAC__SSE_SUPPORTED
+		if(encoder->protected_->max_lpc_order < 4)
+			encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4;
+		else if(encoder->protected_->max_lpc_order < 8)
+			encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8;
+		else if(encoder->protected_->max_lpc_order < 12)
+			encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12;
+		else if(encoder->protected_->max_lpc_order < 16)
+			encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16;
+#    endif
+
+#    ifdef FLAC__SSE2_SUPPORTED
+		encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_16bit = FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2;
+#    endif
+#    ifdef FLAC__SSE4_1_SUPPORTED
+		if(encoder->private_->cpuinfo.x86.sse41) {
+			encoder->private_->local_lpc_compute_residual_from_qlp_coefficients = FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse41;
+		}
+#    endif
+#    ifdef FLAC__AVX2_SUPPORTED
+		if(encoder->private_->cpuinfo.x86.avx2) {
+			encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_16bit = FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_avx2;
+			encoder->private_->local_lpc_compute_residual_from_qlp_coefficients       = FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_avx2;
+			encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_64bit = FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2;
+		}
+#    endif
+
+#    ifdef FLAC__SSE2_SUPPORTED
+		encoder->private_->local_fixed_compute_best_predictor      = FLAC__fixed_compute_best_predictor_intrin_sse2;
+		encoder->private_->local_fixed_compute_best_predictor_wide = FLAC__fixed_compute_best_predictor_wide_intrin_sse2;
+#    endif
+#    ifdef FLAC__SSSE3_SUPPORTED
+		if (encoder->private_->cpuinfo.x86.ssse3) {
+			encoder->private_->local_fixed_compute_best_predictor      = FLAC__fixed_compute_best_predictor_intrin_ssse3;
+			encoder->private_->local_fixed_compute_best_predictor_wide = FLAC__fixed_compute_best_predictor_wide_intrin_ssse3;
+		}
+#    endif
+#   endif /* FLAC__HAS_X86INTRIN */
+#  endif /* FLAC__CPU_... */
 	}
 # endif /* !FLAC__NO_ASM */
 #endif /* !FLAC__INTEGER_ONLY_LIBRARY */
+#if !defined FLAC__NO_ASM && defined FLAC__HAS_X86INTRIN
+	if(encoder->private_->cpuinfo.use_asm) {
+# if defined FLAC__CPU_IA32
+#  ifdef FLAC__SSE2_SUPPORTED
+		if(encoder->private_->cpuinfo.ia32.sse2)
+			encoder->private_->local_precompute_partition_info_sums = FLAC__precompute_partition_info_sums_intrin_sse2;
+#  endif
+#  ifdef FLAC__SSSE3_SUPPORTED
+		if(encoder->private_->cpuinfo.ia32.ssse3)
+			encoder->private_->local_precompute_partition_info_sums = FLAC__precompute_partition_info_sums_intrin_ssse3;
+#  endif
+#  ifdef FLAC__AVX2_SUPPORTED
+		if(encoder->private_->cpuinfo.ia32.avx2)
+			encoder->private_->local_precompute_partition_info_sums = FLAC__precompute_partition_info_sums_intrin_avx2;
+#  endif
+# elif defined FLAC__CPU_X86_64
+#  ifdef FLAC__SSE2_SUPPORTED
+		encoder->private_->local_precompute_partition_info_sums = FLAC__precompute_partition_info_sums_intrin_sse2;
+#  endif
+#  ifdef FLAC__SSSE3_SUPPORTED
+		if(encoder->private_->cpuinfo.x86.ssse3)
+			encoder->private_->local_precompute_partition_info_sums = FLAC__precompute_partition_info_sums_intrin_ssse3;
+#  endif
+#  ifdef FLAC__AVX2_SUPPORTED
+		if(encoder->private_->cpuinfo.x86.avx2)
+			encoder->private_->local_precompute_partition_info_sums = FLAC__precompute_partition_info_sums_intrin_avx2;
+#  endif
+# endif /* FLAC__CPU_... */
+	}
+#endif /* !FLAC__NO_ASM && FLAC__HAS_X86INTRIN */
 	/* finally override based on wide-ness if necessary */
 	if(encoder->private_->use_wide_by_block) {
-		encoder->private_->local_fixed_compute_best_predictor = FLAC__fixed_compute_best_predictor_wide;
+		encoder->private_->local_fixed_compute_best_predictor = encoder->private_->local_fixed_compute_best_predictor_wide;
 	}
 
 	/* set state to OK; from here on, errors are fatal and we'll override the state then */
@@ -997,7 +1091,7 @@
 		 */
 		encoder->private_->verify.input_fifo.size = encoder->protected_->blocksize+OVERREAD_;
 		for(i = 0; i < encoder->protected_->channels; i++) {
-			if(0 == (encoder->private_->verify.input_fifo.data[i] = (FLAC__int32*)safe_malloc_mul_2op_(sizeof(FLAC__int32), /*times*/encoder->private_->verify.input_fifo.size))) {
+			if(0 == (encoder->private_->verify.input_fifo.data[i] = safe_malloc_mul_2op_p(sizeof(FLAC__int32), /*times*/encoder->private_->verify.input_fifo.size))) {
 				encoder->protected_->state = FLAC__STREAM_ENCODER_MEMORY_ALLOCATION_ERROR;
 				return FLAC__STREAM_ENCODER_INIT_STATUS_ENCODER_ERROR;
 			}
@@ -1007,10 +1101,12 @@
 		/*
 		 * Now set up a stream decoder for verification
 		 */
-		encoder->private_->verify.decoder = FLAC__stream_decoder_new();
 		if(0 == encoder->private_->verify.decoder) {
-			encoder->protected_->state = FLAC__STREAM_ENCODER_VERIFY_DECODER_ERROR;
-			return FLAC__STREAM_ENCODER_INIT_STATUS_ENCODER_ERROR;
+			encoder->private_->verify.decoder = FLAC__stream_decoder_new();
+			if(0 == encoder->private_->verify.decoder) {
+				encoder->protected_->state = FLAC__STREAM_ENCODER_VERIFY_DECODER_ERROR;
+				return FLAC__STREAM_ENCODER_INIT_STATUS_ENCODER_ERROR;
+			}
 		}
 
 		if(FLAC__stream_decoder_init_stream(encoder->private_->verify.decoder, verify_read_callback_, /*seek_callback=*/0, /*tell_callback=*/0, /*length_callback=*/0, /*eof_callback=*/0, verify_write_callback_, verify_metadata_callback_, verify_error_callback_, /*client_data=*/encoder) != FLAC__STREAM_DECODER_INIT_STATUS_OK) {
@@ -1183,7 +1279,7 @@
 		/*is_ogg=*/true
 	);
 }
- 
+
 static FLAC__StreamEncoderInitStatus init_FILE_internal_(
 	FLAC__StreamEncoder *encoder,
 	FILE *file,
@@ -1214,6 +1310,13 @@
 	if(file == stdout)
 		file = get_binary_stdout_(); /* just to be safe */
 
+#ifdef _WIN32
+	/*
+	 * Windows can suffer quite badly from disk fragmentation. This can be
+	 * reduced significantly by setting the output buffer size to be 10MB.
+	 */
+	setvbuf(file, NULL, _IOFBF, 10*1024*1024);
+#endif
 	encoder->private_->file = file;
 
 	encoder->private_->progress_callback = progress_callback;
@@ -1245,7 +1348,7 @@
 
 	return init_status;
 }
- 
+
 FLAC_API FLAC__StreamEncoderInitStatus FLAC__stream_encoder_init_FILE(
 	FLAC__StreamEncoder *encoder,
 	FILE *file,
@@ -1255,7 +1358,7 @@
 {
 	return init_FILE_internal_(encoder, file, progress_callback, client_data, /*is_ogg=*/false);
 }
- 
+
 FLAC_API FLAC__StreamEncoderInitStatus FLAC__stream_encoder_init_ogg_FILE(
 	FLAC__StreamEncoder *encoder,
 	FILE *file,
@@ -1286,7 +1389,7 @@
 	if(encoder->protected_->state != FLAC__STREAM_ENCODER_UNINITIALIZED)
 		return FLAC__STREAM_ENCODER_INIT_STATUS_ALREADY_INITIALIZED;
 
-	file = filename? fopen(filename, "w+b") : stdout;
+	file = filename? flac_fopen(filename, "w+b") : stdout;
 
 	if(file == 0) {
 		encoder->protected_->state = FLAC__STREAM_ENCODER_IO_ERROR;
@@ -1482,11 +1585,10 @@
 	ok &= FLAC__stream_encoder_set_do_mid_side_stereo          (encoder, compression_levels_[value].do_mid_side_stereo);
 	ok &= FLAC__stream_encoder_set_loose_mid_side_stereo       (encoder, compression_levels_[value].loose_mid_side_stereo);
 #ifndef FLAC__INTEGER_ONLY_LIBRARY
-#if 0
-	/* was: */
+#if 1
 	ok &= FLAC__stream_encoder_set_apodization                 (encoder, compression_levels_[value].apodization);
-	/* but it's too hard to specify the string in a locale-specific way */
 #else
+	/* equivalent to -A tukey(0.5) */
 	encoder->protected_->num_apodizations = 1;
 	encoder->protected_->apodizations[0].type = FLAC__APODIZATION_TUKEY;
 	encoder->protected_->apodizations[0].parameters.tukey.p = 0.5;
@@ -1590,6 +1692,48 @@
 				encoder->protected_->apodizations[encoder->protected_->num_apodizations++].type = FLAC__APODIZATION_TUKEY;
 			}
 		}
+		else if(n>15   && 0 == strncmp("partial_tukey("       , specification, 14)) {
+			FLAC__int32 tukey_parts = (FLAC__int32)strtod(specification+14, 0);
+			const char *si_1 = strchr(specification, '/');
+			FLAC__real overlap = si_1?flac_min((FLAC__real)strtod(si_1+1, 0),0.99f):0.1f;
+			FLAC__real overlap_units = 1.0f/(1.0f - overlap) - 1.0f;
+			const char *si_2 = strchr((si_1?(si_1+1):specification), '/');
+			FLAC__real tukey_p = si_2?(FLAC__real)strtod(si_2+1, 0):0.2f;
+
+			if (tukey_parts <= 1) {
+				encoder->protected_->apodizations[encoder->protected_->num_apodizations].parameters.tukey.p = tukey_p;
+				encoder->protected_->apodizations[encoder->protected_->num_apodizations++].type = FLAC__APODIZATION_TUKEY;
+			}else if (encoder->protected_->num_apodizations + tukey_parts < 32){
+				FLAC__int32 m;
+				for(m = 0; m < tukey_parts; m++){
+					encoder->protected_->apodizations[encoder->protected_->num_apodizations].parameters.multiple_tukey.p = tukey_p;
+					encoder->protected_->apodizations[encoder->protected_->num_apodizations].parameters.multiple_tukey.start = m/(tukey_parts+overlap_units);
+					encoder->protected_->apodizations[encoder->protected_->num_apodizations].parameters.multiple_tukey.end = (m+1+overlap_units)/(tukey_parts+overlap_units);
+					encoder->protected_->apodizations[encoder->protected_->num_apodizations++].type = FLAC__APODIZATION_PARTIAL_TUKEY;
+				}
+			}
+		}
+		else if(n>16   && 0 == strncmp("punchout_tukey("       , specification, 15)) {
+			FLAC__int32 tukey_parts = (FLAC__int32)strtod(specification+15, 0);
+			const char *si_1 = strchr(specification, '/');
+			FLAC__real overlap = si_1?flac_min((FLAC__real)strtod(si_1+1, 0),0.99f):0.2f;
+			FLAC__real overlap_units = 1.0f/(1.0f - overlap) - 1.0f;
+			const char *si_2 = strchr((si_1?(si_1+1):specification), '/');
+			FLAC__real tukey_p = si_2?(FLAC__real)strtod(si_2+1, 0):0.2f;
+
+			if (tukey_parts <= 1) {
+				encoder->protected_->apodizations[encoder->protected_->num_apodizations].parameters.tukey.p = tukey_p;
+				encoder->protected_->apodizations[encoder->protected_->num_apodizations++].type = FLAC__APODIZATION_TUKEY;
+			}else if (encoder->protected_->num_apodizations + tukey_parts < 32){
+				FLAC__int32 m;
+				for(m = 0; m < tukey_parts; m++){
+					encoder->protected_->apodizations[encoder->protected_->num_apodizations].parameters.multiple_tukey.p = tukey_p;
+					encoder->protected_->apodizations[encoder->protected_->num_apodizations].parameters.multiple_tukey.start = m/(tukey_parts+overlap_units);
+					encoder->protected_->apodizations[encoder->protected_->num_apodizations].parameters.multiple_tukey.end = (m+1+overlap_units)/(tukey_parts+overlap_units);
+					encoder->protected_->apodizations[encoder->protected_->num_apodizations++].type = FLAC__APODIZATION_PUNCHOUT_TUKEY;
+				}
+			}
+		}
 		else if(n==5  && 0 == strncmp("welch"        , specification, n))
 			encoder->protected_->apodizations[encoder->protected_->num_apodizations++].type = FLAC__APODIZATION_WELCH;
 		if (encoder->protected_->num_apodizations == 32)
@@ -1736,7 +1880,7 @@
 	}
 	if(num_blocks) {
 		FLAC__StreamMetadata **m;
-		if(0 == (m = (FLAC__StreamMetadata**)safe_malloc_mul_2op_(sizeof(m[0]), /*times*/num_blocks)))
+		if(0 == (m = safe_malloc_mul_2op_p(sizeof(m[0]), /*times*/num_blocks)))
 			return false;
 		memcpy(m, metadata, sizeof(m[0]) * num_blocks);
 		encoder->protected_->metadata = m;
@@ -1990,7 +2134,7 @@
 	FLAC__ASSERT(encoder->protected_->state == FLAC__STREAM_ENCODER_OK);
 
 	do {
-		const unsigned n = min(blocksize+OVERREAD_-encoder->private_->current_sample_number, samples-j);
+		const unsigned n = flac_min(blocksize+OVERREAD_-encoder->private_->current_sample_number, samples-j);
 
 		if(encoder->protected_->verify)
 			append_to_verify_fifo_(&encoder->private_->verify.input_fifo, buffer, j, channels, n);
@@ -2053,7 +2197,7 @@
 		 */
 		do {
 			if(encoder->protected_->verify)
-				append_to_verify_fifo_interleaved_(&encoder->private_->verify.input_fifo, buffer, j, channels, min(blocksize+OVERREAD_-encoder->private_->current_sample_number, samples-j));
+				append_to_verify_fifo_interleaved_(&encoder->private_->verify.input_fifo, buffer, j, channels, flac_min(blocksize+OVERREAD_-encoder->private_->current_sample_number, samples-j));
 
 			/* "i <= blocksize" to overread 1 sample; see comment in OVERREAD_ decl */
 			for(i = encoder->private_->current_sample_number; i <= blocksize && j < samples; i++, j++) {
@@ -2088,7 +2232,7 @@
 		 */
 		do {
 			if(encoder->protected_->verify)
-				append_to_verify_fifo_interleaved_(&encoder->private_->verify.input_fifo, buffer, j, channels, min(blocksize+OVERREAD_-encoder->private_->current_sample_number, samples-j));
+				append_to_verify_fifo_interleaved_(&encoder->private_->verify.input_fifo, buffer, j, channels, flac_min(blocksize+OVERREAD_-encoder->private_->current_sample_number, samples-j));
 
 			/* "i <= blocksize" to overread 1 sample; see comment in OVERREAD_ decl */
 			for(i = encoder->private_->current_sample_number; i <= blocksize && j < samples; i++, j++) {
@@ -2171,6 +2315,8 @@
 #if FLAC__HAS_OGG
 	FLAC__ogg_encoder_aspect_set_defaults(&encoder->protected_->ogg_encoder_aspect);
 #endif
+
+	FLAC__stream_encoder_set_compression_level(encoder, 5);
 }
 
 void free_(FLAC__StreamEncoder *encoder)
@@ -2269,8 +2415,8 @@
 
 	ok = true;
 
-	/* WATCHOUT: FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx()
-	 * requires that the input arrays (in our case the integer signals)
+	/* WATCHOUT: FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx() and ..._intrin_sse2()
+	 * require that the input arrays (in our case the integer signals)
 	 * have a buffer of up to 3 zeroes in front (at negative indices) for
 	 * alignment purposes; we use 4 in front to keep the data well-aligned.
 	 */
@@ -2367,6 +2513,12 @@
 				case FLAC__APODIZATION_TUKEY:
 					FLAC__window_tukey(encoder->private_->window[i], new_blocksize, encoder->protected_->apodizations[i].parameters.tukey.p);
 					break;
+				case FLAC__APODIZATION_PARTIAL_TUKEY:
+					FLAC__window_partial_tukey(encoder->private_->window[i], new_blocksize, encoder->protected_->apodizations[i].parameters.multiple_tukey.p, encoder->protected_->apodizations[i].parameters.multiple_tukey.start, encoder->protected_->apodizations[i].parameters.multiple_tukey.end);
+					break;
+				case FLAC__APODIZATION_PUNCHOUT_TUKEY:
+					FLAC__window_punchout_tukey(encoder->private_->window[i], new_blocksize, encoder->protected_->apodizations[i].parameters.multiple_tukey.p, encoder->protected_->apodizations[i].parameters.multiple_tukey.start, encoder->protected_->apodizations[i].parameters.multiple_tukey.end);
+					break;
 				case FLAC__APODIZATION_WELCH:
 					FLAC__window_welch(encoder->private_->window[i], new_blocksize);
 					break;
@@ -2428,8 +2580,8 @@
 	FLAC__bitwriter_clear(encoder->private_->frame);
 
 	if(samples > 0) {
-		encoder->private_->streaminfo.data.stream_info.min_framesize = (unsigned)min(bytes, encoder->private_->streaminfo.data.stream_info.min_framesize);
-		encoder->private_->streaminfo.data.stream_info.max_framesize = (unsigned)max(bytes, encoder->private_->streaminfo.data.stream_info.max_framesize);
+		encoder->private_->streaminfo.data.stream_info.min_framesize = flac_min(bytes, encoder->private_->streaminfo.data.stream_info.min_framesize);
+		encoder->private_->streaminfo.data.stream_info.max_framesize = flac_max(bytes, encoder->private_->streaminfo.data.stream_info.max_framesize);
 	}
 
 	return true;
@@ -2440,6 +2592,10 @@
 	FLAC__StreamEncoderWriteStatus status;
 	FLAC__uint64 output_position = 0;
 
+#if FLAC__HAS_OGG == 0
+	(void)is_last_block;
+#endif
+
 	/* FLAC__STREAM_ENCODER_TELL_STATUS_UNSUPPORTED just means we didn't get the offset; no error */
 	if(encoder->private_->tell_callback && encoder->private_->tell_callback(encoder, &output_position, encoder->private_->client_data) == FLAC__STREAM_ENCODER_TELL_STATUS_ERROR) {
 		encoder->protected_->state = FLAC__STREAM_ENCODER_CLIENT_ERROR;
@@ -2516,7 +2672,7 @@
 		 * when the encoder goes back to write metadata, 'current_frame'
 		 * will drop back to 0.
 		 */
-		encoder->private_->frames_written = max(encoder->private_->frames_written, encoder->private_->current_frame_number+1);
+		encoder->private_->frames_written = flac_max(encoder->private_->frames_written, encoder->private_->current_frame_number+1);
 	}
 	else
 		encoder->protected_->state = FLAC__STREAM_ENCODER_CLIENT_ERROR;
@@ -2527,7 +2683,7 @@
 /* Gets called when the encoding process has finished so that we can update the STREAMINFO and SEEKTABLE blocks.  */
 void update_metadata_(const FLAC__StreamEncoder *encoder)
 {
-	FLAC__byte b[max(6, FLAC__STREAM_METADATA_SEEKPOINT_LENGTH)];
+	FLAC__byte b[flac_max(6u, FLAC__STREAM_METADATA_SEEKPOINT_LENGTH)];
 	const FLAC__StreamMetadata *metadata = &encoder->private_->streaminfo;
 	const FLAC__uint64 samples = metadata->data.stream_info.total_samples;
 	const unsigned min_framesize = metadata->data.stream_info.min_framesize;
@@ -2692,7 +2848,7 @@
 		FLAC__OGG_MAPPING_NUM_HEADERS_LENGTH +
 		FLAC__STREAM_SYNC_LENGTH
 	;
-	FLAC__byte b[max(6, FLAC__STREAM_METADATA_SEEKPOINT_LENGTH)];
+	FLAC__byte b[flac_max(6u, FLAC__STREAM_METADATA_SEEKPOINT_LENGTH)];
 	const FLAC__StreamMetadata *metadata = &encoder->private_->streaminfo;
 	const FLAC__uint64 samples = metadata->data.stream_info.total_samples;
 	const unsigned min_framesize = metadata->data.stream_info.min_framesize;
@@ -2943,9 +3099,9 @@
 	}
 	else {
 		max_partition_order = FLAC__format_get_max_rice_partition_order_from_blocksize(encoder->protected_->blocksize);
-		max_partition_order = min(max_partition_order, encoder->protected_->max_residual_partition_order);
+		max_partition_order = flac_min(max_partition_order, encoder->protected_->max_residual_partition_order);
 	}
-	min_partition_order = min(min_partition_order, max_partition_order);
+	min_partition_order = flac_min(min_partition_order, max_partition_order);
 
 	/*
 	 * Setup the frame
@@ -3194,7 +3350,7 @@
 #endif
 #ifndef FLAC__INTEGER_ONLY_LIBRARY
 	FLAC__double lpc_residual_bits_per_sample;
-	FLAC__real autoc[FLAC__MAX_LPC_ORDER+1]; /* WATCHOUT: the size is important even though encoder->protected_->max_lpc_order might be less; some asm routines need all the space */
+	FLAC__real autoc[FLAC__MAX_LPC_ORDER+1]; /* WATCHOUT: the size is important even though encoder->protected_->max_lpc_order might be less; some asm and x86 intrinsic routines need all the space */
 	FLAC__double lpc_error[FLAC__MAX_LPC_ORDER];
 	unsigned min_lpc_order, max_lpc_order, lpc_order;
 	unsigned min_qlp_coeff_precision, max_qlp_coeff_precision, qlp_coeff_precision;
@@ -3347,10 +3503,10 @@
 								}
 								if(encoder->protected_->do_qlp_coeff_prec_search) {
 									min_qlp_coeff_precision = FLAC__MIN_QLP_COEFF_PRECISION;
-									/* try to ensure a 32-bit datapath throughout for 16bps(+1bps for side channel) or less */
-									if(subframe_bps <= 17) {
-										max_qlp_coeff_precision = min(32 - subframe_bps - lpc_order, FLAC__MAX_QLP_COEFF_PRECISION);
-										max_qlp_coeff_precision = max(max_qlp_coeff_precision, min_qlp_coeff_precision);
+									/* try to keep qlp coeff precision such that only 32-bit math is required for decode of <=16bps streams */
+									if(subframe_bps <= 16) {
+										max_qlp_coeff_precision = flac_min(32 - subframe_bps - FLAC__bitmath_ilog2(lpc_order), FLAC__MAX_QLP_COEFF_PRECISION);
+										max_qlp_coeff_precision = flac_max(max_qlp_coeff_precision, min_qlp_coeff_precision);
 									}
 									else
 										max_qlp_coeff_precision = FLAC__MAX_QLP_COEFF_PRECISION;
@@ -3585,7 +3741,7 @@
 	FLAC__EntropyCodingMethod_PartitionedRiceContents *partitioned_rice_contents
 )
 {
-	FLAC__int32 qlp_coeff[FLAC__MAX_LPC_ORDER];
+	FLAC__int32 qlp_coeff[FLAC__MAX_LPC_ORDER]; /* WATCHOUT: the size is important; some x86 intrinsic routines need more than lpc order elements */
 	unsigned i, residual_bits, estimate;
 	int quantization, ret;
 	const unsigned residual_samples = blocksize - order;
@@ -3594,7 +3750,7 @@
 	if(subframe_bps <= 16) {
 		FLAC__ASSERT(order > 0);
 		FLAC__ASSERT(order <= FLAC__MAX_LPC_ORDER);
-		qlp_coeff_precision = min(qlp_coeff_precision, 32 - subframe_bps - FLAC__bitmath_ilog2(order));
+		qlp_coeff_precision = flac_min(qlp_coeff_precision, 32 - subframe_bps - FLAC__bitmath_ilog2(order));
 	}
 
 	ret = FLAC__lpc_quantize_coefficients(lp_coeff, order, qlp_coeff_precision, qlp_coeff, &quantization);
@@ -3698,9 +3854,9 @@
 	const unsigned blocksize = residual_samples + predictor_order;
 
 	max_partition_order = FLAC__format_get_max_rice_partition_order_from_blocksize_limited_max_and_predictor_order(max_partition_order, blocksize, predictor_order);
-	min_partition_order = min(min_partition_order, max_partition_order);
+	min_partition_order = flac_min(min_partition_order, max_partition_order);
 
-	precompute_partition_info_sums_(residual, abs_residual_partition_sums, residual_samples, predictor_order, min_partition_order, max_partition_order, bps);
+	private_->local_precompute_partition_info_sums(residual, abs_residual_partition_sums, residual_samples, predictor_order, min_partition_order, max_partition_order, bps);
 
 	if(do_escape_coding)
 		precompute_partition_info_escapes_(residual, raw_bits_per_partition, residual_samples, predictor_order, min_partition_order, max_partition_order);
@@ -3752,10 +3908,10 @@
 		unsigned partition;
 
 		/* save best parameters and raw_bits */
-		FLAC__format_entropy_coding_method_partitioned_rice_contents_ensure_size(prc, max(6, best_partition_order));
-		memcpy(prc->parameters, private_->partitioned_rice_contents_extra[best_parameters_index].parameters, sizeof(unsigned)*((size_t)1<<(best_partition_order)));
+		FLAC__format_entropy_coding_method_partitioned_rice_contents_ensure_size(prc, flac_max(6u, best_partition_order));
+		memcpy(prc->parameters, private_->partitioned_rice_contents_extra[best_parameters_index].parameters, sizeof(unsigned)*(1<<(best_partition_order)));
 		if(do_escape_coding)
-			memcpy(prc->raw_bits, private_->partitioned_rice_contents_extra[best_parameters_index].raw_bits, sizeof(unsigned)*((size_t)1<<(best_partition_order)));
+			memcpy(prc->raw_bits, private_->partitioned_rice_contents_extra[best_parameters_index].raw_bits, sizeof(unsigned)*(1<<(best_partition_order)));
 		/*
 		 * Now need to check if the type should be changed to
 		 * FLAC__ENTROPY_CODING_METHOD_PARTITIONED_RICE2 based on the
@@ -3772,17 +3928,6 @@
 	return best_residual_bits;
 }
 
-#if defined(FLAC__CPU_IA32) && !defined FLAC__NO_ASM && defined FLAC__HAS_NASM
-extern void precompute_partition_info_sums_32bit_asm_ia32_(
-	const FLAC__int32 residual[],
-	FLAC__uint64 abs_residual_partition_sums[],
-	unsigned blocksize,
-	unsigned predictor_order,
-	unsigned min_partition_order,
-	unsigned max_partition_order
-);
-#endif
-
 void precompute_partition_info_sums_(
 	const FLAC__int32 residual[],
 	FLAC__uint64 abs_residual_partition_sums[],
@@ -3798,21 +3943,12 @@
 
 	FLAC__ASSERT(default_partition_samples > predictor_order);
 
-#if defined(FLAC__CPU_IA32) && !defined FLAC__NO_ASM && defined FLAC__HAS_NASM
-	/* slightly pessimistic but still catches all common cases */
-	/* WATCHOUT: "+ bps" is an assumption that the average residual magnitude will not be more than "bps" bits */
-	if(FLAC__bitmath_ilog2(default_partition_samples) + bps < 32) {
-		precompute_partition_info_sums_32bit_asm_ia32_(residual, abs_residual_partition_sums, residual_samples + predictor_order, predictor_order, min_partition_order, max_partition_order);
-		return;
-	}
-#endif
-
 	/* first do max_partition_order */
 	{
 		unsigned partition, residual_sample, end = (unsigned)(-(int)predictor_order);
-		/* slightly pessimistic but still catches all common cases */
-		/* WATCHOUT: "+ bps" is an assumption that the average residual magnitude will not be more than "bps" bits */
-		if(FLAC__bitmath_ilog2(default_partition_samples) + bps < 32) {
+		/* WATCHOUT: "+ bps + FLAC__MAX_EXTRA_RESIDUAL_BPS" is the maximum
+		 * assumed size of the average residual magnitude */
+		if(FLAC__bitmath_ilog2(default_partition_samples) + bps + FLAC__MAX_EXTRA_RESIDUAL_BPS < 32) {
 			FLAC__uint32 abs_residual_partition_sum;
 
 			for(partition = residual_sample = 0; partition < partitions; partition++) {
@@ -3904,7 +4040,7 @@
 		for(i = 0; i < partitions; i++) {
 			m = raw_bits_per_partition[from_partition];
 			from_partition++;
-			raw_bits_per_partition[to_partition] = max(m, raw_bits_per_partition[from_partition]);
+			raw_bits_per_partition[to_partition] = flac_max(m, raw_bits_per_partition[from_partition]);
 			from_partition++;
 			to_partition++;
 		}
@@ -3912,7 +4048,7 @@
 }
 
 #ifdef EXACT_RICE_BITS_CALCULATION
-static FLaC__INLINE unsigned count_rice_bits_in_partition_(
+static inline unsigned count_rice_bits_in_partition_(
 	const unsigned rice_parameter,
 	const unsigned partition_samples,
 	const FLAC__int32 *residual
@@ -3927,7 +4063,7 @@
 	return partition_bits;
 }
 #else
-static FLaC__INLINE unsigned count_rice_bits_in_partition_(
+static inline unsigned count_rice_bits_in_partition_(
 	const unsigned rice_parameter,
 	const unsigned partition_samples,
 	const FLAC__uint64 abs_residual_partition_sum
@@ -3982,7 +4118,7 @@
 	FLAC__ASSERT(suggested_rice_parameter < FLAC__ENTROPY_CODING_METHOD_PARTITIONED_RICE2_ESCAPE_PARAMETER);
 	FLAC__ASSERT(rice_parameter_limit <= FLAC__ENTROPY_CODING_METHOD_PARTITIONED_RICE2_ESCAPE_PARAMETER);
 
-	FLAC__format_entropy_coding_method_partitioned_rice_contents_ensure_size(partitioned_rice_contents, max(6, partition_order));
+	FLAC__format_entropy_coding_method_partitioned_rice_contents_ensure_size(partitioned_rice_contents, flac_max(6u, partition_order));
 	parameters = partitioned_rice_contents->parameters;
 	raw_bits = partitioned_rice_contents->raw_bits;
 
@@ -4056,8 +4192,35 @@
 			 * in the partition, so the actual mean is
 			 * mean/partition_samples
 			 */
+#if 0 /* old simple code */
 			for(rice_parameter = 0, k = partition_samples; k < mean; rice_parameter++, k <<= 1)
 				;
+#else
+#if defined FLAC__CPU_X86_64 /* and other 64-bit arch, too */
+			if(mean <= 0x80000000/512) { /* 512: more or less optimal for both 16- and 24-bit input */
+#else
+			if(mean <= 0x80000000/8) { /* 32-bit arch: use 32-bit math if possible */
+#endif
+				FLAC__uint32 k2, mean2 = (FLAC__uint32) mean;
+				rice_parameter = 0; k2 = partition_samples;
+				while(k2*8 < mean2) { /* requires: mean <= (2^31)/8 */
+					rice_parameter += 4; k2 <<= 4; /* tuned for 16-bit input */
+				}
+				while(k2 < mean2) { /* requires: mean <= 2^31 */
+					rice_parameter++; k2 <<= 1;
+				}
+			}
+			else {
+				rice_parameter = 0; k = partition_samples;
+				if(mean <= FLAC__U64L(0x8000000000000000)/128) /* usually mean is _much_ smaller than this value */
+					while(k*128 < mean) { /* requires: mean <= (2^63)/128 */
+						rice_parameter += 8; k <<= 8; /* tuned for 24-bit input */
+					}
+				while(k < mean) { /* requires: mean <= 2^63 */
+					rice_parameter++; k <<= 1;
+				}
+			}
+#endif
 			if(rice_parameter >= rice_parameter_limit) {
 #ifdef DEBUG_VERBOSE
 				fprintf(stderr, "clipping rice_parameter (%u -> %u) @6\n", rice_parameter, rice_parameter_limit - 1);
@@ -4274,7 +4437,7 @@
 {
 	(void)client_data;
 
-	if(fseeko(encoder->private_->file, (off_t)absolute_byte_offset, SEEK_SET) < 0)
+	if(fseeko(encoder->private_->file, (FLAC__off_t)absolute_byte_offset, SEEK_SET) < 0)
 		return FLAC__STREAM_ENCODER_SEEK_STATUS_ERROR;
 	else
 		return FLAC__STREAM_ENCODER_SEEK_STATUS_OK;
@@ -4282,7 +4445,7 @@
 
 FLAC__StreamEncoderTellStatus file_tell_callback_(const FLAC__StreamEncoder *encoder, FLAC__uint64 *absolute_byte_offset, void *client_data)
 {
-	off_t offset;
+	FLAC__off_t offset;
 
 	(void)client_data;
 
diff --git a/src/libFLAC/stream_encoder_framing.c b/src/libFLAC/stream_encoder_framing.c
index beb6fee..959eca0 100644
--- a/src/libFLAC/stream_encoder_framing.c
+++ b/src/libFLAC/stream_encoder_framing.c
@@ -1,5 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
- * Copyright (C) 2000,2001,2002,2003,2004,2005,2006,2007  Josh Coalson
+ * Copyright (C) 2000-2009  Josh Coalson
+ * Copyright (C) 2011-2014  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -29,7 +30,7 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#if HAVE_CONFIG_H
+#ifdef HAVE_CONFIG_H
 #  include <config.h>
 #endif
 
@@ -39,11 +40,6 @@
 #include "private/crc.h"
 #include "FLAC/assert.h"
 
-#ifdef max
-#undef max
-#endif
-#define max(x,y) ((x)>(y)?(x):(y))
-
 static FLAC__bool add_entropy_coding_method_(FLAC__BitWriter *bw, const FLAC__EntropyCodingMethod *method);
 static FLAC__bool add_residual_partitioned_rice_(FLAC__BitWriter *bw, const FLAC__int32 residual[], const unsigned residual_samples, const unsigned predictor_order, const unsigned rice_parameters[], const unsigned raw_bits[], const unsigned partition_order, const FLAC__bool is_extended);
 
@@ -166,11 +162,11 @@
 				if(!FLAC__bitwriter_write_raw_uint32(bw, track->num_indices, FLAC__STREAM_METADATA_CUESHEET_TRACK_NUM_INDICES_LEN))
 					return false;
 				for(j = 0; j < track->num_indices; j++) {
-					const FLAC__StreamMetadata_CueSheet_Index *index = track->indices + j;
+					const FLAC__StreamMetadata_CueSheet_Index *indx = track->indices + j;
 
-					if(!FLAC__bitwriter_write_raw_uint64(bw, index->offset, FLAC__STREAM_METADATA_CUESHEET_INDEX_OFFSET_LEN))
+					if(!FLAC__bitwriter_write_raw_uint64(bw, indx->offset, FLAC__STREAM_METADATA_CUESHEET_INDEX_OFFSET_LEN))
 						return false;
-					if(!FLAC__bitwriter_write_raw_uint32(bw, index->number, FLAC__STREAM_METADATA_CUESHEET_INDEX_NUMBER_LEN))
+					if(!FLAC__bitwriter_write_raw_uint32(bw, indx->number, FLAC__STREAM_METADATA_CUESHEET_INDEX_NUMBER_LEN))
 						return false;
 					if(!FLAC__bitwriter_write_zeroes(bw, FLAC__STREAM_METADATA_CUESHEET_INDEX_RESERVED_LEN))
 						return false;
@@ -179,15 +175,15 @@
 			break;
 		case FLAC__METADATA_TYPE_PICTURE:
 			{
-				unsigned len;
+				size_t len;
 				if(!FLAC__bitwriter_write_raw_uint32(bw, metadata->data.picture.type, FLAC__STREAM_METADATA_PICTURE_TYPE_LEN))
 					return false;
-				len = (unsigned)strlen(metadata->data.picture.mime_type);
+				len = strlen(metadata->data.picture.mime_type);
 				if(!FLAC__bitwriter_write_raw_uint32(bw, len, FLAC__STREAM_METADATA_PICTURE_MIME_TYPE_LENGTH_LEN))
 					return false;
 				if(!FLAC__bitwriter_write_byte_block(bw, (const FLAC__byte*)metadata->data.picture.mime_type, len))
 					return false;
-				len = (unsigned)strlen((const char *)metadata->data.picture.description);
+				len = strlen((const char *)metadata->data.picture.description);
 				if(!FLAC__bitwriter_write_raw_uint32(bw, len, FLAC__STREAM_METADATA_PICTURE_DESCRIPTION_LENGTH_LEN))
 					return false;
 				if(!FLAC__bitwriter_write_byte_block(bw, metadata->data.picture.description, len))
diff --git a/src/libFLAC/stream_encoder_intrin_avx2.c b/src/libFLAC/stream_encoder_intrin_avx2.c
new file mode 100644
index 0000000..3aa3197
--- /dev/null
+++ b/src/libFLAC/stream_encoder_intrin_avx2.c
@@ -0,0 +1,142 @@
+/* libFLAC - Free Lossless Audio Codec library
+ * Copyright (C) 2000-2009  Josh Coalson
+ * Copyright (C) 2011-2014  Xiph.Org Foundation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the Xiph.org Foundation nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#  include <config.h>
+#endif
+
+#ifndef FLAC__NO_ASM
+#if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
+#include "private/stream_encoder.h"
+#include "private/bitmath.h"
+#ifdef FLAC__AVX2_SUPPORTED
+
+#include <stdlib.h>    /* for abs() */
+#include <immintrin.h> /* AVX2 */
+#include "FLAC/assert.h"
+
+FLAC__SSE_TARGET("avx2")
+void FLAC__precompute_partition_info_sums_intrin_avx2(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[],
+		unsigned residual_samples, unsigned predictor_order, unsigned min_partition_order, unsigned max_partition_order, unsigned bps)
+{
+	const unsigned default_partition_samples = (residual_samples + predictor_order) >> max_partition_order;
+	unsigned partitions = 1u << max_partition_order;
+
+	FLAC__ASSERT(default_partition_samples > predictor_order);
+
+	/* first do max_partition_order */
+	{
+		unsigned partition, residual_sample, end = (unsigned)(-(int)predictor_order);
+		__m256i res256, sum256;
+		__m128i res128, sum128;
+
+		if(FLAC__bitmath_ilog2(default_partition_samples) + bps + FLAC__MAX_EXTRA_RESIDUAL_BPS < 32) {
+			for(partition = residual_sample = 0; partition < partitions; partition++) {
+				end += default_partition_samples;
+				sum256 = _mm256_setzero_si256();
+
+				for( ; (int)residual_sample < (int)end-7; residual_sample+=8) {
+					res256 = _mm256_abs_epi32(_mm256_loadu_si256((const __m256i*)(residual+residual_sample)));
+					sum256 = _mm256_add_epi32(sum256, res256);
+				}
+
+				sum128 = _mm_add_epi32(_mm256_extracti128_si256(sum256, 1), _mm256_castsi256_si128(sum256));
+
+				for( ; (int)residual_sample < (int)end-3; residual_sample+=4) {
+					res128 = _mm_abs_epi32(_mm_loadu_si128((const __m128i*)(residual+residual_sample)));
+					sum128 = _mm_add_epi32(sum128, res128);
+				}
+
+				for( ; residual_sample < end; residual_sample++) {
+					res128 = _mm_cvtsi32_si128(residual[residual_sample]);
+					res128 = _mm_abs_epi32(res128);
+					sum128 = _mm_add_epi32(sum128, res128);
+				}
+
+				sum128 = _mm_hadd_epi32(sum128, sum128);
+				sum128 = _mm_hadd_epi32(sum128, sum128);
+				abs_residual_partition_sums[partition] = (FLAC__uint32)_mm_cvtsi128_si32(sum128);
+			}
+		}
+		else { /* have to pessimistically use 64 bits for accumulator */
+			for(partition = residual_sample = 0; partition < partitions; partition++) {
+				end += default_partition_samples;
+				sum256 = _mm256_setzero_si256();
+
+				for( ; (int)residual_sample < (int)end-3; residual_sample+=4) {
+					res128 = _mm_abs_epi32(_mm_loadu_si128((const __m128i*)(residual+residual_sample)));
+					res256 = _mm256_cvtepu32_epi64(res128);
+					sum256 = _mm256_add_epi64(sum256, res256);
+				}
+
+				sum128 = _mm_add_epi64(_mm256_extracti128_si256(sum256, 1), _mm256_castsi256_si128(sum256));
+
+				for( ; (int)residual_sample < (int)end-1; residual_sample+=2) {
+					res128 = _mm_loadl_epi64((const __m128i*)(residual+residual_sample));
+					res128 = _mm_abs_epi32(res128);
+					res128 = _mm_cvtepu32_epi64(res128);
+					sum128 = _mm_add_epi64(sum128, res128);
+				}
+
+				for( ; residual_sample < end; residual_sample++) {
+					res128 = _mm_cvtsi32_si128(residual[residual_sample]);
+					res128 = _mm_abs_epi32(res128);
+					sum128 = _mm_add_epi64(sum128, res128);
+				}
+
+				sum128 = _mm_add_epi64(sum128, _mm_srli_si128(sum128, 8));
+				_mm_storel_epi64((__m128i*)(abs_residual_partition_sums+partition), sum128);
+			}
+		}
+	}
+
+	/* now merge partitions for lower orders */
+	{
+		unsigned from_partition = 0, to_partition = partitions;
+		int partition_order;
+		for(partition_order = (int)max_partition_order - 1; partition_order >= (int)min_partition_order; partition_order--) {
+			unsigned i;
+			partitions >>= 1;
+			for(i = 0; i < partitions; i++) {
+				abs_residual_partition_sums[to_partition++] =
+					abs_residual_partition_sums[from_partition  ] +
+					abs_residual_partition_sums[from_partition+1];
+				from_partition += 2;
+			}
+		}
+	}
+	_mm256_zeroupper();
+}
+
+#endif /* FLAC__AVX2_SUPPORTED */
+#endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
+#endif /* FLAC__NO_ASM */
diff --git a/src/libFLAC/stream_encoder_intrin_sse2.c b/src/libFLAC/stream_encoder_intrin_sse2.c
new file mode 100644
index 0000000..ec5541c
--- /dev/null
+++ b/src/libFLAC/stream_encoder_intrin_sse2.c
@@ -0,0 +1,159 @@
+/* libFLAC - Free Lossless Audio Codec library
+ * Copyright (C) 2000-2009  Josh Coalson
+ * Copyright (C) 2011-2014  Xiph.Org Foundation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the Xiph.org Foundation nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#  include <config.h>
+#endif
+
+#ifndef FLAC__NO_ASM
+#if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
+#include "private/stream_encoder.h"
+#include "private/bitmath.h"
+#ifdef FLAC__SSE2_SUPPORTED
+
+#include <stdlib.h>    /* for abs() */
+#include <emmintrin.h> /* SSE2 */
+#include "FLAC/assert.h"
+
+FLAC__SSE_TARGET("sse2")
+void FLAC__precompute_partition_info_sums_intrin_sse2(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[],
+		unsigned residual_samples, unsigned predictor_order, unsigned min_partition_order, unsigned max_partition_order, unsigned bps)
+{
+	const unsigned default_partition_samples = (residual_samples + predictor_order) >> max_partition_order;
+	unsigned partitions = 1u << max_partition_order;
+
+	FLAC__ASSERT(default_partition_samples > predictor_order);
+
+	/* first do max_partition_order */
+	{
+		unsigned partition, residual_sample, end = (unsigned)(-(int)predictor_order);
+		unsigned e1, e3;
+		__m128i mm_res, mm_sum, mm_mask;
+
+		if(FLAC__bitmath_ilog2(default_partition_samples) + bps + FLAC__MAX_EXTRA_RESIDUAL_BPS < 32) {
+			for(partition = residual_sample = 0; partition < partitions; partition++) {
+				end += default_partition_samples;
+				mm_sum = _mm_setzero_si128();
+
+				e1 = (residual_sample + 3) & ~3; e3 = end & ~3;
+				if(e1 > end)
+					e1 = end; /* try flac -l 1 -b 16 and you'll be here */
+
+				/* assumption: residual[] is properly aligned so (residual + e1) is properly aligned too and _mm_loadu_si128() is fast */
+				for( ; residual_sample < e1; residual_sample++) {
+					mm_res = _mm_cvtsi32_si128(residual[residual_sample]);
+					mm_mask = _mm_srai_epi32(mm_res, 31);
+					mm_res = _mm_xor_si128(mm_res, mm_mask);
+					mm_res = _mm_sub_epi32(mm_res, mm_mask); /* abs(INT_MIN) is undefined, but if the residual is INT_MIN we have bigger problems */
+					mm_sum = _mm_add_epi32(mm_sum, mm_res);
+				}
+
+				for( ; residual_sample < e3; residual_sample+=4) {
+					mm_res = _mm_loadu_si128((const __m128i*)(residual+residual_sample));
+					mm_mask = _mm_srai_epi32(mm_res, 31);
+					mm_res = _mm_xor_si128(mm_res, mm_mask);
+					mm_res = _mm_sub_epi32(mm_res, mm_mask);
+					mm_sum = _mm_add_epi32(mm_sum, mm_res);
+				}
+
+				for( ; residual_sample < end; residual_sample++) {
+					mm_res = _mm_cvtsi32_si128(residual[residual_sample]);
+					mm_mask = _mm_srai_epi32(mm_res, 31);
+					mm_res = _mm_xor_si128(mm_res, mm_mask);
+					mm_res = _mm_sub_epi32(mm_res, mm_mask);
+					mm_sum = _mm_add_epi32(mm_sum, mm_res);
+				}
+
+				mm_sum = _mm_add_epi32(mm_sum, _mm_srli_si128(mm_sum, 8));
+				mm_sum = _mm_add_epi32(mm_sum, _mm_srli_si128(mm_sum, 4));
+				abs_residual_partition_sums[partition] = (FLAC__uint32)_mm_cvtsi128_si32(mm_sum);
+			}
+		}
+		else { /* have to pessimistically use 64 bits for accumulator */
+			for(partition = residual_sample = 0; partition < partitions; partition++) {
+				end += default_partition_samples;
+				mm_sum = _mm_setzero_si128();
+
+				e1 = (residual_sample + 1) & ~1; e3 = end & ~1;
+				FLAC__ASSERT(e1 <= end);
+
+				for( ; residual_sample < e1; residual_sample++) {
+					mm_res = _mm_cvtsi32_si128(residual[residual_sample]); /*  0   0   0   r0 */
+					mm_mask = _mm_srai_epi32(mm_res, 31);
+					mm_res = _mm_xor_si128(mm_res, mm_mask);
+					mm_res = _mm_sub_epi32(mm_res, mm_mask); /*  0   0   0  |r0|  ==   00   |r0_64| */
+					mm_sum = _mm_add_epi64(mm_sum, mm_res);
+				}
+
+				for( ; residual_sample < e3; residual_sample+=2) {
+					mm_res = _mm_loadl_epi64((const __m128i*)(residual+residual_sample)); /*  0   0   r1  r0 */
+					mm_mask = _mm_srai_epi32(mm_res, 31);
+					mm_res = _mm_xor_si128(mm_res, mm_mask);
+					mm_res = _mm_sub_epi32(mm_res, mm_mask); /*  0   0  |r1|   |r0| */
+					mm_res = _mm_shuffle_epi32(mm_res, _MM_SHUFFLE(3,1,2,0)); /* 0  |r1|  0  |r0|  ==  |r1_64|  |r0_64|  */
+					mm_sum = _mm_add_epi64(mm_sum, mm_res);
+				}
+
+				for( ; residual_sample < end; residual_sample++) {
+					mm_res = _mm_cvtsi32_si128(residual[residual_sample]);
+					mm_mask = _mm_srai_epi32(mm_res, 31);
+					mm_res = _mm_xor_si128(mm_res, mm_mask);
+					mm_res = _mm_sub_epi32(mm_res, mm_mask);
+					mm_sum = _mm_add_epi64(mm_sum, mm_res);
+				}
+
+				mm_sum = _mm_add_epi64(mm_sum, _mm_srli_si128(mm_sum, 8));
+				_mm_storel_epi64((__m128i*)(abs_residual_partition_sums+partition), mm_sum);
+			}
+		}
+	}
+
+	/* now merge partitions for lower orders */
+	{
+		unsigned from_partition = 0, to_partition = partitions;
+		int partition_order;
+		for(partition_order = (int)max_partition_order - 1; partition_order >= (int)min_partition_order; partition_order--) {
+			unsigned i;
+			partitions >>= 1;
+			for(i = 0; i < partitions; i++) {
+				abs_residual_partition_sums[to_partition++] =
+					abs_residual_partition_sums[from_partition  ] +
+					abs_residual_partition_sums[from_partition+1];
+				from_partition += 2;
+			}
+		}
+	}
+}
+
+#endif /* FLAC__SSE2_SUPPORTED */
+#endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
+#endif /* FLAC__NO_ASM */
diff --git a/src/libFLAC/stream_encoder_intrin_ssse3.c b/src/libFLAC/stream_encoder_intrin_ssse3.c
new file mode 100644
index 0000000..2dbd18c
--- /dev/null
+++ b/src/libFLAC/stream_encoder_intrin_ssse3.c
@@ -0,0 +1,147 @@
+/* libFLAC - Free Lossless Audio Codec library
+ * Copyright (C) 2000-2009  Josh Coalson
+ * Copyright (C) 2011-2014  Xiph.Org Foundation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the Xiph.org Foundation nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#  include <config.h>
+#endif
+
+#ifndef FLAC__NO_ASM
+#if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
+#include "private/stream_encoder.h"
+#include "private/bitmath.h"
+#ifdef FLAC__SSSE3_SUPPORTED
+
+#include <stdlib.h>    /* for abs() */
+#include <tmmintrin.h> /* SSSE3 */
+#include "FLAC/assert.h"
+
+FLAC__SSE_TARGET("ssse3")
+void FLAC__precompute_partition_info_sums_intrin_ssse3(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[],
+		unsigned residual_samples, unsigned predictor_order, unsigned min_partition_order, unsigned max_partition_order, unsigned bps)
+{
+	const unsigned default_partition_samples = (residual_samples + predictor_order) >> max_partition_order;
+	unsigned partitions = 1u << max_partition_order;
+
+	FLAC__ASSERT(default_partition_samples > predictor_order);
+
+	/* first do max_partition_order */
+	{
+		unsigned partition, residual_sample, end = (unsigned)(-(int)predictor_order);
+		unsigned e1, e3;
+		__m128i mm_res, mm_sum;
+
+		if(FLAC__bitmath_ilog2(default_partition_samples) + bps + FLAC__MAX_EXTRA_RESIDUAL_BPS < 32) {
+			for(partition = residual_sample = 0; partition < partitions; partition++) {
+				end += default_partition_samples;
+				mm_sum = _mm_setzero_si128();
+
+				e1 = (residual_sample + 3) & ~3; e3 = end & ~3;
+				if(e1 > end)
+					e1 = end; /* try flac -l 1 -b 16 and you'll be here */
+
+				/* assumption: residual[] is properly aligned so (residual + e1) is properly aligned too and _mm_loadu_si128() is fast */
+				for( ; residual_sample < e1; residual_sample++) {
+					mm_res = _mm_cvtsi32_si128(residual[residual_sample]);
+					mm_res = _mm_abs_epi32(mm_res); /* abs(INT_MIN) is undefined, but if the residual is INT_MIN we have bigger problems */
+					mm_sum = _mm_add_epi32(mm_sum, mm_res);
+				}
+
+				for( ; residual_sample < e3; residual_sample+=4) {
+					mm_res = _mm_loadu_si128((const __m128i*)(residual+residual_sample));
+					mm_res = _mm_abs_epi32(mm_res);
+					mm_sum = _mm_add_epi32(mm_sum, mm_res);
+				}
+
+				for( ; residual_sample < end; residual_sample++) {
+					mm_res = _mm_cvtsi32_si128(residual[residual_sample]);
+					mm_res = _mm_abs_epi32(mm_res);
+					mm_sum = _mm_add_epi32(mm_sum, mm_res);
+				}
+
+				mm_sum = _mm_hadd_epi32(mm_sum, mm_sum);
+				mm_sum = _mm_hadd_epi32(mm_sum, mm_sum);
+				abs_residual_partition_sums[partition] = (FLAC__uint32)_mm_cvtsi128_si32(mm_sum);
+			}
+		}
+		else { /* have to pessimistically use 64 bits for accumulator */
+			for(partition = residual_sample = 0; partition < partitions; partition++) {
+				end += default_partition_samples;
+				mm_sum = _mm_setzero_si128();
+
+				e1 = (residual_sample + 1) & ~1; e3 = end & ~1;
+				FLAC__ASSERT(e1 <= end);
+
+				for( ; residual_sample < e1; residual_sample++) {
+					mm_res = _mm_cvtsi32_si128(residual[residual_sample]); /*  0   0   0   r0 */
+					mm_res = _mm_abs_epi32(mm_res); /*  0   0   0  |r0|  ==   00   |r0_64| */
+					mm_sum = _mm_add_epi64(mm_sum, mm_res);
+				}
+
+				for( ; residual_sample < e3; residual_sample+=2) {
+					mm_res = _mm_loadl_epi64((const __m128i*)(residual+residual_sample)); /*  0   0   r1  r0 */
+					mm_res = _mm_abs_epi32(mm_res); /*  0   0  |r1|   |r0| */
+					mm_res = _mm_shuffle_epi32(mm_res, _MM_SHUFFLE(3,1,2,0)); /* 0  |r1|  0  |r0|  ==  |r1_64|  |r0_64|  */
+					mm_sum = _mm_add_epi64(mm_sum, mm_res);
+				}
+
+				for( ; residual_sample < end; residual_sample++) {
+					mm_res = _mm_cvtsi32_si128(residual[residual_sample]);
+					mm_res = _mm_abs_epi32(mm_res);
+					mm_sum = _mm_add_epi64(mm_sum, mm_res);
+				}
+
+				mm_sum = _mm_add_epi64(mm_sum, _mm_srli_si128(mm_sum, 8));
+				_mm_storel_epi64((__m128i*)(abs_residual_partition_sums+partition), mm_sum);
+			}
+		}
+	}
+
+	/* now merge partitions for lower orders */
+	{
+		unsigned from_partition = 0, to_partition = partitions;
+		int partition_order;
+		for(partition_order = (int)max_partition_order - 1; partition_order >= (int)min_partition_order; partition_order--) {
+			unsigned i;
+			partitions >>= 1;
+			for(i = 0; i < partitions; i++) {
+				abs_residual_partition_sums[to_partition++] =
+					abs_residual_partition_sums[from_partition  ] +
+					abs_residual_partition_sums[from_partition+1];
+				from_partition += 2;
+			}
+		}
+	}
+}
+
+#endif /* FLAC__SSSE3_SUPPORTED */
+#endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
+#endif /* FLAC__NO_ASM */
diff --git a/src/libFLAC/window.c b/src/libFLAC/window.c
index cd689c5..4387ef7 100644
--- a/src/libFLAC/window.c
+++ b/src/libFLAC/window.c
@@ -1,5 +1,6 @@
 /* libFLAC - Free Lossless Audio Codec library
- * Copyright (C) 2006,2007  Josh Coalson
+ * Copyright (C) 2006-2009  Josh Coalson
+ * Copyright (C) 2011-2014  Xiph.Org Foundation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -29,22 +30,18 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#if HAVE_CONFIG_H
+#ifdef HAVE_CONFIG_H
 #  include <config.h>
 #endif
 
 #include <math.h>
+#include "share/compat.h"
 #include "FLAC/assert.h"
 #include "FLAC/format.h"
 #include "private/window.h"
 
 #ifndef FLAC__INTEGER_ONLY_LIBRARY
 
-#ifndef M_PI
-/* math.h in VC++ doesn't seem to have this (how Microsoft is that?) */
-#define M_PI 3.14159265358979323846
-#endif
-
 
 void FLAC__window_bartlett(FLAC__real *window, const FLAC__int32 L)
 {
@@ -61,7 +58,7 @@
 		for (n = 0; n <= L/2-1; n++)
 			window[n] = 2.0f * n / (float)N;
 		for (; n <= N; n++)
-			window[n] = 2.0f - 2.0f * (N-n) / (float)N;
+			window[n] = 2.0f - 2.0f * n / (float)N;
 	}
 }
 
@@ -71,7 +68,7 @@
 	FLAC__int32 n;
 
 	for (n = 0; n < L; n++)
-		window[n] = (FLAC__real)(0.62f - 0.48f * fabs((float)n/(float)N+0.5f) + 0.38f * cos(2.0f * M_PI * ((float)n/(float)N+0.5f)));
+		window[n] = (FLAC__real)(0.62f - 0.48f * fabs((float)n/(float)N-0.5f) - 0.38f * cos(2.0f * M_PI * ((float)n/(float)N)));
 }
 
 void FLAC__window_blackman(FLAC__real *window, const FLAC__int32 L)
@@ -176,16 +173,16 @@
 	FLAC__int32 n;
 
 	if (L & 1) {
-		for (n = 1; n <= L+1/2; n++)
+		for (n = 1; n <= (L+1)/2; n++)
 			window[n-1] = 2.0f * n / ((float)L + 1.0f);
 		for (; n <= L; n++)
-			window[n-1] = - (float)(2 * (L - n + 1)) / ((float)L + 1.0f);
+			window[n-1] = (float)(2 * (L - n + 1)) / ((float)L + 1.0f);
 	}
 	else {
 		for (n = 1; n <= L/2; n++)
-			window[n-1] = 2.0f * n / (float)L;
+			window[n-1] = 2.0f * n / ((float)L + 1.0f);
 		for (; n <= L; n++)
-			window[n-1] = ((float)(2 * (L - n)) + 1.0f) / (float)L;
+			window[n-1] = (float)(2 * (L - n + 1)) / ((float)L + 1.0f);
 	}
 }
 
@@ -210,6 +207,66 @@
 	}
 }
 
+void FLAC__window_partial_tukey(FLAC__real *window, const FLAC__int32 L, const FLAC__real p, const FLAC__real start, const FLAC__real end)
+{
+	const FLAC__int32 start_n = (FLAC__int32)(start * L);
+	const FLAC__int32 end_n = (FLAC__int32)(end * L);
+	const FLAC__int32 N = end_n - start_n;
+	FLAC__int32 Np, n, i;
+
+	if (p <= 0.0f)
+		FLAC__window_partial_tukey(window, L, 0.05f, start, end);
+	else if (p >= 1.0f)
+		FLAC__window_partial_tukey(window, L, 0.95f, start, end);
+	else {
+
+		Np = (FLAC__int32)(p / 2.0f * N);
+
+		for (n = 0; n < start_n && n < L; n++)
+			window[n] = 0.0f;
+		for (i = 1; n < (start_n+Np) && n < L; n++, i++)
+			window[n] = (FLAC__real)(0.5f - 0.5f * cos(M_PI * i / Np));
+		for (; n < (end_n-Np) && n < L; n++)
+			window[n] = 1.0f;
+		for (i = Np; n < end_n && n < L; n++, i--)
+			window[n] = (FLAC__real)(0.5f - 0.5f * cos(M_PI * i / Np));
+		for (; n < L; n++)
+			window[n] = 0.0f;
+	}
+}
+
+void FLAC__window_punchout_tukey(FLAC__real *window, const FLAC__int32 L, const FLAC__real p, const FLAC__real start, const FLAC__real end)
+{
+	const FLAC__int32 start_n = (FLAC__int32)(start * L);
+	const FLAC__int32 end_n = (FLAC__int32)(end * L);
+	FLAC__int32 Ns, Ne, n, i;
+
+	if (p <= 0.0f)
+		FLAC__window_punchout_tukey(window, L, 0.05f, start, end);
+	else if (p >= 1.0f)
+		FLAC__window_punchout_tukey(window, L, 0.95f, start, end);
+	else {
+
+		Ns = (FLAC__int32)(p / 2.0f * start_n);
+		Ne = (FLAC__int32)(p / 2.0f * (L - end_n));
+
+		for (n = 0, i = 1; n < Ns && n < L; n++, i++)
+			window[n] = (FLAC__real)(0.5f - 0.5f * cos(M_PI * i / Ns));
+		for (; n < start_n-Ns && n < L; n++)
+			window[n] = 1.0f;
+		for (i = Ns; n < start_n && n < L; n++, i--)
+			window[n] = (FLAC__real)(0.5f - 0.5f * cos(M_PI * i / Ns));
+		for (; n < end_n && n < L; n++)
+			window[n] = 0.0f;
+		for (i = 1; n < end_n+Ne && n < L; n++, i++)
+			window[n] = (FLAC__real)(0.5f - 0.5f * cos(M_PI * i / Ne));
+		for (; n < L - (Ne) && n < L; n++)
+			window[n] = 1.0f;
+		for (i = Ne; n < L; n++, i--)
+			window[n] = (FLAC__real)(0.5f - 0.5f * cos(M_PI * i / Ne));
+	}
+}
+
 void FLAC__window_welch(FLAC__real *window, const FLAC__int32 L)
 {
 	const FLAC__int32 N = L - 1;
diff --git a/src/share/win_utf8_io/win_utf8_io.c b/src/share/win_utf8_io/win_utf8_io.c
new file mode 100644
index 0000000..dd22d5b
--- /dev/null
+++ b/src/share/win_utf8_io/win_utf8_io.c
@@ -0,0 +1,373 @@
+/* libFLAC - Free Lossless Audio Codec library
+ * Copyright (C) 2013-2014  Xiph.Org Foundation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the Xiph.org Foundation nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#  include <config.h>
+#endif
+
+#include <stdio.h>
+#include <sys/stat.h>
+#include <sys/utime.h>
+#include <io.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h> /* for WideCharToMultiByte and MultiByteToWideChar */
+
+#include "share/win_utf8_io.h"
+
+#define UTF8_BUFFER_SIZE 32768
+
+static
+int local_vsnprintf(char *str, size_t size, const char *fmt, va_list va)
+{
+	int rc;
+
+#if defined _MSC_VER
+	if (size == 0)
+		return 1024;
+	rc = vsnprintf_s (str, size, _TRUNCATE, fmt, va);
+	if (rc < 0)
+		rc = size - 1;
+#elif defined __MINGW32__
+	rc = __mingw_vsnprintf (str, size, fmt, va);
+#else
+	rc = vsnprintf (str, size, fmt, va);
+#endif
+
+	return rc;
+}
+
+static UINT win_utf8_io_codepage = CP_ACP;
+
+/* convert WCHAR stored Unicode string to UTF-8. Caller is responsible for freeing memory */
+static
+char *utf8_from_wchar(const wchar_t *wstr)
+{
+	char *utf8str;
+	int len;
+
+	if (!wstr) return NULL;
+	if ((len = WideCharToMultiByte(CP_UTF8, 0, wstr, -1, NULL, 0, NULL, NULL)) == 0) return NULL;
+	if ((utf8str = (char *)malloc(++len)) == NULL) return NULL;
+	if (WideCharToMultiByte(CP_UTF8, 0, wstr, -1, utf8str, len, NULL, NULL) == 0) {
+		free(utf8str);
+		utf8str = NULL;
+	}
+
+	return utf8str;
+}
+
+/* convert UTF-8 back to WCHAR. Caller is responsible for freeing memory */
+static
+wchar_t *wchar_from_utf8(const char *str)
+{
+	wchar_t *widestr;
+	int len;
+
+	if (!str) return NULL;
+	len=(int)strlen(str)+1;
+	if ((widestr = (wchar_t *)malloc(len*sizeof(wchar_t))) != NULL) {
+		if (MultiByteToWideChar(win_utf8_io_codepage, 0, str, len, widestr, len) == 0) {
+			if (MultiByteToWideChar(CP_ACP, 0, str, len, widestr, len) == 0) { /* try conversion from Ansi in case the initial UTF-8 conversion had failed */
+				free(widestr);
+				widestr = NULL;
+			}
+		}
+	}
+
+	return widestr;
+}
+
+/* retrieve WCHAR commandline, expand wildcards and convert everything to UTF-8 */
+int get_utf8_argv(int *argc, char ***argv)
+{
+	typedef int (__cdecl *wgetmainargs_t)(int*, wchar_t***, wchar_t***, int, int*);
+	wgetmainargs_t wgetmainargs;
+	HMODULE handle;
+	int wargc;
+	wchar_t **wargv;
+	wchar_t **wenv;
+	char **utf8argv;
+	int ret, i;
+
+	if ((handle = LoadLibraryA("msvcrt.dll")) == NULL) return 1;
+	if ((wgetmainargs = (wgetmainargs_t)GetProcAddress(handle, "__wgetmainargs")) == NULL) return 1;
+	i = 0;
+	/* if __wgetmainargs expands wildcards then it also erroneously converts \\?\c:\path\to\file.flac to \\file.flac */
+	if (wgetmainargs(&wargc, &wargv, &wenv, 1, &i) != 0) return 1;
+	if ((utf8argv = (char **)calloc(wargc, sizeof(char*))) == NULL) return 1;
+	ret = 0;
+
+	for (i=0; i<wargc; i++) {
+		if ((utf8argv[i] = utf8_from_wchar(wargv[i])) == NULL) {
+			ret = 1;
+			break;
+		}
+	}
+
+	FreeLibrary(handle);
+
+	if (ret == 0) {
+		win_utf8_io_codepage = CP_UTF8;
+		*argc = wargc;
+		*argv = utf8argv;
+	} else {
+		for (i=0; i<wargc; i++)
+			free(utf8argv[i]);
+		free(utf8argv);
+	}
+
+	return ret;
+}
+
+/* return number of characters in the UTF-8 string */
+size_t strlen_utf8(const char *str)
+{
+	size_t len;
+	if ((len = MultiByteToWideChar(win_utf8_io_codepage, 0, str, -1, NULL, 0)) == 0)
+		len = strlen(str);
+	return len;
+}
+
+/* get the console width in characters */
+int win_get_console_width(void)
+{
+	int width = 80;
+	CONSOLE_SCREEN_BUFFER_INFO csbi;
+	HANDLE hOut = GetStdHandle(STD_OUTPUT_HANDLE);
+	if (GetConsoleScreenBufferInfo(hOut, &csbi) != 0) width = csbi.dwSize.X;
+	return width;
+}
+
+/* print functions */
+
+int print_console(FILE *stream, const wchar_t *text, size_t len)
+{
+	static HANDLE hOut;
+	static HANDLE hErr;
+	DWORD out;
+	hOut = GetStdHandle(STD_OUTPUT_HANDLE);
+	hErr = GetStdHandle(STD_ERROR_HANDLE);
+	if (stream == stdout && hOut != INVALID_HANDLE_VALUE && GetFileType(hOut) == FILE_TYPE_CHAR) {
+		if (WriteConsoleW(hOut, text, len, &out, NULL) == 0) return -1;
+		return out;
+	} else if (stream == stderr && hErr != INVALID_HANDLE_VALUE && GetFileType(hErr) == FILE_TYPE_CHAR) {
+		if (WriteConsoleW(hErr, text, len, &out, NULL) == 0) return -1;
+		return out;
+	} else {
+		int ret = fputws(text, stream);
+		if (ret < 0) return ret;
+		return len;
+	}
+}
+
+int printf_utf8(const char *format, ...)
+{
+	char *utmp = NULL;
+	wchar_t *wout = NULL;
+	int ret = -1;
+
+	while (1) {
+		va_list argptr;
+		if (!(utmp = (char *)malloc(UTF8_BUFFER_SIZE*sizeof(char)))) break;
+		va_start(argptr, format);
+		ret = local_vsnprintf(utmp, UTF8_BUFFER_SIZE, format, argptr);
+		va_end(argptr);
+		if (ret < 0) break;
+		if (!(wout = wchar_from_utf8(utmp))) {
+			ret = -1;
+			break;
+		}
+		ret = print_console(stdout, wout, wcslen(wout));
+		break;
+	}
+	if (utmp) free(utmp);
+	if (wout) free(wout);
+
+	return ret;
+}
+
+int fprintf_utf8(FILE *stream, const char *format, ...)
+{
+	char *utmp = NULL;
+	wchar_t *wout = NULL;
+	int ret = -1;
+
+	while (1) {
+		va_list argptr;
+		if (!(utmp = (char *)malloc(UTF8_BUFFER_SIZE*sizeof(char)))) break;
+		va_start(argptr, format);
+		ret = local_vsnprintf(utmp, UTF8_BUFFER_SIZE, format, argptr);
+		va_end(argptr);
+		if (ret < 0) break;
+		if (!(wout = wchar_from_utf8(utmp))) {
+			ret = -1;
+			break;
+		}
+		ret = print_console(stream, wout, wcslen(wout));
+		break;
+	}
+	if (utmp) free(utmp);
+	if (wout) free(wout);
+
+	return ret;
+}
+
+int vfprintf_utf8(FILE *stream, const char *format, va_list argptr)
+{
+	char *utmp = NULL;
+	wchar_t *wout = NULL;
+	int ret = -1;
+
+	while (1) {
+		if (!(utmp = (char *)malloc(UTF8_BUFFER_SIZE*sizeof(char)))) break;
+		if ((ret = local_vsnprintf(utmp, UTF8_BUFFER_SIZE, format, argptr)) < 0) break;
+		if (!(wout = wchar_from_utf8(utmp))) {
+			ret = -1;
+			break;
+		}
+		ret = print_console(stream, wout, wcslen(wout));
+		break;
+	}
+	if (utmp) free(utmp);
+	if (wout) free(wout);
+
+	return ret;
+}
+
+/* file functions */
+
+FILE *fopen_utf8(const char *filename, const char *mode)
+{
+	wchar_t *wname = NULL;
+	wchar_t *wmode = NULL;
+	FILE *f = NULL;
+
+	while (1) {
+		if (!(wname = wchar_from_utf8(filename))) break;
+		if (!(wmode = wchar_from_utf8(mode))) break;
+		f = _wfopen(wname, wmode);
+		break;
+	}
+	if (wname) free(wname);
+	if (wmode) free(wmode);
+
+	return f;
+}
+
+int _stat64_utf8(const char *path, struct __stat64 *buffer)
+{
+	wchar_t *wpath;
+	int ret;
+
+	if (!(wpath = wchar_from_utf8(path))) return -1;
+	ret = _wstat64(wpath, buffer);
+	free(wpath);
+
+	return ret;
+}
+
+int chmod_utf8(const char *filename, int pmode)
+{
+	wchar_t *wname;
+	int ret;
+
+	if (!(wname = wchar_from_utf8(filename))) return -1;
+	ret = _wchmod(wname, pmode);
+	free(wname);
+
+	return ret;
+}
+
+int utime_utf8(const char *filename, struct utimbuf *times)
+{
+	wchar_t *wname;
+	struct __utimbuf64 ut;
+	int ret;
+
+	if (sizeof(*times) == sizeof(ut)) {
+		memcpy(&ut, times, sizeof(ut));
+	} else {
+		ut.actime = times->actime;
+		ut.modtime = times->modtime;
+	}
+
+	if (!(wname = wchar_from_utf8(filename))) return -1;
+	ret = _wutime64(wname, &ut);
+	free(wname);
+
+	return ret;
+}
+
+int unlink_utf8(const char *filename)
+{
+	wchar_t *wname;
+	int ret;
+
+	if (!(wname = wchar_from_utf8(filename))) return -1;
+	ret = _wunlink(wname);
+	free(wname);
+
+	return ret;
+}
+
+int rename_utf8(const char *oldname, const char *newname)
+{
+	wchar_t *wold = NULL;
+	wchar_t *wnew = NULL;
+	int ret = -1;
+
+	while (1) {
+		if (!(wold = wchar_from_utf8(oldname))) break;
+		if (!(wnew = wchar_from_utf8(newname))) break;
+		ret = _wrename(wold, wnew);
+		break;
+	}
+	if (wold) free(wold);
+	if (wnew) free(wnew);
+
+	return ret;
+}
+
+HANDLE WINAPI CreateFile_utf8(const char *lpFileName, DWORD dwDesiredAccess, DWORD dwShareMode, LPSECURITY_ATTRIBUTES lpSecurityAttributes, DWORD dwCreationDisposition, DWORD dwFlagsAndAttributes, HANDLE hTemplateFile)
+{
+	wchar_t *wname;
+	HANDLE handle = INVALID_HANDLE_VALUE;
+
+	if ((wname = wchar_from_utf8(lpFileName)) != NULL) {
+		handle = CreateFileW(wname, dwDesiredAccess, dwShareMode, lpSecurityAttributes, dwCreationDisposition, dwFlagsAndAttributes, hTemplateFile);
+		free(wname);
+	}
+
+	return handle;
+}