From 30c9359a639a9a3cc2bcc3390dd7801fe03e8979 Mon Sep 17 00:00:00 2001
From: Benjamin Drung <bdrung@debian.org>
Date: Thu, 24 Sep 2015 19:44:17 +0100
Subject: [PATCH] Import libsoxr_0.1.2.orig.tar.xz

[dgit import orig libsoxr_0.1.2.orig.tar.xz]
---
 AUTHORS                            |    1 +
 CMakeLists.txt                     |  298 +++++
 COPYING.LGPL                       |  502 ++++++++
 INSTALL                            |  123 ++
 LICENCE                            |   24 +
 NEWS                               |   37 +
 README                             |   53 +
 TODO                               |    3 +
 cmake/Modules/FindLibAVCodec.cmake |   23 +
 cmake/Modules/FindOpenMP.cmake     |  115 ++
 cmake/Modules/FindSIMD.cmake       |   94 ++
 cmake/Modules/TestBigEndian.cmake  |   15 +
 deinstall.cmake.in                 |   25 +
 examples/1-single-block.c          |   50 +
 examples/1a-lsr.c                  |   40 +
 examples/2-stream.C                |   78 ++
 examples/3-options-input-fn.c      |  113 ++
 examples/4-split-channels.c        |  148 +++
 examples/5-variable-rate.c         |   93 ++
 examples/CMakeLists.txt            |   37 +
 examples/README                    |   20 +
 examples/examples-common.h         |   45 +
 go                                 |   17 +
 go.bat                             |   27 +
 inst-check                         |   25 +
 inst-check-soxr                    |   52 +
 inst-check-soxr-lsr                |    1 +
 msvc/README                        |    9 +
 msvc/libsoxr.vcproj                |   80 ++
 msvc/soxr-config.h                 |   49 +
 soxr-config.h.in                   |   46 +
 src/CMakeLists.txt                 |  125 ++
 src/aliases.h                      |   37 +
 src/avfft32.c                      |   27 +
 src/avfft32s.c                     |   27 +
 src/ccrw2.h                        |   75 ++
 src/data-io.c                      |  252 ++++
 src/data-io.h                      |   39 +
 src/dbesi0.c                       |  149 +++
 src/fft4g.c                        | 1352 ++++++++++++++++++++++
 src/fft4g.h                        |   23 +
 src/fft4g32.c                      |   27 +
 src/fft4g32s.c                     |   26 +
 src/fft4g64.c                      |   29 +
 src/fft4g_cache.h                  |   92 ++
 src/fifo.h                         |  124 ++
 src/filter.c                       |  245 ++++
 src/filter.h                       |   39 +
 src/filters.h                      |  151 +++
 src/half-fir.h                     |   25 +
 src/half_coefs.h                   |   57 +
 src/internal.h                     |   46 +
 src/libsoxr-dev.src.in             |    2 +
 src/libsoxr.src.in                 |    1 +
 src/lsr.c                          |  114 ++
 src/pffft.c                        | 1729 ++++++++++++++++++++++++++++
 src/pffft.h                        |  177 +++
 src/pffft32.c                      |   32 +
 src/pffft32s.c                     |   27 +
 src/poly-fir.h                     |   98 ++
 src/poly-fir0.h                    |   32 +
 src/rate.h                         |  726 ++++++++++++
 src/rate32.c                       |    9 +
 src/rate32s.c                      |    9 +
 src/rate64.c                       |    9 +
 src/rdft.h                         |   31 +
 src/rint-clip.h                    |  153 +++
 src/rint.h                         |   68 ++
 src/samplerate.h                   |    1 +
 src/simd-dev.h                     |    5 +
 src/simd.c                         |   84 ++
 src/simd.h                         |   16 +
 src/soxr-lsr.h                     |   80 ++
 src/soxr-lsr.pc.in                 |    5 +
 src/soxr.c                         |  638 ++++++++++
 src/soxr.h                         |  348 ++++++
 src/soxr.pc.in                     |    5 +
 src/vr-coefs.c                     |  112 ++
 src/vr-coefs.h                     |   91 ++
 src/vr32.c                         |  657 +++++++++++
 tests/1-delay-clear.c              |   64 +
 tests/CMakeLists.txt               |   52 +
 tests/README                       |    1 +
 tests/bandwidth-test               |   40 +
 tests/cmp-test.cmake               |   30 +
 tests/eg-test                      |   47 +
 tests/io-test                      |   59 +
 tests/large-ratio-test             |   23 +
 tests/phase-test                   |   38 +
 tests/q-test                       |   72 ++
 tests/scripts                      |   13 +
 tests/time-test                    |   35 +
 tests/vector-cmp.c                 |   53 +
 tests/vector-gen.c                 |   56 +
 94 files changed, 11052 insertions(+)
 create mode 100644 AUTHORS
 create mode 100644 CMakeLists.txt
 create mode 100644 COPYING.LGPL
 create mode 100644 INSTALL
 create mode 100644 LICENCE
 create mode 100644 NEWS
 create mode 100644 README
 create mode 100644 TODO
 create mode 100644 cmake/Modules/FindLibAVCodec.cmake
 create mode 100644 cmake/Modules/FindOpenMP.cmake
 create mode 100644 cmake/Modules/FindSIMD.cmake
 create mode 100644 cmake/Modules/TestBigEndian.cmake
 create mode 100644 deinstall.cmake.in
 create mode 100644 examples/1-single-block.c
 create mode 100644 examples/1a-lsr.c
 create mode 100644 examples/2-stream.C
 create mode 100644 examples/3-options-input-fn.c
 create mode 100644 examples/4-split-channels.c
 create mode 100644 examples/5-variable-rate.c
 create mode 100644 examples/CMakeLists.txt
 create mode 100644 examples/README
 create mode 100644 examples/examples-common.h
 create mode 100755 go
 create mode 100644 go.bat
 create mode 100755 inst-check
 create mode 100755 inst-check-soxr
 create mode 120000 inst-check-soxr-lsr
 create mode 100644 msvc/README
 create mode 100644 msvc/libsoxr.vcproj
 create mode 100644 msvc/soxr-config.h
 create mode 100644 soxr-config.h.in
 create mode 100644 src/CMakeLists.txt
 create mode 100644 src/aliases.h
 create mode 100644 src/avfft32.c
 create mode 100644 src/avfft32s.c
 create mode 100644 src/ccrw2.h
 create mode 100644 src/data-io.c
 create mode 100644 src/data-io.h
 create mode 100644 src/dbesi0.c
 create mode 100644 src/fft4g.c
 create mode 100644 src/fft4g.h
 create mode 100644 src/fft4g32.c
 create mode 100644 src/fft4g32s.c
 create mode 100644 src/fft4g64.c
 create mode 100644 src/fft4g_cache.h
 create mode 100644 src/fifo.h
 create mode 100644 src/filter.c
 create mode 100644 src/filter.h
 create mode 100644 src/filters.h
 create mode 100644 src/half-fir.h
 create mode 100644 src/half_coefs.h
 create mode 100644 src/internal.h
 create mode 100644 src/libsoxr-dev.src.in
 create mode 100644 src/libsoxr.src.in
 create mode 100644 src/lsr.c
 create mode 100644 src/pffft.c
 create mode 100644 src/pffft.h
 create mode 100644 src/pffft32.c
 create mode 100644 src/pffft32s.c
 create mode 100644 src/poly-fir.h
 create mode 100644 src/poly-fir0.h
 create mode 100644 src/rate.h
 create mode 100644 src/rate32.c
 create mode 100644 src/rate32s.c
 create mode 100644 src/rate64.c
 create mode 100644 src/rdft.h
 create mode 100644 src/rint-clip.h
 create mode 100644 src/rint.h
 create mode 100644 src/samplerate.h
 create mode 100644 src/simd-dev.h
 create mode 100644 src/simd.c
 create mode 100644 src/simd.h
 create mode 100644 src/soxr-lsr.h
 create mode 100644 src/soxr-lsr.pc.in
 create mode 100644 src/soxr.c
 create mode 100644 src/soxr.h
 create mode 100644 src/soxr.pc.in
 create mode 100644 src/vr-coefs.c
 create mode 100644 src/vr-coefs.h
 create mode 100644 src/vr32.c
 create mode 100644 tests/1-delay-clear.c
 create mode 100644 tests/CMakeLists.txt
 create mode 100644 tests/README
 create mode 100755 tests/bandwidth-test
 create mode 100644 tests/cmp-test.cmake
 create mode 100755 tests/eg-test
 create mode 100755 tests/io-test
 create mode 100755 tests/large-ratio-test
 create mode 100755 tests/phase-test
 create mode 100755 tests/q-test
 create mode 100755 tests/scripts
 create mode 100755 tests/time-test
 create mode 100644 tests/vector-cmp.c
 create mode 100644 tests/vector-gen.c

diff --git a/AUTHORS b/AUTHORS
new file mode 100644
index 0000000..2ba76d3
--- /dev/null
+++ b/AUTHORS
@@ -0,0 +1 @@
+Rob Sykes <robs@users.sourceforge.net>
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..406e826
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,298 @@
+# SoX Resampler Library       Copyright (c) 2007-13 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+cmake_minimum_required (VERSION 2.8 FATAL_ERROR)
+
+project (soxr C)
+set (DESCRIPTION_SUMMARY "High quality, one-dimensional sample-rate conversion library")
+
+
+
+# Release versioning:
+
+set (PROJECT_VERSION_MAJOR 0)
+set (PROJECT_VERSION_MINOR 1)
+set (PROJECT_VERSION_PATCH 2)
+
+# For shared-object; if, since the last public release:
+#  * library code changed at all: ++revision
+#  * interfaces changed at all:   ++current, revision = 0
+#  * interfaces added:            ++age
+#  * interfaces removed:          age = 0
+
+set (SO_VERSION_CURRENT  1)
+set (SO_VERSION_REVISION 1)
+set (SO_VERSION_AGE      1)
+
+
+
+# Main options:
+
+include (CMakeDependentOption)
+
+if (NOT CMAKE_BUILD_TYPE)
+  set (CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." FORCE)
+endif ()
+
+option (BUILD_TESTS "Build sanity-tests."  ON)
+option (BUILD_SHARED_LIBS "Build shared libraries." ON)
+option (BUILD_EXAMPLES "Build examples." OFF)
+option (WITH_OPENMP "Include OpenMP threading." ON)
+option (WITH_LSR_BINDINGS "Include a `libsamplerate'-like interface." ON)
+cmake_dependent_option (WITH_SINGLE_PRECISION "Build with single precision (for up to 20-bit accuracy)." ON
+  "WITH_DOUBLE_PRECISION" ON)
+cmake_dependent_option (WITH_DOUBLE_PRECISION "Build with double precision (for up to 32-bit accuracy)." ON
+  "WITH_SINGLE_PRECISION" ON)
+cmake_dependent_option (WITH_SIMD "Use SIMD (for faster single precision)." ON
+  "WITH_SINGLE_PRECISION" OFF)
+cmake_dependent_option (WITH_AVFFT "Use libavcodec (LGPL) for SIMD DFT." OFF
+  "WITH_SIMD;NOT WITH_PFFFT" OFF)
+cmake_dependent_option (WITH_PFFFT "Use PFFFT (BSD-like licence) for SIMD DFT." ON
+  "WITH_SIMD;NOT WITH_AVFFT" OFF)
+if (UNIX)
+  if (EXISTS ${PROJECT_SOURCE_DIR}/lsr-tests)
+    cmake_dependent_option (BUILD_LSR_TESTS "Build LSR tests." OFF
+      "WITH_LSR_BINDINGS" OFF)
+  endif ()
+endif ()
+
+
+
+# Introspection:
+
+list (APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/Modules)
+
+include (CheckFunctionExists)
+include (CheckIncludeFiles)
+include (CheckLibraryExists)
+include (TestBigEndian)
+
+check_library_exists (m pow "" NEED_LIBM)
+if (NEED_LIBM)
+  set (CMAKE_REQUIRED_LIBRARIES "m;${CMAKE_REQUIRED_LIBRARIES}")
+  link_libraries (m)
+endif ()
+
+if (WITH_OPENMP)
+  find_package (OpenMP)
+  if (OPENMP_FOUND)
+    set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+    set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
+    set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${OpenMP_SHARED_LINKER_FLAGS}")
+  endif ()
+endif ()
+
+if (WITH_SIMD)
+  find_package (SIMD)
+  if (SIMD_FOUND)
+    set (HAVE_SIMD 1)
+  endif ()
+endif ()
+
+if (WITH_SINGLE_PRECISION)
+  set (HAVE_SINGLE_PRECISION 1)
+endif ()
+
+if (WITH_DOUBLE_PRECISION)
+  set (HAVE_DOUBLE_PRECISION 1)
+endif ()
+
+if (WITH_AVFFT)
+  find_package (LibAVCodec)
+  if (AVCODEC_FOUND)
+    include_directories (${AVCODEC_INCLUDE_DIRS})
+    link_libraries (${AVCODEC_LIBRARIES})
+    set (HAVE_AVFFT 1)
+  endif ()
+endif ()
+
+check_function_exists (lrint HAVE_LRINT)
+check_include_files (fenv.h HAVE_FENV_H)
+test_big_endian (WORDS_BIGENDIAN)
+
+macro (make_exist)
+  foreach (x ${ARGN})
+    if (NOT ${x})
+      set (${x} 0)
+    endif ()
+  endforeach ()
+endmacro ()
+
+make_exist (HAVE_LRINT HAVE_FENV_H WORDS_BIGENDIAN HAVE_SIMD)
+make_exist (HAVE_SINGLE_PRECISION HAVE_DOUBLE_PRECISION HAVE_AVFFT)
+
+
+
+# Compiler configuration:
+
+if (CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX)
+  set (PROJECT_CXX_FLAGS "-Wconversion -Wall -W -pedantic -Wundef -Wcast-align -Wpointer-arith -Wno-long-long")
+  set (PROJECT_C_FLAGS "${PROJECT_CXX_FLAGS} -Wnested-externs -Wmissing-prototypes -Wstrict-prototypes")
+  if (CMAKE_BUILD_TYPE STREQUAL "Release")
+    set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -s") # strip
+  endif ()
+  cmake_dependent_option (VISIBILITY_HIDDEN "Build with -fvisibility=hidden." ON
+    "BUILD_SHARED_LIBS" OFF)
+  if (VISIBILITY_HIDDEN)
+    add_definitions (-fvisibility=hidden -DSOXR_VISIBILITY)
+  endif ()
+endif ()
+
+if (MSVC)
+  add_definitions (-D_USE_MATH_DEFINES -D_CRT_SECURE_NO_WARNINGS)
+  option (ENABLE_STATIC_RUNTIME "Visual Studio, link with runtime statically."  OFF)
+  if (ENABLE_STATIC_RUNTIME)
+    foreach (flag_var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
+      string (REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
+    endforeach ()
+  endif ()
+  # By default, do not warn when built on machines using only VS Express:
+  if (NOT DEFINED CMAKE_INSTALL_SYSTEM_RUNTIME_LIBS_NO_WARNINGS)
+    set (CMAKE_INSTALL_SYSTEM_RUNTIME_LIBS_NO_WARNINGS ON)
+  endif ()
+endif ()
+
+
+
+# Build configuration:
+
+if (${BUILD_SHARED_LIBS} AND ${CMAKE_SYSTEM_NAME} STREQUAL Windows) # Allow exes to find dlls:
+  set (BIN ${PROJECT_BINARY_DIR}/bin/)
+  set (EXAMPLES_BIN ${BIN})
+  set (CMAKE_LIBRARY_OUTPUT_DIRECTORY ${BIN})
+  set (CMAKE_RUNTIME_OUTPUT_DIRECTORY ${BIN})
+else ()
+  set (BIN ./)
+  set (EXAMPLES_BIN ../examples/)
+endif ()
+
+set (LIB_TYPE STATIC)
+if (BUILD_SHARED_LIBS)
+  set (LIB_TYPE SHARED)
+  if (MSVC)
+    add_definitions (-DSOXR_DLL)
+  endif ()
+endif ()
+
+
+
+# Installation configuration:
+
+if (NOT DEFINED BIN_INSTALL_DIR)
+  set (BIN_INSTALL_DIR "${CMAKE_INSTALL_PREFIX}/bin")
+endif ()
+if (NOT DEFINED LIB_INSTALL_DIR)
+  set (LIB_INSTALL_DIR "${CMAKE_INSTALL_PREFIX}/lib${LIB_SUFFIX}")
+endif ()
+if (NOT DEFINED INCLUDE_INSTALL_DIR)
+  set (INCLUDE_INSTALL_DIR "${CMAKE_INSTALL_PREFIX}/include")
+endif ()
+if (NOT DEFINED DOC_INSTALL_DIR)
+  if (UNIX)
+    set (DOC_INSTALL_DIR "${CMAKE_INSTALL_PREFIX}/share/doc/lib${PROJECT_NAME}")
+  else ()
+    set (DOC_INSTALL_DIR "${CMAKE_INSTALL_PREFIX}/doc")
+  endif ()
+endif ()
+
+if (APPLE)
+  option (BUILD_FRAMEWORK "Build an OS X framework." OFF)
+  set (FRAMEWORK_INSTALL_DIR "/Library/Frameworks" CACHE STRING "Directory to install frameworks to.")
+endif ()
+
+
+
+# Top-level:
+
+set (PROJECT_VERSION ${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}.${PROJECT_VERSION_PATCH})
+math (EXPR SO_VERSION_MAJOR "${SO_VERSION_CURRENT} - ${SO_VERSION_AGE}")
+math (EXPR SO_VERSION_MINOR "${SO_VERSION_AGE}")
+math (EXPR SO_VERSION_PATCH "${SO_VERSION_REVISION}")
+set (SO_VERSION ${SO_VERSION_MAJOR}.${SO_VERSION_MINOR}.${SO_VERSION_PATCH})
+
+configure_file (
+  ${PROJECT_SOURCE_DIR}/${PROJECT_NAME}-config.h.in
+  ${PROJECT_BINARY_DIR}/${PROJECT_NAME}-config.h)
+include_directories (${PROJECT_BINARY_DIR})
+
+if (BUILD_TESTS OR BUILD_LSR_TESTS)
+  enable_testing ()
+endif ()
+
+install (FILES
+  ${CMAKE_CURRENT_SOURCE_DIR}/README
+  ${CMAKE_CURRENT_SOURCE_DIR}/LICENCE
+  ${CMAKE_CURRENT_SOURCE_DIR}/NEWS
+  DESTINATION ${DOC_INSTALL_DIR})
+
+
+
+# Subdirectories:
+
+include_directories (${PROJECT_SOURCE_DIR}/src)
+
+add_subdirectory (src)
+if (BUILD_TESTS)
+  add_subdirectory (tests)
+endif ()
+if (BUILD_LSR_TESTS)
+  add_subdirectory (lsr-tests)
+endif ()
+if (BUILD_EXAMPLES OR BUILD_TESTS)
+  add_subdirectory (examples)
+endif ()
+
+
+
+# Rough-and-ready distclean for anyone still doing in-tree builds:
+
+if (UNIX)
+  add_custom_target (distclean
+    COMMAND make clean && rm -rf
+      CMakeCache.txt
+      CMakeFiles
+      cmake_install.cmake
+      CPackConfig.cmake
+      CPackSourceConfig.cmake
+      deinstall.cmake
+      Makefile
+      soxr-config.h
+      src/CMakeFiles
+      src/cmake_install.cmake
+      src/libsoxr-dev.src
+      src/libsoxr-lsr.pc
+      src/libsoxr.pc
+      src/libsoxr.src
+      src/Makefile)
+endif ()
+
+
+
+# Deinstallation:
+
+configure_file (
+  "${CMAKE_CURRENT_SOURCE_DIR}/deinstall.cmake.in"
+  "${CMAKE_CURRENT_BINARY_DIR}/deinstall.cmake"
+  IMMEDIATE @ONLY)
+
+add_custom_target (deinstall
+  COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_BINARY_DIR}/deinstall.cmake")
+
+
+
+# Packaging:
+
+if (UNIX)
+  set (CPACK_PACKAGE_VERSION_MAJOR "${PROJECT_VERSION_MAJOR}")
+  set (CPACK_PACKAGE_VERSION_MINOR "${PROJECT_VERSION_MINOR}")
+  set (CPACK_PACKAGE_VERSION_PATCH "${PROJECT_VERSION_PATCH}")
+
+  set (CPACK_SOURCE_GENERATOR "TGZ")
+  set (CPACK_SOURCE_IGNORE_FILES "dist;/lsr-tests/;/Debug/;/Release/;/cpack/;\\\\.swp$;\\\\.gitignore;/\\\\.git/")
+
+  include (CPack)
+
+  if (IS_DIRECTORY ${PROJECT_SOURCE_DIR}/cpack)
+    add_subdirectory (cpack)
+  endif ()
+endif ()
diff --git a/COPYING.LGPL b/COPYING.LGPL
new file mode 100644
index 0000000..551cb4a
--- /dev/null
+++ b/COPYING.LGPL
@@ -0,0 +1,502 @@
+		  GNU LESSER GENERAL PUBLIC LICENSE
+		       Version 2.1, February 1999
+
+ Copyright (C) 1991, 1999 Free Software Foundation, Inc.
+ 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+[This is the first released version of the Lesser GPL.  It also counts
+ as the successor of the GNU Library Public License, version 2, hence
+ the version number 2.1.]
+
+			    Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+Licenses are intended to guarantee your freedom to share and change
+free software--to make sure the software is free for all its users.
+
+  This license, the Lesser General Public License, applies to some
+specially designated software packages--typically libraries--of the
+Free Software Foundation and other authors who decide to use it.  You
+can use it too, but we suggest you first think carefully about whether
+this license or the ordinary General Public License is the better
+strategy to use in any particular case, based on the explanations below.
+
+  When we speak of free software, we are referring to freedom of use,
+not price.  Our General Public Licenses are designed to make sure that
+you have the freedom to distribute copies of free software (and charge
+for this service if you wish); that you receive source code or can get
+it if you want it; that you can change the software and use pieces of
+it in new free programs; and that you are informed that you can do
+these things.
+
+  To protect your rights, we need to make restrictions that forbid
+distributors to deny you these rights or to ask you to surrender these
+rights.  These restrictions translate to certain responsibilities for
+you if you distribute copies of the library or if you modify it.
+
+  For example, if you distribute copies of the library, whether gratis
+or for a fee, you must give the recipients all the rights that we gave
+you.  You must make sure that they, too, receive or can get the source
+code.  If you link other code with the library, you must provide
+complete object files to the recipients, so that they can relink them
+with the library after making changes to the library and recompiling
+it.  And you must show them these terms so they know their rights.
+
+  We protect your rights with a two-step method: (1) we copyright the
+library, and (2) we offer you this license, which gives you legal
+permission to copy, distribute and/or modify the library.
+
+  To protect each distributor, we want to make it very clear that
+there is no warranty for the free library.  Also, if the library is
+modified by someone else and passed on, the recipients should know
+that what they have is not the original version, so that the original
+author's reputation will not be affected by problems that might be
+introduced by others.
+
+  Finally, software patents pose a constant threat to the existence of
+any free program.  We wish to make sure that a company cannot
+effectively restrict the users of a free program by obtaining a
+restrictive license from a patent holder.  Therefore, we insist that
+any patent license obtained for a version of the library must be
+consistent with the full freedom of use specified in this license.
+
+  Most GNU software, including some libraries, is covered by the
+ordinary GNU General Public License.  This license, the GNU Lesser
+General Public License, applies to certain designated libraries, and
+is quite different from the ordinary General Public License.  We use
+this license for certain libraries in order to permit linking those
+libraries into non-free programs.
+
+  When a program is linked with a library, whether statically or using
+a shared library, the combination of the two is legally speaking a
+combined work, a derivative of the original library.  The ordinary
+General Public License therefore permits such linking only if the
+entire combination fits its criteria of freedom.  The Lesser General
+Public License permits more lax criteria for linking other code with
+the library.
+
+  We call this license the "Lesser" General Public License because it
+does Less to protect the user's freedom than the ordinary General
+Public License.  It also provides other free software developers Less
+of an advantage over competing non-free programs.  These disadvantages
+are the reason we use the ordinary General Public License for many
+libraries.  However, the Lesser license provides advantages in certain
+special circumstances.
+
+  For example, on rare occasions, there may be a special need to
+encourage the widest possible use of a certain library, so that it becomes
+a de-facto standard.  To achieve this, non-free programs must be
+allowed to use the library.  A more frequent case is that a free
+library does the same job as widely used non-free libraries.  In this
+case, there is little to gain by limiting the free library to free
+software only, so we use the Lesser General Public License.
+
+  In other cases, permission to use a particular library in non-free
+programs enables a greater number of people to use a large body of
+free software.  For example, permission to use the GNU C Library in
+non-free programs enables many more people to use the whole GNU
+operating system, as well as its variant, the GNU/Linux operating
+system.
+
+  Although the Lesser General Public License is Less protective of the
+users' freedom, it does ensure that the user of a program that is
+linked with the Library has the freedom and the wherewithal to run
+that program using a modified version of the Library.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.  Pay close attention to the difference between a
+"work based on the library" and a "work that uses the library".  The
+former contains code derived from the library, whereas the latter must
+be combined with the library in order to run.
+
+		  GNU LESSER GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License Agreement applies to any software library or other
+program which contains a notice placed by the copyright holder or
+other authorized party saying it may be distributed under the terms of
+this Lesser General Public License (also called "this License").
+Each licensee is addressed as "you".
+
+  A "library" means a collection of software functions and/or data
+prepared so as to be conveniently linked with application programs
+(which use some of those functions and data) to form executables.
+
+  The "Library", below, refers to any such software library or work
+which has been distributed under these terms.  A "work based on the
+Library" means either the Library or any derivative work under
+copyright law: that is to say, a work containing the Library or a
+portion of it, either verbatim or with modifications and/or translated
+straightforwardly into another language.  (Hereinafter, translation is
+included without limitation in the term "modification".)
+
+  "Source code" for a work means the preferred form of the work for
+making modifications to it.  For a library, complete source code means
+all the source code for all modules it contains, plus any associated
+interface definition files, plus the scripts used to control compilation
+and installation of the library.
+
+  Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running a program using the Library is not restricted, and output from
+such a program is covered only if its contents constitute a work based
+on the Library (independent of the use of the Library in a tool for
+writing it).  Whether that is true depends on what the Library does
+and what the program that uses the Library does.
+
+  1. You may copy and distribute verbatim copies of the Library's
+complete source code as you receive it, in any medium, provided that
+you conspicuously and appropriately publish on each copy an
+appropriate copyright notice and disclaimer of warranty; keep intact
+all the notices that refer to this License and to the absence of any
+warranty; and distribute a copy of this License along with the
+Library.
+
+  You may charge a fee for the physical act of transferring a copy,
+and you may at your option offer warranty protection in exchange for a
+fee.
+
+  2. You may modify your copy or copies of the Library or any portion
+of it, thus forming a work based on the Library, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) The modified work must itself be a software library.
+
+    b) You must cause the files modified to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    c) You must cause the whole of the work to be licensed at no
+    charge to all third parties under the terms of this License.
+
+    d) If a facility in the modified Library refers to a function or a
+    table of data to be supplied by an application program that uses
+    the facility, other than as an argument passed when the facility
+    is invoked, then you must make a good faith effort to ensure that,
+    in the event an application does not supply such function or
+    table, the facility still operates, and performs whatever part of
+    its purpose remains meaningful.
+
+    (For example, a function in a library to compute square roots has
+    a purpose that is entirely well-defined independent of the
+    application.  Therefore, Subsection 2d requires that any
+    application-supplied function or table used by this function must
+    be optional: if the application does not supply it, the square
+    root function must still compute square roots.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Library,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Library, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote
+it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Library.
+
+In addition, mere aggregation of another work not based on the Library
+with the Library (or with a work based on the Library) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may opt to apply the terms of the ordinary GNU General Public
+License instead of this License to a given copy of the Library.  To do
+this, you must alter all the notices that refer to this License, so
+that they refer to the ordinary GNU General Public License, version 2,
+instead of to this License.  (If a newer version than version 2 of the
+ordinary GNU General Public License has appeared, then you can specify
+that version instead if you wish.)  Do not make any other change in
+these notices.
+
+  Once this change is made in a given copy, it is irreversible for
+that copy, so the ordinary GNU General Public License applies to all
+subsequent copies and derivative works made from that copy.
+
+  This option is useful when you wish to copy part of the code of
+the Library into a program that is not a library.
+
+  4. You may copy and distribute the Library (or a portion or
+derivative of it, under Section 2) in object code or executable form
+under the terms of Sections 1 and 2 above provided that you accompany
+it with the complete corresponding machine-readable source code, which
+must be distributed under the terms of Sections 1 and 2 above on a
+medium customarily used for software interchange.
+
+  If distribution of object code is made by offering access to copy
+from a designated place, then offering equivalent access to copy the
+source code from the same place satisfies the requirement to
+distribute the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  5. A program that contains no derivative of any portion of the
+Library, but is designed to work with the Library by being compiled or
+linked with it, is called a "work that uses the Library".  Such a
+work, in isolation, is not a derivative work of the Library, and
+therefore falls outside the scope of this License.
+
+  However, linking a "work that uses the Library" with the Library
+creates an executable that is a derivative of the Library (because it
+contains portions of the Library), rather than a "work that uses the
+library".  The executable is therefore covered by this License.
+Section 6 states terms for distribution of such executables.
+
+  When a "work that uses the Library" uses material from a header file
+that is part of the Library, the object code for the work may be a
+derivative work of the Library even though the source code is not.
+Whether this is true is especially significant if the work can be
+linked without the Library, or if the work is itself a library.  The
+threshold for this to be true is not precisely defined by law.
+
+  If such an object file uses only numerical parameters, data
+structure layouts and accessors, and small macros and small inline
+functions (ten lines or less in length), then the use of the object
+file is unrestricted, regardless of whether it is legally a derivative
+work.  (Executables containing this object code plus portions of the
+Library will still fall under Section 6.)
+
+  Otherwise, if the work is a derivative of the Library, you may
+distribute the object code for the work under the terms of Section 6.
+Any executables containing that work also fall under Section 6,
+whether or not they are linked directly with the Library itself.
+
+  6. As an exception to the Sections above, you may also combine or
+link a "work that uses the Library" with the Library to produce a
+work containing portions of the Library, and distribute that work
+under terms of your choice, provided that the terms permit
+modification of the work for the customer's own use and reverse
+engineering for debugging such modifications.
+
+  You must give prominent notice with each copy of the work that the
+Library is used in it and that the Library and its use are covered by
+this License.  You must supply a copy of this License.  If the work
+during execution displays copyright notices, you must include the
+copyright notice for the Library among them, as well as a reference
+directing the user to the copy of this License.  Also, you must do one
+of these things:
+
+    a) Accompany the work with the complete corresponding
+    machine-readable source code for the Library including whatever
+    changes were used in the work (which must be distributed under
+    Sections 1 and 2 above); and, if the work is an executable linked
+    with the Library, with the complete machine-readable "work that
+    uses the Library", as object code and/or source code, so that the
+    user can modify the Library and then relink to produce a modified
+    executable containing the modified Library.  (It is understood
+    that the user who changes the contents of definitions files in the
+    Library will not necessarily be able to recompile the application
+    to use the modified definitions.)
+
+    b) Use a suitable shared library mechanism for linking with the
+    Library.  A suitable mechanism is one that (1) uses at run time a
+    copy of the library already present on the user's computer system,
+    rather than copying library functions into the executable, and (2)
+    will operate properly with a modified version of the library, if
+    the user installs one, as long as the modified version is
+    interface-compatible with the version that the work was made with.
+
+    c) Accompany the work with a written offer, valid for at
+    least three years, to give the same user the materials
+    specified in Subsection 6a, above, for a charge no more
+    than the cost of performing this distribution.
+
+    d) If distribution of the work is made by offering access to copy
+    from a designated place, offer equivalent access to copy the above
+    specified materials from the same place.
+
+    e) Verify that the user has already received a copy of these
+    materials or that you have already sent this user a copy.
+
+  For an executable, the required form of the "work that uses the
+Library" must include any data and utility programs needed for
+reproducing the executable from it.  However, as a special exception,
+the materials to be distributed need not include anything that is
+normally distributed (in either source or binary form) with the major
+components (compiler, kernel, and so on) of the operating system on
+which the executable runs, unless that component itself accompanies
+the executable.
+
+  It may happen that this requirement contradicts the license
+restrictions of other proprietary libraries that do not normally
+accompany the operating system.  Such a contradiction means you cannot
+use both them and the Library together in an executable that you
+distribute.
+
+  7. You may place library facilities that are a work based on the
+Library side-by-side in a single library together with other library
+facilities not covered by this License, and distribute such a combined
+library, provided that the separate distribution of the work based on
+the Library and of the other library facilities is otherwise
+permitted, and provided that you do these two things:
+
+    a) Accompany the combined library with a copy of the same work
+    based on the Library, uncombined with any other library
+    facilities.  This must be distributed under the terms of the
+    Sections above.
+
+    b) Give prominent notice with the combined library of the fact
+    that part of it is a work based on the Library, and explaining
+    where to find the accompanying uncombined form of the same work.
+
+  8. You may not copy, modify, sublicense, link with, or distribute
+the Library except as expressly provided under this License.  Any
+attempt otherwise to copy, modify, sublicense, link with, or
+distribute the Library is void, and will automatically terminate your
+rights under this License.  However, parties who have received copies,
+or rights, from you under this License will not have their licenses
+terminated so long as such parties remain in full compliance.
+
+  9. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Library or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Library (or any work based on the
+Library), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Library or works based on it.
+
+  10. Each time you redistribute the Library (or any work based on the
+Library), the recipient automatically receives a license from the
+original licensor to copy, distribute, link with or modify the Library
+subject to these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties with
+this License.
+
+  11. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Library at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Library by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Library.
+
+If any portion of this section is held invalid or unenforceable under any
+particular circumstance, the balance of the section is intended to apply,
+and the section as a whole is intended to apply in other circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  12. If the distribution and/or use of the Library is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Library under this License may add
+an explicit geographical distribution limitation excluding those countries,
+so that distribution is permitted only in or among countries not thus
+excluded.  In such case, this License incorporates the limitation as if
+written in the body of this License.
+
+  13. The Free Software Foundation may publish revised and/or new
+versions of the Lesser General Public License from time to time.
+Such new versions will be similar in spirit to the present version,
+but may differ in detail to address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Library
+specifies a version number of this License which applies to it and
+"any later version", you have the option of following the terms and
+conditions either of that version or of any later version published by
+the Free Software Foundation.  If the Library does not specify a
+license version number, you may choose any version ever published by
+the Free Software Foundation.
+
+  14. If you wish to incorporate parts of the Library into other free
+programs whose distribution conditions are incompatible with these,
+write to the author to ask for permission.  For software which is
+copyrighted by the Free Software Foundation, write to the Free
+Software Foundation; we sometimes make exceptions for this.  Our
+decision will be guided by the two goals of preserving the free status
+of all derivatives of our free software and of promoting the sharing
+and reuse of software generally.
+
+			    NO WARRANTY
+
+  15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
+WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
+EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
+OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
+KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
+LIBRARY IS WITH YOU.  SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
+THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
+WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
+AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
+FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
+CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
+LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
+RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
+FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
+SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
+DAMAGES.
+
+		     END OF TERMS AND CONDITIONS
+
+           How to Apply These Terms to Your New Libraries
+
+  If you develop a new library, and you want it to be of the greatest
+possible use to the public, we recommend making it free software that
+everyone can redistribute and change.  You can do so by permitting
+redistribution under these terms (or, alternatively, under the terms of the
+ordinary General Public License).
+
+  To apply these terms, attach the following notices to the library.  It is
+safest to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least the
+"copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the library's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2.1 of the License, or (at your option) any later version.
+
+    This library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public
+    License along with this library; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+
+Also add information on how to contact you by electronic and paper mail.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the library, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the
+  library `Frob' (a library for tweaking knobs) written by James Random Hacker.
+
+  <signature of Ty Coon>, 1 April 1990
+  Ty Coon, President of Vice
+
+That's all there is to it!
diff --git a/INSTALL b/INSTALL
new file mode 100644
index 0000000..c2c7675
--- /dev/null
+++ b/INSTALL
@@ -0,0 +1,123 @@
+SoX Resampler Library       Copyright (c) 2007-13 robs@users.sourceforge.net
+
+INSTALLATION GUIDE CONTENTS
+
+* Standard build
+* Build customisation
+* Cross-compiling with mingw (linux host)
+* Integration with other build systems
+
+
+
+STANDARD BUILD
+
+1. Prerequisites:
+
+    Before you can build this library, you need to have available on your
+    system:
+
+    * A C-compiler with 64-bit integer support and, optionally, OpenMP, SIMD.
+
+    * A 'make' utility (most compiler installations already have one of these).
+
+    * CMake: http://www.cmake.org/cmake/resources/software.html
+
+
+2. Build:
+
+    At a command prompt, change directory (`cd') to the one containing this
+    file, then enter:
+
+        go                          (on MS-Windows with nmake)
+    or
+        ./go                        (on unix-like systems)
+
+    This should build the library and run a few sanity tests.
+
+
+3. Installation:
+
+    Note that this step may need to be performed by a system
+    adminstrator.  Enter:
+
+        nmake install               (on MS-Windows)
+    or
+        cd Release; make install    (on unix)
+
+
+4. Configuration:
+
+    To use the library you may need to set up appropriate paths to the
+    library and its header file in your development environment.
+
+
+5. Installation test
+
+    To test the installation, build and run some of the example programmes
+    (see examples/README).
+
+
+
+BUILD CUSTOMISATION
+
+If it is necessary to customise the build, then steps 2 and 3 above may be
+substituted as follows.  Change directory to the one containing this file,
+then enter commands along the lines of:
+
+    mkdir build
+    cd build
+    cmake [OPTIONS] ..
+    make
+    make test
+    sudo make install
+
+To list help on the available options, enter:
+
+    cmake -LH ..
+
+Options, if given, should be preceded with '-D', e.g.
+
+    cmake -DWITH_SIMD:BOOL=OFF ..
+
+
+
+CROSS-COMPILING WITH MINGW (LINUX HOST)
+
+For example:
+
+    mkdir build
+    cd build
+    cmake -DCMAKE_TOOLCHAIN_FILE=~/Toolchain-x86_64-mingw-w64-mingw32.cmake \
+          -DCMAKE_INSTALL_PREFIX=install \
+          -DHAVE_WORDS_BIGENDIAN_EXITCODE=1 \
+          -DBUILD_TESTS=0 \
+          -DBUILD_EXAMPLES=1 \
+          ..
+    make
+
+where ~/Toolchain-x86_64-mingw-w64-mingw32.cmake might contain:
+
+    SET(CMAKE_SYSTEM_NAME Windows)
+    SET(CMAKE_C_COMPILER /usr/bin/x86_64-w64-mingw32-gcc)
+    SET(CMAKE_CXX_COMPILER /usr/bin/x86_64-w64-mingw32-g++)
+    SET(CMAKE_RC_COMPILER /usr/bin/x86_64-w64-mingw32-windres)
+    SET(CMAKE_Fortran_COMPILER /usr/bin/x86_64-w64-mingw32-gfortran)
+    SET(CMAKE_AR:FILEPATH /usr/bin/x86_64-w64-mingw32-ar)
+    SET(CMAKE_RANLIB:FILEPATH /usr/bin/x86_64-w64-mingw32-ranlib)
+    SET(CMAKE_FIND_ROOT_PATH /usr/x86_64-w64-mingw32)
+    SET(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+    SET(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+    SET(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+    SET(QT_BINARY_DIR /usr/x86_64-w64-mingw32/bin /usr/bin)
+    SET(Boost_COMPILER -gcc47)
+
+
+
+INTEGRATION WITH OTHER BUILD SYSTEMS
+
+Autotools-based systems might find it useful to create a file called
+`configure' in the directory containing this file, consisting of the line:
+  cmake -DBUILD_SHARED_LIBS=OFF .
+(or with other build options as required).
+
+For MS visual studio, see msvc/README
diff --git a/LICENCE b/LICENCE
new file mode 100644
index 0000000..1c61878
--- /dev/null
+++ b/LICENCE
@@ -0,0 +1,24 @@
+SoX Resampler Library       Copyright (c) 2007-13 robs@users.sourceforge.net
+
+This library is free software; you can redistribute it and/or modify it
+under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at
+your option) any later version.
+
+This library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser
+General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with this library; if not, write to the Free Software Foundation,
+Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+
+
+Notes
+
+1. Re software in the `examples' directory: works that are not resampling
+examples but are based on the given examples -- for example, applications using
+the library -- shall not be considered to be derivative works of the examples.
+
+2. If building with pffft.c, see the licence embedded in that file.
diff --git a/NEWS b/NEWS
new file mode 100644
index 0000000..f388974
--- /dev/null
+++ b/NEWS
@@ -0,0 +1,37 @@
+Version 0.1.2 (2015-09-05)
+  * Fix conversion failure when I/O types differ but I/O rates don't.
+  * Fix #defines for interpolation order selection.
+  * Fix ineffectual SOXR_MINIMUM_PHASE and SOXR_INTERMEDIATE_PHASE in
+    soxr_quality_spec recipe.
+  * Fix soxr_delay() returning a negative number after end-of-input has been
+    indicated.
+  * Fix crash when using soxr_process() after calling soxr_clear().
+  * Be more POSIX compliant w.r.t. errno in the examples; fixes erroneous
+    reporting of errors on FreeBSD.
+  * Quality improvement for variable-rate.
+  * Various fixes/improvements to build/tests/documentation.
+
+Version 0.1.1 (2013-03-03)
+  * Minor fixes/improvements to build/tests.
+  * Fix crash (e.g. with k3b) when null error pointer passed to src_create (lsr
+    bindings only).
+  * Fix broken resampling in many cases with SIMD and anti_aliasing_pc < 100.
+  * For clarity, renamed and slightly changed usage of three parameters in
+    soxr_quality_spec_t (ABI compatible, API incompatible).  An application not
+    setting these parameters directly need make no change; otherwise, changes
+    should be made per the following example (as shown, compatibility with both
+    old/new APIs is maintained).  See also the comments on these parameters in
+    soxr.h.  N.B. ABI compatibility with the 0.1.0 API may be removed in a
+    future release.
+      #if !defined SOXR_VERSION /* Deprecated, 0.1.0 API */
+        q_spec.phase = minimum_phase? 0 : 50;
+        q_spec.bw_pc = cutoff * 100;
+        q_spec.anti_aliasing_pc = anti_aliasing * 100;
+      #else /* 0.1.1 API */                            Explanation:
+        q_spec.phase_response = minimum_phase? 0 : 50;  Renamed.
+        q_spec.passband_end = cutoff;                   Renamed, no longer %.
+        q_spec.stopband_begin = 2 - anti_aliasing;      Renamed, no longer %, no
+      #endif                                            longer mirrored in Fs.
+
+Version 0.1.0 (2013-01-19)
+  * First public release.
diff --git a/README b/README
new file mode 100644
index 0000000..06f11e6
--- /dev/null
+++ b/README
@@ -0,0 +1,53 @@
+SoX Resampler Library       Copyright (c) 2007-13 robs@users.sourceforge.net
+
+The SoX Resampler library `libsoxr' performs one-dimensional sample-rate
+conversion -- it may be used, for example, to resample PCM-encoded audio.
+For higher-dimensional resampling, such as for visual-image processing, you
+should look elsewhere.
+
+It aims to give fastÂ¹ and very high qualityÂ² results for any constant
+(rational or irrational) resampling ratio.  Phase-response, preserved
+bandwidth, aliasing, and rejection level parameters are all configurable;
+alternatively, simple `preset' configurations may be selected.  A
+variable-rate resampling mode of operation is also included.
+
+The resampler is currently available either as part of `libsox' (the audio
+file-format and effect library), or stand-alone as `libsoxr' (this package).
+The interfaces to libsox and libsoxr are slightly different, with that of
+libsoxr designed specifically for resampling.  An application requiring
+support for other effects, or for reading-from or writing-to audio files or
+devices, should use libsox (or other libraries such as libsndfile or
+libavformat).
+
+Libsoxr provides a simple API that allows interfacing using the most
+commonly-used sample formats and buffering schemes: sample-formats may be
+either floating-point or integer, and multiple channels either interleaved
+or split in separate buffers.  The API is documented in the header file
+`soxr.h', together with sample code found in the 'examples' directory.
+
+For compatibility with the popular `libsamplerate' library, the header file
+`soxr-lsr.h' is provided and may be used as an alternative API.Â³  Note
+however, that libsoxr does not provide a full emulation of libsamplerate
+and that using this approach, only a sub-set of libsoxr's features are
+available.
+
+The design was inspired by Laurent De Soras' paper `The Quest For The
+Perfect Resampler', http://ldesoras.free.fr/doc/articles/resampler-en.pdf;
+in essence, it combines Julius O. Smith's `Bandlimited Interpolation'
+technique (https://ccrma.stanford.edu/~jos/resample/resample.pdf) with FFT-
+based over-sampling.
+
+Note that for real-time resampling, libsoxr may have a higher latency
+than non-FFT based resamplers.  For example, when using the `High Quality'
+configuration to resample between 44100Hz and 48000Hz, the latency is
+around 1000 output samples, i.e. roughly 20ms (though passband and FFT-
+size configuration parameters may be used to reduce this figure).
+
+For build and installation instructions, see the file `INSTALL'; for
+copyright and licensing information, see the file `LICENCE'.
+
+For support and new versions, see http://soxr.sourceforge.net
+________
+Â¹ For example, multi-channel resampling can utilise multiple CPU-cores.
+Â² Bit-perfect within practical occupied-bandwidth limits.
+Â³ For details of that API, see http://www.mega-nerd.com/SRC/api.html.
diff --git a/TODO b/TODO
new file mode 100644
index 0000000..1c4a31b
--- /dev/null
+++ b/TODO
@@ -0,0 +1,3 @@
+* SOXR_ALLOW_ALIASING
+* Explicit flush API fn, perhaps.
+* More SIMD.
diff --git a/cmake/Modules/FindLibAVCodec.cmake b/cmake/Modules/FindLibAVCodec.cmake
new file mode 100644
index 0000000..add33c3
--- /dev/null
+++ b/cmake/Modules/FindLibAVCodec.cmake
@@ -0,0 +1,23 @@
+# SoX Resampler Library       Copyright (c) 2007-13 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+# - Find AVCODEC
+# Find the native installation of this package: includes and libraries.
+#
+#  AVCODEC_INCLUDES    - where to find headers for this package.
+#  AVCODEC_LIBRARIES   - List of libraries when using this package.
+#  AVCODEC_FOUND       - True if this package can be found.
+
+if (AVCODEC_INCLUDES)
+  set (AVCODEC_FIND_QUIETLY TRUE)
+endif (AVCODEC_INCLUDES)
+
+find_path (AVCODEC_INCLUDES libavcodec/avcodec.h)
+
+find_library (AVCODEC_LIBRARIES NAMES avcodec)
+
+include (FindPackageHandleStandardArgs)
+find_package_handle_standard_args (
+  AVCODEC DEFAULT_MSG AVCODEC_LIBRARIES AVCODEC_INCLUDES)
+
+mark_as_advanced (AVCODEC_LIBRARIES AVCODEC_INCLUDES)
diff --git a/cmake/Modules/FindOpenMP.cmake b/cmake/Modules/FindOpenMP.cmake
new file mode 100644
index 0000000..eef8422
--- /dev/null
+++ b/cmake/Modules/FindOpenMP.cmake
@@ -0,0 +1,115 @@
+# - Finds OpenMP support
+# This module can be used to detect OpenMP support in a compiler.
+# If the compiler supports OpenMP, the flags required to compile with
+# openmp support are set.
+#
+# The following variables are set:
+#   OpenMP_C_FLAGS - flags to add to the C compiler for OpenMP support
+#   OPENMP_FOUND - true if openmp is detected
+#
+# Supported compilers can be found at http://openmp.org/wp/openmp-compilers/
+#
+# Modifications for soxr:
+#   * don't rely on presence of C++ compiler
+#   * support MINGW
+#
+#=============================================================================
+# Copyright 2009 Kitware, Inc.
+# Copyright 2008-2009 AndrÃ© Rigland Brodtkorb <Andre.Brodtkorb@ifi.uio.no>
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#  * Redistributions of source code must retain the above copyright notice,
+#    this list of conditions and the following disclaimer.
+#
+#  * Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+#  * The names of Kitware, Inc., the Insight Consortium, or the names of
+#    any consortium members, or of any contributors, may not be used to
+#    endorse or promote products derived from this software without
+#    specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS''
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR
+# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+include (CheckCSourceCompiles)
+include (FindPackageHandleStandardArgs)
+
+set (OpenMP_C_FLAG_CANDIDATES
+  #Gnu
+  "-fopenmp"
+  #Microsoft Visual Studio
+  "/openmp"
+  #Intel windows
+  "-Qopenmp"
+  #Intel
+  "-openmp"
+  #Empty, if compiler automatically accepts openmp
+  " "
+  #Sun
+  "-xopenmp"
+  #HP
+  "+Oopenmp"
+  #IBM XL C/c++
+  "-qsmp"
+  #Portland Group
+  "-mp"
+)
+
+# sample openmp source code to test
+set (OpenMP_C_TEST_SOURCE
+"
+#include <omp.h>
+int main() {
+#ifdef _OPENMP
+  return 0;
+#else
+  breaks_on_purpose
+#endif
+}
+")
+# if these are set then do not try to find them again,
+# by avoiding any try_compiles for the flags
+if (DEFINED OpenMP_C_FLAGS)
+  set (OpenMP_C_FLAG_CANDIDATES)
+endif (DEFINED OpenMP_C_FLAGS)
+
+# check c compiler
+foreach (FLAG ${OpenMP_C_FLAG_CANDIDATES})
+  set (SAFE_CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS}")
+  set (CMAKE_REQUIRED_FLAGS "${FLAG}")
+  unset (OpenMP_FLAG_DETECTED CACHE)
+  message (STATUS "Try OpenMP C flag = [${FLAG}]")
+  check_c_source_compiles ("${OpenMP_C_TEST_SOURCE}" OpenMP_FLAG_DETECTED)
+  set (CMAKE_REQUIRED_FLAGS "${SAFE_CMAKE_REQUIRED_FLAGS}")
+  if (OpenMP_FLAG_DETECTED)
+    set (OpenMP_C_FLAGS_INTERNAL "${FLAG}")
+    break ()
+  endif (OpenMP_FLAG_DETECTED)
+endforeach (FLAG ${OpenMP_C_FLAG_CANDIDATES})
+
+set (OpenMP_C_FLAGS "${OpenMP_C_FLAGS_INTERNAL}"
+  CACHE STRING "C compiler flags for OpenMP parallization")
+
+# handle the standard arguments for find_package
+find_package_handle_standard_args (OpenMP DEFAULT_MSG
+  OpenMP_C_FLAGS)
+
+if (MINGW)
+  set (OpenMP_SHARED_LINKER_FLAGS "${OpenMP_SHARED_LINKER_FLAGS} ${OpenMP_C_FLAGS}")
+  set (OpenMP_EXE_LINKER_FLAGS "${OpenMP_EXE_LINKER_FLAGS} ${OpenMP_C_FLAGS}")
+endif ()
+
+mark_as_advanced (OpenMP_C_FLAGS OpenMP_SHARED_LINKER_FLAGS OpenMP_EXE_LINKER_FLAGS)
diff --git a/cmake/Modules/FindSIMD.cmake b/cmake/Modules/FindSIMD.cmake
new file mode 100644
index 0000000..6ac51cb
--- /dev/null
+++ b/cmake/Modules/FindSIMD.cmake
@@ -0,0 +1,94 @@
+# - Finds SIMD support
+#
+# The following variables are set:
+#   SIMD_C_FLAGS - flags to add to the C compiler for this package.
+#   SIMD_FOUND - true if support for this package is found.
+#
+#=============================================================================
+# Based on FindOpenMP.cmake, which is:
+#
+# Copyright 2009 Kitware, Inc.
+# Copyright 2008-2009 AndrÃ© Rigland Brodtkorb <Andre.Brodtkorb@ifi.uio.no>
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#  * Redistributions of source code must retain the above copyright notice,
+#    this list of conditions and the following disclaimer.
+#
+#  * Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+#  * The names of Kitware, Inc., the Insight Consortium, or the names of
+#    any consortium members, or of any contributors, may not be used to
+#    endorse or promote products derived from this software without
+#    specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS''
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR
+# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+include (CheckCSourceCompiles)
+include (FindPackageHandleStandardArgs)
+
+if (WIN32) # Safety for when mixed lib/app compilers (but performance hit)
+  set (GCC_WIN32_SIMD_OPTS "-mincoming-stack-boundary=2")
+endif ()
+
+set (SIMD_C_FLAG_CANDIDATES
+  # x64
+  " "
+  # Microsoft Visual Studio x86
+  "/arch:SSE /fp:fast -D__SSE__"
+  # Gcc x86
+  "-msse -mfpmath=sse ${GCC_WIN32_SIMD_OPTS}"
+  # Gcc x86 (old versions)
+  "-msse -mfpmath=sse"
+)
+
+set (SIMD_C_TEST_SOURCE
+"
+#include <xmmintrin.h>
+int main()
+{
+  __m128 a, b;
+  float vals[4] = {0};
+  a = _mm_loadu_ps (vals);
+  b = a;
+  b = _mm_add_ps (a,b);
+  _mm_storeu_ps (vals,b);
+  return 0;
+}
+")
+
+if (DEFINED SIMD_C_FLAGS)
+  set (SIMD_C_FLAG_CANDIDATES)
+endif ()
+
+foreach (FLAG ${SIMD_C_FLAG_CANDIDATES})
+  set (SAFE_CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS}")
+  set (CMAKE_REQUIRED_FLAGS "${FLAG}")
+  unset (SIMD_FLAG_DETECTED CACHE)
+  message (STATUS "Try SIMD C flag = [${FLAG}]")
+  check_c_source_compiles ("${SIMD_C_TEST_SOURCE}" SIMD_FLAG_DETECTED)
+  set (CMAKE_REQUIRED_FLAGS "${SAFE_CMAKE_REQUIRED_FLAGS}")
+  if (SIMD_FLAG_DETECTED)
+    set (SIMD_C_FLAGS_INTERNAL "${FLAG}")
+    break ()
+  endif ()
+endforeach ()
+
+set (SIMD_C_FLAGS "${SIMD_C_FLAGS_INTERNAL}"
+  CACHE STRING "C compiler flags for SIMD vectorization")
+
+find_package_handle_standard_args (SIMD DEFAULT_MSG SIMD_C_FLAGS SIMD_C_FLAGS)
+mark_as_advanced (SIMD_C_FLAGS)
diff --git a/cmake/Modules/TestBigEndian.cmake b/cmake/Modules/TestBigEndian.cmake
new file mode 100644
index 0000000..7f65cc0
--- /dev/null
+++ b/cmake/Modules/TestBigEndian.cmake
@@ -0,0 +1,15 @@
+# SoX Resampler Library       Copyright (c) 2007-13 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+# - Macro to determine endian type
+#  test_big_endian (VARIABLE)
+#  VARIABLE - variable to store the result to
+
+macro (test_big_endian VARIABLE)
+  if ("HAVE_${VARIABLE}" MATCHES "^HAVE_${VARIABLE}$")
+    include (CheckCSourceRuns)
+    check_c_source_runs ("int main() {union {long i; char c[sizeof(long)];}
+      const u = {1}; return !!u.c[0];}" HAVE_${VARIABLE})
+    set (${VARIABLE} "${HAVE_${VARIABLE}}" CACHE INTERNAL "1 if system is big endian" FORCE)
+  endif ()
+endmacro ()
diff --git a/deinstall.cmake.in b/deinstall.cmake.in
new file mode 100644
index 0000000..307be50
--- /dev/null
+++ b/deinstall.cmake.in
@@ -0,0 +1,25 @@
+# SoX Resampler Library       Copyright (c) 2007-13 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+if (NOT EXISTS "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt")
+  message (FATAL_ERROR "Cannot find install manifest")
+endif ()
+
+file (READ "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt" files)
+string (REGEX REPLACE "\n" ";" files "${files}")
+foreach (file ${files})
+  set (dest "$ENV{DESTDIR}${file}")
+  message (STATUS "Deinstalling \"${dest}\"")
+  if (EXISTS "${dest}" OR IS_SYMLINK "${dest}")
+    execute_process (
+      COMMAND "@CMAKE_COMMAND@" -E remove "${dest}"
+      OUTPUT_VARIABLE rm_out
+      RESULT_VARIABLE rm_retval
+    )
+    if (NOT ${rm_retval} EQUAL 0)
+      message (FATAL_ERROR "Problem when removing \"${dest}\"")
+    endif ()
+  else ()
+    message (STATUS "File \"${dest}\" does not exist.")
+  endif ()
+endforeach ()
diff --git a/examples/1-single-block.c b/examples/1-single-block.c
new file mode 100644
index 0000000..3fb9201
--- /dev/null
+++ b/examples/1-single-block.c
@@ -0,0 +1,50 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+/* Example 1: `One-shot' resample a single block of data in memory.
+ *
+ * N.B. See example 2 for how to resample a stream (of blocks).
+ *
+ * Optional arguments are: INPUT-RATE OUTPUT-RATE
+ *
+ * With the default arguments, the output should produce lines similar to the
+ * following:
+ *
+ *  0.00  0.71  1.00  0.71 -0.00 -0.71 -1.00 -0.71
+ *
+ * Gibbs effect may be seen at the ends of the resampled signal; this is because
+ * unlike a `real-world' signal, the synthetic input signal is not band-limited.
+ */
+
+#include <soxr.h>
+#include "examples-common.h"
+
+const float in[] = {  /* Input: 12 cycles of a sine wave with freq. = irate/4 */
+  0,1,0,-1, 0,1,0,-1, 0,1,0,-1, 0,1,0,-1, 0,1,0,-1, 0,1,0,-1,
+  0,1,0,-1, 0,1,0,-1, 0,1,0,-1, 0,1,0,-1, 0,1,0,-1, 0,1,0,-1};
+
+int main(int argc, char const * arg[])
+{
+  double irate = argc > 1? atof(arg[1]) : 1;         /* Default to upsampling */
+  double orate = argc > 2? atof(arg[2]) : 2;             /* by a factor of 2. */
+
+  size_t olen = (size_t)(AL(in) * orate / irate + .5);   /* Assay output len. */
+  float * out = malloc(sizeof(*out) * olen);       /* Allocate output buffer. */
+  size_t odone;
+
+  soxr_error_t error = soxr_oneshot(irate, orate, 1, /* Rates and # of chans. */
+      in, AL(in), NULL,                              /* Input. */
+      out, olen, &odone,                             /* Output. */
+      NULL, NULL, NULL);                             /* Default configuration.*/
+
+  unsigned i = 0;                            /* Print out the resampled data, */
+  while (i++ < odone)
+    printf("%5.2f%c", out[i-1], " \n"[!(i&7) || i == odone]);
+  printf("%-26s %s\n", arg[0], soxr_strerror(error)); /* and reported result. */
+
+  if (argc > 3)                                     /* Library version check: */
+    printf("runtime=%s API="SOXR_THIS_VERSION_STR"\n", soxr_version());
+
+  free(out);                                                      /* Tidy up. */
+  return !!error;
+}
diff --git a/examples/1a-lsr.c b/examples/1a-lsr.c
new file mode 100644
index 0000000..e42e530
--- /dev/null
+++ b/examples/1a-lsr.c
@@ -0,0 +1,40 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+/* Example 1a: Variant of example 1 using libsamplerate-like bindings. */
+
+#include <soxr-lsr.h>
+#include "examples-common.h"
+
+float in[] = {  /* Input: 12 cycles of a sine wave with freq. = irate/4 */
+  0,1,0,-1, 0,1,0,-1, 0,1,0,-1, 0,1,0,-1, 0,1,0,-1, 0,1,0,-1,
+  0,1,0,-1, 0,1,0,-1, 0,1,0,-1, 0,1,0,-1, 0,1,0,-1, 0,1,0,-1};
+
+int main(int argc, char const * arg[])
+{
+  double irate = argc > 1? atof(arg[1]) : 1;         /* Default to upsampling */
+  double orate = argc > 2? atof(arg[2]) : 2;             /* by a factor of 2. */
+
+  size_t olen = (size_t)(AL(in) * orate / irate + .5);   /* Assay output len. */
+  float * out = (float *)malloc(sizeof(*out) * olen); /* Allocate output buf. */
+
+  int error, i = 0;
+  SRC_DATA data;
+
+  data.data_in = in;
+  data.data_out = out;
+  data.input_frames = AL(in);
+  data.output_frames = (int)olen;
+  data.src_ratio = orate / irate;
+  error = src_simple(&data, SRC_SINC_FASTEST, 1);
+
+  while (i++ < data.output_frames_gen)       /* Print out the resampled data, */
+    printf("%5.2f%c", out[i-1], " \n"[!(i&7) || i == data.output_frames_gen]);
+  printf("%-26s %s\n", arg[0], src_strerror(error));  /* and reported result. */
+
+  if (argc > 3)                                     /* Library version check: */
+    printf("runtime=%s\n", src_get_version());
+
+  free(out);                                                      /* Tidy up. */
+  return !!error;
+}
diff --git a/examples/2-stream.C b/examples/2-stream.C
new file mode 100644
index 0000000..29c1bf6
--- /dev/null
+++ b/examples/2-stream.C
@@ -0,0 +1,78 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+/* Example 2: resample a raw, single-channel, floating-point data stream from
+ * stdin to stdout.
+ *
+ * The application uses the single function `soxr_process' for both input and
+ * output to/from the resampler; compared to the `input function' approach
+ * (illustrated in example 3) this requires that the application implements
+ * more logic, but one less function.
+ *
+ * Arguments are: INPUT-RATE OUTPUT-RATE
+ */
+
+#include <soxr.h>
+#include "examples-common.h"
+
+int main(int argc, char const * arg[])
+{
+  double const irate = argc > 1? atof(arg[1]) : 96000.;
+  double const orate = argc > 2? atof(arg[2]) : 44100.;
+
+  /* Allocate resampling input and output buffers in proportion to the input
+   * and output rates: */
+  #define buf_total_len 15000  /* In samples. */
+  size_t const olen = (size_t)(orate * buf_total_len / (irate + orate) + .5);
+  size_t const ilen = buf_total_len - olen;
+  size_t const osize = sizeof(float), isize = osize;
+  void * obuf = malloc(osize * olen);
+  void * ibuf = malloc(isize * ilen);
+
+  size_t odone, written, need_input = 1;
+  soxr_error_t error;
+
+  /* Create a stream resampler: */
+  soxr_t soxr = soxr_create(
+      irate, orate, 1,             /* Input rate, output rate, # of channels. */
+      &error,                         /* To report any error during creation. */
+      NULL, NULL, NULL);                        /* Use configuration defaults.*/
+
+  if (!error) {                         /* If all is well, run the resampler: */
+    USE_STD_STDIO;
+                                                       /* Resample in blocks: */
+    do {
+      size_t ilen1 = 0;
+
+      if (need_input) {
+
+        /* Read one block into the buffer, ready to be resampled: */
+        ilen1 = fread(ibuf, isize, ilen, stdin);
+
+        if (!ilen1) {     /* If the is no (more) input data available, */
+          free(ibuf);     /* set ibuf to NULL, to indicate end-of-input */
+          ibuf = NULL;    /* to the resampler. */
+        }
+      }
+
+      /* Copy data from the input buffer into the resampler, and resample
+       * to produce as much output as is possible to the given output buffer: */
+      error = soxr_process(soxr, ibuf, ilen1, NULL, obuf, olen, &odone);
+
+      written = fwrite(obuf, osize, odone, stdout); /* Consume output.*/
+
+      /* If the actual amount of data output is less than that requested, and
+       * we have not already reached the end of the input data, then supply some
+       * more input next time round the loop: */
+      need_input = odone < olen && ibuf;
+
+    } while (!error && (need_input || written));
+  }
+                                                                  /* Tidy up: */
+  soxr_delete(soxr);
+  free(obuf), free(ibuf);
+                                                              /* Diagnostics: */
+  fprintf(stderr, "%-26s %s; I/O: %s\n", arg[0], soxr_strerror(error),
+      ferror(stdin) || ferror(stdout)? strerror(errno) : "no error");
+  return !!error;
+}
diff --git a/examples/3-options-input-fn.c b/examples/3-options-input-fn.c
new file mode 100644
index 0000000..38fbb0d
--- /dev/null
+++ b/examples/3-options-input-fn.c
@@ -0,0 +1,113 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+/* Example 3: extends example 2 with multiple channels, multiple datatypes,
+ * and other options.
+ *
+ * The application provides an input function, called on demand by libsoxr, in
+ * response to calls to soxr_output(); compared to the `process' approach
+ * (illustrated in example 2) this requires that the application implements
+ * less logic, but one more function.
+ *
+ * The 11 arguments (which are optional, from last to first) are:
+ *   INPUT-RATE       As example 2
+ *   OUTPUT-RATE      Ditto
+ *   NUM-CHANNELS     Number of interleaved channels
+ *   IN-DATATYPE#     0:float32 1:float64 2:int32 3:int16
+ *   OUT-DATATYPE#    Ditto
+ *   Q-RECIPE         Quality recipe (in hex) See soxr.h
+ *   Q-FLAGS          Quality flags  (in hex) See soxr.h
+ *   PASSBAND-END     %
+ *   STOPBAND-BEGIN   %
+ *   PHASE-RESPONSE   [0,100]
+ *   USE-THREADS      1 to use multi-threading (where available)
+ */
+
+#include <soxr.h>
+#include "examples-common.h"
+
+typedef struct {void * ibuf; size_t isize;} input_context_t;
+
+static size_t input_fn(input_context_t * p, soxr_cbuf_t * buf, size_t len)
+{
+  /* Read one block into the buffer, ready to be input to the resampler: */
+  len = fread(p->ibuf, p->isize, len, stdin); /* Actual len read may be less. */
+
+  /* Inform the resampler of the data's whereabouts (which could be anywhere, in
+   * a freshly malloc'd buffer, for example): */
+  *buf = (!len && ferror(stdin))? NULL : p->ibuf;  /* NULL if error occurred. */
+
+  return len;                           /* # of samples per channel to input. */
+}
+
+int main(int n, char const * arg[])
+{
+  char const *     const arg0 = n? --n, *arg++ : "";
+  double          const irate = n? --n, atof(*arg++) : 96000.;
+  double          const orate = n? --n, atof(*arg++) : 44100.;
+  unsigned        const chans = n? --n, (unsigned)atoi(*arg++) : 1;
+  soxr_datatype_t const itype = n? --n, (soxr_datatype_t)atoi(*arg++) : 0;
+  unsigned        const ospec = n? --n, (soxr_datatype_t)atoi(*arg++) : 0;
+  unsigned long const q_recipe= n? --n, strtoul(*arg++, 0, 16) : SOXR_HQ;
+  unsigned long const q_flags = n? --n, strtoul(*arg++, 0, 16) : 0;
+  double   const passband_end = n? --n, atof(*arg++) : 0;
+  double const stopband_begin = n? --n, atof(*arg++) : 0;
+  double const phase_response = n? --n, atof(*arg++) : -1;
+  int       const use_threads = n? --n, atoi(*arg++) : 1;
+  soxr_datatype_t const otype = ospec & 3;
+
+  soxr_quality_spec_t       q_spec = soxr_quality_spec(q_recipe, q_flags);
+  soxr_io_spec_t            io_spec = soxr_io_spec(itype, otype);
+  soxr_runtime_spec_t const runtime_spec = soxr_runtime_spec(!use_threads);
+
+  /* Allocate resampling input and output buffers in proportion to the input
+   * and output rates: */
+  #define buf_total_len 15000  /* In samples per channel. */
+  size_t const osize = soxr_datatype_size(otype) * chans;
+  size_t const isize = soxr_datatype_size(itype) * chans;
+  size_t const olen0= (size_t)(orate * buf_total_len / (irate + orate) + .5);
+  size_t const olen = min(max(olen0, 1), buf_total_len - 1);
+  size_t const ilen = buf_total_len - olen;
+  void * const obuf = malloc(osize * olen);
+  void * const ibuf = malloc(isize * ilen);
+
+  input_context_t icontext;
+  size_t odone, clips = 0;
+  soxr_error_t error;
+  soxr_t soxr;
+
+  /* Overrides (if given): */
+  if (passband_end   > 0) q_spec.passband_end   = passband_end / 100;
+  if (stopband_begin > 0) q_spec.stopband_begin = stopband_begin / 100;
+  if (phase_response >=0) q_spec.phase_response = phase_response;
+  io_spec.flags = ospec & ~7u;
+
+  /* Create a stream resampler: */
+  soxr = soxr_create(
+      irate, orate, chans,         /* Input rate, output rate, # of channels. */
+      &error,                         /* To report any error during creation. */
+      &io_spec, &q_spec, &runtime_spec);
+
+  if (!error) {                      /* Register input_fn with the resampler: */
+    icontext.ibuf = ibuf, icontext.isize = isize;
+    error = soxr_set_input_fn(soxr, (soxr_input_fn_t)input_fn, &icontext, ilen);
+  }
+
+  if (!error) {                         /* If all is well, run the resampler: */
+    USE_STD_STDIO;
+                                                       /* Resample in blocks: */
+    do odone = soxr_output(soxr, obuf, olen);
+    while (fwrite(obuf, osize, odone, stdout));            /* Consume output. */
+
+    error = soxr_error(soxr);            /* Check if any soxr error occurred. */
+    clips = *soxr_num_clips(soxr);     /* Can occur only with integer output. */
+  }
+                                                                  /* Tidy up: */
+  soxr_delete(soxr);
+  free(obuf), free(ibuf);
+                                                              /* Diagnostics: */
+  fprintf(stderr, "%-26s %s; %lu clips; I/O: %s\n",
+      arg0, soxr_strerror(error), (long unsigned)clips,
+      ferror(stdin) || ferror(stdout)? strerror(errno) : "no error");
+  return !!error;
+}
diff --git a/examples/4-split-channels.c b/examples/4-split-channels.c
new file mode 100644
index 0000000..d6448aa
--- /dev/null
+++ b/examples/4-split-channels.c
@@ -0,0 +1,148 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+/* Example 4: variant of examples 2 & 3, demonstrating I/O with split channels.
+ *
+ * Note that, for convenience of the demonstration, split-channel data is
+ * made available by deinterleaving data sourced from and sent to
+ * interleaved file-streams; this adds a lot of code to the example that,
+ * for purposes of understanding how to use split-channels, may safely be
+ * ignored.  In a real application, the channel-data might never be
+ * interleaved; for example, the split-channel data output from the
+ * resampler might be sent directly to digital-to-analogue converters.
+ *
+ * Note also (not shown in the examples) that split/interleaved channels may
+ * be used for input and output independently.
+ */
+
+#include <soxr.h>
+#include "examples-common.h"
+
+
+
+#define DEINTERLEAVE(T) do { \
+  unsigned i; \
+  size_t j; \
+  T * const * dest = (T * const *)dest0; \
+  T const * src = src0; \
+  if (ch == 1) memcpy(dest[0], src, n * sizeof(dest[0][0])); \
+  else for (j = 0; j < n; ++j) for (i = 0; i < ch; ++i) dest[i][j] = *src++; \
+  return; \
+} while (0)
+
+static void deinterleave(soxr_datatype_t data_type,
+    void * const * dest0,
+    void const * src0,
+    size_t n, unsigned ch)
+{
+  switch (data_type & 3) {
+    case SOXR_FLOAT32: DEINTERLEAVE(float);
+    case SOXR_FLOAT64: DEINTERLEAVE(double);
+    case SOXR_INT32  : DEINTERLEAVE(int32_t);
+    case SOXR_INT16  : DEINTERLEAVE(int16_t);
+    default: break;
+  }
+}
+
+#define INTERLEAVE(T) do { \
+  unsigned i; \
+  size_t j; \
+  T * dest = dest0; \
+  T const * const * src = (T const * const *)src0; \
+  if (ch == 1) memcpy(dest, src[0], n * sizeof(dest[0])); \
+  else for (j = 0; j < n; ++j) for (i = 0; i < ch; ++i) *dest++ = src[i][j]; \
+  return; \
+} while (0)
+
+static void interleave(soxr_datatype_t data_type, void * dest0,
+  void * const * src0, size_t n, unsigned ch)
+{
+  switch (data_type & 3) {
+    case SOXR_FLOAT32: INTERLEAVE(float);
+    case SOXR_FLOAT64: INTERLEAVE(double);
+    case SOXR_INT32  : INTERLEAVE(int32_t);
+    case SOXR_INT16  : INTERLEAVE(int16_t);
+    default: break;
+  }
+}
+
+int main(int n, char const * arg[])
+{
+  char const *     const arg0 = n? --n, *arg++ : "";
+  double          const irate = n? --n, atof(*arg++) : 96000.;
+  double          const orate = n? --n, atof(*arg++) : 44100.;
+  unsigned        const chans = n? --n, (unsigned)atoi(*arg++) : 1;
+  soxr_datatype_t const itype = n? --n, (soxr_datatype_t)atoi(*arg++) : 0;
+  soxr_datatype_t const otype = n? --n, (soxr_datatype_t)atoi(*arg++) : 0;
+  unsigned long const q_recipe= n? --n, strtoul(*arg++, 0, 16) : SOXR_HQ;
+  unsigned long const q_flags = n? --n, strtoul(*arg++, 0, 16) : 0;
+  int       const use_threads = n? --n, atoi(*arg++) : 1;
+
+  soxr_quality_spec_t const q_spec = soxr_quality_spec(q_recipe, q_flags);
+  soxr_io_spec_t const io_spec=soxr_io_spec(itype|SOXR_SPLIT, otype|SOXR_SPLIT);
+  soxr_runtime_spec_t const runtime_spec = soxr_runtime_spec(!use_threads);
+
+  /* Allocate resampling input and output buffers in proportion to the input
+   * and output rates: */
+  #define buf_total_len 15000  /* In samples per channel. */
+  size_t const osize = soxr_datatype_size(otype) * chans;
+  size_t const isize = soxr_datatype_size(itype) * chans;
+  size_t const olen = (size_t)(orate * buf_total_len / (irate + orate) + .5);
+  size_t const ilen = buf_total_len - olen;
+
+  /* For split channels: */
+  void * * const obuf_ptrs = malloc(sizeof(void *) * chans);
+  void * *       ibuf_ptrs = malloc(sizeof(void *) * chans);
+  char * const obufs = malloc(osize * olen), * optr = obufs;
+  char * const ibufs = malloc(isize * ilen), * iptr = ibufs;
+
+  /* For interleaved channels: */
+  char * const obuf = malloc(osize * olen);
+  char * const ibuf = malloc(isize * ilen);
+
+  size_t odone, written, need_input = 1, clips = 0;
+  soxr_error_t error;
+
+  soxr_t soxr = soxr_create(
+      irate, orate, chans, &error, &io_spec, &q_spec, &runtime_spec);
+
+  unsigned i;
+  for (i = 0; i < chans; ++i) {
+    ibuf_ptrs[i] = iptr;
+    obuf_ptrs[i] = optr;
+    iptr += ilen * soxr_datatype_size(itype);
+    optr += olen * soxr_datatype_size(otype);
+  }
+
+  if (!error) {
+    USE_STD_STDIO;
+
+    do {
+      size_t ilen1 = 0;
+
+      if (need_input) {
+        if (!(ilen1 = fread(ibuf, isize, ilen, stdin)))
+          free(ibuf_ptrs), ibuf_ptrs = 0; /* If none available, don't retry. */
+        else deinterleave(itype, ibuf_ptrs, ibuf, ilen1, chans);
+      }
+
+      error = soxr_process(soxr, ibuf_ptrs, ilen1, NULL, obuf_ptrs, olen, &odone);
+      interleave(otype, obuf, obuf_ptrs, odone, chans);  /* Consume output... */
+      written = fwrite(obuf, osize, odone, stdout);
+
+      need_input = odone < olen && ibuf_ptrs;
+
+    } while (!error && (need_input || written));
+
+    clips = *soxr_num_clips(soxr);     /* Can occur only with integer output. */
+  }
+                                                                  /* Tidy up: */
+  soxr_delete(soxr);
+  free(obuf), free(ibuf), free(obufs), free(ibufs);
+  free(obuf_ptrs), free(ibuf_ptrs);
+                                                              /* Diagnostics: */
+  fprintf(stderr, "%-26s %s; %lu clips; I/O: %s\n",
+      arg0, soxr_strerror(error), (long unsigned)clips,
+      ferror(stdin) || ferror(stdout)? strerror(errno) : "no error");
+  return !!error;
+}
diff --git a/examples/5-variable-rate.c b/examples/5-variable-rate.c
new file mode 100644
index 0000000..1a1c63f
--- /dev/null
+++ b/examples/5-variable-rate.c
@@ -0,0 +1,93 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+/* Example 5:  Variable-rate resampling.  A test signal (held in a buffer) is
+ * resampled over a wide range of octaves.  Resampled data is sent to stdout as
+ * raw, float32 samples.  Choices of 2 test-signals and of 2 ways of varying
+ * the sample-rate are combined in a command-line option:
+ *
+ * Usage: ./5-variable-rate [0|1|2|3]
+ */
+
+#include <soxr.h>
+#include "examples-common.h"
+
+#define OCTAVES  5       /* Resampling range. Â± */
+#define OLEN     16      /* Output length in seconds. */
+#define FS       44100   /* Output sampling rate in Hz. */
+
+/* For output pos in [0,1], returns an ioratio in the 2^Â±OCTAVES range: */
+static double ioratio(double pos, int fm)
+{
+  if (fm) /* fm: non-0 for a fast-changing ioratio, 0 for a slow sweep. */
+    pos = .5 - cos(pos * 2 * M_PI) * .4 + sin(pos * OLEN * 20 * M_PI) * .05;
+  return pow(2, 2 * OCTAVES * pos - OCTAVES);
+}
+
+int main(int argc, char *arg[])
+{
+  int opt = argc <= 1? 2 : (atoi(arg[1]) & 3), saw = opt & 1, fm = opt & 2;
+  float ibuf[10 << OCTAVES], obuf[AL(ibuf)];
+  int i, wl = 2 << OCTAVES;
+  size_t ilen = AL(ibuf), need_input = 1, written;
+  size_t odone, total_odone, total_olen = OLEN * FS;
+  size_t olen1 = fm? 10 : AL(obuf); /* Small block-len if fast-changing ratio */
+  soxr_error_t error;
+
+  /* When creating a var-rate resampler, q_spec must be set as follows: */
+  soxr_quality_spec_t q_spec = soxr_quality_spec(SOXR_HQ, SOXR_VR);
+
+  /* The ratio of the given input rate and output rates must equate to the
+   * maximum I/O ratio that will be used: */
+  soxr_t soxr = soxr_create(1 << OCTAVES, 1, 1, &error, NULL, &q_spec, NULL);
+
+  if (!error) {
+    USE_STD_STDIO;
+
+    /* Generate input signal, sine or saw, with wave-length = wl: */
+    for (i = 0; i < (int)ilen; ++i)
+      ibuf[i] = (float)(saw? (i%wl)/(wl-1.)-.5 : .9 * sin(2 * M_PI * i / wl));
+
+    /* Set the initial resampling ratio (N.B. 3rd parameter = 0): */
+    soxr_set_io_ratio(soxr, ioratio(0, fm), 0);
+
+    /* Resample in blocks of size olen1: */
+    for (total_odone = 0; !error && total_odone < total_olen;) {
+
+      /* The last block might be shorter: */
+      size_t block_len = min(olen1, total_olen - total_odone);
+
+      /* Determine the position in [0,1] of the end of the current block: */
+      double pos = (double)(total_odone + block_len) / (double)total_olen;
+
+      /* Calculate an ioratio for this position and instruct the resampler to
+       * move smoothly to the new value, over the course of outputting the next
+       * 'block_len' samples (or give 0 for an instant change instead): */
+      soxr_set_io_ratio(soxr, ioratio(pos, fm), block_len);
+
+      /* Output the block of samples, supplying input samples as needed: */
+      do {
+        size_t len = need_input? ilen : 0;
+        error = soxr_process(soxr, ibuf, len, NULL, obuf, block_len, &odone);
+        written = fwrite(obuf, sizeof(float), odone, stdout);
+
+        /* Update counters for the current block and for the total length: */
+        block_len -= odone;
+        total_odone += odone;
+
+        /* If soxr_process did not provide the complete block, we must call it
+         * again, supplying more input samples: */
+        need_input = block_len != 0;
+
+      } while (need_input && !error && written == odone);
+
+      /* Now that the block for the current ioratio is complete, go back
+       * round the main `for' loop in order to process the next block. */
+    }
+    soxr_delete(soxr);
+  }
+                                                              /* Diagnostics: */
+  fprintf(stderr, "%-26s %s; I/O: %s\n", arg[0], soxr_strerror(error),
+      ferror(stdin) || ferror(stdout)? strerror(errno) : "no error");
+  return !!error;
+}
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
new file mode 100644
index 0000000..862718a
--- /dev/null
+++ b/examples/CMakeLists.txt
@@ -0,0 +1,37 @@
+# SoX Resampler Library       Copyright (c) 2007-13 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+if (${BUILD_EXAMPLES})
+  project (soxr) # Adds c++ compiler
+  file (GLOB SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/[1-9]-*.[cC])
+elseif (${BUILD_TESTS})
+  file (GLOB SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/3*.c)
+endif ()
+
+if (${BUILD_EXAMPLES} OR ${BUILD_TESTS})
+  if (${WITH_LSR_BINDINGS})
+    set (LSR_SOURCES 1a-lsr.c)
+  endif ()
+endif ()
+
+if (NOT BUILD_SHARED_LIBS AND OPENMP_FOUND)
+  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_C_FLAGS}")
+endif ()
+set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${PROJECT_C_FLAGS}")
+set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${PROJECT_CXX_FLAGS}")
+link_libraries (${PROJECT_NAME})
+
+foreach (fe ${SOURCES} ${LSR_SOURCES})
+  get_filename_component (f ${fe} NAME_WE)
+  add_executable (${f} ${fe})
+  if (${f} STREQUAL "1a-lsr")
+    target_link_libraries (${f} soxr-lsr)
+  endif ()
+endforeach ()
+
+if (${BUILD_TESTS} AND ${WITH_LSR_BINDINGS})
+  add_test (lsr-bindings ${BIN}1a-lsr)
+endif ()
+
+file (GLOB INSTALL_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/*.[cCh])
+install (FILES ${INSTALL_SOURCES} ${CMAKE_CURRENT_SOURCE_DIR}/README DESTINATION ${DOC_INSTALL_DIR}/examples)
diff --git a/examples/README b/examples/README
new file mode 100644
index 0000000..a58939b
--- /dev/null
+++ b/examples/README
@@ -0,0 +1,20 @@
+SoX Resampler Library       Copyright (c) 2007-13 robs@users.sourceforge.net
+
+These simple examples show the different ways that an application may
+interface with soxr.  Note that real-world applications may also have to
+deal with file-formats, codecs, (more sophisticated) dithering, etc., which
+are not covered here.
+
+With the library installed, the examples may be built using commands similar
+to the following.  On unix-like systems:
+
+    cc 1-single-block.c -lsoxr
+    cc 1a-lsr.c -lsoxr-lsr
+
+or, with MSVC on MS-Windows:
+
+    cl 1-single-block.c -I"C:/Program Files/soxr/include" "C:/Program Files/soxr/lib/soxr.lib"
+    cl 1a-lsr.c -I"C:/Program Files/soxr/include" "C:/Program Files/soxr/lib/soxr-lsr.lib"
+
+IDEs may hide such commands behind configuration screens and build menus --
+where applicable, consult your IDE's user-manual.
diff --git a/examples/examples-common.h b/examples/examples-common.h
new file mode 100644
index 0000000..585fac3
--- /dev/null
+++ b/examples/examples-common.h
@@ -0,0 +1,45 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+/* Common includes etc. for the examples.  */
+
+#include <assert.h>
+#include <errno.h>
+#include <limits.h>
+#include <math.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef _WIN32
+  /* Work-around for broken file-I/O on MS-Windows: */
+  #include <io.h>
+  #include <fcntl.h>
+  #define USE_STD_STDIO _setmode(_fileno(stdout), _O_BINARY), \
+                        _setmode(_fileno(stdin ), _O_BINARY);
+  /* Sometimes missing, so ensure that it is defined: */
+  #undef M_PI
+  #define M_PI 3.14159265358979323846
+#else
+  #define USE_STD_STDIO
+#endif
+
+#undef int16_t
+#define int16_t short
+
+#undef int32_t
+#if LONG_MAX > 2147483647L
+  #define int32_t int
+#elif LONG_MAX < 2147483647L
+  #error this programme requires that 'long int' has at least 32-bits
+#else
+  #define int32_t long
+#endif
+
+#undef min
+#undef max
+#define min(x,y) ((x)<(y)?(x):(y))
+#define max(x,y) ((x)>(y)?(x):(y))
+
+#define AL(a) (sizeof(a)/sizeof((a)[0]))  /* Array Length */
diff --git a/go b/go
new file mode 100755
index 0000000..10f3ceb
--- /dev/null
+++ b/go
@@ -0,0 +1,17 @@
+#!/bin/sh
+# SoX Resampler Library       Copyright (c) 2007-13 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+case $1 in -j*) j=$1; shift;; esac    # Support -jX for parallel build/test
+
+build=$1
+test x$build = x && build=Release
+
+rm -f CMakeCache.txt             # Prevent interference from any in-tree build
+
+mkdir -p $build
+cd $build
+
+cmake -DCMAKE_BUILD_TYPE=$build -Wno-dev .. &&
+  make $j &&
+    (ctest $j || echo "FAILURE details in $build/Testing/Temporary/LastTest.log")
diff --git a/go.bat b/go.bat
new file mode 100644
index 0000000..c73d4c2
--- /dev/null
+++ b/go.bat
@@ -0,0 +1,27 @@
+@echo off
+rem SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+rem Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+set build=%1
+if x%build% == x set build=Release
+
+rem Prevent interference from any in-tree build
+del/f CMakeCache.txt
+
+mkdir %build%
+cd %build%
+
+cmake -G "NMake Makefiles" -DCMAKE_BUILD_TYPE=%build% -Wno-dev ..
+if errorlevel 1 goto end
+
+nmake
+if errorlevel 1 goto end
+
+nmake test
+if errorlevel 1 goto error
+goto end
+
+:error
+echo FAILURE details in Testing\Temporary\LastTest.log
+
+:end
diff --git a/inst-check b/inst-check
new file mode 100755
index 0000000..8cf64b7
--- /dev/null
+++ b/inst-check
@@ -0,0 +1,25 @@
+#!/bin/sh
+set -e
+# SoX Resampler Library       Copyright (c) 2007-13 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+# Sanity-check of library installed on unix-like system
+
+# This script checks the installation of the entire library (including lsr).
+#
+# Distros using three separate packages can do the following (in order):
+#
+# * Install soxr pkg (i.e. basically, just the shared object)
+# * ./inst-check-soxr
+# * Install soxr-lsr pkg (i.e. basically, just the shared object)
+# * ./inst-check-soxr-lsr
+# * Install the -dev pkg (i.e. examples, headers, & pkg-config)
+# * ./inst-check PATH-OF-INSTALLED-EXAMPLES-DIR (e.g. /usr/share/doc/libsoxr/examples)
+
+# Where are the example source files:
+src=$1
+test x$src = x && src=/usr/local/share/doc/libsoxr/examples
+
+dir="$(dirname $(readlink -f $0))"
+$dir/inst-check-soxr $src
+$dir/inst-check-soxr-lsr $src
diff --git a/inst-check-soxr b/inst-check-soxr
new file mode 100755
index 0000000..5f923b8
--- /dev/null
+++ b/inst-check-soxr
@@ -0,0 +1,52 @@
+#!/bin/sh
+set -e
+# SoX Resampler Library       Copyright (c) 2007-13 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+# Sanity-check of sub-library installed on unix-like system
+
+arg="$1" # path to installed examples (if dev pkg installed); otherwise omitted
+dir="$(dirname $(readlink -f $0))"
+
+# Find the examples:
+src="$arg"
+test x"$src" = x && src="$dir/examples"
+cd $src
+
+# Somewhere to put the binaries:
+tmp=`mktemp -d`
+
+build_examples() {
+  if [ x"$arg" = x ]; then
+    echo "Examples in `pwd`; using local headers:" # for when dev pkg not installed
+    libs=-l$1
+    cflags=-I$dir/src
+  else
+    echo "Examples in `pwd`; using pkg-config:"
+    libs=$(pkg-config --libs $1)
+    cflags=$(pkg-config --cflags $1)
+  fi
+  for f in ?$2-*.[cC]; do
+    cc=cc; echo $f | grep -q C$ && cc=c++
+    out=$tmp/`echo $f | sed "s/.[cC]$//"`
+    cmd="$cc $cflags -o $out $f $libs -lm"
+    echo $cmd; $cmd
+  done
+}
+
+# Determine library:
+if [ `basename $0` = inst-check-soxr ]; then
+  build_examples soxr
+  gen="dd if=/dev/urandom count=1000"
+  $tmp/1-single-block 1 2 .
+  $gen 2> /dev/null | $tmp/2-stream                     2>&1 >$tmp/stdout
+  $gen 2> /dev/null | $tmp/3-options-input-fn 6 7 2 2 0 2>&1 >$tmp/stdout
+  $gen 2> /dev/null | $tmp/4-split-channels   7 6 2 2 3 2>&1 >$tmp/stdout  # Clipping expected here
+  $gen 2> /dev/null | $tmp/5-variable-rate              2>&1 >$tmp/stdout
+else
+  build_examples soxr-lsr a # lsr has 'a' suffix on example number.
+  $tmp/1a-lsr 1 2 .
+fi
+
+# Tidy up:
+rm -rf $tmp
diff --git a/inst-check-soxr-lsr b/inst-check-soxr-lsr
new file mode 120000
index 0000000..ec971fb
--- /dev/null
+++ b/inst-check-soxr-lsr
@@ -0,0 +1 @@
+inst-check-soxr
\ No newline at end of file
diff --git a/msvc/README b/msvc/README
new file mode 100644
index 0000000..5a34eba
--- /dev/null
+++ b/msvc/README
@@ -0,0 +1,9 @@
+SoX Resampler Library       Copyright (c) 2007-13 robs@users.sourceforge.net
+
+Cmake is able to configure, build (as either a DLL or a static library),
+and install libsoxr for general use on MS-Windows as on other OSs.
+However, for projects that prefer to maintain a more monolithic build
+structure using the MSVC compiler, the accompanying files may be useful.
+
+ * libsoxr.vcproj      Builds a static lib for MSVC ver >= 9 (2008).
+ * soxr-config.h       Pre-configured for a typical Win32 system.
diff --git a/msvc/libsoxr.vcproj b/msvc/libsoxr.vcproj
new file mode 100644
index 0000000..b1e1714
--- /dev/null
+++ b/msvc/libsoxr.vcproj
@@ -0,0 +1,80 @@
+<?xml version="1.0" encoding="Windows-1252"?>
+<VisualStudioProject
+	ProjectType="Visual C++"
+	Version="9.00"
+	Name="libsoxr"
+	ProjectGUID="{af9ad75c-4785-4432-bac3-adab1e7f1192}"
+	RootNamespace="libsoxr"
+	TargetFrameworkVersion="131072"
+	>
+	<Platforms>
+		<Platform Name="Win32" />
+	</Platforms>
+	<ToolFiles>
+	</ToolFiles>
+	<Configurations>
+		<Configuration
+			Name="Debug|Win32"
+			OutputDirectory="$(SolutionDir)$(ConfigurationName)"
+			IntermediateDirectory="$(ConfigurationName)"
+			ConfigurationType="4"
+			CharacterSet="2"
+			WholeProgramOptimization="0"
+			>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="0"
+				AdditionalIncludeDirectories="."
+				PreprocessorDefinitions="_DEBUG;_USE_MATH_DEFINES;_CRT_SECURE_NO_WARNINGS;SOXR_LIB"
+				StringPooling="true"
+				BasicRuntimeChecks="3"
+				RuntimeLibrary="3"
+				EnableFunctionLevelLinking="true"
+				WarningLevel="3"
+				DebugInformationFormat="4"
+				CompileAs="0"
+			/>
+		</Configuration>
+		<Configuration
+			Name="Release|Win32"
+			OutputDirectory="$(SolutionDir)$(ConfigurationName)"
+			IntermediateDirectory="$(ConfigurationName)"
+			ConfigurationType="4"
+			CharacterSet="2"
+			WholeProgramOptimization="1"
+			>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="2"
+				AdditionalIncludeDirectories="."
+				PreprocessorDefinitions="NDEBUG;_USE_MATH_DEFINES;_CRT_SECURE_NO_WARNINGS;SOXR_LIB"
+				StringPooling="true"
+				RuntimeLibrary="2"
+				EnableFunctionLevelLinking="true"
+				WarningLevel="3"
+				CompileAs="0"
+			/>
+		</Configuration>
+	</Configurations>
+	<References>
+	</References>
+	<Files>
+		<Filter Name="Source Files" >
+			<File RelativePath="../src/data-io.c" />
+			<File RelativePath="../src/dbesi0.c" />
+			<File RelativePath="../src/fft4g32.c" />
+			<File RelativePath="../src/fft4g64.c" />
+			<File RelativePath="../src/filter.c" />
+			<File RelativePath="../src/lsr.c" />
+			<File RelativePath="../src/pffft32s.c" />
+			<File RelativePath="../src/rate32.c" />
+			<File RelativePath="../src/rate32s.c" />
+			<File RelativePath="../src/rate64.c" />
+			<File RelativePath="../src/simd.c" />
+			<File RelativePath="../src/soxr.c" />
+			<File RelativePath="../src/vr32.c" />
+		</Filter>
+	</Files>
+	<Globals>
+	</Globals>
+</VisualStudioProject>
diff --git a/msvc/soxr-config.h b/msvc/soxr-config.h
new file mode 100644
index 0000000..a158de4
--- /dev/null
+++ b/msvc/soxr-config.h
@@ -0,0 +1,49 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+/* N.B. Pre-configured for typical MS-Windows systems.  However, the normal
+ * procedure is to use the cmake configuration and build system. See INSTALL. */
+
+#if !defined soxr_config_included
+#define soxr_config_included
+
+#define HAVE_SINGLE_PRECISION 1
+#define HAVE_DOUBLE_PRECISION 1
+#define HAVE_AVFFT            0
+#define HAVE_SIMD             1
+#define HAVE_FENV_H           0
+#define HAVE_LRINT            0
+#define WORDS_BIGENDIAN       0
+
+#include <limits.h>
+
+#undef bool
+#undef false
+#undef true
+#define bool int
+#define false 0
+#define true 1
+
+#undef int16_t
+#undef int32_t
+#undef int64_t
+#undef uint32_t
+#undef uint64_t
+#define int16_t short
+#if LONG_MAX > 2147483647L
+  #define int32_t int
+  #define int64_t long
+#elif LONG_MAX < 2147483647L
+#error this library requires that 'long int' has at least 32-bits
+#else
+  #define int32_t long
+  #if defined _MSC_VER
+    #define int64_t __int64
+  #else
+    #define int64_t long long
+  #endif
+#endif
+#define uint32_t unsigned int32_t
+#define uint64_t unsigned int64_t
+
+#endif
diff --git a/soxr-config.h.in b/soxr-config.h.in
new file mode 100644
index 0000000..227bcfd
--- /dev/null
+++ b/soxr-config.h.in
@@ -0,0 +1,46 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#if !defined soxr_config_included
+#define soxr_config_included
+
+#define HAVE_SINGLE_PRECISION @HAVE_SINGLE_PRECISION@
+#define HAVE_DOUBLE_PRECISION @HAVE_DOUBLE_PRECISION@
+#define HAVE_AVFFT            @HAVE_AVFFT@
+#define HAVE_SIMD             @HAVE_SIMD@
+#define HAVE_FENV_H           @HAVE_FENV_H@
+#define HAVE_LRINT            @HAVE_LRINT@
+#define WORDS_BIGENDIAN       @WORDS_BIGENDIAN@
+
+#include <limits.h>
+
+#undef bool
+#undef false
+#undef true
+#define bool int
+#define false 0
+#define true 1
+
+#undef int16_t
+#undef int32_t
+#undef int64_t
+#undef uint32_t
+#undef uint64_t
+#define int16_t short
+#if LONG_MAX > 2147483647L
+  #define int32_t int
+  #define int64_t long
+#elif LONG_MAX < 2147483647L
+#error this library requires that 'long int' has at least 32-bits
+#else
+  #define int32_t long
+  #if defined _MSC_VER
+    #define int64_t __int64
+  #else
+    #define int64_t long long
+  #endif
+#endif
+#define uint32_t unsigned int32_t
+#define uint64_t unsigned int64_t
+
+#endif
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
new file mode 100644
index 0000000..56756bf
--- /dev/null
+++ b/src/CMakeLists.txt
@@ -0,0 +1,125 @@
+# SoX Resampler Library       Copyright (c) 2007-13 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+
+
+# Can generate vr-coefs.h but it complicates cross-compiling & non-cmake builds
+
+if (NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/vr-coefs.h)
+  include_directories(${CMAKE_CURRENT_BINARY_DIR})
+  set_property(SOURCE vr32.c APPEND PROPERTY OBJECT_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/vr-coefs.h)
+  add_executable (vr-coefs vr-coefs.c)
+  ADD_CUSTOM_COMMAND(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/vr-coefs.h
+    COMMAND vr-coefs > ${CMAKE_CURRENT_BINARY_DIR}/vr-coefs.h
+    DEPENDS vr-coefs)
+endif ()
+
+
+
+add_definitions (${PROJECT_C_FLAGS} -DSOXR_LIB)
+
+
+
+# Libsoxr configuration:
+
+set (RDFT32 fft4g32)
+if (WITH_AVFFT AND AVCODEC_FOUND)
+  set (RDFT32 avfft32)
+  set (RDFT32S avfft32s)
+elseif (WITH_PFFFT)
+  #set (RDFT32 pffft32)
+  set (RDFT32S pffft32s)
+elseif (WITH_SIMD)
+  set (RDFT32S fft4g32s)
+endif ()
+
+if (WITH_DOUBLE_PRECISION)
+  set (DP_SOURCES rate64)
+endif ()
+
+if (WITH_SINGLE_PRECISION)
+  set (SP_SOURCES rate32 ${RDFT32})
+endif ()
+
+if (HAVE_SIMD)
+  set (SIMD_SOURCES rate32s ${RDFT32S} simd)
+  foreach (source ${SIMD_SOURCES})
+    set_property (SOURCE ${source} PROPERTY COMPILE_FLAGS ${SIMD_C_FLAGS})
+  endforeach ()
+endif ()
+
+
+
+# Libsoxr:
+
+add_library (${PROJECT_NAME} ${LIB_TYPE} ${PROJECT_NAME}.c data-io dbesi0 filter fft4g64
+  ${SP_SOURCES} vr32 ${DP_SOURCES} ${SIMD_SOURCES})
+set_target_properties (${PROJECT_NAME} PROPERTIES
+  VERSION "${SO_VERSION}"
+  SOVERSION ${SO_VERSION_MAJOR}
+  INSTALL_NAME_DIR ${LIB_INSTALL_DIR}
+  LINK_INTERFACE_LIBRARIES ""
+  PUBLIC_HEADER "${PROJECT_NAME}.h")
+if (BUILD_FRAMEWORK)
+  set_target_properties (${PROJECT_NAME} PROPERTIES FRAMEWORK TRUE)
+elseif (NOT WIN32)
+  set (TARGET_PCS ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}.pc)
+  configure_file (${CMAKE_CURRENT_SOURCE_DIR}/${PROJECT_NAME}.pc.in ${TARGET_PCS})
+  install (FILES ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}.pc DESTINATION ${LIB_INSTALL_DIR}/pkgconfig)
+endif ()
+
+
+
+# LSR bindings:
+
+if (WITH_LSR_BINDINGS)
+  set (LSR ${PROJECT_NAME}-lsr)
+  set (LSR_SO_VERSION 0.1.9)
+  set (LSR_SO_VERSION_MAJOR 0)
+  add_library (${LSR} ${LIB_TYPE} lsr)
+  target_link_libraries (${LSR} ${PROJECT_NAME})
+  set_target_properties (${LSR} PROPERTIES
+    VERSION "${LSR_SO_VERSION}"
+    SOVERSION ${LSR_SO_VERSION_MAJOR}
+    INSTALL_NAME_DIR ${LIB_INSTALL_DIR}
+    LINK_INTERFACE_LIBRARIES ""
+    PUBLIC_HEADER "${LSR}.h")
+  if (BUILD_FRAMEWORK)
+    set_target_properties (${LSR} PROPERTIES FRAMEWORK TRUE)
+  elseif (NOT WIN32)
+    set (TARGET_PCS "${TARGET_PCS} ${CMAKE_CURRENT_BINARY_DIR}/${LSR}.pc")
+    configure_file (${CMAKE_CURRENT_SOURCE_DIR}/${LSR}.pc.in ${CMAKE_CURRENT_BINARY_DIR}/${LSR}.pc)
+    install (FILES ${CMAKE_CURRENT_BINARY_DIR}/${LSR}.pc DESTINATION ${LIB_INSTALL_DIR}/pkgconfig)
+  endif ()
+endif ()
+
+
+
+# Installation (from build from source):
+
+install (TARGETS ${PROJECT_NAME} ${LSR}
+  FRAMEWORK DESTINATION ${FRAMEWORK_INSTALL_DIR}
+  LIBRARY DESTINATION ${LIB_INSTALL_DIR}
+  RUNTIME DESTINATION ${BIN_INSTALL_DIR}
+  ARCHIVE DESTINATION ${LIB_INSTALL_DIR}
+  PUBLIC_HEADER DESTINATION ${INCLUDE_INSTALL_DIR})
+
+
+
+# Packaging (for unix-like distributions):
+
+get_property (LIB1 TARGET ${PROJECT_NAME} PROPERTY LOCATION)
+if (BUILD_SHARED_LIBS)
+  set (LIB1 ${LIB1}.${SO_VERSION_MAJOR} ${LIB1}.${SO_VERSION})
+endif ()
+list (APPEND TARGET_HEADERS "${CMAKE_CURRENT_SOURCE_DIR}/${PROJECT_NAME}.h")
+if (WITH_LSR_BINDINGS)
+  get_property (LIB2 TARGET ${LSR} PROPERTY LOCATION)
+  if (BUILD_SHARED_LIBS)
+    set (LIB2 ${LIB2}.${LSR_SO_VERSION_MAJOR} ${LIB2}.${LSR_SO_VERSION})
+  endif ()
+  list (APPEND TARGET_HEADERS "${CMAKE_CURRENT_SOURCE_DIR}/${LSR}.h")
+endif ()
+set (TARGET_LIBS ${LIB1} ${LIB2})
+configure_file (${CMAKE_CURRENT_SOURCE_DIR}/libsoxr.src.in ${CMAKE_CURRENT_BINARY_DIR}/libsoxr.src)
+configure_file (${CMAKE_CURRENT_SOURCE_DIR}/libsoxr-dev.src.in ${CMAKE_CURRENT_BINARY_DIR}/libsoxr-dev.src)
diff --git a/src/aliases.h b/src/aliases.h
new file mode 100644
index 0000000..eb42bdc
--- /dev/null
+++ b/src/aliases.h
@@ -0,0 +1,37 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#if defined SOXR_LIB
+
+#define lsx_bessel_I_0                 _soxr_bessel_I_0
+#define lsx_cdft_f                     _soxr_cdft_f
+#define lsx_cdft                       _soxr_cdft
+#define lsx_clear_fft_cache_f          _soxr_clear_fft_cache_f
+#define lsx_clear_fft_cache            _soxr_clear_fft_cache
+#define lsx_ddct_f                     _soxr_ddct_f
+#define lsx_ddct                       _soxr_ddct
+#define lsx_ddst_f                     _soxr_ddst_f
+#define lsx_ddst                       _soxr_ddst
+#define lsx_design_lpf                 _soxr_design_lpf
+#define lsx_dfct_f                     _soxr_dfct_f
+#define lsx_dfct                       _soxr_dfct
+#define lsx_dfst_f                     _soxr_dfst_f
+#define lsx_dfst                       _soxr_dfst
+#define lsx_fir_to_phase               _soxr_fir_to_phase
+#define lsx_init_fft_cache_f           _soxr_init_fft_cache_f
+#define lsx_init_fft_cache             _soxr_init_fft_cache
+#define lsx_kaiser_beta                _soxr_kaiser_beta
+#define lsx_kaiser_params              _soxr_kaiser_params
+#define lsx_make_lpf                   _soxr_make_lpf
+#define lsx_ordered_convolve_f         _soxr_ordered_convolve_f
+#define lsx_ordered_convolve           _soxr_ordered_convolve
+#define lsx_ordered_partial_convolve_f _soxr_ordered_partial_convolve_f
+#define lsx_ordered_partial_convolve   _soxr_ordered_partial_convolve
+#define lsx_rdft_f                     _soxr_rdft_f
+#define lsx_rdft                       _soxr_rdft
+#define lsx_safe_cdft_f                _soxr_safe_cdft_f
+#define lsx_safe_cdft                  _soxr_safe_cdft
+#define lsx_safe_rdft_f                _soxr_safe_rdft_f
+#define lsx_safe_rdft                  _soxr_safe_rdft
+
+#endif
diff --git a/src/avfft32.c b/src/avfft32.c
new file mode 100644
index 0000000..5be13d2
--- /dev/null
+++ b/src/avfft32.c
@@ -0,0 +1,27 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#include <math.h>
+#include <libavcodec/avfft.h>
+#include "filter.h"
+
+static void * forward_setup(int len) {return av_rdft_init((int)(log(len)/log(2)+.5),DFT_R2C);}
+static void * backward_setup(int len) {return av_rdft_init((int)(log(len)/log(2)+.5),IDFT_C2R);}
+static void rdft(int length, void * setup, float * h) {av_rdft_calc(setup, h); (void)length;}
+static int multiplier(void) {return 2;}
+static void nothing(void) {}
+
+typedef void (* fn_t)(void);
+fn_t _soxr_rdft32_cb[] = {
+  (fn_t)forward_setup,
+  (fn_t)backward_setup,
+  (fn_t)av_rdft_end,
+  (fn_t)rdft,
+  (fn_t)rdft,
+  (fn_t)rdft,
+  (fn_t)rdft,
+  (fn_t)_soxr_ordered_convolve_f,
+  (fn_t)_soxr_ordered_partial_convolve_f,
+  (fn_t)multiplier,
+  (fn_t)nothing,
+};
diff --git a/src/avfft32s.c b/src/avfft32s.c
new file mode 100644
index 0000000..75e485e
--- /dev/null
+++ b/src/avfft32s.c
@@ -0,0 +1,27 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#include <math.h>
+#include <libavcodec/avfft.h>
+#include "simd.h"
+
+static void * forward_setup(int len) {return av_rdft_init((int)(log(len)/log(2)+.5),DFT_R2C);}
+static void * backward_setup(int len) {return av_rdft_init((int)(log(len)/log(2)+.5),IDFT_C2R);}
+static void rdft(int length, void * setup, float * h) {av_rdft_calc(setup, h); (void)length;}
+static int multiplier(void) {return 2;}
+static void nothing(void) {}
+
+typedef void (* fn_t)(void);
+fn_t _soxr_rdft32s_cb[] = {
+  (fn_t)forward_setup,
+  (fn_t)backward_setup,
+  (fn_t)av_rdft_end,
+  (fn_t)rdft,
+  (fn_t)rdft,
+  (fn_t)rdft,
+  (fn_t)rdft,
+  (fn_t)_soxr_ordered_convolve_simd,
+  (fn_t)_soxr_ordered_partial_convolve_simd,
+  (fn_t)multiplier,
+  (fn_t)nothing,
+};
diff --git a/src/ccrw2.h b/src/ccrw2.h
new file mode 100644
index 0000000..b42185b
--- /dev/null
+++ b/src/ccrw2.h
@@ -0,0 +1,75 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+/* Concurrent Control with "Readers" and "Writers", P.J. Courtois et al, 1971 */
+
+#if !defined ccrw2_included
+#define ccrw2_included
+
+#if defined SOXR_LIB
+#include "internal.h"
+#endif
+
+#if defined _OPENMP
+
+#include <omp.h>
+
+typedef struct {
+  int readcount, writecount; /* initial value = 0 */
+  omp_lock_t mutex_1, mutex_2, mutex_3, w, r; /* initial value = 1 */
+} ccrw2_t; /* Problem #2: `writers-preference' */
+
+#define ccrw2_become_reader(p) do {\
+  omp_set_lock(&p.mutex_3);\
+    omp_set_lock(&p.r);\
+      omp_set_lock(&p.mutex_1);\
+        if (++p.readcount == 1) omp_set_lock(&p.w);\
+      omp_unset_lock(&p.mutex_1);\
+    omp_unset_lock(&p.r);\
+  omp_unset_lock(&p.mutex_3);\
+} while (0)
+#define ccrw2_cease_reading(p) do {\
+  omp_set_lock(&p.mutex_1);\
+    if (!--p.readcount) omp_unset_lock(&p.w);\
+  omp_unset_lock(&p.mutex_1);\
+} while (0)
+#define ccrw2_become_writer(p) do {\
+  omp_set_lock(&p.mutex_2);\
+    if (++p.writecount == 1) omp_set_lock(&p.r);\
+  omp_unset_lock(&p.mutex_2);\
+  omp_set_lock(&p.w);\
+} while (0)
+#define ccrw2_cease_writing(p) do {\
+  omp_unset_lock(&p.w);\
+  omp_set_lock(&p.mutex_2);\
+    if (!--p.writecount) omp_unset_lock(&p.r);\
+  omp_unset_lock(&p.mutex_2);\
+} while (0)
+#define ccrw2_init(p) do {\
+  omp_init_lock(&p.mutex_1);\
+  omp_init_lock(&p.mutex_2);\
+  omp_init_lock(&p.mutex_3);\
+  omp_init_lock(&p.w);\
+  omp_init_lock(&p.r);\
+} while (0)
+#define ccrw2_clear(p) do {\
+  omp_destroy_lock(&p.r);\
+  omp_destroy_lock(&p.w);\
+  omp_destroy_lock(&p.mutex_3);\
+  omp_destroy_lock(&p.mutex_2);\
+  omp_destroy_lock(&p.mutex_1);\
+} while (0)
+
+#else
+
+typedef int ccrw2_t;
+#define ccrw2_become_reader(x) (void)(x)
+#define ccrw2_cease_reading(x) (void)(x)
+#define ccrw2_become_writer(x) (void)(x)
+#define ccrw2_cease_writing(x) (void)(x)
+#define ccrw2_init(x) (void)(x)
+#define ccrw2_clear(x) (void)(x)
+
+#endif /* _OPENMP */
+
+#endif
diff --git a/src/data-io.c b/src/data-io.c
new file mode 100644
index 0000000..1cd8e7f
--- /dev/null
+++ b/src/data-io.c
@@ -0,0 +1,252 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#include <limits.h>
+#include <math.h>
+#include <string.h>
+
+#include "data-io.h"
+#include "internal.h"
+
+
+
+#define DEINTERLEAVE_FROM(T,flag) do { \
+  unsigned i; \
+  size_t j; \
+  T const * src = *src0; \
+  if (ch > 1) \
+    for (j = 0; j < n; ++j) for (i = 0; i < ch; ++i) dest[i][j] = (DEINTERLEAVE_TO)*src++; \
+  else if (flag) memcpy(dest[0], src, n * sizeof(T)), src = &src[n]; \
+  else for (j = 0; j < n; dest[0][j++] = (DEINTERLEAVE_TO)*src++); \
+  *src0 = src; \
+} while (0)
+
+
+
+#if HAVE_DOUBLE_PRECISION
+void _soxr_deinterleave(double * * dest, /* Round/clipping not needed here */
+    soxr_datatype_t data_type, void const * * src0, size_t n, unsigned ch)
+{
+#define DEINTERLEAVE_TO double
+  switch (data_type & 3) {
+    case SOXR_FLOAT32: DEINTERLEAVE_FROM(float, 0); break;
+    case SOXR_FLOAT64: DEINTERLEAVE_FROM(double, 1); break;
+    case SOXR_INT32:   DEINTERLEAVE_FROM(int32_t, 0); break;
+    case SOXR_INT16:   DEINTERLEAVE_FROM(int16_t, 0); break;
+    default: break;
+  }
+}
+#endif
+
+
+
+#if HAVE_SINGLE_PRECISION
+void _soxr_deinterleave_f(float * * dest, /* Round/clipping not needed here */
+    soxr_datatype_t data_type, void const * * src0, size_t n, unsigned ch)
+{
+#undef DEINTERLEAVE_TO
+#define DEINTERLEAVE_TO float
+  switch (data_type & 3) {
+    case SOXR_FLOAT32: DEINTERLEAVE_FROM(float, 1); break;
+    case SOXR_FLOAT64: DEINTERLEAVE_FROM(double, 0); break;
+    case SOXR_INT32:   DEINTERLEAVE_FROM(int32_t, 0); break;
+    case SOXR_INT16:   DEINTERLEAVE_FROM(int16_t, 0); break;
+    default: break;
+  }
+}
+#endif
+
+
+
+#include "rint.h"
+
+#if HAVE_FENV_H
+  #include <fenv.h>
+  #define fe_test_invalid() fetestexcept(FE_INVALID)
+  #define fe_clear_invalid() feclearexcept(FE_INVALID)
+#elif defined _MSC_VER
+  #define FE_INVALID 1
+  #if defined _WIN64
+    #include <float.h>
+    #define fe_test_invalid() (_statusfp() & _SW_INVALID)
+    #define fe_clear_invalid _clearfp /* FIXME clears all */
+  #else
+  static __inline int fe_test_invalid()
+  {
+    short status_word;
+    __asm fnstsw status_word
+    return status_word & FE_INVALID;
+  }
+
+  static __inline int fe_clear_invalid()
+  {
+    int16_t status[14];
+    __asm fnstenv status
+    status[2] &= ~FE_INVALID;
+    __asm fldenv status
+    return 0;
+  }
+  #endif
+#endif
+
+
+
+#if defined FE_INVALID && defined FPU_RINT32 && defined __STDC_VERSION__
+  #if __STDC_VERSION__ >= 199901L
+    #pragma STDC FENV_ACCESS ON
+  #endif
+#endif
+
+#if HAVE_DOUBLE_PRECISION
+#define FLOATX double
+
+#define LSX_RINT_CLIP_2 lsx_rint32_clip_2
+#define LSX_RINT_CLIP lsx_rint32_clip
+#define RINT_CLIP rint32_clip
+#define RINT rint32
+#if defined FPU_RINT32
+  #define FPU_RINT
+#endif
+#define RINT_T int32_t
+#define RINT_MAX 2147483647L
+#include "rint-clip.h"
+
+#define LSX_RINT_CLIP_2 lsx_rint16_clip_2
+#define LSX_RINT_CLIP lsx_rint16_clip
+#define RINT_CLIP rint16_clip
+#define RINT rint16
+#if defined FPU_RINT16
+  #define FPU_RINT
+#endif
+#define RINT_T int16_t
+#define RINT_MAX 32767
+#include "rint-clip.h"
+
+#define LSX_RINT_CLIP_2 lsx_rint16_clip_2_dither
+#define LSX_RINT_CLIP lsx_rint16_clip_dither
+#define RINT_CLIP rint16_clip_dither
+#define RINT rint16
+#if defined FPU_RINT16
+  #define FPU_RINT
+#endif
+#define RINT_T int16_t
+#define RINT_MAX 32767
+#define DITHER
+#include "rint-clip.h"
+
+#undef FLOATX
+#endif
+
+
+
+#if HAVE_SINGLE_PRECISION
+#define FLOATX float
+
+#define LSX_RINT_CLIP_2 lsx_rint32_clip_2_f
+#define LSX_RINT_CLIP lsx_rint32_clip_f
+#define RINT_CLIP rint32_clip_f
+#define RINT rint32
+#if defined FPU_RINT32
+  #define FPU_RINT
+#endif
+#define RINT_T int32_t
+#define RINT_MAX 2147483647L
+#include "rint-clip.h"
+
+#define LSX_RINT_CLIP_2 lsx_rint16_clip_2_f
+#define LSX_RINT_CLIP lsx_rint16_clip_f
+#define RINT_CLIP rint16_clip_f
+#define RINT rint16
+#if defined FPU_RINT16
+  #define FPU_RINT
+#endif
+#define RINT_T int16_t
+#define RINT_MAX 32767
+#include "rint-clip.h"
+
+#define LSX_RINT_CLIP_2 lsx_rint16_clip_2_dither_f
+#define LSX_RINT_CLIP lsx_rint16_clip_dither_f
+#define RINT_CLIP rint16_clip_dither_f
+#define RINT rint16
+#if defined FPU_RINT16
+  #define FPU_RINT
+#endif
+#define RINT_T int16_t
+#define RINT_MAX 32767
+#define DITHER
+#include "rint-clip.h"
+
+#undef FLOATX
+#endif
+
+#if defined FE_INVALID && defined FPU_RINT32 && defined __STDC_VERSION__
+  #if __STDC_VERSION__ >= 199901L
+    #pragma STDC FENV_ACCESS OFF
+  #endif
+#endif
+
+
+
+#define INTERLEAVE_TO(T,flag) do { \
+  unsigned i; \
+  size_t j; \
+  T * dest = *dest0; \
+  if (ch > 1) \
+  for (j = 0; j < n; ++j) for (i = 0; i < ch; ++i) *dest++ = (T)src[i][j]; \
+  else if (flag) memcpy(dest, src[0], n * sizeof(T)), dest = &dest[n]; \
+  else for (j = 0; j < n; *dest++ = (T)src[0][j++]); \
+  *dest0 = dest; \
+  return 0; \
+} while (0)
+
+#if HAVE_DOUBLE_PRECISION
+size_t /* clips */ _soxr_interleave(soxr_datatype_t data_type, void * * dest0,
+  double const * const * src, size_t n, unsigned ch, unsigned long * seed)
+{
+  switch (data_type & 3) {
+    case SOXR_FLOAT32: INTERLEAVE_TO(float, 0);
+    case SOXR_FLOAT64: INTERLEAVE_TO(double, 1);
+
+    case SOXR_INT32: if (ch == 1)
+        return lsx_rint32_clip(dest0, src[0], n);
+      return lsx_rint32_clip_2(dest0, src, ch, n);
+
+    case SOXR_INT16: if (seed) {
+      if (ch == 1)
+        return lsx_rint16_clip_dither(dest0, src[0], n, seed);
+      return lsx_rint16_clip_2_dither(dest0, src, ch, n, seed);
+    }
+    if (ch == 1)
+        return lsx_rint16_clip(dest0, src[0], n);
+      return lsx_rint16_clip_2(dest0, src, ch, n);
+    default: break;
+  }
+  return 0;
+}
+#endif
+
+#if HAVE_SINGLE_PRECISION
+size_t /* clips */ _soxr_interleave_f(soxr_datatype_t data_type, void * * dest0,
+  float const * const * src, size_t n, unsigned ch, unsigned long * seed)
+{
+  switch (data_type & 3) {
+    case SOXR_FLOAT32: INTERLEAVE_TO(float, 1);
+    case SOXR_FLOAT64: INTERLEAVE_TO(double, 0);
+
+    case SOXR_INT32: if (ch == 1)
+        return lsx_rint32_clip_f(dest0, src[0], n);
+      return lsx_rint32_clip_2_f(dest0, src, ch, n);
+
+    case SOXR_INT16: if (seed) {
+      if (ch == 1)
+        return lsx_rint16_clip_dither_f(dest0, src[0], n, seed);
+      return lsx_rint16_clip_2_dither_f(dest0, src, ch, n, seed);
+    }
+    if (ch == 1)
+        return lsx_rint16_clip_f(dest0, src[0], n);
+      return lsx_rint16_clip_2_f(dest0, src, ch, n);
+    default: break;
+  }
+  return 0;
+}
+#endif
diff --git a/src/data-io.h b/src/data-io.h
new file mode 100644
index 0000000..83a0a13
--- /dev/null
+++ b/src/data-io.h
@@ -0,0 +1,39 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#if !defined soxr_data_io_included
+#define soxr_data_io_included
+
+#include "soxr.h"
+
+void _soxr_deinterleave(
+    double * * dest,
+    soxr_datatype_t data_type,
+    void const * * src0,
+    size_t n,
+    unsigned ch);
+
+void _soxr_deinterleave_f(
+    float * * dest,
+    soxr_datatype_t data_type,
+    void const * * src0,
+    size_t n,
+    unsigned ch);
+
+size_t /* clips */ _soxr_interleave(
+    soxr_datatype_t data_type,
+    void * * dest,
+    double const * const * src,
+    size_t n,
+    unsigned ch,
+    unsigned long * seed);
+
+size_t /* clips */ _soxr_interleave_f(
+    soxr_datatype_t data_type,
+    void * * dest,
+    float const * const * src,
+    size_t n,
+    unsigned ch,
+    unsigned long * seed);
+
+#endif
diff --git a/src/dbesi0.c b/src/dbesi0.c
new file mode 100644
index 0000000..654216e
--- /dev/null
+++ b/src/dbesi0.c
@@ -0,0 +1,149 @@
+/*  Copyright(C) 1996 Takuya OOURA
+
+You may use, copy, modify this code for any purpose and
+without fee.
+
+Package home:  http://www.kurims.kyoto-u.ac.jp/~ooura/bessel.html
+*/
+
+#include "filter.h"
+#define dbesi0 lsx_bessel_I_0
+
+/* Bessel I_0(x) function in double precision */
+
+#include <math.h>
+
+double dbesi0(double x)
+{
+    int k;
+    double w, t, y;
+    static double a[65] = {
+        8.5246820682016865877e-11, 2.5966600546497407288e-9,
+        7.9689994568640180274e-8, 1.9906710409667748239e-6,
+        4.0312469446528002532e-5, 6.4499871606224265421e-4,
+        0.0079012345761930579108, 0.071111111109207045212,
+        0.444444444444724909, 1.7777777777777532045,
+        4.0000000000000011182, 3.99999999999999998,
+        1.0000000000000000001,
+        1.1520919130377195927e-10, 2.2287613013610985225e-9,
+        8.1903951930694585113e-8, 1.9821560631611544984e-6,
+        4.0335461940910133184e-5, 6.4495330974432203401e-4,
+        0.0079013012611467520626, 0.071111038160875566622,
+        0.44444450319062699316, 1.7777777439146450067,
+        4.0000000132337935071, 3.9999999968569015366,
+        1.0000000003426703174,
+        1.5476870780515238488e-10, 1.2685004214732975355e-9,
+        9.2776861851114223267e-8, 1.9063070109379044378e-6,
+        4.0698004389917945832e-5, 6.4370447244298070713e-4,
+        0.0079044749458444976958, 0.071105052411749363882,
+        0.44445280640924755082, 1.7777694934432109713,
+        4.0000055808824003386, 3.9999977081165740932,
+        1.0000004333949319118,
+        2.0675200625006793075e-10, -6.1689554705125681442e-10,
+        1.2436765915401571654e-7, 1.5830429403520613423e-6,
+        4.2947227560776583326e-5, 6.3249861665073441312e-4,
+        0.0079454472840953930811, 0.070994327785661860575,
+        0.44467219586283000332, 1.7774588182255374745,
+        4.0003038986252717972, 3.9998233869142057195,
+        1.0000472932961288324,
+        2.7475684794982708655e-10, -3.8991472076521332023e-9,
+        1.9730170483976049388e-7, 5.9651531561967674521e-7,
+        5.1992971474748995357e-5, 5.7327338675433770752e-4,
+        0.0082293143836530412024, 0.069990934858728039037,
+        0.44726764292723985087, 1.7726685170014087784,
+        4.0062907863712704432, 3.9952750700487845355,
+        1.0016354346654179322
+    };
+    static double b[70] = {
+        6.7852367144945531383e-8, 4.6266061382821826854e-7,
+        6.9703135812354071774e-6, 7.6637663462953234134e-5,
+        7.9113515222612691636e-4, 0.0073401204731103808981,
+        0.060677114958668837046, 0.43994941411651569622,
+        2.7420017097661750609, 14.289661921740860534,
+        59.820609640320710779, 188.78998681199150629,
+        399.8731367825601118, 427.56411572180478514,
+        1.8042097874891098754e-7, 1.2277164312044637357e-6,
+        1.8484393221474274861e-5, 2.0293995900091309208e-4,
+        0.0020918539850246207459, 0.019375315654033949297,
+        0.15985869016767185908, 1.1565260527420641724,
+        7.1896341224206072113, 37.354773811947484532,
+        155.80993164266268457, 489.5211371158540918,
+        1030.9147225169564806, 1093.5883545113746958,
+        4.8017305613187493564e-7, 3.261317843912380074e-6,
+        4.9073137508166159639e-5, 5.3806506676487583755e-4,
+        0.0055387918291051866561, 0.051223717488786549025,
+        0.42190298621367914765, 3.0463625987357355872,
+        18.895299447327733204, 97.915189029455461554,
+        407.13940115493494659, 1274.3088990480582632,
+        2670.9883037012547506, 2815.7166284662544712,
+        1.2789926338424623394e-6, 8.6718263067604918916e-6,
+        1.3041508821299929489e-4, 0.001428224737372747892,
+        0.014684070635768789378, 0.13561403190404185755,
+        1.1152592585977393953, 8.0387088559465389038,
+        49.761318895895479206, 257.2684232313529138,
+        1066.8543146269566231, 3328.3874581009636362,
+        6948.8586598121634874, 7288.4893398212481055,
+        3.409350368197032893e-6, 2.3079025203103376076e-5,
+        3.4691373283901830239e-4, 0.003794994977222908545,
+        0.038974209677945602145, 0.3594948380414878371,
+        2.9522878893539528226, 21.246564609514287056,
+        131.28727387146173141, 677.38107093296675421,
+        2802.3724744545046518, 8718.5731420798254081,
+        18141.348781638832286, 18948.925349296308859
+    };
+    static double c[45] = {
+        2.5568678676452702768e-15, 3.0393953792305924324e-14,
+        6.3343751991094840009e-13, 1.5041298011833009649e-11,
+        4.4569436918556541414e-10, 1.746393051427167951e-8,
+        1.0059224011079852317e-6, 1.0729838945088577089e-4,
+        0.05150322693642527738,
+        5.2527963991711562216e-15, 7.202118481421005641e-15,
+        7.2561421229904797156e-13, 1.482312146673104251e-11,
+        4.4602670450376245434e-10, 1.7463600061788679671e-8,
+        1.005922609132234756e-6, 1.0729838937545111487e-4,
+        0.051503226936437300716,
+        1.3365917359358069908e-14, -1.2932643065888544835e-13,
+        1.7450199447905602915e-12, 1.0419051209056979788e-11,
+        4.58047881980598326e-10, 1.7442405450073548966e-8,
+        1.0059461453281292278e-6, 1.0729837434500161228e-4,
+        0.051503226940658446941,
+        5.3771611477352308649e-14, -1.1396193006413731702e-12,
+        1.2858641335221653409e-11, -5.9802086004570057703e-11,
+        7.3666894305929510222e-10, 1.6731837150730356448e-8,
+        1.0070831435812128922e-6, 1.0729733111203704813e-4,
+        0.051503227360726294675,
+        3.7819492084858931093e-14, -4.8600496888588034879e-13,
+        1.6898350504817224909e-12, 4.5884624327524255865e-11,
+        1.2521615963377513729e-10, 1.8959658437754727957e-8,
+        1.0020716710561353622e-6, 1.073037119856927559e-4,
+        0.05150322383300230775
+    };
+
+    w = fabs(x);
+    if (w < 8.5) {
+        t = w * w * 0.0625;
+        k = 13 * ((int) t);
+        y = (((((((((((a[k] * t + a[k + 1]) * t +
+            a[k + 2]) * t + a[k + 3]) * t + a[k + 4]) * t +
+            a[k + 5]) * t + a[k + 6]) * t + a[k + 7]) * t +
+            a[k + 8]) * t + a[k + 9]) * t + a[k + 10]) * t +
+            a[k + 11]) * t + a[k + 12];
+    } else if (w < 12.5) {
+        k = (int) w;
+        t = w - k;
+        k = 14 * (k - 8);
+        y = ((((((((((((b[k] * t + b[k + 1]) * t +
+            b[k + 2]) * t + b[k + 3]) * t + b[k + 4]) * t +
+            b[k + 5]) * t + b[k + 6]) * t + b[k + 7]) * t +
+            b[k + 8]) * t + b[k + 9]) * t + b[k + 10]) * t +
+            b[k + 11]) * t + b[k + 12]) * t + b[k + 13];
+    } else {
+        t = 60 / w;
+        k = 9 * ((int) t);
+        y = ((((((((c[k] * t + c[k + 1]) * t +
+            c[k + 2]) * t + c[k + 3]) * t + c[k + 4]) * t +
+            c[k + 5]) * t + c[k + 6]) * t + c[k + 7]) * t +
+            c[k + 8]) * sqrt(t) * exp(w);
+    }
+    return y;
+}
diff --git a/src/fft4g.c b/src/fft4g.c
new file mode 100644
index 0000000..5fae8a6
--- /dev/null
+++ b/src/fft4g.c
@@ -0,0 +1,1352 @@
+/* Copyright Takuya OOURA, 1996-2001.
+
+You may use, copy, modify and distribute this code for any
+purpose (include commercial use) and without fee.  Please
+refer to this package when you modify this code.
+
+Package home:  http://www.kurims.kyoto-u.ac.jp/~ooura/fft.html
+
+Fast Fourier/Cosine/Sine Transform
+    dimension   :one
+    data length :power of 2
+    decimation  :frequency
+    radix       :4, 2
+    data        :inplace
+    table       :use
+functions
+    cdft: Complex Discrete Fourier Transform
+    rdft: Real Discrete Fourier Transform
+    ddct: Discrete Cosine Transform
+    ddst: Discrete Sine Transform
+    dfct: Cosine Transform of RDFT (Real Symmetric DFT)
+    dfst: Sine Transform of RDFT (Real Anti-symmetric DFT)
+function prototypes
+    void cdft(int, int, double *, int *, double *);
+    void rdft(int, int, double *, int *, double *);
+    void ddct(int, int, double *, int *, double *);
+    void ddst(int, int, double *, int *, double *);
+    void dfct(int, double *, double *, int *, double *);
+    void dfst(int, double *, double *, int *, double *);
+
+
+-------- Complex DFT (Discrete Fourier Transform) --------
+    [definition]
+        <case1>
+            X[k] = sum_j=0^n-1 x[j]*exp(2*pi*i*j*k/n), 0<=k<n
+        <case2>
+            X[k] = sum_j=0^n-1 x[j]*exp(-2*pi*i*j*k/n), 0<=k<n
+        (notes: sum_j=0^n-1 is a summation from j=0 to n-1)
+    [usage]
+        <case1>
+            ip[0] = 0; // first time only
+            cdft(2*n, 1, a, ip, w);
+        <case2>
+            ip[0] = 0; // first time only
+            cdft(2*n, -1, a, ip, w);
+    [parameters]
+        2*n            :data length (int)
+                        n >= 1, n = power of 2
+        a[0...2*n-1]   :input/output data (double *)
+                        input data
+                            a[2*j] = Re(x[j]),
+                            a[2*j+1] = Im(x[j]), 0<=j<n
+                        output data
+                            a[2*k] = Re(X[k]),
+                            a[2*k+1] = Im(X[k]), 0<=k<n
+        ip[0...*]      :work area for bit reversal (int *)
+                        length of ip >= 2+sqrt(n)
+                        strictly,
+                        length of ip >=
+                            2+(1<<(int)(log(n+0.5)/log(2))/2).
+                        ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...n/2-1]   :cos/sin table (double *)
+                        w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of
+            cdft(2*n, -1, a, ip, w);
+        is
+            cdft(2*n, 1, a, ip, w);
+            for (j = 0; j <= 2 * n - 1; j++) {
+                a[j] *= 1.0 / n;
+            }
+        .
+
+
+-------- Real DFT / Inverse of Real DFT --------
+    [definition]
+        <case1> RDFT
+            R[k] = sum_j=0^n-1 a[j]*cos(2*pi*j*k/n), 0<=k<=n/2
+            I[k] = sum_j=0^n-1 a[j]*sin(2*pi*j*k/n), 0<k<n/2
+        <case2> IRDFT (excluding scale)
+            a[k] = (R[0] + R[n/2]*cos(pi*k))/2 +
+                   sum_j=1^n/2-1 R[j]*cos(2*pi*j*k/n) +
+                   sum_j=1^n/2-1 I[j]*sin(2*pi*j*k/n), 0<=k<n
+    [usage]
+        <case1>
+            ip[0] = 0; // first time only
+            rdft(n, 1, a, ip, w);
+        <case2>
+            ip[0] = 0; // first time only
+            rdft(n, -1, a, ip, w);
+    [parameters]
+        n              :data length (int)
+                        n >= 2, n = power of 2
+        a[0...n-1]     :input/output data (double *)
+                        <case1>
+                            output data
+                                a[2*k] = R[k], 0<=k<n/2
+                                a[2*k+1] = I[k], 0<k<n/2
+                                a[1] = R[n/2]
+                        <case2>
+                            input data
+                                a[2*j] = R[j], 0<=j<n/2
+                                a[2*j+1] = I[j], 0<j<n/2
+                                a[1] = R[n/2]
+        ip[0...*]      :work area for bit reversal (int *)
+                        length of ip >= 2+sqrt(n/2)
+                        strictly,
+                        length of ip >=
+                            2+(1<<(int)(log(n/2+0.5)/log(2))/2).
+                        ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...n/2-1]   :cos/sin table (double *)
+                        w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of
+            rdft(n, 1, a, ip, w);
+        is
+            rdft(n, -1, a, ip, w);
+            for (j = 0; j <= n - 1; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+
+
+-------- DCT (Discrete Cosine Transform) / Inverse of DCT --------
+    [definition]
+        <case1> IDCT (excluding scale)
+            C[k] = sum_j=0^n-1 a[j]*cos(pi*j*(k+1/2)/n), 0<=k<n
+        <case2> DCT
+            C[k] = sum_j=0^n-1 a[j]*cos(pi*(j+1/2)*k/n), 0<=k<n
+    [usage]
+        <case1>
+            ip[0] = 0; // first time only
+            ddct(n, 1, a, ip, w);
+        <case2>
+            ip[0] = 0; // first time only
+            ddct(n, -1, a, ip, w);
+    [parameters]
+        n              :data length (int)
+                        n >= 2, n = power of 2
+        a[0...n-1]     :input/output data (double *)
+                        output data
+                            a[k] = C[k], 0<=k<n
+        ip[0...*]      :work area for bit reversal (int *)
+                        length of ip >= 2+sqrt(n/2)
+                        strictly,
+                        length of ip >=
+                            2+(1<<(int)(log(n/2+0.5)/log(2))/2).
+                        ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...n*5/4-1] :cos/sin table (double *)
+                        w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of
+            ddct(n, -1, a, ip, w);
+        is
+            a[0] *= 0.5;
+            ddct(n, 1, a, ip, w);
+            for (j = 0; j <= n - 1; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+
+
+-------- DST (Discrete Sine Transform) / Inverse of DST --------
+    [definition]
+        <case1> IDST (excluding scale)
+            S[k] = sum_j=1^n A[j]*sin(pi*j*(k+1/2)/n), 0<=k<n
+        <case2> DST
+            S[k] = sum_j=0^n-1 a[j]*sin(pi*(j+1/2)*k/n), 0<k<=n
+    [usage]
+        <case1>
+            ip[0] = 0; // first time only
+            ddst(n, 1, a, ip, w);
+        <case2>
+            ip[0] = 0; // first time only
+            ddst(n, -1, a, ip, w);
+    [parameters]
+        n              :data length (int)
+                        n >= 2, n = power of 2
+        a[0...n-1]     :input/output data (double *)
+                        <case1>
+                            input data
+                                a[j] = A[j], 0<j<n
+                                a[0] = A[n]
+                            output data
+                                a[k] = S[k], 0<=k<n
+                        <case2>
+                            output data
+                                a[k] = S[k], 0<k<n
+                                a[0] = S[n]
+        ip[0...*]      :work area for bit reversal (int *)
+                        length of ip >= 2+sqrt(n/2)
+                        strictly,
+                        length of ip >=
+                            2+(1<<(int)(log(n/2+0.5)/log(2))/2).
+                        ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...n*5/4-1] :cos/sin table (double *)
+                        w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of
+            ddst(n, -1, a, ip, w);
+        is
+            a[0] *= 0.5;
+            ddst(n, 1, a, ip, w);
+            for (j = 0; j <= n - 1; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+
+
+-------- Cosine Transform of RDFT (Real Symmetric DFT) --------
+    [definition]
+        C[k] = sum_j=0^n a[j]*cos(pi*j*k/n), 0<=k<=n
+    [usage]
+        ip[0] = 0; // first time only
+        dfct(n, a, t, ip, w);
+    [parameters]
+        n              :data length - 1 (int)
+                        n >= 2, n = power of 2
+        a[0...n]       :input/output data (double *)
+                        output data
+                            a[k] = C[k], 0<=k<=n
+        t[0...n/2]     :work area (double *)
+        ip[0...*]      :work area for bit reversal (int *)
+                        length of ip >= 2+sqrt(n/4)
+                        strictly,
+                        length of ip >=
+                            2+(1<<(int)(log(n/4+0.5)/log(2))/2).
+                        ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...n*5/8-1] :cos/sin table (double *)
+                        w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of
+            a[0] *= 0.5;
+            a[n] *= 0.5;
+            dfct(n, a, t, ip, w);
+        is
+            a[0] *= 0.5;
+            a[n] *= 0.5;
+            dfct(n, a, t, ip, w);
+            for (j = 0; j <= n; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+
+
+-------- Sine Transform of RDFT (Real Anti-symmetric DFT) --------
+    [definition]
+        S[k] = sum_j=1^n-1 a[j]*sin(pi*j*k/n), 0<k<n
+    [usage]
+        ip[0] = 0; // first time only
+        dfst(n, a, t, ip, w);
+    [parameters]
+        n              :data length + 1 (int)
+                        n >= 2, n = power of 2
+        a[0...n-1]     :input/output data (double *)
+                        output data
+                            a[k] = S[k], 0<k<n
+                        (a[0] is used for work area)
+        t[0...n/2-1]   :work area (double *)
+        ip[0...*]      :work area for bit reversal (int *)
+                        length of ip >= 2+sqrt(n/4)
+                        strictly,
+                        length of ip >=
+                            2+(1<<(int)(log(n/4+0.5)/log(2))/2).
+                        ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...n*5/8-1] :cos/sin table (double *)
+                        w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of
+            dfst(n, a, t, ip, w);
+        is
+            dfst(n, a, t, ip, w);
+            for (j = 1; j <= n - 1; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+
+
+Appendix :
+    The cos/sin table is recalculated when the larger table required.
+    w[] and ip[] are compatible with all routines.
+*/
+
+
+#include <math.h>
+#include "fft4g.h"
+
+#ifdef FFT4G_FLOAT
+  #define double float
+  #define one_half 0.5f
+
+#if defined _MSC_VER
+  #define sin   (float)sin
+  #define cos   (float)cos
+  #define atan  (float)atan
+#else
+  #define sin   sinf
+  #define cos   cosf
+  #define atan  atanf
+#endif
+
+  #define cdft  lsx_cdft_f
+  #define rdft  lsx_rdft_f
+  #define ddct  lsx_ddct_f
+  #define ddst  lsx_ddst_f
+  #define dfct  lsx_dfct_f
+  #define dfst  lsx_dfst_f
+#else
+  #define one_half 0.5
+  #define cdft  lsx_cdft
+  #define rdft  lsx_rdft
+  #define ddct  lsx_ddct
+  #define ddst  lsx_ddst
+  #define dfct  lsx_dfct
+  #define dfst  lsx_dfst
+#endif
+
+static void bitrv2conj(int n, int *ip, double *a);
+static void bitrv2(int n, int *ip, double *a);
+static void cft1st(int n, double *a, double const *w);
+static void cftbsub(int n, double *a, double const *w);
+static void cftfsub(int n, double *a, double const *w);
+static void cftmdl(int n, int l, double *a, double const *w);
+static void dctsub(int n, double *a, int nc, double const *c);
+static void dstsub(int n, double *a, int nc, double const *c);
+static void makect(int nc, int *ip, double *c);
+static void makewt(int nw, int *ip, double *w);
+static void rftbsub(int n, double *a, int nc, double const *c);
+static void rftfsub(int n, double *a, int nc, double const *c);
+
+
+void cdft(int n, int isgn, double *a, int *ip, double *w)
+{
+    if (n > (ip[0] << 2)) {
+        makewt(n >> 2, ip, w);
+    }
+    if (n > 4) {
+        if (isgn >= 0) {
+            bitrv2(n, ip + 2, a);
+            cftfsub(n, a, w);
+        } else {
+            bitrv2conj(n, ip + 2, a);
+            cftbsub(n, a, w);
+        }
+    } else if (n == 4) {
+        cftfsub(n, a, w);
+    }
+}
+
+
+void rdft(int n, int isgn, double *a, int *ip, double *w)
+{
+    int nw, nc;
+    double xi;
+
+    nw = ip[0];
+    if (n > (nw << 2)) {
+        nw = n >> 2;
+        makewt(nw, ip, w);
+    }
+    nc = ip[1];
+    if (n > (nc << 2)) {
+        nc = n >> 2;
+        makect(nc, ip, w + nw);
+    }
+    if (isgn >= 0) {
+        if (n > 4) {
+            bitrv2(n, ip + 2, a);
+            cftfsub(n, a, w);
+            rftfsub(n, a, nc, w + nw);
+        } else if (n == 4) {
+            cftfsub(n, a, w);
+        }
+        xi = a[0] - a[1];
+        a[0] += a[1];
+        a[1] = xi;
+    } else {
+        a[1] = one_half * (a[0] - a[1]);
+        a[0] -= a[1];
+        if (n > 4) {
+            rftbsub(n, a, nc, w + nw);
+            bitrv2(n, ip + 2, a);
+            cftbsub(n, a, w);
+        } else if (n == 4) {
+            cftfsub(n, a, w);
+        }
+    }
+}
+
+
+void ddct(int n, int isgn, double *a, int *ip, double *w)
+{
+    int j, nw, nc;
+    double xr;
+
+    nw = ip[0];
+    if (n > (nw << 2)) {
+        nw = n >> 2;
+        makewt(nw, ip, w);
+    }
+    nc = ip[1];
+    if (n > nc) {
+        nc = n;
+        makect(nc, ip, w + nw);
+    }
+    if (isgn < 0) {
+        xr = a[n - 1];
+        for (j = n - 2; j >= 2; j -= 2) {
+            a[j + 1] = a[j] - a[j - 1];
+            a[j] += a[j - 1];
+        }
+        a[1] = a[0] - xr;
+        a[0] += xr;
+        if (n > 4) {
+            rftbsub(n, a, nc, w + nw);
+            bitrv2(n, ip + 2, a);
+            cftbsub(n, a, w);
+        } else if (n == 4) {
+            cftfsub(n, a, w);
+        }
+    }
+    dctsub(n, a, nc, w + nw);
+    if (isgn >= 0) {
+        if (n > 4) {
+            bitrv2(n, ip + 2, a);
+            cftfsub(n, a, w);
+            rftfsub(n, a, nc, w + nw);
+        } else if (n == 4) {
+            cftfsub(n, a, w);
+        }
+        xr = a[0] - a[1];
+        a[0] += a[1];
+        for (j = 2; j < n; j += 2) {
+            a[j - 1] = a[j] - a[j + 1];
+            a[j] += a[j + 1];
+        }
+        a[n - 1] = xr;
+    }
+}
+
+
+void ddst(int n, int isgn, double *a, int *ip, double *w)
+{
+    int j, nw, nc;
+    double xr;
+
+    nw = ip[0];
+    if (n > (nw << 2)) {
+        nw = n >> 2;
+        makewt(nw, ip, w);
+    }
+    nc = ip[1];
+    if (n > nc) {
+        nc = n;
+        makect(nc, ip, w + nw);
+    }
+    if (isgn < 0) {
+        xr = a[n - 1];
+        for (j = n - 2; j >= 2; j -= 2) {
+            a[j + 1] = -a[j] - a[j - 1];
+            a[j] -= a[j - 1];
+        }
+        a[1] = a[0] + xr;
+        a[0] -= xr;
+        if (n > 4) {
+            rftbsub(n, a, nc, w + nw);
+            bitrv2(n, ip + 2, a);
+            cftbsub(n, a, w);
+        } else if (n == 4) {
+            cftfsub(n, a, w);
+        }
+    }
+    dstsub(n, a, nc, w + nw);
+    if (isgn >= 0) {
+        if (n > 4) {
+            bitrv2(n, ip + 2, a);
+            cftfsub(n, a, w);
+            rftfsub(n, a, nc, w + nw);
+        } else if (n == 4) {
+            cftfsub(n, a, w);
+        }
+        xr = a[0] - a[1];
+        a[0] += a[1];
+        for (j = 2; j < n; j += 2) {
+            a[j - 1] = -a[j] - a[j + 1];
+            a[j] -= a[j + 1];
+        }
+        a[n - 1] = -xr;
+    }
+}
+
+
+void dfct(int n, double *a, double *t, int *ip, double *w)
+{
+    int j, k, l, m, mh, nw, nc;
+    double xr, xi, yr, yi;
+
+    nw = ip[0];
+    if (n > (nw << 3)) {
+        nw = n >> 3;
+        makewt(nw, ip, w);
+    }
+    nc = ip[1];
+    if (n > (nc << 1)) {
+        nc = n >> 1;
+        makect(nc, ip, w + nw);
+    }
+    m = n >> 1;
+    yi = a[m];
+    xi = a[0] + a[n];
+    a[0] -= a[n];
+    t[0] = xi - yi;
+    t[m] = xi + yi;
+    if (n > 2) {
+        mh = m >> 1;
+        for (j = 1; j < mh; j++) {
+            k = m - j;
+            xr = a[j] - a[n - j];
+            xi = a[j] + a[n - j];
+            yr = a[k] - a[n - k];
+            yi = a[k] + a[n - k];
+            a[j] = xr;
+            a[k] = yr;
+            t[j] = xi - yi;
+            t[k] = xi + yi;
+        }
+        t[mh] = a[mh] + a[n - mh];
+        a[mh] -= a[n - mh];
+        dctsub(m, a, nc, w + nw);
+        if (m > 4) {
+            bitrv2(m, ip + 2, a);
+            cftfsub(m, a, w);
+            rftfsub(m, a, nc, w + nw);
+        } else if (m == 4) {
+            cftfsub(m, a, w);
+        }
+        a[n - 1] = a[0] - a[1];
+        a[1] = a[0] + a[1];
+        for (j = m - 2; j >= 2; j -= 2) {
+            a[2 * j + 1] = a[j] + a[j + 1];
+            a[2 * j - 1] = a[j] - a[j + 1];
+        }
+        l = 2;
+        m = mh;
+        while (m >= 2) {
+            dctsub(m, t, nc, w + nw);
+            if (m > 4) {
+                bitrv2(m, ip + 2, t);
+                cftfsub(m, t, w);
+                rftfsub(m, t, nc, w + nw);
+            } else if (m == 4) {
+                cftfsub(m, t, w);
+            }
+            a[n - l] = t[0] - t[1];
+            a[l] = t[0] + t[1];
+            k = 0;
+            for (j = 2; j < m; j += 2) {
+                k += l << 2;
+                a[k - l] = t[j] - t[j + 1];
+                a[k + l] = t[j] + t[j + 1];
+            }
+            l <<= 1;
+            mh = m >> 1;
+            for (j = 0; j < mh; j++) {
+                k = m - j;
+                t[j] = t[m + k] - t[m + j];
+                t[k] = t[m + k] + t[m + j];
+            }
+            t[mh] = t[m + mh];
+            m = mh;
+        }
+        a[l] = t[0];
+        a[n] = t[2] - t[1];
+        a[0] = t[2] + t[1];
+    } else {
+        a[1] = a[0];
+        a[2] = t[0];
+        a[0] = t[1];
+    }
+}
+
+
+void dfst(int n, double *a, double *t, int *ip, double *w)
+{
+    int j, k, l, m, mh, nw, nc;
+    double xr, xi, yr, yi;
+
+    nw = ip[0];
+    if (n > (nw << 3)) {
+        nw = n >> 3;
+        makewt(nw, ip, w);
+    }
+    nc = ip[1];
+    if (n > (nc << 1)) {
+        nc = n >> 1;
+        makect(nc, ip, w + nw);
+    }
+    if (n > 2) {
+        m = n >> 1;
+        mh = m >> 1;
+        for (j = 1; j < mh; j++) {
+            k = m - j;
+            xr = a[j] + a[n - j];
+            xi = a[j] - a[n - j];
+            yr = a[k] + a[n - k];
+            yi = a[k] - a[n - k];
+            a[j] = xr;
+            a[k] = yr;
+            t[j] = xi + yi;
+            t[k] = xi - yi;
+        }
+        t[0] = a[mh] - a[n - mh];
+        a[mh] += a[n - mh];
+        a[0] = a[m];
+        dstsub(m, a, nc, w + nw);
+        if (m > 4) {
+            bitrv2(m, ip + 2, a);
+            cftfsub(m, a, w);
+            rftfsub(m, a, nc, w + nw);
+        } else if (m == 4) {
+            cftfsub(m, a, w);
+        }
+        a[n - 1] = a[1] - a[0];
+        a[1] = a[0] + a[1];
+        for (j = m - 2; j >= 2; j -= 2) {
+            a[2 * j + 1] = a[j] - a[j + 1];
+            a[2 * j - 1] = -a[j] - a[j + 1];
+        }
+        l = 2;
+        m = mh;
+        while (m >= 2) {
+            dstsub(m, t, nc, w + nw);
+            if (m > 4) {
+                bitrv2(m, ip + 2, t);
+                cftfsub(m, t, w);
+                rftfsub(m, t, nc, w + nw);
+            } else if (m == 4) {
+                cftfsub(m, t, w);
+            }
+            a[n - l] = t[1] - t[0];
+            a[l] = t[0] + t[1];
+            k = 0;
+            for (j = 2; j < m; j += 2) {
+                k += l << 2;
+                a[k - l] = -t[j] - t[j + 1];
+                a[k + l] = t[j] - t[j + 1];
+            }
+            l <<= 1;
+            mh = m >> 1;
+            for (j = 1; j < mh; j++) {
+                k = m - j;
+                t[j] = t[m + k] + t[m + j];
+                t[k] = t[m + k] - t[m + j];
+            }
+            t[0] = t[m + mh];
+            m = mh;
+        }
+        a[l] = t[0];
+    }
+    a[0] = 0;
+}
+
+
+/* -------- initializing routines -------- */
+
+
+static void makewt(int nw, int *ip, double *w)
+{
+    int j, nwh;
+    double delta, x, y;
+
+    ip[0] = nw;
+    ip[1] = 1;
+    if (nw > 2) {
+        nwh = nw >> 1;
+        delta = atan(1.0) / (double)nwh;
+        w[0] = 1;
+        w[1] = 0;
+        w[nwh] = cos(delta * (double)nwh);
+        w[nwh + 1] = w[nwh];
+        if (nwh > 2) {
+            for (j = 2; j < nwh; j += 2) {
+                x = cos(delta * (double)j);
+                y = sin(delta * (double)j);
+                w[j] = x;
+                w[j + 1] = y;
+                w[nw - j] = y;
+                w[nw - j + 1] = x;
+            }
+            bitrv2(nw, ip + 2, w);
+        }
+    }
+}
+
+
+static void makect(int nc, int *ip, double *c)
+{
+    int j, nch;
+    double delta;
+
+    ip[1] = nc;
+    if (nc > 1) {
+        nch = nc >> 1;
+        delta = atan(1.0) / (double)nch;
+        c[0] = cos(delta * (double)nch);
+        c[nch] = one_half * c[0];
+        for (j = 1; j < nch; j++) {
+            c[j] = one_half * cos(delta * (double)j);
+            c[nc - j] = one_half * sin(delta * (double)j);
+        }
+    }
+}
+
+
+/* -------- child routines -------- */
+
+
+static void bitrv2(int n, int *ip0, double *a)
+{
+    int j, j1, k, k1, l, m, m2, ip[1024];
+    double xr, xi, yr, yi;
+
+    (void)ip0;
+    ip[0] = 0;
+    l = n;
+    m = 1;
+    while ((m << 3) < l) {
+        l >>= 1;
+        for (j = 0; j < m; j++) {
+            ip[m + j] = ip[j] + l;
+        }
+        m <<= 1;
+    }
+    m2 = 2 * m;
+    if ((m << 3) == l) {
+        for (k = 0; k < m; k++) {
+            for (j = 0; j < k; j++) {
+                j1 = 2 * j + ip[k];
+                k1 = 2 * k + ip[j];
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += m2;
+                k1 += 2 * m2;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += m2;
+                k1 -= m2;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += m2;
+                k1 += 2 * m2;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+            }
+            j1 = 2 * k + m2 + ip[k];
+            k1 = j1 + m2;
+            xr = a[j1];
+            xi = a[j1 + 1];
+            yr = a[k1];
+            yi = a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+        }
+    } else {
+        for (k = 1; k < m; k++) {
+            for (j = 0; j < k; j++) {
+                j1 = 2 * j + ip[k];
+                k1 = 2 * k + ip[j];
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += m2;
+                k1 += m2;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+            }
+        }
+    }
+}
+
+
+static void bitrv2conj(int n, int *ip0, double *a)
+{
+    int j, j1, k, k1, l, m, m2, ip[256];
+    double xr, xi, yr, yi;
+
+    (void)ip0;
+    ip[0] = 0;
+    l = n;
+    m = 1;
+    while ((m << 3) < l) {
+        l >>= 1;
+        for (j = 0; j < m; j++) {
+            ip[m + j] = ip[j] + l;
+        }
+        m <<= 1;
+    }
+    m2 = 2 * m;
+    if ((m << 3) == l) {
+        for (k = 0; k < m; k++) {
+            for (j = 0; j < k; j++) {
+                j1 = 2 * j + ip[k];
+                k1 = 2 * k + ip[j];
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += m2;
+                k1 += 2 * m2;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += m2;
+                k1 -= m2;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += m2;
+                k1 += 2 * m2;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+            }
+            k1 = 2 * k + ip[k];
+            a[k1 + 1] = -a[k1 + 1];
+            j1 = k1 + m2;
+            k1 = j1 + m2;
+            xr = a[j1];
+            xi = -a[j1 + 1];
+            yr = a[k1];
+            yi = -a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            k1 += m2;
+            a[k1 + 1] = -a[k1 + 1];
+        }
+    } else {
+        a[1] = -a[1];
+        a[m2 + 1] = -a[m2 + 1];
+        for (k = 1; k < m; k++) {
+            for (j = 0; j < k; j++) {
+                j1 = 2 * j + ip[k];
+                k1 = 2 * k + ip[j];
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += m2;
+                k1 += m2;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+            }
+            k1 = 2 * k + ip[k];
+            a[k1 + 1] = -a[k1 + 1];
+            a[k1 + m2 + 1] = -a[k1 + m2 + 1];
+        }
+    }
+}
+
+
+static void cftfsub(int n, double *a, double const *w)
+{
+    int j, j1, j2, j3, l;
+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+
+    l = 2;
+    if (n > 8) {
+        cft1st(n, a, w);
+        l = 8;
+        while ((l << 2) < n) {
+            cftmdl(n, l, a, w);
+            l <<= 2;
+        }
+    }
+    if ((l << 2) == n) {
+        for (j = 0; j < l; j += 2) {
+            j1 = j + l;
+            j2 = j1 + l;
+            j3 = j2 + l;
+            x0r = a[j] + a[j1];
+            x0i = a[j + 1] + a[j1 + 1];
+            x1r = a[j] - a[j1];
+            x1i = a[j + 1] - a[j1 + 1];
+            x2r = a[j2] + a[j3];
+            x2i = a[j2 + 1] + a[j3 + 1];
+            x3r = a[j2] - a[j3];
+            x3i = a[j2 + 1] - a[j3 + 1];
+            a[j] = x0r + x2r;
+            a[j + 1] = x0i + x2i;
+            a[j2] = x0r - x2r;
+            a[j2 + 1] = x0i - x2i;
+            a[j1] = x1r - x3i;
+            a[j1 + 1] = x1i + x3r;
+            a[j3] = x1r + x3i;
+            a[j3 + 1] = x1i - x3r;
+        }
+    } else {
+        for (j = 0; j < l; j += 2) {
+            j1 = j + l;
+            x0r = a[j] - a[j1];
+            x0i = a[j + 1] - a[j1 + 1];
+            a[j] += a[j1];
+            a[j + 1] += a[j1 + 1];
+            a[j1] = x0r;
+            a[j1 + 1] = x0i;
+        }
+    }
+}
+
+
+static void cftbsub(int n, double *a, double const *w)
+{
+    int j, j1, j2, j3, l;
+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+
+    l = 2;
+    if (n > 8) {
+        cft1st(n, a, w);
+        l = 8;
+        while ((l << 2) < n) {
+            cftmdl(n, l, a, w);
+            l <<= 2;
+        }
+    }
+    if ((l << 2) == n) {
+        for (j = 0; j < l; j += 2) {
+            j1 = j + l;
+            j2 = j1 + l;
+            j3 = j2 + l;
+            x0r = a[j] + a[j1];
+            x0i = -a[j + 1] - a[j1 + 1];
+            x1r = a[j] - a[j1];
+            x1i = -a[j + 1] + a[j1 + 1];
+            x2r = a[j2] + a[j3];
+            x2i = a[j2 + 1] + a[j3 + 1];
+            x3r = a[j2] - a[j3];
+            x3i = a[j2 + 1] - a[j3 + 1];
+            a[j] = x0r + x2r;
+            a[j + 1] = x0i - x2i;
+            a[j2] = x0r - x2r;
+            a[j2 + 1] = x0i + x2i;
+            a[j1] = x1r - x3i;
+            a[j1 + 1] = x1i - x3r;
+            a[j3] = x1r + x3i;
+            a[j3 + 1] = x1i + x3r;
+        }
+    } else {
+        for (j = 0; j < l; j += 2) {
+            j1 = j + l;
+            x0r = a[j] - a[j1];
+            x0i = -a[j + 1] + a[j1 + 1];
+            a[j] += a[j1];
+            a[j + 1] = -a[j + 1] - a[j1 + 1];
+            a[j1] = x0r;
+            a[j1 + 1] = x0i;
+        }
+    }
+}
+
+
+static void cft1st(int n, double *a, double const *w)
+{
+    int j, k1, k2;
+    double wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+
+    x0r = a[0] + a[2];
+    x0i = a[1] + a[3];
+    x1r = a[0] - a[2];
+    x1i = a[1] - a[3];
+    x2r = a[4] + a[6];
+    x2i = a[5] + a[7];
+    x3r = a[4] - a[6];
+    x3i = a[5] - a[7];
+    a[0] = x0r + x2r;
+    a[1] = x0i + x2i;
+    a[4] = x0r - x2r;
+    a[5] = x0i - x2i;
+    a[2] = x1r - x3i;
+    a[3] = x1i + x3r;
+    a[6] = x1r + x3i;
+    a[7] = x1i - x3r;
+    wk1r = w[2];
+    x0r = a[8] + a[10];
+    x0i = a[9] + a[11];
+    x1r = a[8] - a[10];
+    x1i = a[9] - a[11];
+    x2r = a[12] + a[14];
+    x2i = a[13] + a[15];
+    x3r = a[12] - a[14];
+    x3i = a[13] - a[15];
+    a[8] = x0r + x2r;
+    a[9] = x0i + x2i;
+    a[12] = x2i - x0i;
+    a[13] = x0r - x2r;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    a[10] = wk1r * (x0r - x0i);
+    a[11] = wk1r * (x0r + x0i);
+    x0r = x3i + x1r;
+    x0i = x3r - x1i;
+    a[14] = wk1r * (x0i - x0r);
+    a[15] = wk1r * (x0i + x0r);
+    k1 = 0;
+    for (j = 16; j < n; j += 16) {
+        k1 += 2;
+        k2 = 2 * k1;
+        wk2r = w[k1];
+        wk2i = w[k1 + 1];
+        wk1r = w[k2];
+        wk1i = w[k2 + 1];
+        wk3r = wk1r - 2 * wk2i * wk1i;
+        wk3i = 2 * wk2i * wk1r - wk1i;
+        x0r = a[j] + a[j + 2];
+        x0i = a[j + 1] + a[j + 3];
+        x1r = a[j] - a[j + 2];
+        x1i = a[j + 1] - a[j + 3];
+        x2r = a[j + 4] + a[j + 6];
+        x2i = a[j + 5] + a[j + 7];
+        x3r = a[j + 4] - a[j + 6];
+        x3i = a[j + 5] - a[j + 7];
+        a[j] = x0r + x2r;
+        a[j + 1] = x0i + x2i;
+        x0r -= x2r;
+        x0i -= x2i;
+        a[j + 4] = wk2r * x0r - wk2i * x0i;
+        a[j + 5] = wk2r * x0i + wk2i * x0r;
+        x0r = x1r - x3i;
+        x0i = x1i + x3r;
+        a[j + 2] = wk1r * x0r - wk1i * x0i;
+        a[j + 3] = wk1r * x0i + wk1i * x0r;
+        x0r = x1r + x3i;
+        x0i = x1i - x3r;
+        a[j + 6] = wk3r * x0r - wk3i * x0i;
+        a[j + 7] = wk3r * x0i + wk3i * x0r;
+        wk1r = w[k2 + 2];
+        wk1i = w[k2 + 3];
+        wk3r = wk1r - 2 * wk2r * wk1i;
+        wk3i = 2 * wk2r * wk1r - wk1i;
+        x0r = a[j + 8] + a[j + 10];
+        x0i = a[j + 9] + a[j + 11];
+        x1r = a[j + 8] - a[j + 10];
+        x1i = a[j + 9] - a[j + 11];
+        x2r = a[j + 12] + a[j + 14];
+        x2i = a[j + 13] + a[j + 15];
+        x3r = a[j + 12] - a[j + 14];
+        x3i = a[j + 13] - a[j + 15];
+        a[j + 8] = x0r + x2r;
+        a[j + 9] = x0i + x2i;
+        x0r -= x2r;
+        x0i -= x2i;
+        a[j + 12] = -wk2i * x0r - wk2r * x0i;
+        a[j + 13] = -wk2i * x0i + wk2r * x0r;
+        x0r = x1r - x3i;
+        x0i = x1i + x3r;
+        a[j + 10] = wk1r * x0r - wk1i * x0i;
+        a[j + 11] = wk1r * x0i + wk1i * x0r;
+        x0r = x1r + x3i;
+        x0i = x1i - x3r;
+        a[j + 14] = wk3r * x0r - wk3i * x0i;
+        a[j + 15] = wk3r * x0i + wk3i * x0r;
+    }
+}
+
+
+static void cftmdl(int n, int l, double *a, double const *w)
+{
+    int j, j1, j2, j3, k, k1, k2, m, m2;
+    double wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+
+    m = l << 2;
+    for (j = 0; j < l; j += 2) {
+        j1 = j + l;
+        j2 = j1 + l;
+        j3 = j2 + l;
+        x0r = a[j] + a[j1];
+        x0i = a[j + 1] + a[j1 + 1];
+        x1r = a[j] - a[j1];
+        x1i = a[j + 1] - a[j1 + 1];
+        x2r = a[j2] + a[j3];
+        x2i = a[j2 + 1] + a[j3 + 1];
+        x3r = a[j2] - a[j3];
+        x3i = a[j2 + 1] - a[j3 + 1];
+        a[j] = x0r + x2r;
+        a[j + 1] = x0i + x2i;
+        a[j2] = x0r - x2r;
+        a[j2 + 1] = x0i - x2i;
+        a[j1] = x1r - x3i;
+        a[j1 + 1] = x1i + x3r;
+        a[j3] = x1r + x3i;
+        a[j3 + 1] = x1i - x3r;
+    }
+    wk1r = w[2];
+    for (j = m; j < l + m; j += 2) {
+        j1 = j + l;
+        j2 = j1 + l;
+        j3 = j2 + l;
+        x0r = a[j] + a[j1];
+        x0i = a[j + 1] + a[j1 + 1];
+        x1r = a[j] - a[j1];
+        x1i = a[j + 1] - a[j1 + 1];
+        x2r = a[j2] + a[j3];
+        x2i = a[j2 + 1] + a[j3 + 1];
+        x3r = a[j2] - a[j3];
+        x3i = a[j2 + 1] - a[j3 + 1];
+        a[j] = x0r + x2r;
+        a[j + 1] = x0i + x2i;
+        a[j2] = x2i - x0i;
+        a[j2 + 1] = x0r - x2r;
+        x0r = x1r - x3i;
+        x0i = x1i + x3r;
+        a[j1] = wk1r * (x0r - x0i);
+        a[j1 + 1] = wk1r * (x0r + x0i);
+        x0r = x3i + x1r;
+        x0i = x3r - x1i;
+        a[j3] = wk1r * (x0i - x0r);
+        a[j3 + 1] = wk1r * (x0i + x0r);
+    }
+    k1 = 0;
+    m2 = 2 * m;
+    for (k = m2; k < n; k += m2) {
+        k1 += 2;
+        k2 = 2 * k1;
+        wk2r = w[k1];
+        wk2i = w[k1 + 1];
+        wk1r = w[k2];
+        wk1i = w[k2 + 1];
+        wk3r = wk1r - 2 * wk2i * wk1i;
+        wk3i = 2 * wk2i * wk1r - wk1i;
+        for (j = k; j < l + k; j += 2) {
+            j1 = j + l;
+            j2 = j1 + l;
+            j3 = j2 + l;
+            x0r = a[j] + a[j1];
+            x0i = a[j + 1] + a[j1 + 1];
+            x1r = a[j] - a[j1];
+            x1i = a[j + 1] - a[j1 + 1];
+            x2r = a[j2] + a[j3];
+            x2i = a[j2 + 1] + a[j3 + 1];
+            x3r = a[j2] - a[j3];
+            x3i = a[j2 + 1] - a[j3 + 1];
+            a[j] = x0r + x2r;
+            a[j + 1] = x0i + x2i;
+            x0r -= x2r;
+            x0i -= x2i;
+            a[j2] = wk2r * x0r - wk2i * x0i;
+            a[j2 + 1] = wk2r * x0i + wk2i * x0r;
+            x0r = x1r - x3i;
+            x0i = x1i + x3r;
+            a[j1] = wk1r * x0r - wk1i * x0i;
+            a[j1 + 1] = wk1r * x0i + wk1i * x0r;
+            x0r = x1r + x3i;
+            x0i = x1i - x3r;
+            a[j3] = wk3r * x0r - wk3i * x0i;
+            a[j3 + 1] = wk3r * x0i + wk3i * x0r;
+        }
+        wk1r = w[k2 + 2];
+        wk1i = w[k2 + 3];
+        wk3r = wk1r - 2 * wk2r * wk1i;
+        wk3i = 2 * wk2r * wk1r - wk1i;
+        for (j = k + m; j < l + (k + m); j += 2) {
+            j1 = j + l;
+            j2 = j1 + l;
+            j3 = j2 + l;
+            x0r = a[j] + a[j1];
+            x0i = a[j + 1] + a[j1 + 1];
+            x1r = a[j] - a[j1];
+            x1i = a[j + 1] - a[j1 + 1];
+            x2r = a[j2] + a[j3];
+            x2i = a[j2 + 1] + a[j3 + 1];
+            x3r = a[j2] - a[j3];
+            x3i = a[j2 + 1] - a[j3 + 1];
+            a[j] = x0r + x2r;
+            a[j + 1] = x0i + x2i;
+            x0r -= x2r;
+            x0i -= x2i;
+            a[j2] = -wk2i * x0r - wk2r * x0i;
+            a[j2 + 1] = -wk2i * x0i + wk2r * x0r;
+            x0r = x1r - x3i;
+            x0i = x1i + x3r;
+            a[j1] = wk1r * x0r - wk1i * x0i;
+            a[j1 + 1] = wk1r * x0i + wk1i * x0r;
+            x0r = x1r + x3i;
+            x0i = x1i - x3r;
+            a[j3] = wk3r * x0r - wk3i * x0i;
+            a[j3 + 1] = wk3r * x0i + wk3i * x0r;
+        }
+    }
+}
+
+
+static void rftfsub(int n, double *a, int nc, double const *c)
+{
+    int j, k, kk, ks, m;
+    double wkr, wki, xr, xi, yr, yi;
+
+    m = n >> 1;
+    ks = 2 * nc / m;
+    kk = 0;
+    for (j = 2; j < m; j += 2) {
+        k = n - j;
+        kk += ks;
+        wkr = one_half - c[nc - kk];
+        wki = c[kk];
+        xr = a[j] - a[k];
+        xi = a[j + 1] + a[k + 1];
+        yr = wkr * xr - wki * xi;
+        yi = wkr * xi + wki * xr;
+        a[j] -= yr;
+        a[j + 1] -= yi;
+        a[k] += yr;
+        a[k + 1] -= yi;
+    }
+}
+
+
+static void rftbsub(int n, double *a, int nc, double const *c)
+{
+    int j, k, kk, ks, m;
+    double wkr, wki, xr, xi, yr, yi;
+
+    a[1] = -a[1];
+    m = n >> 1;
+    ks = 2 * nc / m;
+    kk = 0;
+    for (j = 2; j < m; j += 2) {
+        k = n - j;
+        kk += ks;
+        wkr = one_half - c[nc - kk];
+        wki = c[kk];
+        xr = a[j] - a[k];
+        xi = a[j + 1] + a[k + 1];
+        yr = wkr * xr + wki * xi;
+        yi = wkr * xi - wki * xr;
+        a[j] -= yr;
+        a[j + 1] = yi - a[j + 1];
+        a[k] += yr;
+        a[k + 1] = yi - a[k + 1];
+    }
+    a[m + 1] = -a[m + 1];
+}
+
+
+static void dctsub(int n, double *a, int nc, double const *c)
+{
+    int j, k, kk, ks, m;
+    double wkr, wki, xr;
+
+    m = n >> 1;
+    ks = nc / n;
+    kk = 0;
+    for (j = 1; j < m; j++) {
+        k = n - j;
+        kk += ks;
+        wkr = c[kk] - c[nc - kk];
+        wki = c[kk] + c[nc - kk];
+        xr = wki * a[j] - wkr * a[k];
+        a[j] = wkr * a[j] + wki * a[k];
+        a[k] = xr;
+    }
+    a[m] *= c[0];
+}
+
+
+static void dstsub(int n, double *a, int nc, double const *c)
+{
+    int j, k, kk, ks, m;
+    double wkr, wki, xr;
+
+    m = n >> 1;
+    ks = nc / n;
+    kk = 0;
+    for (j = 1; j < m; j++) {
+        k = n - j;
+        kk += ks;
+        wkr = c[kk] - c[nc - kk];
+        wki = c[kk] + c[nc - kk];
+        xr = wki * a[k] - wkr * a[j];
+        a[k] = wkr * a[k] + wki * a[j];
+        a[j] = xr;
+    }
+    a[m] *= c[0];
+}
diff --git a/src/fft4g.h b/src/fft4g.h
new file mode 100644
index 0000000..0f906ab
--- /dev/null
+++ b/src/fft4g.h
@@ -0,0 +1,23 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+void lsx_cdft(int, int, double *, int *, double *);
+void lsx_rdft(int, int, double *, int *, double *);
+void lsx_ddct(int, int, double *, int *, double *);
+void lsx_ddst(int, int, double *, int *, double *);
+void lsx_dfct(int, double *, double *, int *, double *);
+void lsx_dfst(int, double *, double *, int *, double *);
+
+void lsx_cdft_f(int, int, float *, int *, float *);
+void lsx_rdft_f(int, int, float *, int *, float *);
+void lsx_ddct_f(int, int, float *, int *, float *);
+void lsx_ddst_f(int, int, float *, int *, float *);
+void lsx_dfct_f(int, float *, float *, int *, float *);
+void lsx_dfst_f(int, float *, float *, int *, float *);
+
+#define dft_br_len(l) (2ul + (1ul << (int)(log(l / 2 + .5) / log(2.)) / 2))
+#define dft_sc_len(l) ((unsigned long)l / 2)
+
+/* Over-allocate h by 2 to use these macros */
+#define LSX_PACK(h, n)   h[1] = h[n]
+#define LSX_UNPACK(h, n) h[n] = h[1], h[n + 1] = h[1] = 0;
diff --git a/src/fft4g32.c b/src/fft4g32.c
new file mode 100644
index 0000000..8741394
--- /dev/null
+++ b/src/fft4g32.c
@@ -0,0 +1,27 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#include "filter.h"
+#define FFT4G_FLOAT
+#include "fft4g.c"
+
+static void * null(void) {return 0;}
+static void forward (int length, void * setup, double * H) {lsx_safe_rdft_f(length,  1, H); (void)setup;}
+static void backward(int length, void * setup, double * H) {lsx_safe_rdft_f(length, -1, H); (void)setup;}
+static int multiplier(void) {return 2;}
+static void nothing(void) {}
+
+typedef void (* fn_t)(void);
+fn_t _soxr_rdft32_cb[] = {
+  (fn_t)null,
+  (fn_t)null,
+  (fn_t)nothing,
+  (fn_t)forward,
+  (fn_t)forward,
+  (fn_t)backward,
+  (fn_t)backward,
+  (fn_t)_soxr_ordered_convolve_f,
+  (fn_t)_soxr_ordered_partial_convolve_f,
+  (fn_t)multiplier,
+  (fn_t)nothing,
+};
diff --git a/src/fft4g32s.c b/src/fft4g32s.c
new file mode 100644
index 0000000..4a95a7d
--- /dev/null
+++ b/src/fft4g32s.c
@@ -0,0 +1,26 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#include "filter.h"
+#include "simd.h"
+
+static void * null(void) {return 0;}
+static void nothing(void) {}
+static void forward (int length, void * setup, float * H) {lsx_safe_rdft_f(length,  1, H); (void)setup;}
+static void backward(int length, void * setup, float * H) {lsx_safe_rdft_f(length, -1, H); (void)setup;}
+static int multiplier(void) {return 2;}
+
+typedef void (* fn_t)(void);
+fn_t _soxr_rdft32s_cb[] = {
+  (fn_t)null,
+  (fn_t)null,
+  (fn_t)nothing,
+  (fn_t)forward,
+  (fn_t)forward,
+  (fn_t)backward,
+  (fn_t)backward,
+  (fn_t)_soxr_ordered_convolve_simd,
+  (fn_t)_soxr_ordered_partial_convolve_simd,
+  (fn_t)multiplier,
+  (fn_t)nothing,
+};
diff --git a/src/fft4g64.c b/src/fft4g64.c
new file mode 100644
index 0000000..48eaddd
--- /dev/null
+++ b/src/fft4g64.c
@@ -0,0 +1,29 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#include "filter.h"
+#include "fft4g.c"
+#include "soxr-config.h"
+
+#if HAVE_DOUBLE_PRECISION
+static void * null(void) {return 0;}
+static void nothing(void) {}
+static void forward (int length, void * setup, double * H) {lsx_safe_rdft(length,  1, H); (void)setup;}
+static void backward(int length, void * setup, double * H) {lsx_safe_rdft(length, -1, H); (void)setup;}
+static int multiplier(void) {return 2;}
+
+typedef void (* fn_t)(void);
+fn_t _soxr_rdft64_cb[] = {
+  (fn_t)null,
+  (fn_t)null,
+  (fn_t)nothing,
+  (fn_t)forward,
+  (fn_t)forward,
+  (fn_t)backward,
+  (fn_t)backward,
+  (fn_t)_soxr_ordered_convolve,
+  (fn_t)_soxr_ordered_partial_convolve,
+  (fn_t)multiplier,
+  (fn_t)nothing,
+};
+#endif
diff --git a/src/fft4g_cache.h b/src/fft4g_cache.h
new file mode 100644
index 0000000..d776c16
--- /dev/null
+++ b/src/fft4g_cache.h
@@ -0,0 +1,92 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+static int * LSX_FFT_BR;
+static DFT_FLOAT * LSX_FFT_SC;
+static int FFT_LEN = -1;
+static ccrw2_t FFT_CACHE_CCRW;
+
+void LSX_INIT_FFT_CACHE(void)
+{
+  if (FFT_LEN >= 0)
+    return;
+  assert(LSX_FFT_BR == NULL);
+  assert(LSX_FFT_SC == NULL);
+  assert(FFT_LEN == -1);
+  ccrw2_init(FFT_CACHE_CCRW);
+  FFT_LEN = 0;
+}
+
+void LSX_CLEAR_FFT_CACHE(void)
+{
+  assert(FFT_LEN >= 0);
+  ccrw2_clear(FFT_CACHE_CCRW);
+  free(LSX_FFT_BR);
+  free(LSX_FFT_SC);
+  LSX_FFT_SC = NULL;
+  LSX_FFT_BR = NULL;
+  FFT_LEN = -1;
+}
+
+static bool UPDATE_FFT_CACHE(int len)
+{
+  LSX_INIT_FFT_CACHE();
+  assert(lsx_is_power_of_2(len));
+  assert(FFT_LEN >= 0);
+  ccrw2_become_reader(FFT_CACHE_CCRW);
+  if (len > FFT_LEN) {
+    ccrw2_cease_reading(FFT_CACHE_CCRW);
+    ccrw2_become_writer(FFT_CACHE_CCRW);
+    if (len > FFT_LEN) {
+      int old_n = FFT_LEN;
+      FFT_LEN = len;
+      LSX_FFT_BR = realloc(LSX_FFT_BR, dft_br_len(FFT_LEN) * sizeof(*LSX_FFT_BR));
+      LSX_FFT_SC = realloc(LSX_FFT_SC, dft_sc_len(FFT_LEN) * sizeof(*LSX_FFT_SC));
+      if (!old_n) {
+        LSX_FFT_BR[0] = 0;
+#if SOXR_LIB
+        atexit(LSX_CLEAR_FFT_CACHE);
+#endif
+      }
+      return true;
+    }
+    ccrw2_cease_writing(FFT_CACHE_CCRW);
+    ccrw2_become_reader(FFT_CACHE_CCRW);
+  }
+  return false;
+}
+
+static void DONE_WITH_FFT_CACHE(bool is_writer)
+{
+  if (is_writer)
+    ccrw2_cease_writing(FFT_CACHE_CCRW);
+  else ccrw2_cease_reading(FFT_CACHE_CCRW);
+}
+
+void LSX_SAFE_RDFT(int len, int type, DFT_FLOAT * d)
+{
+  bool is_writer = UPDATE_FFT_CACHE(len);
+  LSX_RDFT(len, type, d, LSX_FFT_BR, LSX_FFT_SC);
+  DONE_WITH_FFT_CACHE(is_writer);
+}
+
+void LSX_SAFE_CDFT(int len, int type, DFT_FLOAT * d)
+{
+  bool is_writer = UPDATE_FFT_CACHE(len);
+  LSX_CDFT(len, type, d, LSX_FFT_BR, LSX_FFT_SC);
+  DONE_WITH_FFT_CACHE(is_writer);
+}
+
+#undef UPDATE_FFT_CACHE
+#undef LSX_SAFE_RDFT
+#undef LSX_SAFE_CDFT
+#undef LSX_RDFT
+#undef LSX_INIT_FFT_CACHE
+#undef LSX_FFT_SC
+#undef LSX_FFT_BR
+#undef LSX_CLEAR_FFT_CACHE
+#undef LSX_CDFT
+#undef FFT_LEN
+#undef FFT_CACHE_CCRW
+#undef DONE_WITH_FFT_CACHE
+#undef DFT_FLOAT
diff --git a/src/fifo.h b/src/fifo.h
new file mode 100644
index 0000000..b2bda43
--- /dev/null
+++ b/src/fifo.h
@@ -0,0 +1,124 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#ifndef fifo_included
+#define fifo_included
+
+#if !defined FIFO_SIZE_T
+#define FIFO_SIZE_T size_t
+#endif
+
+#if !defined FIFO_REALLOC
+  #define FIFO_REALLOC(a,b,c) realloc(a,b)
+  #undef FIFO_FREE
+  #define FIFO_FREE free
+  #undef FIFO_MALLOC
+  #define FIFO_MALLOC malloc
+#endif
+
+typedef struct {
+  char * data;
+  size_t allocation;   /* Number of bytes allocated for data. */
+  size_t item_size;    /* Size of each item in data */
+  size_t begin;        /* Offset of the first byte to read. */
+  size_t end;          /* 1 + Offset of the last byte byte to read. */
+} fifo_t;
+
+#if !defined FIFO_MIN
+  #define FIFO_MIN 0x4000
+#endif
+
+#if !defined UNUSED
+  #define UNUSED
+#endif
+
+UNUSED static void fifo_clear(fifo_t * f)
+{
+  f->end = f->begin = 0;
+}
+
+UNUSED static void * fifo_reserve(fifo_t * f, FIFO_SIZE_T n0)
+{
+  size_t n = (size_t)n0;
+  n *= f->item_size;
+
+  if (f->begin == f->end)
+    fifo_clear(f);
+
+  while (1) {
+    if (f->end + n <= f->allocation) {
+      void *p = f->data + f->end;
+
+      f->end += n;
+      return p;
+    }
+    if (f->begin > FIFO_MIN) {
+      memmove(f->data, f->data + f->begin, f->end - f->begin);
+      f->end -= f->begin;
+      f->begin = 0;
+      continue;
+    }
+    f->data = FIFO_REALLOC(f->data, f->allocation + n, f->allocation);
+    f->allocation += n;
+    if (!f->data)
+      return 0;
+  }
+}
+
+UNUSED static void * fifo_write(fifo_t * f, FIFO_SIZE_T n0, void const * data)
+{
+  size_t n = (size_t)n0;
+  void * s = fifo_reserve(f, n0);
+  if (data)
+    memcpy(s, data, n * f->item_size);
+  return s;
+}
+
+UNUSED static void fifo_trim_to(fifo_t * f, FIFO_SIZE_T n0)
+{
+  size_t n = (size_t)n0;
+  n *= f->item_size;
+  f->end = f->begin + n;
+}
+
+UNUSED static void fifo_trim_by(fifo_t * f, FIFO_SIZE_T n0)
+{
+  size_t n = (size_t)n0;
+  n *= f->item_size;
+  f->end -= n;
+}
+
+UNUSED static FIFO_SIZE_T fifo_occupancy(fifo_t * f)
+{
+  return (FIFO_SIZE_T)((f->end - f->begin) / f->item_size);
+}
+
+UNUSED static void * fifo_read(fifo_t * f, FIFO_SIZE_T n0, void * data)
+{
+  size_t n = (size_t)n0;
+  char * ret = f->data + f->begin;
+  n *= f->item_size;
+  if (n > (f->end - f->begin))
+    return NULL;
+  if (data)
+    memcpy(data, ret, (size_t)n);
+  f->begin += n;
+  return ret;
+}
+
+#define fifo_read_ptr(f) fifo_read(f, (FIFO_SIZE_T)0, NULL)
+
+UNUSED static void fifo_delete(fifo_t * f)
+{
+  FIFO_FREE(f->data);
+}
+
+UNUSED static int fifo_create(fifo_t * f, FIFO_SIZE_T item_size)
+{
+  f->item_size = (size_t)item_size;
+  f->allocation = FIFO_MIN;
+  fifo_clear(f);
+  return !(f->data = FIFO_MALLOC(f->allocation));
+}
+
+#endif
diff --git a/src/filter.c b/src/filter.c
new file mode 100644
index 0000000..ca146d2
--- /dev/null
+++ b/src/filter.c
@@ -0,0 +1,245 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#include "filter.h"
+
+#include <math.h>
+#if !defined M_PI
+#define M_PI    3.14159265358979323846
+#endif
+#include <assert.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include "fft4g.h"
+#include "ccrw2.h"
+
+#if 1 || HAVE_DOUBLE_PRECISION /* Always need this, for lsx_fir_to_phase. */
+#define DFT_FLOAT double
+#define DONE_WITH_FFT_CACHE done_with_fft_cache
+#define FFT_CACHE_CCRW fft_cache_ccrw
+#define FFT_LEN fft_len
+#define LSX_CDFT lsx_cdft
+#define LSX_CLEAR_FFT_CACHE lsx_clear_fft_cache
+#define LSX_FFT_BR lsx_fft_br
+#define LSX_FFT_SC lsx_fft_sc
+#define LSX_INIT_FFT_CACHE lsx_init_fft_cache
+#define LSX_RDFT lsx_rdft
+#define LSX_SAFE_CDFT lsx_safe_cdft
+#define LSX_SAFE_RDFT lsx_safe_rdft
+#define UPDATE_FFT_CACHE update_fft_cache
+#include "fft4g_cache.h"
+#endif
+
+#if HAVE_SINGLE_PRECISION && !HAVE_AVFFT
+#define DFT_FLOAT float
+#define DONE_WITH_FFT_CACHE done_with_fft_cache_f
+#define FFT_CACHE_CCRW fft_cache_ccrw_f
+#define FFT_LEN fft_len_f
+#define LSX_CDFT lsx_cdft_f
+#define LSX_CLEAR_FFT_CACHE lsx_clear_fft_cache_f
+#define LSX_FFT_BR lsx_fft_br_f
+#define LSX_FFT_SC lsx_fft_sc_f
+#define LSX_INIT_FFT_CACHE lsx_init_fft_cache_f
+#define LSX_RDFT lsx_rdft_f
+#define LSX_SAFE_CDFT lsx_safe_cdft_f
+#define LSX_SAFE_RDFT lsx_safe_rdft_f
+#define UPDATE_FFT_CACHE update_fft_cache_f
+#include "fft4g_cache.h"
+#endif
+
+#if HAVE_DOUBLE_PRECISION || !SOXR_LIB
+#define DFT_FLOAT double
+#define ORDERED_CONVOLVE lsx_ordered_convolve
+#define ORDERED_PARTIAL_CONVOLVE lsx_ordered_partial_convolve
+#include "rdft.h"
+#endif
+
+#if HAVE_SINGLE_PRECISION
+#define DFT_FLOAT float
+#define ORDERED_CONVOLVE lsx_ordered_convolve_f
+#define ORDERED_PARTIAL_CONVOLVE lsx_ordered_partial_convolve_f
+#include "rdft.h"
+#endif
+
+double lsx_kaiser_beta(double att, double tr_bw)
+{
+  if (att >= 60) {
+    static const double coefs[][4] = {
+      {-6.784957e-10,1.02856e-05,0.1087556,-0.8988365+.001},
+      {-6.897885e-10,1.027433e-05,0.10876,-0.8994658+.002},
+      {-1.000683e-09,1.030092e-05,0.1087677,-0.9007898+.003},
+      {-3.654474e-10,1.040631e-05,0.1087085,-0.8977766+.006},
+      {8.106988e-09,6.983091e-06,0.1091387,-0.9172048+.015},
+      {9.519571e-09,7.272678e-06,0.1090068,-0.9140768+.025},
+      {-5.626821e-09,1.342186e-05,0.1083999,-0.9065452+.05},
+      {-9.965946e-08,5.073548e-05,0.1040967,-0.7672778+.085},
+      {1.604808e-07,-5.856462e-05,0.1185998,-1.34824+.1},
+      {-1.511964e-07,6.363034e-05,0.1064627,-0.9876665+.18},
+    };
+    double realm = log(tr_bw/.0005)/log(2.);
+    double const * c0 = coefs[range_limit(  (int)realm, 0, (int)array_length(coefs)-1)];
+    double const * c1 = coefs[range_limit(1+(int)realm, 0, (int)array_length(coefs)-1)];
+    double b0 = ((c0[0]*att + c0[1])*att + c0[2])*att + c0[3];
+    double b1 = ((c1[0]*att + c1[1])*att + c1[2])*att + c1[3];
+    return b0 + (b1 - b0) * (realm - (int)realm);
+  }
+  if (att > 50   ) return .1102 * (att - 8.7);
+  if (att > 20.96) return .58417 * pow(att -20.96, .4) + .07886 * (att - 20.96);
+  return 0;
+}
+
+double * lsx_make_lpf(
+    int num_taps, double Fc, double beta, double rho, double scale)
+{
+  int i, m = num_taps - 1;
+  double * h = malloc((size_t)num_taps * sizeof(*h));
+  double mult = scale / lsx_bessel_I_0(beta), mult1 = 1 / (.5 * m + rho);
+  assert(Fc >= 0 && Fc <= 1);
+  lsx_debug("make_lpf(n=%i Fc=%.7g Î²=%g Ï=%g scale=%g)",
+      num_taps, Fc, beta, rho, scale);
+
+  if (h) for (i = 0; i <= m / 2; ++i) {
+    double z = i - .5 * m, x = z * M_PI, y = z * mult1;
+    h[i] = x? sin(Fc * x) / x : Fc;
+    h[i] *= lsx_bessel_I_0(beta * sqrt(1 - y * y)) * mult;
+    if (m - i != i)
+      h[m - i] = h[i];
+  }
+  return h;
+}
+
+void lsx_kaiser_params(double att, double Fc, double tr_bw, double * beta, int * num_taps)
+{
+  *beta = *beta < 0? lsx_kaiser_beta(att, tr_bw * .5 / Fc): *beta;
+  att = att < 60? (att - 7.95) / (2.285 * M_PI * 2) :
+    ((.0007528358-1.577737e-05**beta)**beta+.6248022)**beta+.06186902;
+  *num_taps = !*num_taps? (int)ceil(att/tr_bw + 1) : *num_taps;
+}
+
+double * lsx_design_lpf(
+    double Fp,      /* End of pass-band */
+    double Fs,      /* Start of stop-band */
+    double Fn,      /* Nyquist freq; e.g. 0.5, 1, PI */
+    double att,     /* Stop-band attenuation in dB */
+    int * num_taps, /* 0: value will be estimated */
+    int k,          /* >0: number of phases; <0: num_taps â¡ 1 (mod -k) */
+    double beta)    /* <0: value will be estimated */
+{
+  int n = *num_taps, phases = max(k, 1), modulo = max(-k, 1);
+  double tr_bw, Fc, rho = phases == 1? .5 : att < 120? .63 : .75;
+
+  Fp /= fabs(Fn), Fs /= fabs(Fn);        /* Normalise to Fn = 1 */
+  tr_bw = .5 * (Fs - Fp); /* Transition band-width: 6dB to stop points */
+  tr_bw /= phases, Fs /= phases;
+  tr_bw = min(tr_bw, .5 * Fs);
+  Fc = Fs - tr_bw;
+  assert(Fc - tr_bw >= 0);
+  lsx_kaiser_params(att, Fc, tr_bw, &beta, num_taps);
+  if (!n)
+    *num_taps = phases > 1? *num_taps / phases * phases + phases - 1 :
+      (*num_taps + modulo - 2) / modulo * modulo + 1;
+  return Fn < 0? 0 : lsx_make_lpf(*num_taps, Fc, beta, rho, (double)phases);
+}
+
+static double safe_log(double x)
+{
+  assert(x >= 0);
+  if (x)
+    return log(x);
+  lsx_debug("log(0)");
+  return -26;
+}
+
+void lsx_fir_to_phase(double * * h, int * len, int * post_len, double phase)
+{
+  double * pi_wraps, * work, phase1 = (phase > 50 ? 100 - phase : phase) / 50;
+  int i, work_len, begin, end, imp_peak = 0, peak = 0;
+  double imp_sum = 0, peak_imp_sum = 0;
+  double prev_angle2 = 0, cum_2pi = 0, prev_angle1 = 0, cum_1pi = 0;
+
+  for (i = *len, work_len = 2 * 2 * 8; i > 1; work_len <<= 1, i >>= 1);
+
+  work = calloc((size_t)work_len + 2, sizeof(*work)); /* +2: (UN)PACK */
+  pi_wraps = malloc((((size_t)work_len + 2) / 2) * sizeof(*pi_wraps));
+
+  memcpy(work, *h, (size_t)*len * sizeof(*work));
+  lsx_safe_rdft(work_len, 1, work); /* Cepstral: */
+  LSX_UNPACK(work, work_len);
+
+  for (i = 0; i <= work_len; i += 2) {
+    double angle = atan2(work[i + 1], work[i]);
+    double detect = 2 * M_PI;
+    double delta = angle - prev_angle2;
+    double adjust = detect * ((delta < -detect * .7) - (delta > detect * .7));
+    prev_angle2 = angle;
+    cum_2pi += adjust;
+    angle += cum_2pi;
+    detect = M_PI;
+    delta = angle - prev_angle1;
+    adjust = detect * ((delta < -detect * .7) - (delta > detect * .7));
+    prev_angle1 = angle;
+    cum_1pi += fabs(adjust); /* fabs for when 2pi and 1pi have combined */
+    pi_wraps[i >> 1] = cum_1pi;
+
+    work[i] = safe_log(sqrt(sqr(work[i]) + sqr(work[i + 1])));
+    work[i + 1] = 0;
+  }
+  LSX_PACK(work, work_len);
+  lsx_safe_rdft(work_len, -1, work);
+  for (i = 0; i < work_len; ++i) work[i] *= 2. / work_len;
+
+  for (i = 1; i < work_len / 2; ++i) { /* Window to reject acausal components */
+    work[i] *= 2;
+    work[i + work_len / 2] = 0;
+  }
+  lsx_safe_rdft(work_len, 1, work);
+
+  for (i = 2; i < work_len; i += 2) /* Interpolate between linear & min phase */
+    work[i + 1] = phase1 * i / work_len * pi_wraps[work_len >> 1] +
+        (1 - phase1) * (work[i + 1] + pi_wraps[i >> 1]) - pi_wraps[i >> 1];
+
+  work[0] = exp(work[0]), work[1] = exp(work[1]);
+  for (i = 2; i < work_len; i += 2) {
+    double x = exp(work[i]);
+    work[i    ] = x * cos(work[i + 1]);
+    work[i + 1] = x * sin(work[i + 1]);
+  }
+
+  lsx_safe_rdft(work_len, -1, work);
+  for (i = 0; i < work_len; ++i) work[i] *= 2. / work_len;
+
+  /* Find peak pos. */
+  for (i = 0; i <= (int)(pi_wraps[work_len >> 1] / M_PI + .5); ++i) {
+    imp_sum += work[i];
+    if (fabs(imp_sum) > fabs(peak_imp_sum)) {
+      peak_imp_sum = imp_sum;
+      peak = i;
+    }
+    if (work[i] > work[imp_peak]) /* For debug check only */
+      imp_peak = i;
+  }
+  while (peak && fabs(work[peak-1]) > fabs(work[peak]) && work[peak-1] * work[peak] > 0)
+    --peak;
+
+  if (!phase1)
+    begin = 0;
+  else if (phase1 == 1)
+    begin = peak - *len / 2;
+  else {
+    begin = (int)((.997 - (2 - phase1) * .22) * *len + .5);
+    end   = (int)((.997 + (0 - phase1) * .22) * *len + .5);
+    begin = peak - (begin & ~3);
+    end   = peak + 1 + ((end + 3) & ~3);
+    *len = end - begin;
+    *h = realloc(*h, (size_t)*len * sizeof(**h));
+  }
+  for (i = 0; i < *len; ++i) (*h)[i] =
+    work[(begin + (phase > 50 ? *len - 1 - i : i) + work_len) & (work_len - 1)];
+  *post_len = phase > 50 ? peak - begin : begin + *len - (peak + 1);
+
+  lsx_debug("nPI=%g peak-sum@%i=%g (val@%i=%g); len=%i post=%i (%g%%)",
+      pi_wraps[work_len >> 1] / M_PI, peak, peak_imp_sum, imp_peak,
+      work[imp_peak], *len, *post_len, 100 - 100. * *post_len / (*len - 1));
+  free(pi_wraps), free(work);
+}
diff --git a/src/filter.h b/src/filter.h
new file mode 100644
index 0000000..435303b
--- /dev/null
+++ b/src/filter.h
@@ -0,0 +1,39 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#if !defined soxr_filter_included
+#define soxr_filter_included
+
+#include "aliases.h"
+
+double lsx_bessel_I_0(double x);
+void lsx_init_fft_cache(void);
+void lsx_clear_fft_cache(void);
+void lsx_init_fft_cache_f(void);
+void lsx_clear_fft_cache_f(void);
+#define lsx_is_power_of_2(x) !(x < 2 || (x & (x - 1)))
+void lsx_safe_rdft(int len, int type, double * d);
+void lsx_safe_cdft(int len, int type, double * d);
+void lsx_safe_rdft_f(int len, int type, float * d);
+void lsx_safe_cdft_f(int len, int type, float * d);
+void lsx_ordered_convolve(int n, void * not_used, double * a, const double * b);
+void lsx_ordered_convolve_f(int n, void * not_used, float * a, const float * b);
+void lsx_ordered_partial_convolve(int n, double * a, const double * b);
+void lsx_ordered_partial_convolve_f(int n, float * a, const float * b);
+
+double lsx_kaiser_beta(double att, double tr_bw);
+double * lsx_make_lpf(int num_taps, double Fc, double beta, double rho,
+    double scale);
+void lsx_kaiser_params(double att, double Fc, double tr_bw, double * beta, int * num_taps);
+double * lsx_design_lpf(
+    double Fp,      /* End of pass-band */
+    double Fs,      /* Start of stop-band */
+    double Fn,      /* Nyquist freq; e.g. 0.5, 1, PI; < 0: dummy run */
+    double att,     /* Stop-band attenuation in dB */
+    int * num_taps, /* 0: value will be estimated */
+    int k,          /* >0: number of phases; <0: num_taps â¡ 1 (mod -k) */
+    double beta);   /* <0: value will be estimated */
+void lsx_fir_to_phase(double * * h, int * len,
+    int * post_len, double phase0);
+
+#endif
diff --git a/src/filters.h b/src/filters.h
new file mode 100644
index 0000000..e9a8011
--- /dev/null
+++ b/src/filters.h
@@ -0,0 +1,151 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#include "half_coefs.h"
+
+#define FUNCTION h8
+#define CONVOLVE _ _ _ _ _ _ _ _
+#define h8_l 8
+#define COEFS half_fir_coefs_8
+#include "half-fir.h"
+
+#define FUNCTION h9
+#define CONVOLVE _ _ _ _ _ _ _ _ _
+#define h9_l 9
+#define COEFS half_fir_coefs_9
+#include "half-fir.h"
+
+#define FUNCTION h10
+#define CONVOLVE _ _ _ _ _ _ _ _ _ _
+#define h10_l 10
+#define COEFS half_fir_coefs_10
+#include "half-fir.h"
+
+#define FUNCTION h11
+#define CONVOLVE _ _ _ _ _ _ _ _ _ _ _
+#define h11_l 11
+#define COEFS half_fir_coefs_11
+#include "half-fir.h"
+
+#define FUNCTION h12
+#define CONVOLVE _ _ _ _ _ _ _ _ _ _ _ _
+#define h12_l 12
+#define COEFS half_fir_coefs_12
+#include "half-fir.h"
+
+#define FUNCTION h13
+#define CONVOLVE _ _ _ _ _ _ _ _ _ _ _ _ _
+#define h13_l 13
+#define COEFS half_fir_coefs_13
+#include "half-fir.h"
+
+static struct {int num_coefs; stage_fn_t fn; float att;} const half_firs[] = {
+  { 8, h8 , 136.51f},
+  { 9, h9 , 152.32f},
+  {10, h10, 168.07f},
+  {11, h11, 183.78f},
+  {12, h12, 199.44f},
+  {13, h13, 212.75f},
+};
+
+#define HI_PREC_CLOCK
+
+#define VAR_LENGTH p->n
+#define VAR_CONVOLVE while (j < FIR_LENGTH) _
+#define VAR_POLY_PHASE_BITS p->phase_bits
+
+#define FUNCTION vpoly0
+#define FIR_LENGTH VAR_LENGTH
+#define CONVOLVE VAR_CONVOLVE
+#include "poly-fir0.h"
+
+#define FUNCTION vpoly1
+#define COEF_INTERP 1
+#define PHASE_BITS VAR_POLY_PHASE_BITS
+#define FIR_LENGTH VAR_LENGTH
+#define CONVOLVE VAR_CONVOLVE
+#include "poly-fir.h"
+
+#define FUNCTION vpoly2
+#define COEF_INTERP 2
+#define PHASE_BITS VAR_POLY_PHASE_BITS
+#define FIR_LENGTH VAR_LENGTH
+#define CONVOLVE VAR_CONVOLVE
+#include "poly-fir.h"
+
+#define FUNCTION vpoly3
+#define COEF_INTERP 3
+#define PHASE_BITS VAR_POLY_PHASE_BITS
+#define FIR_LENGTH VAR_LENGTH
+#define CONVOLVE VAR_CONVOLVE
+#include "poly-fir.h"
+
+#undef HI_PREC_CLOCK
+
+#define U100_l 42
+#if RATE_SIMD_POLY
+  #define U100_l_EXTRA _ _
+  #define u100_l_EXTRA _
+  #define U100_l_EXTRA_LENGTH 2
+  #define u100_l_EXTRA_LENGTH 1
+#else
+  #define U100_l_EXTRA
+  #define u100_l_EXTRA
+  #define U100_l_EXTRA_LENGTH 0
+  #define u100_l_EXTRA_LENGTH 0
+#endif
+#define poly_fir_convolve_U100 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ U100_l_EXTRA
+#define FUNCTION U100_0
+#define FIR_LENGTH (U100_l + U100_l_EXTRA_LENGTH)
+#define CONVOLVE poly_fir_convolve_U100
+#include "poly-fir0.h"
+
+#define u100_l 11
+#define poly_fir_convolve_u100 _ _ _ _ _ _ _ _ _ _ _ u100_l_EXTRA
+#define FUNCTION u100_0
+#define FIR_LENGTH (u100_l + u100_l_EXTRA_LENGTH)
+#define CONVOLVE poly_fir_convolve_u100
+#include "poly-fir0.h"
+
+#define FUNCTION u100_1
+#define COEF_INTERP 1
+#define PHASE_BITS 8
+#define FIR_LENGTH (u100_l + u100_l_EXTRA_LENGTH)
+#define CONVOLVE poly_fir_convolve_u100
+#include "poly-fir.h"
+#define u100_1_b 8
+
+#define FUNCTION u100_2
+#define COEF_INTERP 2
+#define PHASE_BITS 6
+#define FIR_LENGTH (u100_l + u100_l_EXTRA_LENGTH)
+#define CONVOLVE poly_fir_convolve_u100
+#include "poly-fir.h"
+#define u100_2_b 6
+
+typedef struct {float scalar; stage_fn_t fn;} poly_fir1_t;
+typedef struct {float beta; poly_fir1_t interp[3];} poly_fir_t;
+
+static poly_fir_t const poly_firs[] = {
+  {-1, {{0, vpoly0}, { 7.2f, vpoly1}, {5.0f, vpoly2}}},
+  {-1, {{0, vpoly0}, { 9.4f, vpoly1}, {6.7f, vpoly2}}},
+  {-1, {{0, vpoly0}, {12.4f, vpoly1}, {7.8f, vpoly2}}},
+  {-1, {{0, vpoly0}, {13.6f, vpoly1}, {9.3f, vpoly2}}},
+  {-1, {{0, vpoly0}, {10.5f, vpoly2}, {8.4f, vpoly3}}},
+  {-1, {{0, vpoly0}, {11.85f,vpoly2}, {9.0f, vpoly3}}},
+
+  {-1, {{0, vpoly0}, { 8.0f, vpoly1}, {5.3f, vpoly2}}},
+  {-1, {{0, vpoly0}, { 8.6f, vpoly1}, {5.7f, vpoly2}}},
+  {-1, {{0, vpoly0}, {10.6f, vpoly1}, {6.75f,vpoly2}}},
+  {-1, {{0, vpoly0}, {12.6f, vpoly1}, {8.6f, vpoly2}}},
+  {-1, {{0, vpoly0}, { 9.6f, vpoly2}, {7.6f, vpoly3}}},
+  {-1, {{0, vpoly0}, {11.4f, vpoly2}, {8.65f,vpoly3}}},
+
+  {10.62f, {{U100_l, U100_0}, {0, 0}, {0, 0}}},
+  {11.28f, {{u100_l, u100_0}, {u100_1_b, u100_1}, {u100_2_b, u100_2}}},
+  {-1, {{0, vpoly0}, {   9, vpoly1}, {  6, vpoly2}}},
+  {-1, {{0, vpoly0}, {  11, vpoly1}, {  7, vpoly2}}},
+  {-1, {{0, vpoly0}, {  13, vpoly1}, {  8, vpoly2}}},
+  {-1, {{0, vpoly0}, {  10, vpoly2}, {  8, vpoly3}}},
+  {-1, {{0, vpoly0}, {  12, vpoly2}, {  9, vpoly3}}},
+};
diff --git a/src/half-fir.h b/src/half-fir.h
new file mode 100644
index 0000000..0a8ee97
--- /dev/null
+++ b/src/half-fir.h
@@ -0,0 +1,25 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+/* Down-sample by a factor of 2 using a FIR with odd length (LEN).*/
+/* Input must be preceded and followed by LEN >> 1 samples. */
+
+#define _ sum += (input[-(2*j +1)] + input[(2*j +1)]) * COEFS[j], ++j;
+static void FUNCTION(stage_t * p, fifo_t * output_fifo)
+{
+  sample_t const * input = stage_read_p(p);
+  int i, num_out = (stage_occupancy(p) + 1) / 2;
+  sample_t * output = fifo_reserve(output_fifo, num_out);
+
+  for (i = 0; i < num_out; ++i, input += 2) {
+    int j = 0;
+    sample_t sum = input[0] * .5f;
+    CONVOLVE
+    output[i] = sum;
+  }
+  fifo_read(&p->fifo, 2 * num_out, NULL);
+}
+#undef _
+#undef COEFS
+#undef CONVOLVE
+#undef FUNCTION
diff --git a/src/half_coefs.h b/src/half_coefs.h
new file mode 100644
index 0000000..aac7769
--- /dev/null
+++ b/src/half_coefs.h
@@ -0,0 +1,57 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#if defined __GNUC__
+  #pragma GCC system_header
+#elif defined __SUNPRO_C
+  #pragma disable_warn
+#elif defined _MSC_VER
+  #pragma warning(push, 1)
+#endif
+
+static const sample_t half_fir_coefs_8[] = {
+  0.3115465451887802, -0.08734497241282892, 0.03681452335604365,
+  -0.01518925831569441, 0.005454118437408876, -0.001564400922162005,
+  0.0003181701445034203, -3.48001341225749e-5,
+};
+
+static const sample_t half_fir_coefs_9[] = {
+  0.3122703613711853, -0.08922155288172305, 0.03913974805854332,
+  -0.01725059723447163, 0.006858970092378141, -0.002304518467568703,
+  0.0006096426006051062, -0.0001132393923815236, 1.119795386287666e-5,
+};
+
+static const sample_t half_fir_coefs_10[] = {
+  0.3128545521327376, -0.09075671986104322, 0.04109637155154835,
+  -0.01906629512749895, 0.008184039342054333, -0.0030766775017262,
+  0.0009639607022414314, -0.0002358552746579827, 4.025184282444155e-5,
+  -3.629779111541012e-6,
+};
+
+static const sample_t half_fir_coefs_11[] = {
+  0.3133358837508807, -0.09203588680609488, 0.04276515428384758,
+  -0.02067356614745591, 0.00942253142371517, -0.003856330993895144,
+  0.001363470684892284, -0.0003987400965541919, 9.058629923971627e-5,
+  -1.428553070915318e-5, 1.183455238783835e-6,
+};
+
+static const sample_t half_fir_coefs_12[] = {
+  0.3137392991811407, -0.0931182192961332, 0.0442050575271454,
+  -0.02210391200618091, 0.01057473015666001, -0.00462766983973885,
+  0.001793630226239453, -0.0005961819959665878, 0.0001631475979359577,
+  -3.45557865639653e-5, 5.06188341942088e-6, -3.877010943315563e-7,
+};
+
+static const sample_t half_fir_coefs_13[] = {
+  0.3140822554324578, -0.0940458550886253, 0.04545990399121566,
+  -0.02338339450796002, 0.01164429409071052, -0.005380686021429845,
+  0.002242915773871009, -0.000822047600000082, 0.0002572510962395222,
+  -6.607320708956279e-5, 1.309926399120154e-5, -1.790719575255006e-6,
+  1.27504961098836e-7,
+};
+
+#if defined __SUNPRO_C
+  #pragma enable_warn
+#elif defined _MSC_VER
+  #pragma warning(pop)
+#endif
diff --git a/src/internal.h b/src/internal.h
new file mode 100644
index 0000000..f29e29f
--- /dev/null
+++ b/src/internal.h
@@ -0,0 +1,46 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#if !defined soxr_internal_included
+#define soxr_internal_included
+
+#include "soxr-config.h"
+
+#undef min
+#undef max
+#define min(a, b) ((a) <= (b) ? (a) : (b))
+#define max(a, b) ((a) >= (b) ? (a) : (b))
+
+#define range_limit(x, lower, upper) (min(max(x, lower), upper))
+#define linear_to_dB(x) (log10(x) * 20)
+#define array_length(a) (sizeof(a)/sizeof(a[0]))
+#define AL(a) array_length(a)
+#define iAL(a) (int)AL(a)
+#define sqr(a) ((a) * (a))
+
+#ifdef __GNUC__
+  #define UNUSED __attribute__ ((unused))
+#else
+  #define UNUSED
+#endif
+
+#if defined NDEBUG
+  #ifdef __GNUC__
+    void lsx_dummy(char const *, ...);
+  #else
+    static __inline void lsx_dummy(char const * x, ...) {}
+  #endif
+  #define lsx_debug if(0) lsx_dummy
+#else
+  #include <stdarg.h>
+  #include <stdio.h>
+  UNUSED static void lsx_debug(char const * fmt, ...)
+  {
+    va_list args;
+    va_start(args, fmt);
+    vfprintf(stderr, fmt, args);
+    fputc('\n', stderr);
+    va_end(args);
+  }
+#endif
+#endif
diff --git a/src/libsoxr-dev.src.in b/src/libsoxr-dev.src.in
new file mode 100644
index 0000000..ce879f9
--- /dev/null
+++ b/src/libsoxr-dev.src.in
@@ -0,0 +1,2 @@
+set(TARGET_HEADERS "@TARGET_HEADERS@")
+set(TARGET_PCS "@TARGET_PCS@")
diff --git a/src/libsoxr.src.in b/src/libsoxr.src.in
new file mode 100644
index 0000000..1c926ff
--- /dev/null
+++ b/src/libsoxr.src.in
@@ -0,0 +1 @@
+set(TARGET_LIBS "@TARGET_LIBS@")
diff --git a/src/lsr.c b/src/lsr.c
new file mode 100644
index 0000000..64b5798
--- /dev/null
+++ b/src/lsr.c
@@ -0,0 +1,114 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+/* Wrapper mostly compatible with `libsamplerate'. */
+
+#include <assert.h>
+#include <stdlib.h>
+#include "soxr.h"
+
+/* Runtime casts: */
+typedef struct io_t {
+  float *in,*out; long ilen,olen,idone,odone; int eoi; double oi_ratio;} io_t;
+#define SRC_DATA io_t
+typedef struct  soxr SRC_STATE;
+#define src_callback_t soxr_input_fn_t
+#define SRC_ERROR soxr_error_t
+#define SRC_SRCTYPE unsigned
+
+#include "soxr-lsr.h"
+#include "rint.h"
+
+soxr_error_t src_simple(io_t * p, unsigned id, int channels)
+{
+  size_t idone, odone;
+  soxr_error_t error;
+  soxr_quality_spec_t q_spec = soxr_quality_spec(SOXR_LSR0Q + id, 0);
+  char const * e = getenv("SOXR_LSR_NUM_THREADS");
+  soxr_runtime_spec_t r_spec = soxr_runtime_spec(!(e && atoi(e) != 1));
+  assert (channels > 0);
+  assert (p->ilen >= 0);
+  assert (p->olen >= 0);
+  error = soxr_oneshot(1, p->oi_ratio, (unsigned)channels,
+      p->in, (size_t)p->ilen, &idone, p->out, (size_t)p->olen, &odone,
+      0, &q_spec, &r_spec);
+  p->idone = (long)idone, p->odone = (long)odone;
+  return error;
+}
+
+soxr_t src_callback_new(soxr_input_fn_t fn, unsigned id, int channels, SRC_ERROR * error0, void * p)
+{
+  soxr_quality_spec_t q_spec = soxr_quality_spec(SOXR_LSR0Q + id, 0);
+  char const * e = getenv("SOXR_LSR_NUM_THREADS");
+  soxr_runtime_spec_t r_spec = soxr_runtime_spec(!(e && atoi(e) != 1));
+  soxr_error_t error;
+  soxr_t soxr = 0;
+  assert (channels > 0);
+  /* To minimise latency e.g. for real-time playback:
+  if (id == 2)
+    r_spec.log2_large_dft_size = r_spec.log2_min_dft_size = 8;
+    */
+  soxr = soxr_create(0, 0, (unsigned)channels, &error, 0, &q_spec, &r_spec);
+  if (soxr)
+    error = soxr_set_input_fn(soxr, fn, p, 0);
+  if (error0)
+    *(int *)error0 = (int)(ptrdiff_t)error;
+  return soxr;
+}
+
+soxr_error_t src_process(soxr_t p, io_t * io)
+{
+  if (!p || !io) return "null pointer";
+  soxr_set_error(p, soxr_set_io_ratio(p, 1/io->oi_ratio, (size_t)io->olen));
+
+  { size_t idone , odone;
+  soxr_process(p, io->in, (size_t)(io->eoi? ~io->ilen : io->ilen), /* hack */
+      &idone, io->out, (size_t)io->olen, &odone);
+  io->idone = (long)idone, io->odone = (long)odone;
+  return soxr_error(p); }
+}
+
+long src_callback_read(soxr_t p, double oi_ratio, long olen, float * obuf)
+{
+  if (!p || olen < 0) return -1;
+  soxr_set_error(p, soxr_set_io_ratio(p, 1/oi_ratio, (size_t)olen));
+  return (long)soxr_output(p, obuf, (size_t)olen);
+}
+
+void src_float_to_short_array(float const * src, short * dest, int len)
+{
+  double d, N = 1. + SHRT_MAX;
+  assert (src && dest);
+  while (len--) d = src[len] * N, dest[len] = (short)(d > N - 1? (short)(N - 1) : d < -N? (short)-N : rint16(d));
+}
+
+void src_short_to_float_array(short const * src, float * dest, int len)
+{
+  assert (src && dest);
+  while (len--) dest[len] = (float)(src[len] * (1 / (1. + SHRT_MAX)));
+}
+
+void src_float_to_int_array(float const * src, int * dest, int len)
+{
+  double d, N = 32768. * 65536.; /* N.B. int32, not int! (Also next fn.) */
+  assert (src && dest);
+  while (len--) d = src[len] * N, dest[len] = d >= N - 1? (int)(N - 1) : d < -N? (int)(-N) : rint32(d);
+}
+
+void src_int_to_float_array(int const * src, float * dest, int len)
+{
+  assert (src && dest);
+  while (len--) dest[len] = (float)(src[len] * (1 / (32768. * 65536.)));
+}
+
+static char const * const names[] = {"LSR best sinc", "LSR medium sinc", "LSR fastest sinc", "LSR ZOH", "LSR linear", "SoX VHQ"};
+char const * src_get_name(unsigned n)         {return n < 5u + !getenv("SOXR_LSR_STRICT")? names[n] : 0;}
+char const * src_get_description(unsigned id) {return src_get_name(id);}
+char const * src_get_version(void)            {return soxr_version();}
+char const * src_strerror(soxr_error_t error) {return error == (soxr_error_t)1? "Placeholder." : sizeof(int) >= sizeof(char *) || !error ? soxr_strerror(error) : "soxr error";}
+int src_is_valid_ratio(double oi_ratio)       {return getenv("SOXR_LSR_STRICT")? oi_ratio >= 1./256 && oi_ratio <= 256 : oi_ratio > 0;}
+soxr_error_t src_error(soxr_t p)              {return soxr_error(p);}
+soxr_error_t src_reset(soxr_t p)              {return soxr_clear(p);}
+soxr_t src_delete(soxr_t p)                   {soxr_delete(p); return 0;}
+soxr_error_t src_set_ratio(soxr_t p, double oi_ratio) {return soxr_set_io_ratio(p, 1/oi_ratio, 0);}
+soxr_t src_new(unsigned id, int channels, SRC_ERROR * error) {return src_callback_new(0, id, channels, error, 0);}
diff --git a/src/pffft.c b/src/pffft.c
new file mode 100644
index 0000000..9b4f59d
--- /dev/null
+++ b/src/pffft.c
@@ -0,0 +1,1729 @@
+/* Copyright (c) 2011  Julien Pommier ( pommier@modartt.com )
+
+   Based on original fortran 77 code from FFTPACKv4 from NETLIB
+   (http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber
+   of NCAR, in 1985.
+
+   As confirmed by the NCAR fftpack software curators, the following
+   FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
+   released under the same terms.
+
+   FFTPACK license:
+
+   http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
+
+   Copyright (c) 2004 the University Corporation for Atmospheric
+   Research ("UCAR"). All rights reserved. Developed by NCAR's
+   Computational and Information Systems Laboratory, UCAR,
+   www.cisl.ucar.edu.
+
+   Redistribution and use of the Software in source and binary forms,
+   with or without modification, is permitted provided that the
+   following conditions are met:
+
+   - Neither the names of NCAR's Computational and Information Systems
+   Laboratory, the University Corporation for Atmospheric Research,
+   nor the names of its sponsors or contributors may be used to
+   endorse or promote products derived from this Software without
+   specific prior written permission.
+
+   - Redistributions of source code must retain the above copyright
+   notices, this list of conditions, and the disclaimer below.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions, and the disclaimer below in the
+   documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+   SOFTWARE.
+
+
+   PFFFT : a Pretty Fast FFT.
+
+   This file is largerly based on the original FFTPACK implementation, modified in
+   order to take advantage of SIMD instructions of modern CPUs.
+*/
+
+/*
+  ChangeLog:
+  - 2011/10/02, version 1: This is the very first release of this file.
+*/
+
+#if !defined PFFT_MACROS_ONLY
+#include "pffft.h"
+#include "simd.h"
+#include <string.h>
+#include <stdlib.h>
+#include <math.h>
+#include <assert.h>
+
+#define pffft_aligned_free    _soxr_simd_aligned_free
+#define pffft_aligned_malloc  _soxr_simd_aligned_malloc
+#define pffft_aligned_calloc  _soxr_simd_aligned_calloc
+#endif
+
+/*
+   vector support macros: the rest of the code is independant of
+   SSE/Altivec/NEON -- adding support for other platforms with 4-element
+   vectors should be limited to these macros
+*/
+
+
+/* define PFFFT_SIMD_DISABLE if you want to use scalar code instead of simd code */
+/*#define PFFFT_SIMD_DISABLE */
+
+/* detect compiler flavour */
+#if defined(_MSC_VER)
+#  define COMPILER_MSVC
+#elif defined(__GNUC__)
+#  define COMPILER_GCC
+#endif
+
+#if defined(COMPILER_GCC)
+#  define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline))
+#  define NEVER_INLINE(return_type) return_type __attribute__ ((noinline))
+#  define RESTRICT __restrict
+/*#  define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ varname__[size__]; */
+#elif defined(COMPILER_MSVC)
+#  define ALWAYS_INLINE(return_type) __forceinline return_type
+#  define NEVER_INLINE(return_type) __declspec(noinline) return_type
+#  define RESTRICT __restrict
+/*#  define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ *varname__ = (v4sf*)_alloca(size__ * sizeof(type__)) */
+#endif
+
+/*
+   Altivec support macros
+*/
+#if !defined(PFFFT_SIMD_DISABLE) && (defined(__ppc__) || defined(__ppc64__))
+typedef vector float v4sf;
+#  define SIMD_SZ 4
+#  define VZERO() ((vector float) vec_splat_u8(0))
+#  define VMUL(a,b) vec_madd(a,b, VZERO())
+#  define VADD(a,b) vec_add(a,b)
+#  define VMADD(a,b,c) vec_madd(a,b,c)
+#  define VSUB(a,b) vec_sub(a,b)
+inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_perm(v, v, vec_lvsl(0, p)), 0); }
+#  define LD_PS1(p) ld_ps1(&p)
+#  define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = vec_mergeh(in1, in2); out2 = vec_mergel(in1, in2); out1 = tmp__; }
+#  define UNINTERLEAVE2(in1, in2, out1, out2) {                           \
+    vector unsigned char vperm1 =  (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); \
+    vector unsigned char vperm2 =  (vector unsigned char)(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); \
+    v4sf tmp__ = vec_perm(in1, in2, vperm1); out2 = vec_perm(in1, in2, vperm2); out1 = tmp__; \
+  }
+#  define VTRANSPOSE4(x0,x1,x2,x3) {              \
+    v4sf y0 = vec_mergeh(x0, x2);               \
+    v4sf y1 = vec_mergel(x0, x2);               \
+    v4sf y2 = vec_mergeh(x1, x3);               \
+    v4sf y3 = vec_mergel(x1, x3);               \
+    x0 = vec_mergeh(y0, y2);                    \
+    x1 = vec_mergel(y0, y2);                    \
+    x2 = vec_mergeh(y1, y3);                    \
+    x3 = vec_mergel(y1, y3);                    \
+  }
+#  define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char)(16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15))
+#  define VALIGNED(ptr) ((((long)(ptr)) & 0xF) == 0)
+
+/*
+  SSE1 support macros
+*/
+#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(i386) || defined(_M_IX86))
+
+#include <xmmintrin.h>
+typedef __m128 v4sf;
+#  define SIMD_SZ 4 /* 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/finalize functions anyway so you will have to work if you want to enable AVX with its 256-bit vectors. */
+#  define VZERO() _mm_setzero_ps()
+#  define VMUL(a,b) _mm_mul_ps(a,b)
+#  define VADD(a,b) _mm_add_ps(a,b)
+#  define VMADD(a,b,c) _mm_add_ps(_mm_mul_ps(a,b), c)
+#  define VSUB(a,b) _mm_sub_ps(a,b)
+#  define LD_PS1(p) _mm_set1_ps(p)
+#  define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; }
+#  define UNINTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; }
+#  define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3)
+#  define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0))
+#  define VALIGNED(ptr) ((((long)(ptr)) & 0xF) == 0)
+
+/*
+  ARM NEON support macros
+*/
+#elif !defined(PFFFT_SIMD_DISABLE) && defined(__arm__)
+#  include <arm_neon.h>
+typedef float32x4_t v4sf;
+#  define SIMD_SZ 4
+#  define VZERO() vdupq_n_f32(0)
+#  define VMUL(a,b) vmulq_f32(a,b)
+#  define VADD(a,b) vaddq_f32(a,b)
+#  define VMADD(a,b,c) vmlaq_f32(c,a,b)
+#  define VSUB(a,b) vsubq_f32(a,b)
+#  define LD_PS1(p) vld1q_dup_f32(&(p))
+#  define INTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vzipq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
+#  define UNINTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vuzpq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
+#  define VTRANSPOSE4_(x0,x1,x2,x3) {                                    \
+    float32x4x2_t t0_ = vzipq_f32(x0, x2);                              \
+    float32x4x2_t t1_ = vzipq_f32(x1, x3);                              \
+    float32x4x2_t u0_ = vzipq_f32(t0_.val[0], t1_.val[0]);              \
+    float32x4x2_t u1_ = vzipq_f32(t0_.val[1], t1_.val[1]);              \
+    x0 = u0_.val[0]; x1 = u0_.val[1]; x2 = u1_.val[0]; x3 = u1_.val[1]; \
+  }
+/* marginally faster version */
+#  define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); }
+#  define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a))
+#  define VALIGNED(ptr) ((((long)(ptr)) & 0x3) == 0)
+#else
+#  if !defined(PFFFT_SIMD_DISABLE)
+#    warning "building with simd disabled !\n";
+#    define PFFFT_SIMD_DISABLE /* fallback to scalar code */
+#  endif
+#endif
+
+/* fallback mode for situations where SSE/Altivec are not available, use scalar mode instead */
+#ifdef PFFFT_SIMD_DISABLE
+typedef float v4sf;
+#  define SIMD_SZ 1
+#  define VZERO() 0.f
+#  define VMUL(a,b) ((a)*(b))
+#  define VADD(a,b) ((a)+(b))
+#  define VMADD(a,b,c) ((a)*(b)+(c))
+#  define VSUB(a,b) ((a)-(b))
+#  define LD_PS1(p) (p)
+#  define VALIGNED(ptr) ((((long)(ptr)) & 0x3) == 0)
+#endif
+
+/* shortcuts for complex multiplcations */
+#define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); }
+#define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); }
+
+#if !defined(PFFFT_SIMD_DISABLE)
+typedef union v4sf_union {
+  v4sf  v;
+  float f[4];
+} v4sf_union;
+
+#if 0
+#include <string.h>
+
+#define assertv4(v,f0,f1,f2,f3) assert(v.f[0] == (f0) && v.f[1] == (f1) && v.f[2] == (f2) && v.f[3] == (f3))
+
+/* detect bugs with the vector support macros */
+void validate_pffft_simd() {
+  float f[16] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 };
+  v4sf_union a0, a1, a2, a3, t, u;
+  memcpy(a0.f, f, 4*sizeof(float));
+  memcpy(a1.f, f+4, 4*sizeof(float));
+  memcpy(a2.f, f+8, 4*sizeof(float));
+  memcpy(a3.f, f+12, 4*sizeof(float));
+
+  t = a0; u = a1; t.v = VZERO();
+  printf("VZERO=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 0, 0, 0, 0);
+  t.v = VADD(a1.v, a2.v);
+  printf("VADD(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 12, 14, 16, 18);
+  t.v = VMUL(a1.v, a2.v);
+  printf("VMUL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 45, 60, 77);
+  t.v = VMADD(a1.v, a2.v,a0.v);
+  printf("VMADD(4:7,8:11,0:3)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 46, 62, 80);
+
+  INTERLEAVE2(a1.v,a2.v,t.v,u.v);
+  printf("INTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
+  assertv4(t, 4, 8, 5, 9); assertv4(u, 6, 10, 7, 11);
+  UNINTERLEAVE2(a1.v,a2.v,t.v,u.v);
+  printf("UNINTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
+  assertv4(t, 4, 6, 8, 10); assertv4(u, 5, 7, 9, 11);
+
+  t.v=LD_PS1(f[15]);
+  printf("LD_PS1(15)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
+  assertv4(t, 15, 15, 15, 15);
+  t.v = VSWAPHL(a1.v, a2.v);
+  printf("VSWAPHL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
+  assertv4(t, 8, 9, 6, 7);
+  VTRANSPOSE4(a0.v, a1.v, a2.v, a3.v);
+  printf("VTRANSPOSE4(0:3,4:7,8:11,12:15)=[%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g]\n",
+         a0.f[0], a0.f[1], a0.f[2], a0.f[3], a1.f[0], a1.f[1], a1.f[2], a1.f[3],
+         a2.f[0], a2.f[1], a2.f[2], a2.f[3], a3.f[0], a3.f[1], a3.f[2], a3.f[3]);
+  assertv4(a0, 0, 4, 8, 12); assertv4(a1, 1, 5, 9, 13); assertv4(a2, 2, 6, 10, 14); assertv4(a3, 3, 7, 11, 15);
+}
+#endif
+#endif /*!PFFFT_SIMD_DISABLE */
+
+#if !defined PFFT_MACROS_ONLY
+
+
+#if defined (COMPILER_MSVC)
+  #define sin   (float)sin
+  #define cos   (float)cos
+#else
+  #define sin   sinf
+  #define cos   cosf
+#endif
+
+/*
+int pffft_simd_size() { return SIMD_SZ; }
+*/
+
+/*
+  passf2 and passb2 has been merged here, fsign = -1 for passf2, +1 for passb2
+*/
+static NEVER_INLINE(void) passf2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1, float fsign) {
+  int k, i;
+  int l1ido = l1*ido;
+  if (ido <= 2) {
+    for (k=0; k < l1ido; k += ido, ch += ido, cc+= 2*ido) {
+      ch[0]         = VADD(cc[0], cc[ido+0]);
+      ch[l1ido]     = VSUB(cc[0], cc[ido+0]);
+      ch[1]         = VADD(cc[1], cc[ido+1]);
+      ch[l1ido + 1] = VSUB(cc[1], cc[ido+1]);
+    }
+  } else {
+    for (k=0; k < l1ido; k += ido, ch += ido, cc += 2*ido) {
+      for (i=0; i<ido-1; i+=2) {
+        v4sf tr2 = VSUB(cc[i+0], cc[i+ido+0]);
+        v4sf ti2 = VSUB(cc[i+1], cc[i+ido+1]);
+        v4sf wr = LD_PS1(wa1[i]), wi = VMUL(LD_PS1(fsign), LD_PS1(wa1[i+1]));
+        ch[i]   = VADD(cc[i+0], cc[i+ido+0]);
+        ch[i+1] = VADD(cc[i+1], cc[i+ido+1]);
+        VCPLXMUL(tr2, ti2, wr, wi);
+        ch[i+l1ido]   = tr2;
+        ch[i+l1ido+1] = ti2;
+      }
+    }
+  }
+}
+
+/*
+  passf3 and passb3 has been merged here, fsign = -1 for passf3, +1 for passb3
+*/
+static NEVER_INLINE(void) passf3_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
+                                    const float *wa1, const float *wa2, float fsign) {
+  static const float taur = -0.5f;
+  float taui = 0.866025403784439f*fsign;
+  int i, k;
+  v4sf tr2, ti2, cr2, ci2, cr3, ci3, dr2, di2, dr3, di3;
+  int l1ido = l1*ido;
+  float wr1, wi1, wr2, wi2;
+  assert(ido > 2);
+  for (k=0; k< l1ido; k += ido, cc+= 3*ido, ch +=ido) {
+    for (i=0; i<ido-1; i+=2) {
+      tr2 = VADD(cc[i+ido], cc[i+2*ido]);
+      cr2 = VADD(cc[i], VMUL(LD_PS1(taur),tr2));
+      ch[i]    = VADD(cc[i], tr2);
+      ti2 = VADD(cc[i+ido+1], cc[i+2*ido+1]);
+      ci2 = VADD(cc[i    +1], VMUL(LD_PS1(taur),ti2));
+      ch[i+1]  = VADD(cc[i+1], ti2);
+      cr3 = VMUL(LD_PS1(taui), VSUB(cc[i+ido], cc[i+2*ido]));
+      ci3 = VMUL(LD_PS1(taui), VSUB(cc[i+ido+1], cc[i+2*ido+1]));
+      dr2 = VSUB(cr2, ci3);
+      dr3 = VADD(cr2, ci3);
+      di2 = VADD(ci2, cr3);
+      di3 = VSUB(ci2, cr3);
+      wr1=wa1[i], wi1=fsign*wa1[i+1], wr2=wa2[i], wi2=fsign*wa2[i+1];
+      VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
+      ch[i+l1ido] = dr2;
+      ch[i+l1ido + 1] = di2;
+      VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
+      ch[i+2*l1ido] = dr3;
+      ch[i+2*l1ido+1] = di3;
+    }
+  }
+} /* passf3 */
+
+static NEVER_INLINE(void) passf4_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
+                                    const float *wa1, const float *wa2, const float *wa3, float fsign) {
+  /* isign == -1 for forward transform and +1 for backward transform */
+
+  int i, k;
+  v4sf ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
+  int l1ido = l1*ido;
+  if (ido == 2) {
+    for (k=0; k < l1ido; k += ido, ch += ido, cc += 4*ido) {
+      tr1 = VSUB(cc[0], cc[2*ido + 0]);
+      tr2 = VADD(cc[0], cc[2*ido + 0]);
+      ti1 = VSUB(cc[1], cc[2*ido + 1]);
+      ti2 = VADD(cc[1], cc[2*ido + 1]);
+      ti4 = VMUL(VSUB(cc[1*ido + 0], cc[3*ido + 0]), LD_PS1(fsign));
+      tr4 = VMUL(VSUB(cc[3*ido + 1], cc[1*ido + 1]), LD_PS1(fsign));
+      tr3 = VADD(cc[ido + 0], cc[3*ido + 0]);
+      ti3 = VADD(cc[ido + 1], cc[3*ido + 1]);
+
+      ch[0*l1ido + 0] = VADD(tr2, tr3);
+      ch[0*l1ido + 1] = VADD(ti2, ti3);
+      ch[1*l1ido + 0] = VADD(tr1, tr4);
+      ch[1*l1ido + 1] = VADD(ti1, ti4);
+      ch[2*l1ido + 0] = VSUB(tr2, tr3);
+      ch[2*l1ido + 1] = VSUB(ti2, ti3);
+      ch[3*l1ido + 0] = VSUB(tr1, tr4);
+      ch[3*l1ido + 1] = VSUB(ti1, ti4);
+    }
+  } else {
+    for (k=0; k < l1ido; k += ido, ch+=ido, cc += 4*ido) {
+      for (i=0; i<ido-1; i+=2) {
+        float wr1, wi1, wr2, wi2, wr3, wi3;
+        tr1 = VSUB(cc[i + 0], cc[i + 2*ido + 0]);
+        tr2 = VADD(cc[i + 0], cc[i + 2*ido + 0]);
+        ti1 = VSUB(cc[i + 1], cc[i + 2*ido + 1]);
+        ti2 = VADD(cc[i + 1], cc[i + 2*ido + 1]);
+        tr4 = VMUL(VSUB(cc[i + 3*ido + 1], cc[i + 1*ido + 1]), LD_PS1(fsign));
+        ti4 = VMUL(VSUB(cc[i + 1*ido + 0], cc[i + 3*ido + 0]), LD_PS1(fsign));
+        tr3 = VADD(cc[i + ido + 0], cc[i + 3*ido + 0]);
+        ti3 = VADD(cc[i + ido + 1], cc[i + 3*ido + 1]);
+
+        ch[i] = VADD(tr2, tr3);
+        cr3    = VSUB(tr2, tr3);
+        ch[i + 1] = VADD(ti2, ti3);
+        ci3 = VSUB(ti2, ti3);
+
+        cr2 = VADD(tr1, tr4);
+        cr4 = VSUB(tr1, tr4);
+        ci2 = VADD(ti1, ti4);
+        ci4 = VSUB(ti1, ti4);
+        wr1=wa1[i], wi1=fsign*wa1[i+1];
+        VCPLXMUL(cr2, ci2, LD_PS1(wr1), LD_PS1(wi1));
+        wr2=wa2[i], wi2=fsign*wa2[i+1];
+        ch[i + l1ido] = cr2;
+        ch[i + l1ido + 1] = ci2;
+
+        VCPLXMUL(cr3, ci3, LD_PS1(wr2), LD_PS1(wi2));
+        wr3=wa3[i], wi3=fsign*wa3[i+1];
+        ch[i + 2*l1ido] = cr3;
+        ch[i + 2*l1ido + 1] = ci3;
+
+        VCPLXMUL(cr4, ci4, LD_PS1(wr3), LD_PS1(wi3));
+        ch[i + 3*l1ido] = cr4;
+        ch[i + 3*l1ido + 1] = ci4;
+      }
+    }
+  }
+} /* passf4 */
+
+static NEVER_INLINE(void) radf2_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch, const float *wa1) {
+  static const float minus_one = -1.f;
+  int i, k, l1ido = l1*ido;
+  for (k=0; k < l1ido; k += ido) {
+    v4sf a = cc[k], b = cc[k + l1ido];
+    ch[2*k] = VADD(a, b);
+    ch[2*(k+ido)-1] = VSUB(a, b);
+  }
+  if (ido < 2) return;
+  if (ido != 2) {
+    for (k=0; k < l1ido; k += ido) {
+      for (i=2; i<ido; i+=2) {
+        v4sf tr2 = cc[i - 1 + k + l1ido], ti2 = cc[i + k + l1ido];
+        v4sf br = cc[i - 1 + k], bi = cc[i + k];
+        VCPLXMULCONJ(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
+        ch[i + 2*k] = VADD(bi, ti2);
+        ch[2*(k+ido) - i] = VSUB(ti2, bi);
+        ch[i - 1 + 2*k] = VADD(br, tr2);
+        ch[2*(k+ido) - i -1] = VSUB(br, tr2);
+      }
+    }
+    if (ido % 2 == 1) return;
+  }
+  for (k=0; k < l1ido; k += ido) {
+    ch[2*k + ido] = VMUL(LD_PS1(minus_one), cc[ido-1 + k + l1ido]);
+    ch[2*k + ido-1] = cc[k + ido-1];
+  }
+} /* radf2 */
+
+
+static NEVER_INLINE(void) radb2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1) {
+  static const float minus_two=-2;
+  int i, k, l1ido = l1*ido;
+  v4sf a,b,c,d, tr2, ti2;
+  for (k=0; k < l1ido; k += ido) {
+    a = cc[2*k]; b = cc[2*(k+ido) - 1];
+    ch[k] = VADD(a, b);
+    ch[k + l1ido] =VSUB(a, b);
+  }
+  if (ido < 2) return;
+  if (ido != 2) {
+    for (k = 0; k < l1ido; k += ido) {
+      for (i = 2; i < ido; i += 2) {
+        a = cc[i-1 + 2*k]; b = cc[2*(k + ido) - i - 1];
+        c = cc[i+0 + 2*k]; d = cc[2*(k + ido) - i + 0];
+        ch[i-1 + k] = VADD(a, b);
+        tr2 = VSUB(a, b);
+        ch[i+0 + k] = VSUB(c, d);
+        ti2 = VADD(c, d);
+        VCPLXMUL(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
+        ch[i-1 + k + l1ido] = tr2;
+        ch[i+0 + k + l1ido] = ti2;
+      }
+    }
+    if (ido % 2 == 1) return;
+  }
+  for (k = 0; k < l1ido; k += ido) {
+    a = cc[2*k + ido-1]; b = cc[2*k + ido];
+    ch[k + ido-1] = VADD(a,a);
+    ch[k + ido-1 + l1ido] = VMUL(LD_PS1(minus_two), b);
+  }
+} /* radb2 */
+
+static void radf3_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
+                     const float *wa1, const float *wa2) {
+  static const float taur = -0.5f;
+  static const float taui = 0.866025403784439f;
+  int i, k, ic;
+  v4sf ci2, di2, di3, cr2, dr2, dr3, ti2, ti3, tr2, tr3, wr1, wi1, wr2, wi2;
+  for (k=0; k<l1; k++) {
+    cr2 = VADD(cc[(k + l1)*ido], cc[(k + 2*l1)*ido]);
+    ch[3*k*ido] = VADD(cc[k*ido], cr2);
+    ch[(3*k+2)*ido] = VMUL(LD_PS1(taui), VSUB(cc[(k + l1*2)*ido], cc[(k + l1)*ido]));
+    ch[ido-1 + (3*k + 1)*ido] = VADD(cc[k*ido], VMUL(LD_PS1(taur), cr2));
+  }
+  if (ido == 1) return;
+  for (k=0; k<l1; k++) {
+    for (i=2; i<ido; i+=2) {
+      ic = ido - i;
+      wr1 = LD_PS1(wa1[i - 2]); wi1 = LD_PS1(wa1[i - 1]);
+      dr2 = cc[i - 1 + (k + l1)*ido]; di2 = cc[i + (k + l1)*ido];
+      VCPLXMULCONJ(dr2, di2, wr1, wi1);
+
+      wr2 = LD_PS1(wa2[i - 2]); wi2 = LD_PS1(wa2[i - 1]);
+      dr3 = cc[i - 1 + (k + l1*2)*ido]; di3 = cc[i + (k + l1*2)*ido];
+      VCPLXMULCONJ(dr3, di3, wr2, wi2);
+
+      cr2 = VADD(dr2, dr3);
+      ci2 = VADD(di2, di3);
+      ch[i - 1 + 3*k*ido] = VADD(cc[i - 1 + k*ido], cr2);
+      ch[i + 3*k*ido] = VADD(cc[i + k*ido], ci2);
+      tr2 = VADD(cc[i - 1 + k*ido], VMUL(LD_PS1(taur), cr2));
+      ti2 = VADD(cc[i + k*ido], VMUL(LD_PS1(taur), ci2));
+      tr3 = VMUL(LD_PS1(taui), VSUB(di2, di3));
+      ti3 = VMUL(LD_PS1(taui), VSUB(dr3, dr2));
+      ch[i - 1 + (3*k + 2)*ido] = VADD(tr2, tr3);
+      ch[ic - 1 + (3*k + 1)*ido] = VSUB(tr2, tr3);
+      ch[i + (3*k + 2)*ido] = VADD(ti2, ti3);
+      ch[ic + (3*k + 1)*ido] = VSUB(ti3, ti2);
+    }
+  }
+} /* radf3 */
+
+
+static void radb3_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
+                     const float *wa1, const float *wa2)
+{
+  static const float taur = -0.5f;
+  static const float taui = 0.866025403784439f;
+  static const float taui_2 = 0.866025403784439f*2;
+  int i, k, ic;
+  v4sf ci2, ci3, di2, di3, cr2, cr3, dr2, dr3, ti2, tr2;
+  for (k=0; k<l1; k++) {
+    tr2 = cc[ido-1 + (3*k + 1)*ido]; tr2 = VADD(tr2,tr2);
+    cr2 = VMADD(LD_PS1(taur), tr2, cc[3*k*ido]);
+    ch[k*ido] = VADD(cc[3*k*ido], tr2);
+    ci3 = VMUL(LD_PS1(taui_2), cc[(3*k + 2)*ido]);
+    ch[(k + l1)*ido] = VSUB(cr2, ci3);
+    ch[(k + 2*l1)*ido] = VADD(cr2, ci3);
+  }
+  if (ido == 1) return;
+  for (k=0; k<l1; k++) {
+    for (i=2; i<ido; i+=2) {
+      ic = ido - i;
+      tr2 = VADD(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]);
+      cr2 = VMADD(LD_PS1(taur), tr2, cc[i - 1 + 3*k*ido]);
+      ch[i - 1 + k*ido] = VADD(cc[i - 1 + 3*k*ido], tr2);
+      ti2 = VSUB(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]);
+      ci2 = VMADD(LD_PS1(taur), ti2, cc[i + 3*k*ido]);
+      ch[i + k*ido] = VADD(cc[i + 3*k*ido], ti2);
+      cr3 = VMUL(LD_PS1(taui), VSUB(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]));
+      ci3 = VMUL(LD_PS1(taui), VADD(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]));
+      dr2 = VSUB(cr2, ci3);
+      dr3 = VADD(cr2, ci3);
+      di2 = VADD(ci2, cr3);
+      di3 = VSUB(ci2, cr3);
+      VCPLXMUL(dr2, di2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
+      ch[i - 1 + (k + l1)*ido] = dr2;
+      ch[i + (k + l1)*ido] = di2;
+      VCPLXMUL(dr3, di3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
+      ch[i - 1 + (k + 2*l1)*ido] = dr3;
+      ch[i + (k + 2*l1)*ido] = di3;
+    }
+  }
+} /* radb3 */
+
+
+static NEVER_INLINE(void) radf4_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf * RESTRICT ch,
+                                   const float * RESTRICT wa1, const float * RESTRICT wa2, const float * RESTRICT wa3)
+{
+  static const float minus_hsqt2 = (float)-0.7071067811865475;
+  int i, k, l1ido = l1*ido;
+  {
+    const v4sf *RESTRICT cc_ = cc, * RESTRICT cc_end = cc + l1ido;
+    v4sf * RESTRICT ch_ = ch;
+    while (cc < cc_end) {
+      /* this loop represents between 25% and 40% of total radf4_ps cost ! */
+      v4sf a0 = cc[0], a1 = cc[l1ido];
+      v4sf a2 = cc[2*l1ido], a3 = cc[3*l1ido];
+      v4sf tr1 = VADD(a1, a3);
+      v4sf tr2 = VADD(a0, a2);
+      ch[2*ido-1] = VSUB(a0, a2);
+      ch[2*ido  ] = VSUB(a3, a1);
+      ch[0      ] = VADD(tr1, tr2);
+      ch[4*ido-1] = VSUB(tr2, tr1);
+      cc += ido; ch += 4*ido;
+    }
+    cc = cc_; ch = ch_;
+  }
+  if (ido < 2) return;
+  if (ido != 2) {
+    for (k = 0; k < l1ido; k += ido) {
+      const v4sf * RESTRICT pc = (v4sf*)(cc + 1 + k);
+      for (i=2; i<ido; i += 2, pc += 2) {
+        int ic = ido - i;
+        v4sf wr, wi, cr2, ci2, cr3, ci3, cr4, ci4;
+        v4sf tr1, ti1, tr2, ti2, tr3, ti3, tr4, ti4;
+
+        cr2 = pc[1*l1ido+0];
+        ci2 = pc[1*l1ido+1];
+        wr=LD_PS1(wa1[i - 2]);
+        wi=LD_PS1(wa1[i - 1]);
+        VCPLXMULCONJ(cr2,ci2,wr,wi);
+
+        cr3 = pc[2*l1ido+0];
+        ci3 = pc[2*l1ido+1];
+        wr = LD_PS1(wa2[i-2]);
+        wi = LD_PS1(wa2[i-1]);
+        VCPLXMULCONJ(cr3, ci3, wr, wi);
+
+        cr4 = pc[3*l1ido];
+        ci4 = pc[3*l1ido+1];
+        wr = LD_PS1(wa3[i-2]);
+        wi = LD_PS1(wa3[i-1]);
+        VCPLXMULCONJ(cr4, ci4, wr, wi);
+
+        /* at this point, on SSE, five of "cr2 cr3 cr4 ci2 ci3 ci4" should be loaded in registers */
+
+        tr1 = VADD(cr2,cr4);
+        tr4 = VSUB(cr4,cr2);
+        tr2 = VADD(pc[0],cr3);
+        tr3 = VSUB(pc[0],cr3);
+        ch[i - 1 + 4*k] = VADD(tr1,tr2);
+        ch[ic - 1 + 4*k + 3*ido] = VSUB(tr2,tr1); /* at this point tr1 and tr2 can be disposed */
+        ti1 = VADD(ci2,ci4);
+        ti4 = VSUB(ci2,ci4);
+        ch[i - 1 + 4*k + 2*ido] = VADD(ti4,tr3);
+        ch[ic - 1 + 4*k + 1*ido] = VSUB(tr3,ti4); /* dispose tr3, ti4 */
+        ti2 = VADD(pc[1],ci3);
+        ti3 = VSUB(pc[1],ci3);
+        ch[i + 4*k] = VADD(ti1, ti2);
+        ch[ic + 4*k + 3*ido] = VSUB(ti1, ti2);
+        ch[i + 4*k + 2*ido] = VADD(tr4, ti3);
+        ch[ic + 4*k + 1*ido] = VSUB(tr4, ti3);
+      }
+    }
+    if (ido % 2 == 1) return;
+  }
+  for (k=0; k<l1ido; k += ido) {
+    v4sf a = cc[ido-1 + k + l1ido], b = cc[ido-1 + k + 3*l1ido];
+    v4sf c = cc[ido-1 + k], d = cc[ido-1 + k + 2*l1ido];
+    v4sf ti1 = VMUL(LD_PS1(minus_hsqt2), VADD(a, b));
+    v4sf tr1 = VMUL(LD_PS1(minus_hsqt2), VSUB(b, a));
+    ch[ido-1 + 4*k] = VADD(tr1, c);
+    ch[ido-1 + 4*k + 2*ido] = VSUB(c, tr1);
+    ch[4*k + 1*ido] = VSUB(ti1, d);
+    ch[4*k + 3*ido] = VADD(ti1, d);
+  }
+} /* radf4 */
+
+
+static NEVER_INLINE(void) radb4_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
+                                   const float * RESTRICT wa1, const float * RESTRICT wa2, const float *RESTRICT wa3)
+{
+  static const float minus_sqrt2 = (float)-1.414213562373095;
+  static const float two = 2.f;
+  int i, k, l1ido = l1*ido;
+  v4sf ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
+  {
+    const v4sf *RESTRICT cc_ = cc, * RESTRICT ch_end = ch + l1ido;
+    v4sf *ch_ = ch;
+    while (ch < ch_end) {
+      v4sf a = cc[0], b = cc[4*ido-1];
+      v4sf c = cc[2*ido], d = cc[2*ido-1];
+      tr3 = VMUL(LD_PS1(two),d);
+      tr2 = VADD(a,b);
+      tr1 = VSUB(a,b);
+      tr4 = VMUL(LD_PS1(two),c);
+      ch[0*l1ido] = VADD(tr2, tr3);
+      ch[2*l1ido] = VSUB(tr2, tr3);
+      ch[1*l1ido] = VSUB(tr1, tr4);
+      ch[3*l1ido] = VADD(tr1, tr4);
+
+      cc += 4*ido; ch += ido;
+    }
+    cc = cc_; ch = ch_;
+  }
+  if (ido < 2) return;
+  if (ido != 2) {
+    for (k = 0; k < l1ido; k += ido) {
+      const v4sf * RESTRICT pc = (v4sf*)(cc - 1 + 4*k);
+      v4sf * RESTRICT ph = (v4sf*)(ch + k + 1);
+      for (i = 2; i < ido; i += 2) {
+
+        tr1 = VSUB(pc[i], pc[4*ido - i]);
+        tr2 = VADD(pc[i], pc[4*ido - i]);
+        ti4 = VSUB(pc[2*ido + i], pc[2*ido - i]);
+        tr3 = VADD(pc[2*ido + i], pc[2*ido - i]);
+        ph[0] = VADD(tr2, tr3);
+        cr3 = VSUB(tr2, tr3);
+
+        ti3 = VSUB(pc[2*ido + i + 1], pc[2*ido - i + 1]);
+        tr4 = VADD(pc[2*ido + i + 1], pc[2*ido - i + 1]);
+        cr2 = VSUB(tr1, tr4);
+        cr4 = VADD(tr1, tr4);
+
+        ti1 = VADD(pc[i + 1], pc[4*ido - i + 1]);
+        ti2 = VSUB(pc[i + 1], pc[4*ido - i + 1]);
+
+        ph[1] = VADD(ti2, ti3); ph += l1ido;
+        ci3 = VSUB(ti2, ti3);
+        ci2 = VADD(ti1, ti4);
+        ci4 = VSUB(ti1, ti4);
+        VCPLXMUL(cr2, ci2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
+        ph[0] = cr2;
+        ph[1] = ci2; ph += l1ido;
+        VCPLXMUL(cr3, ci3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
+        ph[0] = cr3;
+        ph[1] = ci3; ph += l1ido;
+        VCPLXMUL(cr4, ci4, LD_PS1(wa3[i-2]), LD_PS1(wa3[i-1]));
+        ph[0] = cr4;
+        ph[1] = ci4; ph = ph - 3*l1ido + 2;
+      }
+    }
+    if (ido % 2 == 1) return;
+  }
+  for (k=0; k < l1ido; k+=ido) {
+    int i0 = 4*k + ido;
+    v4sf c = cc[i0-1], d = cc[i0 + 2*ido-1];
+    v4sf a = cc[i0+0], b = cc[i0 + 2*ido+0];
+    tr1 = VSUB(c,d);
+    tr2 = VADD(c,d);
+    ti1 = VADD(b,a);
+    ti2 = VSUB(b,a);
+    ch[ido-1 + k + 0*l1ido] = VADD(tr2,tr2);
+    ch[ido-1 + k + 1*l1ido] = VMUL(LD_PS1(minus_sqrt2), VSUB(ti1, tr1));
+    ch[ido-1 + k + 2*l1ido] = VADD(ti2, ti2);
+    ch[ido-1 + k + 3*l1ido] = VMUL(LD_PS1(minus_sqrt2), VADD(ti1, tr1));
+  }
+} /* radb4 */
+
+static NEVER_INLINE(v4sf *) rfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
+                                      const float *wa, const int *ifac) {
+  v4sf *in  = (v4sf*)input_readonly;
+  v4sf *out = (in == work2 ? work1 : work2);
+  int nf = ifac[1], k1;
+  int l2 = n;
+  int iw = n-1;
+  assert(in != out && work1 != work2);
+  for (k1 = 1; k1 <= nf; ++k1) {
+    int kh = nf - k1;
+    int ip = ifac[kh + 2];
+    int l1 = l2 / ip;
+    int ido = n / l2;
+    iw -= (ip - 1)*ido;
+    switch (ip) {
+      case 4: {
+        int ix2 = iw + ido;
+        int ix3 = ix2 + ido;
+        radf4_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3]);
+      } break;
+      case 3: {
+        int ix2 = iw + ido;
+        radf3_ps(ido, l1, in, out, &wa[iw], &wa[ix2]);
+      } break;
+      case 2:
+        radf2_ps(ido, l1, in, out, &wa[iw]);
+        break;
+      default:
+        assert(0);
+        break;
+    }
+    l2 = l1;
+    if (out == work2) {
+      out = work1; in = work2;
+    } else {
+      out = work2; in = work1;
+    }
+  }
+  return in; /* this is in fact the output .. */
+} /* rfftf1 */
+
+static NEVER_INLINE(v4sf *) rfftb1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
+                                      const float *wa, const int *ifac) {
+  v4sf *in  = (v4sf*)input_readonly;
+  v4sf *out = (in == work2 ? work1 : work2);
+  int nf = ifac[1], k1;
+  int l1 = 1;
+  int iw = 0;
+  assert(in != out);
+  for (k1=1; k1<=nf; k1++) {
+    int ip = ifac[k1 + 1];
+    int l2 = ip*l1;
+    int ido = n / l2;
+    switch (ip) {
+      case 4: {
+        int ix2 = iw + ido;
+        int ix3 = ix2 + ido;
+        radb4_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3]);
+      } break;
+      case 3: {
+        int ix2 = iw + ido;
+        radb3_ps(ido, l1, in, out, &wa[iw], &wa[ix2]);
+      } break;
+      case 2:
+        radb2_ps(ido, l1, in, out, &wa[iw]);
+        break;
+      default:
+        assert(0);
+        break;
+    }
+    l1 = l2;
+    iw += (ip - 1)*ido;
+
+    if (out == work2) {
+      out = work1; in = work2;
+    } else {
+      out = work2; in = work1;
+    }
+  }
+  return in; /* this is in fact the output .. */
+}
+
+static int decompose(int n, int *ifac, const int ntryh[3]) {
+  int nl = n, nf = 0, i, j = 0;
+  for (j=0; j < 3; ++j) {
+    int ntry = ntryh[j];
+    while (nl != 1) {
+      int nq = nl / ntry;
+      int nr = nl - ntry * nq;
+      if (nr == 0) {
+        ifac[2+nf++] = ntry;
+        nl = nq;
+        if (ntry == 2 && nf != 1) {
+          for (i = 2; i <= nf; ++i) {
+            int ib = nf - i + 2;
+            ifac[ib + 1] = ifac[ib];
+          }
+          ifac[2] = 2;
+        }
+      } else break;
+    }
+  }
+  ifac[0] = n;
+  ifac[1] = nf;
+  return nf;
+}
+
+
+
+static void rffti1_ps(int n, float *wa, int *ifac)
+{
+  static const int ntryh[3] = { 4,2,3 };
+  int k1, j, ii;
+
+  int nf = decompose(n,ifac,ntryh);
+  float argh = (float)((2*M_PI) / n);
+  int is = 0;
+  int nfm1 = nf - 1;
+  int l1 = 1;
+  if (nfm1 == 0) return;
+  for (k1 = 1; k1 <= nfm1; k1++) {
+    int ip = ifac[k1 + 1];
+    int ld = 0;
+    int l2 = l1*ip;
+    int ido = n / l2;
+    int ipm = ip - 1;
+    for (j = 1; j <= ipm; ++j) {
+      float argld;
+      int i = is, fi=0;
+      ld += l1;
+      argld = (float)ld*argh;
+      for (ii = 3; ii <= ido; ii += 2) {
+        i += 2;
+        fi += 1;
+        wa[i - 2] = cos((float)fi*argld);
+        wa[i - 1] = sin((float)fi*argld);
+      }
+      is += ido;
+    }
+    l1 = l2;
+  }
+} /* rffti1 */
+
+static void cffti1_ps(int n, float *wa, int *ifac)
+{
+  static const int ntryh[3] = { 3,4,2 };
+  int k1, j, ii;
+
+  int nf = decompose(n,ifac,ntryh);
+  float argh = (float)((2*M_PI)/n);
+  int i = 1;
+  int l1 = 1;
+  for (k1=1; k1<=nf; k1++) {
+    int ip = ifac[k1+1];
+    int ld = 0;
+    int l2 = l1*ip;
+    int ido = n / l2;
+    int idot = ido + ido + 2;
+    int ipm = ip - 1;
+    for (j=1; j<=ipm; j++) {
+      float argld;
+      int i1 = i, fi = 0;
+      wa[i-1] = 1;
+      wa[i] = 0;
+      ld += l1;
+      argld = (float)ld*argh;
+      for (ii = 4; ii <= idot; ii += 2) {
+        i += 2;
+        fi += 1;
+        wa[i-1] = cos((float)fi*argld);
+        wa[i] = sin((float)fi*argld);
+      }
+      if (ip > 5) {
+        wa[i1-1] = wa[i-1];
+        wa[i1] = wa[i];
+      }
+    }
+    l1 = l2;
+  }
+} /* cffti1 */
+
+
+static v4sf *cfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, const float *wa, const int *ifac, int isign) {
+  v4sf *in  = (v4sf*)input_readonly;
+  v4sf *out = (in == work2 ? work1 : work2);
+  int nf = ifac[1], k1;
+  int l1 = 1;
+  int iw = 0;
+  assert(in != out && work1 != work2);
+  for (k1=2; k1<=nf+1; k1++) {
+    int ip = ifac[k1];
+    int l2 = ip*l1;
+    int ido = n / l2;
+    int idot = ido + ido;
+    switch (ip) {
+      case 4: {
+        int ix2 = iw + idot;
+        int ix3 = ix2 + idot;
+        passf4_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], (float)isign);
+      } break;
+      case 2: {
+        passf2_ps(idot, l1, in, out, &wa[iw], (float)isign);
+      } break;
+      case 3: {
+        int ix2 = iw + idot;
+        passf3_ps(idot, l1, in, out, &wa[iw], &wa[ix2], (float)isign);
+      } break;
+      default:
+        assert(0);
+    }
+    l1 = l2;
+    iw += (ip - 1)*idot;
+    if (out == work2) {
+      out = work1; in = work2;
+    } else {
+      out = work2; in = work1;
+    }
+  }
+
+  return in; /* this is in fact the output .. */
+}
+
+
+struct PFFFT_Setup {
+  int     N;
+  int     Ncvec; /* nb of complex simd vectors (N/4 if PFFFT_COMPLEX, N/8 if PFFFT_REAL) */
+  int ifac[15];
+  pffft_transform_t transform;
+  v4sf *data; /* allocated room for twiddle coefs */
+  float *e;    /* points into 'data' , N/4*3 elements */
+  float *twiddle; /* points into 'data', N/4 elements */
+};
+
+PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform) {
+  int k, m;
+  PFFFT_Setup *s = (PFFFT_Setup*)malloc(sizeof(PFFFT_Setup));
+  if (!s)
+    return s;
+  if (transform == PFFFT_REAL) { assert(N >= 32); }
+  if (transform == PFFFT_COMPLEX) { assert(N >= 16); }
+  /*assert((N % 32) == 0); */
+  s->N = N;
+  s->transform = transform;
+  /* nb of complex simd vectors */
+  s->Ncvec = (transform == PFFFT_REAL ? N/2 : N)/SIMD_SZ;
+  s->data = (v4sf*)pffft_aligned_malloc(2*(size_t)s->Ncvec * sizeof(v4sf));
+  if (!s->data) {
+    free(s);
+    return 0;
+  }
+  s->e = (float*)s->data;
+  s->twiddle = (float*)(s->data + (2*s->Ncvec*(SIMD_SZ-1))/SIMD_SZ);
+
+  if (transform == PFFFT_REAL) {
+    for (k=0; k < s->Ncvec; ++k) {
+      int i = k/SIMD_SZ;
+      int j = k%SIMD_SZ;
+      for (m=0; m < SIMD_SZ-1; ++m) {
+        float A = (float)(-2*M_PI*(m+1)*k / N);
+        s->e[(2*(i*3 + m) + 0) * SIMD_SZ + j] = cos(A);
+        s->e[(2*(i*3 + m) + 1) * SIMD_SZ + j] = sin(A);
+      }
+    }
+    rffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac);
+  } else {
+    for (k=0; k < s->Ncvec; ++k) {
+      int i = k/SIMD_SZ;
+      int j = k%SIMD_SZ;
+      for (m=0; m < SIMD_SZ-1; ++m) {
+        float A = (float)(-2*M_PI*(m+1)*k / N);
+        s->e[(2*(i*3 + m) + 0)*SIMD_SZ + j] = cos(A);
+        s->e[(2*(i*3 + m) + 1)*SIMD_SZ + j] = sin(A);
+      }
+    }
+    cffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac);
+  }
+  return s;
+}
+
+
+static void pffft_destroy_setup(PFFFT_Setup *s) {
+  if(s){
+    pffft_aligned_free(s->data);
+    free(s);
+  }
+}
+
+#if !defined(PFFFT_SIMD_DISABLE)
+
+/* [0 0 1 2 3 4 5 6 7 8] -> [0 8 7 6 5 4 3 2 1] */
+static void reversed_copy(int N, const v4sf *in, int in_stride, v4sf *out) {
+  v4sf g0, g1;
+  int k;
+  INTERLEAVE2(in[0], in[1], g0, g1); in += in_stride;
+
+  *--out = VSWAPHL(g0, g1); /* [g0l, g0h], [g1l g1h] -> [g1l, g0h] */
+  for (k=1; k < N; ++k) {
+    v4sf h0, h1;
+    INTERLEAVE2(in[0], in[1], h0, h1); in += in_stride;
+    *--out = VSWAPHL(g1, h0);
+    *--out = VSWAPHL(h0, h1);
+    g1 = h1;
+  }
+  *--out = VSWAPHL(g1, g0);
+}
+
+static void unreversed_copy(int N, const v4sf *in, v4sf *out, int out_stride) {
+  v4sf g0, g1, h0, h1;
+  int k;
+  g0 = g1 = in[0]; ++in;
+  for (k=1; k < N; ++k) {
+    h0 = *in++; h1 = *in++;
+    g1 = VSWAPHL(g1, h0);
+    h0 = VSWAPHL(h0, h1);
+    UNINTERLEAVE2(h0, g1, out[0], out[1]); out += out_stride;
+    g1 = h1;
+  }
+  h0 = *in++; h1 = g0;
+  g1 = VSWAPHL(g1, h0);
+  h0 = VSWAPHL(h0, h1);
+  UNINTERLEAVE2(h0, g1, out[0], out[1]);
+}
+
+static void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) {
+  int k, N = setup->N, Ncvec = setup->Ncvec;
+  const v4sf *vin = (const v4sf*)in;
+  v4sf *vout = (v4sf*)out;
+  assert(in != out);
+  if (setup->transform == PFFFT_REAL) {
+    int k, dk = N/32;
+    if (direction == PFFFT_FORWARD) {
+      for (k=0; k < dk; ++k) {
+        INTERLEAVE2(vin[k*8 + 0], vin[k*8 + 1], vout[2*(0*dk + k) + 0], vout[2*(0*dk + k) + 1]);
+        INTERLEAVE2(vin[k*8 + 4], vin[k*8 + 5], vout[2*(2*dk + k) + 0], vout[2*(2*dk + k) + 1]);
+      }
+      reversed_copy(dk, vin+2, 8, (v4sf*)(out + N/2));
+      reversed_copy(dk, vin+6, 8, (v4sf*)(out + N));
+    } else {
+      for (k=0; k < dk; ++k) {
+        UNINTERLEAVE2(vin[2*(0*dk + k) + 0], vin[2*(0*dk + k) + 1], vout[k*8 + 0], vout[k*8 + 1]);
+        UNINTERLEAVE2(vin[2*(2*dk + k) + 0], vin[2*(2*dk + k) + 1], vout[k*8 + 4], vout[k*8 + 5]);
+      }
+      unreversed_copy(dk, (v4sf*)(in + N/4), (v4sf*)(out + N - 6*SIMD_SZ), -8);
+      unreversed_copy(dk, (v4sf*)(in + 3*N/4), (v4sf*)(out + N - 2*SIMD_SZ), -8);
+    }
+  } else {
+    if (direction == PFFFT_FORWARD) {
+      for (k=0; k < Ncvec; ++k) {
+        int kk = (k/4) + (k%4)*(Ncvec/4);
+        INTERLEAVE2(vin[k*2], vin[k*2+1], vout[kk*2], vout[kk*2+1]);
+      }
+    } else {
+      for (k=0; k < Ncvec; ++k) {
+        int kk = (k/4) + (k%4)*(Ncvec/4);
+        UNINTERLEAVE2(vin[kk*2], vin[kk*2+1], vout[k*2], vout[k*2+1]);
+      }
+    }
+  }
+}
+
+static void pffft_cplx_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
+  int k, dk = Ncvec/SIMD_SZ; /* number of 4x4 matrix blocks */
+  v4sf r0, i0, r1, i1, r2, i2, r3, i3;
+  v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
+  assert(in != out);
+  for (k=0; k < dk; ++k) {
+    r0 = in[8*k+0]; i0 = in[8*k+1];
+    r1 = in[8*k+2]; i1 = in[8*k+3];
+    r2 = in[8*k+4]; i2 = in[8*k+5];
+    r3 = in[8*k+6]; i3 = in[8*k+7];
+    VTRANSPOSE4(r0,r1,r2,r3);
+    VTRANSPOSE4(i0,i1,i2,i3);
+    VCPLXMUL(r1,i1,e[k*6+0],e[k*6+1]);
+    VCPLXMUL(r2,i2,e[k*6+2],e[k*6+3]);
+    VCPLXMUL(r3,i3,e[k*6+4],e[k*6+5]);
+
+    sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2);
+    sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3);
+    si0 = VADD(i0,i2); di0 = VSUB(i0, i2);
+    si1 = VADD(i1,i3); di1 = VSUB(i1, i3);
+
+    /*
+      transformation for each column is:
+
+      [1   1   1   1   0   0   0   0]   [r0]
+      [1   0  -1   0   0  -1   0   1]   [r1]
+      [1  -1   1  -1   0   0   0   0]   [r2]
+      [1   0  -1   0   0   1   0  -1]   [r3]
+      [0   0   0   0   1   1   1   1] * [i0]
+      [0   1   0  -1   1   0  -1   0]   [i1]
+      [0   0   0   0   1  -1   1  -1]   [i2]
+      [0  -1   0   1   1   0  -1   0]   [i3]
+    */
+
+    r0 = VADD(sr0, sr1); i0 = VADD(si0, si1);
+    r1 = VADD(dr0, di1); i1 = VSUB(di0, dr1);
+    r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1);
+    r3 = VSUB(dr0, di1); i3 = VADD(di0, dr1);
+
+    *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1;
+    *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3;
+  }
+}
+
+static void pffft_cplx_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
+  int k, dk = Ncvec/SIMD_SZ; /* number of 4x4 matrix blocks */
+  v4sf r0, i0, r1, i1, r2, i2, r3, i3;
+  v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
+  assert(in != out);
+  for (k=0; k < dk; ++k) {
+    r0 = in[8*k+0]; i0 = in[8*k+1];
+    r1 = in[8*k+2]; i1 = in[8*k+3];
+    r2 = in[8*k+4]; i2 = in[8*k+5];
+    r3 = in[8*k+6]; i3 = in[8*k+7];
+
+    sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2);
+    sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3);
+    si0 = VADD(i0,i2); di0 = VSUB(i0, i2);
+    si1 = VADD(i1,i3); di1 = VSUB(i1, i3);
+
+    r0 = VADD(sr0, sr1); i0 = VADD(si0, si1);
+    r1 = VSUB(dr0, di1); i1 = VADD(di0, dr1);
+    r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1);
+    r3 = VADD(dr0, di1); i3 = VSUB(di0, dr1);
+
+    VCPLXMULCONJ(r1,i1,e[k*6+0],e[k*6+1]);
+    VCPLXMULCONJ(r2,i2,e[k*6+2],e[k*6+3]);
+    VCPLXMULCONJ(r3,i3,e[k*6+4],e[k*6+5]);
+
+    VTRANSPOSE4(r0,r1,r2,r3);
+    VTRANSPOSE4(i0,i1,i2,i3);
+
+    *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1;
+    *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3;
+  }
+}
+
+
+static ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *in1, const v4sf *in,
+                            const v4sf *e, v4sf *out) {
+  v4sf r0, i0, r1, i1, r2, i2, r3, i3;
+  v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
+  r0 = *in0; i0 = *in1;
+  r1 = *in++; i1 = *in++; r2 = *in++; i2 = *in++; r3 = *in++; i3 = *in++;
+  VTRANSPOSE4(r0,r1,r2,r3);
+  VTRANSPOSE4(i0,i1,i2,i3);
+
+  /*
+    transformation for each column is:
+
+    [1   1   1   1   0   0   0   0]   [r0]
+    [1   0  -1   0   0  -1   0   1]   [r1]
+    [1   0  -1   0   0   1   0  -1]   [r2]
+    [1  -1   1  -1   0   0   0   0]   [r3]
+    [0   0   0   0   1   1   1   1] * [i0]
+    [0  -1   0   1  -1   0   1   0]   [i1]
+    [0  -1   0   1   1   0  -1   0]   [i2]
+    [0   0   0   0  -1   1  -1   1]   [i3]
+  */
+
+  /*cerr << "matrix initial, before e , REAL:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n"; */
+  /*cerr << "matrix initial, before e, IMAG :\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n"; */
+
+  VCPLXMUL(r1,i1,e[0],e[1]);
+  VCPLXMUL(r2,i2,e[2],e[3]);
+  VCPLXMUL(r3,i3,e[4],e[5]);
+
+  /*cerr << "matrix initial, real part:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n"; */
+  /*cerr << "matrix initial, imag part:\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n"; */
+
+  sr0 = VADD(r0,r2); dr0 = VSUB(r0,r2);
+  sr1 = VADD(r1,r3); dr1 = VSUB(r3,r1);
+  si0 = VADD(i0,i2); di0 = VSUB(i0,i2);
+  si1 = VADD(i1,i3); di1 = VSUB(i3,i1);
+
+  r0 = VADD(sr0, sr1);
+  r3 = VSUB(sr0, sr1);
+  i0 = VADD(si0, si1);
+  i3 = VSUB(si1, si0);
+  r1 = VADD(dr0, di1);
+  r2 = VSUB(dr0, di1);
+  i1 = VSUB(dr1, di0);
+  i2 = VADD(dr1, di0);
+
+  *out++ = r0;
+  *out++ = i0;
+  *out++ = r1;
+  *out++ = i1;
+  *out++ = r2;
+  *out++ = i2;
+  *out++ = r3;
+  *out++ = i3;
+
+}
+
+static NEVER_INLINE(void) pffft_real_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
+  int k, dk = Ncvec/SIMD_SZ; /* number of 4x4 matrix blocks */
+  /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
+
+  v4sf_union cr, ci, *uout = (v4sf_union*)out;
+  v4sf save = in[7], zero=VZERO();
+  float xr0, xi0, xr1, xi1, xr2, xi2, xr3, xi3;
+  static const float s = (float)(M_SQRT2/2);
+
+  cr.v = in[0]; ci.v = in[Ncvec*2-1];
+  assert(in != out);
+  pffft_real_finalize_4x4(&zero, &zero, in+1, e, out);
+
+  /*
+    [cr0 cr1 cr2 cr3 ci0 ci1 ci2 ci3]
+
+    [Xr(1)]  ] [1   1   1   1   0   0   0   0]
+    [Xr(N/4) ] [0   0   0   0   1   s   0  -s]
+    [Xr(N/2) ] [1   0  -1   0   0   0   0   0]
+    [Xr(3N/4)] [0   0   0   0   1  -s   0   s]
+    [Xi(1)   ] [1  -1   1  -1   0   0   0   0]
+    [Xi(N/4) ] [0   0   0   0   0  -s  -1  -s]
+    [Xi(N/2) ] [0  -1   0   1   0   0   0   0]
+    [Xi(3N/4)] [0   0   0   0   0  -s   1  -s]
+  */
+
+  xr0=(cr.f[0]+cr.f[2]) + (cr.f[1]+cr.f[3]); uout[0].f[0] = xr0;
+  xi0=(cr.f[0]+cr.f[2]) - (cr.f[1]+cr.f[3]); uout[1].f[0] = xi0;
+  xr2=(cr.f[0]-cr.f[2]);                     uout[4].f[0] = xr2;
+  xi2=(cr.f[3]-cr.f[1]);                     uout[5].f[0] = xi2;
+  xr1= ci.f[0] + s*(ci.f[1]-ci.f[3]);        uout[2].f[0] = xr1;
+  xi1=-ci.f[2] - s*(ci.f[1]+ci.f[3]);        uout[3].f[0] = xi1;
+  xr3= ci.f[0] - s*(ci.f[1]-ci.f[3]);        uout[6].f[0] = xr3;
+  xi3= ci.f[2] - s*(ci.f[1]+ci.f[3]);        uout[7].f[0] = xi3;
+
+  for (k=1; k < dk; ++k) {
+    v4sf save_next = in[8*k+7];
+    pffft_real_finalize_4x4(&save, &in[8*k+0], in + 8*k+1,
+                           e + k*6, out + k*8);
+    save = save_next;
+  }
+
+}
+
+static ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in,
+                                             const v4sf *e, v4sf *out, int first) {
+  v4sf r0=in[0], i0=in[1], r1=in[2], i1=in[3], r2=in[4], i2=in[5], r3=in[6], i3=in[7];
+  /*
+    transformation for each column is:
+
+    [1   1   1   1   0   0   0   0]   [r0]
+    [1   0   0  -1   0  -1  -1   0]   [r1]
+    [1  -1  -1   1   0   0   0   0]   [r2]
+    [1   0   0  -1   0   1   1   0]   [r3]
+    [0   0   0   0   1  -1   1  -1] * [i0]
+    [0  -1   1   0   1   0   0   1]   [i1]
+    [0   0   0   0   1   1  -1  -1]   [i2]
+    [0   1  -1   0   1   0   0   1]   [i3]
+  */
+
+  v4sf sr0 = VADD(r0,r3), dr0 = VSUB(r0,r3);
+  v4sf sr1 = VADD(r1,r2), dr1 = VSUB(r1,r2);
+  v4sf si0 = VADD(i0,i3), di0 = VSUB(i0,i3);
+  v4sf si1 = VADD(i1,i2), di1 = VSUB(i1,i2);
+
+  r0 = VADD(sr0, sr1);
+  r2 = VSUB(sr0, sr1);
+  r1 = VSUB(dr0, si1);
+  r3 = VADD(dr0, si1);
+  i0 = VSUB(di0, di1);
+  i2 = VADD(di0, di1);
+  i1 = VSUB(si0, dr1);
+  i3 = VADD(si0, dr1);
+
+  VCPLXMULCONJ(r1,i1,e[0],e[1]);
+  VCPLXMULCONJ(r2,i2,e[2],e[3]);
+  VCPLXMULCONJ(r3,i3,e[4],e[5]);
+
+  VTRANSPOSE4(r0,r1,r2,r3);
+  VTRANSPOSE4(i0,i1,i2,i3);
+
+  if (!first) {
+    *out++ = r0;
+    *out++ = i0;
+  }
+  *out++ = r1;
+  *out++ = i1;
+  *out++ = r2;
+  *out++ = i2;
+  *out++ = r3;
+  *out++ = i3;
+}
+
+static NEVER_INLINE(void) pffft_real_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
+  int k, dk = Ncvec/SIMD_SZ; /* number of 4x4 matrix blocks */
+  /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
+
+  v4sf_union Xr, Xi, *uout = (v4sf_union*)out;
+  float cr0, ci0, cr1, ci1, cr2, ci2, cr3, ci3;
+  static const float s = (float)M_SQRT2;
+  assert(in != out);
+  for (k=0; k < 4; ++k) {
+    Xr.f[k] = ((float*)in)[8*k];
+    Xi.f[k] = ((float*)in)[8*k+4];
+  }
+
+  pffft_real_preprocess_4x4(in, e, out+1, 1); /* will write only 6 values */
+
+  /*
+    [Xr0 Xr1 Xr2 Xr3 Xi0 Xi1 Xi2 Xi3]
+
+    [cr0] [1   0   2   0   1   0   0   0]
+    [cr1] [1   0   0   0  -1   0  -2   0]
+    [cr2] [1   0  -2   0   1   0   0   0]
+    [cr3] [1   0   0   0  -1   0   2   0]
+    [ci0] [0   2   0   2   0   0   0   0]
+    [ci1] [0   s   0  -s   0  -s   0  -s]
+    [ci2] [0   0   0   0   0  -2   0   2]
+    [ci3] [0  -s   0   s   0  -s   0  -s]
+  */
+  for (k=1; k < dk; ++k) {
+    pffft_real_preprocess_4x4(in+8*k, e + k*6, out-1+k*8, 0);
+  }
+
+  cr0=(Xr.f[0]+Xi.f[0]) + 2*Xr.f[2]; uout[0].f[0] = cr0;
+  cr1=(Xr.f[0]-Xi.f[0]) - 2*Xi.f[2]; uout[0].f[1] = cr1;
+  cr2=(Xr.f[0]+Xi.f[0]) - 2*Xr.f[2]; uout[0].f[2] = cr2;
+  cr3=(Xr.f[0]-Xi.f[0]) + 2*Xi.f[2]; uout[0].f[3] = cr3;
+  ci0= 2*(Xr.f[1]+Xr.f[3]);                       uout[2*Ncvec-1].f[0] = ci0;
+  ci1= s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[1] = ci1;
+  ci2= 2*(Xi.f[3]-Xi.f[1]);                       uout[2*Ncvec-1].f[2] = ci2;
+  ci3=-s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[3] = ci3;
+}
+
+
+static void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *foutput, v4sf *scratch,
+                             pffft_direction_t direction, int ordered) {
+  int k, Ncvec   = setup->Ncvec;
+  int nf_odd = (setup->ifac[1] & 1);
+
+  /* temporary buffer is allocated on the stack if the scratch pointer is NULL */
+  /*int stack_allocate = (scratch == 0 ? Ncvec*2 : 1); */
+  /*VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate); */
+
+  int ib = (nf_odd ^ ordered ? 1 : 0);
+  const v4sf *vinput = (const v4sf*)finput;
+  v4sf *voutput      = (v4sf*)foutput;
+  v4sf *buff[2];
+  buff[0] = voutput, buff[1] = scratch /*? scratch : scratch_on_stack*/;
+
+  /*if (scratch == 0) scratch = scratch_on_stack; */
+
+  assert(VALIGNED(finput) && VALIGNED(foutput));
+
+  /*assert(finput != foutput); */
+  if (direction == PFFFT_FORWARD) {
+    ib = !ib;
+    if (setup->transform == PFFFT_REAL) {
+      ib = (rfftf1_ps(Ncvec*2, vinput, buff[ib], buff[!ib],
+                      setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
+      pffft_real_finalize(Ncvec, buff[ib], buff[!ib], (v4sf*)setup->e);
+    } else {
+      v4sf *tmp = buff[ib];
+      for (k=0; k < Ncvec; ++k) {
+        UNINTERLEAVE2(vinput[k*2], vinput[k*2+1], tmp[k*2], tmp[k*2+1]);
+      }
+      ib = (cfftf1_ps(Ncvec, buff[ib], buff[!ib], buff[ib],
+                      setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1);
+      pffft_cplx_finalize(Ncvec, buff[ib], buff[!ib], (v4sf*)setup->e);
+    }
+    if (ordered) {
+      pffft_zreorder(setup, (float*)buff[!ib], (float*)buff[ib], PFFFT_FORWARD);
+    } else ib = !ib;
+  } else {
+    if (vinput == buff[ib]) {
+      ib = !ib; /* may happen when finput == foutput */
+    }
+    if (ordered) {
+      pffft_zreorder(setup, (float*)vinput, (float*)buff[ib], PFFFT_BACKWARD);
+      vinput = buff[ib]; ib = !ib;
+    }
+    if (setup->transform == PFFFT_REAL) {
+      pffft_real_preprocess(Ncvec, vinput, buff[ib], (v4sf*)setup->e);
+      ib = (rfftb1_ps(Ncvec*2, buff[ib], buff[0], buff[1],
+                      setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
+    } else {
+      pffft_cplx_preprocess(Ncvec, vinput, buff[ib], (v4sf*)setup->e);
+      ib = (cfftf1_ps(Ncvec, buff[ib], buff[0], buff[1],
+                      setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1);
+      for (k=0; k < Ncvec; ++k) {
+        INTERLEAVE2(buff[ib][k*2], buff[ib][k*2+1], buff[ib][k*2], buff[ib][k*2+1]);
+      }
+    }
+  }
+
+  if (buff[ib] != voutput) {
+    /* extra copy required -- this situation should only happen when finput == foutput */
+    assert(finput==foutput);
+    for (k=0; k < Ncvec; ++k) {
+      v4sf a = buff[ib][2*k], b = buff[ib][2*k+1];
+      voutput[2*k] = a; voutput[2*k+1] = b;
+    }
+    ib = !ib;
+  }
+  assert(buff[ib] == voutput);
+}
+
+#if 0
+static void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab, float scaling) {
+  int i, Ncvec = s->Ncvec;
+  const v4sf * RESTRICT va = (const v4sf*)a;
+  const v4sf * RESTRICT vb = (const v4sf*)b;
+  v4sf * RESTRICT vab = (v4sf*)ab;
+
+#ifdef __arm__
+  __builtin_prefetch(va);
+  __builtin_prefetch(vb);
+  __builtin_prefetch(vab);
+  __builtin_prefetch(va+2);
+  __builtin_prefetch(vb+2);
+  __builtin_prefetch(vab+2);
+  __builtin_prefetch(va+4);
+  __builtin_prefetch(vb+4);
+  __builtin_prefetch(vab+4);
+  __builtin_prefetch(va+6);
+  __builtin_prefetch(vb+6);
+  __builtin_prefetch(vab+6);
+#endif
+
+  float ar, ai, br, bi, abr, abi;
+  v4sf vscal = LD_PS1(scaling);
+
+  assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab));
+  ar = ((v4sf_union*)va)[0].f[0];
+  ai = ((v4sf_union*)va)[1].f[0];
+  br = ((v4sf_union*)vb)[0].f[0];
+  bi = ((v4sf_union*)vb)[1].f[0];
+  abr = ((v4sf_union*)vab)[0].f[0];
+  abi = ((v4sf_union*)vab)[1].f[0];
+
+#ifdef __arm__
+#  if 1 /* inline asm version */
+  const float *a_ = a, *b_ = b; float *ab_ = ab;
+  int N = Ncvec;
+  asm volatile("mov         r8, %2                  \n"
+               "vdup.f32    q15, %4                 \n"
+               "1:                                  \n"
+               "pld         [%0,#64]                \n"
+               "pld         [%1,#64]                \n"
+               "pld         [%2,#64]                \n"
+               "pld         [%0,#96]                \n"
+               "pld         [%1,#96]                \n"
+               "pld         [%2,#96]                \n"
+               "vld1.f32    {q0,q1},   [%0,:128]!         \n"
+               "vld1.f32    {q4,q5},   [%1,:128]!         \n"
+               "vld1.f32    {q2,q3},   [%0,:128]!         \n"
+               "vld1.f32    {q6,q7},   [%1,:128]!         \n"
+               "vld1.f32    {q8,q9},   [r8,:128]!          \n"
+
+               "vmul.f32    q10, q0, q4             \n"
+               "vmul.f32    q11, q0, q5             \n"
+               "vmul.f32    q12, q2, q6             \n"
+               "vmul.f32    q13, q2, q7             \n"
+               "vmls.f32    q10, q1, q5             \n"
+               "vmla.f32    q11, q1, q4             \n"
+               "vld1.f32    {q0,q1}, [r8,:128]!     \n"
+               "vmls.f32    q12, q3, q7             \n"
+               "vmla.f32    q13, q3, q6             \n"
+               "vmla.f32    q8, q10, q15            \n"
+               "vmla.f32    q9, q11, q15            \n"
+               "vmla.f32    q0, q12, q15            \n"
+               "vmla.f32    q1, q13, q15            \n"
+               "vst1.f32    {q8,q9},[%2,:128]!    \n"
+               "vst1.f32    {q0,q1},[%2,:128]!    \n"
+               "subs        %3, #2                  \n"
+               "bne         1b                      \n"
+               : "+r"(a_), "+r"(b_), "+r"(ab_), "+r"(N) : "r"(scaling) : "r8", "q0","q1","q2","q3","q4","q5","q6","q7","q8","q9", "q10","q11","q12","q13","q15","memory");
+
+#  else /* neon instrinsics version, 30% slower that the asm one with gcc 4.6 */
+  v4sf a1r, a1i, b1r, b1i;
+  v4sf a2r, a2i, b2r, b2i;
+  v4sf ab1r, ab1i, ab2r, ab2i;
+  for (i=0; i < Ncvec; i += 2) {
+    __builtin_prefetch(va+8);
+    __builtin_prefetch(va+10);
+
+    a1r = *va++; a1i = *va++;
+    a2r = *va++; a2i = *va++;
+    b1r = *vb++; b1i = *vb++;
+    b2r = *vb++; b2i = *vb++;
+    ab1r = vab[0]; ab1i = vab[1];
+    ab2r = vab[2]; ab2i = vab[3];
+
+    v4sf z1r = VMUL(a1r, b1r);
+    v4sf z2r = VMUL(a2r, b2r);
+    v4sf z1i = VMUL(a1r, b1i);
+    v4sf z2i = VMUL(a2r, b2i);
+
+    __builtin_prefetch(vb+4);
+    __builtin_prefetch(vb+6);
+
+    z1r = vmlsq_f32(z1r, a1i, b1i);
+    z2r = vmlsq_f32(z2r, a2i, b2i);
+    z1i = vmlaq_f32(z1i, a1i, b1r);
+    z2i = vmlaq_f32(z2i, a2i, b2r);
+
+    __builtin_prefetch(vab+4);
+    __builtin_prefetch(vab+6);
+
+    ab1r = vmlaq_f32(ab1r, z1r, vscal);
+    ab2r = vmlaq_f32(ab2r, z2r, vscal);
+    ab1i = vmlaq_f32(ab1i, z1i, vscal);
+    ab2i = vmlaq_f32(ab2i, z2i, vscal);
+
+    *vab++ = ab1r; *vab++ = ab1i;
+    *vab++ = ab2r; *vab++ = ab2i;
+  }
+#  endif
+
+#else /* not ARM, no need to use a special routine */
+  for (i=0; i < Ncvec; i += 2) {
+    v4sf ar, ai, br, bi;
+    ar = va[2*i+0]; ai = va[2*i+1];
+    br = vb[2*i+0]; bi = vb[2*i+1];
+    VCPLXMUL(ar, ai, br, bi);
+    vab[2*i+0] = VMADD(ar, vscal, vab[2*i+0]);
+    vab[2*i+1] = VMADD(ai, vscal, vab[2*i+1]);
+    ar = va[2*i+2]; ai = va[2*i+3];
+    br = vb[2*i+2]; bi = vb[2*i+3];
+    VCPLXMUL(ar, ai, br, bi);
+    vab[2*i+2] = VMADD(ar, vscal, vab[2*i+2]);
+    vab[2*i+3] = VMADD(ai, vscal, vab[2*i+3]);
+  }
+#endif
+  if (s->transform == PFFFT_REAL) {
+    ((v4sf_union*)vab)[0].f[0] = abr + ar*br*scaling;
+    ((v4sf_union*)vab)[1].f[0] = abi + ai*bi*scaling;
+  }
+}
+#endif
+
+static void pffft_zconvolve(PFFFT_Setup *s, const float *a, const float *b, float *ab) {
+  int i, Ncvec = s->Ncvec;
+  const v4sf * /*RESTRICT*/ va = (const v4sf*)a;
+  const v4sf * RESTRICT vb = (const v4sf*)b;
+  v4sf * /*RESTRICT*/ vab = (v4sf*)ab;
+
+  float ar, ai, br, bi;
+
+#ifdef __arm__
+#error
+#endif
+  assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab));
+  ar = ((v4sf_union*)va)[0].f[0];
+  ai = ((v4sf_union*)va)[1].f[0];
+  br = ((v4sf_union*)vb)[0].f[0];
+  bi = ((v4sf_union*)vb)[1].f[0];
+
+  for (i=0; i < Ncvec; i += 2) {
+    v4sf ar, ai, br, bi;
+    ar = va[2*i+0]; ai = va[2*i+1];
+    br = vb[2*i+0]; bi = vb[2*i+1];
+    VCPLXMUL(ar, ai, br, bi);
+    vab[2*i+0] = ar;
+    vab[2*i+1] = ai;
+    ar = va[2*i+2]; ai = va[2*i+3];
+    br = vb[2*i+2]; bi = vb[2*i+3];
+    VCPLXMUL(ar, ai, br, bi);
+    vab[2*i+2] = ar;
+    vab[2*i+3] = ai;
+  }
+  if (s->transform == PFFFT_REAL) {
+    ((v4sf_union*)vab)[0].f[0] = ar*br;
+    ((v4sf_union*)vab)[1].f[0] = ai*bi;
+  }
+}
+
+
+
+#else /* defined(PFFFT_SIMD_DISABLE) */
+
+/* standard routine using scalar floats, without SIMD stuff. */
+
+#define pffft_zreorder_nosimd pffft_zreorder
+static void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) {
+  int k, N = setup->N;
+  if (setup->transform == PFFFT_COMPLEX) {
+    for (k=0; k < 2*N; ++k) out[k] = in[k];
+    return;
+  }
+  else if (direction == PFFFT_FORWARD) {
+    float x_N = in[N-1];
+    for (k=N-1; k > 1; --k) out[k] = in[k-1];
+    out[0] = in[0];
+    out[1] = x_N;
+  } else {
+    float x_N = in[1];
+    for (k=1; k < N-1; ++k) out[k] = in[k+1];
+    out[0] = in[0];
+    out[N-1] = x_N;
+  }
+}
+
+#define pffft_transform_internal_nosimd pffft_transform_internal
+static void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, float *output, float *scratch,
+                                    pffft_direction_t direction, int ordered) {
+  int Ncvec   = setup->Ncvec;
+  int nf_odd = (setup->ifac[1] & 1);
+
+  /* temporary buffer is allocated on the stack if the scratch pointer is NULL */
+  /*int stack_allocate = (scratch == 0 ? Ncvec*2 : 1); */
+  /*VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate); */
+  /*if (scratch == 0) scratch = scratch_on_stack; */
+
+  int ib;
+  float *buff[2];
+  buff[0] = output, buff[1] = scratch;
+  if (setup->transform == PFFFT_COMPLEX) ordered = 0; /* it is always ordered. */
+  ib = (nf_odd ^ ordered ? 1 : 0);
+
+  if (direction == PFFFT_FORWARD) {
+    if (setup->transform == PFFFT_REAL) {
+      ib = (rfftf1_ps(Ncvec*2, input, buff[ib], buff[!ib],
+                      setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
+    } else {
+      ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib],
+                      setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1);
+    }
+    if (ordered) {
+      pffft_zreorder(setup, buff[ib], buff[!ib], PFFFT_FORWARD); ib = !ib;
+    }
+  } else {
+    if (input == buff[ib]) {
+      ib = !ib; /* may happen when finput == foutput */
+    }
+    if (ordered) {
+      pffft_zreorder(setup, input, buff[!ib], PFFFT_BACKWARD);
+      input = buff[!ib];
+    }
+    if (setup->transform == PFFFT_REAL) {
+      ib = (rfftb1_ps(Ncvec*2, input, buff[ib], buff[!ib],
+                      setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
+    } else {
+      ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib],
+                      setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1);
+    }
+  }
+  if (buff[ib] != output) {
+    int k;
+    /* extra copy required -- this situation should happens only when finput == foutput */
+    assert(input==output);
+    for (k=0; k < Ncvec; ++k) {
+      float a = buff[ib][2*k], b = buff[ib][2*k+1];
+      output[2*k] = a; output[2*k+1] = b;
+    }
+    ib = !ib;
+  }
+  assert(buff[ib] == output);
+}
+
+#if 0
+#define pffft_zconvolve_accumulate_nosimd pffft_zconvolve_accumulate
+static void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup *s, const float *a, const float *b,
+                                       float *ab, float scaling) {
+  int i, Ncvec = s->Ncvec;
+
+  if (s->transform == PFFFT_REAL) {
+    /* take care of the fftpack ordering */
+    ab[0] += a[0]*b[0]*scaling;
+    ab[2*Ncvec-1] += a[2*Ncvec-1]*b[2*Ncvec-1]*scaling;
+    ++ab; ++a; ++b; --Ncvec;
+  }
+  for (i=0; i < Ncvec; ++i) {
+    float ar, ai, br, bi;
+    ar = a[2*i+0]; ai = a[2*i+1];
+    br = b[2*i+0]; bi = b[2*i+1];
+    VCPLXMUL(ar, ai, br, bi);
+    ab[2*i+0] += ar*scaling;
+    ab[2*i+1] += ai*scaling;
+  }
+}
+#endif
+
+#define pffft_zconvolve_nosimd pffft_zconvolve
+static void pffft_zconvolve_nosimd(PFFFT_Setup *s, const float *a, const float *b, float *ab) {
+  int i, Ncvec = s->Ncvec;
+
+  if (s->transform == PFFFT_REAL) {
+    /* take care of the fftpack ordering */
+    ab[0] = a[0]*b[0];
+    ab[2*Ncvec-1] = a[2*Ncvec-1]*b[2*Ncvec-1];
+    ++ab; ++a; ++b; --Ncvec;
+  }
+  for (i=0; i < Ncvec; ++i) {
+    float ar, ai, br, bi;
+    ar = a[2*i+0]; ai = a[2*i+1];
+    br = b[2*i+0]; bi = b[2*i+1];
+    VCPLXMUL(ar, ai, br, bi);
+    ab[2*i+0] = ar;
+    ab[2*i+1] = ai;
+  }
+}
+
+#endif /* defined(PFFFT_SIMD_DISABLE) */
+
+static void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) {
+  pffft_transform_internal(setup, input, output, (v4sf*)work, direction, 0);
+}
+
+static void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) {
+  pffft_transform_internal(setup, input, output, (v4sf*)work, direction, 1);
+}
+
+
+static void pffft_reorder_back(int length, void * setup, float * data, float * work)
+{
+  memcpy(work, data, (unsigned)length * sizeof(*work));
+  pffft_zreorder(setup, work, data, PFFFT_BACKWARD);
+}
+#endif
diff --git a/src/pffft.h b/src/pffft.h
new file mode 100644
index 0000000..78d936b
--- /dev/null
+++ b/src/pffft.h
@@ -0,0 +1,177 @@
+/* Copyright (c) 2011  Julien Pommier ( pommier@modartt.com )
+
+   Based on original fortran 77 code from FFTPACKv4 from NETLIB,
+   authored by Dr Paul Swarztrauber of NCAR, in 1985.
+
+   As confirmed by the NCAR fftpack software curators, the following
+   FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
+   released under the same terms.
+
+   FFTPACK license:
+
+   http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
+
+   Copyright (c) 2004 the University Corporation for Atmospheric
+   Research ("UCAR"). All rights reserved. Developed by NCAR's
+   Computational and Information Systems Laboratory, UCAR,
+   www.cisl.ucar.edu.
+
+   Redistribution and use of the Software in source and binary forms,
+   with or without modification, is permitted provided that the
+   following conditions are met:
+
+   - Neither the names of NCAR's Computational and Information Systems
+   Laboratory, the University Corporation for Atmospheric Research,
+   nor the names of its sponsors or contributors may be used to
+   endorse or promote products derived from this Software without
+   specific prior written permission.
+
+   - Redistributions of source code must retain the above copyright
+   notices, this list of conditions, and the disclaimer below.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions, and the disclaimer below in the
+   documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+   SOFTWARE.
+*/
+
+/*
+   PFFFT : a Pretty Fast FFT.
+
+   This is basically an adaptation of the single precision fftpack
+   (v4) as found on netlib taking advantage of SIMD instruction found
+   on cpus such as intel x86 (SSE1), powerpc (Altivec), and arm (NEON).
+
+   For architectures where no SIMD instruction is available, the code
+   falls back to a scalar version.
+
+   Restrictions:
+
+   - 1D transforms only, with 32-bit single precision.
+
+   - supports only transforms for inputs of length N of the form
+   N=(2^a)*(3^b), a >= 5 and b >=0 (32, 48, 64, 96, 128, 144 etc
+   are all acceptable lengths). Performance is best for 128<=N<=8192.
+
+   - all (float*) pointers in the functions below are expected to
+   have an "simd-compatible" alignment, that is 16 bytes on x86 and
+   powerpc CPUs.
+
+   You can allocate such buffers with the functions
+   pffft_aligned_malloc / pffft_aligned_free (or with stuff like
+   posix_memalign..)
+
+*/
+
+#ifndef PFFFT_H
+#define PFFFT_H
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+  /* opaque struct holding internal stuff (precomputed twiddle factors)
+     this struct can be shared by many threads as it contains only
+     read-only data.
+  */
+  typedef struct PFFFT_Setup PFFFT_Setup;
+
+  /* direction of the transform */
+  typedef enum { PFFFT_FORWARD, PFFFT_BACKWARD } pffft_direction_t;
+
+  /* type of transform */
+  typedef enum { PFFFT_REAL, PFFFT_COMPLEX } pffft_transform_t;
+
+  /*
+    prepare for performing transforms of size N -- the returned
+    PFFFT_Setup structure is read-only so it can safely be shared by
+    multiple concurrent threads.
+  */
+  static PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform);
+  static void pffft_destroy_setup(PFFFT_Setup *);
+  /*
+     Perform a Fourier transform , The z-domain data is stored in the
+     most efficient order for transforming it back, or using it for
+     convolution. If you need to have its content sorted in the
+     "usual" way, that is as an array of interleaved complex numbers,
+     either use pffft_transform_ordered , or call pffft_zreorder after
+     the forward fft, and before the backward fft.
+
+     Transforms are not scaled: PFFFT_BACKWARD(PFFFT_FORWARD(x)) = N*x.
+     Typically you will want to scale the backward transform by 1/N.
+
+     The 'work' pointer should point to an area of N (2*N for complex
+     fft) floats, properly aligned. [del]If 'work' is NULL, then stack will
+     be used instead (this is probably the beest strategy for small
+     FFTs, say for N < 16384).[/del]
+
+     input and output may alias.
+  */
+  static void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);
+
+  /*
+     Similar to pffft_transform, but makes sure that the output is
+     ordered as expected (interleaved complex numbers).  This is
+     similar to calling pffft_transform and then pffft_zreorder.
+
+     input and output may alias.
+  */
+  static void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);
+
+  /*
+     call pffft_zreorder(.., PFFFT_FORWARD) after pffft_transform(...,
+     PFFFT_FORWARD) if you want to have the frequency components in
+     the correct "canonical" order, as interleaved complex numbers.
+
+     (for real transforms, both 0-frequency and half frequency
+     components, which are real, are assembled in the first entry as
+     F(0)+i*F(n/2+1). Note that the original fftpack did place
+     F(n/2+1) at the end of the arrays).
+
+     input and output should not alias.
+  */
+  static void pffft_zreorder(PFFFT_Setup *setup, const float *input, float *output, pffft_direction_t direction);
+
+  /*
+     Perform a multiplication of the frequency components of dft_a and
+     dft_b and accumulate them into dft_ab. The arrays should have
+     been obtained with pffft_transform(.., PFFFT_FORWARD) and should
+     *not* have been reordered with pffft_zreorder (otherwise just
+     perform the operation yourself as the dft coefs are stored as
+     interleaved complex numbers).
+
+     the operation performed is: dft_ab += (dft_a * fdt_b)*scaling
+
+     The dft_a, dft_b and dft_ab pointers may alias.
+  void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
+  */
+
+  /*
+     the operation performed is: dft_ab = (dft_a * fdt_b)
+
+     The dft_a, dft_b and dft_ab pointers may alias.
+  */
+  static void pffft_zconvolve(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab);
+
+  /* return 4 or 1 wether support SSE/Altivec instructions was enable when building pffft.c */
+  int pffft_simd_size(void);
+
+  static void pffft_reorder_back(int length, void * setup, float * data, float * work);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/pffft32.c b/src/pffft32.c
new file mode 100644
index 0000000..21bd845
--- /dev/null
+++ b/src/pffft32.c
@@ -0,0 +1,32 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#define _soxr_simd_aligned_free free
+#define _soxr_simd_aligned_malloc malloc
+#define PFFFT_SIMD_DISABLE
+#include "pffft.c"
+#include "filter.h"
+
+static void * setup(int len) {return pffft_new_setup(len, PFFFT_REAL);}
+static void delete_setup(void * setup) {pffft_destroy_setup(setup);}
+static void forward  (int length, void * setup, float * h, float * scratch) {pffft_transform        (setup, h, h, scratch, PFFFT_FORWARD); (void)length;}
+static void oforward (int length, void * setup, float * h, float * scratch) {pffft_transform_ordered(setup, h, h, scratch, PFFFT_FORWARD); (void)length;}
+static void backward (int length, void * setup, float * H, float * scratch) {pffft_transform        (setup, H, H, scratch, PFFFT_BACKWARD);(void)length;}
+static void obackward(int length, void * setup, float * H, float * scratch) {pffft_transform_ordered(setup, H, H, scratch, PFFFT_BACKWARD);(void)length;}
+static void convolve(int length, void * setup, float * H, float const * with) { pffft_zconvolve(setup, H, with, H);  (void)length;}
+static int multiplier(void) {return 1;}
+
+typedef void (* fn_t)(void);
+fn_t _soxr_rdft32_cb[] = {
+  (fn_t)setup,
+  (fn_t)setup,
+  (fn_t)delete_setup,
+  (fn_t)forward,
+  (fn_t)oforward,
+  (fn_t)backward,
+  (fn_t)obackward,
+  (fn_t)convolve,
+  (fn_t)_soxr_ordered_partial_convolve_f,
+  (fn_t)multiplier,
+  (fn_t)pffft_reorder_back,
+};
diff --git a/src/pffft32s.c b/src/pffft32s.c
new file mode 100644
index 0000000..d049990
--- /dev/null
+++ b/src/pffft32s.c
@@ -0,0 +1,27 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#include "pffft.c"
+
+static void * setup(int len) {return pffft_new_setup(len, PFFFT_REAL);}
+static void forward  (int length, void * setup, float * h, float * scratch) {pffft_transform        (setup, h, h, scratch, PFFFT_FORWARD); (void)length;}
+static void oforward (int length, void * setup, float * h, float * scratch) {pffft_transform_ordered(setup, h, h, scratch, PFFFT_FORWARD); (void)length;}
+static void backward (int length, void * setup, float * H, float * scratch) {pffft_transform        (setup, H, H, scratch, PFFFT_BACKWARD);(void)length;}
+static void obackward(int length, void * setup, float * H, float * scratch) {pffft_transform_ordered(setup, H, H, scratch, PFFFT_BACKWARD);(void)length;}
+static void convolve(int length, void * setup, float * H, float const * with) { pffft_zconvolve(setup, H, with, H);                  (void)length;}
+static int multiplier(void) {return 1;}
+
+typedef void (* fn_t)(void);
+fn_t _soxr_rdft32s_cb[] = {
+  (fn_t)setup,
+  (fn_t)setup,
+  (fn_t)pffft_destroy_setup,
+  (fn_t)forward,
+  (fn_t)oforward,
+  (fn_t)backward,
+  (fn_t)obackward,
+  (fn_t)convolve,
+  (fn_t)_soxr_ordered_partial_convolve_simd,
+  (fn_t)multiplier,
+  (fn_t)pffft_reorder_back,
+};
diff --git a/src/poly-fir.h b/src/poly-fir.h
new file mode 100644
index 0000000..f7b4261
--- /dev/null
+++ b/src/poly-fir.h
@@ -0,0 +1,98 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+/* Resample using an interpolated poly-phase FIR with length LEN.*/
+/* Input must be followed by LEN-1 samples. */
+
+#define a (coef(p->shared->poly_fir_coefs, COEF_INTERP, FIR_LENGTH, phase, 0,j))
+#define b (coef(p->shared->poly_fir_coefs, COEF_INTERP, FIR_LENGTH, phase, 1,j))
+#define c (coef(p->shared->poly_fir_coefs, COEF_INTERP, FIR_LENGTH, phase, 2,j))
+#define d (coef(p->shared->poly_fir_coefs, COEF_INTERP, FIR_LENGTH, phase, 3,j))
+#if COEF_INTERP == 0
+  #define _ sum += a *in[j], ++j;
+#elif COEF_INTERP == 1
+  #define _ sum += (b *x + a)*in[j], ++j;
+#elif COEF_INTERP == 2
+  #define _ sum += ((c *x + b)*x + a)*in[j], ++j;
+#elif COEF_INTERP == 3
+  #define _ sum += (((d*x + c)*x + b)*x + a)*in[j], ++j;
+#else
+  #error COEF_INTERP
+#endif
+
+static void FUNCTION(stage_t * p, fifo_t * output_fifo)
+{
+  sample_t const * input = stage_read_p(p);
+  int i, num_in = stage_occupancy(p), max_num_out = 1 + (int)(num_in*p->out_in_ratio);
+  sample_t * output = fifo_reserve(output_fifo, max_num_out);
+
+#if defined HI_PREC_CLOCK
+#if FLOAT_HI_PREC_CLOCK
+  if (p->use_hi_prec_clock) {
+    float_step_t at = p->at.flt;
+    for (i = 0; (int)at < num_in; ++i, at += p->step.flt) {
+      sample_t const * in = input + (int)at;
+      float_step_t frac = at - (int)at;
+      int phase = (int)(frac * (1 << PHASE_BITS));
+#if COEF_INTERP > 0
+      sample_t x = (sample_t)(frac * (1 << PHASE_BITS) - phase);
+#endif
+      sample_t sum = 0;
+      int j = 0;
+      CONVOLVE
+      output[i] = sum;
+    }
+    fifo_read(&p->fifo, (int)at, NULL);
+    p->at.flt = at - (int)at;
+  } else
+#else
+  if (p->use_hi_prec_clock) {
+    for (i = 0; p->at.integer < num_in; ++i,
+        p->at.fix.ls.all += p->step.fix.ls.all,
+        p->at.whole += p->step.whole + (p->at.fix.ls.all < p->step.fix.ls.all)) {
+      sample_t const * in = input + p->at.integer;
+      uint32_t frac = p->at.fraction;
+      int phase = (int)(frac >> (32 - PHASE_BITS)); /* high-order bits */
+#if COEF_INTERP > 0              /* low-order bits, scaled to [0,1) */
+      sample_t x = (sample_t)((frac << PHASE_BITS) * (1 / MULT32));
+#endif
+      sample_t sum = 0;
+      int j = 0;
+      CONVOLVE
+      output[i] = sum;
+    }
+    fifo_read(&p->fifo, p->at.integer, NULL);
+    p->at.integer = 0;
+  } else
+#endif
+#endif
+  {
+    for (i = 0; p->at.integer < num_in; ++i, p->at.whole += p->step.whole) {
+      sample_t const * in = input + p->at.integer;
+      uint32_t frac = p->at.fraction;
+      int phase = (int)(frac >> (32 - PHASE_BITS)); /* high-order bits */
+#if COEF_INTERP > 0              /* low-order bits, scaled to [0,1) */
+      sample_t x = (sample_t)((frac << PHASE_BITS) * (1 / MULT32));
+#endif
+      sample_t sum = 0;
+      int j = 0;
+      CONVOLVE
+      output[i] = sum;
+    }
+    fifo_read(&p->fifo, p->at.integer, NULL);
+    p->at.integer = 0;
+  }
+  assert(max_num_out - i >= 0);
+  fifo_trim_by(output_fifo, max_num_out - i);
+}
+
+#undef _
+#undef a
+#undef b
+#undef c
+#undef d
+#undef COEF_INTERP
+#undef CONVOLVE
+#undef FIR_LENGTH
+#undef FUNCTION
+#undef PHASE_BITS
diff --git a/src/poly-fir0.h b/src/poly-fir0.h
new file mode 100644
index 0000000..52d85b3
--- /dev/null
+++ b/src/poly-fir0.h
@@ -0,0 +1,32 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+/* Resample using a non-interpolated poly-phase FIR with length LEN.*/
+/* Input must be followed by LEN-1 samples. */
+
+#define _ sum += (coef(p->shared->poly_fir_coefs, 0, FIR_LENGTH, rem, 0, j)) *at[j], ++j;
+
+static void FUNCTION(stage_t * p, fifo_t * output_fifo)
+{
+  sample_t const * input = stage_read_p(p);
+  int i, num_in = stage_occupancy(p), max_num_out = 1 + (int)(num_in*p->out_in_ratio);
+  sample_t * output = fifo_reserve(output_fifo, max_num_out);
+
+  for (i = 0; p->at.integer < num_in * p->L; ++i, p->at.integer += p->step.integer) {
+    int div = p->at.integer / p->L, rem = p->at.integer % p->L;
+    sample_t const * at = input + div;
+    sample_t sum = 0;
+    int j = 0;
+    CONVOLVE
+    output[i] = sum;
+  }
+  assert(max_num_out - i >= 0);
+  fifo_trim_by(output_fifo, max_num_out - i);
+  fifo_read(&p->fifo, p->at.integer / p->L, NULL);
+  p->at.integer = p->at.integer % p->L;
+}
+
+#undef _
+#undef CONVOLVE
+#undef FIR_LENGTH
+#undef FUNCTION
diff --git a/src/rate.h b/src/rate.h
new file mode 100644
index 0000000..f6d055a
--- /dev/null
+++ b/src/rate.h
@@ -0,0 +1,726 @@
+/* SoX Resampler Library      Copyright (c) 2007-14 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#include <math.h>
+#include <assert.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include "filter.h"
+
+#if defined SOXR_LIB
+#include "internal.h"
+
+typedef void (* fn_t)(void);
+extern fn_t RDFT_CB[11];
+
+#define rdft_forward_setup    (*(void * (*)(int))RDFT_CB[0])
+#define rdft_backward_setup   (*(void * (*)(int))RDFT_CB[1])
+#define rdft_delete_setup     (*(void (*)(void *))RDFT_CB[2])
+#define rdft_forward          (*(void (*)(int, void *, sample_t *, sample_t *))RDFT_CB[3])
+#define rdft_oforward         (*(void (*)(int, void *, sample_t *, sample_t *))RDFT_CB[4])
+#define rdft_backward         (*(void (*)(int, void *, sample_t *, sample_t *))RDFT_CB[5])
+#define rdft_obackward        (*(void (*)(int, void *, sample_t *, sample_t *))RDFT_CB[6])
+#define rdft_convolve         (*(void (*)(int, void *, sample_t *, sample_t const *))RDFT_CB[7])
+#define rdft_convolve_portion (*(void (*)(int, sample_t *, sample_t const *))RDFT_CB[8])
+#define rdft_multiplier       (*(int (*)(void))RDFT_CB[9])
+#define rdft_reorder_back     (*(void (*)(int, void *, sample_t *, sample_t *))RDFT_CB[10])
+
+#endif
+
+#if RATE_SIMD /* Align for SIMD: */
+  #include "simd.h"
+#if 0 /* Not using this yet. */
+  #define RATE_SIMD_POLY 1
+  #define num_coefs4 ((num_coefs + 3) & ~3)
+  #define coefs4_check(i) ((i) < num_coefs)
+#else
+  #define RATE_SIMD_POLY 0
+  #define num_coefs4 num_coefs
+  #define coefs4_check(i) 1
+#endif
+
+  #define aligned_free    _soxr_simd_aligned_free
+  #define aligned_malloc  _soxr_simd_aligned_malloc
+  #define aligned_calloc  _soxr_simd_aligned_calloc
+#if 0
+  #define FIFO_REALLOC    aligned_realloc
+  #define FIFO_MALLOC     aligned_malloc
+  #define FIFO_FREE       aligned_free
+
+  static void * aligned_realloc(void * q, size_t nb_bytes, size_t copy_bytes) {
+    void * p = aligned_malloc(nb_bytes);
+    if (p) memcpy(p, q, copy_bytes);
+    aligned_free(q);
+    return p;
+  }
+#endif
+#else
+  #define RATE_SIMD_POLY 0
+  #define num_coefs4 num_coefs
+  #define coefs4_check(i) 1
+
+  #define aligned_free    free
+  #define aligned_malloc  malloc
+  #define aligned_calloc  calloc
+#endif
+
+#define  FIFO_SIZE_T int
+#include "fifo.h"
+
+typedef union { /* Int64 in parts */
+  #if WORDS_BIGENDIAN
+  struct {int32_t ms; uint32_t ls;} parts;
+  #else
+  struct {uint32_t ls; int32_t ms;} parts;
+  #endif
+  int64_t all;
+} int64p_t;
+
+typedef union { /* Uint64 in parts */
+  #if WORDS_BIGENDIAN
+  struct {uint32_t ms, ls;} parts;
+  #else
+  struct {uint32_t ls, ms;} parts;
+  #endif
+  uint64_t all;
+} uint64p_t;
+
+#define FLOAT_HI_PREC_CLOCK 0    /* Non-float hi-prec has ~96 bits. */
+#define float_step_t long double /* __float128 is also a (slow) option */
+
+#define coef(coef_p, interp_order, fir_len, phase_num, coef_interp_num, fir_coef_num) coef_p[(fir_len) * ((interp_order) + 1) * (phase_num) + ((interp_order) + 1) * (fir_coef_num) + (interp_order - coef_interp_num)]
+
+#define raw_coef_t double
+
+static sample_t * prepare_coefs(raw_coef_t const * coefs, int num_coefs,
+    int num_phases, int interp_order, double multiplier)
+{
+  int i, j, length = num_coefs4 * num_phases;
+  sample_t * result = malloc((size_t)(length * (interp_order + 1)) * sizeof(*result));
+  double fm1 = coefs[0], f1 = 0, f2 = 0;
+
+  for (i = num_coefs4 - 1; i >= 0; --i)
+    for (j = num_phases - 1; j >= 0; --j) {
+      double f0 = fm1, b = 0, c = 0, d = 0; /* = 0 to kill compiler warning */
+      int pos = i * num_phases + j - 1;
+      fm1 = coefs4_check(i) && pos > 0 ? coefs[pos - 1] * multiplier : 0;
+      switch (interp_order) {
+        case 1: b = f1 - f0; break;
+        case 2: b = f1 - (.5 * (f2+f0) - f1) - f0; c = .5 * (f2+f0) - f1; break;
+        case 3: c=.5*(f1+fm1)-f0;d=(1/6.)*(f2-f1+fm1-f0-4*c);b=f1-f0-d-c; break;
+        default: if (interp_order) assert(0);
+      }
+      #define coef_coef(x) \
+        coef(result, interp_order, num_coefs4, j, x, num_coefs4 - 1 - i)
+      coef_coef(0) = (sample_t)f0;
+      if (interp_order > 0) coef_coef(1) = (sample_t)b;
+      if (interp_order > 1) coef_coef(2) = (sample_t)c;
+      if (interp_order > 2) coef_coef(3) = (sample_t)d;
+      #undef coef_coef
+      f2 = f1, f1 = f0;
+    }
+  return result;
+}
+
+typedef struct {
+  int        dft_length, num_taps, post_peak;
+  void       * dft_forward_setup, * dft_backward_setup;
+  sample_t   * coefs;
+} dft_filter_t;
+
+typedef struct { /* So generated filter coefs may be shared between channels */
+  sample_t   * poly_fir_coefs;
+  dft_filter_t dft_filter[2];
+} rate_shared_t;
+
+typedef enum {
+  irrational_stage = 1,
+  cubic_stage,
+  dft_stage,
+  half_stage,
+  rational_stage
+} stage_type_t;
+
+struct stage;
+typedef void (* stage_fn_t)(struct stage * input, fifo_t * output);
+#define MULT32 (65536. * 65536.)
+
+typedef union { /* Fixed point arithmetic */
+  struct {uint64p_t ls; int64p_t ms;} fix;
+  float_step_t flt;
+} step_t;
+
+typedef struct stage {
+  /* Common to all stage types: */
+  stage_type_t type;
+  stage_fn_t fn;
+  fifo_t     fifo;
+  int        pre;       /* Number of past samples to store */
+  int        pre_post;  /* pre + number of future samples to store */
+  int        preload;   /* Number of zero samples to pre-load the fifo */
+  double     out_in_ratio; /* For buffer management. */
+
+  /* For a stage with variable (run-time generated) filter coefs: */
+  rate_shared_t * shared;
+  unsigned   dft_filter_num; /* Which, if any, of the 2 DFT filters to use */
+  sample_t   * dft_scratch, * dft_out;
+
+  /* For a stage with variable L/M: */
+  step_t     at, step;
+  bool       use_hi_prec_clock;
+  int        L, remM;
+  int        n, phase_bits, block_len;
+  double     mult, phase0;
+} stage_t;
+
+#define stage_occupancy(s) max(0, fifo_occupancy(&(s)->fifo) - (s)->pre_post)
+#define stage_read_p(s) ((sample_t *)fifo_read_ptr(&(s)->fifo) + (s)->pre)
+
+static void cubic_stage_fn(stage_t * p, fifo_t * output_fifo)
+{
+  int i, num_in = stage_occupancy(p), max_num_out = 1 + (int)(num_in*p->out_in_ratio);
+  sample_t const * input = stage_read_p(p);
+  sample_t * output = fifo_reserve(output_fifo, max_num_out);
+
+#define integer  fix.ms.parts.ms
+#define fraction fix.ms.parts.ls
+#define whole    fix.ms.all
+  for (i = 0; p->at.integer < num_in; ++i, p->at.whole += p->step.whole) {
+    sample_t const * s = input + p->at.integer;
+    double x = p->at.fraction * (1 / MULT32);
+    double b = .5*(s[1]+s[-1])-*s, a = (1/6.)*(s[2]-s[1]+s[-1]-*s-4*b);
+    double c = s[1]-*s-a-b;
+    output[i] = (sample_t)(p->mult * (((a*x + b)*x + c)*x + *s));
+  }
+  assert(max_num_out - i >= 0);
+  fifo_trim_by(output_fifo, max_num_out - i);
+  fifo_read(&p->fifo, p->at.integer, NULL);
+  p->at.integer = 0;
+}
+
+#if RATE_SIMD
+  #define dft_out p->dft_out
+#else
+  #define dft_out output
+#endif
+
+static void dft_stage_fn(stage_t * p, fifo_t * output_fifo)
+{
+  sample_t * output;
+  int i, j, num_in = max(0, fifo_occupancy(&p->fifo));
+  rate_shared_t const * s = p->shared;
+  dft_filter_t const * f = &s->dft_filter[p->dft_filter_num];
+  int const overlap = f->num_taps - 1;
+
+  while (p->at.integer + p->L * num_in >= f->dft_length) {
+    div_t divd = div(f->dft_length - overlap - p->at.integer + p->L - 1, p->L);
+    sample_t const * input = fifo_read_ptr(&p->fifo);
+    fifo_read(&p->fifo, divd.quot, NULL);
+    num_in -= divd.quot;
+
+    output = fifo_reserve(output_fifo, f->dft_length);
+
+    if (lsx_is_power_of_2(p->L)) { /* F-domain */
+      int portion = f->dft_length / p->L;
+      memcpy(dft_out, input, (unsigned)portion * sizeof(*dft_out));
+      rdft_oforward(portion, f->dft_forward_setup, dft_out, p->dft_scratch);
+      for (i = portion + 2; i < (portion << 1); i += 2) /* Mirror image. */
+        dft_out[i] = dft_out[(portion << 1) - i],
+        dft_out[i+1] = -dft_out[(portion << 1) - i + 1];
+      dft_out[portion] = dft_out[1];
+      dft_out[portion + 1] = 0;
+      dft_out[1] = dft_out[0];
+
+      for (portion <<= 1; i < f->dft_length; i += portion, portion <<= 1) {
+        memcpy(dft_out + i, dft_out, (size_t)portion * sizeof(*dft_out));
+        dft_out[i + 1] = 0;
+      }
+      if (p->step.integer > 0)
+        rdft_reorder_back(f->dft_length, f->dft_backward_setup, dft_out, p->dft_scratch);
+    } else {
+      if (p->L == 1)
+        memcpy(dft_out, input, (size_t)f->dft_length * sizeof(*dft_out));
+      else {
+        memset(dft_out, 0, (size_t)f->dft_length * sizeof(*dft_out));
+        for (j = 0, i = p->at.integer; i < f->dft_length; ++j, i += p->L)
+          dft_out[i] = input[j];
+        p->at.integer = p->L - 1 - divd.rem;
+      }
+      if (p->step.integer > 0)
+        rdft_forward(f->dft_length, f->dft_forward_setup, dft_out, p->dft_scratch);
+      else
+        rdft_oforward(f->dft_length, f->dft_forward_setup, dft_out, p->dft_scratch);
+    }
+
+    if (p->step.integer > 0) {
+      rdft_convolve(f->dft_length, f->dft_backward_setup, dft_out, f->coefs);
+      rdft_backward(f->dft_length, f->dft_backward_setup, dft_out, p->dft_scratch);
+#if RATE_SIMD
+      if (p->step.integer == 1)
+        memcpy(output, dft_out, (size_t)f->dft_length * sizeof(sample_t));
+#endif
+      if (p->step.integer != 1) {
+        for (j = 0, i = p->remM; i < f->dft_length - overlap; ++j,
+            i += p->step.integer)
+          output[j] = dft_out[i];
+        p->remM = i - (f->dft_length - overlap);
+        fifo_trim_by(output_fifo, f->dft_length - j);
+      }
+      else fifo_trim_by(output_fifo, overlap);
+    }
+    else { /* F-domain */
+      int m = -p->step.integer;
+      rdft_convolve_portion(f->dft_length >> m, dft_out, f->coefs);
+      rdft_obackward(f->dft_length >> m, f->dft_backward_setup, dft_out, p->dft_scratch);
+#if RATE_SIMD
+      memcpy(output, dft_out, (size_t)(f->dft_length >> m) * sizeof(sample_t));
+#endif
+      fifo_trim_by(output_fifo, (((1 << m) - 1) * f->dft_length + overlap) >>m);
+    }
+  }
+}
+
+#undef dft_out
+
+/* Set to 4 x nearest power of 2 */
+/* or half of that if danger of causing too many cache misses. */
+static int set_dft_length(int num_taps, int min, int large)
+{
+  double d = log((double)num_taps) / log(2.);
+  return 1 << range_limit((int)(d + 2.77), min, max((int)(d + 1.77), large));
+}
+
+static void dft_stage_init(
+    unsigned instance, double Fp, double Fs, double Fn, double att,
+    double phase, stage_t * p, int L, int M, double * multiplier,
+    int min_dft_size, int large_dft_size)
+{
+  dft_filter_t * f = &p->shared->dft_filter[instance];
+  int num_taps = 0, dft_length = f->dft_length, i;
+  bool f_domain_m = abs(3-M) == 1 && Fs <= 1;
+
+  if (!dft_length) {
+    int k = phase == 50 && lsx_is_power_of_2(L) && Fn == L? L << 1 : 4;
+    double * h = lsx_design_lpf(Fp, Fs, Fn, att, &num_taps, -k, -1.);
+
+    if (phase != 50)
+      lsx_fir_to_phase(&h, &num_taps, &f->post_peak, phase);
+    else f->post_peak = num_taps / 2;
+
+    dft_length = set_dft_length(num_taps, min_dft_size, large_dft_size);
+    f->coefs = aligned_calloc((size_t)dft_length, sizeof(*f->coefs));
+    for (i = 0; i < num_taps; ++i)
+      f->coefs[(i + dft_length - num_taps + 1) & (dft_length - 1)]
+        = (sample_t)(h[i] * ((1. / dft_length) * rdft_multiplier() * L * *multiplier));
+    free(h);
+  }
+
+#if RATE_SIMD
+  p->dft_out = aligned_malloc(sizeof(sample_t) * (size_t)dft_length);
+#endif
+#if 1 /* In fact, currently, only pffft needs this. */
+  p->dft_scratch = aligned_malloc(2 * sizeof(sample_t) * (size_t)dft_length);
+#endif
+
+  if (!f->dft_length) {
+    void * coef_setup = rdft_forward_setup(dft_length);
+    int Lp = lsx_is_power_of_2(L)? L : 1;
+    int Mp = f_domain_m? M : 1;
+    f->dft_forward_setup = rdft_forward_setup(dft_length / Lp);
+    f->dft_backward_setup = rdft_backward_setup(dft_length / Mp);
+    if (Mp == 1)
+      rdft_forward(dft_length, coef_setup, f->coefs, p->dft_scratch);
+    else
+      rdft_oforward(dft_length, coef_setup, f->coefs, p->dft_scratch);
+    rdft_delete_setup(coef_setup);
+    f->num_taps = num_taps;
+    f->dft_length = dft_length;
+    lsx_debug("fir_len=%i dft_length=%i Fp=%g Fs=%g Fn=%g att=%g %i/%i",
+        num_taps, dft_length, Fp, Fs, Fn, att, L, M);
+  }
+  *multiplier = 1;
+  p->out_in_ratio = (double)L / M;
+  p->type = dft_stage;
+  p->fn = dft_stage_fn;
+  p->preload = f->post_peak / L;
+  p->at.integer = f->post_peak % L;
+  p->L = L;
+  p->step.integer = f_domain_m? -M/2 : M;
+  p->dft_filter_num = instance;
+  p->block_len = f->dft_length - (f->num_taps - 1);
+  p->phase0 = p->at.integer / p->L;
+}
+
+#include "filters.h"
+
+typedef struct {
+  double     factor;
+  uint64_t   samples_in, samples_out;
+  int        num_stages;
+  stage_t    * stages;
+} rate_t;
+
+#define pre_stage       p->stages[shift]
+#define arb_stage       p->stages[shift + have_pre_stage]
+#define post_stage      p->stages[shift + have_pre_stage + have_arb_stage]
+#define have_pre_stage  (preM  * preL  != 1)
+#define have_arb_stage  (arbM  * arbL  != 1)
+#define have_post_stage (postM * postL != 1)
+
+#define TO_3dB(a)       ((1.6e-6*a-7.5e-4)*a+.646)
+#define LOW_Q_BW0       (1385 / 2048.) /* 0.67625 rounded to be a FP exact. */
+
+typedef enum {
+  rolloff_none, rolloff_small /* <= 0.01 dB */, rolloff_medium /* <= 0.35 dB */
+} rolloff_t;
+
+
+static char const * rate_init(
+  /* Private work areas (to be supplied by the client):                       */
+  rate_t * p,                /* Per audio channel.                            */
+  rate_shared_t * shared,    /* Between channels (undergoing same rate change)*/
+
+  /* Public parameters:                                             Typically */
+  double factor,             /* Input rate divided by output rate.            */
+  double bits,               /* Required bit-accuracy (pass + stop)  16|20|28 */
+  double phase,              /* Linear/minimum etc. filter phase.       50    */
+  double passband_end,       /* 0dB pt. bandwidth to preserve; nyquist=1 0.913*/
+  double stopband_begin,     /* Aliasing/imaging control; > passband_end  1   */
+  rolloff_t rolloff,         /* Pass-band roll-off                    small   */
+  bool maintain_3dB_pt,      /*                                        true   */
+  double multiplier,         /* Linear gain to apply during conversion.   1   */
+
+  /* Primarily for test/development purposes:                                 */
+  bool use_hi_prec_clock,    /* Increase irrational ratio accuracy.   false   */
+  int interpolator,          /* Force a particular coef interpolator.   -1    */
+  size_t max_coefs_size,     /* k bytes of coefs to try to keep below.  400   */
+  bool noSmallIntOpt,        /* Disable small integer optimisations.  false   */
+  int log2_min_dft_size,
+  int log2_large_dft_size)
+{
+  double att = (bits + 1) * linear_to_dB(2.), attArb = att;    /* pass + stop */
+  double tbw0 = 1 - passband_end, Fs_a = stopband_begin;
+  double arbM = factor, tbw_tighten = 1;
+  int n = 0, i, preL = 1, preM = 1, shift = 0, arbL = 1, postL = 1, postM = 1;
+  bool upsample = false, rational = false, iOpt = !noSmallIntOpt;
+  int mode = rolloff > rolloff_small? factor > 1 || passband_end > LOW_Q_BW0:
+    (int)ceil(2 + (bits - 17) / 4);
+  stage_t * s;
+
+  assert(factor > 0);
+  assert(!bits || (15 <= bits && bits <= 33));
+  assert(0 <= phase && phase <= 100);
+  assert(.53 <= passband_end);
+  assert(stopband_begin <= 1.2);
+  assert(passband_end + .005 < stopband_begin);
+
+  p->factor = factor;
+  if (bits) while (!n++) {                               /* Determine stages: */
+    int try, L, M, x, maxL = interpolator > 0? 1 : mode? 2048 :
+      (int)ceil((double)max_coefs_size * 1000. / (U100_l * sizeof(sample_t)));
+    double d, epsilon = 0, frac;
+    upsample = arbM < 1;
+    for (i = (int)(arbM * .5), shift = 0; i >>= 1; arbM *= .5, ++shift);
+    preM = upsample || (arbM > 1.5 && arbM < 2);
+    postM = 1 + (arbM > 1 && preM), arbM /= postM;
+    preL = 1 + (!preM && arbM < 2) + (upsample && mode), arbM *= preL;
+    if ((frac = arbM - (int)arbM))
+      epsilon = fabs((uint32_t)(frac * MULT32 + .5) / (frac * MULT32) - 1);
+    for (i = 1, rational = !frac; i <= maxL && !rational; ++i) {
+      d = frac * i, try = (int)(d + .5);
+      if ((rational = fabs(try / d - 1) <= epsilon)) {    /* No long doubles! */
+        if (try == i)
+          arbM = ceil(arbM), shift += arbM > 2, arbM /= 1 + (arbM > 2);
+        else arbM = i * (int)arbM + try, arbL = i;
+      }
+    }
+    L = preL * arbL, M = (int)(arbM * postM), x = (L|M)&1, L >>= !x, M >>= !x;
+    if (iOpt && postL == 1 && (d = preL * arbL / arbM) > 4 && d != 5) {
+      for (postL = 4, i = (int)(d / 16); (i >>= 1) && postL < 256; postL <<= 1);
+      arbM = arbM * postL / arbL / preL, arbL = 1, n = 0;
+    } else if (rational && (max(L, M) < 3 + 2 * iOpt || L * M < 6 * iOpt))
+      preL = L, preM = M, arbM = arbL = postM = 1;
+    if (!mode && (!rational || !n))
+      ++mode, n = 0;
+  }
+
+  p->num_stages = shift + have_pre_stage + have_arb_stage + have_post_stage;
+  if (!p->num_stages && multiplier != 1) {
+    bits = arbL = 0;                         /* Use cubic_stage in this case. */
+    ++p->num_stages;
+  }
+  p->stages = calloc((size_t)p->num_stages + 1, sizeof(*p->stages));
+  for (i = 0; i < p->num_stages; ++i)
+    p->stages[i].shared = shared;
+
+  if ((n = p->num_stages) > 1) {                              /* Att. budget: */
+    if (have_arb_stage)
+      att += linear_to_dB(2.), attArb = att, --n;
+    att += linear_to_dB((double)n);
+  }
+
+  for (n = 0; (size_t)n + 1 < array_length(half_firs) && att > half_firs[n].att; ++n);
+  for (i = 0, s = p->stages; i < shift; ++i, ++s) {
+    s->type = half_stage;
+    s->fn = half_firs[n].fn;
+    s->pre_post = 4 * half_firs[n].num_coefs;
+    s->preload = s->pre = s->pre_post >> 1;
+  }
+
+  if (have_pre_stage) {
+    if (maintain_3dB_pt && have_post_stage) {    /* Trans. bands overlapping. */
+      double tbw3 = tbw0 * TO_3dB(att);                /* FFS: consider Fs_a. */
+      double x = ((2.1429e-4 - 5.2083e-7 * att) * att - .015863) * att + 3.95;
+      x = att * pow((tbw0 - tbw3) / (postM / (factor * postL) - 1 + tbw0), x);
+      if (x > .035) {
+        tbw_tighten = ((4.3074e-3 - 3.9121e-4 * x) * x - .040009) * x + 1.0014;
+        lsx_debug("x=%g tbw_tighten=%g", x, tbw_tighten);
+      }
+    }
+    dft_stage_init(0, 1 - tbw0 * tbw_tighten, Fs_a, preM? max(preL, preM) :
+        arbM / arbL, att, phase, &pre_stage, preL, max(preM, 1), &multiplier,
+        log2_min_dft_size, log2_large_dft_size);
+  }
+
+  if (!bits && have_arb_stage) {                  /* `Quick' cubic arb stage: */
+    arb_stage.type = cubic_stage;
+    arb_stage.fn = cubic_stage_fn;
+    arb_stage.mult = multiplier, multiplier = 1;
+    arb_stage.step.whole = (int64_t)(arbM * MULT32 + .5);
+    arb_stage.pre_post = max(3, arb_stage.step.integer);
+    arb_stage.preload = arb_stage.pre = 1;
+    arb_stage.out_in_ratio = MULT32 / (double)arb_stage.step.whole;
+  }
+  else if (have_arb_stage) {                     /* Higher quality arb stage: */
+    poly_fir_t const * f = &poly_firs[6*(upsample + !!preM) + mode - !upsample];
+    int order, num_coefs = (int)f->interp[0].scalar, phase_bits, phases;
+    size_t coefs_size;
+    double x = .5, at, Fp, Fs, Fn, mult = upsample? 1 : arbL / arbM;
+    poly_fir1_t const * f1;
+
+    Fn = !upsample && preM? x = arbM / arbL : 1;
+    Fp = !preM? mult : mode? .5 : 1;
+    Fs = 2 - Fp;           /* Ignore Fs_a; it would have little benefit here. */
+    Fp *= 1 - tbw0;
+    if (rolloff > rolloff_small && mode)
+      Fp = !preM? mult * .5 - .125 : mult * .05 + .1;
+    else if (rolloff == rolloff_small)
+      Fp = Fs - (Fs - .148 * x - Fp * .852) * (.00813 * bits + .973);
+
+    i = (interpolator < 0? !rational : max(interpolator, !rational)) - 1;
+    do {
+      f1 = &f->interp[++i];
+      assert(f1->fn);
+      if (i)
+        arbM /= arbL, arbL = 1, rational = false;
+      phase_bits = (int)ceil(f1->scalar + log(mult)/log(2.));
+      phases = !rational? (1 << phase_bits) : arbL;
+      if (!f->interp[0].scalar) {
+        int phases0 = max(phases, 19), n0 = 0;
+        lsx_design_lpf(Fp, Fs, -Fn, attArb, &n0, phases0, f->beta);
+        num_coefs = n0 / phases0 + 1, num_coefs += num_coefs & !preM;
+      }
+      if ((num_coefs & 1) && rational && (arbL & 1))
+        phases <<= 1, arbL <<= 1, arbM *= 2;
+      at = arbL * (arb_stage.phase0 = .5 * (num_coefs & 1));
+      order = i + (i && mode > 4);
+      coefs_size = (size_t)(num_coefs4 * phases * (order + 1)) * sizeof(sample_t);
+    } while (interpolator < 0 && i < 2 && f->interp[i+1].fn &&
+        coefs_size / 1000 > max_coefs_size);
+
+    if (!arb_stage.shared->poly_fir_coefs) {
+      int num_taps = num_coefs * phases - 1;
+      raw_coef_t * coefs = lsx_design_lpf(
+          Fp, Fs, Fn, attArb, &num_taps, phases, f->beta);
+      arb_stage.shared->poly_fir_coefs = prepare_coefs(
+          coefs, num_coefs, phases, order, multiplier);
+      lsx_debug("fir_len=%i phases=%i coef_interp=%i size=%.3gk",
+          num_coefs, phases, order, (double)coefs_size / 1000.);
+      free(coefs);
+    }
+    multiplier = 1;
+    arb_stage.type = rational? rational_stage : irrational_stage;
+    arb_stage.fn = f1->fn;
+    arb_stage.pre_post = num_coefs4 - 1;
+    arb_stage.preload = ((num_coefs - 1) >> 1) + (num_coefs4 - num_coefs);
+    arb_stage.n = num_coefs4;
+    arb_stage.phase_bits = phase_bits;
+    arb_stage.L = arbL;
+    arb_stage.use_hi_prec_clock = mode > 1 && use_hi_prec_clock && !rational;
+#if FLOAT_HI_PREC_CLOCK
+    if (arb_stage.use_hi_prec_clock) {
+      arb_stage.at.flt = at;
+      arb_stage.step.flt = arbM;
+      arb_stage.out_in_ratio = (double)(arbL / arb_stage.step.flt);
+    } else
+#endif
+    {
+      arb_stage.at.whole = (int64_t)(at * MULT32 + .5);
+#if !FLOAT_HI_PREC_CLOCK
+      if (arb_stage.use_hi_prec_clock) {
+        arb_stage.at.fix.ls.parts.ms = 0x80000000ul;
+        arbM *= MULT32;
+        arb_stage.step.whole = (int64_t)arbM;
+        arbM -= (double)arb_stage.step.whole;
+        arbM *= MULT32 * MULT32;
+        arb_stage.step.fix.ls.all = (uint64_t)arbM;
+      } else
+#endif
+        arb_stage.step.whole = (int64_t)(arbM * MULT32 + .5);
+      arb_stage.out_in_ratio = MULT32 * arbL / (double)arb_stage.step.whole;
+    }
+  }
+
+  if (have_post_stage)
+    dft_stage_init(1, 1 - (1 - (1 - tbw0) *
+        (upsample? factor * postL / postM : 1)) * tbw_tighten, Fs_a,
+        (double)max(postL, postM), att, phase, &post_stage, postL, postM,
+        &multiplier, log2_min_dft_size, log2_large_dft_size);
+
+
+  lsx_debug("%g: Â»%iâ%i/%iâ%i/%gâ%i/%i",
+      1/factor, shift, preL, preM, arbL, arbM, postL, postM);
+  for (i = 0, s = p->stages; i < p->num_stages; ++i, ++s) {
+    fifo_create(&s->fifo, (int)sizeof(sample_t));
+    memset(fifo_reserve(&s->fifo, s->preload), 0, sizeof(sample_t) * (size_t)s->preload);
+    lsx_debug("%5i|%-5i preload=%i remL=%i o/i=%g",
+        s->pre, s->pre_post - s->pre, s->preload, s->at.integer, s->out_in_ratio);
+  }
+  fifo_create(&s->fifo, (int)sizeof(sample_t));
+  return 0;
+}
+
+static void rate_process(rate_t * p)
+{
+  stage_t * stage = p->stages;
+  int i;
+  for (i = 0; i < p->num_stages; ++i, ++stage)
+    stage->fn(stage, &(stage+1)->fifo);
+}
+
+static sample_t * rate_input(rate_t * p, sample_t const * samples, size_t n)
+{
+  p->samples_in += n;
+  return fifo_write(&p->stages[0].fifo, (int)n, samples);
+}
+
+static sample_t const * rate_output(rate_t * p, sample_t * samples, size_t * n)
+{
+  fifo_t * fifo = &p->stages[p->num_stages].fifo;
+  p->samples_out += *n = min(*n, (size_t)fifo_occupancy(fifo));
+  return fifo_read(fifo, (int)*n, samples);
+}
+
+static void rate_flush(rate_t * p)
+{
+  fifo_t * fifo = &p->stages[p->num_stages].fifo;
+#if defined _MSC_VER && _MSC_VER == 1200
+  uint64_t samples_out = (uint64_t)(int64_t)((double)(int64_t)p->samples_in / p->factor + .5);
+#else
+  uint64_t samples_out = (uint64_t)((double)p->samples_in / p->factor + .5);
+#endif
+  size_t remaining = (size_t)(samples_out - p->samples_out);
+
+  if ((size_t)fifo_occupancy(fifo) < remaining) {
+    uint64_t samples_in = p->samples_in;
+    sample_t * buff = calloc(1024, sizeof(*buff));
+
+    while ((size_t)fifo_occupancy(fifo) < remaining) {
+      rate_input(p, buff, 1024);
+      rate_process(p);
+    }
+    fifo_trim_to(fifo, (int)remaining);
+    p->samples_in = samples_in;
+    free(buff);
+  }
+}
+
+static void rate_close(rate_t * p)
+{
+  rate_shared_t * shared = p->stages[0].shared;
+  int i;
+
+  for (i = 0; i <= p->num_stages; ++i) {
+    stage_t * s = &p->stages[i];
+    aligned_free(s->dft_scratch);
+    aligned_free(s->dft_out);
+    fifo_delete(&s->fifo);
+  }
+  if (shared) {
+    for (i = 0; i < 2; ++i) {
+      dft_filter_t * f= &shared->dft_filter[i];
+      aligned_free(f->coefs);
+      rdft_delete_setup(f->dft_forward_setup);
+      rdft_delete_setup(f->dft_backward_setup);
+    }
+    free(shared->poly_fir_coefs);
+    memset(shared, 0, sizeof(*shared));
+  }
+  free(p->stages);
+}
+
+#if defined SOXR_LIB
+static double rate_delay(rate_t * p)
+{
+#if defined _MSC_VER && _MSC_VER == 1200
+  double samples_out = (double)(int64_t)p->samples_in / p->factor;
+  return max(0, samples_out - (double)(int64_t)p->samples_out);
+#else
+  double samples_out = (double)p->samples_in / p->factor;
+  return max(0, samples_out - (double)p->samples_out);
+#endif
+}
+
+static void rate_sizes(size_t * shared, size_t * channel)
+{
+  *shared = sizeof(rate_shared_t);
+  *channel = sizeof(rate_t);
+}
+
+#include "soxr.h"
+
+static char const * rate_create(
+    void * channel,
+    void * shared,
+    double io_ratio,
+    soxr_quality_spec_t * q_spec,
+    soxr_runtime_spec_t * r_spec,
+    double scale)
+{
+  return rate_init(
+      channel, shared,
+      io_ratio,
+      q_spec->precision,
+      q_spec->phase_response,
+      q_spec->passband_end,
+      q_spec->stopband_begin,
+      "\1\2\0"[q_spec->flags & 3],
+      !!(q_spec->flags & SOXR_MAINTAIN_3DB_PT),
+      scale,
+      !!(q_spec->flags & SOXR_HI_PREC_CLOCK),
+      (int)(r_spec->flags & 3) - 1,
+      r_spec->coef_size_kbytes,
+      !!(r_spec->flags & SOXR_NOSMALLINTOPT),
+      (int)r_spec->log2_min_dft_size,
+      (int)r_spec->log2_large_dft_size);
+}
+
+static char const * id(void)
+{
+  return RATE_ID;
+}
+
+fn_t RATE_CB[] = {
+  (fn_t)rate_input,
+  (fn_t)rate_process,
+  (fn_t)rate_output,
+  (fn_t)rate_flush,
+  (fn_t)rate_close,
+  (fn_t)rate_delay,
+  (fn_t)rate_sizes,
+  (fn_t)rate_create,
+  (fn_t)0,
+  (fn_t)id,
+};
+#endif
diff --git a/src/rate32.c b/src/rate32.c
new file mode 100644
index 0000000..d6dd3b9
--- /dev/null
+++ b/src/rate32.c
@@ -0,0 +1,9 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#define sample_t   float
+#define RATE_SIMD  0
+#define RDFT_CB    _soxr_rdft32_cb
+#define RATE_CB    _soxr_rate32_cb
+#define RATE_ID    "single-precision"
+#include "rate.h"
diff --git a/src/rate32s.c b/src/rate32s.c
new file mode 100644
index 0000000..26a371a
--- /dev/null
+++ b/src/rate32s.c
@@ -0,0 +1,9 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#define sample_t   float
+#define RATE_SIMD  1
+#define RDFT_CB    _soxr_rdft32s_cb
+#define RATE_CB    _soxr_rate32s_cb
+#define RATE_ID    "single-precision-SIMD"
+#include "rate.h"
diff --git a/src/rate64.c b/src/rate64.c
new file mode 100644
index 0000000..6289911
--- /dev/null
+++ b/src/rate64.c
@@ -0,0 +1,9 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#define sample_t   double
+#define RATE_SIMD  0
+#define RDFT_CB    _soxr_rdft64_cb
+#define RATE_CB    _soxr_rate64_cb
+#define RATE_ID    "double-precision"
+#include "rate.h"
diff --git a/src/rdft.h b/src/rdft.h
new file mode 100644
index 0000000..59ba174
--- /dev/null
+++ b/src/rdft.h
@@ -0,0 +1,31 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+void ORDERED_CONVOLVE(int n, void * not_used, DFT_FLOAT * a, const DFT_FLOAT * b)
+{
+  int i;
+  a[0] *= b[0];
+  a[1] *= b[1];
+  for (i = 2; i < n; i += 2) {
+    DFT_FLOAT tmp = a[i];
+    a[i  ] = b[i  ] * tmp - b[i+1] * a[i+1];
+    a[i+1] = b[i+1] * tmp + b[i  ] * a[i+1];
+  }
+  (void)not_used;
+}
+
+void ORDERED_PARTIAL_CONVOLVE(int n, DFT_FLOAT * a, const DFT_FLOAT * b)
+{
+  int i;
+  a[0] *= b[0];
+  for (i = 2; i < n; i += 2) {
+    DFT_FLOAT tmp = a[i];
+    a[i  ] = b[i  ] * tmp - b[i+1] * a[i+1];
+    a[i+1] = b[i+1] * tmp + b[i  ] * a[i+1];
+  }
+  a[1] = b[i] * a[i] - b[i+1] * a[i+1];
+}
+
+#undef ORDERED_CONVOLVE
+#undef ORDERED_PARTIAL_CONVOLVE
+#undef DFT_FLOAT
diff --git a/src/rint-clip.h b/src/rint-clip.h
new file mode 100644
index 0000000..06764a8
--- /dev/null
+++ b/src/rint-clip.h
@@ -0,0 +1,153 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#if defined DITHER
+
+#define DITHERING (1./32)*(int)(((ran1>>=3)&31)-((ran2>>=3)&31))
+#define DITHER_RAND (seed = 1664525UL * seed + 1013904223UL) >> 3
+#define DITHER_VARS unsigned long ran1 = DITHER_RAND, ran2 = DITHER_RAND
+#define SEED_ARG , unsigned long * seed0
+#define SAVE_SEED *seed0 = seed
+#define COPY_SEED unsigned long seed = *seed0;
+#define COPY_SEED1 unsigned long seed1 = seed
+#define PASS_SEED1 , &seed1
+#define PASS_SEED  , &seed
+
+#else
+
+#define DITHERING 0
+#define DITHER_VARS
+#define SEED_ARG
+#define SAVE_SEED
+#define COPY_SEED
+#define COPY_SEED1
+#define PASS_SEED1
+#define PASS_SEED
+
+#endif
+
+
+
+#if defined FE_INVALID && defined FPU_RINT
+static void RINT_CLIP(RINT_T * const dest, FLOATX const * const src,
+    unsigned stride, size_t i, size_t const n, size_t * const clips SEED_ARG)
+{
+  COPY_SEED
+  DITHER_VARS;
+  for (; i < n; ++i) {
+    double d = src[i] + DITHERING;
+    dest[stride * i] = RINT(d);
+    if (fe_test_invalid()) {
+      fe_clear_invalid();
+      dest[stride * i] = d > 0? RINT_MAX : -RINT_MAX - 1;
+      ++*clips;
+    }
+  }
+  SAVE_SEED;
+}
+#endif
+
+
+
+static size_t LSX_RINT_CLIP(void * * const dest0, FLOATX const * const src,
+    size_t const n SEED_ARG)
+{
+  size_t i, clips = 0;
+  RINT_T * dest = *dest0;
+  COPY_SEED
+#if defined FE_INVALID && defined FPU_RINT
+#define _ dest[i] = RINT(src[i] + DITHERING), ++i,
+  fe_clear_invalid();
+  for (i = 0; i < (n & ~7u);) {
+    COPY_SEED1;
+    DITHER_VARS;
+    _ _ _ _ _ _ _ _ (void)0;
+    if (fe_test_invalid()) {
+      fe_clear_invalid();
+      RINT_CLIP(dest, src, 1, i - 8, i, &clips PASS_SEED1);
+    }
+  }
+  RINT_CLIP(dest, src, 1, i, n, &clips PASS_SEED);
+#else
+#define _ d = src[i] + DITHERING, dest[i++] = (RINT_T)(d > 0? d+.5 >= N? ++clips, N-1 : d+.5 : d-.5 <= -N-1? ++clips, -N:d-.5),
+  const double N = 1. + RINT_MAX;
+  double d;
+  for (i = 0; i < (n & ~7u);) {
+    DITHER_VARS;
+    _ _ _ _ _ _ _ _ (void)0;
+  }
+  {
+    DITHER_VARS;
+    for (; i < n; _ (void)0);
+  }
+#endif
+  SAVE_SEED;
+  *dest0 = dest + n;
+  return clips;
+}
+#undef _
+
+
+
+static size_t LSX_RINT_CLIP_2(void * * dest0, FLOATX const * const * srcs,
+    unsigned const stride, size_t const n SEED_ARG)
+{
+  unsigned j;
+  size_t i, clips = 0;
+  RINT_T * dest = *dest0;
+  COPY_SEED
+#if defined FE_INVALID && defined FPU_RINT
+#define _ dest[stride * i] = RINT(src[i] + DITHERING), ++i,
+  fe_clear_invalid();
+  for (j = 0; j < stride; ++j, ++dest) {
+    FLOATX const * const src = srcs[j];
+    for (i = 0; i < (n & ~7u);) {
+      COPY_SEED1;
+      DITHER_VARS;
+      _ _ _ _ _ _ _ _ (void)0;
+      if (fe_test_invalid()) {
+        fe_clear_invalid();
+        RINT_CLIP(dest, src, stride, i - 8, i, &clips PASS_SEED1);
+      }
+    }
+    RINT_CLIP(dest, src, stride, i, n, &clips PASS_SEED);
+  }
+#else
+#define _ d = src[i] + DITHERING, dest[stride * i++] = (RINT_T)(d > 0? d+.5 >= N? ++clips, N-1 : d+.5 : d-.5 <= -N-1? ++clips, -N:d-.5),
+  const double N = 1. + RINT_MAX;
+  double d;
+  for (j = 0; j < stride; ++j, ++dest) {
+    FLOATX const * const src = srcs[j];
+    for (i = 0; i < (n & ~7u);) {
+      DITHER_VARS;
+      _ _ _ _ _ _ _ _ (void)0;
+    }
+    {
+      DITHER_VARS;
+      for (; i < n; _ (void)0);
+    }
+  }
+#endif
+  SAVE_SEED;
+  *dest0 = dest + stride * (n - 1);
+  return clips;
+}
+#undef _
+
+#undef PASS_SEED
+#undef PASS_SEED1
+#undef COPY_SEED1
+#undef COPY_SEED
+#undef SAVE_SEED
+#undef SEED_ARG
+#undef DITHER_VARS
+#undef DITHERING
+#undef DITHER
+
+#undef RINT_MAX
+#undef RINT_T
+#undef FPU_RINT
+#undef RINT
+#undef RINT_CLIP
+#undef LSX_RINT_CLIP
+#undef LSX_RINT_CLIP_2
diff --git a/src/rint.h b/src/rint.h
new file mode 100644
index 0000000..529e4bb
--- /dev/null
+++ b/src/rint.h
@@ -0,0 +1,68 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#if !defined soxr_rint_included
+#define soxr_rint_included
+
+#include "soxr-config.h"
+
+
+
+#if HAVE_LRINT && LONG_MAX == 2147483647L
+  #include <math.h>
+  #define FPU_RINT32
+  #define rint32 lrint
+#elif defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+  #define FPU_RINT32
+  static __inline int32_t rint32(double input) {
+    int32_t result;
+    __asm__ __volatile__("fistpl %0": "=m"(result): "t"(input): "st");
+    return result;
+  }
+#elif defined __GNUC__ && defined __arm__
+  #define FPU_RINT32
+  static __inline int32_t rint32(double input) {
+    register int32_t result;
+    __asm__ __volatile__ ("ftosid %0, %P1": "=w"(result): "w"(input));
+    return result;
+  }
+#elif defined _MSC_VER && defined _M_IX86 /* FIXME need solution for MSVC x64 */
+  #define FPU_RINT32
+  static __inline int32_t rint32(double input) {
+    int32_t result;
+    _asm {
+      fld input
+      fistp result
+    }
+    return result;
+  }
+#else
+  #define rint32(x) (int32_t)((x) < 0? x - .5 : x + .5)
+#endif
+
+
+
+#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+  #define FPU_RINT16
+  static __inline int16_t rint16(double input) {
+    int16_t result;
+    __asm__ __volatile__("fistps %0": "=m"(result): "t"(input): "st");
+    return result;
+  }
+#elif defined _MSC_VER && defined _M_IX86 /* FIXME need solution for MSVC x64 */
+  #define FPU_RINT16
+  static __inline int16_t rint16(double input) {
+    int16_t result;
+    _asm {
+      fld input
+      fistp result
+    }
+    return result;
+  }
+#else
+  #define rint16(x) (int16_t)((x) < 0? x - .5 : x + .5)
+#endif
+
+
+
+#endif
diff --git a/src/samplerate.h b/src/samplerate.h
new file mode 100644
index 0000000..911cc5d
--- /dev/null
+++ b/src/samplerate.h
@@ -0,0 +1 @@
+#include "soxr-lsr.h"
diff --git a/src/simd-dev.h b/src/simd-dev.h
new file mode 100644
index 0000000..019325c
--- /dev/null
+++ b/src/simd-dev.h
@@ -0,0 +1,5 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#define PFFT_MACROS_ONLY
+#include "pffft.c"
diff --git a/src/simd.c b/src/simd.c
new file mode 100644
index 0000000..7659ab9
--- /dev/null
+++ b/src/simd.c
@@ -0,0 +1,84 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#include <assert.h>
+#include <string.h>
+#include <stdlib.h>
+#include "simd.h"
+#include "simd-dev.h"
+
+#define SIMD_ALIGNMENT (sizeof(float) * 4)
+
+void * _soxr_simd_aligned_malloc(size_t size)
+{
+  char * p1 = 0, * p = malloc(size + SIMD_ALIGNMENT);
+  if (p) {
+    p1 = (char *)((size_t)(p + SIMD_ALIGNMENT) & ~(SIMD_ALIGNMENT - 1));
+    *((void * *)p1 - 1) = p;
+  }
+  return p1;
+}
+
+
+
+void * _soxr_simd_aligned_calloc(size_t nmemb, size_t size)
+{
+  void * p = _soxr_simd_aligned_malloc(nmemb * size);
+  if (p)
+    memset(p, 0, nmemb * size);
+  return p;
+}
+
+
+
+void _soxr_simd_aligned_free(void * p1)
+{
+  if (p1)
+    free(*((void * *)p1 - 1));
+}
+
+
+
+void _soxr_ordered_convolve_simd(int n, void * not_used, float * a, const float * b)
+{
+  int i;
+  float ab0, ab1;
+  v4sf       * /*RESTRICT*/ va = (v4sf       *)a;
+  v4sf const *   RESTRICT   vb = (v4sf const *)b;
+  assert(VALIGNED(a) && VALIGNED(b));
+  ab0 = a[0] * b[0], ab1 = a[1] * b[1];
+  for (i = 0; i < n / 4; i += 2) {
+    v4sf a1r = va[i+0], a1i = va[i+1];
+    v4sf b1r = vb[i+0], b1i = vb[i+1];
+    UNINTERLEAVE2(a1r, a1i, a1r, a1i);
+    UNINTERLEAVE2(b1r, b1i, b1r, b1i);
+    VCPLXMUL(a1r, a1i, b1r, b1i);
+    INTERLEAVE2(a1r, a1i, a1r, a1i);
+    va[i+0] = a1r, va[i+1] = a1i;
+  }
+  a[0] = ab0, a[1] = ab1;
+  (void)not_used;
+}
+
+
+
+void _soxr_ordered_partial_convolve_simd(int n, float * a, const float * b)
+{
+  int i;
+  float ab0;
+  v4sf       * /*RESTRICT*/ va = (v4sf       *)a;
+  v4sf const *   RESTRICT   vb = (v4sf const *)b;
+  assert(VALIGNED(a) && VALIGNED(b));
+  ab0 = a[0] * b[0];
+  for (i = 0; i < n / 4; i += 2) {
+    v4sf a1r = va[i+0], a1i = va[i+1];
+    v4sf b1r = vb[i+0], b1i = vb[i+1];
+    UNINTERLEAVE2(a1r, a1i, a1r, a1i);
+    UNINTERLEAVE2(b1r, b1i, b1r, b1i);
+    VCPLXMUL(a1r, a1i, b1r, b1i);
+    INTERLEAVE2(a1r, a1i, a1r, a1i);
+    va[i+0] = a1r, va[i+1] = a1i;
+  }
+  a[0] = ab0;
+  a[1] = b[n] * a[n] - b[n+1] * a[n+1];
+}
diff --git a/src/simd.h b/src/simd.h
new file mode 100644
index 0000000..71eefc6
--- /dev/null
+++ b/src/simd.h
@@ -0,0 +1,16 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#if !defined simd_included
+#define simd_included
+
+#include <stddef.h>
+
+void * _soxr_simd_aligned_malloc(size_t);
+void * _soxr_simd_aligned_calloc(size_t, size_t);
+void _soxr_simd_aligned_free(void *);
+
+void _soxr_ordered_convolve_simd(int n, void * not_used, float * a, const float * b);
+void _soxr_ordered_partial_convolve_simd(int n, float * a, const float * b);
+
+#endif
diff --git a/src/soxr-lsr.h b/src/soxr-lsr.h
new file mode 100644
index 0000000..c0923aa
--- /dev/null
+++ b/src/soxr-lsr.h
@@ -0,0 +1,80 @@
+/* SoX Resampler Library       Copyright (c) 2007-13 robs@users.sourceforge.net
+ *
+ * This library is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/* Wrapper compatible with `libsamplerate' (constant-rate).
+ * (Libsoxr's native API can be found in soxr.h).  */
+
+#if !defined SAMPLERATE_H
+#define SAMPLERATE_H
+#if defined __cplusplus
+  extern "C" {
+#endif
+
+#if defined SOXR_DLL
+  #if defined soxr_lsr_EXPORTS
+    #define SOXR __declspec(dllexport)
+  #else
+    #define SOXR __declspec(dllimport)
+  #endif
+#elif defined SOXR_VISIBILITY && defined __GNUC__ && (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 1)
+  #define SOXR __attribute__ ((visibility("default")))
+#else
+  #define SOXR
+#endif
+
+typedef float   SRC_SAMPLE;
+#if !defined SOXR_LIB
+enum SRC_SRCTYPE_e {SRC_SINC_BEST_QUALITY, SRC_SINC_MEDIUM_QUALITY,
+                    SRC_SINC_FASTEST, SRC_ZERO_ORDER_HOLD, SRC_LINEAR};
+typedef int     SRC_SRCTYPE;
+typedef int     SRC_ERROR;
+typedef long    (* src_callback_t)(void *, SRC_SAMPLE * *);
+typedef struct  SRC_STATE SRC_STATE;
+typedef struct  SRC_DATA {
+  SRC_SAMPLE    * data_in, * data_out;
+  long          input_frames, output_frames;
+  long          input_frames_used, output_frames_gen;
+  int           end_of_input;
+  double        src_ratio;
+} SRC_DATA;
+#endif
+SOXR SRC_STATE *   src_new(SRC_SRCTYPE, int num_channels, SRC_ERROR *);
+SOXR SRC_ERROR     src_process  (SRC_STATE *, SRC_DATA *);
+SOXR SRC_ERROR     src_set_ratio(SRC_STATE *, double);
+SOXR SRC_ERROR     src_reset    (SRC_STATE *);
+SOXR SRC_ERROR     src_error    (SRC_STATE *);
+SOXR SRC_STATE *   src_delete   (SRC_STATE *);
+SOXR SRC_STATE *   src_callback_new(
+                    src_callback_t, SRC_SRCTYPE, int, SRC_ERROR *, void *);
+SOXR long          src_callback_read(
+                    SRC_STATE *, double src_ratio, long, SRC_SAMPLE *);
+SOXR SRC_ERROR     src_simple(SRC_DATA *, SRC_SRCTYPE, int);
+SOXR char const *  src_get_name(SRC_SRCTYPE);
+SOXR char const *  src_get_description(SRC_SRCTYPE);
+SOXR char const *  src_get_version(void);
+SOXR char const *  src_strerror(SRC_ERROR);
+SOXR int           src_is_valid_ratio(double);
+SOXR void          src_short_to_float_array(short const *, float *, int);
+SOXR void          src_float_to_short_array(float const *, short *, int);
+SOXR void          src_int_to_float_array(int const *, float *, int);
+SOXR void          src_float_to_int_array(float const *, int *, int);
+
+#undef SOXR
+#if defined __cplusplus
+  }
+#endif
+#endif
diff --git a/src/soxr-lsr.pc.in b/src/soxr-lsr.pc.in
new file mode 100644
index 0000000..7b75757
--- /dev/null
+++ b/src/soxr-lsr.pc.in
@@ -0,0 +1,5 @@
+Name: ${LSR}
+Description: ${DESCRIPTION_SUMMARY} (with libsamplerate-like bindings)
+Version: ${PROJECT_VERSION}
+Libs: -L${LIB_INSTALL_DIR} -l${LSR}
+Cflags: -I${INCLUDE_INSTALL_DIR}
diff --git a/src/soxr.c b/src/soxr.c
new file mode 100644
index 0000000..dc61ad9
--- /dev/null
+++ b/src/soxr.c
@@ -0,0 +1,638 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#include "soxr.h"
+#include "data-io.h"
+#include "internal.h"
+
+
+
+char const * soxr_version(void)
+{
+  return "libsoxr-" SOXR_THIS_VERSION_STR;
+}
+
+
+
+typedef void sample_t; /* float or double */
+typedef void (* fn_t)(void);
+typedef fn_t control_block_t[10];
+
+#define resampler_input        (*(sample_t * (*)(void *, sample_t * samples, size_t   n))p->control_block[0])
+#define resampler_process      (*(void (*)(void *, size_t))p->control_block[1])
+#define resampler_output       (*(sample_t const * (*)(void *, sample_t * samples, size_t * n))p->control_block[2])
+#define resampler_flush        (*(void (*)(void *))p->control_block[3])
+#define resampler_close        (*(void (*)(void *))p->control_block[4])
+#define resampler_delay        (*(double (*)(void *))p->control_block[5])
+#define resampler_sizes        (*(void (*)(size_t * shared, size_t * channel))p->control_block[6])
+#define resampler_create       (*(char const * (*)(void * channel, void * shared, double io_ratio, soxr_quality_spec_t * q_spec, soxr_runtime_spec_t * r_spec, double scale))p->control_block[7])
+#define resampler_set_io_ratio (*(void (*)(void *, double io_ratio, size_t len))p->control_block[8])
+#define resampler_id           (*(char const * (*)(void))p->control_block[9])
+
+typedef void * resampler_t; /* For one channel. */
+typedef void * resampler_shared_t; /* Between channels. */
+typedef void (* deinterleave_t)(sample_t * * dest,
+    soxr_datatype_t data_type, void const * * src0, size_t n, unsigned ch);
+typedef size_t (* interleave_t)(soxr_datatype_t data_type, void * * dest,
+    sample_t const * const * src, size_t, unsigned, unsigned long *);
+
+struct soxr {
+  unsigned num_channels;
+  double io_ratio;
+  soxr_error_t error;
+  soxr_quality_spec_t q_spec;
+  soxr_io_spec_t io_spec;
+  soxr_runtime_spec_t runtime_spec;
+
+  void * input_fn_state;
+  soxr_input_fn_t input_fn;
+  size_t max_ilen;
+
+  resampler_shared_t shared;
+  resampler_t * resamplers;
+  control_block_t control_block;
+  deinterleave_t deinterleave;
+  interleave_t interleave;
+
+  void * * channel_ptrs;
+  size_t clips;
+  unsigned long seed;
+  int flushing;
+};
+
+
+
+#define RESET_ON_CLEAR   (1u<<31)
+
+/* TODO: these should not be here. */
+#define TO_3dB(a)       ((1.6e-6*a-7.5e-4)*a+.646)
+#define LOW_Q_BW0       (1385 / 2048.) /* 0.67625 rounded to be a FP exact. */
+
+soxr_quality_spec_t soxr_quality_spec(unsigned long recipe, unsigned long flags)
+{
+  soxr_quality_spec_t spec, * p = &spec;
+  unsigned quality = recipe & 0xf;
+  double rej;
+  memset(p, 0, sizeof(*p));
+  if (quality > 13) {
+    p->e = "invalid quality type";
+    return spec;
+  }
+  flags |= quality < SOXR_LSR0Q? RESET_ON_CLEAR : 0;
+  if (quality == 13)
+    quality = 6;
+  else if (quality > 10)
+    quality = 0;
+  p->phase_response = "\62\31\144"[(recipe & 0x30) >> 4];
+  p->stopband_begin = 1;
+  p->precision = !quality? 0: quality < 3? 16 : quality < 8? 4 + quality * 4 : 55 - quality * 4;
+  rej = p->precision * linear_to_dB(2.);
+  p->flags = flags;
+  if (quality < 8) {
+    p->passband_end = quality == 1? LOW_Q_BW0 : 1 - .05 / TO_3dB(rej);
+    if (quality <= 2)
+      p->flags &= ~SOXR_ROLLOFF_NONE, p->flags |= SOXR_ROLLOFF_MEDIUM;
+  }
+  else {
+    static float const bw[] = {.931f, .832f, .663f};
+    p->passband_end = bw[quality - 8];
+    if (quality - 8 == 2)
+      p->flags &= ~SOXR_ROLLOFF_NONE, p->flags |= SOXR_ROLLOFF_MEDIUM;
+  }
+  if (recipe & SOXR_STEEP_FILTER)
+    p->passband_end = 1 - .01 / TO_3dB(rej);
+  return spec;
+}
+
+
+
+char const * soxr_engine(soxr_t p)
+{
+  return resampler_id();
+}
+
+
+
+size_t * soxr_num_clips(soxr_t p)
+{
+  return &p->clips;
+}
+
+
+
+soxr_error_t soxr_error(soxr_t p)
+{
+  return p->error;
+}
+
+
+
+soxr_runtime_spec_t soxr_runtime_spec(unsigned num_threads)
+{
+  soxr_runtime_spec_t spec, * p = &spec;
+  memset(p, 0, sizeof(*p));
+  p->log2_min_dft_size = 10;
+  p->log2_large_dft_size = 17;
+  p->coef_size_kbytes = 400;
+  p->num_threads = num_threads;
+  return spec;
+}
+
+
+
+soxr_io_spec_t soxr_io_spec(
+  soxr_datatype_t itype,
+  soxr_datatype_t otype)
+{
+  soxr_io_spec_t spec, * p = &spec;
+  memset(p, 0, sizeof(*p));
+  if ((itype | otype) >= SOXR_SPLIT * 2)
+    p->e = "invalid io datatype(s)";
+  else {
+    p->itype = itype;
+    p->otype = otype;
+    p->scale = 1;
+  }
+  return spec;
+}
+
+
+
+#if HAVE_SIMD
+static bool cpu_has_simd(void)
+{
+#if defined __x86_64__ || defined _M_X64
+  return true;
+#elif defined __GNUC__ && defined i386
+  uint32_t eax, ebx, ecx, edx;
+  __asm__ __volatile__ (
+      "pushl %%ebx   \n\t"
+      "cpuid         \n\t"
+      "movl %%ebx, %1\n\t"
+      "popl %%ebx    \n\t"
+      : "=a"(eax), "=r"(ebx), "=c"(ecx), "=d"(edx)
+      : "a"(1)
+      : "cc" );
+  return !!(edx & 0x06000000);
+#elif defined _MSC_VER && defined _M_IX86
+  uint32_t d;
+  __asm {
+    xor     eax, eax
+    inc     eax
+    push    ebx
+    cpuid
+    pop     ebx
+    mov     d, edx
+  }
+  return !!(d & 0x06000000);
+#endif
+  return false;
+}
+#endif
+
+extern control_block_t _soxr_rate32s_cb, _soxr_rate32_cb, _soxr_rate64_cb, _soxr_vr32_cb;
+
+
+
+soxr_t soxr_create(
+  double input_rate, double output_rate,
+  unsigned num_channels,
+  soxr_error_t * error0,
+  soxr_io_spec_t const * io_spec,
+  soxr_quality_spec_t const * q_spec,
+  soxr_runtime_spec_t const * runtime_spec)
+{
+  double io_ratio = output_rate? input_rate? input_rate / output_rate : -1 : input_rate? -1 : 0;
+  static const float datatype_full_scale[] = {1, 1, 65536.*32768, 32768};
+  soxr_t p = 0;
+  soxr_error_t error = 0;
+
+  if (q_spec && q_spec->e)  error = q_spec->e;
+  else if (io_spec && (io_spec->itype | io_spec->otype) >= SOXR_SPLIT * 2)
+    error = "invalid io datatype(s)";
+
+  if (!error && !(p = calloc(sizeof(*p), 1))) error = "malloc failed";
+
+  if (p) {
+    p->q_spec = q_spec? *q_spec : soxr_quality_spec(SOXR_HQ, 0);
+
+    if (q_spec) { /* Backwards compatibility with original API: */
+      if (p->q_spec.passband_end > 2)
+        p->q_spec.passband_end /= 100;
+      if (p->q_spec.stopband_begin > 2)
+        p->q_spec.stopband_begin = 2 - p->q_spec.stopband_begin / 100;
+    }
+
+    p->io_ratio = io_ratio;
+    p->num_channels = num_channels;
+    if (io_spec)
+      p->io_spec = *io_spec;
+    else
+      p->io_spec.scale = 1;
+
+    p->runtime_spec = runtime_spec? *runtime_spec : soxr_runtime_spec(1);
+    p->io_spec.scale *= datatype_full_scale[p->io_spec.otype & 3] /
+                        datatype_full_scale[p->io_spec.itype & 3];
+    p->seed = (unsigned long)time(0) ^ (unsigned long)(size_t)p;
+
+#if HAVE_SINGLE_PRECISION
+    if (!HAVE_DOUBLE_PRECISION || (p->q_spec.precision <= 20 && !(p->q_spec.flags & SOXR_DOUBLE_PRECISION))
+        || (p->q_spec.flags & SOXR_VR)) {
+      p->deinterleave = (deinterleave_t)_soxr_deinterleave_f;
+      p->interleave = (interleave_t)_soxr_interleave_f;
+      memcpy(&p->control_block,
+          (p->q_spec.flags & SOXR_VR)? &_soxr_vr32_cb :
+#if HAVE_SIMD
+          cpu_has_simd()? &_soxr_rate32s_cb :
+#endif
+          &_soxr_rate32_cb, sizeof(p->control_block));
+    }
+#if HAVE_DOUBLE_PRECISION
+    else
+#endif
+#endif
+#if HAVE_DOUBLE_PRECISION
+    {
+      p->deinterleave = (deinterleave_t)_soxr_deinterleave;
+      p->interleave = (interleave_t)_soxr_interleave;
+      memcpy(&p->control_block, &_soxr_rate64_cb, sizeof(p->control_block));
+    }
+#endif
+
+    if (p->num_channels && io_ratio)
+      error = soxr_set_io_ratio(p, io_ratio, 0);
+  }
+  if (error)
+    soxr_delete(p), p = 0;
+  if (error0)
+    *error0 = error;
+  return p;
+}
+
+
+
+soxr_error_t soxr_set_input_fn(soxr_t p,
+    soxr_input_fn_t input_fn, void * input_fn_state, size_t max_ilen)
+{
+  p->input_fn_state = input_fn_state;
+  p->input_fn = input_fn;
+  p->max_ilen = max_ilen? max_ilen : (size_t)-1;
+  return 0;
+}
+
+
+
+static void soxr_delete0(soxr_t p)
+{
+  unsigned i;
+
+  if (p->resamplers) for (i = 0; i < p->num_channels; ++i) {
+    if (p->resamplers[i])
+      resampler_close(p->resamplers[i]);
+    free(p->resamplers[i]);
+  }
+  free(p->resamplers);
+  free(p->channel_ptrs);
+  free(p->shared);
+
+  memset(p, 0, sizeof(*p));
+}
+
+
+
+double soxr_delay(soxr_t p)
+{
+  return (p && !p->error && p->resamplers)? resampler_delay(p->resamplers[0]) : 0;
+}
+
+
+
+static soxr_error_t fatal_error(soxr_t p, soxr_error_t error)
+{
+  soxr_delete0(p);
+  return p->error = error;
+}
+
+
+
+static soxr_error_t initialise(soxr_t p)
+{
+  unsigned i;
+  size_t shared_size, channel_size;
+
+  resampler_sizes(&shared_size, &channel_size);
+  p->channel_ptrs = calloc(sizeof(*p->channel_ptrs), p->num_channels);
+  p->shared = calloc(shared_size, 1);
+  p->resamplers = calloc(sizeof(*p->resamplers), p->num_channels);
+  if (!p->shared || !p->channel_ptrs || !p->resamplers)
+    return fatal_error(p, "malloc failed");
+
+  for (i = 0; i < p->num_channels; ++i) {
+    soxr_error_t error;
+    if (!(p->resamplers[i] = calloc(channel_size, 1)))
+      return fatal_error(p, "malloc failed");
+    error = resampler_create(
+        p->resamplers[i],
+        p->shared,
+        p->io_ratio,
+        &p->q_spec,
+        &p->runtime_spec,
+        p->io_spec.scale);
+    if (error)
+      return fatal_error(p, error);
+  }
+  return 0;
+}
+
+
+
+soxr_error_t soxr_set_num_channels(soxr_t p, unsigned num_channels)
+{
+  if (!p)                return "invalid soxr_t pointer";
+  if (num_channels == p->num_channels) return p->error;
+  if (!num_channels)     return "invalid # of channels";
+  if (p->resamplers)     return "# of channels can't be changed";
+  p->num_channels = num_channels;
+  return soxr_set_io_ratio(p, p->io_ratio, 0);
+}
+
+
+
+soxr_error_t soxr_set_io_ratio(soxr_t p, double io_ratio, size_t slew_len)
+{
+  unsigned i;
+  soxr_error_t error;
+  if (!p)                 return "invalid soxr_t pointer";
+  if ((error = p->error)) return error;
+  if (!p->num_channels)   return "must set # channels before O/I ratio";
+  if (io_ratio <= 0)      return "I/O ratio out-of-range";
+  if (!p->channel_ptrs) {
+    p->io_ratio = io_ratio;
+    return initialise(p);
+  }
+  if (p->control_block[8]) {
+    for (i = 0; !error && i < p->num_channels; ++i)
+      resampler_set_io_ratio(p->resamplers[i], io_ratio, slew_len);
+    return error;
+  }
+  return fabs(p->io_ratio - io_ratio) < 1e-15? 0 :
+    "Varying O/I ratio is not supported with this quality level";
+}
+
+
+
+void soxr_delete(soxr_t p)
+{
+  if (p)
+    soxr_delete0(p), free(p);
+}
+
+
+
+soxr_error_t soxr_clear(soxr_t p) /* TODO: this, properly. */
+{
+  if (p) {
+    struct soxr tmp = *p;
+    soxr_delete0(p);
+    memset(p, 0, sizeof(*p));
+    p->input_fn = tmp.input_fn;
+    p->runtime_spec = tmp.runtime_spec;
+    p->q_spec = tmp.q_spec;
+    p->io_spec = tmp.io_spec;
+    p->num_channels = tmp.num_channels;
+    p->input_fn_state = tmp.input_fn_state;
+    memcpy(p->control_block, tmp.control_block, sizeof(p->control_block));
+    p->deinterleave = tmp.deinterleave;
+    p->interleave = tmp.interleave;
+    return (p->q_spec.flags & RESET_ON_CLEAR)?
+      soxr_set_io_ratio(p, tmp.io_ratio, 0) : 0;
+  }
+  return "invalid soxr_t pointer";
+}
+
+
+
+static void soxr_input_1ch(soxr_t p, unsigned i, soxr_cbuf_t src, size_t len)
+{
+  sample_t * dest = resampler_input(p->resamplers[i], NULL, len);
+  (*p->deinterleave)(&dest, p->io_spec.itype, &src, len, 1);
+}
+
+
+
+static size_t soxr_input(soxr_t p, void const * in, size_t len)
+{
+  bool separated = !!(p->io_spec.itype & SOXR_SPLIT);
+  unsigned i;
+  if (!p || p->error) return 0;
+  if (!in && len) {p->error = "null input buffer pointer"; return 0;}
+  if (!len) {
+    p->flushing = true;
+    return 0;
+  }
+  if (separated)
+    for (i = 0; i < p->num_channels; ++i)
+      soxr_input_1ch(p, i, ((soxr_cbufs_t)in)[i], len);
+  else {
+    for (i = 0; i < p->num_channels; ++i)
+      p->channel_ptrs[i] = resampler_input(p->resamplers[i], NULL, len);
+    (*p->deinterleave)(
+        (sample_t **)p->channel_ptrs, p->io_spec.itype, &in, len, p->num_channels);
+  }
+  return len;
+}
+
+
+
+static size_t soxr_output_1ch(soxr_t p, unsigned i, soxr_buf_t dest, size_t len, bool separated)
+{
+  sample_t const * src;
+  if (p->flushing)
+    resampler_flush(p->resamplers[i]);
+  resampler_process(p->resamplers[i], len);
+  src = resampler_output(p->resamplers[i], NULL, &len);
+  if (separated)
+    p->clips += (p->interleave)(p->io_spec.otype, &dest, &src,
+      len, 1, (p->io_spec.flags & SOXR_NO_DITHER)? 0 : &p->seed);
+  else p->channel_ptrs[i] = (void /* const */ *)src;
+  return len;
+}
+
+
+
+static size_t soxr_output_no_callback(soxr_t p, soxr_buf_t out, size_t len)
+{
+  unsigned u;
+  size_t done = 0;
+  bool separated = !!(p->io_spec.otype & SOXR_SPLIT);
+#if defined _OPENMP
+  int i;
+  if (!p->runtime_spec.num_threads && p->num_channels > 1)
+#pragma omp parallel for
+  for (i = 0; i < (int)p->num_channels; ++i) {
+    size_t done1;
+    done1 = soxr_output_1ch(p, (unsigned)i, ((soxr_bufs_t)out)[i], len, separated);
+    if (!i)
+      done = done1;
+  } else
+#endif
+  for (u = 0; u < p->num_channels; ++u)
+    done = soxr_output_1ch(p, u, ((soxr_bufs_t)out)[u], len, separated);
+
+  if (!separated)
+    p->clips += (p->interleave)(p->io_spec.otype, &out, (sample_t const * const *)p->channel_ptrs,
+        done, p->num_channels, (p->io_spec.flags & SOXR_NO_DITHER)? 0 : &p->seed);
+  return done;
+}
+
+
+
+size_t soxr_output(soxr_t p, void * out, size_t len0)
+{
+  size_t odone, odone0 = 0, olen = len0, osize, idone;
+  size_t ilen = min(p->max_ilen, (size_t)ceil((double)olen *p->io_ratio));
+  void const * in = out; /* Set to !=0, so that caller may leave unset. */
+  bool was_flushing;
+
+  if (!p || p->error) return 0;
+  if (!out && len0) {p->error = "null output buffer pointer"; return 0;}
+
+  do {
+    odone = soxr_output_no_callback(p, out, olen);
+    odone0 += odone;
+    if (odone0 == len0 || !p->input_fn || p->flushing)
+      break;
+
+    osize = soxr_datatype_size(p->io_spec.otype) * p->num_channels;
+    out = (char *)out + osize * odone;
+    olen -= odone;
+    idone = p->input_fn(p->input_fn_state, &in, ilen);
+    was_flushing = p->flushing;
+    if (!in)
+      p->error = "input function reported failure";
+    else soxr_input(p, in, idone);
+  } while (odone || idone || (!was_flushing && p->flushing));
+  return odone0;
+}
+
+
+
+static size_t soxr_i_for_o(soxr_t p, size_t olen, size_t ilen)
+{
+  size_t result;
+#if 0
+  if (p->runtime_spec.flags & SOXR_STRICT_BUFFERING)
+    result = rate_i_for_o(p->resamplers[0], olen);
+  else
+#endif
+    result = (size_t)ceil((double)olen * p->io_ratio);
+  return min(result, ilen);
+}
+
+
+
+#if 0
+static size_t soxr_o_for_i(soxr_t p, size_t ilen, size_t olen)
+{
+  size_t result = (size_t)ceil((double)ilen / p->io_ratio);
+  return min(result, olen);
+}
+#endif
+
+
+
+soxr_error_t soxr_process(soxr_t p,
+    void const * in , size_t ilen0, size_t * idone0,
+    void       * out, size_t olen , size_t * odone0)
+{
+  size_t ilen, idone, odone = 0;
+  unsigned u;
+  bool flush_requested = false;
+
+  if (!p) return "null pointer";
+
+  if (!in)
+    flush_requested = true, ilen = ilen0 = 0;
+  else {
+    if ((ptrdiff_t)ilen0 < 0)
+      flush_requested = true, ilen0 = ~ilen0;
+    if (idone0 && (1 || flush_requested))
+      ilen = soxr_i_for_o(p, olen, ilen0);
+    else
+      ilen = ilen0/*, olen = soxr_o_for_i(p, ilen, olen)*/;
+  }
+  p->flushing |= ilen == ilen0 && flush_requested;
+
+  if (!out && !in)
+    idone = ilen;
+  else if (p->io_spec.itype & p->io_spec.otype & SOXR_SPLIT) { /* Both i & o */
+#if defined _OPENMP
+    int i;
+    if (!p->runtime_spec.num_threads && p->num_channels > 1)
+#pragma omp parallel for
+    for (i = 0; i < (int)p->num_channels; ++i) {
+      size_t done;
+      if (in)
+        soxr_input_1ch(p, (unsigned)i, ((soxr_cbufs_t)in)[i], ilen);
+      done = soxr_output_1ch(p, (unsigned)i, ((soxr_bufs_t)out)[i], olen, true);
+      if (!i)
+        odone = done;
+    } else
+#endif
+    for (u = 0; u < p->num_channels; ++u) {
+      if (in)
+        soxr_input_1ch(p, u, ((soxr_cbufs_t)in)[u], ilen);
+      odone = soxr_output_1ch(p, u, ((soxr_bufs_t)out)[u], olen, true);
+    }
+    idone = ilen;
+  }
+  else {
+    idone = ilen? soxr_input (p, in , ilen) : 0;
+    odone = soxr_output(p, out, olen);
+  }
+  if (idone0) *idone0 = idone;
+  if (odone0) *odone0 = odone;
+  return p->error;
+}
+
+
+
+soxr_error_t soxr_oneshot(
+    double irate, double orate,
+    unsigned num_channels,
+    void const * in , size_t ilen, size_t * idone,
+    void * out, size_t olen, size_t * odone,
+    soxr_io_spec_t const * io_spec,
+    soxr_quality_spec_t const * q_spec,
+    soxr_runtime_spec_t const * runtime_spec)
+{
+  soxr_t resampler;
+  soxr_error_t error = q_spec? q_spec->e : 0;
+  if (!error) {
+    soxr_quality_spec_t q_spec1;
+    if (!q_spec)
+      q_spec1 = soxr_quality_spec(SOXR_LQ, 0), q_spec = &q_spec1;
+    resampler = soxr_create(irate, orate, num_channels,
+        &error, io_spec, q_spec, runtime_spec);
+  }
+  if (!error) {
+    error = soxr_process(resampler, in, ~ilen, idone, out, olen, odone);
+    soxr_delete(resampler);
+  }
+  return error;
+}
+
+
+
+soxr_error_t soxr_set_error(soxr_t p, soxr_error_t error)
+{
+  if (!p) return "null pointer";
+  if (!p->error && p->error != error) return p->error;
+  p->error = error;
+  return 0;
+}
diff --git a/src/soxr.h b/src/soxr.h
new file mode 100644
index 0000000..8d9622d
--- /dev/null
+++ b/src/soxr.h
@@ -0,0 +1,348 @@
+/* SoX Resampler Library       Copyright (c) 2007-13 robs@users.sourceforge.net
+ *
+ * This library is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+
+
+/* -------------------------------- Gubbins --------------------------------- */
+
+#if !defined soxr_included
+#define soxr_included
+
+
+#if defined __cplusplus
+  #include <cstddef>
+  extern "C" {
+#else
+  #include <stddef.h>
+#endif
+
+#if defined SOXR_DLL
+  #if defined soxr_EXPORTS
+    #define SOXR __declspec(dllexport)
+  #else
+    #define SOXR __declspec(dllimport)
+  #endif
+#elif defined SOXR_VISIBILITY && defined __GNUC__ && (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 1)
+  #define SOXR __attribute__ ((visibility("default")))
+#else
+  #define SOXR
+#endif
+
+typedef struct soxr_io_spec soxr_io_spec_t;
+typedef struct soxr_quality_spec soxr_quality_spec_t;
+typedef struct soxr_runtime_spec soxr_runtime_spec_t;
+
+
+
+/* ---------------------------- API conventions --------------------------------
+
+Buffer lengths (and occupancies) are expressed as the number of contained
+samples per channel.
+
+Parameter names for buffer lengths have the suffix `len'.
+
+A single-character `i' or 'o' is often used in names to give context as
+input or output (e.g. ilen, olen).                                            */
+
+
+
+/* --------------------------- Version management --------------------------- */
+
+/* E.g. #if SOXR_THIS_VERSION >= SOXR_VERSION(0,1,1) ...                      */
+
+#define SOXR_VERSION(x,y,z)     (((x)<<16)|((y)<<8)|(z))
+#define SOXR_THIS_VERSION       SOXR_VERSION(0,1,2)
+#define SOXR_THIS_VERSION_STR               "0.1.2"
+
+
+
+/* --------------------------- Type declarations ---------------------------- */
+
+typedef struct soxr * soxr_t;          /* A resampler for 1 or more channels. */
+typedef char const * soxr_error_t;                /* 0:no-error; non-0:error. */
+
+typedef void       * soxr_buf_t;  /* 1 buffer of channel-interleaved samples. */
+typedef void const * soxr_cbuf_t;                        /* Ditto; read-only. */
+
+typedef soxr_buf_t const  * soxr_bufs_t;/* Or, a separate buffer for each ch. */
+typedef soxr_cbuf_t const * soxr_cbufs_t;                /* Ditto; read-only. */
+
+typedef void const * soxr_in_t;      /* Either a soxr_cbuf_t or soxr_cbufs_t,
+                                        depending on itype in soxr_io_spec_t. */
+typedef void       * soxr_out_t;     /* Either a soxr_buf_t or soxr_bufs_t,
+                                        depending on otype in soxr_io_spec_t. */
+
+
+
+/* --------------------------- API main functions --------------------------- */
+
+SOXR char const * soxr_version(void);  /* Query library version: "libsoxr-x.y.z" */
+
+#define soxr_strerror(e)               /* Soxr counterpart to strerror. */     \
+    ((e)?(e):"no error")
+
+
+/* Create a stream resampler: */
+
+SOXR soxr_t soxr_create(
+    double      input_rate,      /* Input sample-rate. */
+    double      output_rate,     /* Output sample-rate. */
+    unsigned    num_channels,    /* Number of channels to be used. */
+        /* All following arguments are optional (may be set to NULL). */
+    soxr_error_t *,              /* To report any error during creation. */
+    soxr_io_spec_t const *,      /* To specify non-default I/O formats. */
+    soxr_quality_spec_t const *, /* To specify non-default resampling quality.*/
+    soxr_runtime_spec_t const *);/* To specify non-default runtime resources.
+
+    Default io_spec      is per soxr_io_spec(SOXR_FLOAT32_I, SOXR_FLOAT32_I)
+    Default quality_spec is per soxr_quality_spec(SOXR_HQ, 0)
+    Default runtime_spec is per soxr_runtime_spec(1)                          */
+
+
+
+/* If not using an app-supplied input function, after creating a stream
+ * resampler, repeatedly call: */
+
+SOXR soxr_error_t soxr_process(
+    soxr_t      resampler,      /* As returned by soxr_create. */
+                            /* Input (to be resampled): */
+    soxr_in_t   in,             /* Input buffer(s); may be NULL (see below). */
+    size_t      ilen,           /* Input buf. length (samples per channel). */
+    size_t      * idone,        /* To return actual # samples used (<= ilen). */
+                            /* Output (resampled): */
+    soxr_out_t  out,            /* Output buffer(s).*/
+    size_t      olen,           /* Output buf. length (samples per channel). */
+    size_t      * odone);       /* To return actual # samples out (<= olen).
+
+    Note that no special meaning is associated with ilen or olen equal to
+    zero.  End-of-input (i.e. no data is available nor shall be available)
+    may be indicated by seting `in' to NULL.                                  */
+
+
+
+/* If using an app-supplied input function, it must look and behave like this:*/
+
+typedef size_t /* data_len */
+  (* soxr_input_fn_t)(         /* Supply data to be resampled. */
+    void * input_fn_state,     /* As given to soxr_set_input_fn (below). */
+    soxr_in_t * data,          /* Returned data; see below. N.B. ptr to ptr(s)*/
+    size_t requested_len);     /* Samples per channel, >= returned data_len.
+
+  data_len  *data     Indicates    Meaning
+   ------- -------   ------------  -------------------------
+     !=0     !=0       Success     *data contains data to be
+                                   input to the resampler.
+      0    !=0 (or   End-of-input  No data is available nor
+           not set)                shall be available.
+      0       0        Failure     An error occurred whilst trying to
+                                   source data to be input to the resampler.  */
+
+/* and be registered with a previously created stream resampler using: */
+
+SOXR soxr_error_t soxr_set_input_fn(/* Set (or reset) an input function.*/
+    soxr_t resampler,            /* As returned by soxr_create. */
+    soxr_input_fn_t,             /* Function to supply data to be resampled.*/
+    void * input_fn_state,       /* If needed by the input function. */
+    size_t max_ilen);            /* Maximum value for input fn. requested_len.*/
+
+/* then repeatedly call: */
+
+SOXR size_t /*odone*/ soxr_output(/* Resample and output a block of data.*/
+    soxr_t resampler,            /* As returned by soxr_create. */
+    soxr_out_t data,             /* App-supplied buffer(s) for resampled data.*/
+    size_t olen);                /* Amount of data to output; >= odone. */
+
+
+
+/* Common stream resampler operations: */
+
+SOXR soxr_error_t soxr_error(soxr_t);   /* Query error status. */
+SOXR size_t   * soxr_num_clips(soxr_t); /* Query int. clip counter (for R/W). */
+SOXR double     soxr_delay(soxr_t);  /* Query current delay in output samples.*/
+SOXR char const * soxr_engine(soxr_t p); /* Query resampling engine name. */
+
+SOXR soxr_error_t soxr_clear(soxr_t); /* Ready for fresh signal, same config. */
+SOXR void         soxr_delete(soxr_t);  /* Free resources. */
+
+
+
+/* `Short-cut', single call to resample a (probably short) signal held entirely
+ * in memory.  See soxr_create and soxr_process above for parameter details.
+ * Note that unlike soxr_create however, the default quality spec. for
+ * soxr_oneshot is per soxr_quality_spec(SOXR_LQ, 0). */
+
+SOXR soxr_error_t soxr_oneshot(
+    double         input_rate,
+    double         output_rate,
+    unsigned       num_channels,
+    soxr_in_t    in , size_t ilen, size_t * idone,
+    soxr_out_t   out, size_t olen, size_t * odone,
+    soxr_io_spec_t const *,
+    soxr_quality_spec_t const *,
+    soxr_runtime_spec_t const *);
+
+
+
+/* For variable-rate resampling. See example # 5 for how to create a
+ * variable-rate resampler and how to use this function. */
+
+SOXR soxr_error_t soxr_set_io_ratio(soxr_t, double io_ratio, size_t slew_len);
+
+
+
+/* -------------------------- API type definitions -------------------------- */
+
+typedef enum {          /* Datatypes supported for I/O to/from the resampler: */
+  /* Internal; do not use: */
+  SOXR_FLOAT32, SOXR_FLOAT64, SOXR_INT32, SOXR_INT16, SOXR_SPLIT = 4,
+
+  /* Use for interleaved channels: */
+  SOXR_FLOAT32_I = SOXR_FLOAT32, SOXR_FLOAT64_I, SOXR_INT32_I, SOXR_INT16_I,
+
+  /* Use for split channels: */
+  SOXR_FLOAT32_S = SOXR_SPLIT  , SOXR_FLOAT64_S, SOXR_INT32_S, SOXR_INT16_S
+
+} soxr_datatype_t;
+
+#define soxr_datatype_size(x)  /* Returns `sizeof' a soxr_datatype_t sample. */\
+  ((unsigned char *)"\4\10\4\2")[(x)&3]
+
+
+
+struct soxr_io_spec {                                            /* Typically */
+  soxr_datatype_t itype;     /* Input datatype.                SOXR_FLOAT32_I */
+  soxr_datatype_t otype;     /* Output datatype.               SOXR_FLOAT32_I */
+  double scale;              /* Linear gain to apply during resampling.  1    */
+  void * e;                  /* Reserved for internal use                0    */
+  unsigned long flags;       /* Per the following #defines.              0    */
+};
+
+#define SOXR_TPDF              0     /* Applicable only if otype is INT16. */
+#define SOXR_NO_DITHER         8u    /* Disable the above. */
+
+
+
+struct soxr_quality_spec {                                       /* Typically */
+  double precision;         /* Conversion precision (in bits).           20   */
+  double phase_response;    /* 0=minimum, ... 50=linear, ... 100=maximum 50   */
+  double passband_end;      /* 0dB pt. bandwidth to preserve; nyquist=1  0.913*/
+  double stopband_begin;    /* Aliasing/imaging control; > passband_end   1   */
+  void * e;                 /* Reserved for internal use.                 0   */
+  unsigned long flags;      /* Per the following #defines.                0   */
+};
+
+#define SOXR_ROLLOFF_SMALL     0u    /* <= 0.01 dB */
+#define SOXR_ROLLOFF_MEDIUM    1u    /* <= 0.35 dB */
+#define SOXR_ROLLOFF_NONE      2u    /* For Chebyshev bandwidth. */
+
+#define SOXR_MAINTAIN_3DB_PT   4u  /* Reserved for internal use. */
+#define SOXR_HI_PREC_CLOCK     8u  /* Increase `irrational' ratio accuracy. */
+#define SOXR_DOUBLE_PRECISION 16u  /* Use D.P. calcs even if precision <= 20. */
+#define SOXR_VR               32u  /* Variable-rate resampling. */
+
+
+
+struct soxr_runtime_spec {                                       /* Typically */
+  unsigned log2_min_dft_size;/* For DFT efficiency. [8,15]              10    */
+  unsigned log2_large_dft_size;/* For DFT efficiency. [16,20]           17    */
+  unsigned coef_size_kbytes; /* For SOXR_COEF_INTERP_AUTO (below).      400   */
+  unsigned num_threads;      /* If built so. 0 means `automatic'.        1    */
+  void * e;                  /* Reserved for internal use.               0    */
+  unsigned long flags;       /* Per the following #defines.              0    */
+};
+                                   /* For `irrational' ratios only: */
+#define SOXR_COEF_INTERP_AUTO  0u    /* Auto select coef. interpolation. */
+#define SOXR_COEF_INTERP_LOW   2u    /* Man. select: less CPU, more memory. */
+#define SOXR_COEF_INTERP_HIGH  3u    /* Man. select: more CPU, less memory. */
+
+#define SOXR_STRICT_BUFFERING  4u  /* Reserved for future use. */
+#define SOXR_NOSMALLINTOPT     8u  /* For test purposes only. */
+
+
+
+/* -------------------------- API type constructors ------------------------- */
+
+/* These functions allow setting of the most commonly-used structure
+ * parameters, with other parameters being given default values.  The default
+ * values may then be overridden, directly in the structure, if needed.  */
+
+SOXR soxr_quality_spec_t soxr_quality_spec(
+    unsigned long recipe,       /* Per the #defines immediately below. */
+    unsigned long flags);       /* As soxr_quality_spec_t.flags. */
+
+                                  /* The 5 standard qualities found in SoX: */
+#define SOXR_QQ                 0   /* 'Quick' cubic interpolation. */
+#define SOXR_LQ                 1   /* 'Low' 16-bit with larger rolloff. */
+#define SOXR_MQ                 2   /* 'Medium' 16-bit with medium rolloff. */
+#define SOXR_HQ                 SOXR_20_BITQ /* 'High quality'. */
+#define SOXR_VHQ                SOXR_28_BITQ /* 'Very high quality'. */
+
+#define SOXR_16_BITQ            3
+#define SOXR_20_BITQ            4
+#define SOXR_24_BITQ            5
+#define SOXR_28_BITQ            6
+#define SOXR_32_BITQ            7
+                                    /* Libsamplerate equivalent qualities: */
+#define SOXR_LSR0Q              8     /* 'Best sinc'. */
+#define SOXR_LSR1Q              9     /* 'Medium sinc'. */
+#define SOXR_LSR2Q              10    /* 'Fast sinc'. */
+
+#define SOXR_LINEAR_PHASE       0x00
+#define SOXR_INTERMEDIATE_PHASE 0x10
+#define SOXR_MINIMUM_PHASE      0x30
+#define SOXR_STEEP_FILTER       0x40
+#define SOXR_ALLOW_ALIASING     0x80  /* Reserved for future use. */
+
+
+
+SOXR soxr_runtime_spec_t soxr_runtime_spec(
+    unsigned num_threads);
+
+
+
+SOXR soxr_io_spec_t soxr_io_spec(
+    soxr_datatype_t itype,
+    soxr_datatype_t otype);
+
+
+
+/* --------------------------- Advanced use only ---------------------------- */
+
+/* For new designs, the following functions/usage will probably not be needed.
+ * They might be useful when adding soxr into an existing design where values
+ * for the resampling-rate and/or number-of-channels parameters to soxr_create
+ * are not available when that function will be called.  In such cases, the
+ * relevant soxr_create parameter(s) can be given as 0, then one or both of the
+ * following (as appropriate) later invoked (but prior to calling soxr_process
+ * or soxr_output):
+ *
+ * soxr_set_error(soxr, soxr_set_io_ratio(soxr, io_ratio, 0));
+ * soxr_set_error(soxr, soxr_set_num_channels(soxr, num_channels));
+ */
+
+SOXR soxr_error_t soxr_set_error(soxr_t, soxr_error_t);
+SOXR soxr_error_t soxr_set_num_channels(soxr_t, unsigned);
+
+
+
+#undef SOXR
+
+#if defined __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/soxr.pc.in b/src/soxr.pc.in
new file mode 100644
index 0000000..69d225b
--- /dev/null
+++ b/src/soxr.pc.in
@@ -0,0 +1,5 @@
+Name: ${PROJECT_NAME}
+Description: ${DESCRIPTION_SUMMARY}
+Version: ${PROJECT_VERSION}
+Libs: -L${LIB_INSTALL_DIR} -l${PROJECT_NAME}
+Cflags: -I${INCLUDE_INSTALL_DIR}
diff --git a/src/vr-coefs.c b/src/vr-coefs.c
new file mode 100644
index 0000000..14886df
--- /dev/null
+++ b/src/vr-coefs.c
@@ -0,0 +1,112 @@
+/* SoX Resampler Library         Copyright (c) 2013 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+/* Generate the filter coefficients for variable-rate resampling. */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#define PI 3.14159265358979323846            /* Since M_PI can't be relied on */
+
+static void print(double * h, int m, double l, char const * name)
+{                                                      /* Print out a filter: */
+  int i, N = l? (int)(l*m)-(l>1) : m, R=(N+1)/2;
+  int a = !l||l>1? 0:N-R, b = l>1? R:N;
+  printf("static float const %s[] = {\n", name);
+  if (l>1) printf(" 0.f,"); else if (!l) l=1;
+  for (i=a; h && i<b; ++i, printf("% .9gf,%c",l*h[i-1],"\n "[(i-a)&3 && i<b]));
+  puts("};\n");
+  free(h);
+}
+                                                  /* Parks McClellan FIR LPF: */
+#define even_adj(f) ((N&1)? 1 : cos(PI*.5*(f)))
+#define W(f) (((f) < Fp+1e-9? weight : 1) * even_adj(f))      /* Weighting fn */
+#define D(f) (((f) < Fp+1e-9) / even_adj(f))           /* Desired response fn */
+#define F(i) ((i) <= end[0]? (i)*inc[0] : 1-(end[1]-(i))*inc[1])
+#define EE(x,z) (_1 != x 1 && x E[i] > 0 && x E[i] >= x E[i z 1])
+#define PEAK do {if (k<NP+1) peak[k]=i; ++k,_1=(E[i]>0)-(E[i]<0);} while (0)
+
+typedef struct {double x, beta, gamma;} coef_t;
+
+static double amp_response(coef_t * coef, int R, double f, int i)
+{
+  double n = 0, d = 0, x = cos(PI*f), t;
+  for (; i < R; d += t = coef[i].beta / t, n += coef[i].gamma * t, ++i)
+    if (fabs(t = x - coef[i].x) < 1e-9) return coef[i].gamma;
+  return n/d;
+}
+
+static void fir(int m, double l, double Fp0, double Fs0,
+    double weight0, int density, char const * name)
+{
+  double Fp=Fp0/l, Fs=Fs0/l, weight=1/weight0, inc[2], Ws=1-Fs;
+  int N = (int)(l*m)-(l>1), R=(N+1)/2, NP=R+1, grid_size=1+density*R+1, pass=0;
+  int n1 = Ws>=(2*R-1)*Fp? 1:(int)(R*Fp/(Fp+Ws)+.5), n2=NP-n1, _1, i, j, k;
+  int    * peak = calloc(sizeof(*peak), (size_t)(NP+1)), * P=peak, end[2];
+  coef_t * coef = calloc(sizeof(*coef), (size_t)(NP));
+  float  * E    = calloc(sizeof(*E   ), (size_t)(grid_size));
+  double d, n, e, f, mult, delta, sum, hi, lo, * A = (double*)E, *h=0;
+
+  if (!P || !coef || !E) goto END;
+  end[0] = n1 * density, end[1] = grid_size-1;     /* Create prototype peaks: */
+  inc[0] = Fp/end[0],    inc[1] = n2==1? 0 : Ws / ((n2-1)*density);
+  for (i=0; i<n1; P[n1-1-i] = end[0] - i*density,++i);
+  for (i=0; i<n2; P[n1+i] = 1+end[0] + i*density,++i);
+
+  do {                                               /* Coefs for amp. resp.: */
+    for (i = 0; i<NP; coef[i].x = cos(PI*F(P[i])), ++i);
+    for (_1=-1, n=d=i=0; i < NP; ++i) {
+      for (mult = 1, j = 0; j < R; ++j) if (j != i) mult *= coef[i].x-coef[j].x;
+      if (mult) coef[i].beta = 1/mult; else goto END;
+      if (i != R) mult *= coef[i].x - coef[R].x;
+      f = F(P[i]), n += D(f)/mult, d += (_1=-_1)/(W(f)*mult);
+    }
+    for (delta = n/d, _1 = -1, i = 0; i < R; ++i)
+      f = F(P[i]), coef[i].gamma = D(f)-(_1=-_1)*delta/W(f);
+    for (i = 0; i <= end[1]; ++i)            /* Amplitude response and error: */
+      f = F(i), E[i] = (float)(W(f)*(D(f) - amp_response(coef, R, f, 0)));
+
+    i = k = _1 = 0;                                        /* Find new peaks: */
+    if (end[0]) if (EE(+,+) || EE(-,+)) PEAK;                       /* At F=0 */
+    for (++i, j = 0; j < 2; ++j) {                              /* In band j: */
+      for (; i < end[j]; ++i)
+        if ((EE(+,-) && E[i]>E[i+1]) || (EE(-,-) && E[i]<E[i+1])) PEAK;
+      if (!j) {PEAK; ++i; PEAK; ++i;}                           /* At Fp & Fs */
+    }
+    if (i==end[1]) if (EE(+,-) || EE(-,-)) PEAK;                    /* At F=1 */
+    if ((unsigned)(k = k-NP) > 1) goto END;                  /* Too many/few? */
+    P = peak + k * (fabs(E[peak[0]]) < fabs(E[peak[NP]]));         /* rm 1st? */
+
+    for (lo = hi = fabs(E[P[0]]), i=1; i<NP; ++i)              /* Converged?: */
+      e = fabs(E[P[i]]), lo = e<lo? e:lo, hi = e>hi? e:hi;
+  } while ((hi-lo)/hi > .001 && ++pass < 20);
+                      /* Create impulse response from final amp. resp. coefs: */
+  if (!(h = malloc(sizeof(*h)*(size_t)N))) goto END;
+  for (i = 0; i < R; f = 2.*i/N, A[i++] = amp_response(coef,R,f,0)*even_adj(f));
+  for (i = 0; i < R; h[N-1-i] = h[i] = sum/N, ++i)
+    for (sum=*A, j=1; j<R; sum += 2*cos(2*PI*(i-(N-1)/2.)/N*j)*A[j], ++j);
+  END: free(coef), free(E), free(peak);
+  print(h, m, l, name);
+}
+                                  /* Half-band IIR LPF (Mitra DSP 3/e, 13_9): */
+static void iir(int N, double Fp, char const * name)
+{
+  double d=tan(PI*.5*Fp), r=d*d, t=sqrt(1-r*r), n=(1-sqrt(t))/(1+sqrt(t))*.5;
+  double x=(n*n)*(n*n), Q=(((150*x+15)*x+2)*x+1)*n, q=pow(Q,.25), *h;
+  int i=0, j, _1;
+  if (!(h = malloc(sizeof(*h)*(size_t)N))) goto END;
+  for (; i<N; t=n*q/d, t=t*t, t=sqrt((1-t*r)*(1-t/r))/(1+t), h[i++]=(1-t)/(1+t))
+    for (_1=1, d=-.5, n=j=0, x=(i+1)*PI/(N+.5); j<7; ++j, _1=-_1)
+      n += _1*pow(Q,j*(j+1))*sin(x*(j+.5)), d += _1*pow(Q,j*j)*cos(x*j);
+  END: print(h, N, 0, name);
+}
+
+int main(int argc, char **argv)
+{
+  fir(241,  1, .45,  .5, 160, 32, "half_fir_coefs");
+  fir( 24, .5, .25,  .5,   1, 31, "fast_half_fir_coefs");
+  fir( 20, 12, .9 , 1.5, 160, 58, "coefs0_d");
+  fir( 12,  6, .45, 1.5,  80, 29, "coefs0_u");
+  iir( 15, .492, "iir_coefs");
+  return 0*argc*!argv;
+}
diff --git a/src/vr-coefs.h b/src/vr-coefs.h
new file mode 100644
index 0000000..9790ec0
--- /dev/null
+++ b/src/vr-coefs.h
@@ -0,0 +1,91 @@
+static float const half_fir_coefs[] = {
+ 0.471112154f,  0.316907549f,  0.0286963396f, -0.101927032f,
+-0.0281272982f,  0.0568029535f,  0.027196876f, -0.0360795942f,
+-0.0259313561f,  0.023641162f,  0.0243660538f, -0.0151238564f,
+-0.0225440668f,  0.00886927471f,  0.0205146088f, -0.00411434209f,
+-0.0183312132f,  0.000458525335f,  0.0160497772f,  0.00233248286f,
+-0.0137265989f, -0.0044106884f,  0.011416442f,  0.005885487f,
+-0.00917074467f, -0.00684373006f,  0.00703601669f,  0.00736018933f,
+-0.00505250698f, -0.00750298261f,  0.00325317131f,  0.00733618346f,
+-0.00166298445f, -0.00692082025f,  0.000298598848f,  0.00631493711f,
+ 0.000831644129f, -0.0055731438f, -0.00172737872f,  0.00474591812f,
+ 0.0023955814f, -0.0038788491f, -0.00284969263f,  0.00301194082f,
+ 0.00310854264f, -0.00217906496f, -0.00319514679f,  0.00140761062f,
+ 0.00313542959f, -0.000718361916f, -0.00295694328f,  0.000125607323f,
+ 0.00268763625f,  0.000362527878f, -0.00235472525f, -0.000743552559f,
+ 0.00198371228f,  0.00101991741f, -0.0015975797f, -0.00119820218f,
+ 0.00121618271f,  0.0012882279f, -0.000855849209f, -0.00130214036f,
+ 0.000529184474f,  0.00125350876f, -0.000245067778f, -0.00115647977f,
+ 8.82118676e-06f,  0.00102502052f,  0.000177478031f, -0.000872275256f,
+-0.000314572995f,  0.000710055602f,  0.000405526007f, -0.000548470439f,
+-0.000455174442f,  0.000395698685f,  0.000469579667f, -0.000257895884f,
+-0.000455495078f,  0.000139222702f,  0.000419883982f, -4.19753541e-05f,
+-0.00036950051f, -3.32020844e-05f,  0.000310554015f,  8.7050045e-05f,
+-0.000248456595f, -0.000121389974f,  0.000187662656f,  0.000138813233f,
+-0.000131587954f, -0.000142374865f,  8.26090549e-05f,  0.000135318039f,
+-4.21208043e-05f, -0.000120830917f,  1.06505085e-05f,  0.00010185819f,
+ 1.20015129e-05f, -8.09558888e-05f, -2.65925299e-05f,  6.02101571e-05f,
+ 3.42775752e-05f, -4.11911155e-05f, -3.64462477e-05f,  2.49654252e-05f,
+ 3.46090513e-05f, -1.21078107e-05f, -3.03027209e-05f,  2.73562006e-06f,
+ 2.51329043e-05f,  3.66157998e-06f, -2.0990973e-05f, -9.38752332e-06f,
+ 2.07133365e-05f,  3.2060847e-05f,  1.98462364e-05f,  4.90328648e-06f,
+-5.28550107e-07f,
+};
+
+static float const fast_half_fir_coefs[] = {
+ 0.309418476f, -0.0819805418f,  0.0305513441f, -0.0101582224f,
+ 0.00251293175f, -0.000346895324f,
+};
+
+static float const coefs0_d[] = {
+ 0.f, 1.40520362e-05f,  2.32939994e-05f,  4.00699869e-05f,  6.18938797e-05f,
+ 8.79406317e-05f,  0.000116304226f,  0.000143862785f,  0.000166286173f,
+ 0.000178229431f,  0.00017374107f,  0.00014689118f,  9.25928444e-05f,
+ 7.55567388e-06f, -0.000108723934f, -0.000253061416f, -0.000417917952f,
+-0.000591117466f, -0.000756082504f, -0.000892686881f, -0.000978762367f,
+-0.000992225841f, -0.00091370246f, -0.000729430325f, -0.000434153678f,
+-3.36489703e-05f,  0.000453499646f,  0.000995243588f,  0.00154683724f,
+ 0.00205322353f,  0.00245307376f,  0.0026843294f,  0.0026908874f,
+ 0.00242986868f,  0.00187874742f,  0.00104150259f, -4.70759945e-05f,
+-0.00131972748f, -0.00267834298f, -0.00399923407f, -0.00514205849f,
+-0.00596200535f, -0.00632441105f, -0.00612058374f, -0.00528328869f,
+-0.00380015804f, -0.0017232609f,  0.000826765169f,  0.0036632503f,
+ 0.00654337507f,  0.00918536843f,  0.0112922007f,  0.0125801323f,
+ 0.0128097433f,  0.0118164904f,  0.00953750551f,  0.00603133188f,
+ 0.00148762708f, -0.00377544588f, -0.009327395f, -0.014655127f,
+-0.0192047839f, -0.0224328082f, -0.0238620596f, -0.0231377935f,
+-0.0200777417f, -0.0147104883f, -0.00729690011f,  0.0016694689f,
+ 0.0114853672f,  0.02128446f,  0.0301054204f,  0.03697694f,
+ 0.0410129138f,  0.0415093321f,  0.0380333749f,  0.0304950299f,
+ 0.0191923285f,  0.00482304203f, -0.0115416941f, -0.0285230397f,
+-0.0445368533f, -0.0579264573f, -0.0671158215f, -0.070770308f,
+-0.0679502076f, -0.0582416438f, -0.0418501969f, -0.0196448429f,
+ 0.00685658762f,  0.0355644891f,  0.0639556622f,  0.0892653703f,
+ 0.108720484f,  0.11979613f,  0.120474745f,  0.109484562f,
+ 0.0864946948f,  0.0522461633f,  0.00860233712f, -0.041491734f,
+-0.0941444939f, -0.144742955f, -0.188255118f, -0.219589829f,
+-0.233988169f, -0.227416437f, -0.196929062f, -0.140970726f,
+-0.0595905561f,  0.0454527813f,  0.170708227f,  0.311175511f,
+ 0.460568159f,  0.61168037f,  0.756833088f,  0.888367707f,
+ 0.999151395f,  1.08305644f,  1.13537741f,  1.15315438f,
+};
+
+static float const coefs0_u[] = {
+ 0.f, 2.4378013e-05f,  9.70782157e-05f,  0.000256572953f,  0.000527352928f,
+ 0.000890796838f,  0.00124949518f,  0.00140604793f,  0.00107945998f,
+-2.15586031e-05f, -0.00206589462f, -0.00493342625f, -0.00807135101f,
+-0.0104515787f, -0.0107039866f, -0.00746258988f,  0.000109078838f,
+ 0.0117345872f,  0.0255795186f,  0.0381690155f,  0.0448461522f,
+ 0.0408218138f,  0.0226797758f, -0.00999595371f, -0.0533441602f,
+-0.0987927774f, -0.133827418f, -0.144042973f, -0.116198269f,
+-0.0416493482f,  0.0806808506f,  0.242643854f,  0.427127981f,
+ 0.610413245f,  0.766259257f,  0.8708884f,  0.907742029f,
+};
+
+static float const iir_coefs[] = {
+ 0.0262852045f,  0.0998310478f,  0.206865061f,  0.330224134f,
+ 0.454420362f,  0.568578357f,  0.666944466f,  0.747869771f,
+ 0.812324404f,  0.8626001f,  0.901427744f,  0.931486057f,
+ 0.955191529f,  0.974661783f,  0.991776305f,
+};
+
diff --git a/src/vr32.c b/src/vr32.c
new file mode 100644
index 0000000..65eed3f
--- /dev/null
+++ b/src/vr32.c
@@ -0,0 +1,657 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+/* Variable-rate resampling. */
+
+#include <assert.h>
+#include <math.h>
+#if !defined M_PI
+#define M_PI    3.14159265358979323846
+#endif
+#if !defined M_LN2
+#define M_LN2   0.69314718055994530942
+#endif
+#include <string.h>
+#include <stdlib.h>
+#include "internal.h"
+#define FIFO_SIZE_T int
+#define FIFO_MIN 0x8000
+#include "fifo.h"
+#include "vr-coefs.h"
+
+#define FADE_LEN_BITS     9
+#define PHASE_BITS_D      10
+#define PHASE_BITS_U      9
+
+#define PHASES0_D         12
+#define POLY_FIR_LEN_D    20
+#define PHASES0_U         6
+#define POLY_FIR_LEN_U    12
+
+#define MULT32            (65536. * 65536.)
+#define PHASES_D          (1 << PHASE_BITS_D)
+#define PHASES_U          (1 << PHASE_BITS_U)
+
+#define CONVOLVE \
+    _ _ _ _ _ _ _ _ _ _  _ _ _ _ _ _ _ _ _ _ \
+    _ _ _ _ _ _ _ _ _ _  _ _ _ _ _ _ _ _ _ _ \
+    _ _ _ _ _ _ _ _ _ _  _ _ _ _ _ _ _ _ _ _
+
+#define HALF_FIR_LEN_2 (iAL(half_fir_coefs) - 1)
+#define HALF_FIR_LEN_4 (HALF_FIR_LEN_2 / 2)
+
+#define _ sum += (input[-i] + input[i]) * half_fir_coefs[i], ++i;
+static float half_fir(float const * input)
+{
+  long i = 1;
+  float sum = input[0] * half_fir_coefs[0];
+  CONVOLVE CONVOLVE
+  assert(i == HALF_FIR_LEN_2 + 1);
+  return (float)sum;
+}
+#undef _
+
+#define _ sum += (input[-i] + input[i]) * half_fir_coefs[2*i], ++i;
+static float double_fir0(float const * input)
+{
+  int i = 1;
+  float sum = input[0] * half_fir_coefs[0];
+  CONVOLVE
+  assert(i == HALF_FIR_LEN_4 + 1);
+  return (float)(sum * 2);
+}
+#undef _
+
+#define _ sum += (input[-i] + input[1+i]) * half_fir_coefs[2*i+1], ++i;
+static float double_fir1(float const * input)
+{
+  int i = 0;
+  float sum = 0;
+  CONVOLVE
+  assert(i == HALF_FIR_LEN_4 + 0);
+  return (float)(sum * 2);
+}
+#undef _
+
+static float fast_half_fir(float const * input)
+{
+  int i = 0;
+  float sum = input[0] * .5f;
+#define _ sum += (input[-(2*i+1)] + input[2*i+1]) * fast_half_fir_coefs[i], ++i;
+  _ _ _ _ _ _
+#undef _
+  return (float)sum;
+}
+
+#define IIR_FILTER _ _ _ _ _ _ _
+#define _ in1=(in1-p->y[i])*iir_coefs[i]+tmp1;tmp1=p->y[i],p->y[i]=in1;++i;\
+          in0=(in0-p->y[i])*iir_coefs[i]+tmp0;tmp0=p->y[i],p->y[i]=in0;++i;
+
+typedef struct {float x[2], y[AL(iir_coefs)];} half_iir_t;
+
+static float half_iir1(half_iir_t * p, float in0, float in1)
+{
+  int i = 0;
+  float tmp0, tmp1;
+  tmp0 = p->x[0], p->x[0] = in0;
+  tmp1 = p->x[1], p->x[1] = in1;
+  IIR_FILTER
+  p->y[i] = in1 = (in1 - p->y[i]) * iir_coefs[i] + tmp1;
+  return in1 + in0;
+}
+#undef _
+
+static void half_iir(half_iir_t * p, float * obuf, float const * ibuf, int olen)
+{
+  int i;
+  for (i=0; i < olen; obuf[i] = (float)half_iir1(p, ibuf[i*2], ibuf[i*2+1]),++i);
+}
+
+static void half_phase(half_iir_t * p, float * buf, int len)
+{
+  float const small_normal = 1/MULT32/MULT32; /* To quash denormals on path 0.*/
+  int i;
+  for (i = 0; i < len; buf[i] = (float)half_iir1(p, buf[i], 0), ++i);
+#define _ p->y[i] += small_normal, i += 2;
+  i = 0, _ IIR_FILTER
+#undef _
+#define _ p->y[i] -= small_normal, i += 2;
+  i = 0, _ IIR_FILTER
+#undef _
+}
+
+#define coef(coef_p, interp_order, fir_len, phase_num, coef_interp_num, \
+    fir_coef_num) coef_p[(fir_len) * ((interp_order) + 1) * (phase_num) + \
+    ((interp_order) + 1) * (fir_coef_num) + (interp_order - coef_interp_num)]
+
+#define COEF(h,l,i) ((i)<0||(i)>=(l)?0:(h)[(i)>(l)/2?(l)-(i):(i)])
+static void prepare_coefs(float * coefs, int n, int phases0, int phases,
+    float const * coefs0, double multiplier)
+{
+  double k[6];
+  int length0 = n * phases0, length = n * phases, K0 = iAL(k)/2 - 1, i, j, pos;
+  float * coefs1 = malloc(((size_t)length / 2  + 1) * sizeof(*coefs1));
+  float * p = coefs1, f0, f1 = 0;
+
+  for (j = 0; j < iAL(k); k[j] = COEF(coefs0, length0, j - K0), ++j);
+  for (pos = i = 0; i < length0 / 2; ++i) {
+    double b=(1/24.)*(k[0]+k[4]+6*k[2]-4*(k[1]+k[3])),d=.5*(k[1]+k[3])-k[2]-b;
+    double a=(1/120.)*(k[5]-k[2]-9*(9*b+d)+2.5*(k[3]-k[1])-2*(k[4]-k[0]));
+    double c=(1/12.)*(k[4]-k[0]-2*(k[3]-k[1])-60*a),e=.5*(k[3]-k[1])-a-c;
+    for (; pos / phases == i; pos += phases0) {
+      double x = (double)(pos % phases) / phases;
+      *p++ = (float)(k[K0] + ((((a*x + b)*x + c)*x + d)*x + e)*x);
+    }
+    for (j = 0; j < iAL(k) - 1; k[j] = k[j + 1], ++j);
+    k[j] = COEF(coefs0, length0, i + iAL(k) / 2 + 1);
+  }
+  if (!(length & 1))
+    *p++ = (float)k[K0];
+  assert(p - coefs1 == length / 2  + 1);
+
+  for (i = 0; i < n; ++i) for (j = phases - 1; j >= 0; --j, f1 = f0) {
+    pos = (n - 1 - i) * phases + j;
+    f0 = COEF(coefs1, length, pos) * (float)multiplier;
+    coef(coefs, 1, n, j, 0, i) = (float)f0;
+    coef(coefs, 1, n, j, 1, i) = (float)(f1 - f0);
+  }
+  free(coefs1);
+}
+
+#define _ sum += (b *x + a)*input[i], ++i;
+#define a (coef(poly_fir_coefs_d, 1, POLY_FIR_LEN_D, phase, 0,i))
+#define b (coef(poly_fir_coefs_d, 1, POLY_FIR_LEN_D, phase, 1,i))
+static float poly_fir_coefs_d[POLY_FIR_LEN_D * PHASES_D * 2];
+
+static float poly_fir1_d(float const * input, uint32_t frac)
+{
+  int i = 0, phase = (int)(frac >> (32 - PHASE_BITS_D));
+  float sum = 0, x = (float)(frac << PHASE_BITS_D) * (float)(1 / MULT32);
+  _ _ _ _ _  _ _ _ _ _  _ _ _ _ _  _ _ _ _ _
+  assert(i == POLY_FIR_LEN_D);
+  return (float)sum;
+}
+#undef a
+#undef b
+#define a (coef(poly_fir_coefs_u, 1, POLY_FIR_LEN_U, phase, 0,i))
+#define b (coef(poly_fir_coefs_u, 1, POLY_FIR_LEN_U, phase, 1,i))
+static float poly_fir_coefs_u[POLY_FIR_LEN_U * PHASES_U * 2];
+
+static float poly_fir1_u(float const * input, uint32_t frac)
+{
+  int i = 0, phase = (int)(frac >> (32 - PHASE_BITS_U));
+  float sum = 0, x = (float)(frac << PHASE_BITS_U) * (float)(1 / MULT32);
+  _ _ _ _ _  _ _ _ _ _  _ _
+  assert(i == POLY_FIR_LEN_U);
+  return (float)sum;
+}
+#undef a
+#undef b
+#undef _
+
+#define ADD_TO(x,y)           x.all += y.all
+#define SUBTRACT_FROM(x,y)    x.all -= y.all
+#define FRAC(x)               x.part.frac
+#define INT(x)                x.part.integer
+
+typedef struct {
+  union {
+    int64_t all;
+#if WORDS_BIGENDIAN
+    struct {int32_t integer; uint32_t frac;} part;
+#else
+    struct {uint32_t frac; int32_t integer;} part;
+#endif
+  } at, step, step_step;
+  float const * input;
+  int len, stage_num;
+  bool is_d; /* true: downsampling at x2 rate; false: upsampling at 1x rate. */
+  double step_mult;
+} stream_t;
+
+static int poly_fir_d(stream_t * s, float * output, int olen)
+{
+  int i;
+  float const * input = s->input - POLY_FIR_LEN_D / 2 + 1;
+  for (i = 0; i < olen && INT(s->at) < s->len; ++i) {
+    output[i] = poly_fir1_d(input + INT(s->at), FRAC(s->at));
+    ADD_TO(s->at, s->step);
+    if (!(INT(s->at) < s->len)) {
+      SUBTRACT_FROM(s->at, s->step);
+      break;
+    }
+    output[++i] = poly_fir1_d(input + INT(s->at), FRAC(s->at));
+    ADD_TO(s->at, s->step);
+    ADD_TO(s->step, s->step_step);
+  }
+  return i;
+}
+
+static int poly_fir_fade_d(
+    stream_t * s, float const * vol, int step, float * output, int olen)
+{
+  int i;
+  float const * input = s->input - POLY_FIR_LEN_D / 2 + 1;
+  for (i = 0; i < olen && INT(s->at) < s->len; ++i, vol += step) {
+    output[i] += *vol * poly_fir1_d(input + INT(s->at), FRAC(s->at));
+    ADD_TO(s->at, s->step);
+    if (!(INT(s->at) < s->len)) {
+      SUBTRACT_FROM(s->at, s->step);
+      break;
+    }
+    output[++i] += *(vol += step) * poly_fir1_d(input + INT(s->at),FRAC(s->at));
+    ADD_TO(s->at, s->step);
+    ADD_TO(s->step, s->step_step);
+  }
+  return i;
+}
+
+static int poly_fir_u(stream_t * s, float * output, int olen)
+{
+  int i;
+  float const * input = s->input - POLY_FIR_LEN_U / 2 + 1;
+  for (i = 0; i < olen && INT(s->at) < s->len; ++i) {
+    output[i] = poly_fir1_u(input + INT(s->at), FRAC(s->at));
+    ADD_TO(s->at, s->step);
+    ADD_TO(s->step, s->step_step);
+  }
+  return i;
+}
+
+static int poly_fir_fade_u(
+    stream_t * s, float const * vol, int step, float * output, int olen)
+{
+  int i;
+  float const * input = s->input - POLY_FIR_LEN_U / 2 + 1;
+  for (i = 0; i < olen && INT(s->at) < s->len; i += 2, vol += step) {
+    output[i] += *vol * poly_fir1_u(input + INT(s->at), FRAC(s->at));
+    ADD_TO(s->at, s->step);
+    ADD_TO(s->step, s->step_step);
+  }
+  return i;
+}
+
+#define shiftr(x,by) ((by) < 0? (x) << (-(by)) : (x) >> (by))
+#define shiftl(x,by) shiftr(x,-(by))
+#define stage_occupancy(s) (fifo_occupancy(&(s)->fifo) - 4*HALF_FIR_LEN_2)
+#define stage_read_p(s) ((float *)fifo_read_ptr(&(s)->fifo) + 2*HALF_FIR_LEN_2)
+#define stage_preload(s) memset(fifo_reserve(&(s)->fifo, (s)->preload), \
+    0, sizeof(float) * (size_t)(s)->preload);
+
+typedef struct {
+  fifo_t fifo;
+  double step_mult;
+  int is_fast, x_fade_len, preload;
+} stage_t;
+
+typedef struct {
+  int num_stages0, num_stages, flushing;
+  int fade_len, slew_len, xfade, stage_inc, switch_stage_num;
+  double new_io_ratio, default_io_ratio;
+  stage_t * stages;
+  fifo_t output_fifo;
+  half_iir_t halfer;
+  stream_t current, fadeout; /* Current/fade-in, fadeout streams. */
+} rate_t;
+
+static float fade_coefs[(2 << FADE_LEN_BITS) + 1];
+
+static void vr_init(rate_t * p, double default_io_ratio, int num_stages, double mult)
+{
+  int i;
+  assert(num_stages >= 0);
+  memset(p, 0, sizeof(*p));
+
+  p->num_stages0 = num_stages;
+  p->num_stages = num_stages = max(num_stages, 1);
+  p->stages = (stage_t *)calloc((unsigned)num_stages + 1, sizeof(*p->stages)) + 1;
+  for (i = -1; i < p->num_stages; ++i) {
+    stage_t * s = &p->stages[i];
+    fifo_create(&s->fifo, sizeof(float));
+    s->step_mult = 2 * MULT32 / shiftl(2, i);
+    s->preload = i < 0? 0 : i == 0? 2 * HALF_FIR_LEN_2 : 3 * HALF_FIR_LEN_2 / 2;
+    stage_preload(s);
+    s->is_fast = true;
+    lsx_debug("%-3i preload=%i", i, s->preload);
+  }
+  fifo_create(&p->output_fifo, sizeof(float));
+  p->default_io_ratio = default_io_ratio;
+  if (!fade_coefs[0]) {
+    for (i = 0; i < iAL(fade_coefs); ++i)
+      fade_coefs[i] = (float)(.5 * (1 + cos(M_PI * i / (AL(fade_coefs) - 1))));
+    prepare_coefs(poly_fir_coefs_u, POLY_FIR_LEN_U, PHASES0_U, PHASES_U, coefs0_u, mult);
+    prepare_coefs(poly_fir_coefs_d, POLY_FIR_LEN_D, PHASES0_D, PHASES_D, coefs0_d, mult *.5);
+  }
+  assert(fade_coefs[0]);
+}
+
+static void enter_new_stage(rate_t * p, int occupancy0)
+{
+  p->current.len = shiftr(occupancy0, p->current.stage_num);
+  p->current.input = stage_read_p(&p->stages[p->current.stage_num]);
+
+  p->current.step_mult = p->stages[p->current.stage_num].step_mult;
+  p->current.is_d = p->current.stage_num >= 0;
+  if (p->current.is_d)
+    p->current.step_mult *= .5;
+}
+
+static void set_step(stream_t * p, double io_ratio)
+{
+  p->step.all = (int64_t)(io_ratio * p->step_mult + .5);
+}
+
+static bool set_step_step(stream_t * p, double io_ratio, int slew_len)
+{
+  int64_t dif;
+  int difi;
+  stream_t tmp = *p;
+  set_step(&tmp, io_ratio);
+  dif = tmp.step.all - p->step.all;
+  dif = dif < 0? dif - (slew_len >> 1) : dif + (slew_len >> 1);
+  difi = (int)dif;   /* Try to avoid int64_t div. */
+  p->step_step.all = difi == dif? difi / slew_len : dif / slew_len;
+  return p->step_step.all != 0;
+}
+
+static void vr_set_io_ratio(rate_t * p, double io_ratio, size_t slew_len)
+{
+  assert(io_ratio > 0);
+  if (slew_len) {
+    if (!set_step_step(&p->current, io_ratio, p->slew_len = (int)slew_len))
+      p->slew_len = 0, p->new_io_ratio = 0, p->fadeout.step_step.all = 0;
+    else {
+      p->new_io_ratio = io_ratio;
+      if (p->fade_len)
+        set_step_step(&p->fadeout, io_ratio, p->slew_len);
+    }
+  }
+  else {
+    if (p->default_io_ratio) { /* Then this is the first call to this fn. */
+      int octave = (int)floor(log(io_ratio) / M_LN2);
+      p->current.stage_num = octave < 0? -1 : min(octave, p->num_stages0-1);
+      enter_new_stage(p, 0);
+    }
+    else if (p->fade_len)
+      set_step(&p->fadeout, io_ratio);
+    set_step(&p->current, io_ratio);
+    if (p->default_io_ratio) FRAC(p->current.at) = FRAC(p->current.step) >> 1;
+    p->default_io_ratio = 0;
+  }
+}
+
+static bool do_input_stage(rate_t * p, int stage_num, int sign, int min_stage_num)
+{
+  int i = 0;
+  float * dest;
+  stage_t * s = &p->stages[stage_num];
+  stage_t * s1 = &p->stages[stage_num - sign];
+  float const * src = (float *)fifo_read_ptr(&s1->fifo) + HALF_FIR_LEN_2;
+  int len = shiftr(fifo_occupancy(&s1->fifo) - HALF_FIR_LEN_2 * 2, sign);
+  int already_done = fifo_occupancy(&s->fifo) - s->preload;
+  if ((len -= already_done) <= 0)
+    return false;
+  src += shiftl(already_done, sign);
+
+  dest = fifo_reserve(&s->fifo, len);
+  if (stage_num < 0) for (; i < len; ++src)
+    dest[i++] = double_fir0(src), dest[i++] = double_fir1(src);
+  else {
+    bool should_be_fast = p->stage_inc;
+    if (!s->x_fade_len && stage_num == p->switch_stage_num) {
+      p->switch_stage_num = 0;
+      if (s->is_fast != should_be_fast) {
+        s->x_fade_len = 1 << FADE_LEN_BITS, s->is_fast = should_be_fast, ++p->xfade;
+        lsx_debug("xfade level %i, inc?=%i", stage_num, p->stage_inc);
+      }
+    }
+    if (s->x_fade_len) {
+      float const * vol1 = fade_coefs + (s->x_fade_len << 1);
+      float const * vol2 = fade_coefs + (((1 << FADE_LEN_BITS) - s->x_fade_len) << 1);
+      int n = min(len, s->x_fade_len);
+      /*lsx_debug("xfade level %i, inc?=%i len=%i n=%i", stage_num, p->stage_inc, s->x_fade_len, n);*/
+      if (should_be_fast)
+        for (; i < n; vol2 += 2, vol1 -= 2, src += 2)
+          dest[i++] = *vol1 * fast_half_fir(src) + *vol2 * half_fir(src);
+      else for (; i < n; vol2 += 2, vol1 -= 2, src += 2)
+        dest[i++] = *vol2 * fast_half_fir(src) + *vol1 * half_fir(src);
+      s->x_fade_len -= n;
+      p->xfade -= !s->x_fade_len;
+    }
+    if (stage_num < min_stage_num)
+      for (; i < len; dest[i++] = fast_half_fir(src), src += 2);
+    else for (; i < len; dest[i++] = half_fir(src), src += 2);
+  }
+  if (p->flushing > 0)
+    stage_preload(s);
+  return true;
+}
+
+static int vr_process(rate_t * p, int olen0)
+{
+  assert(p->num_stages > 0);
+  if (p->default_io_ratio)
+    vr_set_io_ratio(p, p->default_io_ratio, 0);
+  {
+    float * output = fifo_reserve(&p->output_fifo, olen0);
+    int j, odone0 = 0, min_stage_num = p->current.stage_num;
+    int occupancy0, max_stage_num = min_stage_num;
+    if (p->fade_len) {
+      min_stage_num = min(min_stage_num, p->fadeout.stage_num);
+      max_stage_num = max(max_stage_num, p->fadeout.stage_num);
+    }
+
+    for (j = min(min_stage_num, 0); j <= max_stage_num; ++j)
+      if (j && !do_input_stage(p, j, j < 0? -1 : 1, min_stage_num))
+        break;
+    if (p->flushing > 0)
+      p->flushing = -1;
+
+    occupancy0 = shiftl(max(0,stage_occupancy(&p->stages[max_stage_num])), max_stage_num);
+    p->current.len = shiftr(occupancy0, p->current.stage_num);
+    p->current.input = stage_read_p(&p->stages[p->current.stage_num]);
+    if (p->fade_len) {
+      p->fadeout.len = shiftr(occupancy0, p->fadeout.stage_num);
+      p->fadeout.input = stage_read_p(&p->stages[p->fadeout.stage_num]);
+    }
+
+    while (odone0 < olen0) {
+      int odone, odone2, olen = olen0 - odone0, stage_dif = 0, shift;
+      float buf[64 << 1];
+
+      olen = min(olen, (int)(AL(buf) >> 1));
+      if (p->slew_len)
+        olen = min(olen, p->slew_len);
+      else if (p->new_io_ratio) {
+        set_step(&p->current, p->new_io_ratio);
+        set_step(&p->fadeout, p->new_io_ratio);
+        p->fadeout.step_step.all = p->current.step_step.all = 0;
+        p->new_io_ratio = 0;
+      }
+      if (!p->flushing && !p->fade_len && !p->xfade) {
+        if (p->current.is_d) {
+          if (INT(p->current.step) && FRAC(p->current.step))
+            stage_dif = 1, ++max_stage_num;
+          else if (!INT(p->current.step) && FRAC(p->current.step) < (1u << 31))
+            stage_dif = -1, --min_stage_num;
+        } else if (INT(p->current.step) > 1 && FRAC(p->current.step))
+          stage_dif = 1, ++max_stage_num;
+      }
+      if (stage_dif) {
+        int n = p->current.stage_num + stage_dif;
+        if (n >= p->num_stages)
+          --max_stage_num;
+        else {
+          p->stage_inc = stage_dif > 0;
+          p->fadeout = p->current;
+          p->current.stage_num += stage_dif;
+          if (!p->stage_inc)
+          p->switch_stage_num = p->current.stage_num;
+          if ((p->current.stage_num < 0 && stage_dif < 0) ||
+              (p->current.stage_num > 0 && stage_dif > 0)) {
+            stage_t * s = &p->stages[p->current.stage_num];
+            fifo_clear(&s->fifo);
+            stage_preload(s);
+            s->is_fast = false;
+            do_input_stage(p, p->current.stage_num, stage_dif, p->current.stage_num);
+          }
+          if (p->current.stage_num > 0 && stage_dif < 0) {
+            int idone = INT(p->current.at);
+            stage_t * s = &p->stages[p->current.stage_num];
+            fifo_trim_to(&s->fifo, 2 * HALF_FIR_LEN_2 + idone + (POLY_FIR_LEN_D >> 1));
+            do_input_stage(p, p->current.stage_num, 1, p->current.stage_num);
+          }
+          enter_new_stage(p, occupancy0);
+          shift = -stage_dif;
+#define lshift(x,by) (x)=(by)>0?(x)<<(by):(x)>>-(by)
+          lshift(p->current.at.all, shift);
+          shift += p->fadeout.is_d - p->current.is_d;
+          lshift(p->current.step.all, shift);
+          lshift(p->current.step_step.all, shift);
+          p->fade_len = AL(fade_coefs) - 1;
+          lsx_debug("switch from stage %i to %i, x2 from %i to %i", p->fadeout.stage_num, p->current.stage_num, p->fadeout.is_d, p->current.is_d);
+        }
+      }
+
+      if (p->fade_len) {
+        float const * vol1 = fade_coefs + p->fade_len;
+        float const * vol2 = fade_coefs + (iAL(fade_coefs) - 1 - p->fade_len);
+        int olen2 = (olen = min(olen, p->fade_len >> 1)) << 1;
+
+        /* x2 is more fine-grained so may fail to produce a pair of samples
+         * where x1 would not (the x1 second sample is a zero so is always
+         * available).  So do x2 first, then feed odone to the second one. */
+        memset(buf, 0, sizeof(*buf) * (size_t)olen2);
+        if (p->current.is_d && p->fadeout.is_d) {
+          odone  = poly_fir_fade_d(&p->current, vol1,-1, buf, olen2);
+          odone2 = poly_fir_fade_d(&p->fadeout, vol2, 1, buf, odone);
+        } else if (p->current.is_d) {
+          odone  = poly_fir_fade_d(&p->current, vol1,-1, buf, olen2);
+          odone2 = poly_fir_fade_u(&p->fadeout, vol2, 2, buf, odone);
+        } else {
+          assert(p->fadeout.is_d);
+          odone  = poly_fir_fade_d(&p->fadeout, vol2, 1, buf, olen2);
+          odone2 = poly_fir_fade_u(&p->current, vol1,-2, buf, odone);
+        }
+        assert(odone == odone2);
+        (void)odone2;
+        p->fade_len -= odone;
+        if (!p->fade_len) {
+          if (p->stage_inc)
+            p->switch_stage_num = min_stage_num++;
+          else
+            --max_stage_num;
+        }
+        half_iir(&p->halfer, &output[odone0], buf, odone >>= 1);
+      }
+      else if (p->current.is_d) {
+        odone = poly_fir_d(&p->current, buf, olen << 1) >> 1;
+        half_iir(&p->halfer, &output[odone0], buf, odone);
+      }
+      else {
+        odone = poly_fir_u(&p->current, &output[odone0], olen);
+        if (p->num_stages0)
+          half_phase(&p->halfer, &output[odone0], odone);
+      }
+      odone0 += odone;
+      if (p->slew_len)
+        p->slew_len -= odone;
+      if (odone != olen)
+        break; /* Need more input. */
+    } {
+      int from = max(0, max_stage_num), to = min(0, min_stage_num);
+      int i, idone = shiftr(INT(p->current.at), from - p->current.stage_num);
+      INT(p->current.at) -= shiftl(idone, from - p->current.stage_num);
+      if (p->fade_len)
+        INT(p->fadeout.at) -= shiftl(idone, from - p->fadeout.stage_num);
+      for (i = from; i >= to; --i, idone <<= 1)
+        fifo_read(&p->stages[i].fifo, idone, NULL);
+    }
+    fifo_trim_by(&p->output_fifo, olen0 - odone0);
+    return odone0;
+  }
+}
+
+static float * vr_input(rate_t * p, float const * input, size_t n)
+{
+  return fifo_write(&p->stages[0].fifo, (int)n, input);
+}
+
+static float const * vr_output(rate_t * p, float * output, size_t * n)
+{
+  fifo_t * fifo = &p->output_fifo;
+  if (1 || !p->num_stages0)
+    return fifo_read(fifo, (int)(*n = min(*n, (size_t)fifo_occupancy(fifo))), output);
+  else { /* Ignore this complication for now. */
+    int const IIR_DELAY = 2;
+    float * ptr = fifo_read_ptr(fifo);
+    int olen = min((int)*n, max(0, fifo_occupancy(fifo) - IIR_DELAY));
+    *n = (size_t)olen;
+    if (output)
+      memcpy(output, ptr + IIR_DELAY, *n * sizeof(*output));
+    fifo_read(fifo, olen, NULL);
+    return ptr + IIR_DELAY;
+  }
+}
+
+static void vr_flush(rate_t * p)
+{
+  if (!p->flushing) {
+    stage_preload(&p->stages[0]);
+    ++p->flushing;
+  }
+}
+
+static void vr_close(rate_t * p)
+{
+  int i;
+
+  fifo_delete(&p->output_fifo);
+  for (i = -1; i < p->num_stages; ++i) {
+    stage_t * s = &p->stages[i];
+    fifo_delete(&s->fifo);
+  }
+  free(p->stages - 1);
+}
+
+static double vr_delay(rate_t * p)
+{
+  return 100; /* TODO */
+  (void)p;
+}
+
+static void vr_sizes(size_t * shared, size_t * channel)
+{
+  *shared = 0;
+  *channel = sizeof(rate_t);
+}
+
+static char const * vr_create(void * channel, void * shared,double max_io_ratio,
+    void * q_spec, void * r_spec, double scale)
+{
+  double x = max_io_ratio;
+  int n;
+  for (n = 0; x > 1; x *= .5, ++n);
+  vr_init(channel, max_io_ratio, n, scale);
+  return 0;
+  (void)shared, (void)q_spec, (void)r_spec;
+}
+
+static char const * vr_id(void)
+{
+  return "single-precision variable-rate";
+}
+
+typedef void (* fn_t)(void);
+fn_t _soxr_vr32_cb[] = {
+  (fn_t)vr_input,
+  (fn_t)vr_process,
+  (fn_t)vr_output,
+  (fn_t)vr_flush,
+  (fn_t)vr_close,
+  (fn_t)vr_delay,
+  (fn_t)vr_sizes,
+  (fn_t)vr_create,
+  (fn_t)vr_set_io_ratio,
+  (fn_t)vr_id,
+};
diff --git a/tests/1-delay-clear.c b/tests/1-delay-clear.c
new file mode 100644
index 0000000..ba4d47c
--- /dev/null
+++ b/tests/1-delay-clear.c
@@ -0,0 +1,64 @@
+/* SoX Resampler Library      Copyright (c) 2007-15 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+/* Test 1: exercises soxr_delay and soxr_clear */
+
+#ifdef NDEBUG /* N.B. assert used with active statements so enable always. */
+#undef NDEBUG /* Must undef above assert.h or other that might include it. */
+#endif
+
+#include <soxr.h>
+#include "../examples/examples-common.h"
+
+#define ranqd1(x) ((x) = 1664525 * (x) + 1013904223) /* int32_t x */
+#define franqd1(x) (float)(ranqd1(x) * (1. / (65536. * 32768.))) /* [-1,1) */
+
+#define irate 9600
+#define orate 4410
+
+int main(int argc, char const * arg[])
+{
+  soxr_error_t error;
+  int32_t ran = 0;
+  int j;
+
+  soxr_t soxr = soxr_create(irate, orate, 1, &error, NULL, NULL, NULL);
+  assert(!error);
+
+  for (j=0; j<2; ++j) {
+    float ibuf[irate], out[orate+2], obuf[orate+2], * ibuf1 = ibuf;
+    size_t ilen = AL(ibuf)-1, olen = AL(obuf), i, odone = 0, odone0, odone1=0;
+    soxr_quality_spec_t  q_spec = soxr_quality_spec(SOXR_HQ, 0);
+
+    for (i=0; i<irate; ibuf[i++] = franqd1(ran));
+
+    error = soxr_oneshot(irate, orate, 1, ibuf, ilen, NULL,
+        out, AL(out), &odone0, NULL, &q_spec, NULL);
+    assert(!error);
+    assert(odone0==orate);
+
+    for (i=0; ilen || odone1; ++i) {
+      double out_samples = (double)orate / irate * (double)ilen;
+      double delayed_samples = soxr_delay(soxr);
+      unsigned max_out_samples = (unsigned)(out_samples + delayed_samples + .5);
+      assert(delayed_samples >= 0);
+      fprintf(stderr, "%5u %5u %5u\n",
+          (unsigned)ilen, max_out_samples, (unsigned)odone);
+      assert(max_out_samples+odone==odone0);
+      error = soxr_process(soxr, ibuf1, ilen, NULL, obuf+odone, olen, &odone1);
+      assert(!error);
+      odone += odone1;
+      ibuf1 = NULL, ilen = 0;
+      olen = min(100, AL(obuf)-odone);
+    }
+    assert(odone==odone0);
+
+    for (i=0; i<odone && out[i]==obuf[i]; ++i);
+    assert(i==odone);
+
+    soxr_clear(soxr);
+  }
+  soxr_delete(soxr);
+
+  return 0 * argc * !arg;
+}
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
new file mode 100644
index 0000000..333c0bd
--- /dev/null
+++ b/tests/CMakeLists.txt
@@ -0,0 +1,52 @@
+# SoX Resampler Library       Copyright (c) 2007-13 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+add_definitions (${PROJECT_C_FLAGS})
+link_libraries (${PROJECT_NAME})
+
+file (GLOB SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/*.c)
+foreach (fe ${SOURCES})
+  get_filename_component (f ${fe} NAME_WE)
+  add_executable (${f} ${fe})
+endforeach ()
+
+enable_testing ()
+
+set (sweep_to_freq 22050)
+set (leader 1)
+set (len 16)
+math (EXPR base_rate "${sweep_to_freq} + ${sweep_to_freq}")
+
+macro (add_vector r)
+  set (output ${CMAKE_CURRENT_BINARY_DIR}/ref-${r}.s32)
+  add_custom_command (OUTPUT ${output} DEPENDS vector-gen ${CMAKE_CURRENT_LIST_FILE}
+    COMMAND vector-gen ${r} ${leader} ${len} ${sweep_to_freq} 1 ${output})
+  set (vectors ${output} ${vectors})
+endmacro ()
+
+macro (add_cmp_test from to bits)
+  set (name ${bits}-bit-perfect-${from}-${to})
+  add_test (NAME ${name} COMMAND ${CMAKE_COMMAND} -Dbits=${bits} -DBIN=${BIN} -DEXAMPLES_BIN=${EXAMPLES_BIN} -Dleader=${leader} -Dto=${to}
+    -Dfrom=${from} -Dlen=${len} -P ${CMAKE_CURRENT_SOURCE_DIR}/cmp-test.cmake)
+  add_vector (${from})
+  add_vector (${to})
+endmacro ()
+
+unset (test_bits)
+if (WITH_SINGLE_PRECISION)
+  set (test_bits 20)
+endif ()
+if (WITH_DOUBLE_PRECISION)
+  set (test_bits ${test_bits} 24)
+endif ()
+
+foreach (b ${test_bits})
+  foreach (r 96000 65537)
+    add_cmp_test (${base_rate} ${r} ${b})
+    add_cmp_test (${r} ${base_rate} ${b})
+  endforeach ()
+endforeach ()
+
+add_custom_target (test-vectors ALL DEPENDS ${vectors})
+
+add_test (1-delay-clear ${BIN}1-delay-clear)
diff --git a/tests/README b/tests/README
new file mode 100644
index 0000000..44460d6
--- /dev/null
+++ b/tests/README
@@ -0,0 +1 @@
+A few tests on the pass-band performance; not a comprehensive test suite.
diff --git a/tests/bandwidth-test b/tests/bandwidth-test
new file mode 100755
index 0000000..47c2303
--- /dev/null
+++ b/tests/bandwidth-test
@@ -0,0 +1,40 @@
+#!/bin/bash
+set -e
+
+# SoX Resampler Library       Copyright (c) 2007-15 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+# Tests varying bandwidth.
+
+
+
+tool=./3-options-input-fn
+
+spec="spectrogram -z120 -Z-20 -wd -ho"
+ext=f32; e=0
+rate1=48000
+rate2=44100
+
+for n in 1 2; do
+
+rate1n=`expr $rate1 / 2`
+
+#sox -r $rate1 -n 0.$ext synth 1s sq pad .03 .03  gain -1
+sox -r $rate1 -n 0.$ext synth 8 sin 0:$rate1n gain -1
+
+for pass in `seq 79 5 99`; do
+	f=bw1-$rate2-p`printf %02u $pass`
+	$tool $rate1 $rate2 1 $e $e 4 0 $pass < 0.$ext | sox -c1 -r$rate2 -t $ext - -n $spec $f.png -c "bw-test pass:$pass stop:100"
+done
+
+for pass in `seq 79 5 99`; do
+	f=bw2-$rate2-p`printf %02u $pass`
+	stop=`expr 200 - $pass`
+	$tool $rate1 $rate2 1 $e $e 4 0 $pass $stop < 0.$ext | sox -c1 -r$rate2 -t $ext - -n $spec $f.png -c "bw-test pass:$pass stop:$stop"
+done
+
+r=$rate1; rate1=$rate2; rate2=$r
+
+done
+
+rm 0.$ext
diff --git a/tests/cmp-test.cmake b/tests/cmp-test.cmake
new file mode 100644
index 0000000..8db76c5
--- /dev/null
+++ b/tests/cmp-test.cmake
@@ -0,0 +1,30 @@
+# SoX Resampler Library       Copyright (c) 2007-13 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+if (${bits} STREQUAL 24)
+  set (quality 45)
+else ()
+  set (quality 44)
+endif ()
+
+set (output ${from}-${to}-${quality}.s32)
+
+execute_process(COMMAND ${EXAMPLES_BIN}3-options-input-fn ${from} ${to} 1 2 2 ${quality} a
+  INPUT_FILE ref-${from}.s32
+  OUTPUT_FILE ${output}
+  ERROR_VARIABLE test_error
+  RESULT_VARIABLE test_result)
+
+if (test_result)
+  message (FATAL_ERROR "Resampling failure: ${test_error}")
+endif ()
+
+execute_process(COMMAND ${BIN}vector-cmp ref-${to}.s32 ${output} ${to} ${leader} ${len} ${bits} 98
+  OUTPUT_VARIABLE test_output
+  RESULT_VARIABLE test_result)
+
+if (test_result)
+  message (FATAL_ERROR ${test_output})
+else ()
+  message (STATUS ${test_output})
+endif ()
diff --git a/tests/eg-test b/tests/eg-test
new file mode 100755
index 0000000..58d085c
--- /dev/null
+++ b/tests/eg-test
@@ -0,0 +1,47 @@
+#!/bin/bash
+set -e
+
+# SoX Resampler Library       Copyright (c) 2007-15 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+# Exercises each example programme.
+
+
+
+len=8
+#vg="valgrind --leak-check=full --show-reachable=yes"
+
+
+
+# Exercise example 1:
+$vg ./1-single-block
+
+
+
+# Check that examples 2-4 can convert 96k<->44k1 and that results are same for each:
+ir=96000
+or=44100
+for i in 1 2; do
+  prev=""
+  sox -r $ir -n 0.f32 synth $len sin 0+`expr $ir / 2`
+  for f in `find . -type f -executable -name "[2-4]*"`; do
+    $vg $f $ir $or < 0.f32 > $f.f32
+    test x$prev != x && cmp $f.f32 $prev
+    prev=$f.f32
+  done
+  or=96000
+  ir=44100
+done
+rm *.f32
+
+
+
+# Exercise VR making sure that varied internal stage reconfigurations occur:
+variations=(slow-sweep fast-changing)
+signals=(sine-wave saw-tooth-wave)
+for n in 0 1 2 3; do
+  signal=${signals[`expr $n % 2 || true`]}
+  variation=${variations[`expr $n / 2 || true`]}
+  $vg ./5-variable-rate $n | sox -tf32 -r44100 -c1 - -n spectrogram -z130 -hwd -o v$n.png -X 50 -c "variation:$variation signal:$signal"
+  vg=""
+done
diff --git a/tests/io-test b/tests/io-test
new file mode 100755
index 0000000..a291c78
--- /dev/null
+++ b/tests/io-test
@@ -0,0 +1,59 @@
+#!/bin/bash
+set -e
+
+# SoX Resampler Library       Copyright (c) 2007-15 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+# Tests IO
+
+
+
+ir=65537
+or=44100
+len=16
+f=1/32768
+g=32768:0
+tool=./3-options-input-fn
+
+types=(f32 f64 s32 s16)
+
+zs=(180 180 180 180 180 120 120 120 120)
+
+do_one() {
+  $tool $ir $or $c $1 $2 $3 < $c.${types[$1]} |
+  sox -t ${types[`expr $2 % 4`]} -r $or -c $c - -n spectrogram -X50 -hwk -z${zs[$n]} -o io$c$n.png -c "io-test i:${types[$1]} o:${types[`expr $2 % 4`]} ($2) q:$3"
+  n=`expr $n + 1`
+}
+
+j=3; test z$1 != z && j=$1
+
+for c in `seq 1 $j`; do
+  for n in `seq 0 3`; do
+    sox -r $ir -n $c.${types[$n]} synth $len sin $f gain -.1
+  done
+
+  n=0
+  do_one 1 2 5
+  do_one 2 0 5
+  for m in `seq 0 3`; do do_one $m $m 5; done
+  do_one 3 2 3
+  do_one 0 3 3
+  do_one 0 11 3
+
+  f="$f sin $g"
+  g=0+32768
+done
+
+rm ?.[sf][0-9][0-9]
+
+
+
+# Check conversion between differing I/O types, but no rate-change:
+
+for i in 1 2 3; do
+  prev=""
+  sox -n -c $i 0.f32 synth $len gain -.1
+  $tool 1 1 $i 0 2 < 0.f32 | $tool 1 1 $i 2 0 > 1.f32
+  cmp [01].f32
+done
+rm *.f32
diff --git a/tests/large-ratio-test b/tests/large-ratio-test
new file mode 100755
index 0000000..64f1789
--- /dev/null
+++ b/tests/large-ratio-test
@@ -0,0 +1,23 @@
+#!/bin/bash
+set -e
+
+# SoX Resampler Library       Copyright (c) 2007-15 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+# Tests interpolating then decimating be the same, large ratio.
+
+tool=../examples/3-options-input-fn
+q=6
+ratio=2e4
+srate=8000
+nrate=`expr $srate / 2`
+
+rm -f lr.png
+
+../tests/vector-gen $srate 0 8 $nrate .9375 1.s32
+
+$tool 1 $ratio 1 2 1 $q < 1.s32 | $tool $ratio 1 1 1 2 $q > 2.s32
+
+sox -M -r $srate -c1 1.s32 -r $srate -c1 2.s32 -n spectrogram -hwd -Z-10 -z180 -o lr.png -c "large-ratio-test q:$q ratio:$ratio"
+
+rm [12].s32
diff --git a/tests/phase-test b/tests/phase-test
new file mode 100755
index 0000000..4c491d8
--- /dev/null
+++ b/tests/phase-test
@@ -0,0 +1,38 @@
+#!/bin/bash
+set -e
+
+# SoX Resampler Library       Copyright (c) 2007-15 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+# Tests varying phase-response.
+
+tool=./3-options-input-fn
+spec="spectrogram -z160 -Z-20 -X 2000 -wd -ho"
+ext=f32; e=0
+rate1=48000
+rate2=44100
+
+for n in 1 2; do
+	sox -r $rate1 -n 0.$ext synth 1s sq pad .03 .03  gain -1
+
+	# Test the following combinations:
+	names=(linear-phase intermediate-phase maximum-phase minimum-phase)
+	filters=(standard-filter steep-filter)
+
+	for q in `seq 0 7`; do
+		f=ph-$rate2-q$q
+		name=${names[`expr $q % 4 || true`]}
+		filter=${filters[`expr $q / 4 || true`]}
+		$tool $rate1 $rate2 1 $e $e $q'6' < 0.$ext | sox -c1 -r$rate2 -t $ext - -n $spec $f.png -c "ph-test $filter $name"
+	done
+
+	# Test specific phase-response percentages:
+	for q in `seq 0 20 100`; do
+		f=ph-$rate2-p`printf %03u $q`
+		$tool $rate1 $rate2 1 $e $e 46 0 0 0 $q < 0.$ext | sox -c1 -r$rate2 -t $ext - -n $spec $f.png -c "ph-test phase:${q}%"
+	done
+
+	r=$rate1; rate1=$rate2; rate2=$r
+done
+
+rm 0.$ext
diff --git a/tests/q-test b/tests/q-test
new file mode 100755
index 0000000..7a0f0a2
--- /dev/null
+++ b/tests/q-test
@@ -0,0 +1,72 @@
+#!/bin/bash
+set -e
+
+# SoX Resampler Library       Copyright (c) 2007-15 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+# Tests conversion qualities 0..7 & variable-rate.
+
+
+
+tool=./3-options-input-fn
+ext=f64; e=1
+c=1
+q1=0; q2=7
+rates=48000
+zs=(50 87 87 87 111 135 159 180 95)
+
+zz() {
+	echo "spectrogram -z${zs[$1]} -Z-30 -wd -ho"
+}
+
+for rate0 in $rates; do
+
+rate1=$rate0
+rate2=44100
+
+for n in 1 2; do
+
+rate1n=`expr $rate1 / 2`
+
+
+
+# Convert sweep, for spectrogram:
+
+sox -r $rate1 -n -c $c 0.$ext synth 8 sin 0:$rate1n gain -1
+
+for q in `seq $q1 $q2`; do
+	f=qa-$rate1-$rate2-$q
+	$tool $rate1 $rate2 $c $e $e $q  0 < 0.$ext | sox -c$c -r$rate2 -t $ext - -n $(zz $q) $f.png -c $f
+done
+q=8
+f=qa-$rate1-$rate2-v
+$tool $rate1 $rate2 $c $e $e 4 20 < 0.$ext | sox -c$c -r$rate2 -t $ext - -n $(zz $q) $f.png -c $f
+
+
+
+# Convert impulse, for spectrogram:
+
+#: << :
+sox -r $rate1 -n 0.$ext synth 1s sq pad .03 .03  gain -1
+
+for q in `seq $q1 $q2`; do
+	f=qb-$rate1-$rate2-$q
+	$tool $rate1 $rate2 1 $e $e $q  0 < 0.$ext | sox -c1 -r$rate2 -t $ext - $f.wav
+done
+q=8
+f=qb-$rate1-$rate2-v
+$tool $rate1 $rate2 1 $e $e 4 20 < 0.$ext | sox -c1 -r$rate2 -t $ext - $f.wav
+
+# Combine impuse responses into multi-channel file (for inspection in Audacity):
+sox -M qb-$rate1-$rate2-?.wav q$rate1-$rate2.wav
+
+rm qb-$rate1-$rate2-?.wav
+:
+
+rate1=44100
+rate2=$rate0
+
+done
+done
+
+rm 0.$ext
diff --git a/tests/scripts b/tests/scripts
new file mode 100755
index 0000000..f245919
--- /dev/null
+++ b/tests/scripts
@@ -0,0 +1,13 @@
+#!/bin/bash
+set -e
+
+# SoX Resampler Library       Copyright (c) 2007-15 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+../../tests/bandwidth-test
+../../tests/eg-test
+../../tests/io-test
+../../tests/large-ratio-test
+../../tests/phase-test
+../../tests/q-test
+../../tests/time-test
diff --git a/tests/time-test b/tests/time-test
new file mode 100755
index 0000000..e8904b3
--- /dev/null
+++ b/tests/time-test
@@ -0,0 +1,35 @@
+#!/bin/bash
+set -e
+
+# SoX Resampler Library       Copyright (c) 2007-15 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+# Tests rate conversion time for qualities 0..7 & variable-rate.
+
+tool=./3-options-input-fn
+ext=f32; e=0
+c=2
+q1=0; q2=7
+rates="48000 77773 96000"
+
+for rate0 in $rates; do
+	rate1=$rate0
+	rate2=44100
+	for n in 1 2; do
+		rate1n=`expr $rate1 / 2`
+		sox -r $rate1 -n -c $c 0.$ext synth 5: sin 0:$rate1n gain -1
+
+		for q in `seq $q1 $q2`; do
+			echo $rate1 '-->' $rate2 c=$c q=$q
+			time $tool $rate1 $rate2 $c $e $e $q < 0.$ext > /dev/null;
+		done
+
+		echo $rate1 '-->' $rate2 c=$c q=v
+		time $tool $rate1 $rate2 $c $e $e 4 20 < 0.$ext > /dev/null
+
+		rate1=44100
+		rate2=$rate0
+	done
+done
+
+rm 0.$ext
diff --git a/tests/vector-cmp.c b/tests/vector-cmp.c
new file mode 100644
index 0000000..6edd2d5
--- /dev/null
+++ b/tests/vector-cmp.c
@@ -0,0 +1,53 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+/* Utility used to help test the library; not for general consumption.
+ *
+ * Compare two swept-sine files.  */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include "../src/rint.h"
+
+int main(int bit, char const * arg[])
+{
+  FILE    * f1       = fopen(arg[1], "rb"),
+          * f2       = fopen(arg[2], "rb");
+  double  rate       = atof (arg[3]), /* Rate for this vector */
+          leader_len = atof (arg[4]), /* Leader length in seconds */
+          len        = atof (arg[5]), /* Sweep length (excl. leader_len) */
+          expect_bits= atof (arg[6]),
+          expect_bw  = atof (arg[7]);
+
+  int32_t s1, s2;
+  long count = 0;
+  static long thresh[32];
+  double bw, prev = 0;
+
+  for (; fread(&s1, sizeof(s1), 1, f1) == 1 &&
+         fread(&s2, sizeof(s2), 1, f2) == 1; ++count) {
+    long diff = abs((int)(s1 - s2));
+    for (bit = 0; diff && bit < 32; bit++, diff >>= 1)
+      if ((diff & 1) && !thresh[bit])
+        thresh[bit] = count + 1;
+  }
+
+  if (count != (long)((leader_len + len) * rate + .5)) {
+    printf("incorrect file length\n");
+    exit(1);
+  }
+
+  for (bit = 0; bit < 32; ++bit) {
+    bw = ((double)thresh[bit] - 1) / rate - leader_len;
+    if (bit && bw >= 0 && (bw - prev) * 100 / len < .08) {
+      --bit;
+      break;
+    }
+    prev = bw;
+  }
+  bit = 32 - bit;
+  bw = bw * 100 / len;
+  printf("Bit perfect to %i bits, from DC to %.2f%% nyquist.\n", bit, bw);
+  return !(bit >= expect_bits && bw >= expect_bw);
+}
diff --git a/tests/vector-gen.c b/tests/vector-gen.c
new file mode 100644
index 0000000..06d4bac
--- /dev/null
+++ b/tests/vector-gen.c
@@ -0,0 +1,56 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+/* Utility used to help test the library; not for general consumption.
+ *
+ * Generate a swept sine to a file, with faded `lead-in' section.  */
+
+#define QUAD 0
+
+#if QUAD
+  #include <quadmath.h>
+#endif
+
+#include "../examples/examples-common.h"
+
+#if QUAD
+  #define modf modfq
+  #define cos cosq
+  #define sin sinq
+  #undef M_PI
+  #define M_PI M_PIq
+  #define real __float128
+  #define atof(x) strtoflt128(x, 0)
+#else
+  #define real double
+  #include "rint.h"
+#endif
+
+int main(int i, char const * argv[])
+{
+  real rate           = atof(argv[1]), /* Rate for this vector */
+       lead_in_len    = atof(argv[2]), /* Lead-in length in seconds */
+       len            = atof(argv[3]), /* Sweep length (excl. lead_in_len) */
+       sweep_to_freq  = atof(argv[4]), /* Sweep from DC to this freq. */
+       multiplier     = atof(argv[5]), /* For headroom */
+       f1 = -sweep_to_freq / len * lead_in_len, f2 = sweep_to_freq,
+       n1 = rate * -lead_in_len, n2 = rate * len,
+       m = (f2 - f1) / (n2 - n1) / 2, dummy;
+  FILE * file = fopen(argv[6], "wb");
+  i = (int)n1;
+  if (!file || i != n1)
+    exit(1);
+  for (; i < (int)(n2 + .5); ++i) {
+    double d1 = multiplier * sin(2 * M_PI * modf(i * m * i / rate, &dummy));
+    double d = i < 0? d1 * (1 - cos(M_PI * (i + n1) / n1)) * .5 : d1;
+#if QUAD
+    size_t actual = fwrite(&d, sizeof(d), 1, file);
+#else
+    int32_t out = rint32(d * (32768. * 65536 - 1));
+    size_t actual = fwrite(&out, sizeof(out), 1, file);
+#endif
+    if (actual != 1)
+      return 1;
+  }
+  return 0;
+}
-- 
2.30.2