From: Jochen Sprickerhof <git@jochen.sprickerhof.de>
Date: Mon, 15 Aug 2016 15:07:38 +0000 (+0200)
Subject: Imported Upstream version 1.8.0+dfsg
X-Git-Tag: archive/raspbian/1.14.0+dfsg-2+rpi1^2~10^2~12
X-Git-Url: https://dgit.raspbian.org/?a=commitdiff_plain;h=b33c6dbcef571244fc9bcbf8c1ac12d4dc791689;p=pcl.git

Imported Upstream version 1.8.0+dfsg
---

diff --git a/cuda/common/include/pcl/cuda/cutil.h b/cuda/common/include/pcl/cuda/cutil.h
deleted file mode 100644
index 8f2ad57d..00000000
--- a/cuda/common/include/pcl/cuda/cutil.h
+++ /dev/null
@@ -1,955 +0,0 @@
-/*
- * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
- *
- * Please refer to the NVIDIA end user license agreement (EULA) associated
- * with this source code for terms and conditions that govern your use of
- * this software. Any use, reproduction, disclosure, or distribution of
- * this software and related documentation outside the terms of the EULA
- * is strictly prohibited.
- *
- */
- 
- /*
-* Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
-*
-* Please refer to the NVIDIA end user license agreement (EULA) associated
-* with this source code for terms and conditions that govern your use of
-* this software. Any use, reproduction, disclosure, or distribution of
-* this software and related documentation outside the terms of the EULA
-* is strictly prohibited.
-*
-*/
-
-
-/* CUda UTility Library */
-
-#ifndef _CUTIL_H_
-#define _CUTIL_H_
-
-#ifdef _WIN32
-#   pragma warning( disable : 4996 ) // disable deprecated warning 
-#endif
-
-#include <stdio.h>
-#include <stdlib.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-    // helper typedefs for building DLL
-#ifdef _WIN32
-#  ifdef BUILD_DLL
-#    define DLL_MAPPING  __declspec(dllexport)
-#  else
-#    define DLL_MAPPING  __declspec(dllimport)
-#  endif
-#else 
-#  define DLL_MAPPING 
-#endif
-
-#ifdef _WIN32
-    #define CUTIL_API __stdcall
-#else
-    #define CUTIL_API
-#endif
-
-    ////////////////////////////////////////////////////////////////////////////
-    //! CUT bool type
-    ////////////////////////////////////////////////////////////////////////////
-    enum CUTBoolean 
-    {
-        CUTFalse = 0,
-        CUTTrue = 1
-    };
-
-    ////////////////////////////////////////////////////////////////////////////
-    //! Deallocate memory allocated within Cutil
-    //! @param  pointer to memory 
-    ////////////////////////////////////////////////////////////////////////////
-    DLL_MAPPING
-    void CUTIL_API
-  	cutFree( void* ptr);
-
-    ////////////////////////////////////////////////////////////////////////////
-    //! Helper for bank conflict checking (should only be used with the
-    //! CUT_BANK_CHECKER macro)
-    //! @param tidx  thread id in x dimension of block
-    //! @param tidy  thread id in y dimension of block
-    //! @param tidz  thread id in z dimension of block
-    //! @param bdimx block size in x dimension
-    //! @param bdimy block size in y dimension
-    //! @param bdimz block size in z dimension
-    //! @param file  name of the source file where the access takes place
-    //! @param line  line in the source file where the access takes place
-    //! @param aname name of the array which is accessed
-    //! @param index index into the array
-    ////////////////////////////////////////////////////////////////////////////
-    DLL_MAPPING
-    void CUTIL_API
-    cutCheckBankAccess( unsigned int tidx, unsigned int tidy, unsigned int tidz,
-                        unsigned int bdimx, unsigned int bdimy, 
-                        unsigned int bdimz, const char* file, const int line,
-                        const char* aname, const int index);
-
-    ////////////////////////////////////////////////////////////////////////////
-    //! Find the path for a filename
-    //! @return the path if succeeded, otherwise 0
-    //! @param filename        name of the file
-    //! @param executablePath  optional absolute path of the executable
-    ////////////////////////////////////////////////////////////////////////////
-    DLL_MAPPING
-    char* CUTIL_API
-    cutFindFilePath(const char* filename, const char* executablePath);
-
-    ////////////////////////////////////////////////////////////////////////////
-    //! Read file \filename containing single precision floating point data
-    //! @return CUTTrue if reading the file succeeded, otherwise false
-    //! @param filename name of the source file
-    //! @param data  uninitialized pointer, returned initialized and pointing to
-    //!        the data read
-    //! @param len  number of data elements in data, -1 on error
-    //! @note If a NULL pointer is passed to this function and it is
-    //!       initialized within Cutil then cutFree() has to be used to
-    //!       deallocate the memory
-    ////////////////////////////////////////////////////////////////////////////
-    DLL_MAPPING
-    CUTBoolean CUTIL_API 
-    cutReadFilef( const char* filename, float** data, unsigned int* len, 
-                  bool verbose = false);
-
-    ////////////////////////////////////////////////////////////////////////////
-    //! Read file \filename containing double precision floating point data
-    //! @return CUTTrue if reading the file succeeded, otherwise false
-    //! @param filename name of the source file
-    //! @param data  uninitialized pointer, returned initialized and pointing to
-    //!        the data read
-    //! @param len  number of data elements in data, -1 on error
-    //! @note If a NULL pointer is passed to this function and it is
-    //!       initialized within Cutil then cutFree() has to be used to
-    //!       deallocate the memory
-    ////////////////////////////////////////////////////////////////////////////
-    DLL_MAPPING
-    CUTBoolean CUTIL_API 
-    cutReadFiled( const char* filename, double** data, unsigned int* len, 
-                  bool verbose = false);
-
-    ////////////////////////////////////////////////////////////////////////////
-    //! Read file \filename containing integer data
-    //! @return CUTTrue if reading the file succeeded, otherwise false
-    //! @param filename name of the source file
-    //! @param data  uninitialized pointer, returned initialized and pointing to
-    //!        the data read
-    //! @param len  number of data elements in data, -1 on error
-    //! @note If a NULL pointer is passed to this function and it is
-    //!       initialized within Cutil then cutFree() has to be used to
-    //!       deallocate the memory
-    ////////////////////////////////////////////////////////////////////////////
-    DLL_MAPPING
-    CUTBoolean CUTIL_API 
-    cutReadFilei( const char* filename, int** data, unsigned int* len, bool verbose = false);
-
-    ////////////////////////////////////////////////////////////////////////////
-    //! Read file \filename containing unsigned integer data
-    //! @return CUTTrue if reading the file succeeded, otherwise false
-    //! @param filename name of the source file
-    //! @param data  uninitialized pointer, returned initialized and pointing to
-    //!        the data read
-    //! @param len  number of data elements in data, -1 on error
-    //! @note If a NULL pointer is passed to this function and it is 
-    //!       initialized within Cutil then cutFree() has to be used to
-    //!       deallocate the memory
-    ////////////////////////////////////////////////////////////////////////////
-    DLL_MAPPING
-    CUTBoolean CUTIL_API 
-    cutReadFileui( const char* filename, unsigned int** data, 
-                   unsigned int* len, bool verbose = false);
-
-    ////////////////////////////////////////////////////////////////////////////
-    //! Read file \filename containing char / byte data
-    //! @return CUTTrue if reading the file succeeded, otherwise false
-    //! @param filename name of the source file
-    //! @param data  uninitialized pointer, returned initialized and pointing to
-    //!        the data read
-    //! @param len  number of data elements in data, -1 on error
-    //! @note If a NULL pointer is passed to this function and it is 
-    //!       initialized within Cutil then cutFree() has to be used to
-    //!       deallocate the memory
-    ////////////////////////////////////////////////////////////////////////////
-    DLL_MAPPING
-    CUTBoolean CUTIL_API 
-    cutReadFileb( const char* filename, char** data, unsigned int* len, 
-                  bool verbose = false);
-
-    ////////////////////////////////////////////////////////////////////////////
-    //! Read file \filename containing unsigned char / byte data
-    //! @return CUTTrue if reading the file succeeded, otherwise false
-    //! @param filename name of the source file
-    //! @param data  uninitialized pointer, returned initialized and pointing to
-    //!        the data read
-    //! @param len  number of data elements in data, -1 on error
-    //! @note If a NULL pointer is passed to this function and it is
-    //!       initialized within Cutil then cutFree() has to be used to
-    //!       deallocate the memory
-    ////////////////////////////////////////////////////////////////////////////
-    DLL_MAPPING
-    CUTBoolean CUTIL_API 
-    cutReadFileub( const char* filename, unsigned char** data, 
-                   unsigned int* len, bool verbose = false);
-
-    ////////////////////////////////////////////////////////////////////////////
-    //! Write a data file \filename containing single precision floating point 
-    //! data
-    //! @return CUTTrue if writing the file succeeded, otherwise false
-    //! @param filename name of the file to write
-    //! @param data  pointer to data to write
-    //! @param len  number of data elements in data, -1 on error
-    //! @param epsilon  epsilon for comparison
-    ////////////////////////////////////////////////////////////////////////////
-    DLL_MAPPING
-    CUTBoolean CUTIL_API 
-    cutWriteFilef( const char* filename, const float* data, unsigned int len,
-                   const float epsilon, bool verbose = false);
-
-    ////////////////////////////////////////////////////////////////////////////
-    //! Write a data file \filename containing double precision floating point 
-    //! data
-    //! @return CUTTrue if writing the file succeeded, otherwise false
-    //! @param filename name of the file to write
-    //! @param data  pointer to data to write
-    //! @param len  number of data elements in data, -1 on error
-    //! @param epsilon  epsilon for comparison
-    ////////////////////////////////////////////////////////////////////////////
-    DLL_MAPPING
-    CUTBoolean CUTIL_API 
-    cutWriteFiled( const char* filename, const float* data, unsigned int len,
-                   const double epsilon, bool verbose = false);
-
-    ////////////////////////////////////////////////////////////////////////////
-    //! Write a data file \filename containing integer data
-    //! @return CUTTrue if writing the file succeeded, otherwise false
-    //! @param filename name of the file to write
-    //! @param data  pointer to data to write
-    //! @param len  number of data elements in data, -1 on error
-    ////////////////////////////////////////////////////////////////////////////
-    DLL_MAPPING
-    CUTBoolean CUTIL_API 
-    cutWriteFilei( const char* filename, const int* data, unsigned int len,
-                   bool verbose = false);
-
-    ////////////////////////////////////////////////////////////////////////////
-    //! Write a data file \filename containing unsigned integer data
-    //! @return CUTTrue if writing the file succeeded, otherwise false
-    //! @param filename name of the file to write
-    //! @param data  pointer to data to write
-    //! @param len  number of data elements in data, -1 on error
-    ////////////////////////////////////////////////////////////////////////////
-    DLL_MAPPING
-    CUTBoolean CUTIL_API 
-    cutWriteFileui( const char* filename,const unsigned int* data, 
-                    unsigned int len, bool verbose = false);
-
-    ////////////////////////////////////////////////////////////////////////////
-    //! Write a data file \filename containing char / byte data
-    //! @return CUTTrue if writing the file succeeded, otherwise false
-    //! @param filename name of the file to write
-    //! @param data  pointer to data to write
-    //! @param len  number of data elements in data, -1 on error
-    ////////////////////////////////////////////////////////////////////////////
-    DLL_MAPPING
-    CUTBoolean CUTIL_API 
-    cutWriteFileb( const char* filename, const char* data, unsigned int len, 
-                   bool verbose = false);
-
-    ////////////////////////////////////////////////////////////////////////////
-    //! Write a data file \filename containing unsigned char / byte data
-    //! @return CUTTrue if writing the file succeeded, otherwise false
-    //! @param filename name of the file to write
-    //! @param data  pointer to data to write
-    //! @param len  number of data elements in data, -1 on error
-    ////////////////////////////////////////////////////////////////////////////
-    DLL_MAPPING
-    CUTBoolean CUTIL_API 
-    cutWriteFileub( const char* filename,const unsigned char* data,
-                    unsigned int len, bool verbose = false);
-
-    ////////////////////////////////////////////////////////////////////////////
-    //! Load PGM image file (with unsigned char as data element type)
-    //! @return CUTTrue if reading the file succeeded, otherwise false
-    //! @param file  name of the image file
-    //! @param data  handle to the data read
-    //! @param w     width of the image
-    //! @param h     height of the image
-    //! @note If a NULL pointer is passed to this function and it is 
-    //!       initialized within Cutil then cutFree() has to be used to
-    //!       deallocate the memory
-    ////////////////////////////////////////////////////////////////////////////
-    DLL_MAPPING
-    CUTBoolean CUTIL_API
-    cutLoadPGMub( const char* file, unsigned char** data,
-                  unsigned int *w,unsigned int *h);
-
-    ////////////////////////////////////////////////////////////////////////////
-    //! Load PPM image file (with unsigned char as data element type)
-    //! @return CUTTrue if reading the file succeeded, otherwise false
-    //! @param file  name of the image file
-    //! @param data  handle to the data read
-    //! @param w     width of the image
-    //! @param h     height of the image
-    ////////////////////////////////////////////////////////////////////////////
-    DLL_MAPPING
-    CUTBoolean CUTIL_API
-    cutLoadPPMub( const char* file, unsigned char** data, 
-                  unsigned int *w,unsigned int *h);
-
-    ////////////////////////////////////////////////////////////////////////////
-    //! Load PPM image file (with unsigned char as data element type), padding 
-    //! 4th component
-    //! @return CUTTrue if reading the file succeeded, otherwise false
-    //! @param file  name of the image file
-    //! @param data  handle to the data read
-    //! @param w     width of the image
-    //! @param h     height of the image
-    ////////////////////////////////////////////////////////////////////////////
-    DLL_MAPPING
-    CUTBoolean CUTIL_API
-    cutLoadPPM4ub( const char* file, unsigned char** data, 
-                   unsigned int *w,unsigned int *h);
-
-    ////////////////////////////////////////////////////////////////////////////
-    //! Load PGM image file (with unsigned int as data element type)
-    //! @return CUTTrue if reading the file succeeded, otherwise false
-    //! @param file  name of the image file
-    //! @param data  handle to the data read
-    //! @param w     width of the image
-    //! @param h     height of the image
-    //! @note If a NULL pointer is passed to this function and it is 
-    //!       initialized within Cutil then cutFree() has to be used to
-    //!       deallocate the memory
-    ////////////////////////////////////////////////////////////////////////////
-    DLL_MAPPING
-    CUTBoolean CUTIL_API
-        cutLoadPGMi( const char* file, unsigned int** data, 
-                     unsigned int* w, unsigned int* h);
-
-    ////////////////////////////////////////////////////////////////////////////
-    //! Load PGM image file (with unsigned short as data element type)
-    //! @return CUTTrue if reading the file succeeded, otherwise false
-    //! @param file  name of the image file
-    //! @param data  handle to the data read
-    //! @param w     width of the image
-    //! @param h     height of the image
-    //! @note If a NULL pointer is passed to this function and it is 
-    //!       initialized  withing Cutil then cutFree() has to be used to
-    //!       deallocate the memory
-    ////////////////////////////////////////////////////////////////////////////
-    DLL_MAPPING
-    CUTBoolean CUTIL_API
-        cutLoadPGMs( const char* file, unsigned short** data, 
-                     unsigned int* w, unsigned int* h);
-
-    ////////////////////////////////////////////////////////////////////////////
-    //! Load PGM image file (with float as data element type)
-    //! @param file  name of the image file
-    //! @param data  handle to the data read
-    //! @param w     width of the image
-    //! @param h     height of the image
-    //! @note If a NULL pointer is passed to this function and it is 
-    //!       initialized withing Cutil then cutFree() has to be used to 
-    //!       deallocate the memory
-    ////////////////////////////////////////////////////////////////////////////
-    DLL_MAPPING
-    CUTBoolean CUTIL_API
-        cutLoadPGMf( const char* file, float** data,
-                     unsigned int* w, unsigned int* h);
-
-    ////////////////////////////////////////////////////////////////////////////
-    //! Save PGM image file (with unsigned char as data element type)
-    //! @param file  name of the image file
-    //! @param data  handle to the data read
-    //! @param w     width of the image
-    //! @param h     height of the image
-    ////////////////////////////////////////////////////////////////////////////
-    DLL_MAPPING
-    CUTBoolean CUTIL_API
-        cutSavePGMub( const char* file, unsigned char* data, 
-                      unsigned int w, unsigned int h);
-
-    ////////////////////////////////////////////////////////////////////////////
-    //! Save PPM image file (with unsigned char as data element type)
-    //! @param file  name of the image file
-    //! @param data  handle to the data read
-    //! @param w     width of the image
-    //! @param h     height of the image
-    ////////////////////////////////////////////////////////////////////////////
-    DLL_MAPPING
-    CUTBoolean CUTIL_API
-    cutSavePPMub( const char* file, unsigned char *data, 
-                unsigned int w, unsigned int h);
-
-    ////////////////////////////////////////////////////////////////////////////
-    //! Save PPM image file (with unsigned char as data element type, padded to 
-    //! 4 bytes)
-    //! @param file  name of the image file
-    //! @param data  handle to the data read
-    //! @param w     width of the image
-    //! @param h     height of the image
-    ////////////////////////////////////////////////////////////////////////////
-    DLL_MAPPING
-    CUTBoolean CUTIL_API
-    cutSavePPM4ub( const char* file, unsigned char *data, 
-                   unsigned int w, unsigned int h);
-
-    ////////////////////////////////////////////////////////////////////////////
-    //! Save PGM image file (with unsigned int as data element type)
-    //! @param file  name of the image file
-    //! @param data  handle to the data read
-    //! @param w     width of the image
-    //! @param h     height of the image
-    ////////////////////////////////////////////////////////////////////////////
-    DLL_MAPPING
-    CUTBoolean CUTIL_API
-    cutSavePGMi( const char* file, unsigned int* data,
-                 unsigned int w, unsigned int h);
-
-    ////////////////////////////////////////////////////////////////////////////
-    //! Save PGM image file (with unsigned short as data element type)
-    //! @param file  name of the image file
-    //! @param data  handle to the data read
-    //! @param w     width of the image
-    //! @param h     height of the image
-    ////////////////////////////////////////////////////////////////////////////
-    DLL_MAPPING
-    CUTBoolean CUTIL_API
-    cutSavePGMs( const char* file, unsigned short* data,
-                 unsigned int w, unsigned int h);
-
-    ////////////////////////////////////////////////////////////////////////////
-    //! Save PGM image file (with float as data element type)
-    //! @param file  name of the image file
-    //! @param data  handle to the data read
-    //! @param w     width of the image
-    //! @param h     height of the image
-    ////////////////////////////////////////////////////////////////////////////
-    DLL_MAPPING
-    CUTBoolean CUTIL_API
-    cutSavePGMf( const char* file, float* data,
-                 unsigned int w, unsigned int h);
-
-    ////////////////////////////////////////////////////////////////////////////
-    // Command line arguments: General notes
-    // * All command line arguments begin with '--' followed by the token; 
-    //   token and value are seperated by '='; example --samples=50
-    // * Arrays have the form --model=[one.obj,two.obj,three.obj] 
-    //   (without whitespaces)
-    ////////////////////////////////////////////////////////////////////////////
-
-    ////////////////////////////////////////////////////////////////////////////
-    //! Check if command line argument \a flag-name is given
-    //! @return CUTTrue if command line argument \a flag_name has been given, 
-    //!         otherwise 0
-    //! @param argc  argc as passed to main()
-    //! @param argv  argv as passed to main()
-    //! @param flag_name  name of command line flag
-    ////////////////////////////////////////////////////////////////////////////
-    DLL_MAPPING
-    CUTBoolean CUTIL_API
-    cutCheckCmdLineFlag( const int argc, const char** argv, 
-                         const char* flag_name);
-
-    ////////////////////////////////////////////////////////////////////////////
-    //! Get the value of a command line argument of type int
-    //! @return CUTTrue if command line argument \a arg_name has been given and
-    //!         is of the requested type, otherwise CUTFalse
-    //! @param argc  argc as passed to main()
-    //! @param argv  argv as passed to main()
-    //! @param arg_name  name of the command line argument
-    //! @param val  value of the command line argument
-    ////////////////////////////////////////////////////////////////////////////
-    DLL_MAPPING
-    CUTBoolean CUTIL_API
-    cutGetCmdLineArgumenti( const int argc, const char** argv, 
-                            const char* arg_name, int* val);
-
-    ////////////////////////////////////////////////////////////////////////////
-    //! Get the value of a command line argument of type float
-    //! @return CUTTrue if command line argument \a arg_name has been given and
-    //!         is of the requested type, otherwise CUTFalse
-    //! @param argc  argc as passed to main()
-    //! @param argv  argv as passed to main()
-    //! @param arg_name  name of the command line argument
-    //! @param val  value of the command line argument
-    ////////////////////////////////////////////////////////////////////////////
-    DLL_MAPPING
-    CUTBoolean CUTIL_API
-    cutGetCmdLineArgumentf( const int argc, const char** argv, 
-                            const char* arg_name, float* val);
-
-    ////////////////////////////////////////////////////////////////////////////
-    //! Get the value of a command line argument of type string
-    //! @return CUTTrue if command line argument \a arg_name has been given and
-    //!         is of the requested type, otherwise CUTFalse
-    //! @param argc  argc as passed to main()
-    //! @param argv  argv as passed to main()
-    //! @param arg_name  name of the command line argument
-    //! @param val  value of the command line argument
-    ////////////////////////////////////////////////////////////////////////////
-    DLL_MAPPING
-    CUTBoolean CUTIL_API
-    cutGetCmdLineArgumentstr( const int argc, const char** argv, 
-                              const char* arg_name, char** val);
-
-    ////////////////////////////////////////////////////////////////////////////
-    //! Get the value of a command line argument list those element are strings
-    //! @return CUTTrue if command line argument \a arg_name has been given and
-    //!         is of the requested type, otherwise CUTFalse
-    //! @param argc  argc as passed to main()
-    //! @param argv  argv as passed to main()
-    //! @param arg_name  name of the command line argument
-    //! @param val  command line argument list
-    //! @param len  length of the list / number of elements
-    ////////////////////////////////////////////////////////////////////////////
-    DLL_MAPPING
-    CUTBoolean CUTIL_API
-    cutGetCmdLineArgumentListstr( const int argc, const char** argv, 
-                                  const char* arg_name, char** val, 
-                                  unsigned int* len);
-
-    ////////////////////////////////////////////////////////////////////////////
-    //! Extended assert
-    //! @return CUTTrue if the condition \a val holds, otherwise CUTFalse
-    //! @param val  condition to test
-    //! @param file  __FILE__ macro
-    //! @param line  __LINE__ macro
-    //! @note This function should be used via the CONDITION(val) macro
-    ////////////////////////////////////////////////////////////////////////////
-    DLL_MAPPING
-    CUTBoolean CUTIL_API
-    cutCheckCondition( int val, const char* file, const int line);
-
-    ////////////////////////////////////////////////////////////////////////////
-    //! Compare two float arrays
-    //! @return  CUTTrue if \a reference and \a data are identical, 
-    //!          otherwise CUTFalse
-    //! @param reference  handle to the reference data / gold image
-    //! @param data       handle to the computed data
-    //! @param len        number of elements in reference and data
-    ////////////////////////////////////////////////////////////////////////////
-    DLL_MAPPING
-    CUTBoolean CUTIL_API 
-    cutComparef( const float* reference, const float* data,
-                 const unsigned int len);
-
-    ////////////////////////////////////////////////////////////////////////////
-    //! Compare two integer arrays
-    //! @return  CUTTrue if \a reference and \a data are identical, 
-    //!          otherwise CUTFalse
-    //! @param reference  handle to the reference data / gold image
-    //! @param data       handle to the computed data
-    //! @param len        number of elements in reference and data
-    ////////////////////////////////////////////////////////////////////////////
-    DLL_MAPPING
-    CUTBoolean CUTIL_API 
-    cutComparei( const int* reference, const int* data, 
-                 const unsigned int len ); 
-
-    ////////////////////////////////////////////////////////////////////////////////
-    //! Compare two unsigned integer arrays, with epsilon and threshold
-    //! @return  CUTTrue if \a reference and \a data are identical, 
-    //!          otherwise CUTFalse
-    //! @param reference  handle to the reference data / gold image
-    //! @param data       handle to the computed data
-    //! @param len        number of elements in reference and data
-    //! @param threshold  tolerance % # of comparison errors (0.15f = 15%)
-    ////////////////////////////////////////////////////////////////////////////////
-    DLL_MAPPING
-    CUTBoolean CUTIL_API
-    cutCompareuit( const unsigned int* reference, const unsigned int* data,
-                const unsigned int len, const float epsilon, const float threshold );
-
-    ////////////////////////////////////////////////////////////////////////////
-    //! Compare two unsigned char arrays
-    //! @return  CUTTrue if \a reference and \a data are identical, 
-    //!          otherwise CUTFalse
-    //! @param reference  handle to the reference data / gold image
-    //! @param data       handle to the computed data
-    //! @param len        number of elements in reference and data
-    ////////////////////////////////////////////////////////////////////////////
-    DLL_MAPPING
-    CUTBoolean CUTIL_API 
-    cutCompareub( const unsigned char* reference, const unsigned char* data,
-                  const unsigned int len ); 
-
-    ////////////////////////////////////////////////////////////////////////////////
-    //! Compare two integers with a tolernance for # of byte errors
-    //! @return  CUTTrue if \a reference and \a data are identical, 
-    //!          otherwise CUTFalse
-    //! @param reference  handle to the reference data / gold image
-    //! @param data       handle to the computed data
-    //! @param len        number of elements in reference and data
-    //! @param epsilon    epsilon to use for the comparison
-    //! @param threshold  tolerance % # of comparison errors (0.15f = 15%)
-    ////////////////////////////////////////////////////////////////////////////////
-    DLL_MAPPING
-    CUTBoolean CUTIL_API
-    cutCompareubt( const unsigned char* reference, const unsigned char* data,
-                 const unsigned int len, const float epsilon, const float threshold );
-
-    ////////////////////////////////////////////////////////////////////////////////
-    //! Compare two integer arrays witha n epsilon tolerance for equality
-    //! @return  CUTTrue if \a reference and \a data are identical, 
-    //!          otherwise CUTFalse
-    //! @param reference  handle to the reference data / gold image
-    //! @param data       handle to the computed data
-    //! @param len        number of elements in reference and data
-    //! @param epsilon    epsilon to use for the comparison
-    ////////////////////////////////////////////////////////////////////////////////
-    DLL_MAPPING
-    CUTBoolean CUTIL_API
-    cutCompareube( const unsigned char* reference, const unsigned char* data,
-                 const unsigned int len, const float epsilon );
-
-    ////////////////////////////////////////////////////////////////////////////
-    //! Compare two float arrays with an epsilon tolerance for equality
-    //! @return  CUTTrue if \a reference and \a data are identical, 
-    //!          otherwise CUTFalse
-    //! @param reference  handle to the reference data / gold image
-    //! @param data       handle to the computed data
-    //! @param len        number of elements in reference and data
-    //! @param epsilon    epsilon to use for the comparison
-    ////////////////////////////////////////////////////////////////////////////
-    DLL_MAPPING
-    CUTBoolean CUTIL_API 
-    cutComparefe( const float* reference, const float* data,
-                  const unsigned int len, const float epsilon );
-
-    ////////////////////////////////////////////////////////////////////////////////
-    //! Compare two float arrays with an epsilon tolerance for equality and a 
-    //!     threshold for # pixel errors
-    //! @return  CUTTrue if \a reference and \a data are identical, 
-    //!          otherwise CUTFalse
-    //! @param reference  handle to the reference data / gold image
-    //! @param data       handle to the computed data
-    //! @param len        number of elements in reference and data
-    //! @param epsilon    epsilon to use for the comparison
-    ////////////////////////////////////////////////////////////////////////////////
-    DLL_MAPPING
-    CUTBoolean CUTIL_API
-    cutComparefet( const float* reference, const float* data,
-                 const unsigned int len, const float epsilon, const float threshold );
-
-    ////////////////////////////////////////////////////////////////////////////
-    //! Compare two float arrays using L2-norm with an epsilon tolerance for 
-    //! equality
-    //! @return  CUTTrue if \a reference and \a data are identical, 
-    //!          otherwise CUTFalse
-    //! @param reference  handle to the reference data / gold image
-    //! @param data       handle to the computed data
-    //! @param len        number of elements in reference and data
-    //! @param epsilon    epsilon to use for the comparison
-    ////////////////////////////////////////////////////////////////////////////
-    DLL_MAPPING
-    CUTBoolean CUTIL_API 
-    cutCompareL2fe( const float* reference, const float* data,
-                    const unsigned int len, const float epsilon );
-
-	////////////////////////////////////////////////////////////////////////////////
-    //! Compare two PPM image files with an epsilon tolerance for equality
-    //! @return  CUTTrue if \a reference and \a data are identical, 
-    //!          otherwise CUTFalse
-    //! @param src_file   filename for the image to be compared
-    //! @param data       filename for the reference data / gold image
-    //! @param epsilon    epsilon to use for the comparison
-    //! @param threshold  threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
-    //! $param verboseErrors output details of image mismatch to std::err
-    ////////////////////////////////////////////////////////////////////////////////
-    DLL_MAPPING
-    CUTBoolean CUTIL_API
-	cutComparePPM( const char *src_file, const char *ref_file, const float epsilon, const float threshold, bool verboseErrors = false );
-
-
-    ////////////////////////////////////////////////////////////////////////////
-    //! Timer functionality
-
-    ////////////////////////////////////////////////////////////////////////////
-    //! Create a new timer
-    //! @return CUTTrue if a time has been created, otherwise false
-    //! @param  name of the new timer, 0 if the creation failed
-    ////////////////////////////////////////////////////////////////////////////
-    DLL_MAPPING
-    CUTBoolean CUTIL_API 
-    cutCreateTimer( unsigned int* name);
-
-    ////////////////////////////////////////////////////////////////////////////
-    //! Delete a timer
-    //! @return CUTTrue if a time has been deleted, otherwise false
-    //! @param  name of the timer to delete
-    ////////////////////////////////////////////////////////////////////////////
-    DLL_MAPPING
-    CUTBoolean CUTIL_API 
-    cutDeleteTimer( unsigned int name);
-
-    ////////////////////////////////////////////////////////////////////////////
-    //! Start the time with name \a name
-    //! @param name  name of the timer to start
-    ////////////////////////////////////////////////////////////////////////////
-    DLL_MAPPING
-    CUTBoolean CUTIL_API 
-    cutStartTimer( const unsigned int name);
-
-    ////////////////////////////////////////////////////////////////////////////
-    //! Stop the time with name \a name. Does not reset.
-    //! @param name  name of the timer to stop
-    ////////////////////////////////////////////////////////////////////////////
-    DLL_MAPPING
-    CUTBoolean CUTIL_API 
-    cutStopTimer( const unsigned int name);
-
-    ////////////////////////////////////////////////////////////////////////////
-    //! Resets the timer's counter.
-    //! @param name  name of the timer to reset.
-    ////////////////////////////////////////////////////////////////////////////
-    DLL_MAPPING
-    CUTBoolean CUTIL_API 
-    cutResetTimer( const unsigned int name);
-
-    ////////////////////////////////////////////////////////////////////////////
-    //! Returns total execution time in milliseconds for the timer over all 
-    //! runs since the last reset or timer creation.
-    //! @param name  name of the timer to return the time of
-    ////////////////////////////////////////////////////////////////////////////
-    DLL_MAPPING
-    float CUTIL_API 
-    cutGetTimerValue( const unsigned int name);
-
-    ////////////////////////////////////////////////////////////////////////////
-    //! Return the average time in milliseconds for timer execution as the 
-    //! total  time for the timer dividied by the number of completed (stopped)
-    //! runs the timer has made.
-    //! Excludes the current running time if the timer is currently running.
-    //! @param name  name of the timer to return the time of
-    ////////////////////////////////////////////////////////////////////////////
-    DLL_MAPPING
-    float CUTIL_API 
-    cutGetAverageTimerValue( const unsigned int name);
-
-    ////////////////////////////////////////////////////////////////////////////
-    //! Macros
-
-#if CUDART_VERSION >= 4000
-#define CUT_DEVICE_SYNCHRONIZE( )   cudaDeviceSynchronize();
-#else
-#define CUT_DEVICE_SYNCHRONIZE( )   cudaThreadSynchronize();
-#endif
-
-#if CUDART_VERSION >= 4000
-#define CUT_DEVICE_RESET( )   cudaDeviceReset();
-#else
-#define CUT_DEVICE_RESET( )   cudaThreadExit();
-#endif
-
-// This is for the CUTIL bank checker
-#ifdef _DEBUG
-    #if __DEVICE_EMULATION__
-        // Interface for bank conflict checker
-    #define CUT_BANK_CHECKER( array, index)                                      \
-        (cutCheckBankAccess( threadIdx.x, threadIdx.y, threadIdx.z, blockDim.x,  \
-        blockDim.y, blockDim.z,                                                  \
-        __FILE__, __LINE__, #array, index ),                                     \
-        array[index])
-    #else
-    #define CUT_BANK_CHECKER( array, index)  array[index]
-    #endif
-#else
-    #define CUT_BANK_CHECKER( array, index)  array[index]
-#endif
-
-#  define CU_SAFE_CALL_NO_SYNC( call ) {                                     \
-    CUresult err = call;                                                     \
-    if( CUDA_SUCCESS != err) {                                               \
-        fprintf(stderr, "Cuda driver error %x in file '%s' in line %i.\n",   \
-                err, __FILE__, __LINE__ );                                   \
-        exit(EXIT_FAILURE);                                                  \
-    } }
-
-#  define CU_SAFE_CALL( call )       CU_SAFE_CALL_NO_SYNC(call);
-
-#  define CU_SAFE_CTX_SYNC( ) {                                              \
-    CUresult err = cuCtxSynchronize();                                       \
-    if( CUDA_SUCCESS != err) {                                               \
-        fprintf(stderr, "Cuda driver error %x in file '%s' in line %i.\n",   \
-                err, __FILE__, __LINE__ );                                   \
-        exit(EXIT_FAILURE);                                                  \
-    } }
-
-#  define CUDA_SAFE_CALL_NO_SYNC( call) {                                    \
-    cudaError err = call;                                                    \
-    if( cudaSuccess != err) {                                                \
-        fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",        \
-                __FILE__, __LINE__, cudaGetErrorString( err) );              \
-        exit(EXIT_FAILURE);                                                  \
-    } }
-
-#  define CUDA_SAFE_CALL( call)     CUDA_SAFE_CALL_NO_SYNC(call);                                            \
-
-#  define CUDA_SAFE_THREAD_SYNC( ) {                                         \
-    cudaError err = CUT_DEVICE_SYNCHRONIZE();                                 \
-    if ( cudaSuccess != err) {                                               \
-        fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",        \
-                __FILE__, __LINE__, cudaGetErrorString( err) );              \
-    } }
-
-#  define CUFFT_SAFE_CALL( call) {                                           \
-    cufftResult err = call;                                                  \
-    if( CUFFT_SUCCESS != err) {                                              \
-        fprintf(stderr, "CUFFT error in file '%s' in line %i.\n",            \
-                __FILE__, __LINE__);                                         \
-        exit(EXIT_FAILURE);                                                  \
-    } }
-
-#  define CUT_SAFE_CALL( call)                                               \
-    if( CUTTrue != call) {                                                   \
-        fprintf(stderr, "Cut error in file '%s' in line %i.\n",              \
-                __FILE__, __LINE__);                                         \
-        exit(EXIT_FAILURE);                                                  \
-    } 
-
-    //! Check for CUDA error
-#ifdef _DEBUG
-#  define CUT_CHECK_ERROR(errorMessage) {                                    \
-    cudaError_t err = cudaGetLastError();                                    \
-    if( cudaSuccess != err) {                                                \
-        fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",    \
-                errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
-        exit(EXIT_FAILURE);                                                  \
-    }                                                                        \
-    err = CUT_DEVICE_SYNCHRONIZE();                                           \
-    if( cudaSuccess != err) {                                                \
-        fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",    \
-                errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
-        exit(EXIT_FAILURE);                                                  \
-    }                                                                        \
-    }
-#else
-#  define CUT_CHECK_ERROR(errorMessage) {                                    \
-    cudaError_t err = cudaGetLastError();                                    \
-    if( cudaSuccess != err) {                                                \
-        fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",    \
-                errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
-        exit(EXIT_FAILURE);                                                  \
-    }                                                                        \
-    }
-#endif
-
-    //! Check for malloc error
-#  define CUT_SAFE_MALLOC( mallocCall ) {                                    \
-    if( !(mallocCall)) {                                                     \
-        fprintf(stderr, "Host malloc failure in file '%s' in line %i\n",     \
-                __FILE__, __LINE__);                                         \
-        exit(EXIT_FAILURE);                                                  \
-    } } while(0);
-
-    //! Check if conditon is true (flexible assert)
-#  define CUT_CONDITION( val)                                                \
-    if( CUTFalse == cutCheckCondition( val, __FILE__, __LINE__)) {           \
-        exit(EXIT_FAILURE);                                                  \
-    }
-
-#if __DEVICE_EMULATION__
-
-#  define CUT_DEVICE_INIT(ARGC, ARGV)
-
-#else
-
-#  define CUT_DEVICE_INIT(ARGC, ARGV) {                                      \
-    int deviceCount;                                                         \
-    CUDA_SAFE_CALL_NO_SYNC(cudaGetDeviceCount(&deviceCount));                \
-    if (deviceCount == 0) {                                                  \
-        fprintf(stderr, "cutil error: no devices supporting CUDA.\n");       \
-        exit(EXIT_FAILURE);                                                  \
-    }                                                                        \
-    int dev = 0;                                                             \
-    cutGetCmdLineArgumenti(ARGC, (const char **) ARGV, "device", &dev);      \
-	if (dev < 0) dev = 0;                                                    \
-    if (dev > deviceCount-1) dev = deviceCount - 1;                          \
-    cudaDeviceProp deviceProp;                                               \
-    CUDA_SAFE_CALL_NO_SYNC(cudaGetDeviceProperties(&deviceProp, dev));       \
-    if (deviceProp.major < 1) {                                              \
-        fprintf(stderr, "cutil error: device does not support CUDA.\n");     \
-        exit(EXIT_FAILURE);                                                  \
-    }                                                                        \
-    if (cutCheckCmdLineFlag(ARGC, (const char **) ARGV, "quiet") == CUTFalse) \
-        fprintf(stderr, "Using device %d: %s\n", dev, deviceProp.name);       \
-    CUDA_SAFE_CALL(cudaSetDevice(dev));                                      \
-}
-
-
-    //! Check for CUDA context lost
-#  define CUDA_CHECK_CTX_LOST(errorMessage) {                                \
-    cudaError_t err = cudaGetLastError();                                    \
-    if( cudaSuccess != err) {                                                \
-        fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",    \
-                errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
-        exit(EXIT_FAILURE);                                                  \
-    }                                                                        \
-    err = CUT_DEVICE_SYNCHRONIZE();                                           \
-    if( cudaSuccess != err) {                                                \
-        fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",    \
-                errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
-        exit(EXIT_FAILURE);                                                  \
-    } }
-
-//! Check for CUDA context lost
-#  define CU_CHECK_CTX_LOST(errorMessage) {                                  \
-    cudaError_t err = cudaGetLastError();                                    \
-    if( CUDA_ERROR_INVALID_CONTEXT != err) {                                 \
-        fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",    \
-                errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
-        exit(EXIT_FAILURE);                                                  \
-    }                                                                        \
-    err = CUT_DEVICE_SYNCHRONIZE();                                           \
-    if( cudaSuccess != err) {                                                \
-        fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",    \
-                errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
-        exit(EXIT_FAILURE);                                                  \
-    } }
-
-
-#endif
-
-#  define CUT_DEVICE_INIT_DRV(cuDevice, ARGC, ARGV) {                        \
-    cuDevice = 0;                                                            \
-    int deviceCount = 0;                                                     \
-    CUresult err = cuInit(0);                                                \
-    if (CUDA_SUCCESS == err)                                                 \
-        CU_SAFE_CALL_NO_SYNC(cuDeviceGetCount(&deviceCount));                \
-    if (deviceCount == 0) {                                                  \
-        fprintf(stderr, "cutil error: no devices supporting CUDA\n");        \
-        exit(EXIT_FAILURE);                                                  \
-    }                                                                        \
-    int dev = 0;                                                             \
-    cutGetCmdLineArgumenti(ARGC, (const char **) ARGV, "device", &dev);      \
-	if (dev < 0) dev = 0;                                                    \
-    if (dev > deviceCount-1) dev = deviceCount - 1;                          \
-    CU_SAFE_CALL_NO_SYNC(cuDeviceGet(&cuDevice, dev));                       \
-    char name[100];                                                          \
-    cuDeviceGetName(name, 100, cuDevice);                                    \
-    if (cutCheckCmdLineFlag(ARGC, (const char **) ARGV, "quiet") == CUTFalse) \
-        fprintf(stderr, "Using device %d: %s\n", dev, name);                  \
-}
-
-#define CUT_EXIT(argc, argv)                                                 \
-    if (!cutCheckCmdLineFlag(argc, (const char**)argv, "noprompt")) {        \
-        printf("\nPress ENTER to exit...\n");                                \
-        fflush( stdout);                                                     \
-        fflush( stderr);                                                     \
-        getchar();                                                           \
-    }                                                                        \
-    exit(EXIT_SUCCESS);
-
-
-#ifdef __cplusplus
-}
-#endif  // #ifdef _DEBUG (else branch)
-
-#endif  // #ifndef _CUTIL_H_
diff --git a/cuda/common/include/pcl/cuda/cutil_inline.h b/cuda/common/include/pcl/cuda/cutil_inline.h
deleted file mode 100644
index 71885ee8..00000000
--- a/cuda/common/include/pcl/cuda/cutil_inline.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
- *
- * Please refer to the NVIDIA end user license agreement (EULA) associated
- * with this source code for terms and conditions that govern your use of
- * this software. Any use, reproduction, disclosure, or distribution of
- * this software and related documentation outside the terms of the EULA
- * is strictly prohibited.
- *
- */
- 
-#ifndef _CUTIL_INLINE_H_
-#define _CUTIL_INLINE_H_
-
-#include <cuda.h>
-#include <pcl/cuda/cutil.h>
-#include <cuda_runtime_api.h>
-
-#include <pcl/cuda/cutil_inline_bankchecker.h>
-#include <pcl/cuda/cutil_inline_runtime.h>
-#include <pcl/cuda/cutil_inline_drvapi.h>
-
-inline void print_NVCC_min_spec(const char *sSDKsample, const char *sNVCCReq, const char *sDriverReq)
-{
-    printf("CUDA %d.%02d Toolkit built this project.\n", CUDART_VERSION/1000, (CUDART_VERSION%100));
-    printf("  [ %s ] requirements:\n", sSDKsample);
-    printf(" -> CUDA %s Toolkit\n"  , sNVCCReq);
-    printf(" -> %s NVIDIA Display Driver.\n", sDriverReq);
-}
-
-#define ALIGN_OFFSET(offset, alignment) offset = (offset + (alignment-1)) & ~((alignment-1))
-
-
-#endif // _CUTIL_INLINE_H_
diff --git a/cuda/common/include/pcl/cuda/cutil_inline_bankchecker.h b/cuda/common/include/pcl/cuda/cutil_inline_bankchecker.h
deleted file mode 100644
index d313d8e0..00000000
--- a/cuda/common/include/pcl/cuda/cutil_inline_bankchecker.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
- *
- * Please refer to the NVIDIA end user license agreement (EULA) associated
- * with this source code for terms and conditions that govern your use of
- * this software. Any use, reproduction, disclosure, or distribution of
- * this software and related documentation outside the terms of the EULA
- * is strictly prohibited.
- *
- */
- 
- #ifndef _CUTIL_INLINE_BANKCHECKER_H_
-#define _CUTIL_INLINE_BANKCHECKER_H_
-
-#ifdef _DEBUG
-   #if __DEVICE_EMULATION__
-      #define cutilBankChecker(array, idx) (__cutilBankChecker (threadIdx.x, threadIdx.y, threadIdx.z, \
-                                                               blockDim.x, blockDim.y, blockDim.z, \
-                                                               #array, idx, __FILE__, __LINE__), \
-                                                               array[idx])
-
-   #else
-      #define cutilBankChecker(array, idx) array[idx] 
-   #endif
-#else
-      #define cutilBankChecker(array, idx) array[idx]
-#endif
-
-    // Interface for bank conflict checker
-inline void __cutilBankChecker(unsigned int tidx, unsigned int tidy, unsigned int tidz,
-                            unsigned int bdimx, unsigned int bdimy, unsigned int bdimz,
-                            char *aname, int index, char *file, int line) 
-{
-    cutCheckBankAccess( tidx, tidy, tidz, bdimx, bdimy, bdimz, file, line, aname, index);
-}
-
-#endif // _CUTIL_INLINE_BANKCHECKER_H_
diff --git a/cuda/common/include/pcl/cuda/cutil_inline_drvapi.h b/cuda/common/include/pcl/cuda/cutil_inline_drvapi.h
deleted file mode 100644
index 50261782..00000000
--- a/cuda/common/include/pcl/cuda/cutil_inline_drvapi.h
+++ /dev/null
@@ -1,384 +0,0 @@
-/*
- * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
- *
- * Please refer to the NVIDIA end user license agreement (EULA) associated
- * with this source code for terms and conditions that govern your use of
- * this software. Any use, reproduction, disclosure, or distribution of
- * this software and related documentation outside the terms of the EULA
- * is strictly prohibited.
- *
- */
- 
-#ifndef _CUTIL_INLINE_FUNCTIONS_DRVAPI_H_
-#define _CUTIL_INLINE_FUNCTIONS_DRVAPI_H_
-
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-
-
-// We define these calls here, so the user doesn't need to include __FILE__ and __LINE__
-// The advantage is the developers gets to use the inline function so they can debug
-#define cutilDrvSafeCallNoSync(err)     __cuSafeCallNoSync  (err, __FILE__, __LINE__)
-#define cutilDrvSafeCall(err)           __cuSafeCall        (err, __FILE__, __LINE__)
-#define cutilDrvCtxSync()               __cuCtxSync         (__FILE__, __LINE__)
-#define cutilDrvCheckMsg(msg)           __cuCheckMsg        (msg, __FILE__, __LINE__)
-#define cutilDrvAlignOffset(offset, alignment)  ( offset = (offset + (alignment-1)) & ~((alignment-1)) )
-
-// These are the inline versions for all of the CUTIL functions
-inline void __cuSafeCallNoSync( CUresult err, const char *file, const int line )
-{
-    if( CUDA_SUCCESS != err) {
-        fprintf(stderr, "cuSafeCallNoSync() Driver API error = %04d from file <%s>, line %i.\n",
-                err, file, line );
-        exit(-1);
-    }
-}
-inline void __cuSafeCall( CUresult err, const char *file, const int line )
-{
-    __cuSafeCallNoSync( err, file, line );
-}
-
-inline void __cuCtxSync(const char *file, const int line )
-{
-    CUresult err = cuCtxSynchronize();
-    if( CUDA_SUCCESS != err ) {
-        fprintf(stderr, "cuCtxSynchronize() API error = %04d in file <%s>, line %i.\n",
-                err, file, line );
-        exit(-1);
-    }
-}
-
-#define MIN(a,b) ((a < b) ? a : b)
-#define MAX(a,b) ((a > b) ? a : b)
-
-// Beginning of GPU Architecture definitions
-inline int _ConvertSMVer2CoresDrvApi(int major, int minor)
-{
-	// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
-	typedef struct {
-		int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
-		int Cores;
-	} sSMtoCores;
-
-        sSMtoCores nGpuArchCoresPerSM[] =
-        { { 0x10,  8 },
-          { 0x11,  8 },
-          { 0x12,  8 },
-          { 0x13,  8 },
-          { 0x20, 32 },
-          { 0x21, 48 },
-          {   -1, -1 }
-        };
-
-	int index = 0;
-	while (nGpuArchCoresPerSM[index].SM != -1) {
-		if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) {
-			return nGpuArchCoresPerSM[index].Cores;
-		}
-		index++;
-	}
-	printf("MapSMtoCores undefined SMversion %d.%d!\n", major, minor);
-	return -1;
-}
-// end of GPU Architecture definitions
-
-// This function returns the best GPU based on performance
-inline int cutilDrvGetMaxGflopsDeviceId()
-{
-    CUdevice current_device = 0, max_perf_device = 0;
-    int device_count     = 0, sm_per_multiproc = 0;
-    int max_compute_perf = 0, best_SM_arch     = 0;
-    int major = 0, minor = 0, multiProcessorCount, clockRate;
-
-    cuInit(0);
-    cutilDrvSafeCallNoSync(cuDeviceGetCount(&device_count));
-
-	// Find the best major SM Architecture GPU device
-	while ( current_device < device_count ) {
-		cutilDrvSafeCallNoSync( cuDeviceComputeCapability(&major, &minor, current_device ) );
-		if (major > 0 && major < 9999) {
-			best_SM_arch = MAX(best_SM_arch, major);
-		}
-		current_device++;
-	}
-
-    // Find the best CUDA capable GPU device
-	current_device = 0;
-	while( current_device < device_count ) {
-		cutilDrvSafeCallNoSync( cuDeviceGetAttribute( &multiProcessorCount, 
-                                                            CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, 
-                                                            current_device ) );
-        cutilDrvSafeCallNoSync( cuDeviceGetAttribute( &clockRate, 
-                                                            CU_DEVICE_ATTRIBUTE_CLOCK_RATE, 
-                                                            current_device ) );
-		cutilDrvSafeCallNoSync( cuDeviceComputeCapability(&major, &minor, current_device ) );
-
-		if (major == 9999 && minor == 9999) {
-		    sm_per_multiproc = 1;
-		} else {
-		    sm_per_multiproc = _ConvertSMVer2CoresDrvApi(major, minor);
-		}
-
-		int compute_perf  = multiProcessorCount * sm_per_multiproc * clockRate;
-		if( compute_perf  > max_compute_perf ) {
-            // If we find GPU with SM major > 2, search only these
-			if ( best_SM_arch > 2 ) {
-				// If our device==dest_SM_arch, choose this, or else pass
-				if (major == best_SM_arch) {	
-                    max_compute_perf  = compute_perf;
-                    max_perf_device   = current_device;
-				}
-			} else {
-				max_compute_perf  = compute_perf;
-				max_perf_device   = current_device;
-			}
-		}
-		++current_device;
-	}
-	return max_perf_device;
-}
-
-// This function returns the best Graphics GPU based on performance
-inline int cutilDrvGetMaxGflopsGraphicsDeviceId()
-{
-    CUdevice current_device = 0, max_perf_device = 0;
-    int device_count     = 0, sm_per_multiproc = 0;
-    int max_compute_perf = 0, best_SM_arch     = 0;
-    int major = 0, minor = 0, multiProcessorCount, clockRate;
-	int bTCC = 0;
-	char deviceName[256];
-
-    cuInit(0);
-    cutilDrvSafeCallNoSync(cuDeviceGetCount(&device_count));
-
-	// Find the best major SM Architecture GPU device that are graphics devices
-	while ( current_device < device_count ) {
-		cutilDrvSafeCallNoSync( cuDeviceGetName(deviceName, 256, current_device) );
-		cutilDrvSafeCallNoSync( cuDeviceComputeCapability(&major, &minor, current_device ) );
-
-#if CUDA_VERSION >= 3020
-		cutilDrvSafeCallNoSync( cuDeviceGetAttribute( &bTCC,  CU_DEVICE_ATTRIBUTE_TCC_DRIVER, current_device ) );
-#else
-		// Assume a Tesla GPU is running in TCC if we are running CUDA 3.1
-		if (deviceName[0] == 'T') bTCC = 1;
-#endif
-		if (!bTCC) {
-			if (major > 0 && major < 9999) {
-				best_SM_arch = MAX(best_SM_arch, major);
-			}
-		}
-		current_device++;
-	}
-
-    // Find the best CUDA capable GPU device
-	current_device = 0;
-	while( current_device < device_count ) {
-		cutilDrvSafeCallNoSync( cuDeviceGetAttribute( &multiProcessorCount, 
-                                                            CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, 
-                                                            current_device ) );
-        cutilDrvSafeCallNoSync( cuDeviceGetAttribute( &clockRate, 
-                                                            CU_DEVICE_ATTRIBUTE_CLOCK_RATE, 
-                                                            current_device ) );
-		cutilDrvSafeCallNoSync( cuDeviceComputeCapability(&major, &minor, current_device ) );
-
-#if CUDA_VERSION >= 3020
-		cutilDrvSafeCallNoSync( cuDeviceGetAttribute( &bTCC,  CU_DEVICE_ATTRIBUTE_TCC_DRIVER, current_device ) );
-#else
-		// Assume a Tesla GPU is running in TCC if we are running CUDA 3.1
-		if (deviceName[0] == 'T') bTCC = 1;
-#endif
-
-		if (major == 9999 && minor == 9999) {
-		    sm_per_multiproc = 1;
-		} else {
-		    sm_per_multiproc = _ConvertSMVer2CoresDrvApi(major, minor);
-		}
-
-		// If this is a Tesla based GPU and SM 2.0, and TCC is disabled, this is a contendor
-		if (!bTCC) // Is this GPU running the TCC driver?  If so we pass on this
-		{
-			int compute_perf  = multiProcessorCount * sm_per_multiproc * clockRate;
-			if( compute_perf  > max_compute_perf ) {
-				// If we find GPU with SM major > 2, search only these
-				if ( best_SM_arch > 2 ) {
-					// If our device = dest_SM_arch, then we pick this one
-					if (major == best_SM_arch) {	
-                        max_compute_perf  = compute_perf;
-                        max_perf_device   = current_device;
-					}
-				} else {
-					max_compute_perf  = compute_perf;
-					max_perf_device   = current_device;
-				}
-			}
-		}
-		++current_device;
-	}
-	return max_perf_device;
-}
-
-inline void __cuCheckMsg( const char * msg, const char *file, const int line )
-{
-    CUresult err = cuCtxSynchronize();
-    if( CUDA_SUCCESS != err) {
-		fprintf(stderr, "cutilDrvCheckMsg -> %s", msg);
-        fprintf(stderr, "cutilDrvCheckMsg -> cuCtxSynchronize API error = %04d in file <%s>, line %i.\n",
-                err, file, line );
-        exit(-1);
-    }
-}
-
-
-#if __DEVICE_EMULATION__
-    inline int cutilDeviceInitDrv(int ARGC, char **ARGV) { } 
-#else
-    inline int cutilDeviceInitDrv(int ARGC, char ** ARGV) 
-    {
-        int cuDevice = 0;
-        int deviceCount = 0;
-        CUresult err = cuInit(0);
-        if (CUDA_SUCCESS == err)
-            cutilDrvSafeCallNoSync(cuDeviceGetCount(&deviceCount));
-        if (deviceCount == 0) {
-            fprintf(stderr, "CUTIL DeviceInitDrv error: no devices supporting CUDA\n");
-            exit(-1);
-        }
-        int dev = 0;
-        cutGetCmdLineArgumenti(ARGC, (const char **) ARGV, "device", &dev);
-        if (dev < 0) dev = 0;
-        if (dev > deviceCount-1) {
-			fprintf(stderr, "\n");
-			fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", deviceCount);
-            fprintf(stderr, ">> cutilDeviceInit (-device=%d) is not a valid GPU device. <<\n", dev);
-			fprintf(stderr, "\n");
-            return -dev;
-        }
-        cutilDrvSafeCallNoSync(cuDeviceGet(&cuDevice, dev));
-        char name[100];
-        cuDeviceGetName(name, 100, cuDevice);
-        if (cutCheckCmdLineFlag(ARGC, (const char **) ARGV, "quiet") == CUTFalse) {
-           printf("> Using CUDA Device [%d]: %s\n", dev, name);
-       	}
-        return dev;
-    }
-#endif
-
-    // General initialization call to pick the best CUDA Device
-#if __DEVICE_EMULATION__
-    inline CUdevice cutilChooseCudaDeviceDrv(int argc, char **argv, int *p_devID)
-#else
-    inline CUdevice cutilChooseCudaDeviceDrv(int argc, char **argv, int *p_devID)
-    {
-        CUdevice cuDevice;
-        int devID = 0;
-        // If the command-line has a device number specified, use it
-        if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) {
-            devID = cutilDeviceInitDrv(argc, argv);
-            if (devID < 0) {
-                printf("exiting...\n");
-                exit(0);
-            }
-        } else {
-            // Otherwise pick the device with highest Gflops/s
-            char name[100];
-            devID = cutilDrvGetMaxGflopsDeviceId();
-            cutilDrvSafeCallNoSync(cuDeviceGet(&cuDevice, devID));
-            cuDeviceGetName(name, 100, cuDevice);
-            printf("> Using CUDA Device [%d]: %s\n", devID, name);
-        }
-        cuDeviceGet(&cuDevice, devID);
-        if (p_devID) *p_devID = devID;
-        return cuDevice;
-    }
-#endif
-
-
-//! Check for CUDA context lost
-inline void cutilDrvCudaCheckCtxLost(const char *errorMessage, const char *file, const int line ) 
-{
-    CUresult err = cuCtxSynchronize();
-    if( CUDA_ERROR_INVALID_CONTEXT != err) {
-        fprintf(stderr, "Cuda error: %s in file '%s' in line %i\n",
-                errorMessage, file, line );
-        exit(-1);
-    }
-    err = cuCtxSynchronize();
-    if( CUDA_SUCCESS != err) {
-        fprintf(stderr, "Cuda error: %s in file '%s' in line %i\n",
-                errorMessage, file, line );
-        exit(-1);
-    } 
-}
-
-#ifndef STRCASECMP
-#ifdef _WIN32
-#define STRCASECMP  _stricmp
-#else
-#define STRCASECMP  strcasecmp
-#endif
-#endif
-
-#ifndef STRNCASECMP
-#ifdef _WIN32
-#define STRNCASECMP _strnicmp
-#else
-#define STRNCASECMP strncasecmp
-#endif
-#endif
-
-inline void __cutilDrvQAFinish(int argc, char **argv, bool bStatus)
-{
-    const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
-
-    bool bFlag = false;
-    for (int i=1; i < argc; i++) {
-        if (!STRCASECMP(argv[i], "-qatest") || !STRCASECMP(argv[i], "-noprompt")) {
-            bFlag |= true;
-        }
-    }
-
-    if (bFlag) {
-        printf("&&&& %s %s", sStatus[bStatus], argv[0]);
-        for (int i=1; i < argc; i++) printf(" %s", argv[i]);
-    } else {
-        printf("[%s] test result\n%s\n", argv[0], sStatus[bStatus]);
-    }
-}
-
-// General check for CUDA GPU SM Capabilities for a specific device #
-inline bool cutilDrvCudaDevCapabilities(int major_version, int minor_version, int deviceNum, int argc, char** argv)
-{
-    int major, minor, dev;
-    char device_name[256];
-
-#ifdef __DEVICE_EMULATION__
-    printf("> Compute Device Emulation Mode \n");
-#endif
-
-    cutilDrvSafeCallNoSync( cuDeviceGet(&dev, deviceNum) );
-    cutilDrvSafeCallNoSync( cuDeviceComputeCapability(&major, &minor, dev));
-    cutilDrvSafeCallNoSync( cuDeviceGetName(device_name, 256, dev) ); 
-
-    if((major > major_version) ||
-	   (major == major_version && minor >= minor_version))
-    {
-        printf("> Device %d: < %s >, Compute SM %d.%d detected\n", dev, device_name, major, minor);
-        return true;
-    }
-    else
-    {
-        printf("There is no device supporting CUDA compute capability %d.%d.\n", major_version, minor_version);
-        __cutilDrvQAFinish(argc, argv, true);
-        return false;
-    }
-}
-
-// General check for CUDA GPU SM Capabilities
-inline bool cutilDrvCudaCapabilities(int major_version, int minor_version, int argc, char **argv)
-{
-	return cutilDrvCudaDevCapabilities(major_version, minor_version, 0, argc, argv);
-}
-
-
-#endif // _CUTIL_INLINE_FUNCTIONS_DRVAPI_H_
diff --git a/cuda/common/include/pcl/cuda/cutil_inline_runtime.h b/cuda/common/include/pcl/cuda/cutil_inline_runtime.h
deleted file mode 100644
index c0a607be..00000000
--- a/cuda/common/include/pcl/cuda/cutil_inline_runtime.h
+++ /dev/null
@@ -1,488 +0,0 @@
-/*
- * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
- *
- * Please refer to the NVIDIA end user license agreement (EULA) associated
- * with this source code for terms and conditions that govern your use of
- * this software. Any use, reproduction, disclosure, or distribution of
- * this software and related documentation outside the terms of the EULA
- * is strictly prohibited.
- *
- */
- 
-#ifndef _CUTIL_INLINE_FUNCTIONS_RUNTIME_H_
-#define _CUTIL_INLINE_FUNCTIONS_RUNTIME_H_
-
-#ifdef _WIN32
-#ifdef _DEBUG // Do this only in debug mode...
-#  define WINDOWS_LEAN_AND_MEAN
-#  include <windows.h>
-#  include <stdlib.h>
-#  undef min
-#  undef max
-#endif
-#endif
-
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-
-#include <cufft.h>
-
-// We define these calls here, so the user doesn't need to include __FILE__ and __LINE__
-// The advantage is the developers gets to use the inline function so they can debug
-#define cutilSafeCallNoSync(err)     __cudaSafeCallNoSync(err, __FILE__, __LINE__)
-#define cutilSafeCall(err)           __cudaSafeCall      (err, __FILE__, __LINE__)
-#define cutilSafeThreadSync()        __cudaSafeThreadSync(__FILE__, __LINE__)
-#define cufftSafeCall(err)           __cufftSafeCall     (err, __FILE__, __LINE__)
-#define cutilCheckError(err)         __cutilCheckError   (err, __FILE__, __LINE__)
-#define cutilCheckMsg(msg)           __cutilGetLastError (msg, __FILE__, __LINE__)
-#define cutilCheckMsgAndSync(msg)    __cutilGetLastErrorAndSync (msg, __FILE__, __LINE__)
-#define cutilSafeMalloc(mallocCall)  __cutilSafeMalloc   ((mallocCall), __FILE__, __LINE__)
-#define cutilCondition(val)          __cutilCondition    (val, __FILE__, __LINE__)
-#define cutilExit(argc, argv)        __cutilExit         (argc, argv)
-
-inline cudaError cutilDeviceSynchronize()
-{
-#if CUDART_VERSION >= 4000
-	return cudaDeviceSynchronize();
-#else
-	return cudaThreadSynchronize();
-#endif
-}
-
-inline cudaError cutilDeviceReset()
-{
-#if CUDART_VERSION >= 4000
-	return cudaDeviceReset();
-#else
-	return cudaThreadExit();
-#endif
-}
-
-inline void __cutilCondition(int val, char *file, int line) 
-{
-    if( CUTFalse == cutCheckCondition( val, file, line ) ) {
-        exit(EXIT_FAILURE);
-    }
-}
-
-inline void __cutilExit(int argc, char **argv)
-{     
-    if (!cutCheckCmdLineFlag(argc, (const char**)argv, "noprompt")) {
-        printf("\nPress ENTER to exit...\n");
-        fflush( stdout);
-        fflush( stderr);
-        getchar();
-    }
-    exit(EXIT_SUCCESS);
-}
-
-#define MIN(a,b) ((a < b) ? a : b)
-#define MAX(a,b) ((a > b) ? a : b)
-
-// Beginning of GPU Architecture definitions
-inline int _ConvertSMVer2Cores(int major, int minor)
-{
-	// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
-	typedef struct {
-		int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
-		int Cores;
-	} sSMtoCores;
-
-	sSMtoCores nGpuArchCoresPerSM[] = 
-	{ { 0x10,  8 },
-	  { 0x11,  8 },
-	  { 0x12,  8 },
-	  { 0x13,  8 },
-	  { 0x20, 32 },
-	  { 0x21, 48 },
-	  {   -1, -1 } 
-	};
-
-	int index = 0;
-	while (nGpuArchCoresPerSM[index].SM != -1) {
-		if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) {
-			return nGpuArchCoresPerSM[index].Cores;
-		}
-		index++;
-	}
-	printf("MapSMtoCores undefined SMversion %d.%d!\n", major, minor);
-	return -1;
-}
-// end of GPU Architecture definitions
-
-// This function returns the best GPU (with maximum GFLOPS)
-inline int cutGetMaxGflopsDeviceId()
-{
-	int current_device   = 0, sm_per_multiproc = 0;
-	int max_compute_perf = 0, max_perf_device  = 0;
-	int device_count     = 0, best_SM_arch     = 0;
-	cudaDeviceProp deviceProp;
-
-	cudaGetDeviceCount( &device_count );
-	// Find the best major SM Architecture GPU device
-	while ( current_device < device_count ) {
-		cudaGetDeviceProperties( &deviceProp, current_device );
-		if (deviceProp.major > 0 && deviceProp.major < 9999) {
-			best_SM_arch = MAX(best_SM_arch, deviceProp.major);
-		}
-		current_device++;
-	}
-
-    // Find the best CUDA capable GPU device
-	current_device = 0;
-	while( current_device < device_count ) {
-		cudaGetDeviceProperties( &deviceProp, current_device );
-		if (deviceProp.major == 9999 && deviceProp.minor == 9999) {
-		    sm_per_multiproc = 1;
-		} else {
-			sm_per_multiproc = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor);
-		}
-
-		int compute_perf  = deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate;
-		if( compute_perf  > max_compute_perf ) {
-            // If we find GPU with SM major > 2, search only these
-			if ( best_SM_arch > 2 ) {
-				// If our device==dest_SM_arch, choose this, or else pass
-				if (deviceProp.major == best_SM_arch) {	
-					max_compute_perf  = compute_perf;
-					max_perf_device   = current_device;
-				}
-			} else {
-				max_compute_perf  = compute_perf;
-				max_perf_device   = current_device;
-			}
-		}
-		++current_device;
-	}
-	return max_perf_device;
-}
-
-// This function returns the best GPU (with maximum GFLOPS)
-inline int cutGetMaxGflopsGraphicsDeviceId()
-{
-	int current_device   = 0, sm_per_multiproc = 0;
-	int max_compute_perf = 0, max_perf_device  = 0;
-	int device_count     = 0, best_SM_arch     = 0;
-	int bTCC = 0;
-	cudaDeviceProp deviceProp;
-
-	cudaGetDeviceCount( &device_count );
-	// Find the best major SM Architecture GPU device that is graphics capable
-	while ( current_device < device_count ) {
-		cudaGetDeviceProperties( &deviceProp, current_device );
-
-#if CUDA_VERSION >= 3020
-		if (deviceProp.tccDriver) bTCC = 1;
-#else
-		// Assume a Tesla GPU is running in TCC if we are running CUDA 3.1
-		if (deviceProp.name[0] == 'T') bTCC = 1;
-#endif
-
-		if (!bTCC) {
-			if (deviceProp.major > 0 && deviceProp.major < 9999) {
-				best_SM_arch = MAX(best_SM_arch, deviceProp.major);
-			}
-		}
-		current_device++;
-	}
-
-    // Find the best CUDA capable GPU device
-	current_device = 0;
-	while( current_device < device_count ) {
-		cudaGetDeviceProperties( &deviceProp, current_device );
-		if (deviceProp.major == 9999 && deviceProp.minor == 9999) {
-		    sm_per_multiproc = 1;
-		} else {
-			sm_per_multiproc = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor);
-		}
-
-#if CUDA_VERSION >= 3020
-		if (deviceProp.tccDriver) bTCC = 1;
-#else
-		// Assume a Tesla GPU is running in TCC if we are running CUDA 3.1
-		if (deviceProp.name[0] == 'T') bTCC = 1;
-#endif
-
-		if (!bTCC) // Is this GPU running the TCC driver?  If so we pass on this
-		{
-			int compute_perf  = deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate;
-			if( compute_perf  > max_compute_perf ) {
-				// If we find GPU with SM major > 2, search only these
-				if ( best_SM_arch > 2 ) {
-					// If our device==dest_SM_arch, choose this, or else pass
-					if (deviceProp.major == best_SM_arch) {	
-						max_compute_perf  = compute_perf;
-						max_perf_device   = current_device;
-					}
-				} else {
-					max_compute_perf  = compute_perf;
-					max_perf_device   = current_device;
-				}
-			}
-		}
-		++current_device;
-	}
-	return max_perf_device;
-}
-
-// Give a little more for Windows : the console window often disapears before we can read the message
-#ifdef _WIN32
-# if 1//ndef UNICODE
-#  ifdef _DEBUG // Do this only in debug mode...
-	inline void VSPrintf(FILE *file, LPCSTR fmt, ...)
-	{
-		size_t fmt2_sz	= 2048;
-		char *fmt2		= (char*)malloc(fmt2_sz);
-		va_list  vlist;
-		va_start(vlist, fmt);
-		while((_vsnprintf(fmt2, fmt2_sz, fmt, vlist)) < 0) // means there wasn't anough room
-		{
-			fmt2_sz *= 2;
-			if(fmt2) free(fmt2);
-			fmt2 = (char*)malloc(fmt2_sz);
-		}
-		OutputDebugStringA(fmt2);
-		fprintf(file, fmt2);
-		free(fmt2);
-	}
-#	define FPRINTF(a) VSPrintf a
-#  else //debug
-#	define FPRINTF(a) fprintf a
-// For other than Win32
-#  endif //debug
-# else //unicode
-// Unicode case... let's give-up for now and keep basic printf
-#	define FPRINTF(a) fprintf a
-# endif //unicode
-#else //win32
-#	define FPRINTF(a) fprintf a
-#endif //win32
-
-// NOTE: "%s(%i) : " allows Visual Studio to directly jump to the file at the right line
-// when the user double clicks on the error line in the Output pane. Like any compile error.
-
-inline void __cudaSafeCallNoSync( cudaError err, const char *file, const int line )
-{
-    if( cudaSuccess != err) {
-        FPRINTF((stderr, "%s(%i) : cudaSafeCallNoSync() Runtime API error : %s.\n",
-                file, line, cudaGetErrorString( err) ));
-        exit(-1);
-    }
-}
-
-inline void __cudaSafeCall( cudaError err, const char *file, const int line )
-{
-    if( cudaSuccess != err) {
-		FPRINTF((stderr, "%s(%i) : cudaSafeCall() Runtime API error : %s.\n",
-                file, line, cudaGetErrorString( err) ));
-        exit(-1);
-    }
-}
-
-inline void __cudaSafeThreadSync( const char *file, const int line )
-{
-    cudaError err = cutilDeviceSynchronize();
-    if ( cudaSuccess != err) {
-        FPRINTF((stderr, "%s(%i) : cudaDeviceSynchronize() Runtime API error : %s.\n",
-                file, line, cudaGetErrorString( err) ));
-        exit(-1);
-    }
-}
-
-inline void __cufftSafeCall( cufftResult err, const char *file, const int line )
-{
-    if( CUFFT_SUCCESS != err) {
-        FPRINTF((stderr, "%s(%i) : cufftSafeCall() CUFFT error.\n",
-                file, line));
-        exit(-1);
-    }
-}
-
-inline void __cutilCheckError( CUTBoolean err, const char *file, const int line )
-{
-    if( CUTTrue != err) {
-        FPRINTF((stderr, "%s(%i) : CUTIL CUDA error.\n",
-                file, line));
-        exit(-1);
-    }
-}
-
-inline void __cutilGetLastError( const char *errorMessage, const char *file, const int line )
-{
-    cudaError_t err = cudaGetLastError();
-    if( cudaSuccess != err) {
-        FPRINTF((stderr, "%s(%i) : cutilCheckMsg() CUTIL CUDA error : %s : %s.\n",
-                file, line, errorMessage, cudaGetErrorString( err) ));
-        exit(-1);
-    }
-}
-
-inline void __cutilGetLastErrorAndSync( const char *errorMessage, const char *file, const int line )
-{
-    cudaError_t err = cudaGetLastError();
-    if( cudaSuccess != err) {
-        FPRINTF((stderr, "%s(%i) : cutilCheckMsg() CUTIL CUDA error : %s : %s.\n",
-                file, line, errorMessage, cudaGetErrorString( err) ));
-        exit(-1);
-    }
-
-	err = cutilDeviceSynchronize();
-    if( cudaSuccess != err) {
-		FPRINTF((stderr, "%s(%i) : cutilCheckMsg cudaDeviceSynchronize error: %s : %s.\n",
-                file, line, errorMessage, cudaGetErrorString( err) ));
-        exit(-1);
-    }
-}
-
-inline void __cutilSafeMalloc( void *pointer, const char *file, const int line )
-{
-    if( !(pointer)) {
-        FPRINTF((stderr, "%s(%i) : cutilSafeMalloc host malloc failure\n",
-                file, line));
-        exit(-1);
-    }
-}
-
-#if __DEVICE_EMULATION__
-    inline int cutilDeviceInit(int ARGC, char **ARGV) { }
-    inline int cutilChooseCudaDevice(int ARGC, char **ARGV) { }
-#else
-    inline int cutilDeviceInit(int ARGC, char **ARGV)
-    {
-        int deviceCount;
-        cutilSafeCallNoSync(cudaGetDeviceCount(&deviceCount));
-        if (deviceCount == 0) {
-            FPRINTF((stderr, "CUTIL CUDA error: no devices supporting CUDA.\n"));
-            exit(-1);
-        }
-        int dev = 0;
-        cutGetCmdLineArgumenti(ARGC, (const char **) ARGV, "device", &dev);
-        if (dev < 0) 
-            dev = 0;
-        if (dev > deviceCount-1) {
-			fprintf(stderr, "\n");
-			fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", deviceCount);
-            fprintf(stderr, ">> cutilDeviceInit (-device=%d) is not a valid GPU device. <<\n", dev);
-			fprintf(stderr, "\n");
-            return -dev;
-        }  
-        cudaDeviceProp deviceProp;
-        cutilSafeCallNoSync(cudaGetDeviceProperties(&deviceProp, dev));
-        if (deviceProp.major < 1) {
-            FPRINTF((stderr, "cutil error: GPU device does not support CUDA.\n"));
-            exit(-1);                                                  \
-        }
-        printf("> Using CUDA device [%d]: %s\n", dev, deviceProp.name);
-        cutilSafeCall(cudaSetDevice(dev));
-
-        return dev;
-    }
-
-    // General initialization call to pick the best CUDA Device
-    inline int cutilChooseCudaDevice(int argc, char **argv)
-    {
-        cudaDeviceProp deviceProp;
-        int devID = 0;
-        // If the command-line has a device number specified, use it
-        if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) {
-            devID = cutilDeviceInit(argc, argv);
-            if (devID < 0) {
-               printf("exiting...\n");
-               cutilExit(argc, argv);
-               exit(0);
-            }
-        } else {
-            // Otherwise pick the device with highest Gflops/s
-            devID = cutGetMaxGflopsDeviceId();
-            cutilSafeCallNoSync( cudaSetDevice( devID ) );
-            cutilSafeCallNoSync( cudaGetDeviceProperties(&deviceProp, devID) );
-            printf("> Using CUDA device [%d]: %s\n", devID, deviceProp.name);
-        }
-        return devID;
-    }
-#endif
-
-
-//! Check for CUDA context lost
-inline void cutilCudaCheckCtxLost(const char *errorMessage, const char *file, const int line ) 
-{
-    cudaError_t err = cudaGetLastError();
-    if( cudaSuccess != err) {
-        FPRINTF((stderr, "%s(%i) : CUDA error: %s : %s.\n",
-        file, line, errorMessage, cudaGetErrorString( err) ));
-        exit(-1);
-    }
-    err = cutilDeviceSynchronize();
-    if( cudaSuccess != err) {
-        FPRINTF((stderr, "%s(%i) : CUDA error: %s : %s.\n",
-        file, line, errorMessage, cudaGetErrorString( err) ));
-        exit(-1);
-    }
-}
-
-#ifndef STRCASECMP
-#ifdef _WIN32
-#define STRCASECMP  _stricmp
-#else
-#define STRCASECMP  strcasecmp
-#endif
-#endif
-
-#ifndef STRNCASECMP
-#ifdef _WIN32
-#define STRNCASECMP _strnicmp
-#else
-#define STRNCASECMP strncasecmp
-#endif
-#endif
-
-inline void __cutilQAFinish(int argc, char **argv, bool bStatus)
-{
-    const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
-
-    bool bFlag = false;
-    for (int i=1; i < argc; i++) {
-        if (!STRCASECMP(argv[i], "-qatest") || !STRCASECMP(argv[i], "-noprompt")) {
-            bFlag |= true;
-        }
-    }
-
-    if (bFlag) {
-        printf("&&&& %s %s", sStatus[bStatus], argv[0]);
-        for (int i=1; i < argc; i++) printf(" %s", argv[i]);
-    } else {
-        printf("[%s] test result\n%s\n", argv[0], sStatus[bStatus]);
-    }
-}
-
-// General check for CUDA GPU SM Capabilities
-inline bool cutilCudaCapabilities(int major_version, int minor_version, int argc, char **argv)
-{
-    cudaDeviceProp deviceProp;
-    deviceProp.major = 0;
-    deviceProp.minor = 0;
-    int dev;
-
-#ifdef __DEVICE_EMULATION__
-    printf("> Compute Device Emulation Mode \n");
-#endif
-
-    cutilSafeCall( cudaGetDevice(&dev) );
-    cutilSafeCall( cudaGetDeviceProperties(&deviceProp, dev));
-
-    if((deviceProp.major > major_version) ||
-	   (deviceProp.major == major_version && deviceProp.minor >= minor_version))
-    {
-        printf("> Device %d: <%16s >, Compute SM %d.%d detected\n", dev, deviceProp.name, deviceProp.major, deviceProp.minor);
-        return true;
-    }
-    else
-    {
-        printf("There is no device supporting CUDA compute capability %d.%d.\n", major_version, minor_version);
-        __cutilQAFinish(argc, argv, true);
-        return false;
-    }
-}
-
-#endif // _CUTIL_INLINE_FUNCTIONS_RUNTIME_H_
diff --git a/cuda/common/include/pcl/cuda/cutil_math.h b/cuda/common/include/pcl/cuda/cutil_math.h
deleted file mode 100644
index 746f4d68..00000000
--- a/cuda/common/include/pcl/cuda/cutil_math.h
+++ /dev/null
@@ -1,1328 +0,0 @@
-/*
- * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
- *
- * Please refer to the NVIDIA end user license agreement (EULA) associated
- * with this source code for terms and conditions that govern your use of
- * this software. Any use, reproduction, disclosure, or distribution of
- * this software and related documentation outside the terms of the EULA
- * is strictly prohibited.
- *
- */
-
-/*
-    This file implements common mathematical operations on vector types
-    (float3, float4 etc.) since these are not provided as standard by CUDA.
-
-    The syntax is modelled on the Cg standard library.
-
-    This is part of the CUTIL library and is not supported by NVIDIA.
-
-    Thanks to Linh Hah for additions and fixes.
-*/
-
-#ifndef CUTIL_MATH_H
-#define CUTIL_MATH_H
-
-#include "cuda_runtime.h"
-
-typedef unsigned int uint;
-typedef unsigned short ushort;
-
-#ifndef __CUDACC__
-#include <math.h>
-
-////////////////////////////////////////////////////////////////////////////////
-// host implementations of CUDA functions
-////////////////////////////////////////////////////////////////////////////////
-
-inline float fminf(float a, float b)
-{
-  return a < b ? a : b;
-}
-
-inline float fmaxf(float a, float b)
-{
-  return a > b ? a : b;
-}
-
-inline int max(int a, int b)
-{
-  return a > b ? a : b;
-}
-
-inline int min(int a, int b)
-{
-  return a < b ? a : b;
-}
-
-inline float rsqrtf(float x)
-{
-    return 1.0f / sqrtf(x);
-}
-#endif
-
-////////////////////////////////////////////////////////////////////////////////
-// constructors
-////////////////////////////////////////////////////////////////////////////////
-
-inline __host__ __device__ float2 make_float2(float s)
-{
-    return make_float2(s, s);
-}
-inline __host__ __device__ float2 make_float2(float3 a)
-{
-    return make_float2(a.x, a.y);
-}
-inline __host__ __device__ float2 make_float2(int2 a)
-{
-    return make_float2(float(a.x), float(a.y));
-}
-inline __host__ __device__ float2 make_float2(uint2 a)
-{
-    return make_float2(float(a.x), float(a.y));
-}
-
-inline __host__ __device__ int2 make_int2(int s)
-{
-    return make_int2(s, s);
-}
-inline __host__ __device__ int2 make_int2(int3 a)
-{
-    return make_int2(a.x, a.y);
-}
-inline __host__ __device__ int2 make_int2(uint2 a)
-{
-    return make_int2(int(a.x), int(a.y));
-}
-inline __host__ __device__ int2 make_int2(float2 a)
-{
-    return make_int2(int(a.x), int(a.y));
-}
-
-inline __host__ __device__ uint2 make_uint2(uint s)
-{
-    return make_uint2(s, s);
-}
-inline __host__ __device__ uint2 make_uint2(uint3 a)
-{
-    return make_uint2(a.x, a.y);
-}
-inline __host__ __device__ uint2 make_uint2(int2 a)
-{
-    return make_uint2(uint(a.x), uint(a.y));
-}
-
-inline __host__ __device__ float3 make_float3(float s)
-{
-    return make_float3(s, s, s);
-}
-inline __host__ __device__ float3 make_float3(float2 a)
-{
-    return make_float3(a.x, a.y, 0.0f);
-}
-inline __host__ __device__ float3 make_float3(float2 a, float s)
-{
-    return make_float3(a.x, a.y, s);
-}
-inline __host__ __device__ float3 make_float3(float4 a)
-{
-    return make_float3(a.x, a.y, a.z);
-}
-inline __host__ __device__ float3 make_float3(int3 a)
-{
-    return make_float3(float(a.x), float(a.y), float(a.z));
-}
-inline __host__ __device__ float3 make_float3(uint3 a)
-{
-    return make_float3(float(a.x), float(a.y), float(a.z));
-}
-
-inline __host__ __device__ int3 make_int3(int s)
-{
-    return make_int3(s, s, s);
-}
-inline __host__ __device__ int3 make_int3(int2 a)
-{
-    return make_int3(a.x, a.y, 0);
-}
-inline __host__ __device__ int3 make_int3(int2 a, int s)
-{
-    return make_int3(a.x, a.y, s);
-}
-inline __host__ __device__ int3 make_int3(uint3 a)
-{
-    return make_int3(int(a.x), int(a.y), int(a.z));
-}
-inline __host__ __device__ int3 make_int3(float3 a)
-{
-    return make_int3(int(a.x), int(a.y), int(a.z));
-}
-
-inline __host__ __device__ uint3 make_uint3(uint s)
-{
-    return make_uint3(s, s, s);
-}
-inline __host__ __device__ uint3 make_uint3(uint2 a)
-{
-    return make_uint3(a.x, a.y, 0);
-}
-inline __host__ __device__ uint3 make_uint3(uint2 a, uint s)
-{
-    return make_uint3(a.x, a.y, s);
-}
-inline __host__ __device__ uint3 make_uint3(uint4 a)
-{
-    return make_uint3(a.x, a.y, a.z);
-}
-inline __host__ __device__ uint3 make_uint3(int3 a)
-{
-    return make_uint3(uint(a.x), uint(a.y), uint(a.z));
-}
-
-inline __host__ __device__ float4 make_float4(float s)
-{
-    return make_float4(s, s, s, s);
-}
-inline __host__ __device__ float4 make_float4(float3 a)
-{
-    return make_float4(a.x, a.y, a.z, 0.0f);
-}
-inline __host__ __device__ float4 make_float4(float3 a, float w)
-{
-    return make_float4(a.x, a.y, a.z, w);
-}
-inline __host__ __device__ float4 make_float4(int4 a)
-{
-    return make_float4(float(a.x), float(a.y), float(a.z), float(a.w));
-}
-inline __host__ __device__ float4 make_float4(uint4 a)
-{
-    return make_float4(float(a.x), float(a.y), float(a.z), float(a.w));
-}
-
-inline __host__ __device__ int4 make_int4(int s)
-{
-    return make_int4(s, s, s, s);
-}
-inline __host__ __device__ int4 make_int4(int3 a)
-{
-    return make_int4(a.x, a.y, a.z, 0);
-}
-inline __host__ __device__ int4 make_int4(int3 a, int w)
-{
-    return make_int4(a.x, a.y, a.z, w);
-}
-inline __host__ __device__ int4 make_int4(uint4 a)
-{
-    return make_int4(int(a.x), int(a.y), int(a.z), int(a.w));
-}
-inline __host__ __device__ int4 make_int4(float4 a)
-{
-    return make_int4(int(a.x), int(a.y), int(a.z), int(a.w));
-}
-
-
-inline __host__ __device__ uint4 make_uint4(uint s)
-{
-    return make_uint4(s, s, s, s);
-}
-inline __host__ __device__ uint4 make_uint4(uint3 a)
-{
-    return make_uint4(a.x, a.y, a.z, 0);
-}
-inline __host__ __device__ uint4 make_uint4(uint3 a, uint w)
-{
-    return make_uint4(a.x, a.y, a.z, w);
-}
-inline __host__ __device__ uint4 make_uint4(int4 a)
-{
-    return make_uint4(uint(a.x), uint(a.y), uint(a.z), uint(a.w));
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// negate
-////////////////////////////////////////////////////////////////////////////////
-
-inline __host__ __device__ float2 operator-(float2 &a)
-{
-    return make_float2(-a.x, -a.y);
-}
-inline __host__ __device__ int2 operator-(int2 &a)
-{
-    return make_int2(-a.x, -a.y);
-}
-inline __host__ __device__ float3 operator-(float3 &a)
-{
-    return make_float3(-a.x, -a.y, -a.z);
-}
-inline __host__ __device__ int3 operator-(int3 &a)
-{
-    return make_int3(-a.x, -a.y, -a.z);
-}
-inline __host__ __device__ float4 operator-(float4 &a)
-{
-    return make_float4(-a.x, -a.y, -a.z, -a.w);
-}
-inline __host__ __device__ int4 operator-(int4 &a)
-{
-    return make_int4(-a.x, -a.y, -a.z, -a.w);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// addition
-////////////////////////////////////////////////////////////////////////////////
-
-inline __host__ __device__ float2 operator+(float2 a, float2 b)
-{
-    return make_float2(a.x + b.x, a.y + b.y);
-}
-inline __host__ __device__ void operator+=(float2 &a, float2 b)
-{
-    a.x += b.x; a.y += b.y;
-}
-inline __host__ __device__ float2 operator+(float2 a, float b)
-{
-    return make_float2(a.x + b, a.y + b);
-}
-inline __host__ __device__ float2 operator+(float b, float2 a)
-{
-    return make_float2(a.x + b, a.y + b);
-}
-inline __host__ __device__ void operator+=(float2 &a, float b)
-{
-    a.x += b; a.y += b;
-}
-
-inline __host__ __device__ int2 operator+(int2 a, int2 b)
-{
-    return make_int2(a.x + b.x, a.y + b.y);
-}
-inline __host__ __device__ void operator+=(int2 &a, int2 b)
-{
-    a.x += b.x; a.y += b.y;
-}
-inline __host__ __device__ int2 operator+(int2 a, int b)
-{
-    return make_int2(a.x + b, a.y + b);
-}
-inline __host__ __device__ int2 operator+(int b, int2 a)
-{
-    return make_int2(a.x + b, a.y + b);
-}
-inline __host__ __device__ void operator+=(int2 &a, int b)
-{
-    a.x += b; a.y += b;
-}
-
-inline __host__ __device__ uint2 operator+(uint2 a, uint2 b)
-{
-    return make_uint2(a.x + b.x, a.y + b.y);
-}
-inline __host__ __device__ void operator+=(uint2 &a, uint2 b)
-{
-    a.x += b.x; a.y += b.y;
-}
-inline __host__ __device__ uint2 operator+(uint2 a, uint b)
-{
-    return make_uint2(a.x + b, a.y + b);
-}
-inline __host__ __device__ uint2 operator+(uint b, uint2 a)
-{
-    return make_uint2(a.x + b, a.y + b);
-}
-inline __host__ __device__ void operator+=(uint2 &a, uint b)
-{
-    a.x += b; a.y += b;
-}
-
-
-inline __host__ __device__ float3 operator+(float3 a, float3 b)
-{
-    return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
-}
-inline __host__ __device__ void operator+=(float3 &a, float3 b)
-{
-    a.x += b.x; a.y += b.y; a.z += b.z;
-}
-inline __host__ __device__ float3 operator+(float3 a, float b)
-{
-    return make_float3(a.x + b, a.y + b, a.z + b);
-}
-inline __host__ __device__ void operator+=(float3 &a, float b)
-{
-    a.x += b; a.y += b; a.z += b;
-}
-
-inline __host__ __device__ int3 operator+(int3 a, int3 b)
-{
-    return make_int3(a.x + b.x, a.y + b.y, a.z + b.z);
-}
-inline __host__ __device__ void operator+=(int3 &a, int3 b)
-{
-    a.x += b.x; a.y += b.y; a.z += b.z;
-}
-inline __host__ __device__ int3 operator+(int3 a, int b)
-{
-    return make_int3(a.x + b, a.y + b, a.z + b);
-}
-inline __host__ __device__ void operator+=(int3 &a, int b)
-{
-    a.x += b; a.y += b; a.z += b;
-}
-
-inline __host__ __device__ uint3 operator+(uint3 a, uint3 b)
-{
-    return make_uint3(a.x + b.x, a.y + b.y, a.z + b.z);
-}
-inline __host__ __device__ void operator+=(uint3 &a, uint3 b)
-{
-    a.x += b.x; a.y += b.y; a.z += b.z;
-}
-inline __host__ __device__ uint3 operator+(uint3 a, uint b)
-{
-    return make_uint3(a.x + b, a.y + b, a.z + b);
-}
-inline __host__ __device__ void operator+=(uint3 &a, uint b)
-{
-    a.x += b; a.y += b; a.z += b;
-}
-
-inline __host__ __device__ int3 operator+(int b, int3 a)
-{
-    return make_int3(a.x + b, a.y + b, a.z + b);
-}
-inline __host__ __device__ uint3 operator+(uint b, uint3 a)
-{
-    return make_uint3(a.x + b, a.y + b, a.z + b);
-}
-inline __host__ __device__ float3 operator+(float b, float3 a)
-{
-    return make_float3(a.x + b, a.y + b, a.z + b);
-}
-
-inline __host__ __device__ float4 operator+(float4 a, float4 b)
-{
-    return make_float4(a.x + b.x, a.y + b.y, a.z + b.z,  a.w + b.w);
-}
-inline __host__ __device__ void operator+=(float4 &a, float4 b)
-{
-    a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w;
-}
-inline __host__ __device__ float4 operator+(float4 a, float b)
-{
-    return make_float4(a.x + b, a.y + b, a.z + b, a.w + b);
-}
-inline __host__ __device__ float4 operator+(float b, float4 a)
-{
-    return make_float4(a.x + b, a.y + b, a.z + b, a.w + b);
-}
-inline __host__ __device__ void operator+=(float4 &a, float b)
-{
-    a.x += b; a.y += b; a.z += b; a.w += b;
-}
-
-inline __host__ __device__ int4 operator+(int4 a, int4 b)
-{
-    return make_int4(a.x + b.x, a.y + b.y, a.z + b.z,  a.w + b.w);
-}
-inline __host__ __device__ void operator+=(int4 &a, int4 b)
-{
-    a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w;
-}
-inline __host__ __device__ int4 operator+(int4 a, int b)
-{
-    return make_int4(a.x + b, a.y + b, a.z + b,  a.w + b);
-}
-inline __host__ __device__ int4 operator+(int b, int4 a)
-{
-    return make_int4(a.x + b, a.y + b, a.z + b,  a.w + b);
-}
-inline __host__ __device__ void operator+=(int4 &a, int b)
-{
-    a.x += b; a.y += b; a.z += b; a.w += b;
-}
-
-inline __host__ __device__ uint4 operator+(uint4 a, uint4 b)
-{
-    return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z,  a.w + b.w);
-}
-inline __host__ __device__ void operator+=(uint4 &a, uint4 b)
-{
-    a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w;
-}
-inline __host__ __device__ uint4 operator+(uint4 a, uint b)
-{
-    return make_uint4(a.x + b, a.y + b, a.z + b,  a.w + b);
-}
-inline __host__ __device__ uint4 operator+(uint b, uint4 a)
-{
-    return make_uint4(a.x + b, a.y + b, a.z + b,  a.w + b);
-}
-inline __host__ __device__ void operator+=(uint4 &a, uint b)
-{
-    a.x += b; a.y += b; a.z += b; a.w += b;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// subtract
-////////////////////////////////////////////////////////////////////////////////
-
-inline __host__ __device__ float2 operator-(float2 a, float2 b)
-{
-    return make_float2(a.x - b.x, a.y - b.y);
-}
-inline __host__ __device__ void operator-=(float2 &a, float2 b)
-{
-    a.x -= b.x; a.y -= b.y;
-}
-inline __host__ __device__ float2 operator-(float2 a, float b)
-{
-    return make_float2(a.x - b, a.y - b);
-}
-inline __host__ __device__ float2 operator-(float b, float2 a)
-{
-    return make_float2(b - a.x, b - a.y);
-}
-inline __host__ __device__ void operator-=(float2 &a, float b)
-{
-    a.x -= b; a.y -= b;
-}
-
-inline __host__ __device__ int2 operator-(int2 a, int2 b)
-{
-    return make_int2(a.x - b.x, a.y - b.y);
-}
-inline __host__ __device__ void operator-=(int2 &a, int2 b)
-{
-    a.x -= b.x; a.y -= b.y;
-}
-inline __host__ __device__ int2 operator-(int2 a, int b)
-{
-    return make_int2(a.x - b, a.y - b);
-}
-inline __host__ __device__ int2 operator-(int b, int2 a)
-{
-    return make_int2(b - a.x, b - a.y);
-}
-inline __host__ __device__ void operator-=(int2 &a, int b)
-{
-    a.x -= b; a.y -= b;
-}
-
-inline __host__ __device__ uint2 operator-(uint2 a, uint2 b)
-{
-    return make_uint2(a.x - b.x, a.y - b.y);
-}
-inline __host__ __device__ void operator-=(uint2 &a, uint2 b)
-{
-    a.x -= b.x; a.y -= b.y;
-}
-inline __host__ __device__ uint2 operator-(uint2 a, uint b)
-{
-    return make_uint2(a.x - b, a.y - b);
-}
-inline __host__ __device__ uint2 operator-(uint b, uint2 a)
-{
-    return make_uint2(b - a.x, b - a.y);
-}
-inline __host__ __device__ void operator-=(uint2 &a, uint b)
-{
-    a.x -= b; a.y -= b;
-}
-
-inline __host__ __device__ float3 operator-(float3 a, float3 b)
-{
-    return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
-}
-inline __host__ __device__ void operator-=(float3 &a, float3 b)
-{
-    a.x -= b.x; a.y -= b.y; a.z -= b.z;
-}
-inline __host__ __device__ float3 operator-(float3 a, float b)
-{
-    return make_float3(a.x - b, a.y - b, a.z - b);
-}
-inline __host__ __device__ float3 operator-(float b, float3 a)
-{
-    return make_float3(b - a.x, b - a.y, b - a.z);
-}
-inline __host__ __device__ void operator-=(float3 &a, float b)
-{
-    a.x -= b; a.y -= b; a.z -= b;
-}
-
-inline __host__ __device__ int3 operator-(int3 a, int3 b)
-{
-    return make_int3(a.x - b.x, a.y - b.y, a.z - b.z);
-}
-inline __host__ __device__ void operator-=(int3 &a, int3 b)
-{
-    a.x -= b.x; a.y -= b.y; a.z -= b.z;
-}
-inline __host__ __device__ int3 operator-(int3 a, int b)
-{
-    return make_int3(a.x - b, a.y - b, a.z - b);
-}
-inline __host__ __device__ int3 operator-(int b, int3 a)
-{
-    return make_int3(b - a.x, b - a.y, b - a.z);
-}
-inline __host__ __device__ void operator-=(int3 &a, int b)
-{
-    a.x -= b; a.y -= b; a.z -= b;
-}
-
-inline __host__ __device__ uint3 operator-(uint3 a, uint3 b)
-{
-    return make_uint3(a.x - b.x, a.y - b.y, a.z - b.z);
-}
-inline __host__ __device__ void operator-=(uint3 &a, uint3 b)
-{
-    a.x -= b.x; a.y -= b.y; a.z -= b.z;
-}
-inline __host__ __device__ uint3 operator-(uint3 a, uint b)
-{
-    return make_uint3(a.x - b, a.y - b, a.z - b);
-}
-inline __host__ __device__ uint3 operator-(uint b, uint3 a)
-{
-    return make_uint3(b - a.x, b - a.y, b - a.z);
-}
-inline __host__ __device__ void operator-=(uint3 &a, uint b)
-{
-    a.x -= b; a.y -= b; a.z -= b;
-}
-
-inline __host__ __device__ float4 operator-(float4 a, float4 b)
-{
-    return make_float4(a.x - b.x, a.y - b.y, a.z - b.z,  a.w - b.w);
-}
-inline __host__ __device__ void operator-=(float4 &a, float4 b)
-{
-    a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w;
-}
-inline __host__ __device__ float4 operator-(float4 a, float b)
-{
-    return make_float4(a.x - b, a.y - b, a.z - b,  a.w - b);
-}
-inline __host__ __device__ void operator-=(float4 &a, float b)
-{
-    a.x -= b; a.y -= b; a.z -= b; a.w -= b;
-}
-
-inline __host__ __device__ int4 operator-(int4 a, int4 b)
-{
-    return make_int4(a.x - b.x, a.y - b.y, a.z - b.z,  a.w - b.w);
-}
-inline __host__ __device__ void operator-=(int4 &a, int4 b)
-{
-    a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w;
-}
-inline __host__ __device__ int4 operator-(int4 a, int b)
-{
-    return make_int4(a.x - b, a.y - b, a.z - b,  a.w - b);
-}
-inline __host__ __device__ int4 operator-(int b, int4 a)
-{
-    return make_int4(b - a.x, b - a.y, b - a.z, b - a.w);
-}
-inline __host__ __device__ void operator-=(int4 &a, int b)
-{
-    a.x -= b; a.y -= b; a.z -= b; a.w -= b;
-}
-
-inline __host__ __device__ uint4 operator-(uint4 a, uint4 b)
-{
-    return make_uint4(a.x - b.x, a.y - b.y, a.z - b.z,  a.w - b.w);
-}
-inline __host__ __device__ void operator-=(uint4 &a, uint4 b)
-{
-    a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w;
-}
-inline __host__ __device__ uint4 operator-(uint4 a, uint b)
-{
-    return make_uint4(a.x - b, a.y - b, a.z - b,  a.w - b);
-}
-inline __host__ __device__ uint4 operator-(uint b, uint4 a)
-{
-    return make_uint4(b - a.x, b - a.y, b - a.z, b - a.w);
-}
-inline __host__ __device__ void operator-=(uint4 &a, uint b)
-{
-    a.x -= b; a.y -= b; a.z -= b; a.w -= b;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// multiply
-////////////////////////////////////////////////////////////////////////////////
-
-inline __host__ __device__ float2 operator*(float2 a, float2 b)
-{
-    return make_float2(a.x * b.x, a.y * b.y);
-}
-inline __host__ __device__ void operator*=(float2 &a, float2 b)
-{
-    a.x *= b.x; a.y *= b.y;
-}
-inline __host__ __device__ float2 operator*(float2 a, float b)
-{
-    return make_float2(a.x * b, a.y * b);
-}
-inline __host__ __device__ float2 operator*(float b, float2 a)
-{
-    return make_float2(b * a.x, b * a.y);
-}
-inline __host__ __device__ void operator*=(float2 &a, float b)
-{
-    a.x *= b; a.y *= b;
-}
-
-inline __host__ __device__ int2 operator*(int2 a, int2 b)
-{
-    return make_int2(a.x * b.x, a.y * b.y);
-}
-inline __host__ __device__ void operator*=(int2 &a, int2 b)
-{
-    a.x *= b.x; a.y *= b.y;
-}
-inline __host__ __device__ int2 operator*(int2 a, int b)
-{
-    return make_int2(a.x * b, a.y * b);
-}
-inline __host__ __device__ int2 operator*(int b, int2 a)
-{
-    return make_int2(b * a.x, b * a.y);
-}
-inline __host__ __device__ void operator*=(int2 &a, int b)
-{
-    a.x *= b; a.y *= b;
-}
-
-inline __host__ __device__ uint2 operator*(uint2 a, uint2 b)
-{
-    return make_uint2(a.x * b.x, a.y * b.y);
-}
-inline __host__ __device__ void operator*=(uint2 &a, uint2 b)
-{
-    a.x *= b.x; a.y *= b.y;
-}
-inline __host__ __device__ uint2 operator*(uint2 a, uint b)
-{
-    return make_uint2(a.x * b, a.y * b);
-}
-inline __host__ __device__ uint2 operator*(uint b, uint2 a)
-{
-    return make_uint2(b * a.x, b * a.y);
-}
-inline __host__ __device__ void operator*=(uint2 &a, uint b)
-{
-    a.x *= b; a.y *= b;
-}
-
-inline __host__ __device__ float3 operator*(float3 a, float3 b)
-{
-    return make_float3(a.x * b.x, a.y * b.y, a.z * b.z);
-}
-inline __host__ __device__ void operator*=(float3 &a, float3 b)
-{
-    a.x *= b.x; a.y *= b.y; a.z *= b.z;
-}
-inline __host__ __device__ float3 operator*(float3 a, float b)
-{
-    return make_float3(a.x * b, a.y * b, a.z * b);
-}
-inline __host__ __device__ float3 operator*(float b, float3 a)
-{
-    return make_float3(b * a.x, b * a.y, b * a.z);
-}
-inline __host__ __device__ void operator*=(float3 &a, float b)
-{
-    a.x *= b; a.y *= b; a.z *= b;
-}
-
-inline __host__ __device__ int3 operator*(int3 a, int3 b)
-{
-    return make_int3(a.x * b.x, a.y * b.y, a.z * b.z);
-}
-inline __host__ __device__ void operator*=(int3 &a, int3 b)
-{
-    a.x *= b.x; a.y *= b.y; a.z *= b.z;
-}
-inline __host__ __device__ int3 operator*(int3 a, int b)
-{
-    return make_int3(a.x * b, a.y * b, a.z * b);
-}
-inline __host__ __device__ int3 operator*(int b, int3 a)
-{
-    return make_int3(b * a.x, b * a.y, b * a.z);
-}
-inline __host__ __device__ void operator*=(int3 &a, int b)
-{
-    a.x *= b; a.y *= b; a.z *= b;
-}
-
-inline __host__ __device__ uint3 operator*(uint3 a, uint3 b)
-{
-    return make_uint3(a.x * b.x, a.y * b.y, a.z * b.z);
-}
-inline __host__ __device__ void operator*=(uint3 &a, uint3 b)
-{
-    a.x *= b.x; a.y *= b.y; a.z *= b.z;
-}
-inline __host__ __device__ uint3 operator*(uint3 a, uint b)
-{
-    return make_uint3(a.x * b, a.y * b, a.z * b);
-}
-inline __host__ __device__ uint3 operator*(uint b, uint3 a)
-{
-    return make_uint3(b * a.x, b * a.y, b * a.z);
-}
-inline __host__ __device__ void operator*=(uint3 &a, uint b)
-{
-    a.x *= b; a.y *= b; a.z *= b;
-}
-
-inline __host__ __device__ float4 operator*(float4 a, float4 b)
-{
-    return make_float4(a.x * b.x, a.y * b.y, a.z * b.z,  a.w * b.w);
-}
-inline __host__ __device__ void operator*=(float4 &a, float4 b)
-{
-    a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w;
-}
-inline __host__ __device__ float4 operator*(float4 a, float b)
-{
-    return make_float4(a.x * b, a.y * b, a.z * b,  a.w * b);
-}
-inline __host__ __device__ float4 operator*(float b, float4 a)
-{
-    return make_float4(b * a.x, b * a.y, b * a.z, b * a.w);
-}
-inline __host__ __device__ void operator*=(float4 &a, float b)
-{
-    a.x *= b; a.y *= b; a.z *= b; a.w *= b;
-}
-
-inline __host__ __device__ int4 operator*(int4 a, int4 b)
-{
-    return make_int4(a.x * b.x, a.y * b.y, a.z * b.z,  a.w * b.w);
-}
-inline __host__ __device__ void operator*=(int4 &a, int4 b)
-{
-    a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w;
-}
-inline __host__ __device__ int4 operator*(int4 a, int b)
-{
-    return make_int4(a.x * b, a.y * b, a.z * b,  a.w * b);
-}
-inline __host__ __device__ int4 operator*(int b, int4 a)
-{
-    return make_int4(b * a.x, b * a.y, b * a.z, b * a.w);
-}
-inline __host__ __device__ void operator*=(int4 &a, int b)
-{
-    a.x *= b; a.y *= b; a.z *= b; a.w *= b;
-}
-
-inline __host__ __device__ uint4 operator*(uint4 a, uint4 b)
-{
-    return make_uint4(a.x * b.x, a.y * b.y, a.z * b.z,  a.w * b.w);
-}
-inline __host__ __device__ void operator*=(uint4 &a, uint4 b)
-{
-    a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w;
-}
-inline __host__ __device__ uint4 operator*(uint4 a, uint b)
-{
-    return make_uint4(a.x * b, a.y * b, a.z * b,  a.w * b);
-}
-inline __host__ __device__ uint4 operator*(uint b, uint4 a)
-{
-    return make_uint4(b * a.x, b * a.y, b * a.z, b * a.w);
-}
-inline __host__ __device__ void operator*=(uint4 &a, uint b)
-{
-    a.x *= b; a.y *= b; a.z *= b; a.w *= b;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// divide
-////////////////////////////////////////////////////////////////////////////////
-
-inline __host__ __device__ float2 operator/(float2 a, float2 b)
-{
-    return make_float2(a.x / b.x, a.y / b.y);
-}
-inline __host__ __device__ void operator/=(float2 &a, float2 b)
-{
-    a.x /= b.x; a.y /= b.y;
-}
-inline __host__ __device__ float2 operator/(float2 a, float b)
-{
-    return make_float2(a.x / b, a.y / b);
-}
-inline __host__ __device__ void operator/=(float2 &a, float b)
-{
-    a.x /= b; a.y /= b;
-}
-inline __host__ __device__ float2 operator/(float b, float2 a)
-{
-    return make_float2(b / a.x, b / a.y);
-}
-
-inline __host__ __device__ float3 operator/(float3 a, float3 b)
-{
-    return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
-}
-inline __host__ __device__ void operator/=(float3 &a, float3 b)
-{
-    a.x /= b.x; a.y /= b.y; a.z /= b.z;
-}
-inline __host__ __device__ float3 operator/(float3 a, float b)
-{
-    return make_float3(a.x / b, a.y / b, a.z / b);
-}
-inline __host__ __device__ void operator/=(float3 &a, float b)
-{
-    a.x /= b; a.y /= b; a.z /= b;
-}
-inline __host__ __device__ float3 operator/(float b, float3 a)
-{
-    return make_float3(b / a.x, b / a.y, b / a.z);
-}
-
-inline __host__ __device__ float4 operator/(float4 a, float4 b)
-{
-    return make_float4(a.x / b.x, a.y / b.y, a.z / b.z,  a.w / b.w);
-}
-inline __host__ __device__ void operator/=(float4 &a, float4 b)
-{
-    a.x /= b.x; a.y /= b.y; a.z /= b.z; a.w /= b.w;
-}
-inline __host__ __device__ float4 operator/(float4 a, float b)
-{
-    return make_float4(a.x / b, a.y / b, a.z / b,  a.w / b);
-}
-inline __host__ __device__ void operator/=(float4 &a, float b)
-{
-    a.x /= b; a.y /= b; a.z /= b; a.w /= b;
-}
-inline __host__ __device__ float4 operator/(float b, float4 a){
-    return make_float4(b / a.x, b / a.y, b / a.z, b / a.w);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// min
-////////////////////////////////////////////////////////////////////////////////
-
-inline  __host__ __device__ float2 fminf(float2 a, float2 b)
-{
-	return make_float2(fminf(a.x,b.x), fminf(a.y,b.y));
-}
-inline __host__ __device__ float3 fminf(float3 a, float3 b)
-{
-	return make_float3(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z));
-}
-inline  __host__ __device__ float4 fminf(float4 a, float4 b)
-{
-	return make_float4(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z), fminf(a.w,b.w));
-}
-
-inline __host__ __device__ int2 min(int2 a, int2 b)
-{
-    return make_int2(min(a.x,b.x), min(a.y,b.y));
-}
-inline __host__ __device__ int3 min(int3 a, int3 b)
-{
-    return make_int3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z));
-}
-inline __host__ __device__ int4 min(int4 a, int4 b)
-{
-    return make_int4(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z), min(a.w,b.w));
-}
-
-inline __host__ __device__ uint2 min(uint2 a, uint2 b)
-{
-    return make_uint2(min(a.x,b.x), min(a.y,b.y));
-}
-inline __host__ __device__ uint3 min(uint3 a, uint3 b)
-{
-    return make_uint3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z));
-}
-inline __host__ __device__ uint4 min(uint4 a, uint4 b)
-{
-    return make_uint4(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z), min(a.w,b.w));
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// max
-////////////////////////////////////////////////////////////////////////////////
-
-inline __host__ __device__ float2 fmaxf(float2 a, float2 b)
-{
-	return make_float2(fmaxf(a.x,b.x), fmaxf(a.y,b.y));
-}
-inline __host__ __device__ float3 fmaxf(float3 a, float3 b)
-{
-	return make_float3(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z));
-}
-inline __host__ __device__ float4 fmaxf(float4 a, float4 b)
-{
-	return make_float4(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z), fmaxf(a.w,b.w));
-}
-
-inline __host__ __device__ int2 max(int2 a, int2 b)
-{
-    return make_int2(max(a.x,b.x), max(a.y,b.y));
-}
-inline __host__ __device__ int3 max(int3 a, int3 b)
-{
-    return make_int3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z));
-}
-inline __host__ __device__ int4 max(int4 a, int4 b)
-{
-    return make_int4(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z), max(a.w,b.w));
-}
-
-inline __host__ __device__ uint2 max(uint2 a, uint2 b)
-{
-    return make_uint2(max(a.x,b.x), max(a.y,b.y));
-}
-inline __host__ __device__ uint3 max(uint3 a, uint3 b)
-{
-    return make_uint3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z));
-}
-inline __host__ __device__ uint4 max(uint4 a, uint4 b)
-{
-    return make_uint4(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z), max(a.w,b.w));
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// lerp
-// - linear interpolation between a and b, based on value t in [0, 1] range
-////////////////////////////////////////////////////////////////////////////////
-
-inline __device__ __host__ float lerp(float a, float b, float t)
-{
-    return a + t*(b-a);
-}
-inline __device__ __host__ float2 lerp(float2 a, float2 b, float t)
-{
-    return a + t*(b-a);
-}
-inline __device__ __host__ float3 lerp(float3 a, float3 b, float t)
-{
-    return a + t*(b-a);
-}
-inline __device__ __host__ float4 lerp(float4 a, float4 b, float t)
-{
-    return a + t*(b-a);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// clamp
-// - clamp the value v to be in the range [a, b]
-////////////////////////////////////////////////////////////////////////////////
-
-inline __device__ __host__ float clamp(float f, float a, float b)
-{
-    return fmaxf(a, fminf(f, b));
-}
-inline __device__ __host__ int clamp(int f, int a, int b)
-{
-    return max(a, min(f, b));
-}
-inline __device__ __host__ uint clamp(uint f, uint a, uint b)
-{
-    return max(a, min(f, b));
-}
-
-inline __device__ __host__ float2 clamp(float2 v, float a, float b)
-{
-    return make_float2(clamp(v.x, a, b), clamp(v.y, a, b));
-}
-inline __device__ __host__ float2 clamp(float2 v, float2 a, float2 b)
-{
-    return make_float2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
-}
-inline __device__ __host__ float3 clamp(float3 v, float a, float b)
-{
-    return make_float3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
-}
-inline __device__ __host__ float3 clamp(float3 v, float3 a, float3 b)
-{
-    return make_float3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
-}
-inline __device__ __host__ float4 clamp(float4 v, float a, float b)
-{
-    return make_float4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
-}
-inline __device__ __host__ float4 clamp(float4 v, float4 a, float4 b)
-{
-    return make_float4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w));
-}
-
-inline __device__ __host__ int2 clamp(int2 v, int a, int b)
-{
-    return make_int2(clamp(v.x, a, b), clamp(v.y, a, b));
-}
-inline __device__ __host__ int2 clamp(int2 v, int2 a, int2 b)
-{
-    return make_int2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
-}
-inline __device__ __host__ int3 clamp(int3 v, int a, int b)
-{
-    return make_int3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
-}
-inline __device__ __host__ int3 clamp(int3 v, int3 a, int3 b)
-{
-    return make_int3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
-}
-inline __device__ __host__ int4 clamp(int4 v, int a, int b)
-{
-    return make_int4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
-}
-inline __device__ __host__ int4 clamp(int4 v, int4 a, int4 b)
-{
-    return make_int4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w));
-}
-
-inline __device__ __host__ uint2 clamp(uint2 v, uint a, uint b)
-{
-    return make_uint2(clamp(v.x, a, b), clamp(v.y, a, b));
-}
-inline __device__ __host__ uint2 clamp(uint2 v, uint2 a, uint2 b)
-{
-    return make_uint2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
-}
-inline __device__ __host__ uint3 clamp(uint3 v, uint a, uint b)
-{
-    return make_uint3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
-}
-inline __device__ __host__ uint3 clamp(uint3 v, uint3 a, uint3 b)
-{
-    return make_uint3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
-}
-inline __device__ __host__ uint4 clamp(uint4 v, uint a, uint b)
-{
-    return make_uint4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
-}
-inline __device__ __host__ uint4 clamp(uint4 v, uint4 a, uint4 b)
-{
-    return make_uint4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w));
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// dot product
-////////////////////////////////////////////////////////////////////////////////
-
-inline __host__ __device__ float dot(float2 a, float2 b)
-{ 
-    return a.x * b.x + a.y * b.y;
-}
-inline __host__ __device__ float dot(float3 a, float3 b)
-{ 
-    return a.x * b.x + a.y * b.y + a.z * b.z;
-}
-inline __host__ __device__ float dot(float4 a, float4 b)
-{ 
-    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
-}
-
-inline __host__ __device__ int dot(int2 a, int2 b)
-{ 
-    return a.x * b.x + a.y * b.y;
-}
-inline __host__ __device__ int dot(int3 a, int3 b)
-{ 
-    return a.x * b.x + a.y * b.y + a.z * b.z;
-}
-inline __host__ __device__ int dot(int4 a, int4 b)
-{ 
-    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
-}
-
-inline __host__ __device__ uint dot(uint2 a, uint2 b)
-{ 
-    return a.x * b.x + a.y * b.y;
-}
-inline __host__ __device__ uint dot(uint3 a, uint3 b)
-{ 
-    return a.x * b.x + a.y * b.y + a.z * b.z;
-}
-inline __host__ __device__ uint dot(uint4 a, uint4 b)
-{ 
-    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// length
-////////////////////////////////////////////////////////////////////////////////
-
-inline __host__ __device__ float length(float2 v)
-{
-    return sqrtf(dot(v, v));
-}
-inline __host__ __device__ float length(float3 v)
-{
-    return sqrtf(dot(v, v));
-}
-inline __host__ __device__ float length(float4 v)
-{
-    return sqrtf(dot(v, v));
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// normalize
-////////////////////////////////////////////////////////////////////////////////
-
-inline __host__ __device__ float2 normalize(float2 v)
-{
-    float invLen = rsqrtf(dot(v, v));
-    return v * invLen;
-}
-inline __host__ __device__ float3 normalize(float3 v)
-{
-    float invLen = rsqrtf(dot(v, v));
-    return v * invLen;
-}
-inline __host__ __device__ float4 normalize(float4 v)
-{
-    float invLen = rsqrtf(dot(v, v));
-    return v * invLen;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// floor
-////////////////////////////////////////////////////////////////////////////////
-
-inline __host__ __device__ float2 floorf(float2 v)
-{
-    return make_float2(floorf(v.x), floorf(v.y));
-}
-inline __host__ __device__ float3 floorf(float3 v)
-{
-    return make_float3(floorf(v.x), floorf(v.y), floorf(v.z));
-}
-inline __host__ __device__ float4 floorf(float4 v)
-{
-    return make_float4(floorf(v.x), floorf(v.y), floorf(v.z), floorf(v.w));
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// frac - returns the fractional portion of a scalar or each vector component
-////////////////////////////////////////////////////////////////////////////////
-
-inline __host__ __device__ float fracf(float v)
-{
-    return v - floorf(v);
-}
-inline __host__ __device__ float2 fracf(float2 v)
-{
-    return make_float2(fracf(v.x), fracf(v.y));
-}
-inline __host__ __device__ float3 fracf(float3 v)
-{
-    return make_float3(fracf(v.x), fracf(v.y), fracf(v.z));
-}
-inline __host__ __device__ float4 fracf(float4 v)
-{
-    return make_float4(fracf(v.x), fracf(v.y), fracf(v.z), fracf(v.w));
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// fmod
-////////////////////////////////////////////////////////////////////////////////
-
-inline __host__ __device__ float2 fmodf(float2 a, float2 b)
-{
-    return make_float2(fmodf(a.x, b.x), fmodf(a.y, b.y));
-}
-inline __host__ __device__ float3 fmodf(float3 a, float3 b)
-{
-    return make_float3(fmodf(a.x, b.x), fmodf(a.y, b.y), fmodf(a.z, b.z));
-}
-inline __host__ __device__ float4 fmodf(float4 a, float4 b)
-{
-    return make_float4(fmodf(a.x, b.x), fmodf(a.y, b.y), fmodf(a.z, b.z), fmodf(a.w, b.w));
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// absolute value
-////////////////////////////////////////////////////////////////////////////////
-
-inline __host__ __device__ float2 fabs(float2 v)
-{
-	return make_float2(fabs(v.x), fabs(v.y));
-}
-inline __host__ __device__ float3 fabs(float3 v)
-{
-	return make_float3(fabs(v.x), fabs(v.y), fabs(v.z));
-}
-inline __host__ __device__ float4 fabs(float4 v)
-{
-	return make_float4(fabs(v.x), fabs(v.y), fabs(v.z), fabs(v.w));
-}
-
-inline __host__ __device__ int2 abs(int2 v)
-{
-	return make_int2(abs(v.x), abs(v.y));
-}
-inline __host__ __device__ int3 abs(int3 v)
-{
-	return make_int3(abs(v.x), abs(v.y), abs(v.z));
-}
-inline __host__ __device__ int4 abs(int4 v)
-{
-	return make_int4(abs(v.x), abs(v.y), abs(v.z), abs(v.w));
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// reflect
-// - returns reflection of incident ray I around surface normal N
-// - N should be normalized, reflected vector's length is equal to length of I
-////////////////////////////////////////////////////////////////////////////////
-
-inline __host__ __device__ float3 reflect(float3 i, float3 n)
-{
-	return i - 2.0f * n * dot(n,i);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// cross product
-////////////////////////////////////////////////////////////////////////////////
-
-inline __host__ __device__ float3 cross(float3 a, float3 b)
-{ 
-    return make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x); 
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// smoothstep
-// - returns 0 if x < a
-// - returns 1 if x > b
-// - otherwise returns smooth interpolation between 0 and 1 based on x
-////////////////////////////////////////////////////////////////////////////////
-
-inline __device__ __host__ float smoothstep(float a, float b, float x)
-{
-	float y = clamp((x - a) / (b - a), 0.0f, 1.0f);
-	return (y*y*(3.0f - (2.0f*y)));
-}
-inline __device__ __host__ float2 smoothstep(float2 a, float2 b, float2 x)
-{
-	float2 y = clamp((x - a) / (b - a), 0.0f, 1.0f);
-	return (y*y*(make_float2(3.0f) - (make_float2(2.0f)*y)));
-}
-inline __device__ __host__ float3 smoothstep(float3 a, float3 b, float3 x)
-{
-	float3 y = clamp((x - a) / (b - a), 0.0f, 1.0f);
-	return (y*y*(make_float3(3.0f) - (make_float3(2.0f)*y)));
-}
-inline __device__ __host__ float4 smoothstep(float4 a, float4 b, float4 x)
-{
-	float4 y = clamp((x - a) / (b - a), 0.0f, 1.0f);
-	return (y*y*(make_float4(3.0f) - (make_float4(2.0f)*y)));
-}
-
-#endif
diff --git a/gpu/utils/include/pcl/gpu/utils/device/cutil_math.h b/gpu/utils/include/pcl/gpu/utils/device/cutil_math.h
deleted file mode 100644
index 746f4d68..00000000
--- a/gpu/utils/include/pcl/gpu/utils/device/cutil_math.h
+++ /dev/null
@@ -1,1328 +0,0 @@
-/*
- * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
- *
- * Please refer to the NVIDIA end user license agreement (EULA) associated
- * with this source code for terms and conditions that govern your use of
- * this software. Any use, reproduction, disclosure, or distribution of
- * this software and related documentation outside the terms of the EULA
- * is strictly prohibited.
- *
- */
-
-/*
-    This file implements common mathematical operations on vector types
-    (float3, float4 etc.) since these are not provided as standard by CUDA.
-
-    The syntax is modelled on the Cg standard library.
-
-    This is part of the CUTIL library and is not supported by NVIDIA.
-
-    Thanks to Linh Hah for additions and fixes.
-*/
-
-#ifndef CUTIL_MATH_H
-#define CUTIL_MATH_H
-
-#include "cuda_runtime.h"
-
-typedef unsigned int uint;
-typedef unsigned short ushort;
-
-#ifndef __CUDACC__
-#include <math.h>
-
-////////////////////////////////////////////////////////////////////////////////
-// host implementations of CUDA functions
-////////////////////////////////////////////////////////////////////////////////
-
-inline float fminf(float a, float b)
-{
-  return a < b ? a : b;
-}
-
-inline float fmaxf(float a, float b)
-{
-  return a > b ? a : b;
-}
-
-inline int max(int a, int b)
-{
-  return a > b ? a : b;
-}
-
-inline int min(int a, int b)
-{
-  return a < b ? a : b;
-}
-
-inline float rsqrtf(float x)
-{
-    return 1.0f / sqrtf(x);
-}
-#endif
-
-////////////////////////////////////////////////////////////////////////////////
-// constructors
-////////////////////////////////////////////////////////////////////////////////
-
-inline __host__ __device__ float2 make_float2(float s)
-{
-    return make_float2(s, s);
-}
-inline __host__ __device__ float2 make_float2(float3 a)
-{
-    return make_float2(a.x, a.y);
-}
-inline __host__ __device__ float2 make_float2(int2 a)
-{
-    return make_float2(float(a.x), float(a.y));
-}
-inline __host__ __device__ float2 make_float2(uint2 a)
-{
-    return make_float2(float(a.x), float(a.y));
-}
-
-inline __host__ __device__ int2 make_int2(int s)
-{
-    return make_int2(s, s);
-}
-inline __host__ __device__ int2 make_int2(int3 a)
-{
-    return make_int2(a.x, a.y);
-}
-inline __host__ __device__ int2 make_int2(uint2 a)
-{
-    return make_int2(int(a.x), int(a.y));
-}
-inline __host__ __device__ int2 make_int2(float2 a)
-{
-    return make_int2(int(a.x), int(a.y));
-}
-
-inline __host__ __device__ uint2 make_uint2(uint s)
-{
-    return make_uint2(s, s);
-}
-inline __host__ __device__ uint2 make_uint2(uint3 a)
-{
-    return make_uint2(a.x, a.y);
-}
-inline __host__ __device__ uint2 make_uint2(int2 a)
-{
-    return make_uint2(uint(a.x), uint(a.y));
-}
-
-inline __host__ __device__ float3 make_float3(float s)
-{
-    return make_float3(s, s, s);
-}
-inline __host__ __device__ float3 make_float3(float2 a)
-{
-    return make_float3(a.x, a.y, 0.0f);
-}
-inline __host__ __device__ float3 make_float3(float2 a, float s)
-{
-    return make_float3(a.x, a.y, s);
-}
-inline __host__ __device__ float3 make_float3(float4 a)
-{
-    return make_float3(a.x, a.y, a.z);
-}
-inline __host__ __device__ float3 make_float3(int3 a)
-{
-    return make_float3(float(a.x), float(a.y), float(a.z));
-}
-inline __host__ __device__ float3 make_float3(uint3 a)
-{
-    return make_float3(float(a.x), float(a.y), float(a.z));
-}
-
-inline __host__ __device__ int3 make_int3(int s)
-{
-    return make_int3(s, s, s);
-}
-inline __host__ __device__ int3 make_int3(int2 a)
-{
-    return make_int3(a.x, a.y, 0);
-}
-inline __host__ __device__ int3 make_int3(int2 a, int s)
-{
-    return make_int3(a.x, a.y, s);
-}
-inline __host__ __device__ int3 make_int3(uint3 a)
-{
-    return make_int3(int(a.x), int(a.y), int(a.z));
-}
-inline __host__ __device__ int3 make_int3(float3 a)
-{
-    return make_int3(int(a.x), int(a.y), int(a.z));
-}
-
-inline __host__ __device__ uint3 make_uint3(uint s)
-{
-    return make_uint3(s, s, s);
-}
-inline __host__ __device__ uint3 make_uint3(uint2 a)
-{
-    return make_uint3(a.x, a.y, 0);
-}
-inline __host__ __device__ uint3 make_uint3(uint2 a, uint s)
-{
-    return make_uint3(a.x, a.y, s);
-}
-inline __host__ __device__ uint3 make_uint3(uint4 a)
-{
-    return make_uint3(a.x, a.y, a.z);
-}
-inline __host__ __device__ uint3 make_uint3(int3 a)
-{
-    return make_uint3(uint(a.x), uint(a.y), uint(a.z));
-}
-
-inline __host__ __device__ float4 make_float4(float s)
-{
-    return make_float4(s, s, s, s);
-}
-inline __host__ __device__ float4 make_float4(float3 a)
-{
-    return make_float4(a.x, a.y, a.z, 0.0f);
-}
-inline __host__ __device__ float4 make_float4(float3 a, float w)
-{
-    return make_float4(a.x, a.y, a.z, w);
-}
-inline __host__ __device__ float4 make_float4(int4 a)
-{
-    return make_float4(float(a.x), float(a.y), float(a.z), float(a.w));
-}
-inline __host__ __device__ float4 make_float4(uint4 a)
-{
-    return make_float4(float(a.x), float(a.y), float(a.z), float(a.w));
-}
-
-inline __host__ __device__ int4 make_int4(int s)
-{
-    return make_int4(s, s, s, s);
-}
-inline __host__ __device__ int4 make_int4(int3 a)
-{
-    return make_int4(a.x, a.y, a.z, 0);
-}
-inline __host__ __device__ int4 make_int4(int3 a, int w)
-{
-    return make_int4(a.x, a.y, a.z, w);
-}
-inline __host__ __device__ int4 make_int4(uint4 a)
-{
-    return make_int4(int(a.x), int(a.y), int(a.z), int(a.w));
-}
-inline __host__ __device__ int4 make_int4(float4 a)
-{
-    return make_int4(int(a.x), int(a.y), int(a.z), int(a.w));
-}
-
-
-inline __host__ __device__ uint4 make_uint4(uint s)
-{
-    return make_uint4(s, s, s, s);
-}
-inline __host__ __device__ uint4 make_uint4(uint3 a)
-{
-    return make_uint4(a.x, a.y, a.z, 0);
-}
-inline __host__ __device__ uint4 make_uint4(uint3 a, uint w)
-{
-    return make_uint4(a.x, a.y, a.z, w);
-}
-inline __host__ __device__ uint4 make_uint4(int4 a)
-{
-    return make_uint4(uint(a.x), uint(a.y), uint(a.z), uint(a.w));
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// negate
-////////////////////////////////////////////////////////////////////////////////
-
-inline __host__ __device__ float2 operator-(float2 &a)
-{
-    return make_float2(-a.x, -a.y);
-}
-inline __host__ __device__ int2 operator-(int2 &a)
-{
-    return make_int2(-a.x, -a.y);
-}
-inline __host__ __device__ float3 operator-(float3 &a)
-{
-    return make_float3(-a.x, -a.y, -a.z);
-}
-inline __host__ __device__ int3 operator-(int3 &a)
-{
-    return make_int3(-a.x, -a.y, -a.z);
-}
-inline __host__ __device__ float4 operator-(float4 &a)
-{
-    return make_float4(-a.x, -a.y, -a.z, -a.w);
-}
-inline __host__ __device__ int4 operator-(int4 &a)
-{
-    return make_int4(-a.x, -a.y, -a.z, -a.w);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// addition
-////////////////////////////////////////////////////////////////////////////////
-
-inline __host__ __device__ float2 operator+(float2 a, float2 b)
-{
-    return make_float2(a.x + b.x, a.y + b.y);
-}
-inline __host__ __device__ void operator+=(float2 &a, float2 b)
-{
-    a.x += b.x; a.y += b.y;
-}
-inline __host__ __device__ float2 operator+(float2 a, float b)
-{
-    return make_float2(a.x + b, a.y + b);
-}
-inline __host__ __device__ float2 operator+(float b, float2 a)
-{
-    return make_float2(a.x + b, a.y + b);
-}
-inline __host__ __device__ void operator+=(float2 &a, float b)
-{
-    a.x += b; a.y += b;
-}
-
-inline __host__ __device__ int2 operator+(int2 a, int2 b)
-{
-    return make_int2(a.x + b.x, a.y + b.y);
-}
-inline __host__ __device__ void operator+=(int2 &a, int2 b)
-{
-    a.x += b.x; a.y += b.y;
-}
-inline __host__ __device__ int2 operator+(int2 a, int b)
-{
-    return make_int2(a.x + b, a.y + b);
-}
-inline __host__ __device__ int2 operator+(int b, int2 a)
-{
-    return make_int2(a.x + b, a.y + b);
-}
-inline __host__ __device__ void operator+=(int2 &a, int b)
-{
-    a.x += b; a.y += b;
-}
-
-inline __host__ __device__ uint2 operator+(uint2 a, uint2 b)
-{
-    return make_uint2(a.x + b.x, a.y + b.y);
-}
-inline __host__ __device__ void operator+=(uint2 &a, uint2 b)
-{
-    a.x += b.x; a.y += b.y;
-}
-inline __host__ __device__ uint2 operator+(uint2 a, uint b)
-{
-    return make_uint2(a.x + b, a.y + b);
-}
-inline __host__ __device__ uint2 operator+(uint b, uint2 a)
-{
-    return make_uint2(a.x + b, a.y + b);
-}
-inline __host__ __device__ void operator+=(uint2 &a, uint b)
-{
-    a.x += b; a.y += b;
-}
-
-
-inline __host__ __device__ float3 operator+(float3 a, float3 b)
-{
-    return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
-}
-inline __host__ __device__ void operator+=(float3 &a, float3 b)
-{
-    a.x += b.x; a.y += b.y; a.z += b.z;
-}
-inline __host__ __device__ float3 operator+(float3 a, float b)
-{
-    return make_float3(a.x + b, a.y + b, a.z + b);
-}
-inline __host__ __device__ void operator+=(float3 &a, float b)
-{
-    a.x += b; a.y += b; a.z += b;
-}
-
-inline __host__ __device__ int3 operator+(int3 a, int3 b)
-{
-    return make_int3(a.x + b.x, a.y + b.y, a.z + b.z);
-}
-inline __host__ __device__ void operator+=(int3 &a, int3 b)
-{
-    a.x += b.x; a.y += b.y; a.z += b.z;
-}
-inline __host__ __device__ int3 operator+(int3 a, int b)
-{
-    return make_int3(a.x + b, a.y + b, a.z + b);
-}
-inline __host__ __device__ void operator+=(int3 &a, int b)
-{
-    a.x += b; a.y += b; a.z += b;
-}
-
-inline __host__ __device__ uint3 operator+(uint3 a, uint3 b)
-{
-    return make_uint3(a.x + b.x, a.y + b.y, a.z + b.z);
-}
-inline __host__ __device__ void operator+=(uint3 &a, uint3 b)
-{
-    a.x += b.x; a.y += b.y; a.z += b.z;
-}
-inline __host__ __device__ uint3 operator+(uint3 a, uint b)
-{
-    return make_uint3(a.x + b, a.y + b, a.z + b);
-}
-inline __host__ __device__ void operator+=(uint3 &a, uint b)
-{
-    a.x += b; a.y += b; a.z += b;
-}
-
-inline __host__ __device__ int3 operator+(int b, int3 a)
-{
-    return make_int3(a.x + b, a.y + b, a.z + b);
-}
-inline __host__ __device__ uint3 operator+(uint b, uint3 a)
-{
-    return make_uint3(a.x + b, a.y + b, a.z + b);
-}
-inline __host__ __device__ float3 operator+(float b, float3 a)
-{
-    return make_float3(a.x + b, a.y + b, a.z + b);
-}
-
-inline __host__ __device__ float4 operator+(float4 a, float4 b)
-{
-    return make_float4(a.x + b.x, a.y + b.y, a.z + b.z,  a.w + b.w);
-}
-inline __host__ __device__ void operator+=(float4 &a, float4 b)
-{
-    a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w;
-}
-inline __host__ __device__ float4 operator+(float4 a, float b)
-{
-    return make_float4(a.x + b, a.y + b, a.z + b, a.w + b);
-}
-inline __host__ __device__ float4 operator+(float b, float4 a)
-{
-    return make_float4(a.x + b, a.y + b, a.z + b, a.w + b);
-}
-inline __host__ __device__ void operator+=(float4 &a, float b)
-{
-    a.x += b; a.y += b; a.z += b; a.w += b;
-}
-
-inline __host__ __device__ int4 operator+(int4 a, int4 b)
-{
-    return make_int4(a.x + b.x, a.y + b.y, a.z + b.z,  a.w + b.w);
-}
-inline __host__ __device__ void operator+=(int4 &a, int4 b)
-{
-    a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w;
-}
-inline __host__ __device__ int4 operator+(int4 a, int b)
-{
-    return make_int4(a.x + b, a.y + b, a.z + b,  a.w + b);
-}
-inline __host__ __device__ int4 operator+(int b, int4 a)
-{
-    return make_int4(a.x + b, a.y + b, a.z + b,  a.w + b);
-}
-inline __host__ __device__ void operator+=(int4 &a, int b)
-{
-    a.x += b; a.y += b; a.z += b; a.w += b;
-}
-
-inline __host__ __device__ uint4 operator+(uint4 a, uint4 b)
-{
-    return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z,  a.w + b.w);
-}
-inline __host__ __device__ void operator+=(uint4 &a, uint4 b)
-{
-    a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w;
-}
-inline __host__ __device__ uint4 operator+(uint4 a, uint b)
-{
-    return make_uint4(a.x + b, a.y + b, a.z + b,  a.w + b);
-}
-inline __host__ __device__ uint4 operator+(uint b, uint4 a)
-{
-    return make_uint4(a.x + b, a.y + b, a.z + b,  a.w + b);
-}
-inline __host__ __device__ void operator+=(uint4 &a, uint b)
-{
-    a.x += b; a.y += b; a.z += b; a.w += b;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// subtract
-////////////////////////////////////////////////////////////////////////////////
-
-inline __host__ __device__ float2 operator-(float2 a, float2 b)
-{
-    return make_float2(a.x - b.x, a.y - b.y);
-}
-inline __host__ __device__ void operator-=(float2 &a, float2 b)
-{
-    a.x -= b.x; a.y -= b.y;
-}
-inline __host__ __device__ float2 operator-(float2 a, float b)
-{
-    return make_float2(a.x - b, a.y - b);
-}
-inline __host__ __device__ float2 operator-(float b, float2 a)
-{
-    return make_float2(b - a.x, b - a.y);
-}
-inline __host__ __device__ void operator-=(float2 &a, float b)
-{
-    a.x -= b; a.y -= b;
-}
-
-inline __host__ __device__ int2 operator-(int2 a, int2 b)
-{
-    return make_int2(a.x - b.x, a.y - b.y);
-}
-inline __host__ __device__ void operator-=(int2 &a, int2 b)
-{
-    a.x -= b.x; a.y -= b.y;
-}
-inline __host__ __device__ int2 operator-(int2 a, int b)
-{
-    return make_int2(a.x - b, a.y - b);
-}
-inline __host__ __device__ int2 operator-(int b, int2 a)
-{
-    return make_int2(b - a.x, b - a.y);
-}
-inline __host__ __device__ void operator-=(int2 &a, int b)
-{
-    a.x -= b; a.y -= b;
-}
-
-inline __host__ __device__ uint2 operator-(uint2 a, uint2 b)
-{
-    return make_uint2(a.x - b.x, a.y - b.y);
-}
-inline __host__ __device__ void operator-=(uint2 &a, uint2 b)
-{
-    a.x -= b.x; a.y -= b.y;
-}
-inline __host__ __device__ uint2 operator-(uint2 a, uint b)
-{
-    return make_uint2(a.x - b, a.y - b);
-}
-inline __host__ __device__ uint2 operator-(uint b, uint2 a)
-{
-    return make_uint2(b - a.x, b - a.y);
-}
-inline __host__ __device__ void operator-=(uint2 &a, uint b)
-{
-    a.x -= b; a.y -= b;
-}
-
-inline __host__ __device__ float3 operator-(float3 a, float3 b)
-{
-    return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
-}
-inline __host__ __device__ void operator-=(float3 &a, float3 b)
-{
-    a.x -= b.x; a.y -= b.y; a.z -= b.z;
-}
-inline __host__ __device__ float3 operator-(float3 a, float b)
-{
-    return make_float3(a.x - b, a.y - b, a.z - b);
-}
-inline __host__ __device__ float3 operator-(float b, float3 a)
-{
-    return make_float3(b - a.x, b - a.y, b - a.z);
-}
-inline __host__ __device__ void operator-=(float3 &a, float b)
-{
-    a.x -= b; a.y -= b; a.z -= b;
-}
-
-inline __host__ __device__ int3 operator-(int3 a, int3 b)
-{
-    return make_int3(a.x - b.x, a.y - b.y, a.z - b.z);
-}
-inline __host__ __device__ void operator-=(int3 &a, int3 b)
-{
-    a.x -= b.x; a.y -= b.y; a.z -= b.z;
-}
-inline __host__ __device__ int3 operator-(int3 a, int b)
-{
-    return make_int3(a.x - b, a.y - b, a.z - b);
-}
-inline __host__ __device__ int3 operator-(int b, int3 a)
-{
-    return make_int3(b - a.x, b - a.y, b - a.z);
-}
-inline __host__ __device__ void operator-=(int3 &a, int b)
-{
-    a.x -= b; a.y -= b; a.z -= b;
-}
-
-inline __host__ __device__ uint3 operator-(uint3 a, uint3 b)
-{
-    return make_uint3(a.x - b.x, a.y - b.y, a.z - b.z);
-}
-inline __host__ __device__ void operator-=(uint3 &a, uint3 b)
-{
-    a.x -= b.x; a.y -= b.y; a.z -= b.z;
-}
-inline __host__ __device__ uint3 operator-(uint3 a, uint b)
-{
-    return make_uint3(a.x - b, a.y - b, a.z - b);
-}
-inline __host__ __device__ uint3 operator-(uint b, uint3 a)
-{
-    return make_uint3(b - a.x, b - a.y, b - a.z);
-}
-inline __host__ __device__ void operator-=(uint3 &a, uint b)
-{
-    a.x -= b; a.y -= b; a.z -= b;
-}
-
-inline __host__ __device__ float4 operator-(float4 a, float4 b)
-{
-    return make_float4(a.x - b.x, a.y - b.y, a.z - b.z,  a.w - b.w);
-}
-inline __host__ __device__ void operator-=(float4 &a, float4 b)
-{
-    a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w;
-}
-inline __host__ __device__ float4 operator-(float4 a, float b)
-{
-    return make_float4(a.x - b, a.y - b, a.z - b,  a.w - b);
-}
-inline __host__ __device__ void operator-=(float4 &a, float b)
-{
-    a.x -= b; a.y -= b; a.z -= b; a.w -= b;
-}
-
-inline __host__ __device__ int4 operator-(int4 a, int4 b)
-{
-    return make_int4(a.x - b.x, a.y - b.y, a.z - b.z,  a.w - b.w);
-}
-inline __host__ __device__ void operator-=(int4 &a, int4 b)
-{
-    a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w;
-}
-inline __host__ __device__ int4 operator-(int4 a, int b)
-{
-    return make_int4(a.x - b, a.y - b, a.z - b,  a.w - b);
-}
-inline __host__ __device__ int4 operator-(int b, int4 a)
-{
-    return make_int4(b - a.x, b - a.y, b - a.z, b - a.w);
-}
-inline __host__ __device__ void operator-=(int4 &a, int b)
-{
-    a.x -= b; a.y -= b; a.z -= b; a.w -= b;
-}
-
-inline __host__ __device__ uint4 operator-(uint4 a, uint4 b)
-{
-    return make_uint4(a.x - b.x, a.y - b.y, a.z - b.z,  a.w - b.w);
-}
-inline __host__ __device__ void operator-=(uint4 &a, uint4 b)
-{
-    a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w;
-}
-inline __host__ __device__ uint4 operator-(uint4 a, uint b)
-{
-    return make_uint4(a.x - b, a.y - b, a.z - b,  a.w - b);
-}
-inline __host__ __device__ uint4 operator-(uint b, uint4 a)
-{
-    return make_uint4(b - a.x, b - a.y, b - a.z, b - a.w);
-}
-inline __host__ __device__ void operator-=(uint4 &a, uint b)
-{
-    a.x -= b; a.y -= b; a.z -= b; a.w -= b;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// multiply
-////////////////////////////////////////////////////////////////////////////////
-
-inline __host__ __device__ float2 operator*(float2 a, float2 b)
-{
-    return make_float2(a.x * b.x, a.y * b.y);
-}
-inline __host__ __device__ void operator*=(float2 &a, float2 b)
-{
-    a.x *= b.x; a.y *= b.y;
-}
-inline __host__ __device__ float2 operator*(float2 a, float b)
-{
-    return make_float2(a.x * b, a.y * b);
-}
-inline __host__ __device__ float2 operator*(float b, float2 a)
-{
-    return make_float2(b * a.x, b * a.y);
-}
-inline __host__ __device__ void operator*=(float2 &a, float b)
-{
-    a.x *= b; a.y *= b;
-}
-
-inline __host__ __device__ int2 operator*(int2 a, int2 b)
-{
-    return make_int2(a.x * b.x, a.y * b.y);
-}
-inline __host__ __device__ void operator*=(int2 &a, int2 b)
-{
-    a.x *= b.x; a.y *= b.y;
-}
-inline __host__ __device__ int2 operator*(int2 a, int b)
-{
-    return make_int2(a.x * b, a.y * b);
-}
-inline __host__ __device__ int2 operator*(int b, int2 a)
-{
-    return make_int2(b * a.x, b * a.y);
-}
-inline __host__ __device__ void operator*=(int2 &a, int b)
-{
-    a.x *= b; a.y *= b;
-}
-
-inline __host__ __device__ uint2 operator*(uint2 a, uint2 b)
-{
-    return make_uint2(a.x * b.x, a.y * b.y);
-}
-inline __host__ __device__ void operator*=(uint2 &a, uint2 b)
-{
-    a.x *= b.x; a.y *= b.y;
-}
-inline __host__ __device__ uint2 operator*(uint2 a, uint b)
-{
-    return make_uint2(a.x * b, a.y * b);
-}
-inline __host__ __device__ uint2 operator*(uint b, uint2 a)
-{
-    return make_uint2(b * a.x, b * a.y);
-}
-inline __host__ __device__ void operator*=(uint2 &a, uint b)
-{
-    a.x *= b; a.y *= b;
-}
-
-inline __host__ __device__ float3 operator*(float3 a, float3 b)
-{
-    return make_float3(a.x * b.x, a.y * b.y, a.z * b.z);
-}
-inline __host__ __device__ void operator*=(float3 &a, float3 b)
-{
-    a.x *= b.x; a.y *= b.y; a.z *= b.z;
-}
-inline __host__ __device__ float3 operator*(float3 a, float b)
-{
-    return make_float3(a.x * b, a.y * b, a.z * b);
-}
-inline __host__ __device__ float3 operator*(float b, float3 a)
-{
-    return make_float3(b * a.x, b * a.y, b * a.z);
-}
-inline __host__ __device__ void operator*=(float3 &a, float b)
-{
-    a.x *= b; a.y *= b; a.z *= b;
-}
-
-inline __host__ __device__ int3 operator*(int3 a, int3 b)
-{
-    return make_int3(a.x * b.x, a.y * b.y, a.z * b.z);
-}
-inline __host__ __device__ void operator*=(int3 &a, int3 b)
-{
-    a.x *= b.x; a.y *= b.y; a.z *= b.z;
-}
-inline __host__ __device__ int3 operator*(int3 a, int b)
-{
-    return make_int3(a.x * b, a.y * b, a.z * b);
-}
-inline __host__ __device__ int3 operator*(int b, int3 a)
-{
-    return make_int3(b * a.x, b * a.y, b * a.z);
-}
-inline __host__ __device__ void operator*=(int3 &a, int b)
-{
-    a.x *= b; a.y *= b; a.z *= b;
-}
-
-inline __host__ __device__ uint3 operator*(uint3 a, uint3 b)
-{
-    return make_uint3(a.x * b.x, a.y * b.y, a.z * b.z);
-}
-inline __host__ __device__ void operator*=(uint3 &a, uint3 b)
-{
-    a.x *= b.x; a.y *= b.y; a.z *= b.z;
-}
-inline __host__ __device__ uint3 operator*(uint3 a, uint b)
-{
-    return make_uint3(a.x * b, a.y * b, a.z * b);
-}
-inline __host__ __device__ uint3 operator*(uint b, uint3 a)
-{
-    return make_uint3(b * a.x, b * a.y, b * a.z);
-}
-inline __host__ __device__ void operator*=(uint3 &a, uint b)
-{
-    a.x *= b; a.y *= b; a.z *= b;
-}
-
-inline __host__ __device__ float4 operator*(float4 a, float4 b)
-{
-    return make_float4(a.x * b.x, a.y * b.y, a.z * b.z,  a.w * b.w);
-}
-inline __host__ __device__ void operator*=(float4 &a, float4 b)
-{
-    a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w;
-}
-inline __host__ __device__ float4 operator*(float4 a, float b)
-{
-    return make_float4(a.x * b, a.y * b, a.z * b,  a.w * b);
-}
-inline __host__ __device__ float4 operator*(float b, float4 a)
-{
-    return make_float4(b * a.x, b * a.y, b * a.z, b * a.w);
-}
-inline __host__ __device__ void operator*=(float4 &a, float b)
-{
-    a.x *= b; a.y *= b; a.z *= b; a.w *= b;
-}
-
-inline __host__ __device__ int4 operator*(int4 a, int4 b)
-{
-    return make_int4(a.x * b.x, a.y * b.y, a.z * b.z,  a.w * b.w);
-}
-inline __host__ __device__ void operator*=(int4 &a, int4 b)
-{
-    a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w;
-}
-inline __host__ __device__ int4 operator*(int4 a, int b)
-{
-    return make_int4(a.x * b, a.y * b, a.z * b,  a.w * b);
-}
-inline __host__ __device__ int4 operator*(int b, int4 a)
-{
-    return make_int4(b * a.x, b * a.y, b * a.z, b * a.w);
-}
-inline __host__ __device__ void operator*=(int4 &a, int b)
-{
-    a.x *= b; a.y *= b; a.z *= b; a.w *= b;
-}
-
-inline __host__ __device__ uint4 operator*(uint4 a, uint4 b)
-{
-    return make_uint4(a.x * b.x, a.y * b.y, a.z * b.z,  a.w * b.w);
-}
-inline __host__ __device__ void operator*=(uint4 &a, uint4 b)
-{
-    a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w;
-}
-inline __host__ __device__ uint4 operator*(uint4 a, uint b)
-{
-    return make_uint4(a.x * b, a.y * b, a.z * b,  a.w * b);
-}
-inline __host__ __device__ uint4 operator*(uint b, uint4 a)
-{
-    return make_uint4(b * a.x, b * a.y, b * a.z, b * a.w);
-}
-inline __host__ __device__ void operator*=(uint4 &a, uint b)
-{
-    a.x *= b; a.y *= b; a.z *= b; a.w *= b;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// divide
-////////////////////////////////////////////////////////////////////////////////
-
-inline __host__ __device__ float2 operator/(float2 a, float2 b)
-{
-    return make_float2(a.x / b.x, a.y / b.y);
-}
-inline __host__ __device__ void operator/=(float2 &a, float2 b)
-{
-    a.x /= b.x; a.y /= b.y;
-}
-inline __host__ __device__ float2 operator/(float2 a, float b)
-{
-    return make_float2(a.x / b, a.y / b);
-}
-inline __host__ __device__ void operator/=(float2 &a, float b)
-{
-    a.x /= b; a.y /= b;
-}
-inline __host__ __device__ float2 operator/(float b, float2 a)
-{
-    return make_float2(b / a.x, b / a.y);
-}
-
-inline __host__ __device__ float3 operator/(float3 a, float3 b)
-{
-    return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
-}
-inline __host__ __device__ void operator/=(float3 &a, float3 b)
-{
-    a.x /= b.x; a.y /= b.y; a.z /= b.z;
-}
-inline __host__ __device__ float3 operator/(float3 a, float b)
-{
-    return make_float3(a.x / b, a.y / b, a.z / b);
-}
-inline __host__ __device__ void operator/=(float3 &a, float b)
-{
-    a.x /= b; a.y /= b; a.z /= b;
-}
-inline __host__ __device__ float3 operator/(float b, float3 a)
-{
-    return make_float3(b / a.x, b / a.y, b / a.z);
-}
-
-inline __host__ __device__ float4 operator/(float4 a, float4 b)
-{
-    return make_float4(a.x / b.x, a.y / b.y, a.z / b.z,  a.w / b.w);
-}
-inline __host__ __device__ void operator/=(float4 &a, float4 b)
-{
-    a.x /= b.x; a.y /= b.y; a.z /= b.z; a.w /= b.w;
-}
-inline __host__ __device__ float4 operator/(float4 a, float b)
-{
-    return make_float4(a.x / b, a.y / b, a.z / b,  a.w / b);
-}
-inline __host__ __device__ void operator/=(float4 &a, float b)
-{
-    a.x /= b; a.y /= b; a.z /= b; a.w /= b;
-}
-inline __host__ __device__ float4 operator/(float b, float4 a){
-    return make_float4(b / a.x, b / a.y, b / a.z, b / a.w);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// min
-////////////////////////////////////////////////////////////////////////////////
-
-inline  __host__ __device__ float2 fminf(float2 a, float2 b)
-{
-	return make_float2(fminf(a.x,b.x), fminf(a.y,b.y));
-}
-inline __host__ __device__ float3 fminf(float3 a, float3 b)
-{
-	return make_float3(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z));
-}
-inline  __host__ __device__ float4 fminf(float4 a, float4 b)
-{
-	return make_float4(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z), fminf(a.w,b.w));
-}
-
-inline __host__ __device__ int2 min(int2 a, int2 b)
-{
-    return make_int2(min(a.x,b.x), min(a.y,b.y));
-}
-inline __host__ __device__ int3 min(int3 a, int3 b)
-{
-    return make_int3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z));
-}
-inline __host__ __device__ int4 min(int4 a, int4 b)
-{
-    return make_int4(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z), min(a.w,b.w));
-}
-
-inline __host__ __device__ uint2 min(uint2 a, uint2 b)
-{
-    return make_uint2(min(a.x,b.x), min(a.y,b.y));
-}
-inline __host__ __device__ uint3 min(uint3 a, uint3 b)
-{
-    return make_uint3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z));
-}
-inline __host__ __device__ uint4 min(uint4 a, uint4 b)
-{
-    return make_uint4(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z), min(a.w,b.w));
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// max
-////////////////////////////////////////////////////////////////////////////////
-
-inline __host__ __device__ float2 fmaxf(float2 a, float2 b)
-{
-	return make_float2(fmaxf(a.x,b.x), fmaxf(a.y,b.y));
-}
-inline __host__ __device__ float3 fmaxf(float3 a, float3 b)
-{
-	return make_float3(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z));
-}
-inline __host__ __device__ float4 fmaxf(float4 a, float4 b)
-{
-	return make_float4(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z), fmaxf(a.w,b.w));
-}
-
-inline __host__ __device__ int2 max(int2 a, int2 b)
-{
-    return make_int2(max(a.x,b.x), max(a.y,b.y));
-}
-inline __host__ __device__ int3 max(int3 a, int3 b)
-{
-    return make_int3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z));
-}
-inline __host__ __device__ int4 max(int4 a, int4 b)
-{
-    return make_int4(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z), max(a.w,b.w));
-}
-
-inline __host__ __device__ uint2 max(uint2 a, uint2 b)
-{
-    return make_uint2(max(a.x,b.x), max(a.y,b.y));
-}
-inline __host__ __device__ uint3 max(uint3 a, uint3 b)
-{
-    return make_uint3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z));
-}
-inline __host__ __device__ uint4 max(uint4 a, uint4 b)
-{
-    return make_uint4(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z), max(a.w,b.w));
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// lerp
-// - linear interpolation between a and b, based on value t in [0, 1] range
-////////////////////////////////////////////////////////////////////////////////
-
-inline __device__ __host__ float lerp(float a, float b, float t)
-{
-    return a + t*(b-a);
-}
-inline __device__ __host__ float2 lerp(float2 a, float2 b, float t)
-{
-    return a + t*(b-a);
-}
-inline __device__ __host__ float3 lerp(float3 a, float3 b, float t)
-{
-    return a + t*(b-a);
-}
-inline __device__ __host__ float4 lerp(float4 a, float4 b, float t)
-{
-    return a + t*(b-a);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// clamp
-// - clamp the value v to be in the range [a, b]
-////////////////////////////////////////////////////////////////////////////////
-
-inline __device__ __host__ float clamp(float f, float a, float b)
-{
-    return fmaxf(a, fminf(f, b));
-}
-inline __device__ __host__ int clamp(int f, int a, int b)
-{
-    return max(a, min(f, b));
-}
-inline __device__ __host__ uint clamp(uint f, uint a, uint b)
-{
-    return max(a, min(f, b));
-}
-
-inline __device__ __host__ float2 clamp(float2 v, float a, float b)
-{
-    return make_float2(clamp(v.x, a, b), clamp(v.y, a, b));
-}
-inline __device__ __host__ float2 clamp(float2 v, float2 a, float2 b)
-{
-    return make_float2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
-}
-inline __device__ __host__ float3 clamp(float3 v, float a, float b)
-{
-    return make_float3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
-}
-inline __device__ __host__ float3 clamp(float3 v, float3 a, float3 b)
-{
-    return make_float3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
-}
-inline __device__ __host__ float4 clamp(float4 v, float a, float b)
-{
-    return make_float4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
-}
-inline __device__ __host__ float4 clamp(float4 v, float4 a, float4 b)
-{
-    return make_float4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w));
-}
-
-inline __device__ __host__ int2 clamp(int2 v, int a, int b)
-{
-    return make_int2(clamp(v.x, a, b), clamp(v.y, a, b));
-}
-inline __device__ __host__ int2 clamp(int2 v, int2 a, int2 b)
-{
-    return make_int2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
-}
-inline __device__ __host__ int3 clamp(int3 v, int a, int b)
-{
-    return make_int3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
-}
-inline __device__ __host__ int3 clamp(int3 v, int3 a, int3 b)
-{
-    return make_int3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
-}
-inline __device__ __host__ int4 clamp(int4 v, int a, int b)
-{
-    return make_int4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
-}
-inline __device__ __host__ int4 clamp(int4 v, int4 a, int4 b)
-{
-    return make_int4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w));
-}
-
-inline __device__ __host__ uint2 clamp(uint2 v, uint a, uint b)
-{
-    return make_uint2(clamp(v.x, a, b), clamp(v.y, a, b));
-}
-inline __device__ __host__ uint2 clamp(uint2 v, uint2 a, uint2 b)
-{
-    return make_uint2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
-}
-inline __device__ __host__ uint3 clamp(uint3 v, uint a, uint b)
-{
-    return make_uint3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
-}
-inline __device__ __host__ uint3 clamp(uint3 v, uint3 a, uint3 b)
-{
-    return make_uint3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
-}
-inline __device__ __host__ uint4 clamp(uint4 v, uint a, uint b)
-{
-    return make_uint4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
-}
-inline __device__ __host__ uint4 clamp(uint4 v, uint4 a, uint4 b)
-{
-    return make_uint4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w));
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// dot product
-////////////////////////////////////////////////////////////////////////////////
-
-inline __host__ __device__ float dot(float2 a, float2 b)
-{ 
-    return a.x * b.x + a.y * b.y;
-}
-inline __host__ __device__ float dot(float3 a, float3 b)
-{ 
-    return a.x * b.x + a.y * b.y + a.z * b.z;
-}
-inline __host__ __device__ float dot(float4 a, float4 b)
-{ 
-    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
-}
-
-inline __host__ __device__ int dot(int2 a, int2 b)
-{ 
-    return a.x * b.x + a.y * b.y;
-}
-inline __host__ __device__ int dot(int3 a, int3 b)
-{ 
-    return a.x * b.x + a.y * b.y + a.z * b.z;
-}
-inline __host__ __device__ int dot(int4 a, int4 b)
-{ 
-    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
-}
-
-inline __host__ __device__ uint dot(uint2 a, uint2 b)
-{ 
-    return a.x * b.x + a.y * b.y;
-}
-inline __host__ __device__ uint dot(uint3 a, uint3 b)
-{ 
-    return a.x * b.x + a.y * b.y + a.z * b.z;
-}
-inline __host__ __device__ uint dot(uint4 a, uint4 b)
-{ 
-    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// length
-////////////////////////////////////////////////////////////////////////////////
-
-inline __host__ __device__ float length(float2 v)
-{
-    return sqrtf(dot(v, v));
-}
-inline __host__ __device__ float length(float3 v)
-{
-    return sqrtf(dot(v, v));
-}
-inline __host__ __device__ float length(float4 v)
-{
-    return sqrtf(dot(v, v));
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// normalize
-////////////////////////////////////////////////////////////////////////////////
-
-inline __host__ __device__ float2 normalize(float2 v)
-{
-    float invLen = rsqrtf(dot(v, v));
-    return v * invLen;
-}
-inline __host__ __device__ float3 normalize(float3 v)
-{
-    float invLen = rsqrtf(dot(v, v));
-    return v * invLen;
-}
-inline __host__ __device__ float4 normalize(float4 v)
-{
-    float invLen = rsqrtf(dot(v, v));
-    return v * invLen;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// floor
-////////////////////////////////////////////////////////////////////////////////
-
-inline __host__ __device__ float2 floorf(float2 v)
-{
-    return make_float2(floorf(v.x), floorf(v.y));
-}
-inline __host__ __device__ float3 floorf(float3 v)
-{
-    return make_float3(floorf(v.x), floorf(v.y), floorf(v.z));
-}
-inline __host__ __device__ float4 floorf(float4 v)
-{
-    return make_float4(floorf(v.x), floorf(v.y), floorf(v.z), floorf(v.w));
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// frac - returns the fractional portion of a scalar or each vector component
-////////////////////////////////////////////////////////////////////////////////
-
-inline __host__ __device__ float fracf(float v)
-{
-    return v - floorf(v);
-}
-inline __host__ __device__ float2 fracf(float2 v)
-{
-    return make_float2(fracf(v.x), fracf(v.y));
-}
-inline __host__ __device__ float3 fracf(float3 v)
-{
-    return make_float3(fracf(v.x), fracf(v.y), fracf(v.z));
-}
-inline __host__ __device__ float4 fracf(float4 v)
-{
-    return make_float4(fracf(v.x), fracf(v.y), fracf(v.z), fracf(v.w));
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// fmod
-////////////////////////////////////////////////////////////////////////////////
-
-inline __host__ __device__ float2 fmodf(float2 a, float2 b)
-{
-    return make_float2(fmodf(a.x, b.x), fmodf(a.y, b.y));
-}
-inline __host__ __device__ float3 fmodf(float3 a, float3 b)
-{
-    return make_float3(fmodf(a.x, b.x), fmodf(a.y, b.y), fmodf(a.z, b.z));
-}
-inline __host__ __device__ float4 fmodf(float4 a, float4 b)
-{
-    return make_float4(fmodf(a.x, b.x), fmodf(a.y, b.y), fmodf(a.z, b.z), fmodf(a.w, b.w));
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// absolute value
-////////////////////////////////////////////////////////////////////////////////
-
-inline __host__ __device__ float2 fabs(float2 v)
-{
-	return make_float2(fabs(v.x), fabs(v.y));
-}
-inline __host__ __device__ float3 fabs(float3 v)
-{
-	return make_float3(fabs(v.x), fabs(v.y), fabs(v.z));
-}
-inline __host__ __device__ float4 fabs(float4 v)
-{
-	return make_float4(fabs(v.x), fabs(v.y), fabs(v.z), fabs(v.w));
-}
-
-inline __host__ __device__ int2 abs(int2 v)
-{
-	return make_int2(abs(v.x), abs(v.y));
-}
-inline __host__ __device__ int3 abs(int3 v)
-{
-	return make_int3(abs(v.x), abs(v.y), abs(v.z));
-}
-inline __host__ __device__ int4 abs(int4 v)
-{
-	return make_int4(abs(v.x), abs(v.y), abs(v.z), abs(v.w));
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// reflect
-// - returns reflection of incident ray I around surface normal N
-// - N should be normalized, reflected vector's length is equal to length of I
-////////////////////////////////////////////////////////////////////////////////
-
-inline __host__ __device__ float3 reflect(float3 i, float3 n)
-{
-	return i - 2.0f * n * dot(n,i);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// cross product
-////////////////////////////////////////////////////////////////////////////////
-
-inline __host__ __device__ float3 cross(float3 a, float3 b)
-{ 
-    return make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x); 
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// smoothstep
-// - returns 0 if x < a
-// - returns 1 if x > b
-// - otherwise returns smooth interpolation between 0 and 1 based on x
-////////////////////////////////////////////////////////////////////////////////
-
-inline __device__ __host__ float smoothstep(float a, float b, float x)
-{
-	float y = clamp((x - a) / (b - a), 0.0f, 1.0f);
-	return (y*y*(3.0f - (2.0f*y)));
-}
-inline __device__ __host__ float2 smoothstep(float2 a, float2 b, float2 x)
-{
-	float2 y = clamp((x - a) / (b - a), 0.0f, 1.0f);
-	return (y*y*(make_float2(3.0f) - (make_float2(2.0f)*y)));
-}
-inline __device__ __host__ float3 smoothstep(float3 a, float3 b, float3 x)
-{
-	float3 y = clamp((x - a) / (b - a), 0.0f, 1.0f);
-	return (y*y*(make_float3(3.0f) - (make_float3(2.0f)*y)));
-}
-inline __device__ __host__ float4 smoothstep(float4 a, float4 b, float4 x)
-{
-	float4 y = clamp((x - a) / (b - a), 0.0f, 1.0f);
-	return (y*y*(make_float4(3.0f) - (make_float4(2.0f)*y)));
-}
-
-#endif
diff --git a/registration/include/pcl/registration/ia_fpcs.h b/registration/include/pcl/registration/ia_fpcs.h
deleted file mode 100644
index eeb47782..00000000
--- a/registration/include/pcl/registration/ia_fpcs.h
+++ /dev/null
@@ -1,571 +0,0 @@
-/*
- * Software License Agreement (BSD License)
- *
- *  Point Cloud Library (PCL) - www.pointclouds.org
- *  Copyright (c) 2014-, Open Perception, Inc.
- *  Copyright (C) 2008 Ben Gurion University of the Negev, Beer Sheva, Israel.
- *
- *  All rights reserved
- *
- *  Redistribution and use in source and binary forms, with or without
- *  modification, are permitted provided that the following conditions are met
- *
- *   * The use for research only (no for any commercial application).
- *   * Redistributions of source code must retain the above copyright
- *     notice, this list of conditions and the following disclaimer.
- *   * Redistributions in binary form must reproduce the above
- *     copyright notice, this list of conditions and the following
- *     disclaimer in the documentation and/or other materials provided
- *     with the distribution.
- *   * Neither the name of the copyright holder(s) nor the names of its
- *     contributors may be used to endorse or promote products derived
- *     from this software without specific prior written permission.
- *
-  *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *  COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *  ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *  POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#ifndef PCL_REGISTRATION_IA_FPCS_H_
-#define PCL_REGISTRATION_IA_FPCS_H_
-
-#include <pcl/common/common.h>
-#include <pcl/registration/registration.h>
-#include <pcl/registration/matching_candidate.h>
-
-namespace pcl
-{
-  /** \brief Compute the mean point density of a given point cloud.
-    * \param[in] cloud pointer to the input point cloud
-    * \param[in] max_dist maximum distance of a point to be considered as a neighbor
-    * \param[in] nr_threads number of threads to use (default = 1, only used if OpenMP flag is set)
-    * \return the mean point density of a given point cloud
-    */
-  template <typename PointT> inline float
-  getMeanPointDensity (const typename pcl::PointCloud<PointT>::ConstPtr &cloud, float max_dist, int nr_threads = 1);
-
-  /** \brief Compute the mean point density of a given point cloud.
-    * \param[in] cloud pointer to the input point cloud
-    * \param[in] indices the vector of point indices to use from \a cloud
-    * \param[in] max_dist maximum distance of a point to be considered as a neighbor
-    * \param[in] nr_threads number of threads to use (default = 1, only used if OpenMP flag is set)
-    * \return the mean point density of a given point cloud
-    */
-  template <typename PointT> inline float
-  getMeanPointDensity (const typename pcl::PointCloud<PointT>::ConstPtr &cloud, const std::vector <int> &indices,
-    float max_dist, int nr_threads = 1);
-  
-  
-  namespace registration
-  {
-    /** \brief FPCSInitialAlignment computes corresponding four point congruent sets as described in:
-    * "4-points congruent sets for robust pairwise surface registration", Dror Aiger, Niloy Mitra, Daniel Cohen-Or.
-    * ACM Transactions on Graphics, vol. 27(3), 2008
-    * \author P.W.Theiler
-    * \ingroup registration
-    */
-    template <typename PointSource, typename PointTarget, typename NormalT = pcl::Normal, typename Scalar = float>
-    class FPCSInitialAlignment : public Registration <PointSource, PointTarget, Scalar>
-    {
-    public:
-      /** \cond */
-      typedef boost::shared_ptr <FPCSInitialAlignment <PointSource, PointTarget, NormalT, Scalar> > Ptr;
-      typedef boost::shared_ptr <const FPCSInitialAlignment <PointSource, PointTarget, NormalT, Scalar> > ConstPtr;
-
-      typedef pcl::search::KdTree<PointSource> KdTreeReciprocal;
-      typedef typename KdTreeReciprocal::Ptr KdTreeReciprocalPtr;
-
-      typedef pcl::PointCloud <PointTarget> PointCloudTarget;
-      typedef pcl::PointCloud <PointSource> PointCloudSource;
-      typedef typename PointCloudSource::Ptr PointCloudSourcePtr;
-      typedef typename PointCloudSource::iterator PointCloudSourceIterator;      
-
-      typedef pcl::PointCloud <NormalT> Normals;
-      typedef typename Normals::ConstPtr NormalsConstPtr;
-
-      typedef pcl::registration::MatchingCandidate MatchingCandidate;
-      typedef pcl::registration::MatchingCandidates MatchingCandidates;
-      /** \endcond */
-
-
-      /** \brief Constructor.
-        * Resets the maximum number of iterations to 0 thus forcing an internal computation if not set by the user.
-        * Sets the number of RANSAC iterations to 1000 and the standard transformation estimation to TransformationEstimation3Point.
-        */
-      FPCSInitialAlignment ();
-
-      /** \brief Destructor. */
-      virtual ~FPCSInitialAlignment ()
-      {};
-
-
-      /** \brief Provide a pointer to the vector of target indices.
-        * \param[in] target_indices a pointer to the target indices
-        */
-      inline void
-      setTargetIndices (const IndicesPtr &target_indices)
-      {
-        target_indices_ = target_indices;
-      };
-
-      /** \return a pointer to the vector of target indices. */
-      inline IndicesPtr
-      getTargetIndices () const
-      {
-        return (target_indices_);
-      };
-
-
-      /** \brief Provide a pointer to the normals of the source point cloud.
-        * \param[in] source_normals pointer to the normals of the source pointer cloud.
-        */
-      inline void
-      setSourceNormals (const NormalsConstPtr &source_normals)
-      {
-        source_normals_ = source_normals;
-      };
-
-      /** \return the normals of the source point cloud. */
-      inline NormalsConstPtr
-      getSourceNormals () const
-      {
-        return (source_normals_);
-      };
-
-
-      /** \brief Provide a pointer to the normals of the target point cloud.
-        * \param[in] target_normals point to the normals of the target point cloud.
-        */
-      inline void
-      setTargetNormals (const NormalsConstPtr &target_normals)
-      {
-        target_normals_ = target_normals;
-      };
-
-      /** \return the normals of the target point cloud. */
-      inline NormalsConstPtr
-      getTargetNormals () const
-      {
-        return (target_normals_);
-      };
-
-
-      /** \brief Set the number of used threads if OpenMP is activated.
-        * \param[in] nr_threads the number of used threads
-        */
-      inline void
-      setNumberOfThreads (int nr_threads)
-      {
-        nr_threads_ = nr_threads;
-      };
-
-      /** \return the number of threads used if OpenMP is activated. */
-      inline int
-      getNumberOfThreads () const
-      {
-        return (nr_threads_);
-      };
-
-
-      /** \brief Set the constant factor delta which weights the internally calculated parameters.
-        * \param[in] delta the weight factor delta
-        * \param[in] normalize flag if delta should be normalized according to point cloud density
-        */
-      inline void
-      setDelta (float delta, bool normalize = false)
-      {
-        delta_ = delta;
-        normalize_delta_ = normalize;
-      };
-
-      /** \return the constant factor delta which weights the internally calculated parameters. */
-      inline float
-      getDelta () const
-      {
-        return (delta_);
-      };
-
-
-      /** \brief Set the approximate overlap between source and target.
-        * \param[in] approx_overlap the estimated overlap
-        */
-      inline void
-      setApproxOverlap (float approx_overlap)
-      {
-        approx_overlap_ = approx_overlap;
-      };
-
-      /** \return the approximated overlap between source and target. */
-      inline float
-      getApproxOverlap () const
-      {
-        return (approx_overlap_);
-      };
-
-
-      /** \brief Set the scoring threshold used for early finishing the method.
-        * \param[in] score_threshold early terminating score criteria
-        */
-      inline void
-      setScoreThreshold (float score_threshold)
-      {
-        score_threshold_ = score_threshold;
-      };
-
-      /** \return the scoring threshold used for early finishing the method. */
-      inline float
-      getScoreThreshold () const
-      {
-        return (score_threshold_);
-      };
-
-
-      /** \brief Set the number of source samples to use during alignment.
-        * \param[in] nr_samples the number of source samples
-        */
-      inline void
-      setNumberOfSamples (int nr_samples)
-      {
-        nr_samples_ = nr_samples;
-      };
-
-      /** \return the number of source samples to use during alignment. */
-      inline int
-      getNumberOfSamples () const
-      {
-        return (nr_samples_);
-      };
-
-
-      /** \brief Set the maximum normal difference between valid point correspondences in degree.
-        * \param[in] max_norm_diff the maximum difference in degree
-        */
-      inline void
-      setMaxNormalDifference (float max_norm_diff)
-      {
-        max_norm_diff_ = max_norm_diff;
-      };
-
-      /** \return the maximum normal difference between valid point correspondences in degree. */
-      inline float
-      getMaxNormalDifference () const
-      {
-        return (max_norm_diff_);
-      };
-
-
-      /** \brief Set the maximum computation time in seconds.
-        * \param[in] max_runtime the maximum runtime of the method in seconds
-        */
-      inline void
-      setMaxComputationTime (int max_runtime)
-      {
-        max_runtime_ = max_runtime;
-      };
-
-      /** \return the maximum computation time in seconds. */
-      inline int
-      getMaxComputationTime () const
-      {
-        return (max_runtime_);
-      };
-
-
-      /** \return the fitness score of the best scored four-point match. */
-      inline float
-      getFitnessScore () const
-      {
-        return (fitness_score_);
-      };
-
-    protected:
-
-      using PCLBase <PointSource>::deinitCompute;
-      using PCLBase <PointSource>::input_;
-      using PCLBase <PointSource>::indices_;
-
-      using Registration <PointSource, PointTarget, Scalar>::reg_name_;
-      using Registration <PointSource, PointTarget, Scalar>::target_;
-      using Registration <PointSource, PointTarget, Scalar>::tree_;
-      using Registration <PointSource, PointTarget, Scalar>::correspondences_;
-      using Registration <PointSource, PointTarget, Scalar>::target_cloud_updated_;
-      using Registration <PointSource, PointTarget, Scalar>::final_transformation_;
-      using Registration <PointSource, PointTarget, Scalar>::max_iterations_;
-      using Registration <PointSource, PointTarget, Scalar>::ransac_iterations_;
-      using Registration <PointSource, PointTarget, Scalar>::transformation_estimation_;
-      using Registration <PointSource, PointTarget, Scalar>::converged_;
-
-
-      /** \brief Rigid transformation computation method.
-        * \param output the transformed input point cloud dataset using the rigid transformation found
-        * \param guess The computed transforamtion
-        */
-      virtual void
-      computeTransformation (PointCloudSource &output, const Eigen::Matrix4f& guess);
-
-
-      /** \brief Internal computation initialization. */
-      virtual bool
-      initCompute ();
-
-      /** \brief Select an approximately coplanar set of four points from the source cloud.
-        * \param[out] base_indices selected source cloud indices, further used as base (B)
-        * \param[out] ratio the two diagonal intersection ratios (r1,r2) of the base points
-        * \return
-        * * < 0 no coplanar four point sets with large enough sampling distance was found
-        * * = 0 a set of four congruent points was selected
-        */
-      int
-      selectBase (std::vector <int> &base_indices, float (&ratio)[2]);
-
-      /** \brief Select randomly a triplet of points with large point-to-point distances. The minimum point
-        * sampling distance is calculated based on the estimated point cloud overlap during initialization.
-        *
-        * \param[out] base_indices indices of base B
-        * \return
-        * * < 0 no triangle with large enough base lines could be selected
-        * * = 0 base triangle succesully selected
-        */
-      int
-      selectBaseTriangle (std::vector <int> &base_indices);
-
-      /** \brief Setup the base (four coplanar points) by ordering the points and computing intersection
-        * ratios and segment to segment distances of base diagonal.
-        *
-        * \param[in,out] base_indices indices of base B (will be reordered)
-        * \param[out] ratio diagonal intersection ratios of base points
-        */
-      void
-      setupBase (std::vector <int> &base_indices, float (&ratio)[2]);
-
-      /** \brief Calculate intersection ratios and segment to segment distances of base diagonals.
-        * \param[in] base_indices indices of base B
-        * \param[out] ratio diagonal intersection ratios of base points
-        * \return quality value of diagonal intersection
-        */
-      float
-      segmentToSegmentDist (const std::vector <int> &base_indices, float (&ratio)[2]);
-
-      /** \brief Search for corresponding point pairs given the distance between two base points.
-        *
-        * \param[in] idx1 first index of current base segment (in source cloud)
-        * \param[in] idx2 second index of current base segment (in source cloud)
-        * \param[out] pairs resulting point pairs with point-to-point distance close to ref_dist
-        * \return
-        * * < 0 no corresponding point pair was found
-        * * = 0 at least one point pair candidate was found
-        */
-      virtual int
-      bruteForceCorrespondences (int idx1, int idx2, pcl::Correspondences &pairs);
-
-      /** \brief Determine base matches by combining the point pair candidate and search for coinciding
-        * intersection points using the diagonal segment ratios of base B. The coincidation threshold is
-        * calculated during initialization (coincidation_limit_).
-        *
-        * \param[in] base_indices indices of base B
-        * \param[out] matches vector of candidate matches w.r.t the base B
-        * \param[in] pairs_a point pairs corresponding to points of 1st diagonal of base B
-        * \param[in] pairs_b point pairs corresponding to points of 2nd diagonal of base B
-        * \param[in] ratio diagonal intersection ratios of base points
-        * \return
-        * * < 0 no base match could be found
-        * * = 0 at least one base match was found
-        */
-      virtual int
-      determineBaseMatches (
-        const std::vector <int> &base_indices,
-        std::vector <std::vector <int> > &matches,
-        const pcl::Correspondences &pairs_a,
-        const pcl::Correspondences &pairs_b,
-        const float (&ratio)[2]);
-
-      /** \brief Check if outer rectangle distance of matched points fit with the base rectangle.
-        *
-        * \param[in] match_indices indices of match M
-        * \param[in] ds edge lengths of base B
-        * \return
-        * * < 0 at least one edge of the match M has no corresponding one in the base B
-        * * = 0 edges of match M fits to the ones of base B
-        */
-      int
-      checkBaseMatch (const std::vector <int> &match_indices, const float (&ds)[4]);
-
-      /** \brief Method to handle current candidate matches. Here we validate and evaluate the matches w.r.t the
-        * base and store the best fitting match (together with its score and estimated transformation).
-        * \note For forwards compatibility the results are stored in 'vectors of size 1'.
-        *
-        * \param[in] base_indices indices of base B
-        * \param[in,out] matches vector of candidate matches w.r.t the base B. The candidate matches are 
-        * reordered during this step.
-        * \param[out] candidates vector which contains the candidates matches M
-        */
-      virtual void
-      handleMatches (
-        const std::vector <int> &base_indices,
-        std::vector <std::vector <int> > &matches,
-        MatchingCandidates &candidates);
-
-      /** \brief Sets the correspondences between the base B and the match M by using the distance of each point
-        * to the centroid of the rectangle.
-        *
-        * \param[in] base_indices indices of base B
-        * \param[in] match_indices indices of match M
-        * \param[out] correspondences resulting correspondences
-        */
-      virtual void
-      linkMatchWithBase (
-        const std::vector <int> &base_indices,
-        std::vector <int> &match_indices,
-        pcl::Correspondences &correspondences);
-
-      /** \brief Validate the matching by computing the transformation between the source and target based on the
-        * four matched points and by comparing the mean square error (MSE) to a threshold. The MSE limit was
-        * calculated during initialization (max_mse_).
-        *
-        * \param[in] base_indices indices of base B
-        * \param[in] match_indices indices of match M
-        * \param[in] correspondences corresondences between source and target
-        * \param[out] transformation resulting transformation matrix
-        * \return
-        * * < 0 MSE bigger than max_mse_
-        * * = 0 MSE smaller than max_mse_
-        */
-      virtual int
-      validateMatch (
-        const std::vector <int> &base_indices,
-        const std::vector <int> &match_indices,
-        const pcl::Correspondences &correspondences,
-        Eigen::Matrix4f &transformation);
-
-      /** \brief Validate the transformation by calculating the number of inliers after transforming the source cloud.
-        * The resulting fitness score is later used as the decision criteria of the best fitting match.
-        *
-        * \param[out] transformation updated orientation matrix using all inliers
-        * \param[out] fitness_score current best fitness_score
-        * \note fitness score is only updated if the score of the current transformation exceeds the input one.
-        * \return
-        * * < 0 if previous result is better than the current one (score remains)
-        * * = 0 current result is better than the previous one (score updated)
-        */
-      virtual int
-      validateTransformation (Eigen::Matrix4f &transformation, float &fitness_score);
-
-      /** \brief Final computation of best match out of vector of best matches. To avoid cross thread dependencies
-        *  during parallel running, a best match for each try was calculated.
-        * \note For forwards compatibility the candidates are stored in vectors of 'vectors of size 1'.
-        * \param[in] candidates vector of candidate matches
-        */
-      virtual void
-      finalCompute (const std::vector <MatchingCandidates > &candidates);
-
-
-      /** \brief Normals of source point cloud. */
-      NormalsConstPtr source_normals_;
-
-      /** \brief Normals of target point cloud. */
-      NormalsConstPtr target_normals_;
-
-
-      /** \brief Number of threads for parallelization (standard = 1).
-        * \note Only used if run compiled with OpenMP.
-        */
-      int nr_threads_;
-
-      /** \brief Estimated overlap between source and target (standard = 0.5). */
-      float approx_overlap_;
-
-      /** \brief Delta value of 4pcs algorithm (standard = 1.0).
-        * It can be used as:
-        * * absolute value (normalization = false), value should represent the point accuracy to ensure finding neighbors between source <-> target
-        * * relative value (normalization = true), to adjust the internally calculated point accuracy (= point density)
-        */
-      float delta_;
-
-      /** \brief Score threshold to stop calculation with success.
-        * If not set by the user it is equal to the approximated overlap
-        */
-      float score_threshold_;
-
-      /** \brief The number of points to uniformly sample the source point cloud. (standard = 0 => full cloud). */
-      int nr_samples_;
-
-      /** \brief Maximum normal difference of corresponding point pairs in degrees (standard = 90). */
-      float max_norm_diff_;
-
-      /** \brief Maximum allowed computation time in seconds (standard = 0 => ~unlimited). */
-      int max_runtime_;
-
-
-      /** \brief Resulting fitness score of the best match. */
-      float fitness_score_;
-      
-
-      /** \brief Estimated diamter of the target point cloud. */
-      float diameter_;
-
-      /** \brief Estimated squared metric overlap between source and target.
-        * \note Internally calculated using the estimated overlap and the extent of the source cloud.
-        * It is used to derive the minimum sampling distance of the base points as well as to calculated
-        * the number of trys to reliable find a correct mach.
-        */
-      float max_base_diameter_sqr_;
-
-      /** \brief Use normals flag. */
-      bool use_normals_;
-
-      /** \brief Normalize delta flag. */
-      bool normalize_delta_;
-
-
-      /** \brief A pointer to the vector of source point indices to use after sampling. */
-      pcl::IndicesPtr source_indices_;
-
-      /** \brief A pointer to the vector of target point indices to use after sampling. */
-      pcl::IndicesPtr target_indices_;
-
-      /** \brief Maximal difference between corresponding point pairs in source and target.
-        * \note Internally calculated using an estimation of the point density.
-        */
-      float max_pair_diff_;
-
-      /** \brief Maximal difference between the length of the base edges and valid match edges.
-        * \note Internally calculated using an estimation of the point density.
-        */
-      float max_edge_diff_;
-
-      /** \brief Maximal distance between coinciding intersection points to find valid matches.
-        * \note Internally calculated using an estimation of the point density.
-        */
-      float coincidation_limit_;
-
-      /** \brief Maximal mean squared errors of a transformation calculated from a candidate match.
-        * \note Internally calculated using an estimation of the point density.
-        */
-      float max_mse_;
-
-      /** \brief Maximal squared point distance between source and target points to count as inlier.
-        * \note Internally calculated using an estimation of the point density.
-        */
-      float max_inlier_dist_sqr_;
-
-
-      /** \brief Definition of a small error. */
-      const float small_error_;
-
-    };
-  }; // namespace registration  
-}; // namespace pcl 
-
-#include <pcl/registration/impl/ia_fpcs.hpp>
-
-#endif // PCL_REGISTRATION_IA_FPCS_H_
diff --git a/registration/include/pcl/registration/impl/ia_fpcs.hpp b/registration/include/pcl/registration/impl/ia_fpcs.hpp
deleted file mode 100644
index 585cf9e8..00000000
--- a/registration/include/pcl/registration/impl/ia_fpcs.hpp
+++ /dev/null
@@ -1,917 +0,0 @@
-/*
- * Software License Agreement (BSD License)
- *
- *  Point Cloud Library (PCL) - www.pointclouds.org
- *  Copyright (c) 2014-, Open Perception, Inc.
- *  Copyright (C) 2008 Ben Gurion University of the Negev, Beer Sheva, Israel.
- *
- *  All rights reserved
- *
- *  Redistribution and use in source and binary forms, with or without
- *  modification, are permitted provided that the following conditions are met
- *
- *   * The use for research only (no for any commercial application).
- *   * Redistributions of source code must retain the above copyright
- *     notice, this list of conditions and the following disclaimer.
- *   * Redistributions in binary form must reproduce the above
- *     copyright notice, this list of conditions and the following
- *     disclaimer in the documentation and/or other materials provided
- *     with the distribution.
- *   * Neither the name of the copyright holder(s) nor the names of its
- *     contributors may be used to endorse or promote products derived
- *     from this software without specific prior written permission.
- *
-  *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *  COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *  ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *  POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#ifndef PCL_REGISTRATION_IMPL_IA_FPCS_H_
-#define PCL_REGISTRATION_IMPL_IA_FPCS_H_
-
-#include <pcl/registration/ia_fpcs.h>
-#include <pcl/common/time.h>
-#include <pcl/common/distances.h>
-#include <pcl/sample_consensus/sac_model_plane.h>
-#include <pcl/registration/transformation_estimation_3point.h>
-
-///////////////////////////////////////////////////////////////////////////////////////////
-template <typename PointT> inline float
-pcl::getMeanPointDensity (const typename pcl::PointCloud<PointT>::ConstPtr &cloud, float max_dist, int nr_threads)
-{
-  const float max_dist_sqr = max_dist * max_dist;
-  const std::size_t s = cloud.size ();
-
-  pcl::search::KdTree <PointT> tree;
-  tree.setInputCloud (cloud);
-
-  float mean_dist = 0.f;
-  int num = 0;
-  std::vector <int> ids (2);
-  std::vector <float> dists_sqr (2);
-
-#ifdef _OPENMP
-#pragma omp parallel for \
-  reduction (+:mean_dist, num) \
-  private (ids, dists_sqr) shared (tree, cloud) \
-  default (none)num_threads (nr_threads)
-#endif
-
-  for (int i = 0; i < 1000; i++)
-  {
-    tree.nearestKSearch (cloud->points[rand () % s], 2, ids, dists_sqr);
-    if (dists_sqr[1] < max_dist_sqr)
-    {
-      mean_dist += std::sqrt (dists_sqr[1]);
-      num++;
-    }
-  }
-
-  return (mean_dist / num);
-};
-
-
-///////////////////////////////////////////////////////////////////////////////////////////
-template <typename PointT> inline float
-pcl::getMeanPointDensity (const typename pcl::PointCloud<PointT>::ConstPtr &cloud, const std::vector <int> &indices,
-  float max_dist, int nr_threads)
-{
-  const float max_dist_sqr = max_dist * max_dist;
-  const std::size_t s = indices.size ();
-
-  pcl::search::KdTree <PointT> tree;
-  tree.setInputCloud (cloud);
-
-  float mean_dist = 0.f;
-  int num = 0;
-  std::vector <int> ids (2);
-  std::vector <float> dists_sqr (2);
-
-#ifdef _OPENMP
-#pragma omp parallel for \
-  reduction (+:mean_dist, num) \
-  private (ids, dists_sqr) shared (tree, cloud, indices)    \
-  default (none)num_threads (nr_threads)
-#endif
-
-  for (int i = 0; i < 1000; i++)
-  {
-    tree.nearestKSearch (cloud->points[indices[rand () % s]], 2, ids, dists_sqr);
-    if (dists_sqr[1] < max_dist_sqr)
-    {
-      mean_dist += std::sqrt (dists_sqr[1]);
-      num++;
-    }
-  }
-
-  return (mean_dist / num);
-};
-
-
-///////////////////////////////////////////////////////////////////////////////////////////
-template <typename PointSource, typename PointTarget, typename NormalT, typename Scalar>
-pcl::registration::FPCSInitialAlignment <PointSource, PointTarget, NormalT, Scalar>::FPCSInitialAlignment () :
-  source_normals_ (),
-  target_normals_ (),
-  nr_threads_ (1),
-  approx_overlap_ (0.5f),
-  delta_ (1.f),
-  score_threshold_ (FLT_MAX),
-  nr_samples_ (0),
-  max_norm_diff_ (90.f),
-  max_runtime_ (0),
-  fitness_score_ (FLT_MAX),
-  diameter_ (),
-  max_base_diameter_sqr_ (),
-  use_normals_ (false),
-  normalize_delta_ (true),
-  max_pair_diff_ (),
-  max_edge_diff_ (),
-  coincidation_limit_ (),
-  max_mse_ (),
-  max_inlier_dist_sqr_ (),
-  small_error_ (0.00001f)
-{
-  reg_name_ = "pcl::registration::FPCSInitialAlignment";
-  max_iterations_ = 0;
-  ransac_iterations_ = 1000;
-  transformation_estimation_.reset (new pcl::registration::TransformationEstimation3Point <PointSource, PointTarget>);
-}
-
-
-///////////////////////////////////////////////////////////////////////////////////////////
-template <typename PointSource, typename PointTarget, typename NormalT, typename Scalar> void
-pcl::registration::FPCSInitialAlignment <PointSource, PointTarget, NormalT, Scalar>::computeTransformation (
-  PointCloudSource &output,
-  const Eigen::Matrix4f &guess)
-{
-  if (!initCompute ())
-    return;
-
-  final_transformation_ = guess;
-  bool abort = false;
-  std::vector <MatchingCandidates> all_candidates (max_iterations_);
-  pcl::StopWatch timer;
-
-  #ifdef _OPENMP
-  #pragma omp parallel num_threads (nr_threads_)
-  #endif
-  {
-    #ifdef _OPENMP
-    std::srand (static_cast <unsigned int> (std::time (NULL)) ^ omp_get_thread_num ());    
-    #pragma omp for schedule (dynamic)
-    #endif
-    for (int i = 0; i < max_iterations_; i++)
-    {
-
-      #ifdef _OPENMP
-      #pragma omp flush (abort)
-      #endif
-
-      MatchingCandidates candidates (1);
-      std::vector <int> base_indices (4);
-      float ratio[2];
-      all_candidates[i] = candidates;
-
-      if (!abort)
-      {
-        // select four coplanar point base
-        if (selectBase (base_indices, ratio) == 0)
-        {
-          // calculate candidate pair correspondences using diagonal lenghts of base
-          pcl::Correspondences pairs_a, pairs_b;
-          if (bruteForceCorrespondences (base_indices[0], base_indices[1], pairs_a) == 0 &&
-            bruteForceCorrespondences (base_indices[2], base_indices[3], pairs_b) == 0)
-          {
-            // determine candidate matches by combining pair correspondences based on segment distances
-            std::vector <std::vector <int> > matches;
-            if (determineBaseMatches (base_indices, matches, pairs_a, pairs_b, ratio) == 0)
-            {
-              // check and evaluate candidate matches and store them
-              handleMatches (base_indices, matches, candidates);
-              if (candidates.size () != 0)
-                all_candidates[i] = candidates;
-            }
-          }
-        }
-
-        // check terminate early (time or fitness_score threshold reached)
-        abort = (candidates.size () > 0 ? candidates[0].fitness_score < score_threshold_ : abort);
-        abort = (abort ? abort : timer.getTimeSeconds () > max_runtime_);
-
-
-        #ifdef _OPENMP
-        #pragma omp flush (abort)
-        #endif
-      }
-    }
-  }
-  
-
-  // determine best match over all trys
-  finalCompute (all_candidates);
-
-  // apply the final transformation
-  pcl::transformPointCloud (*input_, output, final_transformation_);
-
-  deinitCompute ();
-}
-
-
-///////////////////////////////////////////////////////////////////////////////////////////
-template <typename PointSource, typename PointTarget, typename NormalT, typename Scalar> bool
-pcl::registration::FPCSInitialAlignment <PointSource, PointTarget, NormalT, Scalar>::initCompute ()
-{
-  std::srand (static_cast <unsigned int> (std::time (NULL)));
-
-  // basic pcl initialization
-  if (!pcl::PCLBase <PointSource>::initCompute ())
-    return (false);
-
-  // check if source and target are given
-  if (!input_ || !target_)
-  {
-    PCL_ERROR ("[%s::initCompute] Source or target dataset not given!\n", reg_name_.c_str ());
-    return (false);
-  }
-
-  if (!target_indices_ || target_indices_->size () == 0)
-  {
-    target_indices_.reset (new std::vector <int> (static_cast <int> (target_->size ())));
-    int index = 0;
-    for (std::vector <int>::iterator it = target_indices_->begin (), it_e = target_indices_->end (); it != it_e; it++)
-      *it = index++;
-    target_cloud_updated_ = true;
-  }
-
-  // if a sample size for the point clouds is given; prefarably no sampling of target cloud
-  if (nr_samples_ != 0)
-  {
-    const int ss = static_cast <int> (indices_->size ());
-    const int sample_fraction_src = std::max (1, static_cast <int> (ss / nr_samples_));
-
-    source_indices_ = pcl::IndicesPtr (new std::vector <int>);
-    for (int i = 0; i < ss; i++)
-    if (rand () % sample_fraction_src == 0)
-      source_indices_->push_back ((*indices_) [i]);
-  }
-  else
-    source_indices_ = indices_;
-
-  // check usage of normals
-  if (source_normals_ && target_normals_  && source_normals_->size () == input_->size () && target_normals_->size () == target_->size ())
-    use_normals_ = true;
-
-  // set up tree structures
-  if (target_cloud_updated_)
-  {
-    tree_->setInputCloud (target_, target_indices_);
-    target_cloud_updated_ = false;
-  }
-
-  // set predefined variables
-  const int min_iterations = 4;
-  const float diameter_fraction = 0.3f;
-
-  // get diameter of input cloud (distance between farthest points)
-  Eigen::Vector4f pt_min, pt_max;
-  pcl::getMinMax3D (*target_, *target_indices_, pt_min, pt_max);
-  diameter_ = (pt_max - pt_min).norm ();
-
-  // derive the limits for the random base selection
-  float max_base_diameter = diameter_* approx_overlap_ * 2.f;
-  max_base_diameter_sqr_ = max_base_diameter * max_base_diameter;
-
-  // normalize the delta
-  if (normalize_delta_)
-  {
-    float mean_dist = getMeanPointDensity <PointTarget> (target_, *target_indices_, 0.05f * diameter_, nr_threads_);
-    delta_ *= mean_dist;
-  }
-
-  // heuristic determination of number of trials to have high probabilty of finding a good solution
-  if (max_iterations_ == 0)
-  {
-    float first_est = std::log (small_error_) / std::log (1.0 - std::pow ((double) approx_overlap_, (double) min_iterations));
-    max_iterations_ = static_cast <int> (first_est / (diameter_fraction * approx_overlap_ * 2.f));
-  }
-
-  // set further parameter
-  if (score_threshold_ == FLT_MAX)
-    score_threshold_ = 1.f - approx_overlap_;
-
-  if (max_iterations_ < 4)
-    max_iterations_ = 4;
-
-  if (max_runtime_ < 1)
-    max_runtime_ = INT_MAX;
-
-  // calculate internal parameters based on the the estimated point density
-  max_pair_diff_ = delta_ * 2.f;
-  max_edge_diff_ = delta_ * 4.f;
-  coincidation_limit_ = delta_ * 2.f; // EDITED: originally std::sqrt (delta_ * 2.f)
-  max_mse_ = powf (delta_* 2.f, 2.f);
-  max_inlier_dist_sqr_ = powf (delta_ * 2.f, 2.f);
-
-  // reset fitness_score
-  fitness_score_ = FLT_MAX;
-
-  return (true);
-}
-
-
-///////////////////////////////////////////////////////////////////////////////////////////
-template <typename PointSource, typename PointTarget, typename NormalT, typename Scalar> int
-pcl::registration::FPCSInitialAlignment <PointSource, PointTarget, NormalT, Scalar>::selectBase (
-  std::vector <int> &base_indices,
-  float (&ratio)[2])
-{
-  const float too_close_sqr = max_base_diameter_sqr_*0.01;
-
-  Eigen::VectorXf coefficients (4);
-  pcl::SampleConsensusModelPlane <PointTarget> plane (target_);
-  plane.setIndices (target_indices_);
-  Eigen::Vector4f centre_pt;
-  float nearest_to_plane = FLT_MAX;
-
-  // repeat base search until valid quadruple was found or ransac_iterations_ number of trys were unsuccessfull
-  for (int i = 0; i < ransac_iterations_; i++)
-  {
-    // random select an appropriate point triple
-    if (selectBaseTriangle (base_indices) < 0)
-      continue;
-
-    std::vector <int> base_triple (base_indices.begin (), base_indices.end () - 1);
-    plane.computeModelCoefficients (base_triple, coefficients);
-    pcl::compute3DCentroid (*target_, base_triple, centre_pt);
-
-    // loop over all points in source cloud to find most suitable fourth point
-    const PointTarget *pt1 = &(target_->points[base_indices[0]]);
-    const PointTarget *pt2 = &(target_->points[base_indices[1]]);
-    const PointTarget *pt3 = &(target_->points[base_indices[2]]);
-
-    for (std::vector <int>::iterator it = target_indices_->begin (), it_e = target_indices_->end (); it != it_e; it++)
-    {
-      const PointTarget *pt4 = &(target_->points[*it]);
-
-      float d1 = pcl::squaredEuclideanDistance (*pt4, *pt1);
-      float d2 = pcl::squaredEuclideanDistance (*pt4, *pt2);
-      float d3 = pcl::squaredEuclideanDistance (*pt4, *pt3);
-      float d4 = (pt4->getVector3fMap ()  - centre_pt.head (3)).squaredNorm ();
-
-      // check distance between points w.r.t minimum sampling distance; EDITED -> 4th point now also limited by max base line
-      if (d1 < too_close_sqr || d2 < too_close_sqr || d3 < too_close_sqr || d4 < too_close_sqr ||
-        d1 > max_base_diameter_sqr_ || d2 > max_base_diameter_sqr_ || d3 > max_base_diameter_sqr_)
-        continue;
-
-      // check distance to plane to get point closest to plane
-      float dist_to_plane = pcl::pointToPlaneDistance (*pt4, coefficients);
-      if (dist_to_plane < nearest_to_plane)
-      {
-        base_indices[3] = *it;
-        nearest_to_plane = dist_to_plane;
-      }
-    }
-
-    // check if at least one point fullfilled the conditions
-    if (nearest_to_plane != FLT_MAX)
-    {
-      // order points to build largest quadrangle and calcuate intersection ratios of diagonals
-      setupBase (base_indices, ratio);
-      return (0);
-    }
-  }
-
-  // return unsuccessfull if no quadruple was selected
-  return (-1);
-}
-
-
-///////////////////////////////////////////////////////////////////////////////////////////
-template <typename PointSource, typename PointTarget, typename NormalT, typename Scalar> int
-pcl::registration::FPCSInitialAlignment <PointSource, PointTarget, NormalT, Scalar>::selectBaseTriangle (std::vector <int> &base_indices)
-{
-  int nr_points = static_cast <int> (target_indices_->size ());
-  float best_t = 0.f;
-
-  // choose random first point
-  base_indices[0] = (*target_indices_)[rand () % nr_points];
-  int *index1 = &base_indices[0];
-
-  // random search for 2 other points (as far away as overlap allows)
-  for (int i = 0; i < ransac_iterations_; i++)
-  {
-    int *index2 = &(*target_indices_)[rand () % nr_points];
-    int *index3 = &(*target_indices_)[rand () % nr_points];
-
-    Eigen::Vector3f u = target_->points[*index2].getVector3fMap () - target_->points[*index1].getVector3fMap ();
-    Eigen::Vector3f v = target_->points[*index3].getVector3fMap () - target_->points[*index1].getVector3fMap ();
-    float t = u.cross (v).squaredNorm (); // triangle area (0.5 * sqrt(t)) should be maximal
-
-    // check for most suitable point triple
-    if (t > best_t && u.squaredNorm () < max_base_diameter_sqr_ && v.squaredNorm () < max_base_diameter_sqr_)
-    {
-      best_t = t;
-      base_indices[1] = *index2;
-      base_indices[2] = *index3;
-    }
-  }
-
-  // return if a triplet could be selected
-  return (best_t == 0.f ? -1 : 0);
-}
-
-
-///////////////////////////////////////////////////////////////////////////////////////////
-template <typename PointSource, typename PointTarget, typename NormalT, typename Scalar> void
-pcl::registration::FPCSInitialAlignment <PointSource, PointTarget, NormalT, Scalar>::setupBase (
-  std::vector <int> &base_indices,
-  float (&ratio)[2])
-{
-  float best_t = FLT_MAX;
-  const std::vector <int> copy (base_indices.begin (), base_indices.end ());
-  std::vector <int> temp (base_indices.begin (), base_indices.end ());
-
-  // loop over all combinations of base points
-  for (std::vector <int>::const_iterator i = copy.begin (), i_e = copy.end (); i != i_e; i++)
-  for (std::vector <int>::const_iterator j = copy.begin (), j_e = copy.end (); j != j_e; j++)
-  {
-    if (i == j)
-      continue;
-
-    for (std::vector <int>::const_iterator k = copy.begin (), k_e = copy.end (); k != k_e; k++)
-    {
-      if (k == j || k == i)
-        continue;
-
-      std::vector <int>::const_iterator l = copy.begin ();
-      while (l == i || l == j || l == k)
-        l++;
-
-      temp[0] = *i;
-      temp[1] = *j;
-      temp[2] = *k;
-      temp[3] = *l;
-
-      // calculate diagonal intersection ratios and check for suitable segment to segment distances
-      float ratio_temp[2];
-      float t = segmentToSegmentDist (temp, ratio_temp);
-      if (t < best_t)
-      {
-        best_t = t;
-        ratio[0] = ratio_temp[0];
-        ratio[1] = ratio_temp[1];
-        base_indices = temp;
-      }
-    }
-  }
-}
-
-
-///////////////////////////////////////////////////////////////////////////////////////////
-template <typename PointSource, typename PointTarget, typename NormalT, typename Scalar> float
-pcl::registration::FPCSInitialAlignment <PointSource, PointTarget, NormalT, Scalar>::segmentToSegmentDist (
-  const std::vector <int> &base_indices,
-  float (&ratio)[2])
-{
-  // get point vectors
-  Eigen::Vector3f u = target_->points[base_indices[1]].getVector3fMap () - target_->points[base_indices[0]].getVector3fMap ();
-  Eigen::Vector3f v = target_->points[base_indices[3]].getVector3fMap () - target_->points[base_indices[2]].getVector3fMap ();
-  Eigen::Vector3f w = target_->points[base_indices[0]].getVector3fMap () - target_->points[base_indices[2]].getVector3fMap ();
-
-  // calculate segment distances
-  float a = u.dot (u);
-  float b = u.dot (v);
-  float c = v.dot (v);
-  float d = u.dot (w);
-  float e = v.dot (w);
-  float D = a * c - b * b;
-  float sN = 0.f, sD = D;
-  float tN = 0.f, tD = D;
-
-  // check segments
-  if (D < small_error_)
-  {
-    sN = 0.f;
-    sD = 1.f;
-    tN = e;
-    tD = c;
-  }
-  else
-  {
-    sN = (b * e - c * d);
-    tN = (a * e - b * d);
-
-    if (sN < 0.f)
-    {
-      sN = 0.f;
-      tN = e;
-      tD = c;
-    }
-    else if (sN > sD)
-    {
-      sN = sD;
-      tN = e + b;
-      tD = c;
-    }
-  }
-
-  if (tN < 0.f)
-  {
-    tN = 0.f;
-
-    if (-d < 0.f)
-      sN = 0.f;
-
-    else if (-d > a)
-      sN = sD;
-
-    else
-    {
-      sN = -d;
-      sD = a;
-    }
-  }
-
-  else if (tN > tD)
-  {
-    tN = tD;
-
-    if ((-d + b) < 0.f)
-      sN = 0.f;
-
-    else if ((-d + b) > a)
-      sN = sD;
-
-    else
-    {
-      sN = (-d + b);
-      sD = a;
-    }
-  }
-
-  // set intersection ratios
-  ratio[0] = (std::abs (sN) < small_error_) ? 0.f : sN / sD;
-  ratio[1] = (std::abs (tN) < small_error_) ? 0.f : tN / tD;
-
-  Eigen::Vector3f x = w + (ratio[0] * u) - (ratio[1] * v);
-  return (x.norm ());
-}
-
-
-///////////////////////////////////////////////////////////////////////////////////////////
-template <typename PointSource, typename PointTarget, typename NormalT, typename Scalar> int
-pcl::registration::FPCSInitialAlignment <PointSource, PointTarget, NormalT, Scalar>::bruteForceCorrespondences (
-  int idx1,
-  int idx2,
-  pcl::Correspondences &pairs)
-{
-  const float max_norm_diff = 0.5f * max_norm_diff_ * M_PI / 180.f;
-
-  // calculate reference segment distance and normal angle
-  float ref_dist = pcl::euclideanDistance (target_->points[idx1], target_->points[idx2]);
-  float ref_norm_angle = (use_normals_ ? (target_normals_->points[idx1].getNormalVector3fMap () -
-                                          target_normals_->points[idx2].getNormalVector3fMap ()).norm () : 0.f);
-
-  // loop over all pairs of points in source point cloud
-  std::vector <int>::iterator it_out = source_indices_->begin (), it_out_e = source_indices_->end () - 1;
-  std::vector <int>::iterator it_in, it_in_e = source_indices_->end ();
-  for ( ; it_out != it_out_e; it_out++)
-  {
-    it_in = it_out + 1;
-    const PointSource *pt1 = &(*input_)[*it_out];
-    for ( ; it_in != it_in_e; it_in++)
-    {
-      const PointSource *pt2 = &(*input_)[*it_in];
-
-      // check point distance compared to reference dist (from base)
-      float dist = pcl::euclideanDistance (*pt1, *pt2);
-      if (std::abs(dist - ref_dist) < max_pair_diff_)
-      {
-        // add here normal evaluation if normals are given
-        if (use_normals_)
-        {
-          const NormalT *pt1_n = &(source_normals_->points[*it_out]);
-          const NormalT *pt2_n = &(source_normals_->points[*it_in]);
-
-          float norm_angle_1 = (pt1_n->getNormalVector3fMap () - pt2_n->getNormalVector3fMap ()).norm ();
-          float norm_angle_2 = (pt1_n->getNormalVector3fMap () + pt2_n->getNormalVector3fMap ()).norm ();
-
-          float norm_diff = std::min <float> (std::abs (norm_angle_1 - ref_norm_angle), std::abs (norm_angle_2 - ref_norm_angle));
-          if (norm_diff > max_norm_diff)
-            continue;
-        }
-
-        pairs.push_back (pcl::Correspondence (*it_in, *it_out, dist));
-        pairs.push_back (pcl::Correspondence (*it_out, *it_in, dist));
-      }
-    }
-  }
-
-  // return success if at least one correspondence was found
-  return (pairs.size () == 0 ? -1 : 0);
-}
-
-
-///////////////////////////////////////////////////////////////////////////////////////////
-template <typename PointSource, typename PointTarget, typename NormalT, typename Scalar> int
-pcl::registration::FPCSInitialAlignment <PointSource, PointTarget, NormalT, Scalar>::determineBaseMatches (
-  const std::vector <int> &base_indices,
-  std::vector <std::vector <int> > &matches,
-  const pcl::Correspondences &pairs_a,
-  const pcl::Correspondences &pairs_b,
-  const float (&ratio)[2])
-{
-  // calculate edge lengths of base
-  float dist_base[4];
-  dist_base[0] = pcl::euclideanDistance (target_->points[base_indices[0]], target_->points[base_indices[2]]);
-  dist_base[1] = pcl::euclideanDistance (target_->points[base_indices[0]], target_->points[base_indices[3]]);
-  dist_base[2] = pcl::euclideanDistance (target_->points[base_indices[1]], target_->points[base_indices[2]]);
-  dist_base[3] = pcl::euclideanDistance (target_->points[base_indices[1]], target_->points[base_indices[3]]);
-
-  // loop over first point pair correspondences and store intermediate points 'e' in new point cloud
-  PointCloudSourcePtr cloud_e (new PointCloudSource);
-  cloud_e->resize (pairs_a.size () * 2);
-  PointCloudSourceIterator it_pt = cloud_e->begin ();
-  for (pcl::Correspondences::const_iterator it_pair = pairs_a.begin (), it_pair_e = pairs_a.end () ; it_pair != it_pair_e; it_pair++)
-  {
-    const PointSource *pt1 = &(input_->points[it_pair->index_match]);
-    const PointSource *pt2 = &(input_->points[it_pair->index_query]);
-
-    // calculate intermediate points using both ratios from base (r1,r2)
-    for (int i = 0; i < 2; i++, it_pt++)
-    {
-      it_pt->x = pt1->x + ratio[i] * (pt2->x - pt1->x);
-      it_pt->y = pt1->y + ratio[i] * (pt2->y - pt1->y);
-      it_pt->z = pt1->z + ratio[i] * (pt2->z - pt1->z);
-    }
-  }
-
-  // initialize new kd tree of intermediate points from first point pair correspondences
-  KdTreeReciprocalPtr tree_e (new KdTreeReciprocal);
-  tree_e->setInputCloud (cloud_e);
-
-  std::vector <int> ids;
-  std::vector <float> dists_sqr;
-
-  // loop over second point pair correspondences
-  for (pcl::Correspondences::const_iterator it_pair = pairs_b.begin (), it_pair_e = pairs_b.end () ; it_pair != it_pair_e; it_pair++)
-  {
-    const PointTarget *pt1 = &(input_->points[it_pair->index_match]);
-    const PointTarget *pt2 = &(input_->points[it_pair->index_query]);
-
-    // calculate intermediate points using both ratios from base (r1,r2)
-    for (int i = 0; i < 2; i++)
-    {
-      PointTarget pt_e;
-      pt_e.x = pt1->x + ratio[i] * (pt2->x - pt1->x);
-      pt_e.y = pt1->y + ratio[i] * (pt2->y - pt1->y);
-      pt_e.z = pt1->z + ratio[i] * (pt2->z - pt1->z);
-
-      // search for corresponding intermediate points
-      tree_e->radiusSearch (pt_e, coincidation_limit_, ids, dists_sqr);
-      for (std::vector <int>::iterator it = ids.begin (), it_e = ids.end (); it != it_e; it++)
-      {
-        std::vector <int> match_indices (4);
-
-        match_indices[0] = pairs_a[static_cast <int> (std::floor ((float)(*it/2.f)))].index_match;
-        match_indices[1] = pairs_a[static_cast <int> (std::floor ((float)(*it/2.f)))].index_query;
-        match_indices[2] = it_pair->index_match;
-        match_indices[3] = it_pair->index_query;
-
-        // EDITED: added coarse check of match based on edge length (due to rigid-body )
-        if (checkBaseMatch (match_indices, dist_base) < 0)
-          continue;
-
-        matches.push_back (match_indices);
-      }
-    }
-  }
-
-  // return unsuccessfull if no match was found
-  return (matches.size () > 0 ? 0 : -1);
-}
-
-
-///////////////////////////////////////////////////////////////////////////////////////////
-template <typename PointSource, typename PointTarget, typename NormalT, typename Scalar> int
-pcl::registration::FPCSInitialAlignment <PointSource, PointTarget, NormalT, Scalar>::checkBaseMatch (
-  const std::vector <int> &match_indices,
-  const float (&dist_ref)[4])
-{
-  float d0 = pcl::euclideanDistance (input_->points[match_indices[0]], input_->points[match_indices[2]]);
-  float d1 = pcl::euclideanDistance (input_->points[match_indices[0]], input_->points[match_indices[3]]);
-  float d2 = pcl::euclideanDistance (input_->points[match_indices[1]], input_->points[match_indices[2]]);
-  float d3 = pcl::euclideanDistance (input_->points[match_indices[1]], input_->points[match_indices[3]]);
-
-  // check edge distances of match w.r.t the base
-  return (std::abs (d0 - dist_ref[0]) < max_edge_diff_ && std::abs (d1 - dist_ref[1]) < max_edge_diff_ &&
-          std::abs (d2 - dist_ref[2]) < max_edge_diff_ && std::abs (d3 - dist_ref[3]) < max_edge_diff_) ? 0 : -1;
-}
-
-
-///////////////////////////////////////////////////////////////////////////////////////////
-template <typename PointSource, typename PointTarget, typename NormalT, typename Scalar> void
-pcl::registration::FPCSInitialAlignment <PointSource, PointTarget, NormalT, Scalar>::handleMatches (
-  const std::vector <int> &base_indices,
-  std::vector <std::vector <int> > &matches,
-  MatchingCandidates &candidates)
-{
-  candidates.resize (1);
-  float fitness_score = FLT_MAX;
-
-  // loop over all Candidate matches
-  for (std::vector <std::vector <int> >::iterator match_indices = matches.begin (), it_e = matches.end (); match_indices != it_e; match_indices++)
-  {
-    Eigen::Matrix4f transformation_temp;
-    pcl::Correspondences correspondences_temp;
-
-    // determine corresondences between base and match according to their distance to centroid
-    linkMatchWithBase (base_indices, *match_indices, correspondences_temp);
-
-    // check match based on residuals of the corresponding points after
-    if (validateMatch (base_indices, *match_indices, correspondences_temp, transformation_temp) < 0)
-      continue;
-
-    // check resulting  using a sub sample of the source point cloud and compare to previous matches
-    if (validateTransformation (transformation_temp, fitness_score) < 0)
-      continue;
-
-    // store best match as well as associated fitness_score and transformation
-    candidates[0].fitness_score = fitness_score;
-    candidates [0].transformation = transformation_temp;
-    correspondences_temp.erase (correspondences_temp.end () - 1);
-    candidates[0].correspondences = correspondences_temp;
-  }
-}
-
-
-///////////////////////////////////////////////////////////////////////////////////////////
-template <typename PointSource, typename PointTarget, typename NormalT, typename Scalar> void
-pcl::registration::FPCSInitialAlignment <PointSource, PointTarget, NormalT, Scalar>::linkMatchWithBase (
-  const std::vector <int> &base_indices,
-  std::vector <int> &match_indices,
-  pcl::Correspondences &correspondences)
-{
-  // calculate centroid of base and target
-  Eigen::Vector4f centre_base, centre_match;
-  pcl::compute3DCentroid (*target_, base_indices, centre_base);
-  pcl::compute3DCentroid (*input_, match_indices, centre_match);
-
-  PointTarget centre_pt_base;
-  centre_pt_base.x = centre_base[0];
-  centre_pt_base.y = centre_base[1];
-  centre_pt_base.z = centre_base[2];
-
-  PointSource centre_pt_match;
-  centre_pt_match.x = centre_match[0];
-  centre_pt_match.y = centre_match[1];
-  centre_pt_match.z = centre_match[2];
-
-  // find corresponding points according to their distance to the centroid
-  std::vector <int> copy = match_indices;
-
-  std::vector <int>::const_iterator it_base = base_indices.begin (), it_base_e = base_indices.end ();
-  std::vector <int>::iterator it_match, it_match_e = copy.end ();
-  std::vector <int>::iterator it_match_orig = match_indices.begin ();
-  for (; it_base != it_base_e; it_base++, it_match_orig++)
-  {
-    float dist_sqr_1 = pcl::squaredEuclideanDistance (target_->points[*it_base], centre_pt_base);
-    float best_diff_sqr = FLT_MAX;
-    int best_index;
-
-    for (it_match = copy.begin (); it_match != it_match_e; it_match++)
-    {
-      // calculate difference of distances to centre point
-      float dist_sqr_2 = pcl::squaredEuclideanDistance (input_->points[*it_match], centre_pt_match);
-      float diff_sqr = std::abs(dist_sqr_1 - dist_sqr_2);
-
-      if (diff_sqr < best_diff_sqr)
-      {
-        best_diff_sqr = diff_sqr;
-        best_index = *it_match;
-      }
-    }
-
-    // assign new correspondence and update indices of matched targets
-    correspondences.push_back (pcl::Correspondence (best_index, *it_base, best_diff_sqr));
-    *it_match_orig = best_index;
-  }
-}
-
-
-///////////////////////////////////////////////////////////////////////////////////////////
-template <typename PointSource, typename PointTarget, typename NormalT, typename Scalar> int
-pcl::registration::FPCSInitialAlignment <PointSource, PointTarget, NormalT, Scalar>::validateMatch (
-  const std::vector <int> &base_indices,
-  const std::vector <int> &match_indices,
-  const pcl::Correspondences &correspondences,
-  Eigen::Matrix4f &transformation)
-{
-  // only use triplet of points to simlify process (possible due to planar case)
-  pcl::Correspondences correspondences_temp = correspondences;
-  correspondences_temp.erase (correspondences_temp.end () - 1);
-
-  // estimate transformation between correspondence set
-  transformation_estimation_->estimateRigidTransformation (*input_, *target_, correspondences_temp, transformation);
-
-  // transform base points
-  PointCloudSource match_transformed;
-  pcl::transformPointCloud (*input_, match_indices, match_transformed, transformation);
-
-  // calculate residuals of transformation and check against maximum threshold
-  std::size_t nr_points = correspondences_temp.size ();
-  float mse = 0.f;
-  for (std::size_t i = 0; i < nr_points; i++)
-    mse += pcl::squaredEuclideanDistance (match_transformed.points [i], target_->points [base_indices[i]]);
-
-  mse /= nr_points;
-  return (mse < max_mse_ ? 0 : -1);
-}
-
-
-///////////////////////////////////////////////////////////////////////////////////////////
-template <typename PointSource, typename PointTarget, typename NormalT, typename Scalar> int
-pcl::registration::FPCSInitialAlignment <PointSource, PointTarget, NormalT, Scalar>::validateTransformation (
-  Eigen::Matrix4f &transformation,
-  float &fitness_score)
-{
-  // transform source point cloud
-  PointCloudSource source_transformed;
-  pcl::transformPointCloud (*input_, *source_indices_, source_transformed, transformation);
-
-  std::size_t nr_points = source_transformed.size ();
-  std::size_t terminate_value = fitness_score > 1 ? 0 : static_cast <std::size_t> ((1.f - fitness_score) * nr_points);
-
-  float inlier_score_temp = 0;
-  std::vector <int> ids;
-  std::vector <float> dists_sqr;
-  PointCloudSourceIterator it = source_transformed.begin ();
-
-  for (std::size_t i = 0; i < nr_points; it++, i++)
-  {
-    // search for nearest point using kd tree search
-    tree_->nearestKSearch (*it, 1, ids, dists_sqr);
-    inlier_score_temp += (dists_sqr[0] < max_inlier_dist_sqr_ ? 1 : 0);
-
-    // early terminating
-    if (nr_points - i + inlier_score_temp < terminate_value)
-      break;
-  }
-
-  // check current costs and return unsuccessfull if larger than previous ones
-  inlier_score_temp /= static_cast <float> (nr_points);
-  float fitness_score_temp = 1.f - inlier_score_temp;
-
-  if (fitness_score_temp > fitness_score)
-    return (-1);
-
-  fitness_score = fitness_score_temp;
-  return (0);
-}
-
-
-///////////////////////////////////////////////////////////////////////////////////////////
-template <typename PointSource, typename PointTarget, typename NormalT, typename Scalar> void
-pcl::registration::FPCSInitialAlignment <PointSource, PointTarget, NormalT, Scalar>::finalCompute (
-  const std::vector <MatchingCandidates > &candidates)
-{
-  // get best fitness_score over all trys
-  int nr_candidates = static_cast <int> (candidates.size ());
-  int best_index = -1;
-  float best_score = FLT_MAX;
-  for (int i = 0; i < nr_candidates; i++)
-  {
-    const float &fitness_score = candidates [i][0].fitness_score;
-    if (fitness_score < best_score)
-    {
-      best_score = fitness_score;
-      best_index = i;
-    }
-  }
-
-  // check if a valid candidate was available
-  if (!(best_index < 0))
-  {
-    fitness_score_ = candidates [best_index][0].fitness_score;
-    final_transformation_ = candidates [best_index][0].transformation;
-    *correspondences_ = candidates [best_index][0].correspondences;
-
-    // here we define convergence if resulting fitness_score is below 1-threshold
-    converged_ = fitness_score_ < score_threshold_;
-  }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////
-
-#endif // PCL_REGISTRATION_IMPL_IA_4PCS_H_
diff --git a/registration/include/pcl/registration/matching_candidate.h b/registration/include/pcl/registration/matching_candidate.h
deleted file mode 100644
index 51c822eb..00000000
--- a/registration/include/pcl/registration/matching_candidate.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Software License Agreement (BSD License)
- *
- *  Point Cloud Library (PCL) - www.pointclouds.org
- *  Copyright (c) 2014-, Open Perception, Inc.
- *  Copyright (C) 2008 Ben Gurion University of the Negev, Beer Sheva, Israel.
- *
- *  All rights reserved
- *
- *  Redistribution and use in source and binary forms, with or without
- *  modification, are permitted provided that the following conditions are met
- *
- *   * The use for research only (no for any commercial application).
- *   * Redistributions of source code must retain the above copyright
- *     notice, this list of conditions and the following disclaimer.
- *   * Redistributions in binary form must reproduce the above
- *     copyright notice, this list of conditions and the following
- *     disclaimer in the documentation and/or other materials provided
- *     with the distribution.
- *   * Neither the name of the copyright holder(s) nor the names of its
- *     contributors may be used to endorse or promote products derived
- *     from this software without specific prior written permission.
- *
-  *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *  COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *  ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *  POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#ifndef PCL_REGISTRATION_MATCHING_CANDIDATE_H_
-#define PCL_REGISTRATION_MATCHING_CANDIDATE_H_
-
-#include <pcl/registration/registration.h>
-#include <pcl/common/common.h>
-
-namespace pcl
-{
-  namespace registration
-  {
-    /** \brief Container for matching candidate consisting of
-    *
-    * * fitness score value as a result of the matching algorithm
-    * * correspondences between source and target data set
-    * * transformation matrix calculated based on the correspondences
-    *
-    */
-    struct MatchingCandidate
-    {
-      /** \brief Constructor. */
-      MatchingCandidate () :
-        fitness_score (FLT_MAX),
-        correspondences (),
-        transformation (Eigen::Matrix4f::Identity ())
-      {};
-
-      /** \brief Value constructor. */
-      MatchingCandidate (float s, const pcl::Correspondences &c, const Eigen::Matrix4f &m) :
-        fitness_score (s),
-        correspondences (c),
-        transformation (m)
-      {};
-
-      /** \brief Destructor. */
-      ~MatchingCandidate ()
-      {};
-
-
-      /** \brief Fitness score of current candidate resulting from matching algorithm. */
-      float fitness_score;
-
-      /** \brief Correspondences between source <-> target. */
-      pcl::Correspondences correspondences;
-
-      /** \brief Corresponding transformation matrix retrieved using \a corrs. */
-      Eigen::Matrix4f transformation;
-
-      EIGEN_MAKE_ALIGNED_OPERATOR_NEW
-    };
-
-    typedef std::vector<MatchingCandidate, Eigen::aligned_allocator<MatchingCandidate> > MatchingCandidates;
-
-    /** \brief Sorting of candidates based on fitness score value. */
-    struct by_score
-    {
-      /** \brief Operator used to sort candidates based on fitness score. */
-      bool operator () (MatchingCandidate const &left, MatchingCandidate const &right)
-      {
-        return (left.fitness_score < right.fitness_score);
-      }
-    };
-
-  };  // namespace registration
-}; // namespace pcl
-
-
-#endif // PCL_REGISTRATION_MATCHING_CANDIDATE_H_