From 4cc2d40642d0d90eb62a544e9347629df681ec74 Mon Sep 17 00:00:00 2001
From: Doina Chiroiu <inach@google.com>
Date: Thu, 13 Aug 2020 10:22:33 +0000
Subject: [PATCH 01/29] Simple version of sandbox


From 80dd8733755bdb2bde2452e929868c69407a0350 Mon Sep 17 00:00:00 2001
From: doinachiroiu <inach@google.com>
Date: Mon, 17 Aug 2020 11:20:56 +0000
Subject: [PATCH 02/29] Initial version sandbox

---
 oss-internship-2020/pffft/CMakeLists.txt      |   91 +
 oss-internship-2020/pffft/Makefile            |   15 +
 oss-internship-2020/pffft/README.txt          |  416 +++
 oss-internship-2020/pffft/fftpack.c           | 3112 +++++++++++++++++
 oss-internship-2020/pffft/fftpack.h           |  799 +++++
 oss-internship-2020/pffft/myNotes.txt         |  101 +
 oss-internship-2020/pffft/pffft.c             | 1881 ++++++++++
 oss-internship-2020/pffft/pffft.h             |  177 +
 oss-internship-2020/pffft/test_pffft.c        |  419 +++
 .../pffft/test_pffft_sandboxed.cc             |  150 +
 10 files changed, 7161 insertions(+)
 create mode 100644 oss-internship-2020/pffft/CMakeLists.txt
 create mode 100644 oss-internship-2020/pffft/Makefile
 create mode 100644 oss-internship-2020/pffft/README.txt
 create mode 100644 oss-internship-2020/pffft/fftpack.c
 create mode 100644 oss-internship-2020/pffft/fftpack.h
 create mode 100644 oss-internship-2020/pffft/myNotes.txt
 create mode 100644 oss-internship-2020/pffft/pffft.c
 create mode 100644 oss-internship-2020/pffft/pffft.h
 create mode 100644 oss-internship-2020/pffft/test_pffft.c
 create mode 100644 oss-internship-2020/pffft/test_pffft_sandboxed.cc

diff --git a/oss-internship-2020/pffft/CMakeLists.txt b/oss-internship-2020/pffft/CMakeLists.txt
new file mode 100644
index 0000000..c0d62cb
--- /dev/null
+++ b/oss-internship-2020/pffft/CMakeLists.txt
@@ -0,0 +1,91 @@
+cmake_minimum_required(VERSION 3.10)
+
+project(pffft CXX C)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED True)
+
+add_library(pffft STATIC 
+    pffft.c 
+    pffft.h 
+    fftpack.c 
+    fftpack.h
+)
+
+add_executable(pffft_main
+    test_pffft.c
+)
+
+target_link_libraries(pffft_main PRIVATE 
+    pffft
+)
+
+set(MATH_LIBS "")
+include(CheckLibraryExists)
+check_library_exists(m sin "" LIBM)
+if(LIBM)
+    list(APPEND MATH_LIBS "m")
+endif()
+
+target_link_libraries(pffft PUBLIC ${MATH_LIBS})
+
+
+# Adding dependencies
+set(SAPI_ROOT "/usr/local/google/home/inach/sandboxed-api" CACHE PATH "Path to the Sandboxed API source tree")
+# Then configure:
+#   mkdir -p build && cd build
+#   cmake .. -G Ninja -DSAPI_ROOT=$HOME/sapi_root
+
+set(SAPI_ENABLE_EXAMPLES OFF CACHE BOOL "")
+set(SAPI_ENABLE_TESTS OFF CACHE BOOL "")
+add_subdirectory("${SAPI_ROOT}"
+                 "${CMAKE_BINARY_DIR}/sandboxed-api-build"
+                 # Omit this to have the full Sandboxed API in IDE
+                 EXCLUDE_FROM_ALL)
+
+add_sapi_library(pffft_sapi
+    FUNCTIONS pffft_new_setup
+              pffft_destroy_setup
+              pffft_transform
+              pffft_transform_ordered
+              pffft_zreorder
+              pffft_zconvolve_accumulate
+              pffft_aligned_malloc
+              pffft_aligned_free
+              pffft_simd_size
+              cffti
+              cfftf
+              cfftb 
+              rffti 
+              rfftf 
+              rfftb 
+              cosqi 
+              cosqf 
+              cosqb 
+              costi 
+              cost 
+              sinqi 
+              sinqb 
+              sinqf 
+              sinti 
+              sint
+
+    INPUTS pffft.h fftpack.h
+    LIBRARY pffft
+    LIBRARY_NAME pffft
+
+    NAMESPACE ""
+)
+
+target_include_directories(pffft_sapi INTERFACE
+    "${PROJECT_BINARY_DIR}"
+)
+
+add_executable(pffft_sandboxed
+    test_pffft_sandboxed.cc
+)
+
+target_link_libraries(pffft_sandboxed PRIVATE  
+    pffft_sapi
+    sapi::sapi
+)
\ No newline at end of file
diff --git a/oss-internship-2020/pffft/Makefile b/oss-internship-2020/pffft/Makefile
new file mode 100644
index 0000000..326fd90
--- /dev/null
+++ b/oss-internship-2020/pffft/Makefile
@@ -0,0 +1,15 @@
+CXXFLAGS ?= -std=c++17
+
+pffft_main: test_pffft.o libpffft.a
+	$(CXX) -o $@ $(CXXFLAGS) $(LDFLAGS) $^
+
+libpffft.a: pffft.o fftpack.o
+	ar rcs $@ $^
+
+pffft.c: pffft.h
+
+fftpack.c: fftpack.h
+
+.PHONY: clean
+clean: 
+	rm -f *.o *.a pffft_main
\ No newline at end of file
diff --git a/oss-internship-2020/pffft/README.txt b/oss-internship-2020/pffft/README.txt
new file mode 100644
index 0000000..ee20b42
--- /dev/null
+++ b/oss-internship-2020/pffft/README.txt
@@ -0,0 +1,416 @@
+PFFFT: a pretty fast FFT.
+
+TL;DR
+--
+
+PFFFT does 1D Fast Fourier Transforms, of single precision real and
+complex vectors. It tries do it fast, it tries to be correct, and it
+tries to be small. Computations do take advantage of SSE1 instructions
+on x86 cpus, Altivec on powerpc cpus, and NEON on ARM cpus. The
+license is BSD-like.
+
+
+Why does it exist:
+--
+
+I was in search of a good performing FFT library , preferably very
+small and with a very liberal license.
+
+When one says "fft library", FFTW ("Fastest Fourier Transform in the
+West") is probably the first name that comes to mind -- I guess that
+99% of open-source projects that need a FFT do use FFTW, and are happy
+with it. However, it is quite a large library , which does everything
+fft related (2d transforms, 3d transforms, other transformations such
+as discrete cosine , or fast hartley). And it is licensed under the
+GNU GPL , which means that it cannot be used in non open-source
+products.
+
+An alternative to FFTW that is really small, is the venerable FFTPACK
+v4, which is available on NETLIB. A more recent version (v5) exists,
+but it is larger as it deals with multi-dimensional transforms. This
+is a library that is written in FORTRAN 77, a language that is now
+considered as a bit antiquated by many. FFTPACKv4 was written in 1985,
+by Dr Paul Swarztrauber of NCAR, more than 25 years ago ! And despite
+its age, benchmarks show it that it still a very good performing FFT
+library, see for example the 1d single precision benchmarks here:
+http://www.fftw.org/speed/opteron-2.2GHz-32bit/ . It is however not
+competitive with the fastest ones, such as FFTW, Intel MKL, AMD ACML,
+Apple vDSP. The reason for that is that those libraries do take
+advantage of the SSE SIMD instructions available on Intel CPUs,
+available since the days of the Pentium III. These instructions deal
+with small vectors of 4 floats at a time, instead of a single float
+for a traditionnal FPU, so when using these instructions one may expect
+a 4-fold performance improvement.
+
+The idea was to take this fortran fftpack v4 code, translate to C,
+modify it to deal with those SSE instructions, and check that the
+final performance is not completely ridiculous when compared to other
+SIMD FFT libraries. Translation to C was performed with f2c (
+http://www.netlib.org/f2c/ ). The resulting file was a bit edited in
+order to remove the thousands of gotos that were introduced by
+f2c. You will find the fftpack.h and fftpack.c sources in the
+repository, this a complete translation of
+http://www.netlib.org/fftpack/ , with the discrete cosine transform
+and the test program. There is no license information in the netlib
+repository, but it was confirmed to me by the fftpack v5 curators that
+the same terms do apply to fftpack v4:
+http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html . This is a
+"BSD-like" license, it is compatible with proprietary projects.
+
+Adapting fftpack to deal with the SIMD 4-element vectors instead of
+scalar single precision numbers was more complex than I originally
+thought, especially with the real transforms, and I ended up writing
+more code than I planned..
+
+
+The code:
+--
+
+Only two files, in good old C, pffft.c and pffft.h . The API is very
+very simple, just make sure that you read the comments in pffft.h.
+
+
+Comparison with other FFTs:
+--
+
+The idea was not to break speed records, but to get a decently fast
+fft that is at least 50% as fast as the fastest FFT -- especially on
+slowest computers . I'm more focused on getting the best performance
+on slow cpus (Atom, Intel Core 1, old Athlons, ARM Cortex-A9...), than
+on getting top performance on today fastest cpus.
+
+It can be used in a real-time context as the fft functions do not
+perform any memory allocation -- that is why they accept a 'work'
+array in their arguments.
+
+It is also a bit focused on performing 1D convolutions, that is why it
+provides "unordered" FFTs , and a fourier domain convolution
+operation.
+
+
+Benchmark results (cpu tested: core i7 2600, core 2 quad, core 1 duo, atom N270, cortex-A9, cortex-A15, A8X)
+--
+
+The benchmark shows the performance of various fft implementations measured in 
+MFlops, with the number of floating point operations being defined as 5Nlog2(N)
+for a length N complex fft, and 2.5*Nlog2(N) for a real fft. 
+See http://www.fftw.org/speed/method.html for an explanation of these formulas.
+
+MacOS Lion, gcc 4.2, 64-bit, fftw 3.3 on a 3.4 GHz core i7 2600
+
+Built with:
+
+ gcc-4.2 -o test_pffft -arch x86_64 -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -DHAVE_VECLIB -framework veclib -DHAVE_FFTW -lfftw3f 
+
+| input len |real FFTPack|  real vDSP |  real FFTW | real PFFFT | |cplx FFTPack|  cplx vDSP |  cplx FFTW | cplx PFFFT |
+|-----------+------------+------------+------------+------------| |------------+------------+------------+------------|
+|       64  |     2816   |     8596   |     7329   |     8187   | |     2887   |    14898   |    14668   |    11108   |
+|       96  |     3298   |      n/a   |     8378   |     7727   | |     3953   |      n/a   |    15680   |    10878   |
+|      128  |     3507   |    11575   |     9266   |    10108   | |     4233   |    17598   |    16427   |    12000   |
+|      160  |     3391   |      n/a   |     9838   |    10711   | |     4220   |      n/a   |    16653   |    11187   |
+|      192  |     3919   |      n/a   |     9868   |    10956   | |     4297   |      n/a   |    15770   |    12540   |
+|      256  |     4283   |    13179   |    10694   |    13128   | |     4545   |    19550   |    16350   |    13822   |
+|      384  |     3136   |      n/a   |    10810   |    12061   | |     3600   |      n/a   |    16103   |    13240   |
+|      480  |     3477   |      n/a   |    10632   |    12074   | |     3536   |      n/a   |    11630   |    12522   |
+|      512  |     3783   |    15141   |    11267   |    13838   | |     3649   |    20002   |    16560   |    13580   |
+|      640  |     3639   |      n/a   |    11164   |    13946   | |     3695   |      n/a   |    15416   |    13890   |
+|      768  |     3800   |      n/a   |    11245   |    13495   | |     3590   |      n/a   |    15802   |    14552   |
+|      800  |     3440   |      n/a   |    10499   |    13301   | |     3659   |      n/a   |    12056   |    13268   |
+|     1024  |     3924   |    15605   |    11450   |    15339   | |     3769   |    20963   |    13941   |    15467   |
+|     2048  |     4518   |    16195   |    11551   |    15532   | |     4258   |    20413   |    13723   |    15042   |
+|     2400  |     4294   |      n/a   |    10685   |    13078   | |     4093   |      n/a   |    12777   |    13119   |
+|     4096  |     4750   |    16596   |    11672   |    15817   | |     4157   |    19662   |    14316   |    14336   |
+|     8192  |     3820   |    16227   |    11084   |    12555   | |     3691   |    18132   |    12102   |    13813   |
+|     9216  |     3864   |      n/a   |    10254   |    12870   | |     3586   |      n/a   |    12119   |    13994   |
+|    16384  |     3822   |    15123   |    10454   |    12822   | |     3613   |    16874   |    12370   |    13881   |
+|    32768  |     4175   |    14512   |    10662   |    11095   | |     3881   |    14702   |    11619   |    11524   |
+|   262144  |     3317   |    11429   |     6269   |     9517   | |     2810   |    11729   |     7757   |    10179   |
+|  1048576  |     2913   |    10551   |     4730   |     5867   | |     2661   |     7881   |     3520   |     5350   |
+|-----------+------------+------------+------------+------------| |------------+------------+------------+------------|
+
+
+Debian 6, gcc 4.4.5, 64-bit, fftw 3.3.1 on a 3.4 GHz core i7 2600
+
+Built with:
+gcc -o test_pffft -DHAVE_FFTW -msse2 -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L$HOME/local/lib -I$HOME/local/include/ -lfftw3f -lm
+
+| N (input length) | real FFTPack |   real FFTW  |  real PFFFT  | | cplx FFTPack |   cplx FFTW  |  cplx PFFFT  |
+|------------------+--------------+--------------+--------------| |--------------+--------------+--------------|
+|           64     |      3840    |      7680    |      8777    | |      4389    |     20480    |     11171    |
+|           96     |      4214    |      9633    |      8429    | |      4816    |     22477    |     11238    |
+|          128     |      3584    |     10240    |     10240    | |      5120    |     23893    |     11947    |
+|          192     |      4854    |     11095    |     12945    | |      4854    |     22191    |     14121    |
+|          256     |      4096    |     11703    |     16384    | |      5120    |     23406    |     13653    |
+|          384     |      4395    |     14651    |     12558    | |      4884    |     19535    |     14651    |
+|          512     |      5760    |     13166    |     15360    | |      4608    |     23040    |     15360    |
+|          768     |      4907    |     14020    |     16357    | |      4461    |     19628    |     14020    |
+|         1024     |      5120    |     14629    |     14629    | |      5120    |     20480    |     15754    |
+|         2048     |      5632    |     14080    |     18773    | |      4693    |     12516    |     16091    |
+|         4096     |      5120    |     13653    |     17554    | |      4726    |      7680    |     14456    |
+|         8192     |      4160    |      7396    |     13312    | |      4437    |     14791    |     13312    |
+|         9216     |      4210    |      6124    |     13473    | |      4491    |      7282    |     14970    |
+|        16384     |      3976    |     11010    |     14313    | |      4210    |     11450    |     13631    |
+|        32768     |      4260    |     10224    |     10954    | |      4260    |      6816    |     11797    |
+|       262144     |      3736    |      6896    |      9961    | |      2359    |      8965    |      9437    |
+|      1048576     |      2796    |      4534    |      6453    | |      1864    |      3078    |      5592    |
+|------------------+--------------+--------------+--------------| |--------------+--------------+--------------|
+
+
+
+MacOS Snow Leopard, gcc 4.0, 32-bit, fftw 3.3 on a 1.83 GHz core 1 duo
+
+Built with:
+
+ gcc -o test_pffft -DHAVE_FFTW -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -framework veclib
+
+| input len |real FFTPack|  real vDSP |  real FFTW | real PFFFT | |cplx FFTPack|  cplx vDSP |  cplx FFTW | cplx PFFFT |
+|-----------+------------+------------+------------+------------| |------------+------------+------------+------------|
+|      64   |      745   |     2145   |     1706   |     2028   | |      961   |     3356   |     3313   |     2300   |
+|      96   |      877   |      n/a   |     1976   |     1978   | |     1059   |      n/a   |     3333   |     2233   |
+|     128   |      951   |     2808   |     2213   |     2279   | |     1202   |     3803   |     3739   |     2494   |
+|     192   |     1002   |      n/a   |     2456   |     2429   | |     1186   |      n/a   |     3701   |     2508   |
+|     256   |     1065   |     3205   |     2641   |     2793   | |     1302   |     4013   |     3912   |     2663   |
+|     384   |      845   |      n/a   |     2759   |     2499   | |      948   |      n/a   |     3729   |     2504   |
+|     512   |      900   |     3476   |     2956   |     2759   | |      974   |     4057   |     3954   |     2645   |
+|     768   |      910   |      n/a   |     2912   |     2737   | |      975   |      n/a   |     3837   |     2614   |
+|    1024   |      936   |     3583   |     3107   |     3009   | |     1006   |     4124   |     3821   |     2697   |
+|    2048   |     1057   |     3585   |     3091   |     2837   | |     1089   |     3889   |     3701   |     2513   |
+|    4096   |     1083   |     3524   |     3092   |     2733   | |     1039   |     3617   |     3462   |     2364   |
+|    8192   |      874   |     3252   |     2967   |     2363   | |      911   |     3106   |     2789   |     2302   |
+|    9216   |      898   |      n/a   |     2420   |     2290   | |      865   |      n/a   |     2676   |     2204   |
+|   16384   |      903   |     2892   |     2506   |     2421   | |      899   |     3026   |     2797   |     2289   |
+|   32768   |      965   |     2837   |     2550   |     2358   | |      920   |     2922   |     2763   |     2240   |
+|  262144   |      738   |     2422   |     1589   |     1708   | |      610   |     2038   |     1436   |     1091   |
+| 1048576   |      528   |     1207   |      845   |      880   | |      606   |     1020   |      669   |     1036   |
+|-----------+------------+------------+------------+------------| |------------+------------+------------+------------|
+
+
+
+Ubuntu 11.04, gcc 4.5, 32-bit, fftw 3.2 on a 2.66 core 2 quad
+
+Built with:
+gcc -o test_pffft -DHAVE_FFTW -msse -mfpmath=sse -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -lm
+
+| input len |real FFTPack|  real FFTW | real PFFFT | |cplx FFTPack|  cplx FFTW | cplx PFFFT |
+|-----------+------------+------------+------------| |------------+------------+------------|
+|       64  |     1920   |     3614   |     5120   | |     2194   |     7680   |     6467   |
+|       96  |     1873   |     3549   |     5187   | |     2107   |     8429   |     5863   |
+|      128  |     2240   |     3773   |     5514   | |     2560   |     7964   |     6827   |
+|      192  |     1765   |     4569   |     7767   | |     2284   |     9137   |     7061   |
+|      256  |     2048   |     5461   |     7447   | |     2731   |     9638   |     7802   |
+|      384  |     1998   |     5861   |     6762   | |     2313   |     9253   |     7644   |
+|      512  |     2095   |     6144   |     7680   | |     2194   |    10240   |     7089   |
+|      768  |     2230   |     5773   |     7549   | |     2045   |    10331   |     7010   |
+|     1024  |     2133   |     6400   |     8533   | |     2133   |    10779   |     7877   |
+|     2048  |     2011   |     7040   |     8665   | |     1942   |    10240   |     7768   |
+|     4096  |     2194   |     6827   |     8777   | |     1755   |     9452   |     6827   |
+|     8192  |     1849   |     6656   |     6656   | |     1752   |     7831   |     6827   |
+|     9216  |     1871   |     5858   |     6416   | |     1643   |     6909   |     6266   |
+|    16384  |     1883   |     6223   |     6506   | |     1664   |     7340   |     6982   |
+|    32768  |     1826   |     6390   |     6667   | |     1631   |     7481   |     6971   |
+|   262144  |     1546   |     4075   |     5977   | |     1299   |     3415   |     3551   |
+|  1048576  |     1104   |     2071   |     1730   | |     1104   |     1149   |     1834   |
+|-----------+------------+------------+------------| |------------+------------+------------|
+
+
+
+Ubuntu 11.04, gcc 4.5, 32-bit, fftw 3.3 on a 1.6 GHz Atom N270
+
+Built with:
+gcc -o test_pffft -DHAVE_FFTW -msse -mfpmath=sse -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -lm
+
+| N (input length) | real FFTPack |   real FFTW  |  real PFFFT  | | cplx FFTPack |   cplx FFTW  |  cplx PFFFT  |
+|------------------+--------------+--------------+--------------| |--------------+--------------+--------------|
+|           64     |       452    |      1041    |      1336    | |       549    |      2318    |      1781    |
+|           96     |       444    |      1297    |      1297    | |       503    |      2408    |      1686    |
+|          128     |       527    |      1525    |      1707    | |       543    |      2655    |      1886    |
+|          192     |       498    |      1653    |      1849    | |       539    |      2678    |      1942    |
+|          256     |       585    |      1862    |      2156    | |       594    |      2777    |      2244    |
+|          384     |       499    |      1870    |      1998    | |       511    |      2586    |      1890    |
+|          512     |       562    |      2095    |      2194    | |       542    |      2973    |      2194    |
+|          768     |       545    |      2045    |      2133    | |       545    |      2365    |      2133    |
+|         1024     |       595    |      2133    |      2438    | |       569    |      2695    |      2179    |
+|         2048     |       587    |      2125    |      2347    | |       521    |      2230    |      1707    |
+|         4096     |       495    |      1890    |      1834    | |       492    |      1876    |      1672    |
+|         8192     |       469    |      1548    |      1729    | |       438    |      1740    |      1664    |
+|         9216     |       468    |      1663    |      1663    | |       446    |      1585    |      1531    |
+|        16384     |       453    |      1608    |      1767    | |       398    |      1476    |      1664    |
+|        32768     |       456    |      1420    |      1503    | |       387    |      1388    |      1345    |
+|       262144     |       309    |       385    |       726    | |       262    |       415    |       840    |
+|      1048576     |       280    |       351    |       739    | |       261    |       313    |       797    |
+|------------------+--------------+--------------+--------------| |--------------+--------------+--------------|
+
+
+
+Windows 7, visual c++ 2010 on a 1.6 GHz Atom N270
+
+Built with:
+cl /Ox -D_USE_MATH_DEFINES /arch:SSE test_pffft.c pffft.c fftpack.c
+
+(visual c++ is definitively not very good with SSE intrinsics...)
+
+| N (input length) | real FFTPack |  real PFFFT  | | cplx FFTPack |  cplx PFFFT  |
+|------------------+--------------+--------------| |--------------+--------------|
+|           64     |       173    |      1009    | |       174    |      1159    |
+|           96     |       169    |      1029    | |       188    |      1201    |
+|          128     |       195    |      1242    | |       191    |      1275    |
+|          192     |       178    |      1312    | |       184    |      1276    |
+|          256     |       196    |      1591    | |       186    |      1281    |
+|          384     |       172    |      1409    | |       181    |      1281    |
+|          512     |       187    |      1640    | |       181    |      1313    |
+|          768     |       171    |      1614    | |       176    |      1258    |
+|         1024     |       186    |      1812    | |       178    |      1223    |
+|         2048     |       190    |      1707    | |       186    |      1099    |
+|         4096     |       182    |      1446    | |       177    |       975    |
+|         8192     |       175    |      1345    | |       169    |      1034    |
+|         9216     |       165    |      1271    | |       168    |      1023    |
+|        16384     |       166    |      1396    | |       165    |       949    |
+|        32768     |       172    |      1311    | |       161    |       881    |
+|       262144     |       136    |       632    | |       134    |       629    |
+|      1048576     |       134    |       698    | |       127    |       623    |
+|------------------+--------------+--------------| |--------------+--------------|
+
+
+
+Ubuntu 12.04, gcc-4.7.3, 32-bit, with fftw 3.3.3 (built with --enable-neon), on a 1.2GHz ARM Cortex A9 (Tegra 3)
+
+Built with:
+gcc-4.7 -O3 -DHAVE_FFTW -march=armv7-a -mtune=cortex-a9 -mfloat-abi=hard -mfpu=neon -ffast-math test_pffft.c pffft.c -o test_pffft_arm fftpack.c -lm -I/usr/local/include/ -L/usr/local/lib/ -lfftw3f
+
+| input len |real FFTPack|  real FFTW | real PFFFT | |cplx FFTPack|  cplx FFTW | cplx PFFFT |
+|-----------+------------+------------+------------| |------------+------------+------------|
+|       64  |      549   |      452   |      731   | |      512   |      602   |      640   |
+|       96  |      421   |      272   |      702   | |      496   |      571   |      602   |
+|      128  |      498   |      512   |      815   | |      597   |      618   |      652   |
+|      160  |      521   |      536   |      815   | |      586   |      669   |      625   |
+|      192  |      539   |      571   |      883   | |      485   |      597   |      626   |
+|      256  |      640   |      539   |      975   | |      569   |      611   |      671   |
+|      384  |      499   |      610   |      879   | |      499   |      602   |      637   |
+|      480  |      518   |      507   |      877   | |      496   |      661   |      616   |
+|      512  |      524   |      591   |     1002   | |      549   |      678   |      668   |
+|      640  |      542   |      612   |      955   | |      568   |      663   |      645   |
+|      768  |      557   |      613   |      981   | |      491   |      663   |      598   |
+|      800  |      514   |      353   |      882   | |      514   |      360   |      574   |
+|     1024  |      640   |      640   |     1067   | |      492   |      683   |      602   |
+|     2048  |      587   |      640   |      908   | |      486   |      640   |      552   |
+|     2400  |      479   |      368   |      777   | |      422   |      376   |      518   |
+|     4096  |      511   |      614   |      853   | |      426   |      640   |      534   |
+|     8192  |      415   |      584   |      708   | |      386   |      622   |      516   |
+|     9216  |      419   |      571   |      687   | |      364   |      586   |      506   |
+|    16384  |      426   |      577   |      716   | |      398   |      606   |      530   |
+|    32768  |      417   |      572   |      673   | |      399   |      572   |      468   |
+|   262144  |      219   |      380   |      293   | |      255   |      431   |      343   |
+|  1048576  |      202   |      274   |      237   | |      265   |      282   |      355   |
+|-----------+------------+------------+------------| |------------+------------+------------|
+
+Same platform as above, but this time pffft and fftpack are built with clang 3.2:
+
+clang -O3 -DHAVE_FFTW -march=armv7-a -mtune=cortex-a9 -mfloat-abi=hard -mfpu=neon -ffast-math test_pffft.c pffft.c -o test_pffft_arm fftpack.c -lm -I/usr/local/include/ -L/usr/local/lib/ -lfftw3f
+
+| input len |real FFTPack|  real FFTW | real PFFFT | |cplx FFTPack|  cplx FFTW | cplx PFFFT |
+|-----------+------------+------------+------------| |------------+------------+------------|
+|       64  |      427   |      452   |      853   | |      427   |      602   |     1024   |
+|       96  |      351   |      276   |      843   | |      337   |      571   |      963   |
+|      128  |      373   |      512   |      996   | |      390   |      618   |     1054   |
+|      160  |      426   |      536   |      987   | |      375   |      669   |      914   |
+|      192  |      404   |      571   |     1079   | |      388   |      588   |     1079   |
+|      256  |      465   |      539   |     1205   | |      445   |      602   |     1170   |
+|      384  |      366   |      610   |     1099   | |      343   |      594   |     1099   |
+|      480  |      356   |      507   |     1140   | |      335   |      651   |      931   |
+|      512  |      411   |      591   |     1213   | |      384   |      649   |     1124   |
+|      640  |      398   |      612   |     1193   | |      373   |      654   |      901   |
+|      768  |      409   |      613   |     1227   | |      383   |      663   |     1044   |
+|      800  |      411   |      348   |     1073   | |      353   |      358   |      809   |
+|     1024  |      427   |      640   |     1280   | |      413   |      692   |     1004   |
+|     2048  |      414   |      626   |     1126   | |      371   |      640   |      853   |
+|     2400  |      399   |      373   |      898   | |      319   |      368   |      653   |
+|     4096  |      404   |      602   |     1059   | |      357   |      633   |      778   |
+|     8192  |      332   |      584   |      792   | |      308   |      616   |      716   |
+|     9216  |      322   |      561   |      783   | |      299   |      586   |      687   |
+|    16384  |      344   |      568   |      778   | |      314   |      617   |      745   |
+|    32768  |      342   |      564   |      737   | |      314   |      552   |      629   |
+|   262144  |      201   |      383   |      313   | |      227   |      435   |      413   |
+|  1048576  |      187   |      262   |      251   | |      228   |      281   |      409   |
+|-----------+------------+------------+------------| |------------+------------+------------|
+
+So it looks like, on ARM, gcc 4.7 is the best at scalar floating point
+(the fftpack performance numbers are better with gcc), while clang is
+the best with neon intrinsics (see how pffft perf has improved with
+clang 3.2).
+
+
+NVIDIA Jetson TK1 board, gcc-4.8.2. The cpu is a 2.3GHz cortex A15 (Tegra K1).
+
+Built with:
+gcc -O3 -march=armv7-a -mtune=native -mfloat-abi=hard -mfpu=neon -ffast-math test_pffft.c pffft.c -o test_pffft_arm fftpack.c -lm
+
+| input len |real FFTPack| real PFFFT | |cplx FFTPack| cplx PFFFT |
+|-----------+------------+------------| |------------+------------|
+|       64  |     1735   |     3308   | |     1994   |     3744   |
+|       96  |     1596   |     3448   | |     1987   |     3572   |
+|      128  |     1807   |     4076   | |     2255   |     3960   |
+|      160  |     1769   |     4083   | |     2071   |     3845   |
+|      192  |     1990   |     4233   | |     2017   |     3939   |
+|      256  |     2191   |     4882   | |     2254   |     4346   |
+|      384  |     1878   |     4492   | |     2073   |     4012   |
+|      480  |     1748   |     4398   | |     1923   |     3951   |
+|      512  |     2030   |     5064   | |     2267   |     4195   |
+|      640  |     1918   |     4756   | |     2094   |     4184   |
+|      768  |     2099   |     4907   | |     2048   |     4297   |
+|      800  |     1822   |     4555   | |     1880   |     4063   |
+|     1024  |     2232   |     5355   | |     2187   |     4420   |
+|     2048  |     2176   |     4983   | |     2027   |     3602   |
+|     2400  |     1741   |     4256   | |     1710   |     3344   |
+|     4096  |     1816   |     3914   | |     1851   |     3349   |
+|     8192  |     1716   |     3481   | |     1700   |     3255   |
+|     9216  |     1735   |     3589   | |     1653   |     3094   |
+|    16384  |     1567   |     3483   | |     1637   |     3244   |
+|    32768  |     1624   |     3240   | |     1655   |     3156   |
+|   262144  |     1012   |     1898   | |      983   |     1503   |
+|  1048576  |      876   |     1154   | |      868   |     1341   |
+|-----------+------------+------------| |------------+------------|
+
+The performance on the tegra K1 is pretty impressive. I'm not
+including the FFTW numbers as they as slightly below the scalar
+fftpack numbers, so something must be wrong (however it seems to be
+correctly configured and is using neon simd instructions).
+
+When using clang 3.4 the pffft version is even a bit faster, reaching
+5.7 GFlops for real ffts of size 1024.
+
+
+iPad Air 2 with iOS9, xcode 8.0, arm64. The cpu is an Apple A8X, supposedly running at 1.5GHz.
+
+| input len |real FFTPack|  real vDSP | real PFFFT | |cplx FFTPack|  cplx vDSP | cplx PFFFT |
+|-----------+------------+------------+------------| |------------+------------+------------|
+|       64  |     2517   |     7995   |     6086   | |     2725   |    13006   |     8495   |
+|       96  |     2442   |      n/a   |     6691   | |     2256   |      n/a   |     7991   |
+|      128  |     2664   |    10186   |     7877   | |     2575   |    15115   |     9115   |
+|      160  |     2638   |      n/a   |     8283   | |     2682   |      n/a   |     8806   |
+|      192  |     2903   |      n/a   |     9083   | |     2634   |      n/a   |     8980   |
+|      256  |     3184   |    11452   |    10039   | |     3026   |    15410   |    10199   |
+|      384  |     2665   |      n/a   |    10100   | |     2275   |      n/a   |     9247   |
+|      480  |     2546   |      n/a   |     9863   | |     2341   |      n/a   |     8892   |
+|      512  |     2832   |    12197   |    10989   | |     2547   |    16768   |    10154   |
+|      640  |     2755   |      n/a   |    10461   | |     2569   |      n/a   |     9666   |
+|      768  |     2998   |      n/a   |    11355   | |     2585   |      n/a   |     9813   |
+|      800  |     2516   |      n/a   |    10332   | |     2433   |      n/a   |     9164   |
+|     1024  |     3109   |    12965   |    12114   | |     2869   |    16448   |    10519   |
+|     2048  |     3027   |    12996   |    12023   | |     2648   |    17304   |    10307   |
+|     2400  |     2515   |      n/a   |    10372   | |     2355   |      n/a   |     8443   |
+|     4096  |     3204   |    13603   |    12359   | |     2814   |    16570   |     9780   |
+|     8192  |     2759   |    13422   |    10824   | |     2153   |    15652   |     7884   |
+|     9216  |     2700   |      n/a   |     9938   | |     2241   |      n/a   |     7900   |
+|    16384  |     2280   |    13057   |     7976   | |      593   |     4272   |     2534   |
+|    32768  |      768   |     4269   |     2882   | |      606   |     4405   |     2604   |
+|   262144  |      724   |     3527   |     2630   | |      534   |     2418   |     2157   |
+|  1048576  |      674   |     1467   |     2135   | |      530   |     1621   |     2055   |
+|-----------+------------+------------+------------| |------------+------------+------------|
+
+I double-checked to make sure I did not make a mistake in the time
+measurements, as the numbers are much higher than what I initially
+expected. They are in fact higher than the number I get on the 2.8GHz
+Xeon of my 2008 mac pro.. (except for FFT lengths >= 32768 where
+having a big cache is useful). A good surprise is also that the perf
+is not too far from apple's vDSP (at least for the real FFT).
+
diff --git a/oss-internship-2020/pffft/fftpack.c b/oss-internship-2020/pffft/fftpack.c
new file mode 100644
index 0000000..b6375a8
--- /dev/null
+++ b/oss-internship-2020/pffft/fftpack.c
@@ -0,0 +1,3112 @@
+/*
+  compile with cc -DTESTING_FFTPACK fftpack.c in order to build the
+  test application.
+
+  This is an f2c translation of the full fftpack sources as found on
+  http://www.netlib.org/fftpack/ The translated code has been
+  slightlty edited to remove the ugliest artefacts of the translation
+  (a hundred of wild GOTOs were wiped during that operation).
+
+  The original fftpack file was written by Paul N. Swarztrauber
+  (Version 4, 1985), in fortran 77.
+
+   FFTPACK license:
+
+   http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
+
+   Copyright (c) 2004 the University Corporation for Atmospheric
+   Research ("UCAR"). All rights reserved. Developed by NCAR's
+   Computational and Information Systems Laboratory, UCAR,
+   www.cisl.ucar.edu.
+
+   Redistribution and use of the Software in source and binary forms,
+   with or without modification, is permitted provided that the
+   following conditions are met:
+
+   - Neither the names of NCAR's Computational and Information Systems
+   Laboratory, the University Corporation for Atmospheric Research,
+   nor the names of its sponsors or contributors may be used to
+   endorse or promote products derived from this Software without
+   specific prior written permission.  
+
+   - Redistributions of source code must retain the above copyright
+   notices, this list of conditions, and the disclaimer below.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions, and the disclaimer below in the
+   documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+   SOFTWARE.
+
+   ChangeLog:
+   2011/10/02: this is my first release of this file.
+*/
+
+#include "fftpack.h"
+#include <math.h>
+
+typedef fftpack_real real;
+typedef fftpack_int  integer;
+
+typedef struct f77complex {    
+  real r, i;
+} f77complex;   
+
+#ifdef TESTING_FFTPACK
+static real c_abs(f77complex *c) { return sqrt(c->r*c->r + c->i*c->i); }
+static double dmax(double a, double b) { return a < b ? b : a; }
+#endif
+
+/* translated by f2c (version 20061008), and slightly edited */
+
+static void passfb(integer *nac, integer ido, integer ip, integer l1, integer idl1, 
+                   real *cc, real *c1, real *c2, real *ch, real *ch2, const real *wa, real fsign)
+{
+  /* System generated locals */
+  integer ch_offset, cc_offset,
+    c1_offset, c2_offset, ch2_offset;
+
+  /* Local variables */
+  integer i, j, k, l, jc, lc, ik, idj, idl, inc, idp;
+  real wai, war;
+  integer ipp2, idij, idlj, idot, ipph;
+
+
+#define c1_ref(a_1,a_2,a_3) c1[((a_3)*l1 + (a_2))*ido + a_1]
+#define c2_ref(a_1,a_2) c2[(a_2)*idl1 + a_1]
+#define cc_ref(a_1,a_2,a_3) cc[((a_3)*ip + (a_2))*ido + a_1]
+#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1]
+#define ch2_ref(a_1,a_2) ch2[(a_2)*idl1 + a_1]
+
+  /* Parameter adjustments */
+  ch_offset = 1 + ido * (1 + l1);
+  ch -= ch_offset;
+  c1_offset = 1 + ido * (1 + l1);
+  c1 -= c1_offset;
+  cc_offset = 1 + ido * (1 + ip);
+  cc -= cc_offset;
+  ch2_offset = 1 + idl1;
+  ch2 -= ch2_offset;
+  c2_offset = 1 + idl1;
+  c2 -= c2_offset;
+  --wa;
+
+  /* Function Body */
+  idot = ido / 2;
+  ipp2 = ip + 2;
+  ipph = (ip + 1) / 2;
+  idp = ip * ido;
+
+  if (ido >= l1) {
+    for (j = 2; j <= ipph; ++j) {
+      jc = ipp2 - j;
+      for (k = 1; k <= l1; ++k) {
+        for (i = 1; i <= ido; ++i) {
+          ch_ref(i, k, j) = cc_ref(i, j, k) + cc_ref(i, jc, k);
+          ch_ref(i, k, jc) = cc_ref(i, j, k) - cc_ref(i, jc, k);
+        }
+      }
+    }
+    for (k = 1; k <= l1; ++k) {
+      for (i = 1; i <= ido; ++i) {
+        ch_ref(i, k, 1) = cc_ref(i, 1, k);
+      }
+    }
+  } else {
+    for (j = 2; j <= ipph; ++j) {
+      jc = ipp2 - j;
+      for (i = 1; i <= ido; ++i) {
+        for (k = 1; k <= l1; ++k) {
+          ch_ref(i, k, j) = cc_ref(i, j, k) + cc_ref(i, jc, k);
+          ch_ref(i, k, jc) = cc_ref(i, j, k) - cc_ref(i, jc, k);
+        }
+      }
+    }
+    for (i = 1; i <= ido; ++i) {
+      for (k = 1; k <= l1; ++k) {
+        ch_ref(i, k, 1) = cc_ref(i, 1, k);
+      }
+    }
+  }
+  idl = 2 - ido;
+  inc = 0;
+  for (l = 2; l <= ipph; ++l) {
+    lc = ipp2 - l;
+    idl += ido;
+    for (ik = 1; ik <= idl1; ++ik) {
+      c2_ref(ik, l) = ch2_ref(ik, 1) + wa[idl - 1] * ch2_ref(ik, 2);
+      c2_ref(ik, lc) = fsign*wa[idl] * ch2_ref(ik, ip);
+    }
+    idlj = idl;
+    inc += ido;
+    for (j = 3; j <= ipph; ++j) {
+      jc = ipp2 - j;
+      idlj += inc;
+      if (idlj > idp) {
+        idlj -= idp;
+      }
+      war = wa[idlj - 1];
+      wai = wa[idlj];
+      for (ik = 1; ik <= idl1; ++ik) {
+        c2_ref(ik, l) = c2_ref(ik, l) + war * ch2_ref(ik, j);
+        c2_ref(ik, lc) = c2_ref(ik, lc) + fsign*wai * ch2_ref(ik, jc);
+      }
+    }
+  }
+  for (j = 2; j <= ipph; ++j) {
+    for (ik = 1; ik <= idl1; ++ik) {
+      ch2_ref(ik, 1) = ch2_ref(ik, 1) + ch2_ref(ik, j);
+    }
+  }
+  for (j = 2; j <= ipph; ++j) {
+    jc = ipp2 - j;
+    for (ik = 2; ik <= idl1; ik += 2) {
+      ch2_ref(ik - 1, j) = c2_ref(ik - 1, j) - c2_ref(ik, jc);
+      ch2_ref(ik - 1, jc) = c2_ref(ik - 1, j) + c2_ref(ik, jc);
+      ch2_ref(ik, j) = c2_ref(ik, j) + c2_ref(ik - 1, jc);
+      ch2_ref(ik, jc) = c2_ref(ik, j) - c2_ref(ik - 1, jc);
+    }
+  }
+  *nac = 1;
+  if (ido == 2) {
+    return;
+  }
+  *nac = 0;
+  for (ik = 1; ik <= idl1; ++ik) {
+    c2_ref(ik, 1) = ch2_ref(ik, 1);
+  }
+  for (j = 2; j <= ip; ++j) {
+    for (k = 1; k <= l1; ++k) {
+      c1_ref(1, k, j) = ch_ref(1, k, j);
+      c1_ref(2, k, j) = ch_ref(2, k, j);
+    }
+  }
+  if (idot <= l1) {
+    idij = 0;
+    for (j = 2; j <= ip; ++j) {
+      idij += 2;
+      for (i = 4; i <= ido; i += 2) {
+        idij += 2;
+        for (k = 1; k <= l1; ++k) {
+          c1_ref(i - 1, k, j) = wa[idij - 1] * ch_ref(i - 1, k, j) - fsign*wa[idij] * ch_ref(i, k, j);
+          c1_ref(i, k, j) = wa[idij - 1] * ch_ref(i, k, j) + fsign*wa[idij] * ch_ref(i - 1, k, j);
+        }
+      }
+    }
+    return;
+  }
+  idj = 2 - ido;
+  for (j = 2; j <= ip; ++j) {
+    idj += ido;
+    for (k = 1; k <= l1; ++k) {
+      idij = idj;
+      for (i = 4; i <= ido; i += 2) {
+        idij += 2;
+        c1_ref(i - 1, k, j) = wa[idij - 1] * ch_ref(i - 1, k, j) - fsign*wa[idij] * ch_ref(i, k, j);
+        c1_ref(i, k, j) = wa[idij - 1] * ch_ref(i, k, j) + fsign*wa[idij] * ch_ref(i - 1, k, j);
+      }
+    }
+  }
+} /* passb */
+
+#undef ch2_ref
+#undef ch_ref
+#undef cc_ref
+#undef c2_ref
+#undef c1_ref
+
+
+static void passb2(integer ido, integer l1, const real *cc, real *ch, const real *wa1)
+{
+  /* System generated locals */
+  integer cc_offset, ch_offset;
+
+  /* Local variables */
+  integer i, k;
+  real ti2, tr2;
+
+
+#define cc_ref(a_1,a_2,a_3) cc[((a_3)*2 + (a_2))*ido + a_1]
+#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1]
+
+  /* Parameter adjustments */
+  ch_offset = 1 + ido * (1 + l1);
+  ch -= ch_offset;
+  cc_offset = 1 + ido * 3;
+  cc -= cc_offset;
+  --wa1;
+
+  /* Function Body */
+  if (ido <= 2) {
+    for (k = 1; k <= l1; ++k) {
+      ch_ref(1, k, 1) = cc_ref(1, 1, k) + cc_ref(1, 2, k);
+      ch_ref(1, k, 2) = cc_ref(1, 1, k) - cc_ref(1, 2, k);
+      ch_ref(2, k, 1) = cc_ref(2, 1, k) + cc_ref(2, 2, k);
+      ch_ref(2, k, 2) = cc_ref(2, 1, k) - cc_ref(2, 2, k);
+    }
+    return;
+  }
+  for (k = 1; k <= l1; ++k) {
+    for (i = 2; i <= ido; i += 2) {
+      ch_ref(i - 1, k, 1) = cc_ref(i - 1, 1, k) + cc_ref(i - 1, 2, k);
+      tr2 = cc_ref(i - 1, 1, k) - cc_ref(i - 1, 2, k);
+      ch_ref(i, k, 1) = cc_ref(i, 1, k) + cc_ref(i, 2, k);
+      ti2 = cc_ref(i, 1, k) - cc_ref(i, 2, k);
+      ch_ref(i, k, 2) = wa1[i - 1] * ti2 + wa1[i] * tr2;
+      ch_ref(i - 1, k, 2) = wa1[i - 1] * tr2 - wa1[i] * ti2;
+    }
+  }
+} /* passb2 */
+
+#undef ch_ref
+#undef cc_ref
+
+
+static void passb3(integer ido, integer l1, const real *cc, real *ch, const real *wa1, const real *wa2)
+{
+  static const real taur = -.5f;
+  static const real taui = .866025403784439f;
+
+  /* System generated locals */
+  integer cc_offset, ch_offset;
+
+  /* Local variables */
+  integer i, k;
+  real ci2, ci3, di2, di3, cr2, cr3, dr2, dr3, ti2, tr2;
+
+
+#define cc_ref(a_1,a_2,a_3) cc[((a_3)*3 + (a_2))*ido + a_1]
+#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1]
+
+  /* Parameter adjustments */
+  ch_offset = 1 + ido * (1 + l1);
+  ch -= ch_offset;
+  cc_offset = 1 + (ido << 2);
+  cc -= cc_offset;
+  --wa1;
+  --wa2;
+
+  /* Function Body */
+  if (ido == 2) {
+    for (k = 1; k <= l1; ++k) {
+      tr2 = cc_ref(1, 2, k) + cc_ref(1, 3, k);
+      cr2 = cc_ref(1, 1, k) + taur * tr2;
+      ch_ref(1, k, 1) = cc_ref(1, 1, k) + tr2;
+      ti2 = cc_ref(2, 2, k) + cc_ref(2, 3, k);
+      ci2 = cc_ref(2, 1, k) + taur * ti2;
+      ch_ref(2, k, 1) = cc_ref(2, 1, k) + ti2;
+      cr3 = taui * (cc_ref(1, 2, k) - cc_ref(1, 3, k));
+      ci3 = taui * (cc_ref(2, 2, k) - cc_ref(2, 3, k));
+      ch_ref(1, k, 2) = cr2 - ci3;
+      ch_ref(1, k, 3) = cr2 + ci3;
+      ch_ref(2, k, 2) = ci2 + cr3;
+      ch_ref(2, k, 3) = ci2 - cr3;
+    }
+  } else {
+    for (k = 1; k <= l1; ++k) {
+      for (i = 2; i <= ido; i += 2) {
+        tr2 = cc_ref(i - 1, 2, k) + cc_ref(i - 1, 3, k);
+        cr2 = cc_ref(i - 1, 1, k) + taur * tr2;
+        ch_ref(i - 1, k, 1) = cc_ref(i - 1, 1, k) + tr2;
+        ti2 = cc_ref(i, 2, k) + cc_ref(i, 3, k);
+        ci2 = cc_ref(i, 1, k) + taur * ti2;
+        ch_ref(i, k, 1) = cc_ref(i, 1, k) + ti2;
+        cr3 = taui * (cc_ref(i - 1, 2, k) - cc_ref(i - 1, 3, k));
+        ci3 = taui * (cc_ref(i, 2, k) - cc_ref(i, 3, k));
+        dr2 = cr2 - ci3;
+        dr3 = cr2 + ci3;
+        di2 = ci2 + cr3;
+        di3 = ci2 - cr3;
+        ch_ref(i, k, 2) = wa1[i - 1] * di2 + wa1[i] * dr2;
+        ch_ref(i - 1, k, 2) = wa1[i - 1] * dr2 - wa1[i] * di2;
+        ch_ref(i, k, 3) = wa2[i - 1] * di3 + wa2[i] * dr3;
+        ch_ref(i - 1, k, 3) = wa2[i - 1] * dr3 - wa2[i] * di3;
+      }
+    }
+  }
+} /* passb3 */
+
+#undef ch_ref
+#undef cc_ref
+
+
+static void passb4(integer ido, integer l1, const real *cc, real *ch, 
+                   const real *wa1, const real *wa2, const real *wa3)
+{
+  /* System generated locals */
+  integer cc_offset, ch_offset;
+
+  /* Local variables */
+  integer i, k;
+  real ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
+
+
+#define cc_ref(a_1,a_2,a_3) cc[((a_3)*4 + (a_2))*ido + a_1]
+#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1]
+
+  /* Parameter adjustments */
+  ch_offset = 1 + ido * (1 + l1);
+  ch -= ch_offset;
+  cc_offset = 1 + ido * 5;
+  cc -= cc_offset;
+  --wa1;
+  --wa2;
+  --wa3;
+
+  /* Function Body */
+  if (ido == 2) {
+    for (k = 1; k <= l1; ++k) {
+      ti1 = cc_ref(2, 1, k) - cc_ref(2, 3, k);
+      ti2 = cc_ref(2, 1, k) + cc_ref(2, 3, k);
+      tr4 = cc_ref(2, 4, k) - cc_ref(2, 2, k);
+      ti3 = cc_ref(2, 2, k) + cc_ref(2, 4, k);
+      tr1 = cc_ref(1, 1, k) - cc_ref(1, 3, k);
+      tr2 = cc_ref(1, 1, k) + cc_ref(1, 3, k);
+      ti4 = cc_ref(1, 2, k) - cc_ref(1, 4, k);
+      tr3 = cc_ref(1, 2, k) + cc_ref(1, 4, k);
+      ch_ref(1, k, 1) = tr2 + tr3;
+      ch_ref(1, k, 3) = tr2 - tr3;
+      ch_ref(2, k, 1) = ti2 + ti3;
+      ch_ref(2, k, 3) = ti2 - ti3;
+      ch_ref(1, k, 2) = tr1 + tr4;
+      ch_ref(1, k, 4) = tr1 - tr4;
+      ch_ref(2, k, 2) = ti1 + ti4;
+      ch_ref(2, k, 4) = ti1 - ti4;
+    }
+  } else {
+    for (k = 1; k <= l1; ++k) {
+      for (i = 2; i <= ido; i += 2) {
+        ti1 = cc_ref(i, 1, k) - cc_ref(i, 3, k);
+        ti2 = cc_ref(i, 1, k) + cc_ref(i, 3, k);
+        ti3 = cc_ref(i, 2, k) + cc_ref(i, 4, k);
+        tr4 = cc_ref(i, 4, k) - cc_ref(i, 2, k);
+        tr1 = cc_ref(i - 1, 1, k) - cc_ref(i - 1, 3, k);
+        tr2 = cc_ref(i - 1, 1, k) + cc_ref(i - 1, 3, k);
+        ti4 = cc_ref(i - 1, 2, k) - cc_ref(i - 1, 4, k);
+        tr3 = cc_ref(i - 1, 2, k) + cc_ref(i - 1, 4, k);
+        ch_ref(i - 1, k, 1) = tr2 + tr3;
+        cr3 = tr2 - tr3;
+        ch_ref(i, k, 1) = ti2 + ti3;
+        ci3 = ti2 - ti3;
+        cr2 = tr1 + tr4;
+        cr4 = tr1 - tr4;
+        ci2 = ti1 + ti4;
+        ci4 = ti1 - ti4;
+        ch_ref(i - 1, k, 2) = wa1[i - 1] * cr2 - wa1[i] * ci2;
+        ch_ref(i, k, 2) = wa1[i - 1] * ci2 + wa1[i] * cr2;
+        ch_ref(i - 1, k, 3) = wa2[i - 1] * cr3 - wa2[i] * ci3;
+        ch_ref(i, k, 3) = wa2[i - 1] * ci3 + wa2[i] * cr3;
+        ch_ref(i - 1, k, 4) = wa3[i - 1] * cr4 - wa3[i] * ci4;
+        ch_ref(i, k, 4) = wa3[i - 1] * ci4 + wa3[i] * cr4;
+      }
+    }
+  }
+} /* passb4 */
+
+#undef ch_ref
+#undef cc_ref
+
+/* passf5 and passb5 merged */
+static void passfb5(integer ido, integer l1, const real *cc, real *ch, 
+                    const real *wa1, const real *wa2, const real *wa3, const real *wa4, real fsign)
+{
+  const real tr11 = .309016994374947f;
+  const real ti11 = .951056516295154f*fsign;
+  const real tr12 = -.809016994374947f;
+  const real ti12 = .587785252292473f*fsign;
+
+  /* System generated locals */
+  integer cc_offset, ch_offset;
+
+  /* Local variables */
+  integer i, k;
+  real ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4, ti2, ti3,
+    ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5;
+
+
+#define cc_ref(a_1,a_2,a_3) cc[((a_3)*5 + (a_2))*ido + a_1]
+#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1]
+
+  /* Parameter adjustments */
+  ch_offset = 1 + ido * (1 + l1);
+  ch -= ch_offset;
+  cc_offset = 1 + ido * 6;
+  cc -= cc_offset;
+  --wa1;
+  --wa2;
+  --wa3;
+  --wa4;
+
+  /* Function Body */
+  if (ido == 2) {
+    for (k = 1; k <= l1; ++k) {
+      ti5 = cc_ref(2, 2, k) - cc_ref(2, 5, k);
+      ti2 = cc_ref(2, 2, k) + cc_ref(2, 5, k);
+      ti4 = cc_ref(2, 3, k) - cc_ref(2, 4, k);
+      ti3 = cc_ref(2, 3, k) + cc_ref(2, 4, k);
+      tr5 = cc_ref(1, 2, k) - cc_ref(1, 5, k);
+      tr2 = cc_ref(1, 2, k) + cc_ref(1, 5, k);
+      tr4 = cc_ref(1, 3, k) - cc_ref(1, 4, k);
+      tr3 = cc_ref(1, 3, k) + cc_ref(1, 4, k);
+      ch_ref(1, k, 1) = cc_ref(1, 1, k) + tr2 + tr3;
+      ch_ref(2, k, 1) = cc_ref(2, 1, k) + ti2 + ti3;
+      cr2 = cc_ref(1, 1, k) + tr11 * tr2 + tr12 * tr3;
+      ci2 = cc_ref(2, 1, k) + tr11 * ti2 + tr12 * ti3;
+      cr3 = cc_ref(1, 1, k) + tr12 * tr2 + tr11 * tr3;
+      ci3 = cc_ref(2, 1, k) + tr12 * ti2 + tr11 * ti3;
+      cr5 = ti11 * tr5 + ti12 * tr4;
+      ci5 = ti11 * ti5 + ti12 * ti4;
+      cr4 = ti12 * tr5 - ti11 * tr4;
+      ci4 = ti12 * ti5 - ti11 * ti4;
+      ch_ref(1, k, 2) = cr2 - ci5;
+      ch_ref(1, k, 5) = cr2 + ci5;
+      ch_ref(2, k, 2) = ci2 + cr5;
+      ch_ref(2, k, 3) = ci3 + cr4;
+      ch_ref(1, k, 3) = cr3 - ci4;
+      ch_ref(1, k, 4) = cr3 + ci4;
+      ch_ref(2, k, 4) = ci3 - cr4;
+      ch_ref(2, k, 5) = ci2 - cr5;
+    }
+  } else {
+    for (k = 1; k <= l1; ++k) {
+      for (i = 2; i <= ido; i += 2) {
+        ti5 = cc_ref(i, 2, k) - cc_ref(i, 5, k);
+        ti2 = cc_ref(i, 2, k) + cc_ref(i, 5, k);
+        ti4 = cc_ref(i, 3, k) - cc_ref(i, 4, k);
+        ti3 = cc_ref(i, 3, k) + cc_ref(i, 4, k);
+        tr5 = cc_ref(i - 1, 2, k) - cc_ref(i - 1, 5, k);
+        tr2 = cc_ref(i - 1, 2, k) + cc_ref(i - 1, 5, k);
+        tr4 = cc_ref(i - 1, 3, k) - cc_ref(i - 1, 4, k);
+        tr3 = cc_ref(i - 1, 3, k) + cc_ref(i - 1, 4, k);
+        ch_ref(i - 1, k, 1) = cc_ref(i - 1, 1, k) + tr2 + tr3;
+        ch_ref(i, k, 1) = cc_ref(i, 1, k) + ti2 + ti3;
+        cr2 = cc_ref(i - 1, 1, k) + tr11 * tr2 + tr12 * tr3;
+        ci2 = cc_ref(i, 1, k) + tr11 * ti2 + tr12 * ti3;
+        cr3 = cc_ref(i - 1, 1, k) + tr12 * tr2 + tr11 * tr3;
+        ci3 = cc_ref(i, 1, k) + tr12 * ti2 + tr11 * ti3;
+        cr5 = ti11 * tr5 + ti12 * tr4;
+        ci5 = ti11 * ti5 + ti12 * ti4;
+        cr4 = ti12 * tr5 - ti11 * tr4;
+        ci4 = ti12 * ti5 - ti11 * ti4;
+        dr3 = cr3 - ci4;
+        dr4 = cr3 + ci4;
+        di3 = ci3 + cr4;
+        di4 = ci3 - cr4;
+        dr5 = cr2 + ci5;
+        dr2 = cr2 - ci5;
+        di5 = ci2 - cr5;
+        di2 = ci2 + cr5;
+        ch_ref(i - 1, k, 2) = wa1[i - 1] * dr2 - fsign*wa1[i] * di2;
+        ch_ref(i, k, 2) = wa1[i - 1] * di2 + fsign*wa1[i] * dr2;
+        ch_ref(i - 1, k, 3) = wa2[i - 1] * dr3 - fsign*wa2[i] * di3;
+        ch_ref(i, k, 3) = wa2[i - 1] * di3 + fsign*wa2[i] * dr3;
+        ch_ref(i - 1, k, 4) = wa3[i - 1] * dr4 - fsign*wa3[i] * di4;
+        ch_ref(i, k, 4) = wa3[i - 1] * di4 + fsign*wa3[i] * dr4;
+        ch_ref(i - 1, k, 5) = wa4[i - 1] * dr5 - fsign*wa4[i] * di5;
+        ch_ref(i, k, 5) = wa4[i - 1] * di5 + fsign*wa4[i] * dr5;
+      }
+    }
+  }
+} /* passb5 */
+
+#undef ch_ref
+#undef cc_ref
+
+static void passf2(integer ido, integer l1, const real *cc, real *ch, const real *wa1)
+{
+  /* System generated locals */
+  integer cc_offset, ch_offset;
+
+  /* Local variables */
+  integer i, k;
+  real ti2, tr2;
+
+
+#define cc_ref(a_1,a_2,a_3) cc[((a_3)*2 + (a_2))*ido + a_1]
+#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1]
+
+  /* Parameter adjustments */
+  ch_offset = 1 + ido * (1 + l1);
+  ch -= ch_offset;
+  cc_offset = 1 + ido * 3;
+  cc -= cc_offset;
+  --wa1;
+
+  /* Function Body */
+  if (ido == 2) {
+    for (k = 1; k <= l1; ++k) {
+      ch_ref(1, k, 1) = cc_ref(1, 1, k) + cc_ref(1, 2, k);
+      ch_ref(1, k, 2) = cc_ref(1, 1, k) - cc_ref(1, 2, k);
+      ch_ref(2, k, 1) = cc_ref(2, 1, k) + cc_ref(2, 2, k);
+      ch_ref(2, k, 2) = cc_ref(2, 1, k) - cc_ref(2, 2, k);
+    }
+  } else {
+    for (k = 1; k <= l1; ++k) {
+      for (i = 2; i <= ido; i += 2) {
+        ch_ref(i - 1, k, 1) = cc_ref(i - 1, 1, k) + cc_ref(i - 1, 2,
+                                                           k);
+        tr2 = cc_ref(i - 1, 1, k) - cc_ref(i - 1, 2, k);
+        ch_ref(i, k, 1) = cc_ref(i, 1, k) + cc_ref(i, 2, k);
+        ti2 = cc_ref(i, 1, k) - cc_ref(i, 2, k);
+        ch_ref(i, k, 2) = wa1[i - 1] * ti2 - wa1[i] * tr2;
+        ch_ref(i - 1, k, 2) = wa1[i - 1] * tr2 + wa1[i] * ti2;
+      }
+    }
+  }
+} /* passf2 */
+
+#undef ch_ref
+#undef cc_ref
+
+
+static void passf3(integer ido, integer l1, const real *cc, real *ch, 
+                   const real *wa1, const real *wa2)
+{
+  static const real taur = -.5f;
+  static const real taui = -.866025403784439f;
+
+  /* System generated locals */
+  integer cc_offset, ch_offset;
+
+  /* Local variables */
+  integer i, k;
+  real ci2, ci3, di2, di3, cr2, cr3, dr2, dr3, ti2, tr2;
+
+
+#define cc_ref(a_1,a_2,a_3) cc[((a_3)*3 + (a_2))*ido + a_1]
+#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1]
+
+  /* Parameter adjustments */
+  ch_offset = 1 + ido * (1 + l1);
+  ch -= ch_offset;
+  cc_offset = 1 + (ido << 2);
+  cc -= cc_offset;
+  --wa1;
+  --wa2;
+
+  /* Function Body */
+  if (ido == 2) {
+    for (k = 1; k <= l1; ++k) {
+      tr2 = cc_ref(1, 2, k) + cc_ref(1, 3, k);
+      cr2 = cc_ref(1, 1, k) + taur * tr2;
+      ch_ref(1, k, 1) = cc_ref(1, 1, k) + tr2;
+      ti2 = cc_ref(2, 2, k) + cc_ref(2, 3, k);
+      ci2 = cc_ref(2, 1, k) + taur * ti2;
+      ch_ref(2, k, 1) = cc_ref(2, 1, k) + ti2;
+      cr3 = taui * (cc_ref(1, 2, k) - cc_ref(1, 3, k));
+      ci3 = taui * (cc_ref(2, 2, k) - cc_ref(2, 3, k));
+      ch_ref(1, k, 2) = cr2 - ci3;
+      ch_ref(1, k, 3) = cr2 + ci3;
+      ch_ref(2, k, 2) = ci2 + cr3;
+      ch_ref(2, k, 3) = ci2 - cr3;
+    }
+  } else {
+    for (k = 1; k <= l1; ++k) {
+      for (i = 2; i <= ido; i += 2) {
+        tr2 = cc_ref(i - 1, 2, k) + cc_ref(i - 1, 3, k);
+        cr2 = cc_ref(i - 1, 1, k) + taur * tr2;
+        ch_ref(i - 1, k, 1) = cc_ref(i - 1, 1, k) + tr2;
+        ti2 = cc_ref(i, 2, k) + cc_ref(i, 3, k);
+        ci2 = cc_ref(i, 1, k) + taur * ti2;
+        ch_ref(i, k, 1) = cc_ref(i, 1, k) + ti2;
+        cr3 = taui * (cc_ref(i - 1, 2, k) - cc_ref(i - 1, 3, k));
+        ci3 = taui * (cc_ref(i, 2, k) - cc_ref(i, 3, k));
+        dr2 = cr2 - ci3;
+        dr3 = cr2 + ci3;
+        di2 = ci2 + cr3;
+        di3 = ci2 - cr3;
+        ch_ref(i, k, 2) = wa1[i - 1] * di2 - wa1[i] * dr2;
+        ch_ref(i - 1, k, 2) = wa1[i - 1] * dr2 + wa1[i] * di2;
+        ch_ref(i, k, 3) = wa2[i - 1] * di3 - wa2[i] * dr3;
+        ch_ref(i - 1, k, 3) = wa2[i - 1] * dr3 + wa2[i] * di3;
+      }
+    }
+  }
+} /* passf3 */
+
+#undef ch_ref
+#undef cc_ref
+
+
+static void passf4(integer ido, integer l1, const real *cc, real *ch, 
+                   const real *wa1, const real *wa2, const real *wa3)
+{
+  /* System generated locals */
+  integer cc_offset, ch_offset;
+
+  /* Local variables */
+  integer i, k;
+  real ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
+
+
+#define cc_ref(a_1,a_2,a_3) cc[((a_3)*4 + (a_2))*ido + a_1]
+#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1]
+
+  /* Parameter adjustments */
+  ch_offset = 1 + ido * (1 + l1);
+  ch -= ch_offset;
+  cc_offset = 1 + ido * 5;
+  cc -= cc_offset;
+  --wa1;
+  --wa2;
+  --wa3;
+
+  /* Function Body */
+  if (ido == 2) {
+    for (k = 1; k <= l1; ++k) {
+      ti1 = cc_ref(2, 1, k) - cc_ref(2, 3, k);
+      ti2 = cc_ref(2, 1, k) + cc_ref(2, 3, k);
+      tr4 = cc_ref(2, 2, k) - cc_ref(2, 4, k);
+      ti3 = cc_ref(2, 2, k) + cc_ref(2, 4, k);
+      tr1 = cc_ref(1, 1, k) - cc_ref(1, 3, k);
+      tr2 = cc_ref(1, 1, k) + cc_ref(1, 3, k);
+      ti4 = cc_ref(1, 4, k) - cc_ref(1, 2, k);
+      tr3 = cc_ref(1, 2, k) + cc_ref(1, 4, k);
+      ch_ref(1, k, 1) = tr2 + tr3;
+      ch_ref(1, k, 3) = tr2 - tr3;
+      ch_ref(2, k, 1) = ti2 + ti3;
+      ch_ref(2, k, 3) = ti2 - ti3;
+      ch_ref(1, k, 2) = tr1 + tr4;
+      ch_ref(1, k, 4) = tr1 - tr4;
+      ch_ref(2, k, 2) = ti1 + ti4;
+      ch_ref(2, k, 4) = ti1 - ti4;
+    }
+  } else {
+    for (k = 1; k <= l1; ++k) {
+      for (i = 2; i <= ido; i += 2) {
+        ti1 = cc_ref(i, 1, k) - cc_ref(i, 3, k);
+        ti2 = cc_ref(i, 1, k) + cc_ref(i, 3, k);
+        ti3 = cc_ref(i, 2, k) + cc_ref(i, 4, k);
+        tr4 = cc_ref(i, 2, k) - cc_ref(i, 4, k);
+        tr1 = cc_ref(i - 1, 1, k) - cc_ref(i - 1, 3, k);
+        tr2 = cc_ref(i - 1, 1, k) + cc_ref(i - 1, 3, k);
+        ti4 = cc_ref(i - 1, 4, k) - cc_ref(i - 1, 2, k);
+        tr3 = cc_ref(i - 1, 2, k) + cc_ref(i - 1, 4, k);
+        ch_ref(i - 1, k, 1) = tr2 + tr3;
+        cr3 = tr2 - tr3;
+        ch_ref(i, k, 1) = ti2 + ti3;
+        ci3 = ti2 - ti3;
+        cr2 = tr1 + tr4;
+        cr4 = tr1 - tr4;
+        ci2 = ti1 + ti4;
+        ci4 = ti1 - ti4;
+        ch_ref(i - 1, k, 2) = wa1[i - 1] * cr2 + wa1[i] * ci2;
+        ch_ref(i, k, 2) = wa1[i - 1] * ci2 - wa1[i] * cr2;
+        ch_ref(i - 1, k, 3) = wa2[i - 1] * cr3 + wa2[i] * ci3;
+        ch_ref(i, k, 3) = wa2[i - 1] * ci3 - wa2[i] * cr3;
+        ch_ref(i - 1, k, 4) = wa3[i - 1] * cr4 + wa3[i] * ci4;
+        ch_ref(i, k, 4) = wa3[i - 1] * ci4 - wa3[i] * cr4;
+      }
+    }
+  }
+} /* passf4 */
+
+#undef ch_ref
+#undef cc_ref
+
+static void radb2(integer ido, integer l1, const real *cc, real *ch, const real *wa1)
+{
+  /* System generated locals */
+  integer cc_offset, ch_offset;
+
+  /* Local variables */
+  integer i, k, ic;
+  real ti2, tr2;
+  integer idp2;
+
+
+#define cc_ref(a_1,a_2,a_3) cc[((a_3)*2 + (a_2))*ido + a_1]
+#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1]
+
+  /* Parameter adjustments */
+  ch_offset = 1 + ido * (1 + l1);
+  ch -= ch_offset;
+  cc_offset = 1 + ido * 3;
+  cc -= cc_offset;
+  --wa1;
+
+  /* Function Body */
+  for (k = 1; k <= l1; ++k) {
+    ch_ref(1, k, 1) = cc_ref(1, 1, k) + cc_ref(ido, 2, k);
+    ch_ref(1, k, 2) = cc_ref(1, 1, k) - cc_ref(ido, 2, k);
+  }
+  if (ido < 2) return;
+  else if (ido != 2) {
+    idp2 = ido + 2;
+    for (k = 1; k <= l1; ++k) {
+      for (i = 3; i <= ido; i += 2) {
+        ic = idp2 - i;
+        ch_ref(i - 1, k, 1) = cc_ref(i - 1, 1, k) + cc_ref(ic - 1, 2, 
+                                                           k);
+        tr2 = cc_ref(i - 1, 1, k) - cc_ref(ic - 1, 2, k);
+        ch_ref(i, k, 1) = cc_ref(i, 1, k) - cc_ref(ic, 2, k);
+        ti2 = cc_ref(i, 1, k) + cc_ref(ic, 2, k);
+        ch_ref(i - 1, k, 2) = wa1[i - 2] * tr2 - wa1[i - 1] * ti2;
+        ch_ref(i, k, 2) = wa1[i - 2] * ti2 + wa1[i - 1] * tr2;
+      }
+    }
+    if (ido % 2 == 1) return;
+  }
+  for (k = 1; k <= l1; ++k) {
+    ch_ref(ido, k, 1) = cc_ref(ido, 1, k) + cc_ref(ido, 1, k);
+    ch_ref(ido, k, 2) = -(cc_ref(1, 2, k) + cc_ref(1, 2, k));
+  }
+} /* radb2 */
+
+#undef ch_ref
+#undef cc_ref
+
+
+static void radb3(integer ido, integer l1, const real *cc, real *ch, 
+                  const real *wa1, const real *wa2)
+{
+  /* Initialized data */
+
+  static const real taur = -.5f;
+  static const real taui = .866025403784439f;
+
+  /* System generated locals */
+  integer cc_offset, ch_offset;
+
+  /* Local variables */
+  integer i, k, ic;
+  real ci2, ci3, di2, di3, cr2, cr3, dr2, dr3, ti2, tr2;
+  integer idp2;
+
+
+#define cc_ref(a_1,a_2,a_3) cc[((a_3)*3 + (a_2))*ido + a_1]
+#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1]
+
+  /* Parameter adjustments */
+  ch_offset = 1 + ido * (1 + l1);
+  ch -= ch_offset;
+  cc_offset = 1 + (ido << 2);
+  cc -= cc_offset;
+  --wa1;
+  --wa2;
+
+  /* Function Body */
+  for (k = 1; k <= l1; ++k) {
+    tr2 = cc_ref(ido, 2, k) + cc_ref(ido, 2, k);
+    cr2 = cc_ref(1, 1, k) + taur * tr2;
+    ch_ref(1, k, 1) = cc_ref(1, 1, k) + tr2;
+    ci3 = taui * (cc_ref(1, 3, k) + cc_ref(1, 3, k));
+    ch_ref(1, k, 2) = cr2 - ci3;
+    ch_ref(1, k, 3) = cr2 + ci3;
+  }
+  if (ido == 1) {
+    return;
+  }
+  idp2 = ido + 2;
+  for (k = 1; k <= l1; ++k) {
+    for (i = 3; i <= ido; i += 2) {
+      ic = idp2 - i;
+      tr2 = cc_ref(i - 1, 3, k) + cc_ref(ic - 1, 2, k);
+      cr2 = cc_ref(i - 1, 1, k) + taur * tr2;
+      ch_ref(i - 1, k, 1) = cc_ref(i - 1, 1, k) + tr2;
+      ti2 = cc_ref(i, 3, k) - cc_ref(ic, 2, k);
+      ci2 = cc_ref(i, 1, k) + taur * ti2;
+      ch_ref(i, k, 1) = cc_ref(i, 1, k) + ti2;
+      cr3 = taui * (cc_ref(i - 1, 3, k) - cc_ref(ic - 1, 2, k));
+      ci3 = taui * (cc_ref(i, 3, k) + cc_ref(ic, 2, k));
+      dr2 = cr2 - ci3;
+      dr3 = cr2 + ci3;
+      di2 = ci2 + cr3;
+      di3 = ci2 - cr3;
+      ch_ref(i - 1, k, 2) = wa1[i - 2] * dr2 - wa1[i - 1] * di2;
+      ch_ref(i, k, 2) = wa1[i - 2] * di2 + wa1[i - 1] * dr2;
+      ch_ref(i - 1, k, 3) = wa2[i - 2] * dr3 - wa2[i - 1] * di3;
+      ch_ref(i, k, 3) = wa2[i - 2] * di3 + wa2[i - 1] * dr3;
+    }
+  }
+} /* radb3 */
+
+#undef ch_ref
+#undef cc_ref
+
+
+static void radb4(integer ido, integer l1, const real *cc, real *ch, 
+                  const real *wa1, const real *wa2, const real *wa3)
+{
+  /* Initialized data */
+
+  static const real sqrt2 = 1.414213562373095f;
+
+  /* System generated locals */
+  integer cc_offset, ch_offset;
+
+  /* Local variables */
+  integer i, k, ic;
+  real ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
+  integer idp2;
+
+
+#define cc_ref(a_1,a_2,a_3) cc[((a_3)*4 + (a_2))*ido + a_1]
+#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1]
+
+  /* Parameter adjustments */
+  ch_offset = 1 + ido * (1 + l1);
+  ch -= ch_offset;
+  cc_offset = 1 + ido * 5;
+  cc -= cc_offset;
+  --wa1;
+  --wa2;
+  --wa3;
+
+  /* Function Body */
+  for (k = 1; k <= l1; ++k) {
+    tr1 = cc_ref(1, 1, k) - cc_ref(ido, 4, k);
+    tr2 = cc_ref(1, 1, k) + cc_ref(ido, 4, k);
+    tr3 = cc_ref(ido, 2, k) + cc_ref(ido, 2, k);
+    tr4 = cc_ref(1, 3, k) + cc_ref(1, 3, k);
+    ch_ref(1, k, 1) = tr2 + tr3;
+    ch_ref(1, k, 2) = tr1 - tr4;
+    ch_ref(1, k, 3) = tr2 - tr3;
+    ch_ref(1, k, 4) = tr1 + tr4;
+  }
+  if (ido < 2) return;
+  if (ido != 2) {
+    idp2 = ido + 2;
+    for (k = 1; k <= l1; ++k) {
+      for (i = 3; i <= ido; i += 2) {
+        ic = idp2 - i;
+        ti1 = cc_ref(i, 1, k) + cc_ref(ic, 4, k);
+        ti2 = cc_ref(i, 1, k) - cc_ref(ic, 4, k);
+        ti3 = cc_ref(i, 3, k) - cc_ref(ic, 2, k);
+        tr4 = cc_ref(i, 3, k) + cc_ref(ic, 2, k);
+        tr1 = cc_ref(i - 1, 1, k) - cc_ref(ic - 1, 4, k);
+        tr2 = cc_ref(i - 1, 1, k) + cc_ref(ic - 1, 4, k);
+        ti4 = cc_ref(i - 1, 3, k) - cc_ref(ic - 1, 2, k);
+        tr3 = cc_ref(i - 1, 3, k) + cc_ref(ic - 1, 2, k);
+        ch_ref(i - 1, k, 1) = tr2 + tr3;
+        cr3 = tr2 - tr3;
+        ch_ref(i, k, 1) = ti2 + ti3;
+        ci3 = ti2 - ti3;
+        cr2 = tr1 - tr4;
+        cr4 = tr1 + tr4;
+        ci2 = ti1 + ti4;
+        ci4 = ti1 - ti4;
+        ch_ref(i - 1, k, 2) = wa1[i - 2] * cr2 - wa1[i - 1] * ci2;
+        ch_ref(i, k, 2) = wa1[i - 2] * ci2 + wa1[i - 1] * cr2;
+        ch_ref(i - 1, k, 3) = wa2[i - 2] * cr3 - wa2[i - 1] * ci3;
+        ch_ref(i, k, 3) = wa2[i - 2] * ci3 + wa2[i - 1] * cr3;
+        ch_ref(i - 1, k, 4) = wa3[i - 2] * cr4 - wa3[i - 1] * ci4;
+        ch_ref(i, k, 4) = wa3[i - 2] * ci4 + wa3[i - 1] * cr4;
+      }
+    }
+    if (ido % 2 == 1) return;
+  }
+  for (k = 1; k <= l1; ++k) {
+    ti1 = cc_ref(1, 2, k) + cc_ref(1, 4, k);
+    ti2 = cc_ref(1, 4, k) - cc_ref(1, 2, k);
+    tr1 = cc_ref(ido, 1, k) - cc_ref(ido, 3, k);
+    tr2 = cc_ref(ido, 1, k) + cc_ref(ido, 3, k);
+    ch_ref(ido, k, 1) = tr2 + tr2;
+    ch_ref(ido, k, 2) = sqrt2 * (tr1 - ti1);
+    ch_ref(ido, k, 3) = ti2 + ti2;
+    ch_ref(ido, k, 4) = -sqrt2 * (tr1 + ti1);
+  }
+} /* radb4 */
+
+#undef ch_ref
+#undef cc_ref
+
+
+static void radb5(integer ido, integer l1, const real *cc, real *ch, 
+                  const real *wa1, const real *wa2, const real *wa3, const real *wa4)
+{
+  /* Initialized data */
+
+  static const real tr11 = .309016994374947f;
+  static const real ti11 = .951056516295154f;
+  static const real tr12 = -.809016994374947f;
+  static const real ti12 = .587785252292473f;
+
+  /* System generated locals */
+  integer cc_offset, ch_offset;
+
+  /* Local variables */
+  integer i, k, ic;
+  real ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4, ti2, ti3,
+    ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5;
+  integer idp2;
+
+
+#define cc_ref(a_1,a_2,a_3) cc[((a_3)*5 + (a_2))*ido + a_1]
+#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1]
+
+  /* Parameter adjustments */
+  ch_offset = 1 + ido * (1 + l1);
+  ch -= ch_offset;
+  cc_offset = 1 + ido * 6;
+  cc -= cc_offset;
+  --wa1;
+  --wa2;
+  --wa3;
+  --wa4;
+
+  /* Function Body */
+  for (k = 1; k <= l1; ++k) {
+    ti5 = cc_ref(1, 3, k) + cc_ref(1, 3, k);
+    ti4 = cc_ref(1, 5, k) + cc_ref(1, 5, k);
+    tr2 = cc_ref(ido, 2, k) + cc_ref(ido, 2, k);
+    tr3 = cc_ref(ido, 4, k) + cc_ref(ido, 4, k);
+    ch_ref(1, k, 1) = cc_ref(1, 1, k) + tr2 + tr3;
+    cr2 = cc_ref(1, 1, k) + tr11 * tr2 + tr12 * tr3;
+    cr3 = cc_ref(1, 1, k) + tr12 * tr2 + tr11 * tr3;
+    ci5 = ti11 * ti5 + ti12 * ti4;
+    ci4 = ti12 * ti5 - ti11 * ti4;
+    ch_ref(1, k, 2) = cr2 - ci5;
+    ch_ref(1, k, 3) = cr3 - ci4;
+    ch_ref(1, k, 4) = cr3 + ci4;
+    ch_ref(1, k, 5) = cr2 + ci5;
+  }
+  if (ido == 1) {
+    return;
+  }
+  idp2 = ido + 2;
+  for (k = 1; k <= l1; ++k) {
+    for (i = 3; i <= ido; i += 2) {
+      ic = idp2 - i;
+      ti5 = cc_ref(i, 3, k) + cc_ref(ic, 2, k);
+      ti2 = cc_ref(i, 3, k) - cc_ref(ic, 2, k);
+      ti4 = cc_ref(i, 5, k) + cc_ref(ic, 4, k);
+      ti3 = cc_ref(i, 5, k) - cc_ref(ic, 4, k);
+      tr5 = cc_ref(i - 1, 3, k) - cc_ref(ic - 1, 2, k);
+      tr2 = cc_ref(i - 1, 3, k) + cc_ref(ic - 1, 2, k);
+      tr4 = cc_ref(i - 1, 5, k) - cc_ref(ic - 1, 4, k);
+      tr3 = cc_ref(i - 1, 5, k) + cc_ref(ic - 1, 4, k);
+      ch_ref(i - 1, k, 1) = cc_ref(i - 1, 1, k) + tr2 + tr3;
+      ch_ref(i, k, 1) = cc_ref(i, 1, k) + ti2 + ti3;
+      cr2 = cc_ref(i - 1, 1, k) + tr11 * tr2 + tr12 * tr3;
+      ci2 = cc_ref(i, 1, k) + tr11 * ti2 + tr12 * ti3;
+      cr3 = cc_ref(i - 1, 1, k) + tr12 * tr2 + tr11 * tr3;
+      ci3 = cc_ref(i, 1, k) + tr12 * ti2 + tr11 * ti3;
+      cr5 = ti11 * tr5 + ti12 * tr4;
+      ci5 = ti11 * ti5 + ti12 * ti4;
+      cr4 = ti12 * tr5 - ti11 * tr4;
+      ci4 = ti12 * ti5 - ti11 * ti4;
+      dr3 = cr3 - ci4;
+      dr4 = cr3 + ci4;
+      di3 = ci3 + cr4;
+      di4 = ci3 - cr4;
+      dr5 = cr2 + ci5;
+      dr2 = cr2 - ci5;
+      di5 = ci2 - cr5;
+      di2 = ci2 + cr5;
+      ch_ref(i - 1, k, 2) = wa1[i - 2] * dr2 - wa1[i - 1] * di2;
+      ch_ref(i, k, 2) = wa1[i - 2] * di2 + wa1[i - 1] * dr2;
+      ch_ref(i - 1, k, 3) = wa2[i - 2] * dr3 - wa2[i - 1] * di3;
+      ch_ref(i, k, 3) = wa2[i - 2] * di3 + wa2[i - 1] * dr3;
+      ch_ref(i - 1, k, 4) = wa3[i - 2] * dr4 - wa3[i - 1] * di4;
+      ch_ref(i, k, 4) = wa3[i - 2] * di4 + wa3[i - 1] * dr4;
+      ch_ref(i - 1, k, 5) = wa4[i - 2] * dr5 - wa4[i - 1] * di5;
+      ch_ref(i, k, 5) = wa4[i - 2] * di5 + wa4[i - 1] * dr5;
+    }
+  }
+} /* radb5 */
+
+#undef ch_ref
+#undef cc_ref
+
+
+static void radbg(integer ido, integer ip, integer l1, integer idl1, 
+                  const real *cc, real *c1, real *c2, real *ch, real *ch2, const real *wa)
+{
+  /* System generated locals */
+  integer ch_offset, cc_offset,
+    c1_offset, c2_offset, ch2_offset;
+
+  /* Local variables */
+  integer i, j, k, l, j2, ic, jc, lc, ik, is;
+  real dc2, ai1, ai2, ar1, ar2, ds2;
+  integer nbd;
+  real dcp, arg, dsp, ar1h, ar2h;
+  integer idp2, ipp2, idij, ipph;
+
+
+#define c1_ref(a_1,a_2,a_3) c1[((a_3)*l1 + (a_2))*ido + a_1]
+#define c2_ref(a_1,a_2) c2[(a_2)*idl1 + a_1]
+#define cc_ref(a_1,a_2,a_3) cc[((a_3)*ip + (a_2))*ido + a_1]
+#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1]
+#define ch2_ref(a_1,a_2) ch2[(a_2)*idl1 + a_1]
+
+  /* Parameter adjustments */
+  ch_offset = 1 + ido * (1 + l1);
+  ch -= ch_offset;
+  c1_offset = 1 + ido * (1 + l1);
+  c1 -= c1_offset;
+  cc_offset = 1 + ido * (1 + ip);
+  cc -= cc_offset;
+  ch2_offset = 1 + idl1;
+  ch2 -= ch2_offset;
+  c2_offset = 1 + idl1;
+  c2 -= c2_offset;
+  --wa;
+
+  /* Function Body */
+  arg = (2*M_PI) / (real) (ip);
+  dcp = cos(arg);
+  dsp = sin(arg);
+  idp2 = ido + 2;
+  nbd = (ido - 1) / 2;
+  ipp2 = ip + 2;
+  ipph = (ip + 1) / 2;
+  if (ido >= l1) {
+    for (k = 1; k <= l1; ++k) {
+      for (i = 1; i <= ido; ++i) {
+        ch_ref(i, k, 1) = cc_ref(i, 1, k);
+      }
+    }
+  } else {
+    for (i = 1; i <= ido; ++i) {
+      for (k = 1; k <= l1; ++k) {
+        ch_ref(i, k, 1) = cc_ref(i, 1, k);
+      }
+    }
+  }
+  for (j = 2; j <= ipph; ++j) {
+    jc = ipp2 - j;
+    j2 = j + j;
+    for (k = 1; k <= l1; ++k) {
+      ch_ref(1, k, j) = cc_ref(ido, j2 - 2, k) + cc_ref(ido, j2 - 2, k);
+      ch_ref(1, k, jc) = cc_ref(1, j2 - 1, k) + cc_ref(1, j2 - 1, k);
+    }
+  }
+  if (ido != 1) {
+    if (nbd >= l1) {
+      for (j = 2; j <= ipph; ++j) {
+        jc = ipp2 - j;
+        for (k = 1; k <= l1; ++k) {
+          for (i = 3; i <= ido; i += 2) {
+            ic = idp2 - i;
+            ch_ref(i - 1, k, j) = cc_ref(i - 1, (j << 1) - 1, k) + cc_ref(ic - 1, (j << 1) - 2, k);
+            ch_ref(i - 1, k, jc) = cc_ref(i - 1, (j << 1) - 1, k) - cc_ref(ic - 1, (j << 1) - 2, k);
+            ch_ref(i, k, j) = cc_ref(i, (j << 1) - 1, k) - cc_ref(ic, (j << 1) - 2, k);
+            ch_ref(i, k, jc) = cc_ref(i, (j << 1) - 1, k) + cc_ref(ic, (j << 1) - 2, k);
+          }
+        }
+      }
+    } else {
+      for (j = 2; j <= ipph; ++j) {
+        jc = ipp2 - j;
+        for (i = 3; i <= ido; i += 2) {
+          ic = idp2 - i;
+          for (k = 1; k <= l1; ++k) {
+            ch_ref(i - 1, k, j) = cc_ref(i - 1, (j << 1) - 1, k) + cc_ref(ic - 1, (j << 1) - 2, k);
+            ch_ref(i - 1, k, jc) = cc_ref(i - 1, (j << 1) - 1, k) - cc_ref(ic - 1, (j << 1) - 2, k);
+            ch_ref(i, k, j) = cc_ref(i, (j << 1) - 1, k) - cc_ref(ic, (j << 1) - 2, k);
+            ch_ref(i, k, jc) = cc_ref(i, (j << 1) - 1, k) + cc_ref(ic, (j << 1) - 2, k);
+          }
+        }
+      }
+    }
+  }
+  ar1 = 1.f;
+  ai1 = 0.f;
+  for (l = 2; l <= ipph; ++l) {
+    lc = ipp2 - l;
+    ar1h = dcp * ar1 - dsp * ai1;
+    ai1 = dcp * ai1 + dsp * ar1;
+    ar1 = ar1h;
+    for (ik = 1; ik <= idl1; ++ik) {
+      c2_ref(ik, l) = ch2_ref(ik, 1) + ar1 * ch2_ref(ik, 2);
+      c2_ref(ik, lc) = ai1 * ch2_ref(ik, ip);
+    }
+    dc2 = ar1;
+    ds2 = ai1;
+    ar2 = ar1;
+    ai2 = ai1;
+    for (j = 3; j <= ipph; ++j) {
+      jc = ipp2 - j;
+      ar2h = dc2 * ar2 - ds2 * ai2;
+      ai2 = dc2 * ai2 + ds2 * ar2;
+      ar2 = ar2h;
+      for (ik = 1; ik <= idl1; ++ik) {
+        c2_ref(ik, l) = c2_ref(ik, l) + ar2 * ch2_ref(ik, j);
+        c2_ref(ik, lc) = c2_ref(ik, lc) + ai2 * ch2_ref(ik, jc);
+      }
+    }
+  }
+  for (j = 2; j <= ipph; ++j) {
+    for (ik = 1; ik <= idl1; ++ik) {
+      ch2_ref(ik, 1) = ch2_ref(ik, 1) + ch2_ref(ik, j);
+    }
+  }
+  for (j = 2; j <= ipph; ++j) {
+    jc = ipp2 - j;
+    for (k = 1; k <= l1; ++k) {
+      ch_ref(1, k, j) = c1_ref(1, k, j) - c1_ref(1, k, jc);
+      ch_ref(1, k, jc) = c1_ref(1, k, j) + c1_ref(1, k, jc);
+    }
+  }
+  if (ido != 1) {
+    if (nbd >= l1) {
+      for (j = 2; j <= ipph; ++j) {
+        jc = ipp2 - j;
+        for (k = 1; k <= l1; ++k) {
+          for (i = 3; i <= ido; i += 2) {
+            ch_ref(i - 1, k, j) = c1_ref(i - 1, k, j) - c1_ref(i, k, jc);
+            ch_ref(i - 1, k, jc) = c1_ref(i - 1, k, j) + c1_ref(i, k, jc);
+            ch_ref(i, k, j) = c1_ref(i, k, j) + c1_ref(i - 1, k, jc);
+            ch_ref(i, k, jc) = c1_ref(i, k, j) - c1_ref(i - 1, k, jc);
+          }
+        }
+      }
+    } else {
+      for (j = 2; j <= ipph; ++j) {
+        jc = ipp2 - j;
+        for (i = 3; i <= ido; i += 2) {
+          for (k = 1; k <= l1; ++k) {
+            ch_ref(i - 1, k, j) = c1_ref(i - 1, k, j) - c1_ref(i, k, jc);
+            ch_ref(i - 1, k, jc) = c1_ref(i - 1, k, j) + c1_ref(i, k, jc);
+            ch_ref(i, k, j) = c1_ref(i, k, j) + c1_ref(i - 1, k, jc);
+            ch_ref(i, k, jc) = c1_ref(i, k, j) - c1_ref(i - 1, k, jc);
+          }
+        }
+      }
+    }
+  }
+  if (ido == 1) {
+    return;
+  }
+  for (ik = 1; ik <= idl1; ++ik) {
+    c2_ref(ik, 1) = ch2_ref(ik, 1);
+  }
+  for (j = 2; j <= ip; ++j) {
+    for (k = 1; k <= l1; ++k) {
+      c1_ref(1, k, j) = ch_ref(1, k, j);
+    }
+  }
+  if (nbd <= l1) {
+    is = -(ido);
+    for (j = 2; j <= ip; ++j) {
+      is += ido;
+      idij = is;
+      for (i = 3; i <= ido; i += 2) {
+        idij += 2;
+        for (k = 1; k <= l1; ++k) {
+          c1_ref(i - 1, k, j) = wa[idij - 1] * ch_ref(i - 1, k, j) 
+            - wa[idij] * ch_ref(i, k, j);
+          c1_ref(i, k, j) = wa[idij - 1] * ch_ref(i, k, j) + wa[idij] * ch_ref(i - 1, k, j);
+        }
+      }
+    }
+  } else {
+    is = -(ido);
+    for (j = 2; j <= ip; ++j) {
+      is += ido;
+      for (k = 1; k <= l1; ++k) {
+        idij = is;
+        for (i = 3; i <= ido; i += 2) {
+          idij += 2;
+          c1_ref(i - 1, k, j) = wa[idij - 1] * ch_ref(i - 1, k, j) 
+            - wa[idij] * ch_ref(i, k, j);
+          c1_ref(i, k, j) = wa[idij - 1] * ch_ref(i, k, j) + wa[idij] * ch_ref(i - 1, k, j);
+        }
+      }
+    }
+  }
+} /* radbg */
+
+#undef ch2_ref
+#undef ch_ref
+#undef cc_ref
+#undef c2_ref
+#undef c1_ref
+
+
+static void radf2(integer ido, integer l1, const real *cc, real *ch, 
+                  const real *wa1)
+{
+  /* System generated locals */
+  integer ch_offset, cc_offset;
+
+  /* Local variables */
+  integer i, k, ic;
+  real ti2, tr2;
+  integer idp2;
+
+
+#define cc_ref(a_1,a_2,a_3) cc[((a_3)*l1 + (a_2))*ido + a_1]
+#define ch_ref(a_1,a_2,a_3) ch[((a_3)*2 + (a_2))*ido + a_1]
+
+  /* Parameter adjustments */
+  ch_offset = 1 + ido * 3;
+  ch -= ch_offset;
+  cc_offset = 1 + ido * (1 + l1);
+  cc -= cc_offset;
+  --wa1;
+
+  /* Function Body */
+  for (k = 1; k <= l1; ++k) {
+    ch_ref(1, 1, k) = cc_ref(1, k, 1) + cc_ref(1, k, 2);
+    ch_ref(ido, 2, k) = cc_ref(1, k, 1) - cc_ref(1, k, 2);
+  }
+  if (ido < 2) return;
+  if (ido != 2) {
+    idp2 = ido + 2;
+    for (k = 1; k <= l1; ++k) {
+      for (i = 3; i <= ido; i += 2) {
+        ic = idp2 - i;
+        tr2 = wa1[i - 2] * cc_ref(i - 1, k, 2) + wa1[i - 1] * 
+          cc_ref(i, k, 2);
+        ti2 = wa1[i - 2] * cc_ref(i, k, 2) - wa1[i - 1] * cc_ref(
+                                                                 i - 1, k, 2);
+        ch_ref(i, 1, k) = cc_ref(i, k, 1) + ti2;
+        ch_ref(ic, 2, k) = ti2 - cc_ref(i, k, 1);
+        ch_ref(i - 1, 1, k) = cc_ref(i - 1, k, 1) + tr2;
+        ch_ref(ic - 1, 2, k) = cc_ref(i - 1, k, 1) - tr2;
+      }
+    }
+    if (ido % 2 == 1) {
+      return;
+    }
+  }
+  for (k = 1; k <= l1; ++k) {
+    ch_ref(1, 2, k) = -cc_ref(ido, k, 2);
+    ch_ref(ido, 1, k) = cc_ref(ido, k, 1);
+  }
+} /* radf2 */
+
+#undef ch_ref
+#undef cc_ref
+
+
+static void radf3(integer ido, integer l1, const real *cc, real *ch, 
+                  const real *wa1, const real *wa2)
+{
+  static const real taur = -.5f;
+  static const real taui = .866025403784439f;
+
+  /* System generated locals */
+  integer ch_offset, cc_offset;
+
+  /* Local variables */
+  integer i, k, ic;
+  real ci2, di2, di3, cr2, dr2, dr3, ti2, ti3, tr2, tr3;
+  integer idp2;
+
+
+#define cc_ref(a_1,a_2,a_3) cc[((a_3)*l1 + (a_2))*ido + a_1]
+#define ch_ref(a_1,a_2,a_3) ch[((a_3)*3 + (a_2))*ido + a_1]
+
+  /* Parameter adjustments */
+  ch_offset = 1 + (ido << 2);
+  ch -= ch_offset;
+  cc_offset = 1 + ido * (1 + l1);
+  cc -= cc_offset;
+  --wa1;
+  --wa2;
+
+  /* Function Body */
+  for (k = 1; k <= l1; ++k) {
+    cr2 = cc_ref(1, k, 2) + cc_ref(1, k, 3);
+    ch_ref(1, 1, k) = cc_ref(1, k, 1) + cr2;
+    ch_ref(1, 3, k) = taui * (cc_ref(1, k, 3) - cc_ref(1, k, 2));
+    ch_ref(ido, 2, k) = cc_ref(1, k, 1) + taur * cr2;
+  }
+  if (ido == 1) {
+    return;
+  }
+  idp2 = ido + 2;
+  for (k = 1; k <= l1; ++k) {
+    for (i = 3; i <= ido; i += 2) {
+      ic = idp2 - i;
+      dr2 = wa1[i - 2] * cc_ref(i - 1, k, 2) + wa1[i - 1] * 
+        cc_ref(i, k, 2);
+      di2 = wa1[i - 2] * cc_ref(i, k, 2) - wa1[i - 1] * cc_ref(
+                                                               i - 1, k, 2);
+      dr3 = wa2[i - 2] * cc_ref(i - 1, k, 3) + wa2[i - 1] * 
+        cc_ref(i, k, 3);
+      di3 = wa2[i - 2] * cc_ref(i, k, 3) - wa2[i - 1] * cc_ref(
+                                                               i - 1, k, 3);
+      cr2 = dr2 + dr3;
+      ci2 = di2 + di3;
+      ch_ref(i - 1, 1, k) = cc_ref(i - 1, k, 1) + cr2;
+      ch_ref(i, 1, k) = cc_ref(i, k, 1) + ci2;
+      tr2 = cc_ref(i - 1, k, 1) + taur * cr2;
+      ti2 = cc_ref(i, k, 1) + taur * ci2;
+      tr3 = taui * (di2 - di3);
+      ti3 = taui * (dr3 - dr2);
+      ch_ref(i - 1, 3, k) = tr2 + tr3;
+      ch_ref(ic - 1, 2, k) = tr2 - tr3;
+      ch_ref(i, 3, k) = ti2 + ti3;
+      ch_ref(ic, 2, k) = ti3 - ti2;
+    }
+  }
+} /* radf3 */
+
+#undef ch_ref
+#undef cc_ref
+
+
+static void radf4(integer ido, integer l1, const real *cc, real *ch, 
+                  const real *wa1, const real *wa2, const real *wa3)
+{
+  /* Initialized data */
+
+  static const real hsqt2 = .7071067811865475f;
+
+  /* System generated locals */
+  integer cc_offset, ch_offset;
+
+  /* Local variables */
+  integer i, k, ic;
+  real ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
+  integer idp2;
+
+
+#define cc_ref(a_1,a_2,a_3) cc[((a_3)*l1 + (a_2))*ido + a_1]
+#define ch_ref(a_1,a_2,a_3) ch[((a_3)*4 + (a_2))*ido + a_1]
+
+  /* Parameter adjustments */
+  ch_offset = 1 + ido * 5;
+  ch -= ch_offset;
+  cc_offset = 1 + ido * (1 + l1);
+  cc -= cc_offset;
+  --wa1;
+  --wa2;
+  --wa3;
+
+  /* Function Body */
+  for (k = 1; k <= l1; ++k) {
+    tr1 = cc_ref(1, k, 2) + cc_ref(1, k, 4);
+    tr2 = cc_ref(1, k, 1) + cc_ref(1, k, 3);
+    ch_ref(1, 1, k) = tr1 + tr2;
+    ch_ref(ido, 4, k) = tr2 - tr1;
+    ch_ref(ido, 2, k) = cc_ref(1, k, 1) - cc_ref(1, k, 3);
+    ch_ref(1, 3, k) = cc_ref(1, k, 4) - cc_ref(1, k, 2);
+  }  
+  if (ido < 2) return;
+  if (ido != 2) {
+    idp2 = ido + 2;
+    for (k = 1; k <= l1; ++k) {
+      for (i = 3; i <= ido; i += 2) {
+        ic = idp2 - i;
+        cr2 = wa1[i - 2] * cc_ref(i - 1, k, 2) + wa1[i - 1] * 
+          cc_ref(i, k, 2);
+        ci2 = wa1[i - 2] * cc_ref(i, k, 2) - wa1[i - 1] * cc_ref(
+                                                                 i - 1, k, 2);
+        cr3 = wa2[i - 2] * cc_ref(i - 1, k, 3) + wa2[i - 1] * 
+          cc_ref(i, k, 3);
+        ci3 = wa2[i - 2] * cc_ref(i, k, 3) - wa2[i - 1] * cc_ref(
+                                                                 i - 1, k, 3);
+        cr4 = wa3[i - 2] * cc_ref(i - 1, k, 4) + wa3[i - 1] * 
+          cc_ref(i, k, 4);
+        ci4 = wa3[i - 2] * cc_ref(i, k, 4) - wa3[i - 1] * cc_ref(
+                                                                 i - 1, k, 4);
+        tr1 = cr2 + cr4;
+        tr4 = cr4 - cr2;
+        ti1 = ci2 + ci4;
+        ti4 = ci2 - ci4;
+        ti2 = cc_ref(i, k, 1) + ci3;
+        ti3 = cc_ref(i, k, 1) - ci3;
+        tr2 = cc_ref(i - 1, k, 1) + cr3;
+        tr3 = cc_ref(i - 1, k, 1) - cr3;
+        ch_ref(i - 1, 1, k) = tr1 + tr2;
+        ch_ref(ic - 1, 4, k) = tr2 - tr1;
+        ch_ref(i, 1, k) = ti1 + ti2;
+        ch_ref(ic, 4, k) = ti1 - ti2;
+        ch_ref(i - 1, 3, k) = ti4 + tr3;
+        ch_ref(ic - 1, 2, k) = tr3 - ti4;
+        ch_ref(i, 3, k) = tr4 + ti3;
+        ch_ref(ic, 2, k) = tr4 - ti3;
+      }
+    }
+    if (ido % 2 == 1) {
+      return;
+    }
+  }
+  for (k = 1; k <= l1; ++k) {
+    ti1 = -hsqt2 * (cc_ref(ido, k, 2) + cc_ref(ido, k, 4));
+    tr1 = hsqt2 * (cc_ref(ido, k, 2) - cc_ref(ido, k, 4));
+    ch_ref(ido, 1, k) = tr1 + cc_ref(ido, k, 1);
+    ch_ref(ido, 3, k) = cc_ref(ido, k, 1) - tr1;
+    ch_ref(1, 2, k) = ti1 - cc_ref(ido, k, 3);
+    ch_ref(1, 4, k) = ti1 + cc_ref(ido, k, 3);
+  }
+} /* radf4 */
+
+#undef ch_ref
+#undef cc_ref
+
+
+static void radf5(integer ido, integer l1, const real *cc, real *ch, 
+                  const real *wa1, const real *wa2, const real *wa3, const real *wa4)
+{
+  /* Initialized data */
+
+  static const real tr11 = .309016994374947f;
+  static const real ti11 = .951056516295154f;
+  static const real tr12 = -.809016994374947f;
+  static const real ti12 = .587785252292473f;
+
+  /* System generated locals */
+  integer cc_offset, ch_offset;
+
+  /* Local variables */
+  integer i, k, ic;
+  real ci2, di2, ci4, ci5, di3, di4, di5, ci3, cr2, cr3, dr2, dr3, dr4, dr5,
+    cr5, cr4, ti2, ti3, ti5, ti4, tr2, tr3, tr4, tr5;
+  integer idp2;
+
+
+#define cc_ref(a_1,a_2,a_3) cc[((a_3)*l1 + (a_2))*ido + a_1]
+#define ch_ref(a_1,a_2,a_3) ch[((a_3)*5 + (a_2))*ido + a_1]
+
+  /* Parameter adjustments */
+  ch_offset = 1 + ido * 6;
+  ch -= ch_offset;
+  cc_offset = 1 + ido * (1 + l1);
+  cc -= cc_offset;
+  --wa1;
+  --wa2;
+  --wa3;
+  --wa4;
+
+  /* Function Body */
+  for (k = 1; k <= l1; ++k) {
+    cr2 = cc_ref(1, k, 5) + cc_ref(1, k, 2);
+    ci5 = cc_ref(1, k, 5) - cc_ref(1, k, 2);
+    cr3 = cc_ref(1, k, 4) + cc_ref(1, k, 3);
+    ci4 = cc_ref(1, k, 4) - cc_ref(1, k, 3);
+    ch_ref(1, 1, k) = cc_ref(1, k, 1) + cr2 + cr3;
+    ch_ref(ido, 2, k) = cc_ref(1, k, 1) + tr11 * cr2 + tr12 * cr3;
+    ch_ref(1, 3, k) = ti11 * ci5 + ti12 * ci4;
+    ch_ref(ido, 4, k) = cc_ref(1, k, 1) + tr12 * cr2 + tr11 * cr3;
+    ch_ref(1, 5, k) = ti12 * ci5 - ti11 * ci4;
+  }
+  if (ido == 1) {
+    return;
+  }
+  idp2 = ido + 2;
+  for (k = 1; k <= l1; ++k) {
+    for (i = 3; i <= ido; i += 2) {
+      ic = idp2 - i;
+      dr2 = wa1[i - 2] * cc_ref(i - 1, k, 2) + wa1[i - 1] * cc_ref(i, k, 2);
+      di2 = wa1[i - 2] * cc_ref(i, k, 2) - wa1[i - 1] * cc_ref(i - 1, k, 2);
+      dr3 = wa2[i - 2] * cc_ref(i - 1, k, 3) + wa2[i - 1] * cc_ref(i, k, 3);
+      di3 = wa2[i - 2] * cc_ref(i, k, 3) - wa2[i - 1] * cc_ref(i - 1, k, 3);
+      dr4 = wa3[i - 2] * cc_ref(i - 1, k, 4) + wa3[i - 1] * cc_ref(i, k, 4);
+      di4 = wa3[i - 2] * cc_ref(i, k, 4) - wa3[i - 1] * cc_ref(i - 1, k, 4);
+      dr5 = wa4[i - 2] * cc_ref(i - 1, k, 5) + wa4[i - 1] * cc_ref(i, k, 5);
+      di5 = wa4[i - 2] * cc_ref(i, k, 5) - wa4[i - 1] * cc_ref(i - 1, k, 5);
+      cr2 = dr2 + dr5;
+      ci5 = dr5 - dr2;
+      cr5 = di2 - di5;
+      ci2 = di2 + di5;
+      cr3 = dr3 + dr4;
+      ci4 = dr4 - dr3;
+      cr4 = di3 - di4;
+      ci3 = di3 + di4;
+      ch_ref(i - 1, 1, k) = cc_ref(i - 1, k, 1) + cr2 + cr3;
+      ch_ref(i, 1, k) = cc_ref(i, k, 1) + ci2 + ci3;
+      tr2 = cc_ref(i - 1, k, 1) + tr11 * cr2 + tr12 * cr3;
+      ti2 = cc_ref(i, k, 1) + tr11 * ci2 + tr12 * ci3;
+      tr3 = cc_ref(i - 1, k, 1) + tr12 * cr2 + tr11 * cr3;
+      ti3 = cc_ref(i, k, 1) + tr12 * ci2 + tr11 * ci3;
+      tr5 = ti11 * cr5 + ti12 * cr4;
+      ti5 = ti11 * ci5 + ti12 * ci4;
+      tr4 = ti12 * cr5 - ti11 * cr4;
+      ti4 = ti12 * ci5 - ti11 * ci4;
+      ch_ref(i - 1, 3, k) = tr2 + tr5;
+      ch_ref(ic - 1, 2, k) = tr2 - tr5;
+      ch_ref(i, 3, k) = ti2 + ti5;
+      ch_ref(ic, 2, k) = ti5 - ti2;
+      ch_ref(i - 1, 5, k) = tr3 + tr4;
+      ch_ref(ic - 1, 4, k) = tr3 - tr4;
+      ch_ref(i, 5, k) = ti3 + ti4;
+      ch_ref(ic, 4, k) = ti4 - ti3;
+    }
+  }
+} /* radf5 */
+
+#undef ch_ref
+#undef cc_ref
+
+
+static void radfg(integer ido, integer ip, integer l1, integer idl1, 
+                  real *cc, real *c1, real *c2, real *ch, real *ch2, const real *wa)
+{
+  /* System generated locals */
+  integer ch_offset, cc_offset,
+    c1_offset, c2_offset, ch2_offset;
+
+  /* Local variables */
+  integer i, j, k, l, j2, ic, jc, lc, ik, is;
+  real dc2, ai1, ai2, ar1, ar2, ds2;
+  integer nbd;
+  real dcp, arg, dsp, ar1h, ar2h;
+  integer idp2, ipp2, idij, ipph;
+
+
+#define c1_ref(a_1,a_2,a_3) c1[((a_3)*l1 + (a_2))*ido + a_1]
+#define c2_ref(a_1,a_2) c2[(a_2)*idl1 + a_1]
+#define cc_ref(a_1,a_2,a_3) cc[((a_3)*ip + (a_2))*ido + a_1]
+#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1]
+#define ch2_ref(a_1,a_2) ch2[(a_2)*idl1 + a_1]
+
+  /* Parameter adjustments */
+  ch_offset = 1 + ido * (1 + l1);
+  ch -= ch_offset;
+  c1_offset = 1 + ido * (1 + l1);
+  c1 -= c1_offset;
+  cc_offset = 1 + ido * (1 + ip);
+  cc -= cc_offset;
+  ch2_offset = 1 + idl1;
+  ch2 -= ch2_offset;
+  c2_offset = 1 + idl1;
+  c2 -= c2_offset;
+  --wa;
+
+  /* Function Body */
+  arg = (2*M_PI) / (real) (ip);
+  dcp = cos(arg);
+  dsp = sin(arg);
+  ipph = (ip + 1) / 2;
+  ipp2 = ip + 2;
+  idp2 = ido + 2;
+  nbd = (ido - 1) / 2;
+  if (ido == 1) {
+    for (ik = 1; ik <= idl1; ++ik) {
+      c2_ref(ik, 1) = ch2_ref(ik, 1);
+    }
+  } else {
+    for (ik = 1; ik <= idl1; ++ik) {
+      ch2_ref(ik, 1) = c2_ref(ik, 1);
+    }
+    for (j = 2; j <= ip; ++j) {
+      for (k = 1; k <= l1; ++k) {
+        ch_ref(1, k, j) = c1_ref(1, k, j);
+      }
+    }
+    if (nbd <= l1) {
+      is = -(ido);
+      for (j = 2; j <= ip; ++j) {
+        is += ido;
+        idij = is;
+        for (i = 3; i <= ido; i += 2) {
+          idij += 2;
+          for (k = 1; k <= l1; ++k) {
+            ch_ref(i - 1, k, j) = wa[idij - 1] * c1_ref(i - 1, k, j)
+              + wa[idij] * c1_ref(i, k, j);
+            ch_ref(i, k, j) = wa[idij - 1] * c1_ref(i, k, j) - wa[
+                                                                  idij] * c1_ref(i - 1, k, j);
+          }
+        }
+      }
+    } else {
+      is = -(ido);
+      for (j = 2; j <= ip; ++j) {
+        is += ido;
+        for (k = 1; k <= l1; ++k) {
+          idij = is;
+          for (i = 3; i <= ido; i += 2) {
+            idij += 2;
+            ch_ref(i - 1, k, j) = wa[idij - 1] * c1_ref(i - 1, k, j) 
+              + wa[idij] * c1_ref(i, k, j);
+            ch_ref(i, k, j) = wa[idij - 1] * c1_ref(i, k, j) - wa[
+                                                                  idij] * c1_ref(i - 1, k, j);
+          }
+        }
+      }
+    }
+    if (nbd >= l1) {
+      for (j = 2; j <= ipph; ++j) {
+        jc = ipp2 - j;
+        for (k = 1; k <= l1; ++k) {
+          for (i = 3; i <= ido; i += 2) {
+            c1_ref(i - 1, k, j) = ch_ref(i - 1, k, j) + ch_ref(i - 
+                                                               1, k, jc);
+            c1_ref(i - 1, k, jc) = ch_ref(i, k, j) - ch_ref(i, k, 
+                                                            jc);
+            c1_ref(i, k, j) = ch_ref(i, k, j) + ch_ref(i, k, jc);
+            c1_ref(i, k, jc) = ch_ref(i - 1, k, jc) - ch_ref(i - 1, 
+                                                             k, j);
+          }
+        }
+      }
+    } else {
+      for (j = 2; j <= ipph; ++j) {
+        jc = ipp2 - j;
+        for (i = 3; i <= ido; i += 2) {
+          for (k = 1; k <= l1; ++k) {
+            c1_ref(i - 1, k, j) = ch_ref(i - 1, k, j) + ch_ref(i - 
+                                                               1, k, jc);
+            c1_ref(i - 1, k, jc) = ch_ref(i, k, j) - ch_ref(i, k, 
+                                                            jc);
+            c1_ref(i, k, j) = ch_ref(i, k, j) + ch_ref(i, k, jc);
+            c1_ref(i, k, jc) = ch_ref(i - 1, k, jc) - ch_ref(i - 1, 
+                                                             k, j);
+          }
+        }
+      }
+    }
+  }
+  for (j = 2; j <= ipph; ++j) {
+    jc = ipp2 - j;
+    for (k = 1; k <= l1; ++k) {
+      c1_ref(1, k, j) = ch_ref(1, k, j) + ch_ref(1, k, jc);
+      c1_ref(1, k, jc) = ch_ref(1, k, jc) - ch_ref(1, k, j);
+    }
+  }
+
+  ar1 = 1.f;
+  ai1 = 0.f;
+  for (l = 2; l <= ipph; ++l) {
+    lc = ipp2 - l;
+    ar1h = dcp * ar1 - dsp * ai1;
+    ai1 = dcp * ai1 + dsp * ar1;
+    ar1 = ar1h;
+    for (ik = 1; ik <= idl1; ++ik) {
+      ch2_ref(ik, l) = c2_ref(ik, 1) + ar1 * c2_ref(ik, 2);
+      ch2_ref(ik, lc) = ai1 * c2_ref(ik, ip);
+    }
+    dc2 = ar1;
+    ds2 = ai1;
+    ar2 = ar1;
+    ai2 = ai1;
+    for (j = 3; j <= ipph; ++j) {
+      jc = ipp2 - j;
+      ar2h = dc2 * ar2 - ds2 * ai2;
+      ai2 = dc2 * ai2 + ds2 * ar2;
+      ar2 = ar2h;
+      for (ik = 1; ik <= idl1; ++ik) {
+        ch2_ref(ik, l) = ch2_ref(ik, l) + ar2 * c2_ref(ik, j);
+        ch2_ref(ik, lc) = ch2_ref(ik, lc) + ai2 * c2_ref(ik, jc);
+      }
+    }
+  }
+  for (j = 2; j <= ipph; ++j) {
+    for (ik = 1; ik <= idl1; ++ik) {
+      ch2_ref(ik, 1) = ch2_ref(ik, 1) + c2_ref(ik, j);
+    }
+  }
+
+  if (ido >= l1) {
+    for (k = 1; k <= l1; ++k) {
+      for (i = 1; i <= ido; ++i) {
+        cc_ref(i, 1, k) = ch_ref(i, k, 1);
+      }
+    }
+  } else {
+    for (i = 1; i <= ido; ++i) {
+      for (k = 1; k <= l1; ++k) {
+        cc_ref(i, 1, k) = ch_ref(i, k, 1);
+      }
+    }
+  }
+  for (j = 2; j <= ipph; ++j) {
+    jc = ipp2 - j;
+    j2 = j + j;
+    for (k = 1; k <= l1; ++k) {
+      cc_ref(ido, j2 - 2, k) = ch_ref(1, k, j);
+      cc_ref(1, j2 - 1, k) = ch_ref(1, k, jc);
+    }
+  }
+  if (ido == 1) {
+    return;
+  }
+  if (nbd >= l1) {
+    for (j = 2; j <= ipph; ++j) {
+      jc = ipp2 - j;
+      j2 = j + j;
+      for (k = 1; k <= l1; ++k) {
+        for (i = 3; i <= ido; i += 2) {
+          ic = idp2 - i;
+          cc_ref(i - 1, j2 - 1, k) = ch_ref(i - 1, k, j) + ch_ref(
+                                                                  i - 1, k, jc);
+          cc_ref(ic - 1, j2 - 2, k) = ch_ref(i - 1, k, j) - ch_ref(
+                                                                   i - 1, k, jc);
+          cc_ref(i, j2 - 1, k) = ch_ref(i, k, j) + ch_ref(i, k, 
+                                                          jc);
+          cc_ref(ic, j2 - 2, k) = ch_ref(i, k, jc) - ch_ref(i, k, j)
+            ;
+        }
+      }
+    }
+  } else {
+    for (j = 2; j <= ipph; ++j) {
+      jc = ipp2 - j;
+      j2 = j + j;
+      for (i = 3; i <= ido; i += 2) {
+        ic = idp2 - i;
+        for (k = 1; k <= l1; ++k) {
+          cc_ref(i - 1, j2 - 1, k) = ch_ref(i - 1, k, j) + ch_ref(
+                                                                  i - 1, k, jc);
+          cc_ref(ic - 1, j2 - 2, k) = ch_ref(i - 1, k, j) - ch_ref(
+                                                                   i - 1, k, jc);
+          cc_ref(i, j2 - 1, k) = ch_ref(i, k, j) + ch_ref(i, k, 
+                                                          jc);
+          cc_ref(ic, j2 - 2, k) = ch_ref(i, k, jc) - ch_ref(i, k, j)
+            ;
+        }
+      }
+    }
+  }
+} /* radfg */
+
+#undef ch2_ref
+#undef ch_ref
+#undef cc_ref
+#undef c2_ref
+#undef c1_ref
+
+
+static void cfftb1(integer n, real *c, real *ch, const real *wa, integer *ifac)
+{
+  integer i, k1, l1, l2, na, nf, ip, iw, ix2, ix3, ix4, nac, ido, 
+    idl1, idot;
+
+  /* Function Body */
+  nf = ifac[1];
+  na = 0;
+  l1 = 1;
+  iw = 0;
+  for (k1 = 1; k1 <= nf; ++k1) {
+    ip = ifac[k1 + 1];
+    l2 = ip * l1;
+    ido = n / l2;
+    idot = ido + ido;
+    idl1 = idot * l1;
+    switch (ip) {
+      case 4:
+        ix2 = iw + idot;
+        ix3 = ix2 + idot;
+        passb4(idot, l1, na?ch:c, na?c:ch, &wa[iw], &wa[ix2], &wa[ix3]);
+        na = 1 - na;
+        break;
+      case 2:
+        passb2(idot, l1, na?ch:c, na?c:ch, &wa[iw]);
+        na = 1 - na;
+        break;
+      case 3:
+        ix2 = iw + idot;
+        passb3(idot, l1, na?ch:c, na?c:ch, &wa[iw], &wa[ix2]);
+        na = 1 - na;
+        break;
+      case 5:
+        ix2 = iw + idot;
+        ix3 = ix2 + idot;
+        ix4 = ix3 + idot;
+        passfb5(idot, l1, na?ch:c, na?c:ch, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], +1);
+        na = 1 - na;
+        break;
+      default:
+        if (na == 0) {
+          passfb(&nac, idot, ip, l1, idl1, c, c, c, ch, ch, &wa[iw], +1);
+        } else {
+          passfb(&nac, idot, ip, l1, idl1, ch, ch, ch, c, c, &wa[iw], +1);
+        }
+        if (nac != 0) {
+          na = 1 - na;
+        }
+        break;
+    }
+    l1 = l2;
+    iw += (ip - 1) * idot;
+  }
+  if (na == 0) {
+    return;
+  }
+  for (i = 0; i < 2*n; ++i) {
+    c[i] = ch[i];
+  }
+} /* cfftb1 */
+
+void cfftb(integer n, real *c, real *wsave)
+{
+  integer iw1, iw2;
+
+  /* Parameter adjustments */
+  --wsave;
+  --c;
+
+  /* Function Body */
+  if (n == 1) {
+    return;
+  }
+  iw1 = 2*n + 1;
+  iw2 = iw1 + 2*n;
+  cfftb1(n, &c[1], &wsave[1], &wsave[iw1], (int*)&wsave[iw2]);
+} /* cfftb */
+
+static void cfftf1(integer n, real *c, real *ch, const real *wa, integer *ifac)
+{
+  /* Local variables */
+  integer i, k1, l1, l2, na, nf, ip, iw, ix2, ix3, ix4, nac, ido, 
+    idl1, idot;
+
+  /* Function Body */
+  nf = ifac[1];
+  na = 0;
+  l1 = 1;
+  iw = 0;
+  for (k1 = 1; k1 <= nf; ++k1) {
+    ip = ifac[k1 + 1];
+    l2 = ip * l1;
+    ido = n / l2;
+    idot = ido + ido;
+    idl1 = idot * l1;
+    switch (ip) {
+      case 4:
+        ix2 = iw + idot;
+        ix3 = ix2 + idot;
+        passf4(idot, l1, na?ch:c, na?c:ch, &wa[iw], &wa[ix2], &wa[ix3]);
+        na = 1 - na;
+        break;
+      case 2:
+        passf2(idot, l1, na?ch:c, na?c:ch, &wa[iw]);
+        na = 1 - na;
+        break;
+      case 3:
+        ix2 = iw + idot;
+        passf3(idot, l1, na?ch:c, na?c:ch, &wa[iw], &wa[ix2]);
+        na = 1 - na;
+        break;
+      case 5:
+        ix2 = iw + idot;
+        ix3 = ix2 + idot;
+        ix4 = ix3 + idot;
+        passfb5(idot, l1, na?ch:c, na?c:ch, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], -1);
+        na = 1 - na;
+        break;
+      default:
+        if (na == 0) {
+          passfb(&nac, idot, ip, l1, idl1, c, c, c, ch, ch, &wa[iw], -1);
+        } else {
+          passfb(&nac, idot, ip, l1, idl1, ch, ch, ch, c, c, &wa[iw], -1);
+        }
+        if (nac != 0) {
+          na = 1 - na;
+        }
+        break;
+    }
+    l1 = l2;
+    iw += (ip - 1)*idot;
+  }
+  if (na == 0) {
+    return;
+  }
+  for (i = 0; i < 2*n; ++i) {
+    c[i] = ch[i];
+  }
+} /* cfftf1 */
+
+void cfftf(integer n, real *c, real *wsave)
+{
+  integer iw1, iw2;
+
+  /* Parameter adjustments */
+  --wsave;
+  --c;
+
+  /* Function Body */
+  if (n == 1) {
+    return;
+  }
+  iw1 = 2*n + 1;
+  iw2 = iw1 + 2*n;
+  cfftf1(n, &c[1], &wsave[1], &wsave[iw1], (int*)&wsave[iw2]);
+} /* cfftf */
+
+static int decompose(integer n, integer *ifac, integer ntryh[4]) {  
+  integer ntry=0, nl = n, nf = 0, nq, nr, i, j = 0;
+  do {
+    if (j < 4) {
+      ntry = ntryh[j];
+    } else {
+      ntry += 2;
+    }
+    ++j;
+  L104:
+    nq = nl / ntry;
+    nr = nl - ntry * nq;
+    if (nr != 0) continue;
+    ++nf;
+    ifac[nf + 2] = ntry;
+    nl = nq;
+    if (ntry == 2 && nf != 1) {
+      for (i = 2; i <= nf; ++i) {
+        integer ib = nf - i + 2;
+        ifac[ib + 2] = ifac[ib + 1];
+      }
+      ifac[3] = 2;
+    }
+    if (nl != 1) {
+      goto L104;
+    }
+  } while (nl != 1);
+  ifac[1] = n;
+  ifac[2] = nf;  
+  return nf;
+}
+
+static void cffti1(integer n, real *wa, integer *ifac)
+{
+  static integer ntryh[4] = { 3,4,2,5 };
+
+  /* Local variables */
+  integer i, j, i1, k1, l1, l2;
+  real fi;
+  integer ld, ii, nf, ip;
+  real arg;
+  integer ido, ipm;
+  real argh;
+  integer idot;
+  real argld;
+
+  /* Parameter adjustments */
+  --ifac;
+  --wa;
+
+  nf = decompose(n, ifac, ntryh);
+
+  argh = (2*M_PI) / (real) (n);
+  i = 2;
+  l1 = 1;
+  for (k1 = 1; k1 <= nf; ++k1) {
+    ip = ifac[k1 + 2];
+    ld = 0;
+    l2 = l1 * ip;
+    ido = n / l2;
+    idot = ido + ido + 2;
+    ipm = ip - 1;
+    for (j = 1; j <= ipm; ++j) {
+      i1 = i;
+      wa[i - 1] = 1.f;
+      wa[i] = 0.f;
+      ld += l1;
+      fi = 0.f;
+      argld = (real) ld * argh;
+      for (ii = 4; ii <= idot; ii += 2) {
+        i += 2;
+        fi += 1.f;
+        arg = fi * argld;
+        wa[i - 1] = cos(arg);
+        wa[i] = sin(arg);
+      }
+      if (ip > 5) {
+        wa[i1 - 1] = wa[i - 1];
+        wa[i1] = wa[i];
+      };
+    }
+    l1 = l2;
+  }
+} /* cffti1 */
+
+void cffti(integer n, real *wsave)
+{
+  integer iw1, iw2;
+  /* Parameter adjustments */
+  --wsave;
+
+  /* Function Body */
+  if (n == 1) {
+    return;
+  }
+  iw1 = 2*n + 1;
+  iw2 = iw1 + 2*n;
+  cffti1(n, &wsave[iw1], (int*)&wsave[iw2]);
+  return;
+} /* cffti */
+
+static void rfftb1(integer n, real *c, real *ch, const real *wa, integer *ifac)
+{
+  /* Local variables */
+  integer i, k1, l1, l2, na, nf, ip, iw, ix2, ix3, ix4, ido, idl1;
+
+  /* Function Body */
+  nf = ifac[1];
+  na = 0;
+  l1 = 1;
+  iw = 0;
+  for (k1 = 1; k1 <= nf; ++k1) {
+    ip = ifac[k1 + 1];
+    l2 = ip * l1;
+    ido = n / l2;
+    idl1 = ido * l1;
+    switch (ip) {
+      case 4:
+        ix2 = iw + ido;
+        ix3 = ix2 + ido;
+        radb4(ido, l1, na?ch:c, na?c:ch, &wa[iw], &wa[ix2], &wa[ix3]);
+        na = 1 - na;
+        break;
+      case 2:
+        radb2(ido, l1, na?ch:c, na?c:ch, &wa[iw]);
+        na = 1 - na;
+        break;
+      case 3:
+        ix2 = iw + ido;
+        radb3(ido, l1, na?ch:c, na?c:ch, &wa[iw], &wa[ix2]);
+        na = 1 - na;
+        break;
+      case 5:
+        ix2 = iw + ido;
+        ix3 = ix2 + ido;
+        ix4 = ix3 + ido;
+        radb5(ido, l1, na?ch:c, na?c:ch, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]);
+        na = 1 - na;
+        break;
+      default:
+        if (na == 0) {
+          radbg(ido, ip, l1, idl1, c, c, c, ch, ch, &wa[iw]);
+        } else {
+          radbg(ido, ip, l1, idl1, ch, ch, ch, c, c, &wa[iw]);
+        }
+        if (ido == 1) {
+          na = 1 - na;
+        }
+        break;
+    }
+    l1 = l2;
+    iw += (ip - 1) * ido;
+  }
+  if (na == 0) {
+    return;
+  }
+  for (i = 0; i < n; ++i) {
+    c[i] = ch[i];
+  }
+} /* rfftb1 */
+
+static void rfftf1(integer n, real *c, real *ch, const real *wa, integer *ifac)
+{
+  /* Local variables */
+  integer i, k1, l1, l2, na, kh, nf, ip, iw, ix2, ix3, ix4, ido, idl1;
+
+  /* Function Body */
+  nf = ifac[1];
+  na = 1;
+  l2 = n;
+  iw = n-1;
+  for (k1 = 1; k1 <= nf; ++k1) {
+    kh = nf - k1;
+    ip = ifac[kh + 2];
+    l1 = l2 / ip;
+    ido = n / l2;
+    idl1 = ido * l1;
+    iw -= (ip - 1) * ido;
+    na = 1 - na;
+    switch (ip) {
+      case 4:
+        ix2 = iw + ido;
+        ix3 = ix2 + ido;
+        radf4(ido, l1, na ? ch : c, na ? c : ch, &wa[iw], &wa[ix2], &wa[ix3]);
+        break;
+      case 2:
+        radf2(ido, l1, na ? ch : c, na ? c : ch, &wa[iw]);
+        break;
+      case 3:        
+        ix2 = iw + ido;
+        radf3(ido, l1, na ? ch : c, na ? c : ch, &wa[iw], &wa[ix2]);
+        break;
+      case 5:
+        ix2 = iw + ido;
+        ix3 = ix2 + ido;
+        ix4 = ix3 + ido;
+        radf5(ido, l1, na ? ch : c, na ? c : ch, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]);
+        break;
+      default:
+        if (ido == 1) {
+          na = 1 - na;
+        }
+        if (na == 0) {
+          radfg(ido, ip, l1, idl1, c, c, c, ch, ch, &wa[iw]);
+          na = 1;
+        } else {
+          radfg(ido, ip, l1, idl1, ch, ch, ch, c, c, &wa[iw]);
+          na = 0;
+        }
+        break;
+    }
+    l2 = l1;
+  }
+  if (na == 1) {
+    return;
+  }
+  for (i = 0; i < n; ++i) {
+    c[i] = ch[i];
+  }
+}
+
+void rfftb(integer n, real *r, real *wsave)
+{
+
+  /* Parameter adjustments */
+  --wsave;
+  --r;
+
+  /* Function Body */
+  if (n == 1) {
+    return;
+  }
+  rfftb1(n, &r[1], &wsave[1], &wsave[n + 1], (int*)&wsave[(n << 1) + 1]);
+} /* rfftb */
+
+static void rffti1(integer n, real *wa, integer *ifac)
+{
+  static integer ntryh[4] = { 4,2,3,5 };
+
+  /* Local variables */
+  integer i, j, k1, l1, l2;
+  real fi;
+  integer ld, ii, nf, ip, is;
+  real arg;
+  integer ido, ipm;
+  integer nfm1;
+  real argh;
+  real argld;
+
+  /* Parameter adjustments */
+  --ifac;
+  --wa;
+
+  nf = decompose(n, ifac, ntryh);
+
+  argh = (2*M_PI) / (real) (n);
+  is = 0;
+  nfm1 = nf - 1;
+  l1 = 1;
+  if (nfm1 == 0) {
+    return;
+  }
+  for (k1 = 1; k1 <= nfm1; ++k1) {
+    ip = ifac[k1 + 2];
+    ld = 0;
+    l2 = l1 * ip;
+    ido = n / l2;
+    ipm = ip - 1;
+    for (j = 1; j <= ipm; ++j) {
+      ld += l1;
+      i = is;
+      argld = (real) ld * argh;
+      fi = 0.f;
+      for (ii = 3; ii <= ido; ii += 2) {
+        i += 2;
+        fi += 1.f;
+        arg = fi * argld;
+        wa[i - 1] = cos(arg);
+        wa[i] = sin(arg);
+      }
+      is += ido;
+    }
+    l1 = l2;
+  }
+} /* rffti1 */
+
+void rfftf(integer n, real *r, real *wsave)
+{
+
+  /* Parameter adjustments */
+  --wsave;
+  --r;
+
+  /* Function Body */
+  if (n == 1) {
+    return;
+  }
+  rfftf1(n, &r[1], &wsave[1], &wsave[n + 1], (int*)&wsave[(n << 1) + 1]);
+} /* rfftf */
+
+void rffti(integer n, real *wsave)
+{
+  /* Parameter adjustments */
+  --wsave;
+
+  /* Function Body */
+  if (n == 1) {
+    return;
+  }
+  rffti1(n, &wsave[n + 1], (int*)&wsave[(n << 1) + 1]);
+  return;
+} /* rffti */
+
+static void cosqb1(integer n, real *x, real *w, real *xh)
+{
+  /* Local variables */
+  integer i, k, kc, np2, ns2;
+  real xim1;
+  integer modn;
+
+  /* Parameter adjustments */
+  --xh;
+  --w;
+  --x;
+
+  /* Function Body */
+  ns2 = (n + 1) / 2;
+  np2 = n + 2;
+  for (i = 3; i <= n; i += 2) {
+    xim1 = x[i - 1] + x[i];
+    x[i] -= x[i - 1];
+    x[i - 1] = xim1;
+  }
+  x[1] += x[1];
+  modn = n % 2;
+  if (modn == 0) {
+    x[n] += x[n];
+  }
+  rfftb(n, &x[1], &xh[1]);
+  for (k = 2; k <= ns2; ++k) {
+    kc = np2 - k;
+    xh[k] = w[k - 1] * x[kc] + w[kc - 1] * x[k];
+    xh[kc] = w[k - 1] * x[k] - w[kc - 1] * x[kc];
+  }
+  if (modn == 0) {
+    x[ns2 + 1] = w[ns2] * (x[ns2 + 1] + x[ns2 + 1]);
+  }
+  for (k = 2; k <= ns2; ++k) {
+    kc = np2 - k;
+    x[k] = xh[k] + xh[kc];
+    x[kc] = xh[k] - xh[kc];
+  }
+  x[1] += x[1];
+} /* cosqb1 */
+
+void cosqb(integer n, real *x, real *wsave)
+{
+  static const real tsqrt2 = 2.82842712474619f;
+
+  /* Local variables */
+  real x1;
+
+  /* Parameter adjustments */
+  --wsave;
+  --x;
+
+  if (n < 2) {
+    x[1] *= 4.f;
+  } else if (n == 2) {
+    x1 = (x[1] + x[2]) * 4.f;
+    x[2] = tsqrt2 * (x[1] - x[2]);
+    x[1] = x1;
+  } else {
+    cosqb1(n, &x[1], &wsave[1], &wsave[n + 1]);
+  }
+} /* cosqb */
+
+static void cosqf1(integer n, real *x, real *w, real *xh)
+{
+  /* Local variables */
+  integer i, k, kc, np2, ns2;
+  real xim1;
+  integer modn;
+
+  /* Parameter adjustments */
+  --xh;
+  --w;
+  --x;
+
+  /* Function Body */
+  ns2 = (n + 1) / 2;
+  np2 = n + 2;
+  for (k = 2; k <= ns2; ++k) {
+    kc = np2 - k;
+    xh[k] = x[k] + x[kc];
+    xh[kc] = x[k] - x[kc];
+  }
+  modn = n % 2;
+  if (modn == 0) {
+    xh[ns2 + 1] = x[ns2 + 1] + x[ns2 + 1];
+  }
+  for (k = 2; k <= ns2; ++k) {
+    kc = np2 - k;
+    x[k] = w[k - 1] * xh[kc] + w[kc - 1] * xh[k];
+    x[kc] = w[k - 1] * xh[k] - w[kc - 1] * xh[kc];
+  }
+  if (modn == 0) {
+    x[ns2 + 1] = w[ns2] * xh[ns2 + 1];
+  }
+  rfftf(n, &x[1], &xh[1]);
+  for (i = 3; i <= n; i += 2) {
+    xim1 = x[i - 1] - x[i];
+    x[i] = x[i - 1] + x[i];
+    x[i - 1] = xim1;
+  }
+} /* cosqf1 */
+
+void cosqf(integer n, real *x, real *wsave)
+{
+  static const real sqrt2 = 1.4142135623731f;
+
+  /* Local variables */
+  real tsqx;
+
+  /* Parameter adjustments */
+  --wsave;
+  --x;
+
+  if (n == 2) {
+    tsqx = sqrt2 * x[2];
+    x[2] = x[1] - tsqx;
+    x[1] += tsqx;
+  } else if (n > 2) {
+    cosqf1(n, &x[1], &wsave[1], &wsave[n + 1]);
+  }
+} /* cosqf */
+
+void cosqi(integer n, real *wsave)
+{
+  /* Local variables */
+  integer k;
+  real fk, dt;
+
+  /* Parameter adjustments */
+  --wsave;
+
+  dt = M_PI/2 / (real) (n);
+  fk = 0.f;
+  for (k = 1; k <= n; ++k) {
+    fk += 1.f;
+    wsave[k] = cos(fk * dt);
+  }
+  rffti(n, &wsave[n + 1]);
+} /* cosqi */
+
+void cost(integer n, real *x, real *wsave)
+{
+  /* Local variables */
+  integer i, k;
+  real c1, t1, t2;
+  integer kc;
+  real xi;
+  integer nm1, np1;
+  real x1h;
+  integer ns2;
+  real tx2, x1p3, xim2;
+  integer modn;
+
+  /* Parameter adjustments */
+  --wsave;
+  --x;
+
+  /* Function Body */
+  nm1 = n - 1;
+  np1 = n + 1;
+  ns2 = n / 2;
+  if (n < 2) {
+  } else if (n == 2) {
+    x1h = x[1] + x[2];
+    x[2] = x[1] - x[2];
+    x[1] = x1h;
+  } else if (n == 3) {
+    x1p3 = x[1] + x[3];
+    tx2 = x[2] + x[2];
+    x[2] = x[1] - x[3];
+    x[1] = x1p3 + tx2;
+    x[3] = x1p3 - tx2;
+  } else {
+    c1 = x[1] - x[n];
+    x[1] += x[n];
+    for (k = 2; k <= ns2; ++k) {
+      kc = np1 - k;
+      t1 = x[k] + x[kc];
+      t2 = x[k] - x[kc];
+      c1 += wsave[kc] * t2;
+      t2 = wsave[k] * t2;
+      x[k] = t1 - t2;
+      x[kc] = t1 + t2;
+    }
+    modn = n % 2;
+    if (modn != 0) {
+      x[ns2 + 1] += x[ns2 + 1];
+    }
+    rfftf(nm1, &x[1], &wsave[n + 1]);
+    xim2 = x[2];
+    x[2] = c1;
+    for (i = 4; i <= n; i += 2) {
+      xi = x[i];
+      x[i] = x[i - 2] - x[i - 1];
+      x[i - 1] = xim2;
+      xim2 = xi;
+    }
+    if (modn != 0) {
+      x[n] = xim2;
+    }
+  }
+} /* cost */
+
+void costi(integer n, real *wsave)
+{
+  /* Initialized data */
+
+  /* Local variables */
+  integer k, kc;
+  real fk, dt;
+  integer nm1, np1, ns2;
+
+  /* Parameter adjustments */
+  --wsave;
+
+  /* Function Body */
+  if (n <= 3) {
+    return;
+  }
+  nm1 = n - 1;
+  np1 = n + 1;
+  ns2 = n / 2;
+  dt = M_PI / (real) nm1;
+  fk = 0.f;
+  for (k = 2; k <= ns2; ++k) {
+    kc = np1 - k;
+    fk += 1.f;
+    wsave[k] = sin(fk * dt) * 2.f;
+    wsave[kc] = cos(fk * dt) * 2.f;
+  }
+  rffti(nm1, &wsave[n + 1]);
+} /* costi */
+
+void sinqb(integer n, real *x, real *wsave)
+{
+  /* Local variables */
+  integer k, kc, ns2;
+  real xhold;
+
+  /* Parameter adjustments */
+  --wsave;
+  --x;
+
+  /* Function Body */
+  if (n <= 1) {
+    x[1] *= 4.f;
+    return;
+  }
+  ns2 = n / 2;
+  for (k = 2; k <= n; k += 2) {
+    x[k] = -x[k];
+  }
+  cosqb(n, &x[1], &wsave[1]);
+  for (k = 1; k <= ns2; ++k) {
+    kc = n - k;
+    xhold = x[k];
+    x[k] = x[kc + 1];
+    x[kc + 1] = xhold;
+  }
+} /* sinqb */
+
+void sinqf(integer n, real *x, real *wsave)
+{
+  /* Local variables */
+  integer k, kc, ns2;
+  real xhold;
+
+  /* Parameter adjustments */
+  --wsave;
+  --x;
+
+  /* Function Body */
+  if (n == 1) {
+    return;
+  }
+  ns2 = n / 2;
+  for (k = 1; k <= ns2; ++k) {
+    kc = n - k;
+    xhold = x[k];
+    x[k] = x[kc + 1];
+    x[kc + 1] = xhold;
+  }
+  cosqf(n, &x[1], &wsave[1]);
+  for (k = 2; k <= n; k += 2) {
+    x[k] = -x[k];
+  }
+} /* sinqf */
+
+void sinqi(integer n, real *wsave)
+{
+
+  /* Parameter adjustments */
+  --wsave;
+
+  /* Function Body */
+  cosqi(n, &wsave[1]);
+} /* sinqi */
+
+static void sint1(integer n, real *war, real *was, real *xh, real *
+                  x, integer *ifac)
+{
+  /* Initialized data */
+
+  static const real sqrt3 = 1.73205080756888f;
+
+  /* Local variables */
+  integer i, k;
+  real t1, t2;
+  integer kc, np1, ns2, modn;
+  real xhold;
+
+  /* Parameter adjustments */
+  --ifac;
+  --x;
+  --xh;
+  --was;
+  --war;
+
+  /* Function Body */
+  for (i = 1; i <= n; ++i) {
+    xh[i] = war[i];
+    war[i] = x[i];
+  }
+  
+  if (n < 2) {
+    xh[1] += xh[1];
+  } else if (n == 2) {
+    xhold = sqrt3 * (xh[1] + xh[2]);
+    xh[2] = sqrt3 * (xh[1] - xh[2]);
+    xh[1] = xhold;
+  } else {
+    np1 = n + 1;
+    ns2 = n / 2;
+    x[1] = 0.f;
+    for (k = 1; k <= ns2; ++k) {
+      kc = np1 - k;
+      t1 = xh[k] - xh[kc];
+      t2 = was[k] * (xh[k] + xh[kc]);
+      x[k + 1] = t1 + t2;
+      x[kc + 1] = t2 - t1;
+    }
+    modn = n % 2;
+    if (modn != 0) {
+      x[ns2 + 2] = xh[ns2 + 1] * 4.f;
+    }
+    rfftf1(np1, &x[1], &xh[1], &war[1], &ifac[1]);
+    xh[1] = x[1] * .5f;
+    for (i = 3; i <= n; i += 2) {
+      xh[i - 1] = -x[i];
+      xh[i] = xh[i - 2] + x[i - 1];
+    }
+    if (modn == 0) {
+      xh[n] = -x[n + 1];
+    }
+  }
+  for (i = 1; i <= n; ++i) {
+    x[i] = war[i];
+    war[i] = xh[i];
+  }
+} /* sint1 */
+
+void sinti(integer n, real *wsave)
+{
+  /* Local variables */
+  integer k;
+  real dt;
+  integer np1, ns2;
+
+  /* Parameter adjustments */
+  --wsave;
+
+  /* Function Body */
+  if (n <= 1) {
+    return;
+  }
+  ns2 = n / 2;
+  np1 = n + 1;
+  dt = M_PI / (real) np1;
+  for (k = 1; k <= ns2; ++k) {
+    wsave[k] = sin(k * dt) * 2.f;
+  }
+  rffti(np1, &wsave[ns2 + 1]);
+} /* sinti */
+
+void sint(integer n, real *x, real *wsave)
+{
+  integer np1, iw1, iw2, iw3;
+
+  /* Parameter adjustments */
+  --wsave;
+  --x;
+
+  /* Function Body */
+  np1 = n + 1;
+  iw1 = n / 2 + 1;
+  iw2 = iw1 + np1;
+  iw3 = iw2 + np1;
+  sint1(n, &x[1], &wsave[1], &wsave[iw1], &wsave[iw2], (int*)&wsave[iw3]);
+} /* sint */
+
+#ifdef TESTING_FFTPACK
+#include <stdio.h>
+
+int main(void)
+{
+  static integer nd[] = { 120,91,54,49,32,28,24,8,4,3,2 };
+
+  /* System generated locals */
+  real r1, r2, r3;
+  f77complex q1, q2, q3;
+
+  /* Local variables */
+  integer i, j, k, n;
+  real w[2000], x[200], y[200], cf, fn, dt;
+  f77complex cx[200], cy[200];
+  real xh[200];
+  integer nz, nm1, np1, ns2;
+  real arg, tfn;
+  real sum, arg1, arg2;
+  real sum1, sum2, dcfb;
+  integer modn;
+  real rftb, rftf;
+  real sqrt2;
+  real rftfb;
+  real costt, sintt, dcfftb, dcfftf, cosqfb, costfb;
+  real sinqfb;
+  real sintfb;
+  real cosqbt, cosqft, sinqbt, sinqft;
+
+
+
+  /*     * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+  /*                       VERSION 4  APRIL 1985 */
+
+  /*                         A TEST DRIVER FOR */
+  /*          A PACKAGE OF FORTRAN SUBPROGRAMS FOR THE FAST FOURIER */
+  /*           TRANSFORM OF PERIODIC AND OTHER SYMMETRIC SEQUENCES */
+
+  /*                              BY */
+
+  /*                       PAUL N SWARZTRAUBER */
+
+  /*       NATIONAL CENTER FOR ATMOSPHERIC RESEARCH  BOULDER,COLORADO 80307 */
+
+  /*        WHICH IS SPONSORED BY THE NATIONAL SCIENCE FOUNDATION */
+
+  /*     * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+
+  /*             THIS PROGRAM TESTS THE PACKAGE OF FAST FOURIER */
+  /*     TRANSFORMS FOR BOTH COMPLEX AND REAL PERIODIC SEQUENCES AND */
+  /*     CERTIAN OTHER SYMMETRIC SEQUENCES THAT ARE LISTED BELOW. */
+
+  /*     1.   RFFTI     INITIALIZE  RFFTF AND RFFTB */
+  /*     2.   RFFTF     FORWARD TRANSFORM OF A REAL PERIODIC SEQUENCE */
+  /*     3.   RFFTB     BACKWARD TRANSFORM OF A REAL COEFFICIENT ARRAY */
+
+  /*     4.   EZFFTI    INITIALIZE EZFFTF AND EZFFTB */
+  /*     5.   EZFFTF    A SIMPLIFIED REAL PERIODIC FORWARD TRANSFORM */
+  /*     6.   EZFFTB    A SIMPLIFIED REAL PERIODIC BACKWARD TRANSFORM */
+
+  /*     7.   SINTI     INITIALIZE SINT */
+  /*     8.   SINT      SINE TRANSFORM OF A REAL ODD SEQUENCE */
+
+  /*     9.   COSTI     INITIALIZE COST */
+  /*     10.  COST      COSINE TRANSFORM OF A REAL EVEN SEQUENCE */
+
+  /*     11.  SINQI     INITIALIZE SINQF AND SINQB */
+  /*     12.  SINQF     FORWARD SINE TRANSFORM WITH ODD WAVE NUMBERS */
+  /*     13.  SINQB     UNNORMALIZED INVERSE OF SINQF */
+
+  /*     14.  COSQI     INITIALIZE COSQF AND COSQB */
+  /*     15.  COSQF     FORWARD COSINE TRANSFORM WITH ODD WAVE NUMBERS */
+  /*     16.  COSQB     UNNORMALIZED INVERSE OF COSQF */
+
+  /*     17.  CFFTI     INITIALIZE CFFTF AND CFFTB */
+  /*     18.  CFFTF     FORWARD TRANSFORM OF A COMPLEX PERIODIC SEQUENCE */
+  /*     19.  CFFTB     UNNORMALIZED INVERSE OF CFFTF */
+
+
+  sqrt2 = sqrt(2.f);
+  int all_ok = 1;
+  for (nz = 1; nz <= (int)(sizeof nd/sizeof nd[0]); ++nz) {
+    n = nd[nz - 1];
+    modn = n % 2;
+    fn = (real) n;
+    tfn = fn + fn;
+    np1 = n + 1;
+    nm1 = n - 1;
+    for (j = 1; j <= np1; ++j) {
+      x[j - 1] = sin((real) j * sqrt2);
+      y[j - 1] = x[j - 1];
+      xh[j - 1] = x[j - 1];
+    }
+
+    /*     TEST SUBROUTINES RFFTI,RFFTF AND RFFTB */
+
+    rffti(n, w);
+    dt = (2*M_PI) / fn;
+    ns2 = (n + 1) / 2;
+    if (ns2 < 2) {
+      goto L104;
+    }
+    for (k = 2; k <= ns2; ++k) {
+      sum1 = 0.f;
+      sum2 = 0.f;
+      arg = (real) (k - 1) * dt;
+      for (i = 1; i <= n; ++i) {
+        arg1 = (real) (i - 1) * arg;
+        sum1 += x[i - 1] * cos(arg1);
+        sum2 += x[i - 1] * sin(arg1);
+      }
+      y[(k << 1) - 3] = sum1;
+      y[(k << 1) - 2] = -sum2;
+    }
+  L104:
+    sum1 = 0.f;
+    sum2 = 0.f;
+    for (i = 1; i <= nm1; i += 2) {
+      sum1 += x[i - 1];
+      sum2 += x[i];
+    }
+    if (modn == 1) {
+      sum1 += x[n - 1];
+    }
+    y[0] = sum1 + sum2;
+    if (modn == 0) {
+      y[n - 1] = sum1 - sum2;
+    }
+    rfftf(n, x, w);
+    rftf = 0.f;
+    for (i = 1; i <= n; ++i) {
+      /* Computing MAX */
+      r2 = rftf, r3 = (r1 = x[i - 1] - y[i - 1], fabs(r1));
+      rftf = dmax(r2,r3);
+      x[i - 1] = xh[i - 1];
+    }
+    rftf /= fn;
+    for (i = 1; i <= n; ++i) {
+      sum = x[0] * .5f;
+      arg = (real) (i - 1) * dt;
+      if (ns2 < 2) {
+        goto L108;
+      }
+      for (k = 2; k <= ns2; ++k) {
+        arg1 = (real) (k - 1) * arg;
+        sum = sum + x[(k << 1) - 3] * cos(arg1) - x[(k << 1) - 2] * 
+          sin(arg1);
+      }
+    L108:
+      if (modn == 0) {
+        sum += (real)pow(-1, i-1) * .5f * x[n - 1];
+      }
+      y[i - 1] = sum + sum;
+    }
+    rfftb(n, x, w);
+    rftb = 0.f;
+    for (i = 1; i <= n; ++i) {
+      /* Computing MAX */
+      r2 = rftb, r3 = (r1 = x[i - 1] - y[i - 1], fabs(r1));
+      rftb = dmax(r2,r3);
+      x[i - 1] = xh[i - 1];
+      y[i - 1] = xh[i - 1];
+    }
+    rfftb(n, y, w);
+    rfftf(n, y, w);
+    cf = 1.f / fn;
+    rftfb = 0.f;
+    for (i = 1; i <= n; ++i) {
+      /* Computing MAX */
+      r2 = rftfb, r3 = (r1 = cf * y[i - 1] - x[i - 1], fabs(
+                                                            r1));
+      rftfb = dmax(r2,r3);
+    }
+
+    /*     TEST SUBROUTINES SINTI AND SINT */
+
+    dt = M_PI / fn;
+    for (i = 1; i <= nm1; ++i) {
+      x[i - 1] = xh[i - 1];
+    }
+    for (i = 1; i <= nm1; ++i) {
+      y[i - 1] = 0.f;
+      arg1 = (real) i * dt;
+      for (k = 1; k <= nm1; ++k) {
+        y[i - 1] += x[k - 1] * sin((real) k * arg1);
+      }
+      y[i - 1] += y[i - 1];
+    }
+    sinti(nm1, w);
+    sint(nm1, x, w);
+    cf = .5f / fn;
+    sintt = 0.f;
+    for (i = 1; i <= nm1; ++i) {
+      /* Computing MAX */
+      r2 = sintt, r3 = (r1 = x[i - 1] - y[i - 1], fabs(r1));
+      sintt = dmax(r2,r3);
+      x[i - 1] = xh[i - 1];
+      y[i - 1] = x[i - 1];
+    }
+    sintt = cf * sintt;
+    sint(nm1, x, w);
+    sint(nm1, x, w);
+    sintfb = 0.f;
+    for (i = 1; i <= nm1; ++i) {
+      /* Computing MAX */
+      r2 = sintfb, r3 = (r1 = cf * x[i - 1] - y[i - 1], fabs(
+                                                             r1));
+      sintfb = dmax(r2,r3);
+    }
+
+    /*     TEST SUBROUTINES COSTI AND COST */
+
+    for (i = 1; i <= np1; ++i) {
+      x[i - 1] = xh[i - 1];
+    }
+    for (i = 1; i <= np1; ++i) {
+      y[i - 1] = (x[0] + (real) pow(-1, i+1) * x[n]) * .5f;
+      arg = (real) (i - 1) * dt;
+      for (k = 2; k <= n; ++k) {
+        y[i - 1] += x[k - 1] * cos((real) (k - 1) * arg);
+      }
+      y[i - 1] += y[i - 1];
+    }
+    costi(np1, w);
+    cost(np1, x, w);
+    costt = 0.f;
+    for (i = 1; i <= np1; ++i) {
+      /* Computing MAX */
+      r2 = costt, r3 = (r1 = x[i - 1] - y[i - 1], fabs(r1));
+      costt = dmax(r2,r3);
+      x[i - 1] = xh[i - 1];
+      y[i - 1] = xh[i - 1];
+    }
+    costt = cf * costt;
+    cost(np1, x, w);
+    cost(np1, x, w);
+    costfb = 0.f;
+    for (i = 1; i <= np1; ++i) {
+      /* Computing MAX */
+      r2 = costfb, r3 = (r1 = cf * x[i - 1] - y[i - 1], fabs(
+                                                             r1));
+      costfb = dmax(r2,r3);
+    }
+
+    /*     TEST SUBROUTINES SINQI,SINQF AND SINQB */
+
+    cf = .25f / fn;
+    for (i = 1; i <= n; ++i) {
+      y[i - 1] = xh[i - 1];
+    }
+    dt = M_PI / (fn + fn);
+    for (i = 1; i <= n; ++i) {
+      x[i - 1] = 0.f;
+      arg = dt * (real) i;
+      for (k = 1; k <= n; ++k) {
+        x[i - 1] += y[k - 1] * sin((real) (k + k - 1) * arg);
+      }
+      x[i - 1] *= 4.f;
+    }
+    sinqi(n, w);
+    sinqb(n, y, w);
+    sinqbt = 0.f;
+    for (i = 1; i <= n; ++i) {
+      /* Computing MAX */
+      r2 = sinqbt, r3 = (r1 = y[i - 1] - x[i - 1], fabs(r1))
+        ;
+      sinqbt = dmax(r2,r3);
+      x[i - 1] = xh[i - 1];
+    }
+    sinqbt = cf * sinqbt;
+    for (i = 1; i <= n; ++i) {
+      arg = (real) (i + i - 1) * dt;
+      y[i - 1] = (real) pow(-1, i+1) * .5f * x[n - 1];
+      for (k = 1; k <= nm1; ++k) {
+        y[i - 1] += x[k - 1] * sin((real) k * arg);
+      }
+      y[i - 1] += y[i - 1];
+    }
+    sinqf(n, x, w);
+    sinqft = 0.f;
+    for (i = 1; i <= n; ++i) {
+      /* Computing MAX */
+      r2 = sinqft, r3 = (r1 = x[i - 1] - y[i - 1], fabs(r1))
+        ;
+      sinqft = dmax(r2,r3);
+      y[i - 1] = xh[i - 1];
+      x[i - 1] = xh[i - 1];
+    }
+    sinqf(n, y, w);
+    sinqb(n, y, w);
+    sinqfb = 0.f;
+    for (i = 1; i <= n; ++i) {
+      /* Computing MAX */
+      r2 = sinqfb, r3 = (r1 = cf * y[i - 1] - x[i - 1], fabs(
+                                                             r1));
+      sinqfb = dmax(r2,r3);
+    }
+
+    /*     TEST SUBROUTINES COSQI,COSQF AND COSQB */
+
+    for (i = 1; i <= n; ++i) {
+      y[i - 1] = xh[i - 1];
+    }
+    for (i = 1; i <= n; ++i) {
+      x[i - 1] = 0.f;
+      arg = (real) (i - 1) * dt;
+      for (k = 1; k <= n; ++k) {
+        x[i - 1] += y[k - 1] * cos((real) (k + k - 1) * arg);
+      }
+      x[i - 1] *= 4.f;
+    }
+    cosqi(n, w);
+    cosqb(n, y, w);
+    cosqbt = 0.f;
+    for (i = 1; i <= n; ++i) {
+      /* Computing MAX */
+      r2 = cosqbt, r3 = (r1 = x[i - 1] - y[i - 1], fabs(r1))
+        ;
+      cosqbt = dmax(r2,r3);
+      x[i - 1] = xh[i - 1];
+    }
+    cosqbt = cf * cosqbt;
+    for (i = 1; i <= n; ++i) {
+      y[i - 1] = x[0] * .5f;
+      arg = (real) (i + i - 1) * dt;
+      for (k = 2; k <= n; ++k) {
+        y[i - 1] += x[k - 1] * cos((real) (k - 1) * arg);
+      }
+      y[i - 1] += y[i - 1];
+    }
+    cosqf(n, x, w);
+    cosqft = 0.f;
+    for (i = 1; i <= n; ++i) {
+      /* Computing MAX */
+      r2 = cosqft, r3 = (r1 = y[i - 1] - x[i - 1], fabs(r1))
+        ;
+      cosqft = dmax(r2,r3);
+      x[i - 1] = xh[i - 1];
+      y[i - 1] = xh[i - 1];
+    }
+    cosqft = cf * cosqft;
+    cosqb(n, x, w);
+    cosqf(n, x, w);
+    cosqfb = 0.f;
+    for (i = 1; i <= n; ++i) {
+      /* Computing MAX */
+      r2 = cosqfb, r3 = (r1 = cf * x[i - 1] - y[i - 1], fabs(r1));
+      cosqfb = dmax(r2,r3);
+    }
+
+    /*     TEST  CFFTI,CFFTF,CFFTB */
+
+    for (i = 1; i <= n; ++i) {
+      r1 = cos(sqrt2 * (real) i);
+      r2 = sin(sqrt2 * (real) (i * i));
+      q1.r = r1, q1.i = r2;
+      cx[i-1].r = q1.r, cx[i-1].i = q1.i;
+    }
+    dt = (2*M_PI) / fn;
+    for (i = 1; i <= n; ++i) {
+      arg1 = -((real) (i - 1)) * dt;
+      cy[i-1].r = 0.f, cy[i-1].i = 0.f;
+      for (k = 1; k <= n; ++k) {
+        arg2 = (real) (k - 1) * arg1;
+        r1 = cos(arg2);
+        r2 = sin(arg2);
+        q3.r = r1, q3.i = r2;
+        q2.r = q3.r * cx[k-1].r - q3.i * cx[k-1].i, q2.i = 
+          q3.r * cx[k-1].i + q3.i * cx[k-1].r;
+        q1.r = cy[i-1].r + q2.r, q1.i = cy[i-1].i + q2.i;
+        cy[i-1].r = q1.r, cy[i-1].i = q1.i;
+      }
+    }
+    cffti(n, w);
+    cfftf(n, (real*)cx, w);
+    dcfftf = 0.f;
+    for (i = 1; i <= n; ++i) {
+      /* Computing MAX */
+      q1.r = cx[i-1].r - cy[i-1].r, q1.i = cx[i-1].i - cy[i-1]
+        .i;
+      r1 = dcfftf, r2 = c_abs(&q1);
+      dcfftf = dmax(r1,r2);
+      q1.r = cx[i-1].r / fn, q1.i = cx[i-1].i / fn;
+      cx[i-1].r = q1.r, cx[i-1].i = q1.i;
+    }
+    dcfftf /= fn;
+    for (i = 1; i <= n; ++i) {
+      arg1 = (real) (i - 1) * dt;
+      cy[i-1].r = 0.f, cy[i-1].i = 0.f;
+      for (k = 1; k <= n; ++k) {
+        arg2 = (real) (k - 1) * arg1;
+        r1 = cos(arg2);
+        r2 = sin(arg2);
+        q3.r = r1, q3.i = r2;
+        q2.r = q3.r * cx[k-1].r - q3.i * cx[k-1].i, q2.i = 
+          q3.r * cx[k-1].i + q3.i * cx[k-1].r;
+        q1.r = cy[i-1].r + q2.r, q1.i = cy[i-1].i + q2.i;
+        cy[i-1].r = q1.r, cy[i-1].i = q1.i;
+      }
+    }
+    cfftb(n, (real*)cx, w);
+    dcfftb = 0.f;
+    for (i = 1; i <= n; ++i) {
+      /* Computing MAX */
+      q1.r = cx[i-1].r - cy[i-1].r, q1.i = cx[i-1].i - cy[i-1].i;
+      r1 = dcfftb, r2 = c_abs(&q1);
+      dcfftb = dmax(r1,r2);
+      cx[i-1].r = cy[i-1].r, cx[i-1].i = cy[i-1].i;
+    }
+    cf = 1.f / fn;
+    cfftf(n, (real*)cx, w);
+    cfftb(n, (real*)cx, w);
+    dcfb = 0.f;
+    for (i = 1; i <= n; ++i) {
+      /* Computing MAX */
+      q2.r = cf * cx[i-1].r, q2.i = cf * cx[i-1].i;
+      q1.r = q2.r - cy[i-1].r, q1.i = q2.i - cy[i-1].i;
+      r1 = dcfb, r2 = c_abs(&q1);
+      dcfb = dmax(r1,r2);
+    }
+    printf("%d\tRFFTF  %10.3g\tRFFTB  %10.ge\tRFFTFB %10.3g", n, rftf, rftb, rftfb);
+    printf(  "\tSINT   %10.3g\tSINTFB %10.ge\tCOST   %10.3g\n", sintt, sintfb, costt);
+    printf(  "\tCOSTFB %10.3g\tSINQF  %10.ge\tSINQB  %10.3g", costfb, sinqft, sinqbt);
+    printf(  "\tSINQFB %10.3g\tCOSQF  %10.ge\tCOSQB  %10.3g\n", sinqfb, cosqft, cosqbt);
+    printf(  "\tCOSQFB %10.3g\t", cosqfb);
+    printf(  "\tCFFTF  %10.ge\tCFFTB  %10.3g\n", dcfftf, dcfftb);
+    printf(  "\tCFFTFB %10.3g\n", dcfb);
+
+#define CHECK(x) if (x > 1e-3) { printf(#x " failed: %g\n", x); all_ok = 0; }
+    CHECK(rftf); CHECK(rftb); CHECK(rftfb); CHECK(sintt); CHECK(sintfb); CHECK(costt);
+    CHECK(costfb); CHECK(sinqft); CHECK(sinqbt); CHECK(sinqfb); CHECK(cosqft); CHECK(cosqbt);
+    CHECK(cosqfb); CHECK(dcfftf); CHECK(dcfftb);
+  }
+
+  if (all_ok) printf("Everything looks fine.\n"); 
+  else printf("ERRORS WERE DETECTED.\n");
+  /*
+    expected:
+    120     RFFTF   2.786e-06       RFFTB   6.847e-04       RFFTFB  2.795e-07       SINT    1.312e-06       SINTFB  1.237e-06       COST    1.319e-06
+    COSTFB  4.355e-06       SINQF   3.281e-04       SINQB   1.876e-06       SINQFB  2.198e-07       COSQF   6.199e-07       COSQB   2.193e-06
+    COSQFB  2.300e-07       DEZF    5.573e-06       DEZB    1.363e-05       DEZFB   1.371e-06       CFFTF   5.590e-06       CFFTB   4.751e-05
+    CFFTFB  4.215e-07
+    54      RFFTF   4.708e-07       RFFTB   3.052e-05       RFFTFB  3.439e-07       SINT    3.532e-07       SINTFB  4.145e-07       COST    3.002e-07
+    COSTFB  6.343e-07       SINQF   4.959e-05       SINQB   4.415e-07       SINQFB  2.882e-07       COSQF   2.826e-07       COSQB   2.472e-07
+    COSQFB  3.439e-07       DEZF    9.388e-07       DEZB    5.066e-06       DEZFB   5.960e-07       CFFTF   1.426e-06       CFFTB   9.482e-06
+    CFFTFB  2.980e-07
+    49      RFFTF   4.476e-07       RFFTB   5.341e-05       RFFTFB  2.574e-07       SINT    9.196e-07       SINTFB  9.401e-07       COST    8.174e-07
+    COSTFB  1.331e-06       SINQF   4.005e-05       SINQB   9.342e-07       SINQFB  3.057e-07       COSQF   2.530e-07       COSQB   6.228e-07
+    COSQFB  4.826e-07       DEZF    9.071e-07       DEZB    4.590e-06       DEZFB   5.960e-07       CFFTF   2.095e-06       CFFTB   1.414e-05
+    CFFTFB  7.398e-07
+    32      RFFTF   4.619e-07       RFFTB   2.861e-05       RFFTFB  1.192e-07       SINT    3.874e-07       SINTFB  4.172e-07       COST    4.172e-07
+    COSTFB  1.699e-06       SINQF   2.551e-05       SINQB   6.407e-07       SINQFB  2.980e-07       COSQF   1.639e-07       COSQB   1.714e-07
+    COSQFB  2.384e-07       DEZF    1.013e-06       DEZB    2.339e-06       DEZFB   7.749e-07       CFFTF   1.127e-06       CFFTB   6.744e-06
+    CFFTFB  2.666e-07
+    4       RFFTF   1.490e-08       RFFTB   1.490e-07       RFFTFB  5.960e-08       SINT    7.451e-09       SINTFB  0.000e+00       COST    2.980e-08
+    COSTFB  1.192e-07       SINQF   4.768e-07       SINQB   2.980e-08       SINQFB  5.960e-08       COSQF   2.608e-08       COSQB   5.960e-08
+    COSQFB  1.192e-07       DEZF    2.980e-08       DEZB    5.960e-08       DEZFB   0.000e+00       CFFTF   6.664e-08       CFFTB   5.960e-08
+    CFFTFB  6.144e-08
+    3       RFFTF   3.974e-08       RFFTB   1.192e-07       RFFTFB  3.303e-08       SINT    1.987e-08       SINTFB  1.069e-08       COST    4.967e-08
+    COSTFB  5.721e-08       SINQF   8.941e-08       SINQB   2.980e-08       SINQFB  1.259e-07       COSQF   7.451e-09       COSQB   4.967e-08
+    COSQFB  7.029e-08       DEZF    1.192e-07       DEZB    5.960e-08       DEZFB   5.960e-08       CFFTF   7.947e-08       CFFTB   8.429e-08
+    CFFTFB  9.064e-08
+    2       RFFTF   0.000e+00       RFFTB   0.000e+00       RFFTFB  0.000e+00       SINT    0.000e+00       SINTFB  0.000e+00       COST    0.000e+00
+    COSTFB  0.000e+00       SINQF   1.192e-07       SINQB   2.980e-08       SINQFB  5.960e-08       COSQF   7.451e-09       COSQB   1.490e-08
+    COSQFB  0.000e+00       DEZF    0.000e+00       DEZB    0.000e+00       DEZFB   0.000e+00       CFFTF   0.000e+00       CFFTB   5.960e-08
+    CFFTFB  5.960e-08
+    Everything looks fine.
+
+  */
+
+  return all_ok ? 0 : 1;
+}
+#endif //TESTING_FFTPACK
diff --git a/oss-internship-2020/pffft/fftpack.h b/oss-internship-2020/pffft/fftpack.h
new file mode 100644
index 0000000..5971b9f
--- /dev/null
+++ b/oss-internship-2020/pffft/fftpack.h
@@ -0,0 +1,799 @@
+/*
+  Interface for the f2c translation of fftpack as found on http://www.netlib.org/fftpack/
+  
+   FFTPACK license:
+
+   http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
+
+   Copyright (c) 2004 the University Corporation for Atmospheric
+   Research ("UCAR"). All rights reserved. Developed by NCAR's
+   Computational and Information Systems Laboratory, UCAR,
+   www.cisl.ucar.edu.
+
+   Redistribution and use of the Software in source and binary forms,
+   with or without modification, is permitted provided that the
+   following conditions are met:
+
+   - Neither the names of NCAR's Computational and Information Systems
+   Laboratory, the University Corporation for Atmospheric Research,
+   nor the names of its sponsors or contributors may be used to
+   endorse or promote products derived from this Software without
+   specific prior written permission.  
+
+   - Redistributions of source code must retain the above copyright
+   notices, this list of conditions, and the disclaimer below.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions, and the disclaimer below in the
+   documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+   SOFTWARE.
+
+   ChangeLog:
+   2011/10/02: this is my first release of this file.
+*/
+
+#ifndef FFTPACK_H
+#define FFTPACK_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// just define FFTPACK_DOUBLE_PRECISION if you want to build it as a double precision fft
+
+#ifndef FFTPACK_DOUBLE_PRECISION
+  typedef float fftpack_real;
+  typedef int   fftpack_int;
+#else
+  typedef double fftpack_real;
+  typedef int    fftpack_int;
+#endif
+
+  void cffti(fftpack_int n, fftpack_real *wsave);
+
+  void cfftf(fftpack_int n, fftpack_real *c, fftpack_real *wsave);
+
+  void cfftb(fftpack_int n, fftpack_real *c, fftpack_real *wsave); 
+
+  void rffti(fftpack_int n, fftpack_real *wsave);
+  void rfftf(fftpack_int n, fftpack_real *r, fftpack_real *wsave);
+  void rfftb(fftpack_int n, fftpack_real *r, fftpack_real *wsave);
+
+  void cosqi(fftpack_int n, fftpack_real *wsave);
+  void cosqf(fftpack_int n, fftpack_real *x, fftpack_real *wsave);
+  void cosqb(fftpack_int n, fftpack_real *x, fftpack_real *wsave);
+
+  void costi(fftpack_int n, fftpack_real *wsave);
+  void cost(fftpack_int n, fftpack_real *x, fftpack_real *wsave);
+
+  void sinqi(fftpack_int n, fftpack_real *wsave);
+  void sinqb(fftpack_int n, fftpack_real *x, fftpack_real *wsave);
+  void sinqf(fftpack_int n, fftpack_real *x, fftpack_real *wsave);
+
+  void sinti(fftpack_int n, fftpack_real *wsave);
+  void sint(fftpack_int n, fftpack_real *x, fftpack_real *wsave);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* FFTPACK_H */
+
+/*
+
+                      FFTPACK
+
+* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+
+                  version 4  april 1985
+
+     a package of fortran subprograms for the fast fourier
+      transform of periodic and other symmetric sequences
+
+                         by
+
+                  paul n swarztrauber
+
+  national center for atmospheric research  boulder,colorado 80307
+
+   which is sponsored by the national science foundation
+
+* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+
+
+this package consists of programs which perform fast fourier
+transforms for both complex and real periodic sequences and
+certain other symmetric sequences that are listed below.
+
+1.   rffti     initialize  rfftf and rfftb
+2.   rfftf     forward transform of a real periodic sequence
+3.   rfftb     backward transform of a real coefficient array
+
+4.   ezffti    initialize ezfftf and ezfftb
+5.   ezfftf    a simplified real periodic forward transform
+6.   ezfftb    a simplified real periodic backward transform
+
+7.   sinti     initialize sint
+8.   sint      sine transform of a real odd sequence
+
+9.   costi     initialize cost
+10.  cost      cosine transform of a real even sequence
+
+11.  sinqi     initialize sinqf and sinqb
+12.  sinqf     forward sine transform with odd wave numbers
+13.  sinqb     unnormalized inverse of sinqf
+
+14.  cosqi     initialize cosqf and cosqb
+15.  cosqf     forward cosine transform with odd wave numbers
+16.  cosqb     unnormalized inverse of cosqf
+
+17.  cffti     initialize cfftf and cfftb
+18.  cfftf     forward transform of a complex periodic sequence
+19.  cfftb     unnormalized inverse of cfftf
+
+
+******************************************************************
+
+subroutine rffti(n,wsave)
+
+  ****************************************************************
+
+subroutine rffti initializes the array wsave which is used in
+both rfftf and rfftb. the prime factorization of n together with
+a tabulation of the trigonometric functions are computed and
+stored in wsave.
+
+input parameter
+
+n       the length of the sequence to be transformed.
+
+output parameter
+
+wsave   a work array which must be dimensioned at least 2*n+15.
+        the same work array can be used for both rfftf and rfftb
+        as long as n remains unchanged. different wsave arrays
+        are required for different values of n. the contents of
+        wsave must not be changed between calls of rfftf or rfftb.
+
+******************************************************************
+
+subroutine rfftf(n,r,wsave)
+
+******************************************************************
+
+subroutine rfftf computes the fourier coefficients of a real
+perodic sequence (fourier analysis). the transform is defined
+below at output parameter r.
+
+input parameters
+
+n       the length of the array r to be transformed.  the method
+        is most efficient when n is a product of small primes.
+        n may change so long as different work arrays are provided
+
+r       a real array of length n which contains the sequence
+        to be transformed
+
+wsave   a work array which must be dimensioned at least 2*n+15.
+        in the program that calls rfftf. the wsave array must be
+        initialized by calling subroutine rffti(n,wsave) and a
+        different wsave array must be used for each different
+        value of n. this initialization does not have to be
+        repeated so long as n remains unchanged thus subsequent
+        transforms can be obtained faster than the first.
+        the same wsave array can be used by rfftf and rfftb.
+
+
+output parameters
+
+r       r(1) = the sum from i=1 to i=n of r(i)
+
+        if n is even set l =n/2   , if n is odd set l = (n+1)/2
+
+          then for k = 2,...,l
+
+             r(2*k-2) = the sum from i = 1 to i = n of
+
+                  r(i)*cos((k-1)*(i-1)*2*pi/n)
+
+             r(2*k-1) = the sum from i = 1 to i = n of
+
+                 -r(i)*sin((k-1)*(i-1)*2*pi/n)
+
+        if n is even
+
+             r(n) = the sum from i = 1 to i = n of
+
+                  (-1)**(i-1)*r(i)
+
+ *****  note
+             this transform is unnormalized since a call of rfftf
+             followed by a call of rfftb will multiply the input
+             sequence by n.
+
+wsave   contains results which must not be destroyed between
+        calls of rfftf or rfftb.
+
+
+******************************************************************
+
+subroutine rfftb(n,r,wsave)
+
+******************************************************************
+
+subroutine rfftb computes the real perodic sequence from its
+fourier coefficients (fourier synthesis). the transform is defined
+below at output parameter r.
+
+input parameters
+
+n       the length of the array r to be transformed.  the method
+        is most efficient when n is a product of small primes.
+        n may change so long as different work arrays are provided
+
+r       a real array of length n which contains the sequence
+        to be transformed
+
+wsave   a work array which must be dimensioned at least 2*n+15.
+        in the program that calls rfftb. the wsave array must be
+        initialized by calling subroutine rffti(n,wsave) and a
+        different wsave array must be used for each different
+        value of n. this initialization does not have to be
+        repeated so long as n remains unchanged thus subsequent
+        transforms can be obtained faster than the first.
+        the same wsave array can be used by rfftf and rfftb.
+
+
+output parameters
+
+r       for n even and for i = 1,...,n
+
+             r(i) = r(1)+(-1)**(i-1)*r(n)
+
+                  plus the sum from k=2 to k=n/2 of
+
+                   2.*r(2*k-2)*cos((k-1)*(i-1)*2*pi/n)
+
+                  -2.*r(2*k-1)*sin((k-1)*(i-1)*2*pi/n)
+
+        for n odd and for i = 1,...,n
+
+             r(i) = r(1) plus the sum from k=2 to k=(n+1)/2 of
+
+                  2.*r(2*k-2)*cos((k-1)*(i-1)*2*pi/n)
+
+                 -2.*r(2*k-1)*sin((k-1)*(i-1)*2*pi/n)
+
+ *****  note
+             this transform is unnormalized since a call of rfftf
+             followed by a call of rfftb will multiply the input
+             sequence by n.
+
+wsave   contains results which must not be destroyed between
+        calls of rfftb or rfftf.
+
+******************************************************************
+
+subroutine sinti(n,wsave)
+
+******************************************************************
+
+subroutine sinti initializes the array wsave which is used in
+subroutine sint. the prime factorization of n together with
+a tabulation of the trigonometric functions are computed and
+stored in wsave.
+
+input parameter
+
+n       the length of the sequence to be transformed.  the method
+        is most efficient when n+1 is a product of small primes.
+
+output parameter
+
+wsave   a work array with at least int(2.5*n+15) locations.
+        different wsave arrays are required for different values
+        of n. the contents of wsave must not be changed between
+        calls of sint.
+
+******************************************************************
+
+subroutine sint(n,x,wsave)
+
+******************************************************************
+
+subroutine sint computes the discrete fourier sine transform
+of an odd sequence x(i). the transform is defined below at
+output parameter x.
+
+sint is the unnormalized inverse of itself since a call of sint
+followed by another call of sint will multiply the input sequence
+x by 2*(n+1).
+
+the array wsave which is used by subroutine sint must be
+initialized by calling subroutine sinti(n,wsave).
+
+input parameters
+
+n       the length of the sequence to be transformed.  the method
+        is most efficient when n+1 is the product of small primes.
+
+x       an array which contains the sequence to be transformed
+
+
+wsave   a work array with dimension at least int(2.5*n+15)
+        in the program that calls sint. the wsave array must be
+        initialized by calling subroutine sinti(n,wsave) and a
+        different wsave array must be used for each different
+        value of n. this initialization does not have to be
+        repeated so long as n remains unchanged thus subsequent
+        transforms can be obtained faster than the first.
+
+output parameters
+
+x       for i=1,...,n
+
+             x(i)= the sum from k=1 to k=n
+
+                  2*x(k)*sin(k*i*pi/(n+1))
+
+             a call of sint followed by another call of
+             sint will multiply the sequence x by 2*(n+1).
+             hence sint is the unnormalized inverse
+             of itself.
+
+wsave   contains initialization calculations which must not be
+        destroyed between calls of sint.
+
+******************************************************************
+
+subroutine costi(n,wsave)
+
+******************************************************************
+
+subroutine costi initializes the array wsave which is used in
+subroutine cost. the prime factorization of n together with
+a tabulation of the trigonometric functions are computed and
+stored in wsave.
+
+input parameter
+
+n       the length of the sequence to be transformed.  the method
+        is most efficient when n-1 is a product of small primes.
+
+output parameter
+
+wsave   a work array which must be dimensioned at least 3*n+15.
+        different wsave arrays are required for different values
+        of n. the contents of wsave must not be changed between
+        calls of cost.
+
+******************************************************************
+
+subroutine cost(n,x,wsave)
+
+******************************************************************
+
+subroutine cost computes the discrete fourier cosine transform
+of an even sequence x(i). the transform is defined below at output
+parameter x.
+
+cost is the unnormalized inverse of itself since a call of cost
+followed by another call of cost will multiply the input sequence
+x by 2*(n-1). the transform is defined below at output parameter x
+
+the array wsave which is used by subroutine cost must be
+initialized by calling subroutine costi(n,wsave).
+
+input parameters
+
+n       the length of the sequence x. n must be greater than 1.
+        the method is most efficient when n-1 is a product of
+        small primes.
+
+x       an array which contains the sequence to be transformed
+
+wsave   a work array which must be dimensioned at least 3*n+15
+        in the program that calls cost. the wsave array must be
+        initialized by calling subroutine costi(n,wsave) and a
+        different wsave array must be used for each different
+        value of n. this initialization does not have to be
+        repeated so long as n remains unchanged thus subsequent
+        transforms can be obtained faster than the first.
+
+output parameters
+
+x       for i=1,...,n
+
+            x(i) = x(1)+(-1)**(i-1)*x(n)
+
+             + the sum from k=2 to k=n-1
+
+                 2*x(k)*cos((k-1)*(i-1)*pi/(n-1))
+
+             a call of cost followed by another call of
+             cost will multiply the sequence x by 2*(n-1)
+             hence cost is the unnormalized inverse
+             of itself.
+
+wsave   contains initialization calculations which must not be
+        destroyed between calls of cost.
+
+******************************************************************
+
+subroutine sinqi(n,wsave)
+
+******************************************************************
+
+subroutine sinqi initializes the array wsave which is used in
+both sinqf and sinqb. the prime factorization of n together with
+a tabulation of the trigonometric functions are computed and
+stored in wsave.
+
+input parameter
+
+n       the length of the sequence to be transformed. the method
+        is most efficient when n is a product of small primes.
+
+output parameter
+
+wsave   a work array which must be dimensioned at least 3*n+15.
+        the same work array can be used for both sinqf and sinqb
+        as long as n remains unchanged. different wsave arrays
+        are required for different values of n. the contents of
+        wsave must not be changed between calls of sinqf or sinqb.
+
+******************************************************************
+
+subroutine sinqf(n,x,wsave)
+
+******************************************************************
+
+subroutine sinqf computes the fast fourier transform of quarter
+wave data. that is , sinqf computes the coefficients in a sine
+series representation with only odd wave numbers. the transform
+is defined below at output parameter x.
+
+sinqb is the unnormalized inverse of sinqf since a call of sinqf
+followed by a call of sinqb will multiply the input sequence x
+by 4*n.
+
+the array wsave which is used by subroutine sinqf must be
+initialized by calling subroutine sinqi(n,wsave).
+
+
+input parameters
+
+n       the length of the array x to be transformed.  the method
+        is most efficient when n is a product of small primes.
+
+x       an array which contains the sequence to be transformed
+
+wsave   a work array which must be dimensioned at least 3*n+15.
+        in the program that calls sinqf. the wsave array must be
+        initialized by calling subroutine sinqi(n,wsave) and a
+        different wsave array must be used for each different
+        value of n. this initialization does not have to be
+        repeated so long as n remains unchanged thus subsequent
+        transforms can be obtained faster than the first.
+
+output parameters
+
+x       for i=1,...,n
+
+             x(i) = (-1)**(i-1)*x(n)
+
+                + the sum from k=1 to k=n-1 of
+
+                2*x(k)*sin((2*i-1)*k*pi/(2*n))
+
+             a call of sinqf followed by a call of
+             sinqb will multiply the sequence x by 4*n.
+             therefore sinqb is the unnormalized inverse
+             of sinqf.
+
+wsave   contains initialization calculations which must not
+        be destroyed between calls of sinqf or sinqb.
+
+******************************************************************
+
+subroutine sinqb(n,x,wsave)
+
+******************************************************************
+
+subroutine sinqb computes the fast fourier transform of quarter
+wave data. that is , sinqb computes a sequence from its
+representation in terms of a sine series with odd wave numbers.
+the transform is defined below at output parameter x.
+
+sinqf is the unnormalized inverse of sinqb since a call of sinqb
+followed by a call of sinqf will multiply the input sequence x
+by 4*n.
+
+the array wsave which is used by subroutine sinqb must be
+initialized by calling subroutine sinqi(n,wsave).
+
+
+input parameters
+
+n       the length of the array x to be transformed.  the method
+        is most efficient when n is a product of small primes.
+
+x       an array which contains the sequence to be transformed
+
+wsave   a work array which must be dimensioned at least 3*n+15.
+        in the program that calls sinqb. the wsave array must be
+        initialized by calling subroutine sinqi(n,wsave) and a
+        different wsave array must be used for each different
+        value of n. this initialization does not have to be
+        repeated so long as n remains unchanged thus subsequent
+        transforms can be obtained faster than the first.
+
+output parameters
+
+x       for i=1,...,n
+
+             x(i)= the sum from k=1 to k=n of
+
+               4*x(k)*sin((2k-1)*i*pi/(2*n))
+
+             a call of sinqb followed by a call of
+             sinqf will multiply the sequence x by 4*n.
+             therefore sinqf is the unnormalized inverse
+             of sinqb.
+
+wsave   contains initialization calculations which must not
+        be destroyed between calls of sinqb or sinqf.
+
+******************************************************************
+
+subroutine cosqi(n,wsave)
+
+******************************************************************
+
+subroutine cosqi initializes the array wsave which is used in
+both cosqf and cosqb. the prime factorization of n together with
+a tabulation of the trigonometric functions are computed and
+stored in wsave.
+
+input parameter
+
+n       the length of the array to be transformed.  the method
+        is most efficient when n is a product of small primes.
+
+output parameter
+
+wsave   a work array which must be dimensioned at least 3*n+15.
+        the same work array can be used for both cosqf and cosqb
+        as long as n remains unchanged. different wsave arrays
+        are required for different values of n. the contents of
+        wsave must not be changed between calls of cosqf or cosqb.
+
+******************************************************************
+
+subroutine cosqf(n,x,wsave)
+
+******************************************************************
+
+subroutine cosqf computes the fast fourier transform of quarter
+wave data. that is , cosqf computes the coefficients in a cosine
+series representation with only odd wave numbers. the transform
+is defined below at output parameter x
+
+cosqf is the unnormalized inverse of cosqb since a call of cosqf
+followed by a call of cosqb will multiply the input sequence x
+by 4*n.
+
+the array wsave which is used by subroutine cosqf must be
+initialized by calling subroutine cosqi(n,wsave).
+
+
+input parameters
+
+n       the length of the array x to be transformed.  the method
+        is most efficient when n is a product of small primes.
+
+x       an array which contains the sequence to be transformed
+
+wsave   a work array which must be dimensioned at least 3*n+15
+        in the program that calls cosqf. the wsave array must be
+        initialized by calling subroutine cosqi(n,wsave) and a
+        different wsave array must be used for each different
+        value of n. this initialization does not have to be
+        repeated so long as n remains unchanged thus subsequent
+        transforms can be obtained faster than the first.
+
+output parameters
+
+x       for i=1,...,n
+
+             x(i) = x(1) plus the sum from k=2 to k=n of
+
+                2*x(k)*cos((2*i-1)*(k-1)*pi/(2*n))
+
+             a call of cosqf followed by a call of
+             cosqb will multiply the sequence x by 4*n.
+             therefore cosqb is the unnormalized inverse
+             of cosqf.
+
+wsave   contains initialization calculations which must not
+        be destroyed between calls of cosqf or cosqb.
+
+******************************************************************
+
+subroutine cosqb(n,x,wsave)
+
+******************************************************************
+
+subroutine cosqb computes the fast fourier transform of quarter
+wave data. that is , cosqb computes a sequence from its
+representation in terms of a cosine series with odd wave numbers.
+the transform is defined below at output parameter x.
+
+cosqb is the unnormalized inverse of cosqf since a call of cosqb
+followed by a call of cosqf will multiply the input sequence x
+by 4*n.
+
+the array wsave which is used by subroutine cosqb must be
+initialized by calling subroutine cosqi(n,wsave).
+
+
+input parameters
+
+n       the length of the array x to be transformed.  the method
+        is most efficient when n is a product of small primes.
+
+x       an array which contains the sequence to be transformed
+
+wsave   a work array that must be dimensioned at least 3*n+15
+        in the program that calls cosqb. the wsave array must be
+        initialized by calling subroutine cosqi(n,wsave) and a
+        different wsave array must be used for each different
+        value of n. this initialization does not have to be
+        repeated so long as n remains unchanged thus subsequent
+        transforms can be obtained faster than the first.
+
+output parameters
+
+x       for i=1,...,n
+
+             x(i)= the sum from k=1 to k=n of
+
+               4*x(k)*cos((2*k-1)*(i-1)*pi/(2*n))
+
+             a call of cosqb followed by a call of
+             cosqf will multiply the sequence x by 4*n.
+             therefore cosqf is the unnormalized inverse
+             of cosqb.
+
+wsave   contains initialization calculations which must not
+        be destroyed between calls of cosqb or cosqf.
+
+******************************************************************
+
+subroutine cffti(n,wsave)
+
+******************************************************************
+
+subroutine cffti initializes the array wsave which is used in
+both cfftf and cfftb. the prime factorization of n together with
+a tabulation of the trigonometric functions are computed and
+stored in wsave.
+
+input parameter
+
+n       the length of the sequence to be transformed
+
+output parameter
+
+wsave   a work array which must be dimensioned at least 4*n+15
+        the same work array can be used for both cfftf and cfftb
+        as long as n remains unchanged. different wsave arrays
+        are required for different values of n. the contents of
+        wsave must not be changed between calls of cfftf or cfftb.
+
+******************************************************************
+
+subroutine cfftf(n,c,wsave)
+
+******************************************************************
+
+subroutine cfftf computes the forward complex discrete fourier
+transform (the fourier analysis). equivalently , cfftf computes
+the fourier coefficients of a complex periodic sequence.
+the transform is defined below at output parameter c.
+
+the transform is not normalized. to obtain a normalized transform
+the output must be divided by n. otherwise a call of cfftf
+followed by a call of cfftb will multiply the sequence by n.
+
+the array wsave which is used by subroutine cfftf must be
+initialized by calling subroutine cffti(n,wsave).
+
+input parameters
+
+
+n      the length of the complex sequence c. the method is
+       more efficient when n is the product of small primes. n
+
+c      a complex array of length n which contains the sequence
+
+wsave   a real work array which must be dimensioned at least 4n+15
+        in the program that calls cfftf. the wsave array must be
+        initialized by calling subroutine cffti(n,wsave) and a
+        different wsave array must be used for each different
+        value of n. this initialization does not have to be
+        repeated so long as n remains unchanged thus subsequent
+        transforms can be obtained faster than the first.
+        the same wsave array can be used by cfftf and cfftb.
+
+output parameters
+
+c      for j=1,...,n
+
+           c(j)=the sum from k=1,...,n of
+
+                 c(k)*exp(-i*(j-1)*(k-1)*2*pi/n)
+
+                       where i=sqrt(-1)
+
+wsave   contains initialization calculations which must not be
+        destroyed between calls of subroutine cfftf or cfftb
+
+******************************************************************
+
+subroutine cfftb(n,c,wsave)
+
+******************************************************************
+
+subroutine cfftb computes the backward complex discrete fourier
+transform (the fourier synthesis). equivalently , cfftb computes
+a complex periodic sequence from its fourier coefficients.
+the transform is defined below at output parameter c.
+
+a call of cfftf followed by a call of cfftb will multiply the
+sequence by n.
+
+the array wsave which is used by subroutine cfftb must be
+initialized by calling subroutine cffti(n,wsave).
+
+input parameters
+
+
+n      the length of the complex sequence c. the method is
+       more efficient when n is the product of small primes.
+
+c      a complex array of length n which contains the sequence
+
+wsave   a real work array which must be dimensioned at least 4n+15
+        in the program that calls cfftb. the wsave array must be
+        initialized by calling subroutine cffti(n,wsave) and a
+        different wsave array must be used for each different
+        value of n. this initialization does not have to be
+        repeated so long as n remains unchanged thus subsequent
+        transforms can be obtained faster than the first.
+        the same wsave array can be used by cfftf and cfftb.
+
+output parameters
+
+c      for j=1,...,n
+
+           c(j)=the sum from k=1,...,n of
+
+                 c(k)*exp(i*(j-1)*(k-1)*2*pi/n)
+
+                       where i=sqrt(-1)
+
+wsave   contains initialization calculations which must not be
+        destroyed between calls of subroutine cfftf or cfftb
+
+*/
diff --git a/oss-internship-2020/pffft/myNotes.txt b/oss-internship-2020/pffft/myNotes.txt
new file mode 100644
index 0000000..0dd3bcb
--- /dev/null
+++ b/oss-internship-2020/pffft/myNotes.txt
@@ -0,0 +1,101 @@
+About library's functions:
+    * pffft_aligned_malloc(size_t)
+        returns an allocated array considering the alignment offset
+    * pffft_aligned_free(void *)
+        frees the memory
+    * pffft_simd_size()
+        returns the SIMD_SZ = 4 (regarding simd vector)
+    * pffft_new_setup(int, ...)
+        with a fft size (first argument) being a multiple of 16, 32.
+
+Deleted part (validate function)
+    /*for (pass = 0; pass < 2; pass++) {
+      if (pass == 0) {
+        for (k = 0; k < Nfloat; k++) {
+          ref_[k] = in_[k] = frand() * 2 - 1;
+          out_[k] = 1e30;
+        }
+
+        if (!cplx) {
+          api.rffti(N, wrk_.PtrBoth()).IgnoreError();
+          api.rfftf(N, ref_.PtrBoth(), wrk_.PtrBoth()).IgnoreError();
+
+          {
+            float refN = ref_[N - 1];
+            for (k = N - 2; k >= 1; --k) {
+              ref_[k + 1] = ref_[k];
+            } 
+            ref_[1] = refN;
+          }
+        } else {
+          api.cffti(N, wrk_.PtrBoth()).IgnoreError();
+          api.cfftf(N, ref_.PtrBoth(), wrk_.PtrBoth()).IgnoreError();
+        } 
+      }
+
+      for (k = 0; k < Nfloat; ++k) {
+        ref_max = MAX(ref_max, fabs(ref_[k]));
+      }
+
+      if (pass == 0) {
+        api.pffft_transform(s_reg.PtrBefore(), in_.PtrBoth(), tmp_.PtrBoth(), wrk_.PtrBoth(), PFFFT_FORWARD).IgnoreError();
+
+        memcpy(tmp2, tmp, Nbytes);
+        memcpy(tmp, in, Nbytes);
+
+        api.pffft_transform(s_reg.PtrBefore(), tmp_.PtrBoth(), tmp_.PtrBoth(), wrk_.PtrBoth(), PFFFT_FORWARD).IgnoreError();
+
+        printf("Forward transformation test passed.\n");
+
+        api.pffft_zreorder(s_reg.PtrBefore(), tmp_.PtrBoth(), out_.PtrBoth(), PFFFT_FORWARD).IgnoreError();
+        api.pffft_zreorder(s_reg.PtrBefore(), out_.PtrBoth(), tmp_.PtrBoth(), PFFFT_BACKWARD).IgnoreError();
+
+        printf("Reordering test passed.\n");
+      } else {
+        api.pffft_transform_ordered(s_reg.PtrBefore(), in_.PtrBoth(), tmp_.PtrBoth(), wrk_.PtrBoth(), PFFFT_FORWARD).IgnoreError();
+
+      }
+    } */
+
+        
+MACRO for testing
+TEST(AssignOrReturn, AssignsMultipleVariablesInSequence) {
+  auto func = []() -> absl::Status {
+    int value1;
+    SAPI_ASSIGN_OR_RETURN(value1, StatusOr<int>(1));
+    EXPECT_EQ(1, value1);
+    int value2;
+    SAPI_ASSIGN_OR_RETURN(value2, StatusOr<int>(2));
+    EXPECT_EQ(2, value2);
+    int value3;
+    SAPI_ASSIGN_OR_RETURN(value3, StatusOr<int>(3));
+    EXPECT_EQ(3, value3);
+    int value4;
+    SAPI_ASSIGN_OR_RETURN(value4,
+                          StatusOr<int>(absl::UnknownError("EXPECTED"
+  int value1;
+    SAPI_ASSIGN_OR_RETURN(value1, StatusOr<int>(1));
+
+
+ // PFFFT benchmark
+    /*{
+      sapi::StatusOr<PFFFT_Setup *> s = api.pffft_new_setup(N, cplx ? PFFFT_COMPLEX : PFFFT_REAL);
+      if (s.ok()) {
+        sapi::v::GenericPtr s_reg(s.value());
+
+        t0 = uclock_sec();  
+        for (iter = 0; iter < max_iter; ++iter) {
+          printf("%s 1\n", api.pffft_transform(s_reg.PtrBoth(), X_.PtrBoth(), Z_.PtrBoth(), Y_.PtrBoth(), PFFFT_FORWARD).ToString().c_str());
+          printf("%s 2\n", api.pffft_transform(s_reg.PtrBoth(), X_.PtrBoth(), Z_.PtrBoth(), Y_.PtrBoth(), PFFFT_FORWARD).ToString().c_str());
+        }
+        t1 = uclock_sec();
+        printf("%s 3 \n", api.pffft_destroy_setup(s_reg.PtrBoth()).ToString().c_str());
+
+      
+        flops = (max_iter*2) * ((cplx ? 5 : 2.5) * N * log((double)N) / M_LN2); 
+        show_output("PFFFT", N, cplx, flops, t0, t1, max_iter);
+      } else {
+        fprintf(stderr, "s NULL :(\n\n");
+      }
+    }*/
+    
\ No newline at end of file
diff --git a/oss-internship-2020/pffft/pffft.c b/oss-internship-2020/pffft/pffft.c
new file mode 100644
index 0000000..1686e15
--- /dev/null
+++ b/oss-internship-2020/pffft/pffft.c
@@ -0,0 +1,1881 @@
+/* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
+
+   Based on original fortran 77 code from FFTPACKv4 from NETLIB
+   (http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber
+   of NCAR, in 1985.
+
+   As confirmed by the NCAR fftpack software curators, the following
+   FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
+   released under the same terms.
+
+   FFTPACK license:
+
+   http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
+
+   Copyright (c) 2004 the University Corporation for Atmospheric
+   Research ("UCAR"). All rights reserved. Developed by NCAR's
+   Computational and Information Systems Laboratory, UCAR,
+   www.cisl.ucar.edu.
+
+   Redistribution and use of the Software in source and binary forms,
+   with or without modification, is permitted provided that the
+   following conditions are met:
+
+   - Neither the names of NCAR's Computational and Information Systems
+   Laboratory, the University Corporation for Atmospheric Research,
+   nor the names of its sponsors or contributors may be used to
+   endorse or promote products derived from this Software without
+   specific prior written permission.  
+
+   - Redistributions of source code must retain the above copyright
+   notices, this list of conditions, and the disclaimer below.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions, and the disclaimer below in the
+   documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+   SOFTWARE.
+
+
+   PFFFT : a Pretty Fast FFT.
+
+   This file is largerly based on the original FFTPACK implementation, modified in
+   order to take advantage of SIMD instructions of modern CPUs.
+*/
+
+/*
+  ChangeLog: 
+  - 2011/10/02, version 1: This is the very first release of this file.
+*/
+
+#include "pffft.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <assert.h>
+
+/* detect compiler flavour */
+#if defined(_MSC_VER)
+#  define COMPILER_MSVC
+#elif defined(__GNUC__)
+#  define COMPILER_GCC
+#endif
+
+#if defined(COMPILER_GCC)
+#  define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline))
+#  define NEVER_INLINE(return_type) return_type __attribute__ ((noinline))
+#  define RESTRICT __restrict
+#  define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ varname__[size__];
+#elif defined(COMPILER_MSVC)
+#  define ALWAYS_INLINE(return_type) __forceinline return_type
+#  define NEVER_INLINE(return_type) __declspec(noinline) return_type
+#  define RESTRICT __restrict
+#  define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ *varname__ = (type__*)_alloca(size__ * sizeof(type__))
+#endif
+
+
+/* 
+   vector support macros: the rest of the code is independant of
+   SSE/Altivec/NEON -- adding support for other platforms with 4-element
+   vectors should be limited to these macros 
+*/
+
+
+// define PFFFT_SIMD_DISABLE if you want to use scalar code instead of simd code
+//#define PFFFT_SIMD_DISABLE
+
+/*
+   Altivec support macros 
+*/
+#if !defined(PFFFT_SIMD_DISABLE) && (defined(__ppc__) || defined(__ppc64__))
+typedef vector float v4sf;
+#  define SIMD_SZ 4
+#  define VZERO() ((vector float) vec_splat_u8(0))
+#  define VMUL(a,b) vec_madd(a,b, VZERO())
+#  define VADD(a,b) vec_add(a,b)
+#  define VMADD(a,b,c) vec_madd(a,b,c)
+#  define VSUB(a,b) vec_sub(a,b)
+inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_perm(v, v, vec_lvsl(0, p)), 0); }
+#  define LD_PS1(p) ld_ps1(&p)
+#  define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = vec_mergeh(in1, in2); out2 = vec_mergel(in1, in2); out1 = tmp__; }
+#  define UNINTERLEAVE2(in1, in2, out1, out2) {                           \
+    vector unsigned char vperm1 =  (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); \
+    vector unsigned char vperm2 =  (vector unsigned char)(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); \
+    v4sf tmp__ = vec_perm(in1, in2, vperm1); out2 = vec_perm(in1, in2, vperm2); out1 = tmp__; \
+  }
+#  define VTRANSPOSE4(x0,x1,x2,x3) {              \
+    v4sf y0 = vec_mergeh(x0, x2);               \
+    v4sf y1 = vec_mergel(x0, x2);               \
+    v4sf y2 = vec_mergeh(x1, x3);               \
+    v4sf y3 = vec_mergel(x1, x3);               \
+    x0 = vec_mergeh(y0, y2);                    \
+    x1 = vec_mergel(y0, y2);                    \
+    x2 = vec_mergeh(y1, y3);                    \
+    x3 = vec_mergel(y1, y3);                    \
+  }
+#  define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char)(16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15))
+#  define VALIGNED(ptr) ((((long)(ptr)) & 0xF) == 0)
+
+/*
+  SSE1 support macros
+*/
+#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(i386) || defined(_M_IX86))
+
+#include <xmmintrin.h>
+typedef __m128 v4sf;
+#  define SIMD_SZ 4 // 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/finalize functions anyway so you will have to work if you want to enable AVX with its 256-bit vectors.
+#  define VZERO() _mm_setzero_ps()
+#  define VMUL(a,b) _mm_mul_ps(a,b)
+#  define VADD(a,b) _mm_add_ps(a,b)
+#  define VMADD(a,b,c) _mm_add_ps(_mm_mul_ps(a,b), c)
+#  define VSUB(a,b) _mm_sub_ps(a,b)
+#  define LD_PS1(p) _mm_set1_ps(p)
+#  define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; }
+#  define UNINTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; }
+#  define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3)
+#  define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0))
+#  define VALIGNED(ptr) ((((long)(ptr)) & 0xF) == 0)
+
+/*
+  ARM NEON support macros
+*/
+#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__arm__) || defined(__aarch64__) || defined(__arm64__))
+#  include <arm_neon.h>
+typedef float32x4_t v4sf;
+#  define SIMD_SZ 4
+#  define VZERO() vdupq_n_f32(0)
+#  define VMUL(a,b) vmulq_f32(a,b)
+#  define VADD(a,b) vaddq_f32(a,b)
+#  define VMADD(a,b,c) vmlaq_f32(c,a,b)
+#  define VSUB(a,b) vsubq_f32(a,b)
+#  define LD_PS1(p) vld1q_dup_f32(&(p))
+#  define INTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vzipq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
+#  define UNINTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vuzpq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
+#  define VTRANSPOSE4(x0,x1,x2,x3) {                                    \
+    float32x4x2_t t0_ = vzipq_f32(x0, x2);                              \
+    float32x4x2_t t1_ = vzipq_f32(x1, x3);                              \
+    float32x4x2_t u0_ = vzipq_f32(t0_.val[0], t1_.val[0]);              \
+    float32x4x2_t u1_ = vzipq_f32(t0_.val[1], t1_.val[1]);              \
+    x0 = u0_.val[0]; x1 = u0_.val[1]; x2 = u1_.val[0]; x3 = u1_.val[1]; \
+  }
+// marginally faster version
+//#  define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); }
+#  define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a))
+#  define VALIGNED(ptr) ((((long)(ptr)) & 0x3) == 0)
+#else
+#  if !defined(PFFFT_SIMD_DISABLE)
+#    warning "building with simd disabled !\n";
+#    define PFFFT_SIMD_DISABLE // fallback to scalar code
+#  endif
+#endif
+
+// fallback mode for situations where SSE/Altivec are not available, use scalar mode instead
+#ifdef PFFFT_SIMD_DISABLE
+typedef float v4sf;
+#  define SIMD_SZ 1
+#  define VZERO() 0.f
+#  define VMUL(a,b) ((a)*(b))
+#  define VADD(a,b) ((a)+(b))
+#  define VMADD(a,b,c) ((a)*(b)+(c))
+#  define VSUB(a,b) ((a)-(b))
+#  define LD_PS1(p) (p)
+#  define VALIGNED(ptr) ((((long)(ptr)) & 0x3) == 0)
+#endif
+
+// shortcuts for complex multiplcations
+#define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); }
+#define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); }
+#ifndef SVMUL
+// multiply a scalar with a vector
+#define SVMUL(f,v) VMUL(LD_PS1(f),v)
+#endif
+
+#if !defined(PFFFT_SIMD_DISABLE)
+typedef union v4sf_union {
+  v4sf  v;
+  float f[4];
+} v4sf_union;
+
+#include <string.h>
+
+#define assertv4(v,f0,f1,f2,f3) assert(v.f[0] == (f0) && v.f[1] == (f1) && v.f[2] == (f2) && v.f[3] == (f3))
+
+/* detect bugs with the vector support macros */
+void validate_pffft_simd() {
+  float f[16] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 };
+  v4sf_union a0, a1, a2, a3, t, u; 
+  memcpy(a0.f, f, 4*sizeof(float));
+  memcpy(a1.f, f+4, 4*sizeof(float));
+  memcpy(a2.f, f+8, 4*sizeof(float));
+  memcpy(a3.f, f+12, 4*sizeof(float));
+
+  t = a0; u = a1; t.v = VZERO();
+  printf("VZERO=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 0, 0, 0, 0);
+  t.v = VADD(a1.v, a2.v);
+  printf("VADD(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 12, 14, 16, 18);
+  t.v = VMUL(a1.v, a2.v);
+  printf("VMUL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 45, 60, 77);
+  t.v = VMADD(a1.v, a2.v,a0.v);
+  printf("VMADD(4:7,8:11,0:3)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 46, 62, 80);
+
+  INTERLEAVE2(a1.v,a2.v,t.v,u.v);
+  printf("INTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
+  assertv4(t, 4, 8, 5, 9); assertv4(u, 6, 10, 7, 11);
+  UNINTERLEAVE2(a1.v,a2.v,t.v,u.v);
+  printf("UNINTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
+  assertv4(t, 4, 6, 8, 10); assertv4(u, 5, 7, 9, 11);
+
+  t.v=LD_PS1(f[15]);
+  printf("LD_PS1(15)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
+  assertv4(t, 15, 15, 15, 15);
+  t.v = VSWAPHL(a1.v, a2.v);
+  printf("VSWAPHL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
+  assertv4(t, 8, 9, 6, 7);
+  VTRANSPOSE4(a0.v, a1.v, a2.v, a3.v);
+  printf("VTRANSPOSE4(0:3,4:7,8:11,12:15)=[%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", 
+         a0.f[0], a0.f[1], a0.f[2], a0.f[3], a1.f[0], a1.f[1], a1.f[2], a1.f[3], 
+         a2.f[0], a2.f[1], a2.f[2], a2.f[3], a3.f[0], a3.f[1], a3.f[2], a3.f[3]); 
+  assertv4(a0, 0, 4, 8, 12); assertv4(a1, 1, 5, 9, 13); assertv4(a2, 2, 6, 10, 14); assertv4(a3, 3, 7, 11, 15);
+}
+#endif //!PFFFT_SIMD_DISABLE
+
+/* SSE and co like 16-bytes aligned pointers */
+#define MALLOC_V4SF_ALIGNMENT 64 // with a 64-byte alignment, we are even aligned on L2 cache lines...
+void *pffft_aligned_malloc(size_t nb_bytes) {
+  void *p, *p0 = malloc(nb_bytes + MALLOC_V4SF_ALIGNMENT);
+  if (!p0) return (void *) 0;
+  p = (void *) (((size_t) p0 + MALLOC_V4SF_ALIGNMENT) & (~((size_t) (MALLOC_V4SF_ALIGNMENT-1))));
+  *((void **) p - 1) = p0;
+  return p;
+}
+
+void pffft_aligned_free(void *p) {
+  if (p) free(*((void **) p - 1));
+}
+
+int pffft_simd_size() { return SIMD_SZ; }
+
+/*
+  passf2 and passb2 has been merged here, fsign = -1 for passf2, +1 for passb2
+*/
+static NEVER_INLINE(void) passf2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1, float fsign) {
+  int k, i;
+  int l1ido = l1*ido;
+  if (ido <= 2) {
+    for (k=0; k < l1ido; k += ido, ch += ido, cc+= 2*ido) {
+      ch[0]         = VADD(cc[0], cc[ido+0]);
+      ch[l1ido]     = VSUB(cc[0], cc[ido+0]);
+      ch[1]         = VADD(cc[1], cc[ido+1]);
+      ch[l1ido + 1] = VSUB(cc[1], cc[ido+1]);
+    }
+  } else {
+    for (k=0; k < l1ido; k += ido, ch += ido, cc += 2*ido) {
+      for (i=0; i<ido-1; i+=2) {
+        v4sf tr2 = VSUB(cc[i+0], cc[i+ido+0]);
+        v4sf ti2 = VSUB(cc[i+1], cc[i+ido+1]);
+        v4sf wr = LD_PS1(wa1[i]), wi = VMUL(LD_PS1(fsign), LD_PS1(wa1[i+1]));
+        ch[i]   = VADD(cc[i+0], cc[i+ido+0]);
+        ch[i+1] = VADD(cc[i+1], cc[i+ido+1]);
+        VCPLXMUL(tr2, ti2, wr, wi);
+        ch[i+l1ido]   = tr2;
+        ch[i+l1ido+1] = ti2;
+      }
+    }
+  }
+}
+
+/*
+  passf3 and passb3 has been merged here, fsign = -1 for passf3, +1 for passb3
+*/
+static NEVER_INLINE(void) passf3_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
+                                    const float *wa1, const float *wa2, float fsign) {
+  static const float taur = -0.5f;
+  float taui = 0.866025403784439f*fsign;
+  int i, k;
+  v4sf tr2, ti2, cr2, ci2, cr3, ci3, dr2, di2, dr3, di3;
+  int l1ido = l1*ido;
+  float wr1, wi1, wr2, wi2;
+  assert(ido > 2);
+  for (k=0; k< l1ido; k += ido, cc+= 3*ido, ch +=ido) {
+    for (i=0; i<ido-1; i+=2) {
+      tr2 = VADD(cc[i+ido], cc[i+2*ido]);
+      cr2 = VADD(cc[i], SVMUL(taur,tr2));
+      ch[i]    = VADD(cc[i], tr2);
+      ti2 = VADD(cc[i+ido+1], cc[i+2*ido+1]);
+      ci2 = VADD(cc[i    +1], SVMUL(taur,ti2));
+      ch[i+1]  = VADD(cc[i+1], ti2);
+      cr3 = SVMUL(taui, VSUB(cc[i+ido], cc[i+2*ido]));
+      ci3 = SVMUL(taui, VSUB(cc[i+ido+1], cc[i+2*ido+1]));
+      dr2 = VSUB(cr2, ci3);
+      dr3 = VADD(cr2, ci3);
+      di2 = VADD(ci2, cr3);
+      di3 = VSUB(ci2, cr3);
+      wr1=wa1[i], wi1=fsign*wa1[i+1], wr2=wa2[i], wi2=fsign*wa2[i+1]; 
+      VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
+      ch[i+l1ido] = dr2; 
+      ch[i+l1ido + 1] = di2;
+      VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
+      ch[i+2*l1ido] = dr3;
+      ch[i+2*l1ido+1] = di3;
+    }
+  }
+} /* passf3 */
+
+static NEVER_INLINE(void) passf4_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
+                                    const float *wa1, const float *wa2, const float *wa3, float fsign) {
+  /* isign == -1 for forward transform and +1 for backward transform */
+
+  int i, k;
+  v4sf ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
+  int l1ido = l1*ido;
+  if (ido == 2) {
+    for (k=0; k < l1ido; k += ido, ch += ido, cc += 4*ido) {
+      tr1 = VSUB(cc[0], cc[2*ido + 0]);
+      tr2 = VADD(cc[0], cc[2*ido + 0]);
+      ti1 = VSUB(cc[1], cc[2*ido + 1]);
+      ti2 = VADD(cc[1], cc[2*ido + 1]);
+      ti4 = VMUL(VSUB(cc[1*ido + 0], cc[3*ido + 0]), LD_PS1(fsign));
+      tr4 = VMUL(VSUB(cc[3*ido + 1], cc[1*ido + 1]), LD_PS1(fsign));
+      tr3 = VADD(cc[ido + 0], cc[3*ido + 0]);
+      ti3 = VADD(cc[ido + 1], cc[3*ido + 1]);
+
+      ch[0*l1ido + 0] = VADD(tr2, tr3);
+      ch[0*l1ido + 1] = VADD(ti2, ti3);
+      ch[1*l1ido + 0] = VADD(tr1, tr4);
+      ch[1*l1ido + 1] = VADD(ti1, ti4);
+      ch[2*l1ido + 0] = VSUB(tr2, tr3);
+      ch[2*l1ido + 1] = VSUB(ti2, ti3);        
+      ch[3*l1ido + 0] = VSUB(tr1, tr4);
+      ch[3*l1ido + 1] = VSUB(ti1, ti4);
+    }
+  } else {
+    for (k=0; k < l1ido; k += ido, ch+=ido, cc += 4*ido) {
+      for (i=0; i<ido-1; i+=2) {
+        float wr1, wi1, wr2, wi2, wr3, wi3;
+        tr1 = VSUB(cc[i + 0], cc[i + 2*ido + 0]);
+        tr2 = VADD(cc[i + 0], cc[i + 2*ido + 0]);
+        ti1 = VSUB(cc[i + 1], cc[i + 2*ido + 1]);
+        ti2 = VADD(cc[i + 1], cc[i + 2*ido + 1]);
+        tr4 = VMUL(VSUB(cc[i + 3*ido + 1], cc[i + 1*ido + 1]), LD_PS1(fsign));
+        ti4 = VMUL(VSUB(cc[i + 1*ido + 0], cc[i + 3*ido + 0]), LD_PS1(fsign));
+        tr3 = VADD(cc[i + ido + 0], cc[i + 3*ido + 0]);
+        ti3 = VADD(cc[i + ido + 1], cc[i + 3*ido + 1]);
+
+        ch[i] = VADD(tr2, tr3);
+        cr3    = VSUB(tr2, tr3);
+        ch[i + 1] = VADD(ti2, ti3);
+        ci3 = VSUB(ti2, ti3);
+
+        cr2 = VADD(tr1, tr4);
+        cr4 = VSUB(tr1, tr4);
+        ci2 = VADD(ti1, ti4);
+        ci4 = VSUB(ti1, ti4);
+        wr1=wa1[i], wi1=fsign*wa1[i+1];
+        VCPLXMUL(cr2, ci2, LD_PS1(wr1), LD_PS1(wi1));
+        wr2=wa2[i], wi2=fsign*wa2[i+1]; 
+        ch[i + l1ido] = cr2;
+        ch[i + l1ido + 1] = ci2;
+
+        VCPLXMUL(cr3, ci3, LD_PS1(wr2), LD_PS1(wi2));
+        wr3=wa3[i], wi3=fsign*wa3[i+1]; 
+        ch[i + 2*l1ido] = cr3;
+        ch[i + 2*l1ido + 1] = ci3;
+
+        VCPLXMUL(cr4, ci4, LD_PS1(wr3), LD_PS1(wi3));
+        ch[i + 3*l1ido] = cr4;
+        ch[i + 3*l1ido + 1] = ci4;
+      }
+    }
+  }
+} /* passf4 */
+
+/*
+  passf5 and passb5 has been merged here, fsign = -1 for passf5, +1 for passb5
+*/
+static NEVER_INLINE(void) passf5_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
+                                    const float *wa1, const float *wa2, 
+                                    const float *wa3, const float *wa4, float fsign) {  
+  static const float tr11 = .309016994374947f;
+  const float ti11 = .951056516295154f*fsign;
+  static const float tr12 = -.809016994374947f;
+  const float ti12 = .587785252292473f*fsign;
+
+  /* Local variables */
+  int i, k;
+  v4sf ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4, ti2, ti3,
+    ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5;
+
+  float wr1, wi1, wr2, wi2, wr3, wi3, wr4, wi4;
+
+#define cc_ref(a_1,a_2) cc[(a_2-1)*ido + a_1 + 1]
+#define ch_ref(a_1,a_3) ch[(a_3-1)*l1*ido + a_1 + 1]
+
+  assert(ido > 2);
+  for (k = 0; k < l1; ++k, cc += 5*ido, ch += ido) {
+    for (i = 0; i < ido-1; i += 2) {
+      ti5 = VSUB(cc_ref(i  , 2), cc_ref(i  , 5));
+      ti2 = VADD(cc_ref(i  , 2), cc_ref(i  , 5));
+      ti4 = VSUB(cc_ref(i  , 3), cc_ref(i  , 4));
+      ti3 = VADD(cc_ref(i  , 3), cc_ref(i  , 4));
+      tr5 = VSUB(cc_ref(i-1, 2), cc_ref(i-1, 5));
+      tr2 = VADD(cc_ref(i-1, 2), cc_ref(i-1, 5));
+      tr4 = VSUB(cc_ref(i-1, 3), cc_ref(i-1, 4));
+      tr3 = VADD(cc_ref(i-1, 3), cc_ref(i-1, 4));
+      ch_ref(i-1, 1) = VADD(cc_ref(i-1, 1), VADD(tr2, tr3));
+      ch_ref(i  , 1) = VADD(cc_ref(i  , 1), VADD(ti2, ti3));
+      cr2 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr11, tr2),SVMUL(tr12, tr3)));
+      ci2 = VADD(cc_ref(i  , 1), VADD(SVMUL(tr11, ti2),SVMUL(tr12, ti3)));
+      cr3 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr12, tr2),SVMUL(tr11, tr3)));
+      ci3 = VADD(cc_ref(i  , 1), VADD(SVMUL(tr12, ti2),SVMUL(tr11, ti3)));
+      cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4));
+      ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
+      cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4));
+      ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
+      dr3 = VSUB(cr3, ci4);
+      dr4 = VADD(cr3, ci4);
+      di3 = VADD(ci3, cr4);
+      di4 = VSUB(ci3, cr4);
+      dr5 = VADD(cr2, ci5);
+      dr2 = VSUB(cr2, ci5);
+      di5 = VSUB(ci2, cr5);
+      di2 = VADD(ci2, cr5);
+      wr1=wa1[i], wi1=fsign*wa1[i+1], wr2=wa2[i], wi2=fsign*wa2[i+1]; 
+      wr3=wa3[i], wi3=fsign*wa3[i+1], wr4=wa4[i], wi4=fsign*wa4[i+1]; 
+      VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
+      ch_ref(i - 1, 2) = dr2;
+      ch_ref(i, 2)     = di2;
+      VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
+      ch_ref(i - 1, 3) = dr3;
+      ch_ref(i, 3)     = di3;
+      VCPLXMUL(dr4, di4, LD_PS1(wr3), LD_PS1(wi3));
+      ch_ref(i - 1, 4) = dr4;
+      ch_ref(i, 4)     = di4;
+      VCPLXMUL(dr5, di5, LD_PS1(wr4), LD_PS1(wi4));
+      ch_ref(i - 1, 5) = dr5;
+      ch_ref(i, 5)     = di5;
+    }
+  }
+#undef ch_ref
+#undef cc_ref
+}
+
+static NEVER_INLINE(void) radf2_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch, const float *wa1) {
+  static const float minus_one = -1.f;
+  int i, k, l1ido = l1*ido;
+  for (k=0; k < l1ido; k += ido) {
+    v4sf a = cc[k], b = cc[k + l1ido];
+    ch[2*k] = VADD(a, b);
+    ch[2*(k+ido)-1] = VSUB(a, b);
+  }
+  if (ido < 2) return;
+  if (ido != 2) {
+    for (k=0; k < l1ido; k += ido) {
+      for (i=2; i<ido; i+=2) {
+        v4sf tr2 = cc[i - 1 + k + l1ido], ti2 = cc[i + k + l1ido];
+        v4sf br = cc[i - 1 + k], bi = cc[i + k];
+        VCPLXMULCONJ(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1])); 
+        ch[i + 2*k] = VADD(bi, ti2);
+        ch[2*(k+ido) - i] = VSUB(ti2, bi);
+        ch[i - 1 + 2*k] = VADD(br, tr2);
+        ch[2*(k+ido) - i -1] = VSUB(br, tr2);
+      }
+    }
+    if (ido % 2 == 1) return;
+  }
+  for (k=0; k < l1ido; k += ido) {
+    ch[2*k + ido] = SVMUL(minus_one, cc[ido-1 + k + l1ido]);
+    ch[2*k + ido-1] = cc[k + ido-1];
+  }
+} /* radf2 */
+
+
+static NEVER_INLINE(void) radb2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1) {
+  static const float minus_two=-2;
+  int i, k, l1ido = l1*ido;
+  v4sf a,b,c,d, tr2, ti2;
+  for (k=0; k < l1ido; k += ido) {
+    a = cc[2*k]; b = cc[2*(k+ido) - 1];
+    ch[k] = VADD(a, b);
+    ch[k + l1ido] =VSUB(a, b);
+  }
+  if (ido < 2) return;
+  if (ido != 2) {
+    for (k = 0; k < l1ido; k += ido) {
+      for (i = 2; i < ido; i += 2) {
+        a = cc[i-1 + 2*k]; b = cc[2*(k + ido) - i - 1];
+        c = cc[i+0 + 2*k]; d = cc[2*(k + ido) - i + 0];
+        ch[i-1 + k] = VADD(a, b);
+        tr2 = VSUB(a, b);
+        ch[i+0 + k] = VSUB(c, d);
+        ti2 = VADD(c, d);
+        VCPLXMUL(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
+        ch[i-1 + k + l1ido] = tr2;
+        ch[i+0 + k + l1ido] = ti2;
+      }
+    }
+    if (ido % 2 == 1) return;
+  }
+  for (k = 0; k < l1ido; k += ido) {
+    a = cc[2*k + ido-1]; b = cc[2*k + ido];
+    ch[k + ido-1] = VADD(a,a);
+    ch[k + ido-1 + l1ido] = SVMUL(minus_two, b);
+  }
+} /* radb2 */
+
+static void radf3_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
+                     const float *wa1, const float *wa2) {
+  static const float taur = -0.5f;
+  static const float taui = 0.866025403784439f;
+  int i, k, ic;
+  v4sf ci2, di2, di3, cr2, dr2, dr3, ti2, ti3, tr2, tr3, wr1, wi1, wr2, wi2;
+  for (k=0; k<l1; k++) {
+    cr2 = VADD(cc[(k + l1)*ido], cc[(k + 2*l1)*ido]);
+    ch[3*k*ido] = VADD(cc[k*ido], cr2);
+    ch[(3*k+2)*ido] = SVMUL(taui, VSUB(cc[(k + l1*2)*ido], cc[(k + l1)*ido]));
+    ch[ido-1 + (3*k + 1)*ido] = VADD(cc[k*ido], SVMUL(taur, cr2));
+  }
+  if (ido == 1) return;
+  for (k=0; k<l1; k++) {
+    for (i=2; i<ido; i+=2) {
+      ic = ido - i;
+      wr1 = LD_PS1(wa1[i - 2]); wi1 = LD_PS1(wa1[i - 1]);
+      dr2 = cc[i - 1 + (k + l1)*ido]; di2 = cc[i + (k + l1)*ido];
+      VCPLXMULCONJ(dr2, di2, wr1, wi1);
+
+      wr2 = LD_PS1(wa2[i - 2]); wi2 = LD_PS1(wa2[i - 1]);
+      dr3 = cc[i - 1 + (k + l1*2)*ido]; di3 = cc[i + (k + l1*2)*ido];
+      VCPLXMULCONJ(dr3, di3, wr2, wi2);
+        
+      cr2 = VADD(dr2, dr3);
+      ci2 = VADD(di2, di3);
+      ch[i - 1 + 3*k*ido] = VADD(cc[i - 1 + k*ido], cr2);
+      ch[i + 3*k*ido] = VADD(cc[i + k*ido], ci2);
+      tr2 = VADD(cc[i - 1 + k*ido], SVMUL(taur, cr2));
+      ti2 = VADD(cc[i + k*ido], SVMUL(taur, ci2));
+      tr3 = SVMUL(taui, VSUB(di2, di3));
+      ti3 = SVMUL(taui, VSUB(dr3, dr2));
+      ch[i - 1 + (3*k + 2)*ido] = VADD(tr2, tr3);
+      ch[ic - 1 + (3*k + 1)*ido] = VSUB(tr2, tr3);
+      ch[i + (3*k + 2)*ido] = VADD(ti2, ti3);
+      ch[ic + (3*k + 1)*ido] = VSUB(ti3, ti2);
+    }
+  }
+} /* radf3 */
+
+
+static void radb3_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
+                     const float *wa1, const float *wa2)
+{
+  static const float taur = -0.5f;
+  static const float taui = 0.866025403784439f;
+  static const float taui_2 = 0.866025403784439f*2;
+  int i, k, ic;
+  v4sf ci2, ci3, di2, di3, cr2, cr3, dr2, dr3, ti2, tr2;
+  for (k=0; k<l1; k++) {
+    tr2 = cc[ido-1 + (3*k + 1)*ido]; tr2 = VADD(tr2,tr2);
+    cr2 = VMADD(LD_PS1(taur), tr2, cc[3*k*ido]);
+    ch[k*ido] = VADD(cc[3*k*ido], tr2);
+    ci3 = SVMUL(taui_2, cc[(3*k + 2)*ido]);
+    ch[(k + l1)*ido] = VSUB(cr2, ci3);
+    ch[(k + 2*l1)*ido] = VADD(cr2, ci3);
+  }
+  if (ido == 1) return;
+  for (k=0; k<l1; k++) {
+    for (i=2; i<ido; i+=2) {
+      ic = ido - i;
+      tr2 = VADD(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]);
+      cr2 = VMADD(LD_PS1(taur), tr2, cc[i - 1 + 3*k*ido]);
+      ch[i - 1 + k*ido] = VADD(cc[i - 1 + 3*k*ido], tr2);
+      ti2 = VSUB(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]);
+      ci2 = VMADD(LD_PS1(taur), ti2, cc[i + 3*k*ido]);
+      ch[i + k*ido] = VADD(cc[i + 3*k*ido], ti2);
+      cr3 = SVMUL(taui, VSUB(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]));
+      ci3 = SVMUL(taui, VADD(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]));
+      dr2 = VSUB(cr2, ci3);
+      dr3 = VADD(cr2, ci3);
+      di2 = VADD(ci2, cr3);
+      di3 = VSUB(ci2, cr3);
+      VCPLXMUL(dr2, di2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
+      ch[i - 1 + (k + l1)*ido] = dr2;
+      ch[i + (k + l1)*ido] = di2;
+      VCPLXMUL(dr3, di3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
+      ch[i - 1 + (k + 2*l1)*ido] = dr3;
+      ch[i + (k + 2*l1)*ido] = di3;
+    }
+  }
+} /* radb3 */
+
+static NEVER_INLINE(void) radf4_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf * RESTRICT ch,
+                                   const float * RESTRICT wa1, const float * RESTRICT wa2, const float * RESTRICT wa3)
+{
+  static const float minus_hsqt2 = (float)-0.7071067811865475;
+  int i, k, l1ido = l1*ido;
+  {
+    const v4sf *RESTRICT cc_ = cc, * RESTRICT cc_end = cc + l1ido; 
+    v4sf * RESTRICT ch_ = ch;
+    while (cc < cc_end) {
+      // this loop represents between 25% and 40% of total radf4_ps cost !
+      v4sf a0 = cc[0], a1 = cc[l1ido];
+      v4sf a2 = cc[2*l1ido], a3 = cc[3*l1ido];
+      v4sf tr1 = VADD(a1, a3);
+      v4sf tr2 = VADD(a0, a2);
+      ch[2*ido-1] = VSUB(a0, a2);
+      ch[2*ido  ] = VSUB(a3, a1);
+      ch[0      ] = VADD(tr1, tr2);
+      ch[4*ido-1] = VSUB(tr2, tr1);
+      cc += ido; ch += 4*ido;
+    }
+    cc = cc_; ch = ch_;
+  }
+  if (ido < 2) return;
+  if (ido != 2) {
+    for (k = 0; k < l1ido; k += ido) {
+      const v4sf * RESTRICT pc = (v4sf*)(cc + 1 + k);
+      for (i=2; i<ido; i += 2, pc += 2) {
+        int ic = ido - i;
+        v4sf wr, wi, cr2, ci2, cr3, ci3, cr4, ci4;
+        v4sf tr1, ti1, tr2, ti2, tr3, ti3, tr4, ti4;
+
+        cr2 = pc[1*l1ido+0];
+        ci2 = pc[1*l1ido+1];
+        wr=LD_PS1(wa1[i - 2]);
+        wi=LD_PS1(wa1[i - 1]);
+        VCPLXMULCONJ(cr2,ci2,wr,wi);
+
+        cr3 = pc[2*l1ido+0];
+        ci3 = pc[2*l1ido+1];
+        wr = LD_PS1(wa2[i-2]); 
+        wi = LD_PS1(wa2[i-1]);
+        VCPLXMULCONJ(cr3, ci3, wr, wi);
+
+        cr4 = pc[3*l1ido];
+        ci4 = pc[3*l1ido+1];
+        wr = LD_PS1(wa3[i-2]); 
+        wi = LD_PS1(wa3[i-1]);
+        VCPLXMULCONJ(cr4, ci4, wr, wi);
+
+        /* at this point, on SSE, five of "cr2 cr3 cr4 ci2 ci3 ci4" should be loaded in registers */
+
+        tr1 = VADD(cr2,cr4);
+        tr4 = VSUB(cr4,cr2); 
+        tr2 = VADD(pc[0],cr3);
+        tr3 = VSUB(pc[0],cr3);
+        ch[i - 1 + 4*k] = VADD(tr1,tr2);
+        ch[ic - 1 + 4*k + 3*ido] = VSUB(tr2,tr1); // at this point tr1 and tr2 can be disposed
+        ti1 = VADD(ci2,ci4);
+        ti4 = VSUB(ci2,ci4);
+        ch[i - 1 + 4*k + 2*ido] = VADD(ti4,tr3);
+        ch[ic - 1 + 4*k + 1*ido] = VSUB(tr3,ti4); // dispose tr3, ti4
+        ti2 = VADD(pc[1],ci3);
+        ti3 = VSUB(pc[1],ci3);
+        ch[i + 4*k] = VADD(ti1, ti2);
+        ch[ic + 4*k + 3*ido] = VSUB(ti1, ti2);
+        ch[i + 4*k + 2*ido] = VADD(tr4, ti3);
+        ch[ic + 4*k + 1*ido] = VSUB(tr4, ti3);
+      }
+    }
+    if (ido % 2 == 1) return;
+  }
+  for (k=0; k<l1ido; k += ido) {
+    v4sf a = cc[ido-1 + k + l1ido], b = cc[ido-1 + k + 3*l1ido];
+    v4sf c = cc[ido-1 + k], d = cc[ido-1 + k + 2*l1ido];
+    v4sf ti1 = SVMUL(minus_hsqt2, VADD(a, b));
+    v4sf tr1 = SVMUL(minus_hsqt2, VSUB(b, a));
+    ch[ido-1 + 4*k] = VADD(tr1, c);
+    ch[ido-1 + 4*k + 2*ido] = VSUB(c, tr1);
+    ch[4*k + 1*ido] = VSUB(ti1, d); 
+    ch[4*k + 3*ido] = VADD(ti1, d); 
+  }
+} /* radf4 */
+
+
+static NEVER_INLINE(void) radb4_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
+                                   const float * RESTRICT wa1, const float * RESTRICT wa2, const float *RESTRICT wa3)
+{
+  static const float minus_sqrt2 = (float)-1.414213562373095;
+  static const float two = 2.f;
+  int i, k, l1ido = l1*ido;
+  v4sf ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
+  {
+    const v4sf *RESTRICT cc_ = cc, * RESTRICT ch_end = ch + l1ido; 
+    v4sf *ch_ = ch;
+    while (ch < ch_end) {
+      v4sf a = cc[0], b = cc[4*ido-1];
+      v4sf c = cc[2*ido], d = cc[2*ido-1];
+      tr3 = SVMUL(two,d);
+      tr2 = VADD(a,b);
+      tr1 = VSUB(a,b);
+      tr4 = SVMUL(two,c);
+      ch[0*l1ido] = VADD(tr2, tr3);
+      ch[2*l1ido] = VSUB(tr2, tr3);
+      ch[1*l1ido] = VSUB(tr1, tr4);
+      ch[3*l1ido] = VADD(tr1, tr4);
+      
+      cc += 4*ido; ch += ido;
+    }
+    cc = cc_; ch = ch_;
+  }
+  if (ido < 2) return;
+  if (ido != 2) {
+    for (k = 0; k < l1ido; k += ido) {
+      const v4sf * RESTRICT pc = (v4sf*)(cc - 1 + 4*k);
+      v4sf * RESTRICT ph = (v4sf*)(ch + k + 1);
+      for (i = 2; i < ido; i += 2) {
+
+        tr1 = VSUB(pc[i], pc[4*ido - i]);
+        tr2 = VADD(pc[i], pc[4*ido - i]);
+        ti4 = VSUB(pc[2*ido + i], pc[2*ido - i]);
+        tr3 = VADD(pc[2*ido + i], pc[2*ido - i]);
+        ph[0] = VADD(tr2, tr3);
+        cr3 = VSUB(tr2, tr3);
+
+        ti3 = VSUB(pc[2*ido + i + 1], pc[2*ido - i + 1]);
+        tr4 = VADD(pc[2*ido + i + 1], pc[2*ido - i + 1]);
+        cr2 = VSUB(tr1, tr4);
+        cr4 = VADD(tr1, tr4);
+
+        ti1 = VADD(pc[i + 1], pc[4*ido - i + 1]);
+        ti2 = VSUB(pc[i + 1], pc[4*ido - i + 1]);
+
+        ph[1] = VADD(ti2, ti3); ph += l1ido;
+        ci3 = VSUB(ti2, ti3);
+        ci2 = VADD(ti1, ti4);
+        ci4 = VSUB(ti1, ti4);
+        VCPLXMUL(cr2, ci2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
+        ph[0] = cr2;
+        ph[1] = ci2; ph += l1ido;
+        VCPLXMUL(cr3, ci3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
+        ph[0] = cr3;
+        ph[1] = ci3; ph += l1ido;
+        VCPLXMUL(cr4, ci4, LD_PS1(wa3[i-2]), LD_PS1(wa3[i-1]));
+        ph[0] = cr4;
+        ph[1] = ci4; ph = ph - 3*l1ido + 2;
+      }
+    }
+    if (ido % 2 == 1) return;
+  }
+  for (k=0; k < l1ido; k+=ido) {
+    int i0 = 4*k + ido;
+    v4sf c = cc[i0-1], d = cc[i0 + 2*ido-1];
+    v4sf a = cc[i0+0], b = cc[i0 + 2*ido+0];
+    tr1 = VSUB(c,d);
+    tr2 = VADD(c,d);
+    ti1 = VADD(b,a);
+    ti2 = VSUB(b,a);
+    ch[ido-1 + k + 0*l1ido] = VADD(tr2,tr2);
+    ch[ido-1 + k + 1*l1ido] = SVMUL(minus_sqrt2, VSUB(ti1, tr1));
+    ch[ido-1 + k + 2*l1ido] = VADD(ti2, ti2);
+    ch[ido-1 + k + 3*l1ido] = SVMUL(minus_sqrt2, VADD(ti1, tr1));
+  }
+} /* radb4 */
+
+static void radf5_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch, 
+                     const float *wa1, const float *wa2, const float *wa3, const float *wa4)
+{
+  static const float tr11 = .309016994374947f;
+  static const float ti11 = .951056516295154f;
+  static const float tr12 = -.809016994374947f;
+  static const float ti12 = .587785252292473f;
+
+  /* System generated locals */
+  int cc_offset, ch_offset;
+
+  /* Local variables */
+  int i, k, ic;
+  v4sf ci2, di2, ci4, ci5, di3, di4, di5, ci3, cr2, cr3, dr2, dr3, dr4, dr5,
+    cr5, cr4, ti2, ti3, ti5, ti4, tr2, tr3, tr4, tr5;
+  int idp2;
+
+
+#define cc_ref(a_1,a_2,a_3) cc[((a_3)*l1 + (a_2))*ido + a_1]
+#define ch_ref(a_1,a_2,a_3) ch[((a_3)*5 + (a_2))*ido + a_1]
+
+  /* Parameter adjustments */
+  ch_offset = 1 + ido * 6;
+  ch -= ch_offset;
+  cc_offset = 1 + ido * (1 + l1);
+  cc -= cc_offset;
+
+  /* Function Body */
+  for (k = 1; k <= l1; ++k) {
+    cr2 = VADD(cc_ref(1, k, 5), cc_ref(1, k, 2));
+    ci5 = VSUB(cc_ref(1, k, 5), cc_ref(1, k, 2));
+    cr3 = VADD(cc_ref(1, k, 4), cc_ref(1, k, 3));
+    ci4 = VSUB(cc_ref(1, k, 4), cc_ref(1, k, 3));
+    ch_ref(1, 1, k) = VADD(cc_ref(1, k, 1), VADD(cr2, cr3));
+    ch_ref(ido, 2, k) = VADD(cc_ref(1, k, 1), VADD(SVMUL(tr11, cr2), SVMUL(tr12, cr3)));
+    ch_ref(1, 3, k) = VADD(SVMUL(ti11, ci5), SVMUL(ti12, ci4));
+    ch_ref(ido, 4, k) = VADD(cc_ref(1, k, 1), VADD(SVMUL(tr12, cr2), SVMUL(tr11, cr3)));
+    ch_ref(1, 5, k) = VSUB(SVMUL(ti12, ci5), SVMUL(ti11, ci4));
+    //printf("pffft: radf5, k=%d ch_ref=%f, ci4=%f\n", k, ch_ref(1, 5, k), ci4);
+  }
+  if (ido == 1) {
+    return;
+  }
+  idp2 = ido + 2;
+  for (k = 1; k <= l1; ++k) {
+    for (i = 3; i <= ido; i += 2) {
+      ic = idp2 - i;
+      dr2 = LD_PS1(wa1[i-3]); di2 = LD_PS1(wa1[i-2]);
+      dr3 = LD_PS1(wa2[i-3]); di3 = LD_PS1(wa2[i-2]);
+      dr4 = LD_PS1(wa3[i-3]); di4 = LD_PS1(wa3[i-2]);
+      dr5 = LD_PS1(wa4[i-3]); di5 = LD_PS1(wa4[i-2]);
+      VCPLXMULCONJ(dr2, di2, cc_ref(i-1, k, 2), cc_ref(i, k, 2));
+      VCPLXMULCONJ(dr3, di3, cc_ref(i-1, k, 3), cc_ref(i, k, 3));
+      VCPLXMULCONJ(dr4, di4, cc_ref(i-1, k, 4), cc_ref(i, k, 4));
+      VCPLXMULCONJ(dr5, di5, cc_ref(i-1, k, 5), cc_ref(i, k, 5));
+      cr2 = VADD(dr2, dr5);
+      ci5 = VSUB(dr5, dr2);
+      cr5 = VSUB(di2, di5);
+      ci2 = VADD(di2, di5);
+      cr3 = VADD(dr3, dr4);
+      ci4 = VSUB(dr4, dr3);
+      cr4 = VSUB(di3, di4);
+      ci3 = VADD(di3, di4);
+      ch_ref(i - 1, 1, k) = VADD(cc_ref(i - 1, k, 1), VADD(cr2, cr3));
+      ch_ref(i, 1, k) = VSUB(cc_ref(i, k, 1), VADD(ci2, ci3));//
+      tr2 = VADD(cc_ref(i - 1, k, 1), VADD(SVMUL(tr11, cr2), SVMUL(tr12, cr3)));
+      ti2 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr11, ci2), SVMUL(tr12, ci3)));//
+      tr3 = VADD(cc_ref(i - 1, k, 1), VADD(SVMUL(tr12, cr2), SVMUL(tr11, cr3)));
+      ti3 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr12, ci2), SVMUL(tr11, ci3)));//
+      tr5 = VADD(SVMUL(ti11, cr5), SVMUL(ti12, cr4));
+      ti5 = VADD(SVMUL(ti11, ci5), SVMUL(ti12, ci4));
+      tr4 = VSUB(SVMUL(ti12, cr5), SVMUL(ti11, cr4));
+      ti4 = VSUB(SVMUL(ti12, ci5), SVMUL(ti11, ci4));
+      ch_ref(i - 1, 3, k) = VSUB(tr2, tr5);
+      ch_ref(ic - 1, 2, k) = VADD(tr2, tr5);
+      ch_ref(i, 3, k) = VADD(ti2, ti5);
+      ch_ref(ic, 2, k) = VSUB(ti5, ti2);
+      ch_ref(i - 1, 5, k) = VSUB(tr3, tr4);
+      ch_ref(ic - 1, 4, k) = VADD(tr3, tr4);
+      ch_ref(i, 5, k) = VADD(ti3, ti4);
+      ch_ref(ic, 4, k) = VSUB(ti4, ti3);
+    }
+  }
+#undef cc_ref
+#undef ch_ref
+} /* radf5 */
+
+static void radb5_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch, 
+                  const float *wa1, const float *wa2, const float *wa3, const float *wa4)
+{
+  static const float tr11 = .309016994374947f;
+  static const float ti11 = .951056516295154f;
+  static const float tr12 = -.809016994374947f;
+  static const float ti12 = .587785252292473f;
+
+  int cc_offset, ch_offset;
+
+  /* Local variables */
+  int i, k, ic;
+  v4sf ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4, ti2, ti3,
+    ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5;
+  int idp2;
+
+#define cc_ref(a_1,a_2,a_3) cc[((a_3)*5 + (a_2))*ido + a_1]
+#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1]
+
+  /* Parameter adjustments */
+  ch_offset = 1 + ido * (1 + l1);
+  ch -= ch_offset;
+  cc_offset = 1 + ido * 6;
+  cc -= cc_offset;
+
+  /* Function Body */
+  for (k = 1; k <= l1; ++k) {
+    ti5 = VADD(cc_ref(1, 3, k), cc_ref(1, 3, k));
+    ti4 = VADD(cc_ref(1, 5, k), cc_ref(1, 5, k));
+    tr2 = VADD(cc_ref(ido, 2, k), cc_ref(ido, 2, k));
+    tr3 = VADD(cc_ref(ido, 4, k), cc_ref(ido, 4, k));
+    ch_ref(1, k, 1) = VADD(cc_ref(1, 1, k), VADD(tr2, tr3));
+    cr2 = VADD(cc_ref(1, 1, k), VADD(SVMUL(tr11, tr2), SVMUL(tr12, tr3)));
+    cr3 = VADD(cc_ref(1, 1, k), VADD(SVMUL(tr12, tr2), SVMUL(tr11, tr3)));
+    ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
+    ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
+    ch_ref(1, k, 2) = VSUB(cr2, ci5);
+    ch_ref(1, k, 3) = VSUB(cr3, ci4);
+    ch_ref(1, k, 4) = VADD(cr3, ci4);
+    ch_ref(1, k, 5) = VADD(cr2, ci5);
+  }
+  if (ido == 1) {
+    return;
+  }
+  idp2 = ido + 2;
+  for (k = 1; k <= l1; ++k) {
+    for (i = 3; i <= ido; i += 2) {
+      ic = idp2 - i;
+      ti5 = VADD(cc_ref(i  , 3, k), cc_ref(ic  , 2, k));
+      ti2 = VSUB(cc_ref(i  , 3, k), cc_ref(ic  , 2, k));
+      ti4 = VADD(cc_ref(i  , 5, k), cc_ref(ic  , 4, k));
+      ti3 = VSUB(cc_ref(i  , 5, k), cc_ref(ic  , 4, k));
+      tr5 = VSUB(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k));
+      tr2 = VADD(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k));
+      tr4 = VSUB(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k));
+      tr3 = VADD(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k));
+      ch_ref(i - 1, k, 1) = VADD(cc_ref(i-1, 1, k), VADD(tr2, tr3));
+      ch_ref(i, k, 1) = VADD(cc_ref(i, 1, k), VADD(ti2, ti3));
+      cr2 = VADD(cc_ref(i-1, 1, k), VADD(SVMUL(tr11, tr2), SVMUL(tr12, tr3)));
+      ci2 = VADD(cc_ref(i  , 1, k), VADD(SVMUL(tr11, ti2), SVMUL(tr12, ti3)));
+      cr3 = VADD(cc_ref(i-1, 1, k), VADD(SVMUL(tr12, tr2), SVMUL(tr11, tr3)));
+      ci3 = VADD(cc_ref(i  , 1, k), VADD(SVMUL(tr12, ti2), SVMUL(tr11, ti3)));
+      cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4));
+      ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
+      cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4));
+      ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
+      dr3 = VSUB(cr3, ci4);
+      dr4 = VADD(cr3, ci4);
+      di3 = VADD(ci3, cr4);
+      di4 = VSUB(ci3, cr4);
+      dr5 = VADD(cr2, ci5);
+      dr2 = VSUB(cr2, ci5);
+      di5 = VSUB(ci2, cr5);
+      di2 = VADD(ci2, cr5);
+      VCPLXMUL(dr2, di2, LD_PS1(wa1[i-3]), LD_PS1(wa1[i-2]));
+      VCPLXMUL(dr3, di3, LD_PS1(wa2[i-3]), LD_PS1(wa2[i-2]));
+      VCPLXMUL(dr4, di4, LD_PS1(wa3[i-3]), LD_PS1(wa3[i-2]));
+      VCPLXMUL(dr5, di5, LD_PS1(wa4[i-3]), LD_PS1(wa4[i-2]));
+
+      ch_ref(i-1, k, 2) = dr2; ch_ref(i, k, 2) = di2;
+      ch_ref(i-1, k, 3) = dr3; ch_ref(i, k, 3) = di3;
+      ch_ref(i-1, k, 4) = dr4; ch_ref(i, k, 4) = di4;
+      ch_ref(i-1, k, 5) = dr5; ch_ref(i, k, 5) = di5;
+    }
+  }
+#undef cc_ref
+#undef ch_ref
+} /* radb5 */
+
+static NEVER_INLINE(v4sf *) rfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, 
+                                      const float *wa, const int *ifac) {  
+  v4sf *in  = (v4sf*)input_readonly;
+  v4sf *out = (in == work2 ? work1 : work2);
+  int nf = ifac[1], k1;
+  int l2 = n;
+  int iw = n-1;
+  assert(in != out && work1 != work2);
+  for (k1 = 1; k1 <= nf; ++k1) {
+    int kh = nf - k1;
+    int ip = ifac[kh + 2];
+    int l1 = l2 / ip;
+    int ido = n / l2;
+    iw -= (ip - 1)*ido;
+    switch (ip) {
+      case 5: {
+        int ix2 = iw + ido;
+        int ix3 = ix2 + ido;
+        int ix4 = ix3 + ido;
+        radf5_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]);
+      } break;
+      case 4: {
+        int ix2 = iw + ido;
+        int ix3 = ix2 + ido;
+        radf4_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3]);
+      } break;
+      case 3: {
+        int ix2 = iw + ido;
+        radf3_ps(ido, l1, in, out, &wa[iw], &wa[ix2]);
+      } break;
+      case 2:
+        radf2_ps(ido, l1, in, out, &wa[iw]);
+        break;
+      default:
+        assert(0);
+        break;
+    }
+    l2 = l1;
+    if (out == work2) {
+      out = work1; in = work2;
+    } else {
+      out = work2; in = work1;
+    }
+  }
+  return in; /* this is in fact the output .. */
+} /* rfftf1 */
+
+static NEVER_INLINE(v4sf *) rfftb1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, 
+                                      const float *wa, const int *ifac) {  
+  v4sf *in  = (v4sf*)input_readonly;
+  v4sf *out = (in == work2 ? work1 : work2);
+  int nf = ifac[1], k1;
+  int l1 = 1;
+  int iw = 0;
+  assert(in != out);
+  for (k1=1; k1<=nf; k1++) {
+    int ip = ifac[k1 + 1];
+    int l2 = ip*l1;
+    int ido = n / l2;
+    switch (ip) {
+      case 5: {
+        int ix2 = iw + ido;
+        int ix3 = ix2 + ido;
+        int ix4 = ix3 + ido;
+        radb5_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]);
+      } break;
+      case 4: {
+        int ix2 = iw + ido;
+        int ix3 = ix2 + ido;
+        radb4_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3]);
+      } break;
+      case 3: {
+        int ix2 = iw + ido;
+        radb3_ps(ido, l1, in, out, &wa[iw], &wa[ix2]);
+      } break;
+      case 2:
+        radb2_ps(ido, l1, in, out, &wa[iw]);
+        break;
+      default:
+        assert(0);
+        break;
+    }
+    l1 = l2;
+    iw += (ip - 1)*ido;
+
+    if (out == work2) {
+      out = work1; in = work2;
+    } else {
+      out = work2; in = work1;
+    }
+  }
+  return in; /* this is in fact the output .. */
+}
+
+static int decompose(int n, int *ifac, const int *ntryh) {
+  int nl = n, nf = 0, i, j = 0;
+  for (j=0; ntryh[j]; ++j) {
+    int ntry = ntryh[j];
+    while (nl != 1) {
+      int nq = nl / ntry;
+      int nr = nl - ntry * nq;
+      if (nr == 0) {
+        ifac[2+nf++] = ntry;
+        nl = nq;
+        if (ntry == 2 && nf != 1) {
+          for (i = 2; i <= nf; ++i) {
+            int ib = nf - i + 2;
+            ifac[ib + 1] = ifac[ib];
+          }
+          ifac[2] = 2;
+        }
+      } else break;
+    }
+  }
+  ifac[0] = n;
+  ifac[1] = nf;  
+  return nf;
+}
+
+
+
+static void rffti1_ps(int n, float *wa, int *ifac)
+{
+  static const int ntryh[] = { 4,2,3,5,0 };
+  int k1, j, ii;
+
+  int nf = decompose(n,ifac,ntryh);
+  float argh = (2*M_PI) / n;
+  int is = 0;
+  int nfm1 = nf - 1;
+  int l1 = 1;
+  for (k1 = 1; k1 <= nfm1; k1++) {
+    int ip = ifac[k1 + 1];
+    int ld = 0;
+    int l2 = l1*ip;
+    int ido = n / l2;
+    int ipm = ip - 1;
+    for (j = 1; j <= ipm; ++j) {
+      float argld;
+      int i = is, fi=0;
+      ld += l1;
+      argld = ld*argh;
+      for (ii = 3; ii <= ido; ii += 2) {
+        i += 2;
+        fi += 1;
+        wa[i - 2] = cos(fi*argld);
+        wa[i - 1] = sin(fi*argld);
+      }
+      is += ido;
+    }
+    l1 = l2;
+  }
+} /* rffti1 */
+
+void cffti1_ps(int n, float *wa, int *ifac)
+{
+  static const int ntryh[] = { 5,3,4,2,0 };
+  int k1, j, ii;
+
+  int nf = decompose(n,ifac,ntryh);
+  float argh = (2*M_PI)/(float)n;
+  int i = 1;
+  int l1 = 1;
+  for (k1=1; k1<=nf; k1++) {
+    int ip = ifac[k1+1];
+    int ld = 0;
+    int l2 = l1*ip;
+    int ido = n / l2;
+    int idot = ido + ido + 2;
+    int ipm = ip - 1;
+    for (j=1; j<=ipm; j++) {
+      float argld;
+      int i1 = i, fi = 0;
+      wa[i-1] = 1;
+      wa[i] = 0;
+      ld += l1;
+      argld = ld*argh;
+      for (ii = 4; ii <= idot; ii += 2) {
+        i += 2;
+        fi += 1;
+        wa[i-1] = cos(fi*argld);
+        wa[i] = sin(fi*argld);
+      }
+      if (ip > 5) {
+        wa[i1-1] = wa[i-1];
+        wa[i1] = wa[i];
+      }
+    }
+    l1 = l2;
+  }
+} /* cffti1 */
+
+
+v4sf *cfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, const float *wa, const int *ifac, int isign) {
+  v4sf *in  = (v4sf*)input_readonly;
+  v4sf *out = (in == work2 ? work1 : work2); 
+  int nf = ifac[1], k1;
+  int l1 = 1;
+  int iw = 0;
+  assert(in != out && work1 != work2);
+  for (k1=2; k1<=nf+1; k1++) {
+    int ip = ifac[k1];
+    int l2 = ip*l1;
+    int ido = n / l2;
+    int idot = ido + ido;
+    switch (ip) {
+      case 5: {
+        int ix2 = iw + idot;
+        int ix3 = ix2 + idot;
+        int ix4 = ix3 + idot;
+        passf5_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);
+      } break;
+      case 4: {
+        int ix2 = iw + idot;
+        int ix3 = ix2 + idot;
+        passf4_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], isign);
+      } break;
+      case 2: {
+        passf2_ps(idot, l1, in, out, &wa[iw], isign);
+      } break;
+      case 3: {
+        int ix2 = iw + idot;
+        passf3_ps(idot, l1, in, out, &wa[iw], &wa[ix2], isign);
+      } break;
+      default:
+        assert(0);
+    }
+    l1 = l2;
+    iw += (ip - 1)*idot;
+    if (out == work2) {
+      out = work1; in = work2;
+    } else {
+      out = work2; in = work1;
+    }
+  }
+
+  return in; /* this is in fact the output .. */
+}
+
+
+struct PFFFT_Setup {
+  int     N;
+  int     Ncvec; // nb of complex simd vectors (N/4 if PFFFT_COMPLEX, N/8 if PFFFT_REAL)
+  int ifac[15];
+  pffft_transform_t transform;
+  v4sf *data; // allocated room for twiddle coefs
+  float *e;    // points into 'data' , N/4*3 elements
+  float *twiddle; // points into 'data', N/4 elements
+};
+
+PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform) {
+  PFFFT_Setup *s = (PFFFT_Setup*)malloc(sizeof(PFFFT_Setup));
+  int k, m;
+  /* unfortunately, the fft size must be a multiple of 16 for complex FFTs 
+     and 32 for real FFTs -- a lot of stuff would need to be rewritten to
+     handle other cases (or maybe just switch to a scalar fft, I don't know..) */
+  if (transform == PFFFT_REAL) { assert((N%(2*SIMD_SZ*SIMD_SZ))==0 && N>0); }
+  if (transform == PFFFT_COMPLEX) { assert((N%(SIMD_SZ*SIMD_SZ))==0 && N>0); }
+  //assert((N % 32) == 0);
+  s->N = N;
+  s->transform = transform;  
+  /* nb of complex simd vectors */
+  s->Ncvec = (transform == PFFFT_REAL ? N/2 : N)/SIMD_SZ;
+  s->data = (v4sf*)pffft_aligned_malloc(2*s->Ncvec * sizeof(v4sf));
+  s->e = (float*)s->data;
+  s->twiddle = (float*)(s->data + (2*s->Ncvec*(SIMD_SZ-1))/SIMD_SZ);  
+
+  if (transform == PFFFT_REAL) {
+    for (k=0; k < s->Ncvec; ++k) {
+      int i = k/SIMD_SZ;
+      int j = k%SIMD_SZ;
+      for (m=0; m < SIMD_SZ-1; ++m) {
+        float A = -2*M_PI*(m+1)*k / N;
+        s->e[(2*(i*3 + m) + 0) * SIMD_SZ + j] = cos(A);
+        s->e[(2*(i*3 + m) + 1) * SIMD_SZ + j] = sin(A);
+      }
+    }
+    rffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac);
+  } else {
+    for (k=0; k < s->Ncvec; ++k) {
+      int i = k/SIMD_SZ;
+      int j = k%SIMD_SZ;
+      for (m=0; m < SIMD_SZ-1; ++m) {
+        float A = -2*M_PI*(m+1)*k / N;
+        s->e[(2*(i*3 + m) + 0)*SIMD_SZ + j] = cos(A);
+        s->e[(2*(i*3 + m) + 1)*SIMD_SZ + j] = sin(A);
+      }
+    }
+    cffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac);
+  }
+
+  /* check that N is decomposable with allowed prime factors */
+  for (k=0, m=1; k < s->ifac[1]; ++k) { m *= s->ifac[2+k]; }
+  if (m != N/SIMD_SZ) {
+    pffft_destroy_setup(s); s = 0;
+  }
+
+  return s;
+}
+
+
+void pffft_destroy_setup(PFFFT_Setup *s) {
+  pffft_aligned_free(s->data);
+  free(s);
+}
+
+#if !defined(PFFFT_SIMD_DISABLE)
+
+/* [0 0 1 2 3 4 5 6 7 8] -> [0 8 7 6 5 4 3 2 1] */
+static void reversed_copy(int N, const v4sf *in, int in_stride, v4sf *out) {
+  v4sf g0, g1;
+  int k;
+  INTERLEAVE2(in[0], in[1], g0, g1); in += in_stride;
+  
+  *--out = VSWAPHL(g0, g1); // [g0l, g0h], [g1l g1h] -> [g1l, g0h]
+  for (k=1; k < N; ++k) {
+    v4sf h0, h1;
+    INTERLEAVE2(in[0], in[1], h0, h1); in += in_stride;
+    *--out = VSWAPHL(g1, h0);
+    *--out = VSWAPHL(h0, h1);
+    g1 = h1;
+  }
+  *--out = VSWAPHL(g1, g0);
+}
+
+static void unreversed_copy(int N, const v4sf *in, v4sf *out, int out_stride) {
+  v4sf g0, g1, h0, h1;
+  int k;
+  g0 = g1 = in[0]; ++in;
+  for (k=1; k < N; ++k) {
+    h0 = *in++; h1 = *in++;
+    g1 = VSWAPHL(g1, h0);
+    h0 = VSWAPHL(h0, h1);
+    UNINTERLEAVE2(h0, g1, out[0], out[1]); out += out_stride;
+    g1 = h1;
+  }
+  h0 = *in++; h1 = g0;
+  g1 = VSWAPHL(g1, h0);
+  h0 = VSWAPHL(h0, h1);
+  UNINTERLEAVE2(h0, g1, out[0], out[1]);
+}
+
+void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) {
+  int k, N = setup->N, Ncvec = setup->Ncvec;
+  const v4sf *vin = (const v4sf*)in;
+  v4sf *vout = (v4sf*)out;
+  assert(in != out);
+  if (setup->transform == PFFFT_REAL) {
+    int k, dk = N/32;
+    if (direction == PFFFT_FORWARD) {
+      for (k=0; k < dk; ++k) {
+        INTERLEAVE2(vin[k*8 + 0], vin[k*8 + 1], vout[2*(0*dk + k) + 0], vout[2*(0*dk + k) + 1]);
+        INTERLEAVE2(vin[k*8 + 4], vin[k*8 + 5], vout[2*(2*dk + k) + 0], vout[2*(2*dk + k) + 1]);
+      }
+      reversed_copy(dk, vin+2, 8, (v4sf*)(out + N/2));
+      reversed_copy(dk, vin+6, 8, (v4sf*)(out + N));
+    } else {
+      for (k=0; k < dk; ++k) {
+        UNINTERLEAVE2(vin[2*(0*dk + k) + 0], vin[2*(0*dk + k) + 1], vout[k*8 + 0], vout[k*8 + 1]);
+        UNINTERLEAVE2(vin[2*(2*dk + k) + 0], vin[2*(2*dk + k) + 1], vout[k*8 + 4], vout[k*8 + 5]);
+      }
+      unreversed_copy(dk, (v4sf*)(in + N/4), (v4sf*)(out + N - 6*SIMD_SZ), -8);
+      unreversed_copy(dk, (v4sf*)(in + 3*N/4), (v4sf*)(out + N - 2*SIMD_SZ), -8);
+    }
+  } else {
+    if (direction == PFFFT_FORWARD) {
+      for (k=0; k < Ncvec; ++k) { 
+        int kk = (k/4) + (k%4)*(Ncvec/4);
+        INTERLEAVE2(vin[k*2], vin[k*2+1], vout[kk*2], vout[kk*2+1]);
+      }
+    } else {
+      for (k=0; k < Ncvec; ++k) { 
+        int kk = (k/4) + (k%4)*(Ncvec/4);
+        UNINTERLEAVE2(vin[kk*2], vin[kk*2+1], vout[k*2], vout[k*2+1]);
+      }
+    }
+  }
+}
+
+void pffft_cplx_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
+  int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
+  v4sf r0, i0, r1, i1, r2, i2, r3, i3;
+  v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
+  assert(in != out);
+  for (k=0; k < dk; ++k) {    
+    r0 = in[8*k+0]; i0 = in[8*k+1];
+    r1 = in[8*k+2]; i1 = in[8*k+3];
+    r2 = in[8*k+4]; i2 = in[8*k+5];
+    r3 = in[8*k+6]; i3 = in[8*k+7];
+    VTRANSPOSE4(r0,r1,r2,r3);
+    VTRANSPOSE4(i0,i1,i2,i3);
+    VCPLXMUL(r1,i1,e[k*6+0],e[k*6+1]);
+    VCPLXMUL(r2,i2,e[k*6+2],e[k*6+3]);
+    VCPLXMUL(r3,i3,e[k*6+4],e[k*6+5]);
+
+    sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2);
+    sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3);
+    si0 = VADD(i0,i2); di0 = VSUB(i0, i2);
+    si1 = VADD(i1,i3); di1 = VSUB(i1, i3);
+
+    /*
+      transformation for each column is:
+      
+      [1   1   1   1   0   0   0   0]   [r0]
+      [1   0  -1   0   0  -1   0   1]   [r1]
+      [1  -1   1  -1   0   0   0   0]   [r2]
+      [1   0  -1   0   0   1   0  -1]   [r3]
+      [0   0   0   0   1   1   1   1] * [i0]
+      [0   1   0  -1   1   0  -1   0]   [i1]
+      [0   0   0   0   1  -1   1  -1]   [i2]
+      [0  -1   0   1   1   0  -1   0]   [i3]    
+    */
+    
+    r0 = VADD(sr0, sr1); i0 = VADD(si0, si1);
+    r1 = VADD(dr0, di1); i1 = VSUB(di0, dr1);
+    r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1);
+    r3 = VSUB(dr0, di1); i3 = VADD(di0, dr1);
+  
+    *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1;
+    *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3;
+  }
+}
+
+void pffft_cplx_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
+  int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
+  v4sf r0, i0, r1, i1, r2, i2, r3, i3;
+  v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
+  assert(in != out);
+  for (k=0; k < dk; ++k) {    
+    r0 = in[8*k+0]; i0 = in[8*k+1];
+    r1 = in[8*k+2]; i1 = in[8*k+3];
+    r2 = in[8*k+4]; i2 = in[8*k+5];
+    r3 = in[8*k+6]; i3 = in[8*k+7];
+
+    sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2);
+    sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3);
+    si0 = VADD(i0,i2); di0 = VSUB(i0, i2);
+    si1 = VADD(i1,i3); di1 = VSUB(i1, i3);
+
+    r0 = VADD(sr0, sr1); i0 = VADD(si0, si1);
+    r1 = VSUB(dr0, di1); i1 = VADD(di0, dr1);
+    r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1);
+    r3 = VADD(dr0, di1); i3 = VSUB(di0, dr1);
+
+    VCPLXMULCONJ(r1,i1,e[k*6+0],e[k*6+1]);
+    VCPLXMULCONJ(r2,i2,e[k*6+2],e[k*6+3]);
+    VCPLXMULCONJ(r3,i3,e[k*6+4],e[k*6+5]);
+
+    VTRANSPOSE4(r0,r1,r2,r3);
+    VTRANSPOSE4(i0,i1,i2,i3);
+
+    *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1;
+    *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3;
+  }
+}
+
+
+static ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *in1, const v4sf *in,
+                            const v4sf *e, v4sf *out) {
+  v4sf r0, i0, r1, i1, r2, i2, r3, i3;
+  v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
+  r0 = *in0; i0 = *in1;
+  r1 = *in++; i1 = *in++; r2 = *in++; i2 = *in++; r3 = *in++; i3 = *in++;
+  VTRANSPOSE4(r0,r1,r2,r3);
+  VTRANSPOSE4(i0,i1,i2,i3);
+ 
+  /*
+    transformation for each column is:
+
+    [1   1   1   1   0   0   0   0]   [r0]
+    [1   0  -1   0   0  -1   0   1]   [r1]
+    [1   0  -1   0   0   1   0  -1]   [r2]
+    [1  -1   1  -1   0   0   0   0]   [r3]
+    [0   0   0   0   1   1   1   1] * [i0]
+    [0  -1   0   1  -1   0   1   0]   [i1]
+    [0  -1   0   1   1   0  -1   0]   [i2]
+    [0   0   0   0  -1   1  -1   1]   [i3]    
+  */
+  
+  //cerr << "matrix initial, before e , REAL:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
+  //cerr << "matrix initial, before e, IMAG :\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
+
+  VCPLXMUL(r1,i1,e[0],e[1]);
+  VCPLXMUL(r2,i2,e[2],e[3]);
+  VCPLXMUL(r3,i3,e[4],e[5]);
+
+  //cerr << "matrix initial, real part:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
+  //cerr << "matrix initial, imag part:\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
+
+  sr0 = VADD(r0,r2); dr0 = VSUB(r0,r2); 
+  sr1 = VADD(r1,r3); dr1 = VSUB(r3,r1);
+  si0 = VADD(i0,i2); di0 = VSUB(i0,i2); 
+  si1 = VADD(i1,i3); di1 = VSUB(i3,i1);
+
+  r0 = VADD(sr0, sr1);
+  r3 = VSUB(sr0, sr1);
+  i0 = VADD(si0, si1);
+  i3 = VSUB(si1, si0);
+  r1 = VADD(dr0, di1);
+  r2 = VSUB(dr0, di1);
+  i1 = VSUB(dr1, di0);
+  i2 = VADD(dr1, di0);
+
+  *out++ = r0;
+  *out++ = i0;
+  *out++ = r1;
+  *out++ = i1;
+  *out++ = r2;
+  *out++ = i2;
+  *out++ = r3;
+  *out++ = i3;
+
+}
+
+static NEVER_INLINE(void) pffft_real_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
+  int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
+  /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
+
+  v4sf_union cr, ci, *uout = (v4sf_union*)out;
+  v4sf save = in[7], zero=VZERO();
+  float xr0, xi0, xr1, xi1, xr2, xi2, xr3, xi3;
+  static const float s = M_SQRT2/2;
+
+  cr.v = in[0]; ci.v = in[Ncvec*2-1];
+  assert(in != out);
+  pffft_real_finalize_4x4(&zero, &zero, in+1, e, out);
+
+  /*
+    [cr0 cr1 cr2 cr3 ci0 ci1 ci2 ci3]
+
+    [Xr(1)]  ] [1   1   1   1   0   0   0   0]
+    [Xr(N/4) ] [0   0   0   0   1   s   0  -s]
+    [Xr(N/2) ] [1   0  -1   0   0   0   0   0]
+    [Xr(3N/4)] [0   0   0   0   1  -s   0   s]
+    [Xi(1)   ] [1  -1   1  -1   0   0   0   0]
+    [Xi(N/4) ] [0   0   0   0   0  -s  -1  -s]
+    [Xi(N/2) ] [0  -1   0   1   0   0   0   0]
+    [Xi(3N/4)] [0   0   0   0   0  -s   1  -s]
+  */
+
+  xr0=(cr.f[0]+cr.f[2]) + (cr.f[1]+cr.f[3]); uout[0].f[0] = xr0;
+  xi0=(cr.f[0]+cr.f[2]) - (cr.f[1]+cr.f[3]); uout[1].f[0] = xi0;
+  xr2=(cr.f[0]-cr.f[2]);                     uout[4].f[0] = xr2;
+  xi2=(cr.f[3]-cr.f[1]);                     uout[5].f[0] = xi2;
+  xr1= ci.f[0] + s*(ci.f[1]-ci.f[3]);        uout[2].f[0] = xr1;
+  xi1=-ci.f[2] - s*(ci.f[1]+ci.f[3]);        uout[3].f[0] = xi1;
+  xr3= ci.f[0] - s*(ci.f[1]-ci.f[3]);        uout[6].f[0] = xr3;
+  xi3= ci.f[2] - s*(ci.f[1]+ci.f[3]);        uout[7].f[0] = xi3; 
+
+  for (k=1; k < dk; ++k) {
+    v4sf save_next = in[8*k+7];
+    pffft_real_finalize_4x4(&save, &in[8*k+0], in + 8*k+1,
+                           e + k*6, out + k*8);
+    save = save_next;
+  }
+
+}
+
+static ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in, 
+                                             const v4sf *e, v4sf *out, int first) {
+  v4sf r0=in[0], i0=in[1], r1=in[2], i1=in[3], r2=in[4], i2=in[5], r3=in[6], i3=in[7];
+  /*
+    transformation for each column is:
+
+    [1   1   1   1   0   0   0   0]   [r0]
+    [1   0   0  -1   0  -1  -1   0]   [r1]
+    [1  -1  -1   1   0   0   0   0]   [r2]
+    [1   0   0  -1   0   1   1   0]   [r3]
+    [0   0   0   0   1  -1   1  -1] * [i0]
+    [0  -1   1   0   1   0   0   1]   [i1]
+    [0   0   0   0   1   1  -1  -1]   [i2]
+    [0   1  -1   0   1   0   0   1]   [i3]    
+  */
+
+  v4sf sr0 = VADD(r0,r3), dr0 = VSUB(r0,r3); 
+  v4sf sr1 = VADD(r1,r2), dr1 = VSUB(r1,r2);
+  v4sf si0 = VADD(i0,i3), di0 = VSUB(i0,i3); 
+  v4sf si1 = VADD(i1,i2), di1 = VSUB(i1,i2);
+
+  r0 = VADD(sr0, sr1);
+  r2 = VSUB(sr0, sr1);
+  r1 = VSUB(dr0, si1);
+  r3 = VADD(dr0, si1);
+  i0 = VSUB(di0, di1);
+  i2 = VADD(di0, di1);
+  i1 = VSUB(si0, dr1);
+  i3 = VADD(si0, dr1);
+
+  VCPLXMULCONJ(r1,i1,e[0],e[1]);
+  VCPLXMULCONJ(r2,i2,e[2],e[3]);
+  VCPLXMULCONJ(r3,i3,e[4],e[5]);
+
+  VTRANSPOSE4(r0,r1,r2,r3);
+  VTRANSPOSE4(i0,i1,i2,i3);
+
+  if (!first) {
+    *out++ = r0;
+    *out++ = i0;
+  }
+  *out++ = r1;
+  *out++ = i1;
+  *out++ = r2;
+  *out++ = i2;
+  *out++ = r3;
+  *out++ = i3;
+}
+
+static NEVER_INLINE(void) pffft_real_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
+  int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
+  /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
+
+  v4sf_union Xr, Xi, *uout = (v4sf_union*)out;
+  float cr0, ci0, cr1, ci1, cr2, ci2, cr3, ci3;
+  static const float s = M_SQRT2;
+  assert(in != out);
+  for (k=0; k < 4; ++k) {
+    Xr.f[k] = ((float*)in)[8*k];
+    Xi.f[k] = ((float*)in)[8*k+4];
+  }
+
+  pffft_real_preprocess_4x4(in, e, out+1, 1); // will write only 6 values
+
+  /*
+    [Xr0 Xr1 Xr2 Xr3 Xi0 Xi1 Xi2 Xi3]
+
+    [cr0] [1   0   2   0   1   0   0   0]
+    [cr1] [1   0   0   0  -1   0  -2   0]
+    [cr2] [1   0  -2   0   1   0   0   0]
+    [cr3] [1   0   0   0  -1   0   2   0]
+    [ci0] [0   2   0   2   0   0   0   0]
+    [ci1] [0   s   0  -s   0  -s   0  -s]
+    [ci2] [0   0   0   0   0  -2   0   2]
+    [ci3] [0  -s   0   s   0  -s   0  -s]
+  */
+  for (k=1; k < dk; ++k) {    
+    pffft_real_preprocess_4x4(in+8*k, e + k*6, out-1+k*8, 0);
+  }
+
+  cr0=(Xr.f[0]+Xi.f[0]) + 2*Xr.f[2]; uout[0].f[0] = cr0;
+  cr1=(Xr.f[0]-Xi.f[0]) - 2*Xi.f[2]; uout[0].f[1] = cr1;
+  cr2=(Xr.f[0]+Xi.f[0]) - 2*Xr.f[2]; uout[0].f[2] = cr2;
+  cr3=(Xr.f[0]-Xi.f[0]) + 2*Xi.f[2]; uout[0].f[3] = cr3;
+  ci0= 2*(Xr.f[1]+Xr.f[3]);                       uout[2*Ncvec-1].f[0] = ci0;
+  ci1= s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[1] = ci1;
+  ci2= 2*(Xi.f[3]-Xi.f[1]);                       uout[2*Ncvec-1].f[2] = ci2;
+  ci3=-s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[3] = ci3;
+}
+
+
+void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *foutput, v4sf *scratch,
+                             pffft_direction_t direction, int ordered) {
+  int k, Ncvec   = setup->Ncvec;
+  int nf_odd = (setup->ifac[1] & 1);
+
+  // temporary buffer is allocated on the stack if the scratch pointer is NULL
+  int stack_allocate = (scratch == 0 ? Ncvec*2 : 1);
+  VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate);
+
+  const v4sf *vinput = (const v4sf*)finput;
+  v4sf *voutput      = (v4sf*)foutput;
+  v4sf *buff[2]      = { voutput, scratch ? scratch : scratch_on_stack };
+  int ib = (nf_odd ^ ordered ? 1 : 0);
+
+  assert(VALIGNED(finput) && VALIGNED(foutput));
+
+  //assert(finput != foutput);
+  if (direction == PFFFT_FORWARD) {
+    ib = !ib;
+    if (setup->transform == PFFFT_REAL) { 
+      ib = (rfftf1_ps(Ncvec*2, vinput, buff[ib], buff[!ib],
+                      setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);      
+      pffft_real_finalize(Ncvec, buff[ib], buff[!ib], (v4sf*)setup->e);
+    } else {
+      v4sf *tmp = buff[ib];
+      for (k=0; k < Ncvec; ++k) {
+        UNINTERLEAVE2(vinput[k*2], vinput[k*2+1], tmp[k*2], tmp[k*2+1]);
+      }
+      ib = (cfftf1_ps(Ncvec, buff[ib], buff[!ib], buff[ib], 
+                      setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1);
+      pffft_cplx_finalize(Ncvec, buff[ib], buff[!ib], (v4sf*)setup->e);
+    }
+    if (ordered) {
+      pffft_zreorder(setup, (float*)buff[!ib], (float*)buff[ib], PFFFT_FORWARD);       
+    } else ib = !ib;
+  } else {
+    if (vinput == buff[ib]) { 
+      ib = !ib; // may happen when finput == foutput
+    }
+    if (ordered) {
+      pffft_zreorder(setup, (float*)vinput, (float*)buff[ib], PFFFT_BACKWARD); 
+      vinput = buff[ib]; ib = !ib;
+    }
+    if (setup->transform == PFFFT_REAL) {
+      pffft_real_preprocess(Ncvec, vinput, buff[ib], (v4sf*)setup->e);
+      ib = (rfftb1_ps(Ncvec*2, buff[ib], buff[0], buff[1], 
+                      setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
+    } else {
+      pffft_cplx_preprocess(Ncvec, vinput, buff[ib], (v4sf*)setup->e);
+      ib = (cfftf1_ps(Ncvec, buff[ib], buff[0], buff[1], 
+                      setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1);
+      for (k=0; k < Ncvec; ++k) {
+        INTERLEAVE2(buff[ib][k*2], buff[ib][k*2+1], buff[ib][k*2], buff[ib][k*2+1]);
+      }
+    }
+  }
+  
+  if (buff[ib] != voutput) {
+    /* extra copy required -- this situation should only happen when finput == foutput */
+    assert(finput==foutput);
+    for (k=0; k < Ncvec; ++k) {
+      v4sf a = buff[ib][2*k], b = buff[ib][2*k+1];
+      voutput[2*k] = a; voutput[2*k+1] = b;
+    }
+    ib = !ib;
+  }
+  assert(buff[ib] == voutput);
+}
+
+void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab, float scaling) {
+  int Ncvec = s->Ncvec;
+  const v4sf * RESTRICT va = (const v4sf*)a;
+  const v4sf * RESTRICT vb = (const v4sf*)b;
+  v4sf * RESTRICT vab = (v4sf*)ab;
+
+#ifdef __arm__
+  __builtin_prefetch(va);
+  __builtin_prefetch(vb);
+  __builtin_prefetch(vab);
+  __builtin_prefetch(va+2);
+  __builtin_prefetch(vb+2);
+  __builtin_prefetch(vab+2);
+  __builtin_prefetch(va+4);
+  __builtin_prefetch(vb+4);
+  __builtin_prefetch(vab+4);
+  __builtin_prefetch(va+6);
+  __builtin_prefetch(vb+6);
+  __builtin_prefetch(vab+6);
+# ifndef __clang__
+#   define ZCONVOLVE_USING_INLINE_NEON_ASM
+# endif
+#endif
+
+  float ar, ai, br, bi, abr, abi;
+#ifndef ZCONVOLVE_USING_INLINE_ASM
+  v4sf vscal = LD_PS1(scaling);
+  int i;
+#endif
+
+  assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab));
+  ar = ((v4sf_union*)va)[0].f[0];
+  ai = ((v4sf_union*)va)[1].f[0];
+  br = ((v4sf_union*)vb)[0].f[0];
+  bi = ((v4sf_union*)vb)[1].f[0];
+  abr = ((v4sf_union*)vab)[0].f[0];
+  abi = ((v4sf_union*)vab)[1].f[0];
+ 
+#ifdef ZCONVOLVE_USING_INLINE_ASM // inline asm version, unfortunately miscompiled by clang 3.2, at least on ubuntu.. so this will be restricted to gcc
+  const float *a_ = a, *b_ = b; float *ab_ = ab;
+  int N = Ncvec;
+  asm volatile("mov         r8, %2                  \n"
+               "vdup.f32    q15, %4                 \n"
+               "1:                                  \n"
+               "pld         [%0,#64]                \n"
+               "pld         [%1,#64]                \n"
+               "pld         [%2,#64]                \n"
+               "pld         [%0,#96]                \n"
+               "pld         [%1,#96]                \n"
+               "pld         [%2,#96]                \n"
+               "vld1.f32    {q0,q1},   [%0,:128]!         \n"
+               "vld1.f32    {q4,q5},   [%1,:128]!         \n"
+               "vld1.f32    {q2,q3},   [%0,:128]!         \n"
+               "vld1.f32    {q6,q7},   [%1,:128]!         \n"
+               "vld1.f32    {q8,q9},   [r8,:128]!          \n"
+               
+               "vmul.f32    q10, q0, q4             \n"
+               "vmul.f32    q11, q0, q5             \n"
+               "vmul.f32    q12, q2, q6             \n" 
+               "vmul.f32    q13, q2, q7             \n"                 
+               "vmls.f32    q10, q1, q5             \n"
+               "vmla.f32    q11, q1, q4             \n"
+               "vld1.f32    {q0,q1}, [r8,:128]!     \n"
+               "vmls.f32    q12, q3, q7             \n"
+               "vmla.f32    q13, q3, q6             \n"
+               "vmla.f32    q8, q10, q15            \n"
+               "vmla.f32    q9, q11, q15            \n"
+               "vmla.f32    q0, q12, q15            \n"
+               "vmla.f32    q1, q13, q15            \n"
+               "vst1.f32    {q8,q9},[%2,:128]!    \n"
+               "vst1.f32    {q0,q1},[%2,:128]!    \n"
+               "subs        %3, #2                  \n"
+               "bne         1b                      \n"
+               : "+r"(a_), "+r"(b_), "+r"(ab_), "+r"(N) : "r"(scaling) : "r8", "q0","q1","q2","q3","q4","q5","q6","q7","q8","q9", "q10","q11","q12","q13","q15","memory");
+#else // default routine, works fine for non-arm cpus with current compilers
+  for (i=0; i < Ncvec; i += 2) {
+    v4sf ar, ai, br, bi;
+    ar = va[2*i+0]; ai = va[2*i+1];
+    br = vb[2*i+0]; bi = vb[2*i+1];
+    VCPLXMUL(ar, ai, br, bi);
+    vab[2*i+0] = VMADD(ar, vscal, vab[2*i+0]);
+    vab[2*i+1] = VMADD(ai, vscal, vab[2*i+1]);
+    ar = va[2*i+2]; ai = va[2*i+3];
+    br = vb[2*i+2]; bi = vb[2*i+3];
+    VCPLXMUL(ar, ai, br, bi);
+    vab[2*i+2] = VMADD(ar, vscal, vab[2*i+2]);
+    vab[2*i+3] = VMADD(ai, vscal, vab[2*i+3]);
+  }
+#endif
+  if (s->transform == PFFFT_REAL) {
+    ((v4sf_union*)vab)[0].f[0] = abr + ar*br*scaling;
+    ((v4sf_union*)vab)[1].f[0] = abi + ai*bi*scaling;
+  }
+}
+
+
+#else // defined(PFFFT_SIMD_DISABLE)
+
+// standard routine using scalar floats, without SIMD stuff.
+
+#define pffft_zreorder_nosimd pffft_zreorder
+void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) {
+  int k, N = setup->N;
+  if (setup->transform == PFFFT_COMPLEX) {
+    for (k=0; k < 2*N; ++k) out[k] = in[k];
+    return;
+  }
+  else if (direction == PFFFT_FORWARD) {
+    float x_N = in[N-1];
+    for (k=N-1; k > 1; --k) out[k] = in[k-1]; 
+    out[0] = in[0];
+    out[1] = x_N;
+  } else {
+    float x_N = in[1];
+    for (k=1; k < N-1; ++k) out[k] = in[k+1]; 
+    out[0] = in[0];
+    out[N-1] = x_N;
+  }
+}
+
+#define pffft_transform_internal_nosimd pffft_transform_internal
+void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, float *output, float *scratch,
+                                    pffft_direction_t direction, int ordered) {
+  int Ncvec   = setup->Ncvec;
+  int nf_odd = (setup->ifac[1] & 1);
+
+  // temporary buffer is allocated on the stack if the scratch pointer is NULL
+  int stack_allocate = (scratch == 0 ? Ncvec*2 : 1);
+  VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate);
+  float *buff[2];
+  int ib;
+  if (scratch == 0) scratch = scratch_on_stack;
+  buff[0] = output; buff[1] = scratch;
+
+  if (setup->transform == PFFFT_COMPLEX) ordered = 0; // it is always ordered.
+  ib = (nf_odd ^ ordered ? 1 : 0);
+
+  if (direction == PFFFT_FORWARD) {
+    if (setup->transform == PFFFT_REAL) { 
+      ib = (rfftf1_ps(Ncvec*2, input, buff[ib], buff[!ib],
+                      setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);      
+    } else {
+      ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib], 
+                      setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1);
+    }
+    if (ordered) {
+      pffft_zreorder(setup, buff[ib], buff[!ib], PFFFT_FORWARD); ib = !ib;
+    }
+  } else {    
+    if (input == buff[ib]) { 
+      ib = !ib; // may happen when finput == foutput
+    }
+    if (ordered) {
+      pffft_zreorder(setup, input, buff[!ib], PFFFT_BACKWARD); 
+      input = buff[!ib];
+    }
+    if (setup->transform == PFFFT_REAL) {
+      ib = (rfftb1_ps(Ncvec*2, input, buff[ib], buff[!ib], 
+                      setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
+    } else {
+      ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib], 
+                      setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1);
+    }
+  }
+  if (buff[ib] != output) {
+    int k;
+    // extra copy required -- this situation should happens only when finput == foutput
+    assert(input==output);
+    for (k=0; k < Ncvec; ++k) {
+      float a = buff[ib][2*k], b = buff[ib][2*k+1];
+      output[2*k] = a; output[2*k+1] = b;
+    }
+    ib = !ib;
+  }
+  assert(buff[ib] == output);
+}
+
+#define pffft_zconvolve_accumulate_nosimd pffft_zconvolve_accumulate
+void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup *s, const float *a, const float *b,
+                                       float *ab, float scaling) {
+  int i, Ncvec = s->Ncvec;
+
+  if (s->transform == PFFFT_REAL) {
+    // take care of the fftpack ordering
+    ab[0] += a[0]*b[0]*scaling;
+    ab[2*Ncvec-1] += a[2*Ncvec-1]*b[2*Ncvec-1]*scaling;
+    ++ab; ++a; ++b; --Ncvec;
+  }
+  for (i=0; i < Ncvec; ++i) {
+    float ar, ai, br, bi;
+    ar = a[2*i+0]; ai = a[2*i+1];
+    br = b[2*i+0]; bi = b[2*i+1];
+    VCPLXMUL(ar, ai, br, bi);
+    ab[2*i+0] += ar*scaling;
+    ab[2*i+1] += ai*scaling;
+  }
+}
+
+#endif // defined(PFFFT_SIMD_DISABLE)
+
+void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) {
+  pffft_transform_internal(setup, input, output, (v4sf*)work, direction, 0);
+}
+
+void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) {
+  pffft_transform_internal(setup, input, output, (v4sf*)work, direction, 1);
+}
diff --git a/oss-internship-2020/pffft/pffft.h b/oss-internship-2020/pffft/pffft.h
new file mode 100644
index 0000000..2bfa7b3
--- /dev/null
+++ b/oss-internship-2020/pffft/pffft.h
@@ -0,0 +1,177 @@
+/* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com ) 
+
+   Based on original fortran 77 code from FFTPACKv4 from NETLIB,
+   authored by Dr Paul Swarztrauber of NCAR, in 1985.
+
+   As confirmed by the NCAR fftpack software curators, the following
+   FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
+   released under the same terms.
+
+   FFTPACK license:
+
+   http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
+
+   Copyright (c) 2004 the University Corporation for Atmospheric
+   Research ("UCAR"). All rights reserved. Developed by NCAR's
+   Computational and Information Systems Laboratory, UCAR,
+   www.cisl.ucar.edu.
+
+   Redistribution and use of the Software in source and binary forms,
+   with or without modification, is permitted provided that the
+   following conditions are met:
+
+   - Neither the names of NCAR's Computational and Information Systems
+   Laboratory, the University Corporation for Atmospheric Research,
+   nor the names of its sponsors or contributors may be used to
+   endorse or promote products derived from this Software without
+   specific prior written permission.  
+
+   - Redistributions of source code must retain the above copyright
+   notices, this list of conditions, and the disclaimer below.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions, and the disclaimer below in the
+   documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+   SOFTWARE.
+*/
+   
+/*
+   PFFFT : a Pretty Fast FFT.
+
+   This is basically an adaptation of the single precision fftpack
+   (v4) as found on netlib taking advantage of SIMD instruction found
+   on cpus such as intel x86 (SSE1), powerpc (Altivec), and arm (NEON).
+   
+   For architectures where no SIMD instruction is available, the code
+   falls back to a scalar version.  
+
+   Restrictions: 
+
+   - 1D transforms only, with 32-bit single precision.
+
+   - supports only transforms for inputs of length N of the form
+   N=(2^a)*(3^b)*(5^c), a >= 5, b >=0, c >= 0 (32, 48, 64, 96, 128,
+   144, 160, etc are all acceptable lengths). Performance is best for
+   128<=N<=8192.
+
+   - all (float*) pointers in the functions below are expected to
+   have an "simd-compatible" alignment, that is 16 bytes on x86 and
+   powerpc CPUs.
+  
+   You can allocate such buffers with the functions
+   pffft_aligned_malloc / pffft_aligned_free (or with stuff like
+   posix_memalign..)
+
+*/
+
+#ifndef PFFFT_H
+#define PFFFT_H
+
+#include <stddef.h> // for size_t
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+  /* opaque struct holding internal stuff (precomputed twiddle factors)
+     this struct can be shared by many threads as it contains only
+     read-only data.  
+  */
+  typedef struct PFFFT_Setup PFFFT_Setup;
+
+  /* direction of the transform */
+  typedef enum { PFFFT_FORWARD, PFFFT_BACKWARD } pffft_direction_t;
+  
+  /* type of transform */
+  typedef enum { PFFFT_REAL, PFFFT_COMPLEX } pffft_transform_t;
+
+  /*
+    prepare for performing transforms of size N -- the returned
+    PFFFT_Setup structure is read-only so it can safely be shared by
+    multiple concurrent threads. 
+  */
+  PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform);
+  void pffft_destroy_setup(PFFFT_Setup *);
+  /* 
+     Perform a Fourier transform , The z-domain data is stored in the
+     most efficient order for transforming it back, or using it for
+     convolution. If you need to have its content sorted in the
+     "usual" way, that is as an array of interleaved complex numbers,
+     either use pffft_transform_ordered , or call pffft_zreorder after
+     the forward fft, and before the backward fft.
+
+     Transforms are not scaled: PFFFT_BACKWARD(PFFFT_FORWARD(x)) = N*x.
+     Typically you will want to scale the backward transform by 1/N.
+     
+     The 'work' pointer should point to an area of N (2*N for complex
+     fft) floats, properly aligned. If 'work' is NULL, then stack will
+     be used instead (this is probably the best strategy for small
+     FFTs, say for N < 16384).
+
+     input and output may alias.
+  */
+  void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);
+
+  /* 
+     Similar to pffft_transform, but makes sure that the output is
+     ordered as expected (interleaved complex numbers).  This is
+     similar to calling pffft_transform and then pffft_zreorder.
+     
+     input and output may alias.
+  */
+  void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);
+
+  /* 
+     call pffft_zreorder(.., PFFFT_FORWARD) after pffft_transform(...,
+     PFFFT_FORWARD) if you want to have the frequency components in
+     the correct "canonical" order, as interleaved complex numbers.
+     
+     (for real transforms, both 0-frequency and half frequency
+     components, which are real, are assembled in the first entry as
+     F(0)+i*F(n/2+1). Note that the original fftpack did place
+     F(n/2+1) at the end of the arrays).
+     
+     input and output should not alias.
+  */
+  void pffft_zreorder(PFFFT_Setup *setup, const float *input, float *output, pffft_direction_t direction);
+
+  /* 
+     Perform a multiplication of the frequency components of dft_a and
+     dft_b and accumulate them into dft_ab. The arrays should have
+     been obtained with pffft_transform(.., PFFFT_FORWARD) and should
+     *not* have been reordered with pffft_zreorder (otherwise just
+     perform the operation yourself as the dft coefs are stored as
+     interleaved complex numbers).
+     
+     the operation performed is: dft_ab += (dft_a * fdt_b)*scaling
+     
+     The dft_a, dft_b and dft_ab pointers may alias.
+  */
+  void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
+
+  /*
+    the float buffers must have the correct alignment (16-byte boundary
+    on intel and powerpc). This function may be used to obtain such
+    correctly aligned buffers.  
+  */
+  void *pffft_aligned_malloc(size_t nb_bytes);
+  void pffft_aligned_free(void *);
+
+  /* return 4 or 1 wether support SSE/Altivec instructions was enable when building pffft.c */
+  int pffft_simd_size();
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // PFFFT_H
diff --git a/oss-internship-2020/pffft/test_pffft.c b/oss-internship-2020/pffft/test_pffft.c
new file mode 100644
index 0000000..a5d20c2
--- /dev/null
+++ b/oss-internship-2020/pffft/test_pffft.c
@@ -0,0 +1,419 @@
+/*
+  Copyright (c) 2013 Julien Pommier.
+
+  Small test & bench for PFFFT, comparing its performance with the scalar FFTPACK, FFTW, and Apple vDSP
+
+  How to build: 
+
+  on linux, with fftw3:
+  gcc -o test_pffft -DHAVE_FFTW -msse -mfpmath=sse -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -lm
+
+  on macos, without fftw3:
+  clang -o test_pffft -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -framework Accelerate
+
+  on macos, with fftw3:
+  clang -o test_pffft -DHAVE_FFTW -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -framework Accelerate
+
+  on windows, with visual c++:
+  cl /Ox -D_USE_MATH_DEFINES /arch:SSE test_pffft.c pffft.c fftpack.c
+  
+  build without SIMD instructions:
+  gcc -o test_pffft -DPFFFT_SIMD_DISABLE -O3 -Wall -W pffft.c test_pffft.c fftpack.c -lm
+
+ */
+
+#include "pffft.h"
+#include "fftpack.h"
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <assert.h>
+#include <string.h>
+
+#ifdef HAVE_SYS_TIMES
+#  include <sys/times.h>
+#  include <unistd.h>
+#endif
+
+#ifdef HAVE_VECLIB
+#  include <Accelerate/Accelerate.h>
+#endif
+
+#ifdef HAVE_FFTW
+#  include <fftw3.h>
+#endif
+
+#define MAX(x,y) ((x)>(y)?(x):(y))
+
+double frand() {
+  return rand()/(double)RAND_MAX;
+}
+
+#if defined(HAVE_SYS_TIMES)
+  inline double uclock_sec(void) {
+    static double ttclk = 0.;
+    if (ttclk == 0.) ttclk = sysconf(_SC_CLK_TCK);
+    struct tms t; return ((double)times(&t)) / ttclk;
+  }
+# else
+  double uclock_sec(void)
+{ return (double)clock()/(double)CLOCKS_PER_SEC; }
+#endif
+
+
+/* compare results with the regular fftpack */
+void pffft_validate_N(int N, int cplx) {
+  int Nfloat = N*(cplx?2:1);
+  int Nbytes = Nfloat * sizeof(float);
+  float *ref, *in, *out, *tmp, *tmp2;
+  PFFFT_Setup *s = pffft_new_setup(N, cplx ? PFFFT_COMPLEX : PFFFT_REAL);
+  int pass;
+
+  if (!s) { printf("Skipping N=%d, not supported\n", N); return; }
+  ref = pffft_aligned_malloc(Nbytes);
+  in = pffft_aligned_malloc(Nbytes);
+  out = pffft_aligned_malloc(Nbytes);
+  tmp = pffft_aligned_malloc(Nbytes);
+  tmp2 = pffft_aligned_malloc(Nbytes);
+
+  for (pass=0; pass < 2; ++pass) {
+    float ref_max = 0;
+    int k;
+    //printf("N=%d pass=%d cplx=%d\n", N, pass, cplx);
+    // compute reference solution with FFTPACK
+    if (pass == 0) {
+      float *wrk = malloc(2*Nbytes+15*sizeof(float));
+      for (k=0; k < Nfloat; ++k) {
+        ref[k] = in[k] = frand()*2-1; 
+        out[k] = 1e30;
+      }
+      if (!cplx) {
+        rffti(N, wrk);
+        rfftf(N, ref, wrk);
+        // use our ordering for real ffts instead of the one of fftpack
+        {
+          float refN=ref[N-1];
+          for (k=N-2; k >= 1; --k) ref[k+1] = ref[k]; 
+          ref[1] = refN;
+        }
+      } else {
+        cffti(N, wrk);
+        cfftf(N, ref, wrk);
+      }
+      free(wrk);
+    }
+
+    for (k = 0; k < Nfloat; ++k) ref_max = MAX(ref_max, fabs(ref[k]));
+
+      
+    // pass 0 : non canonical ordering of transform coefficients  
+    if (pass == 0) {
+      // test forward transform, with different input / output
+      pffft_transform(s, in, tmp, 0, PFFFT_FORWARD);
+      memcpy(tmp2, tmp, Nbytes);
+      memcpy(tmp, in, Nbytes);
+      pffft_transform(s, tmp, tmp, 0, PFFFT_FORWARD);
+      for (k = 0; k < Nfloat; ++k) {
+        assert(tmp2[k] == tmp[k]);
+      }
+
+      // test reordering
+      pffft_zreorder(s, tmp, out, PFFFT_FORWARD);
+      pffft_zreorder(s, out, tmp, PFFFT_BACKWARD);
+      for (k = 0; k < Nfloat; ++k) {
+        assert(tmp2[k] == tmp[k]);
+      }
+      pffft_zreorder(s, tmp, out, PFFFT_FORWARD);
+    } else {
+      // pass 1 : canonical ordering of transform coeffs.
+      pffft_transform_ordered(s, in, tmp, 0, PFFFT_FORWARD);
+      memcpy(tmp2, tmp, Nbytes);
+      memcpy(tmp, in, Nbytes);
+      pffft_transform_ordered(s, tmp, tmp, 0, PFFFT_FORWARD);
+      for (k = 0; k < Nfloat; ++k) {
+        assert(tmp2[k] == tmp[k]);
+      }
+      memcpy(out, tmp, Nbytes);
+    }
+
+    {
+      for (k=0; k < Nfloat; ++k) {
+        if (!(fabs(ref[k] - out[k]) < 1e-3*ref_max)) {
+          printf("%s forward PFFFT mismatch found for N=%d\n", (cplx?"CPLX":"REAL"), N);
+          exit(1);
+        }
+      }
+        
+      if (pass == 0) pffft_transform(s, tmp, out, 0, PFFFT_BACKWARD);
+      else   pffft_transform_ordered(s, tmp, out, 0, PFFFT_BACKWARD);
+      memcpy(tmp2, out, Nbytes);
+      memcpy(out, tmp, Nbytes);
+      if (pass == 0) pffft_transform(s, out, out, 0, PFFFT_BACKWARD);
+      else   pffft_transform_ordered(s, out, out, 0, PFFFT_BACKWARD);
+      for (k = 0; k < Nfloat; ++k) {
+        assert(tmp2[k] == out[k]);
+        out[k] *= 1.f/N;
+      }
+      for (k = 0; k < Nfloat; ++k) {
+        if (fabs(in[k] - out[k]) > 1e-3 * ref_max) {
+          printf("pass=%d, %s IFFFT does not match for N=%d\n", pass, (cplx?"CPLX":"REAL"), N); break;
+          exit(1);
+        }
+      }
+    }
+
+    // quick test of the circular convolution in fft domain
+    {
+      float conv_err = 0, conv_max = 0;
+
+      pffft_zreorder(s, ref, tmp, PFFFT_FORWARD);
+      memset(out, 0, Nbytes);
+      pffft_zconvolve_accumulate(s, ref, ref, out, 1.0);
+      pffft_zreorder(s, out, tmp2, PFFFT_FORWARD);
+      
+      for (k=0; k < Nfloat; k += 2) {
+        float ar = tmp[k], ai=tmp[k+1];
+        if (cplx || k > 0) {
+          tmp[k] = ar*ar - ai*ai;
+          tmp[k+1] = 2*ar*ai;
+        } else {
+          tmp[0] = ar*ar;
+          tmp[1] = ai*ai;
+        }
+      }
+      
+      for (k=0; k < Nfloat; ++k) {
+        float d = fabs(tmp[k] - tmp2[k]), e = fabs(tmp[k]);
+        if (d > conv_err) conv_err = d;
+        if (e > conv_max) conv_max = e;
+      }
+      if (conv_err > 1e-5*conv_max) {
+        printf("zconvolve error ? %g %g\n", conv_err, conv_max); exit(1);
+      }
+    }
+
+  }
+
+  printf("%s PFFFT is OK for N=%d\n", (cplx?"CPLX":"REAL"), N); fflush(stdout);
+  
+  pffft_destroy_setup(s);
+  pffft_aligned_free(ref);
+  pffft_aligned_free(in);
+  pffft_aligned_free(out);
+  pffft_aligned_free(tmp);
+  pffft_aligned_free(tmp2);
+}
+
+void pffft_validate(int cplx) {
+  static int Ntest[] = { 16, 32, 64, 96, 128, 160, 192, 256, 288, 384, 5*96, 512, 576, 5*128, 800, 864, 1024, 2048, 2592, 4000, 4096, 12000, 36864, 0};
+  int k;
+  for (k = 0; Ntest[k]; ++k) {
+    int N = Ntest[k];
+    if (N == 16 && !cplx) continue;
+    pffft_validate_N(N, cplx);
+  }
+}
+
+int array_output_format = 0;
+
+void show_output(const char *name, int N, int cplx, float flops, float t0, float t1, int max_iter) {
+  float mflops = flops/1e6/(t1 - t0 + 1e-16);
+  if (array_output_format) {
+    if (flops != -1) {
+      printf("|%9.0f   ", mflops);
+    } else printf("|      n/a   ");
+  } else {
+    if (flops != -1) {
+      printf("N=%5d, %s %16s : %6.0f MFlops [t=%6.0f ns, %d runs]\n", N, (cplx?"CPLX":"REAL"), name, mflops, (t1-t0)/2/max_iter * 1e9, max_iter);
+    }
+  }
+  fflush(stdout);
+}
+
+void benchmark_ffts(int N, int cplx) {
+  int Nfloat = (cplx ? N*2 : N);
+  int Nbytes = Nfloat * sizeof(float);
+  float *X = pffft_aligned_malloc(Nbytes), *Y = pffft_aligned_malloc(Nbytes), *Z = pffft_aligned_malloc(Nbytes);
+
+  double t0, t1, flops;
+
+  int k;
+  int max_iter = 5120000/N*4;
+#ifdef __arm__
+  max_iter /= 4;
+#endif
+  int iter;
+
+  for (k = 0; k < Nfloat; ++k) {
+    X[k] = 0; //sqrtf(k+1);
+  }
+
+  // FFTPack benchmark
+  {
+    float *wrk = malloc(2*Nbytes + 15*sizeof(float));
+    int max_iter_ = max_iter/pffft_simd_size(); if (max_iter_ == 0) max_iter_ = 1;
+    if (cplx) cffti(N, wrk);
+    else      rffti(N, wrk);
+    t0 = uclock_sec();  
+    
+    for (iter = 0; iter < max_iter_; ++iter) {
+      if (cplx) {
+        cfftf(N, X, wrk);
+        cfftb(N, X, wrk);
+      } else {
+        rfftf(N, X, wrk);
+        rfftb(N, X, wrk);
+      }
+    }
+    t1 = uclock_sec();
+    free(wrk);
+    
+    flops = (max_iter_*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); // see http://www.fftw.org/speed/method.html
+    show_output("FFTPack", N, cplx, flops, t0, t1, max_iter_);
+  }
+
+#ifdef HAVE_VECLIB
+  int log2N = (int)(log(N)/log(2) + 0.5f);
+  if (N == (1<<log2N)) {
+    FFTSetup setup;
+
+    setup = vDSP_create_fftsetup(log2N, FFT_RADIX2);
+    DSPSplitComplex zsamples;
+    zsamples.realp = &X[0];
+    zsamples.imagp = &X[Nfloat/2];
+    t0 = uclock_sec();  
+    for (iter = 0; iter < max_iter; ++iter) {
+      if (cplx) {
+        vDSP_fft_zip(setup, &zsamples, 1, log2N, kFFTDirection_Forward);
+        vDSP_fft_zip(setup, &zsamples, 1, log2N, kFFTDirection_Inverse);
+      } else {
+        vDSP_fft_zrip(setup, &zsamples, 1, log2N, kFFTDirection_Forward); 
+        vDSP_fft_zrip(setup, &zsamples, 1, log2N, kFFTDirection_Inverse);
+      }
+    }
+    t1 = uclock_sec();
+    vDSP_destroy_fftsetup(setup);
+
+    flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); // see http://www.fftw.org/speed/method.html
+    show_output("vDSP", N, cplx, flops, t0, t1, max_iter);
+  } else {
+    show_output("vDSP", N, cplx, -1, -1, -1, -1);
+  }
+#endif
+  
+#ifdef HAVE_FFTW
+  {
+    fftwf_plan planf, planb;
+    fftw_complex *in = (fftw_complex*) fftwf_malloc(sizeof(fftw_complex) * N);
+    fftw_complex *out = (fftw_complex*) fftwf_malloc(sizeof(fftw_complex) * N);
+    memset(in, 0, sizeof(fftw_complex) * N);
+    int flags = (N < 40000 ? FFTW_MEASURE : FFTW_ESTIMATE);  // measure takes a lot of time on largest ffts
+    //int flags = FFTW_ESTIMATE;
+    if (cplx) {
+      planf = fftwf_plan_dft_1d(N, (fftwf_complex*)in, (fftwf_complex*)out, FFTW_FORWARD, flags);
+      planb = fftwf_plan_dft_1d(N, (fftwf_complex*)in, (fftwf_complex*)out, FFTW_BACKWARD, flags);
+    } else {
+      planf = fftwf_plan_dft_r2c_1d(N, (float*)in, (fftwf_complex*)out, flags);
+      planb = fftwf_plan_dft_c2r_1d(N, (fftwf_complex*)in, (float*)out, flags);
+    }
+
+    t0 = uclock_sec();  
+    for (iter = 0; iter < max_iter; ++iter) {
+      fftwf_execute(planf);
+      fftwf_execute(planb);
+    }
+    t1 = uclock_sec();
+
+    fftwf_destroy_plan(planf);
+    fftwf_destroy_plan(planb);
+    fftwf_free(in); fftwf_free(out);
+
+    flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); // see http://www.fftw.org/speed/method.html
+    show_output((flags == FFTW_MEASURE ? "FFTW (meas.)" : " FFTW (estim)"), N, cplx, flops, t0, t1, max_iter);
+  }
+#endif  
+
+  // PFFFT benchmark
+  {
+    PFFFT_Setup *s = pffft_new_setup(N, cplx ? PFFFT_COMPLEX : PFFFT_REAL);
+    if (s) {
+      t0 = uclock_sec();  
+      for (iter = 0; iter < max_iter; ++iter) {
+        pffft_transform(s, X, Z, Y, PFFFT_FORWARD);
+        pffft_transform(s, X, Z, Y, PFFFT_BACKWARD);
+      }
+      t1 = uclock_sec();
+      pffft_destroy_setup(s);
+    
+      flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); // see http://www.fftw.org/speed/method.html
+      show_output("PFFFT", N, cplx, flops, t0, t1, max_iter);
+    }
+  }
+
+  if (!array_output_format) {
+    printf("--\n");
+  }
+
+  pffft_aligned_free(X);
+  pffft_aligned_free(Y);
+  pffft_aligned_free(Z);
+}
+
+#ifndef PFFFT_SIMD_DISABLE
+void validate_pffft_simd(); // a small function inside pffft.c that will detect compiler bugs with respect to simd instruction 
+#endif
+
+int main(int argc, char **argv) {
+  int Nvalues[] = { 64, 96, 128, 160, 192, 256, 384, 5*96, 512, 5*128, 3*256, 800, 1024, 2048, 2400, 4096, 8192, 9*1024, 16384, 32768, 256*1024, 1024*1024, -1 };
+  int i;
+
+  if (argc > 1 && strcmp(argv[1], "--array-format") == 0) {
+    array_output_format = 1;
+  }
+
+#ifndef PFFFT_SIMD_DISABLE
+  validate_pffft_simd();
+#endif
+  pffft_validate(1);
+  pffft_validate(0);
+  if (!array_output_format) {
+    for (i=0; Nvalues[i] > 0; ++i) {
+      benchmark_ffts(Nvalues[i], 0 /* real fft */);
+    }
+    for (i=0; Nvalues[i] > 0; ++i) {
+      benchmark_ffts(Nvalues[i], 1 /* cplx fft */);
+    }
+  } else {
+    printf("| input len ");
+    printf("|real FFTPack");
+#ifdef HAVE_VECLIB
+    printf("|  real vDSP ");
+#endif
+#ifdef HAVE_FFTW
+    printf("|  real FFTW ");
+#endif
+    printf("| real PFFFT | ");
+
+    printf("|cplx FFTPack");
+#ifdef HAVE_VECLIB
+    printf("|  cplx vDSP ");
+#endif
+#ifdef HAVE_FFTW
+    printf("|  cplx FFTW ");
+#endif
+    printf("| cplx PFFFT |\n");
+    for (i=0; Nvalues[i] > 0; ++i) {
+      printf("|%9d  ", Nvalues[i]);
+      benchmark_ffts(Nvalues[i], 0); 
+      printf("| ");
+      benchmark_ffts(Nvalues[i], 1);
+      printf("|\n");
+    }
+    printf(" (numbers are given in MFlops)\n");
+  }
+
+
+  return 0;
+}
diff --git a/oss-internship-2020/pffft/test_pffft_sandboxed.cc b/oss-internship-2020/pffft/test_pffft_sandboxed.cc
new file mode 100644
index 0000000..dd6dc0b
--- /dev/null
+++ b/oss-internship-2020/pffft/test_pffft_sandboxed.cc
@@ -0,0 +1,150 @@
+#include "fftpack.h"
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <assert.h>
+#include <string.h>
+#include <syscall.h>
+#include <sys/times.h>
+
+#include "pffft_sapi.sapi.h"
+#include "sandboxed_api/util/flag.h"
+#include "sandboxed_api/vars.h"
+
+ABSL_DECLARE_FLAG(string, sandbox2_danger_danger_permit_all);
+ABSL_DECLARE_FLAG(string, sandbox2_danger_danger_permit_all_and_log);
+
+class pffftSapiSandbox : public pffftSandbox {
+  public:
+    std::unique_ptr<sandbox2::Policy> ModifyPolicy(sandbox2::PolicyBuilder*) override {
+      return sandbox2::PolicyBuilder()
+        .AllowStaticStartup()
+        .AllowOpen()
+        .AllowRead()
+        .AllowWrite()
+        .AllowSystemMalloc()
+        .AllowExit()
+        .AllowSyscalls({
+          __NR_futex,
+          __NR_close,
+          __NR_getrusage,
+        })
+        .DisableNamespaces()
+        .BuildOrDie();
+    }
+};
+
+double frand() {
+  return rand()/(double)RAND_MAX;
+}
+
+double uclock_sec(void) { 
+  return (double)clock()/(double)CLOCKS_PER_SEC; 
+}
+
+int array_output_format = 0;
+
+void show_output(const char *name, int N, int cplx, float flops, float t0, float t1, int max_iter) {
+  float mflops = flops/1e6/(t1 - t0 + 1e-16);
+  if (array_output_format) {
+    if (flops != -1) {
+      printf("|%9.0f   ", mflops);
+    } else printf("|      n/a   ");
+  } else {
+    if (flops != -1) {
+      printf("N=%5d, %s %16s : %6.0f MFlops [t=%6.0f ns, %d runs]\n", N, (cplx?"CPLX":"REAL"), name, mflops, (t1-t0)/2/max_iter * 1e9, max_iter);
+    }
+  }
+  fflush(stdout);
+}
+
+/*  
+  For debug:
+  SAPI_VLOG_LEVEL=1 ./pffft_sandboxed --v=100 --sandbox2_danger_danger_permit_all_and_log my_aux_file
+*/
+
+int main(int argc, char* argv[]) {
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+  int Nvalues[] = { 64, 96, 128, 160, 192, 256, 384, 5*96, 512, 5*128, 3*256, 800, 1024, 2048, 2400, 4096, 8192, 9*1024, 16384, 32768, 256*1024, 1024*1024, -1 };
+  int i;
+
+  printf("initializing sandbox...\n");
+
+  pffftSapiSandbox sandbox;
+  sandbox.Init().IgnoreError();
+
+  pffftApi api(&sandbox);
+  
+  int N, cplx;
+
+  cplx = 0;
+
+  for (i = 0; i < 5; i++) {
+    N = Nvalues[i];
+    
+    int Nfloat = N * (cplx ? 2 : 1);
+    int Nbytes = Nfloat * sizeof(float);
+    int pass;
+
+    float ref[Nbytes], in[Nbytes], out[Nbytes], tmp[Nbytes], tmp2[Nbytes];
+
+    sapi::v::Array<float> ref_(ref, Nbytes);
+    sapi::v::Array<float> in_(in, Nbytes);
+    sapi::v::Array<float> out_(out, Nbytes);
+    sapi::v::Array<float> tmp_(tmp, Nbytes);
+    sapi::v::Array<float> tmp2_(tmp2, Nbytes);
+
+    float wrk[2 * Nbytes + 15 * sizeof(float)];
+    sapi::v::Array<float> wrk_(wrk, 2 * Nbytes + 15 * sizeof(float));
+
+    float ref_max = 0;
+    int k;
+
+    Nfloat = (cplx ? N * 2 : N);
+    float X[Nbytes], Y[Nbytes], Z[Nbytes];
+    sapi::v::Array<float> X_(X, Nbytes), Y_(Y, Nbytes), Z_(Z, Nbytes);
+
+    double t0, t1, flops;
+
+    int max_iter = 5120000/N*4;
+    #ifdef __arm__
+      max_iter /= 4;
+    #endif
+    int iter;
+
+    for (k = 0; k < Nfloat; ++k) {
+      X[k] = 0; 
+    }
+
+    // FFTPack benchmark
+    {
+      int max_iter_ = max_iter/4; // SIMD_SZ == 4 (returning value of pffft_simd_size())
+      if (max_iter_ == 0) max_iter_ = 1;
+      if (cplx) {
+        api.cffti(N, wrk_.PtrBoth()).IgnoreError();
+      } else {
+        api.rffti(N, wrk_.PtrBoth()).IgnoreError();
+      }
+      t0 = uclock_sec();  
+      
+      for (iter = 0; iter < max_iter_; ++iter) {
+        if (cplx) {
+          api.cfftf(N, X_.PtrBoth(), wrk_.PtrBoth()).IgnoreError();
+          api.cfftb(N, X_.PtrBoth(), wrk_.PtrBoth()).IgnoreError();
+        } else {
+          api.rfftf(N, X_.PtrBoth(), wrk_.PtrBoth()).IgnoreError();
+          api.rfftb(N, X_.PtrBoth(), wrk_.PtrBoth()).IgnoreError();
+        }
+      }
+      t1 = uclock_sec();
+      
+      flops = (max_iter_ * 2) * ((cplx ? 5 : 2.5) * N * log((double)N) / M_LN2); 
+      show_output("FFTPack", N, cplx, flops, t0, t1, max_iter_); 
+      
+    }
+  }
+
+  return 0;
+}
\ No newline at end of file

From deb8869bb660a6f2cae4f44dd14b25a9c746debe Mon Sep 17 00:00:00 2001
From: doinachiroiu <inach@google.com>
Date: Mon, 17 Aug 2020 11:21:11 +0000
Subject: [PATCH 03/29] Added .gitignore

---
 oss-internship-2020/pffft/.gitignore | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 oss-internship-2020/pffft/.gitignore

diff --git a/oss-internship-2020/pffft/.gitignore b/oss-internship-2020/pffft/.gitignore
new file mode 100644
index 0000000..f9ed608
--- /dev/null
+++ b/oss-internship-2020/pffft/.gitignore
@@ -0,0 +1 @@
+.hg/

From 50921e222d27198717c3ae174fe93299306ae01b Mon Sep 17 00:00:00 2001
From: doinachiroiu <inach@google.com>
Date: Mon, 17 Aug 2020 11:21:19 +0000
Subject: [PATCH 04/29] Coding style adaptation

---
 .../pffft/test_pffft_sandboxed.cc             | 82 ++++++++++---------
 1 file changed, 43 insertions(+), 39 deletions(-)

diff --git a/oss-internship-2020/pffft/test_pffft_sandboxed.cc b/oss-internship-2020/pffft/test_pffft_sandboxed.cc
index dd6dc0b..d50dff3 100644
--- a/oss-internship-2020/pffft/test_pffft_sandboxed.cc
+++ b/oss-internship-2020/pffft/test_pffft_sandboxed.cc
@@ -1,14 +1,13 @@
-#include "fftpack.h"
-
+#include <assert.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <time.h>
-#include <assert.h>
 #include <string.h>
-#include <syscall.h>
 #include <sys/times.h>
+#include <syscall.h>
+#include <time.h>
 
+#include "fftpack.h"
 #include "pffft_sapi.sapi.h"
 #include "sandboxed_api/util/flag.h"
 #include "sandboxed_api/vars.h"
@@ -17,9 +16,10 @@ ABSL_DECLARE_FLAG(string, sandbox2_danger_danger_permit_all);
 ABSL_DECLARE_FLAG(string, sandbox2_danger_danger_permit_all_and_log);
 
 class pffftSapiSandbox : public pffftSandbox {
-  public:
-    std::unique_ptr<sandbox2::Policy> ModifyPolicy(sandbox2::PolicyBuilder*) override {
-      return sandbox2::PolicyBuilder()
+ public:
+  std::unique_ptr<sandbox2::Policy> ModifyPolicy(
+      sandbox2::PolicyBuilder*) override {
+    return sandbox2::PolicyBuilder()
         .AllowStaticStartup()
         .AllowOpen()
         .AllowRead()
@@ -27,47 +27,51 @@ class pffftSapiSandbox : public pffftSandbox {
         .AllowSystemMalloc()
         .AllowExit()
         .AllowSyscalls({
-          __NR_futex,
-          __NR_close,
-          __NR_getrusage,
+            __NR_futex,
+            __NR_close,
+            __NR_getrusage,
         })
         .DisableNamespaces()
         .BuildOrDie();
-    }
+  }
 };
 
-double frand() {
-  return rand()/(double)RAND_MAX;
-}
+double frand() { return rand() / (double)RAND_MAX; }
 
-double uclock_sec(void) { 
-  return (double)clock()/(double)CLOCKS_PER_SEC; 
-}
+double uclock_sec(void) { return (double)clock() / (double)CLOCKS_PER_SEC; }
 
 int array_output_format = 0;
 
-void show_output(const char *name, int N, int cplx, float flops, float t0, float t1, int max_iter) {
-  float mflops = flops/1e6/(t1 - t0 + 1e-16);
+void show_output(const char* name, int N, int cplx, float flops, float t0,
+                 float t1, int max_iter) {
+  float mflops = flops / 1e6 / (t1 - t0 + 1e-16);
   if (array_output_format) {
     if (flops != -1) {
       printf("|%9.0f   ", mflops);
-    } else printf("|      n/a   ");
+    } else
+      printf("|      n/a   ");
   } else {
     if (flops != -1) {
-      printf("N=%5d, %s %16s : %6.0f MFlops [t=%6.0f ns, %d runs]\n", N, (cplx?"CPLX":"REAL"), name, mflops, (t1-t0)/2/max_iter * 1e9, max_iter);
+      printf("N=%5d, %s %16s : %6.0f MFlops [t=%6.0f ns, %d runs]\n", N,
+             (cplx ? "CPLX" : "REAL"), name, mflops,
+             (t1 - t0) / 2 / max_iter * 1e9, max_iter);
     }
   }
   fflush(stdout);
 }
 
-/*  
+/*
   For debug:
-  SAPI_VLOG_LEVEL=1 ./pffft_sandboxed --v=100 --sandbox2_danger_danger_permit_all_and_log my_aux_file
+  SAPI_VLOG_LEVEL=1 ./pffft_sandboxed --v=100
+  --sandbox2_danger_danger_permit_all_and_log my_aux_file
 */
 
 int main(int argc, char* argv[]) {
   gflags::ParseCommandLineFlags(&argc, &argv, true);
-  int Nvalues[] = { 64, 96, 128, 160, 192, 256, 384, 5*96, 512, 5*128, 3*256, 800, 1024, 2048, 2400, 4096, 8192, 9*1024, 16384, 32768, 256*1024, 1024*1024, -1 };
+  int Nvalues[] = {64,    96,     128,        160,         192,     256,
+                   384,   5 * 96, 512,        5 * 128,     3 * 256, 800,
+                   1024,  2048,   2400,       4096,        8192,    9 * 1024,
+                   16384, 32768,  256 * 1024, 1024 * 1024, -1};
   int i;
 
   printf("initializing sandbox...\n");
@@ -76,14 +80,14 @@ int main(int argc, char* argv[]) {
   sandbox.Init().IgnoreError();
 
   pffftApi api(&sandbox);
-  
+
   int N, cplx;
 
   cplx = 0;
 
   for (i = 0; i < 5; i++) {
     N = Nvalues[i];
-    
+
     int Nfloat = N * (cplx ? 2 : 1);
     int Nbytes = Nfloat * sizeof(float);
     int pass;
@@ -108,27 +112,28 @@ int main(int argc, char* argv[]) {
 
     double t0, t1, flops;
 
-    int max_iter = 5120000/N*4;
-    #ifdef __arm__
-      max_iter /= 4;
-    #endif
+    int max_iter = 5120000 / N * 4;
+#ifdef __arm__
+    max_iter /= 4;
+#endif
     int iter;
 
     for (k = 0; k < Nfloat; ++k) {
-      X[k] = 0; 
+      X[k] = 0;
     }
 
     // FFTPack benchmark
     {
-      int max_iter_ = max_iter/4; // SIMD_SZ == 4 (returning value of pffft_simd_size())
+      int max_iter_ =
+          max_iter / 4;  // SIMD_SZ == 4 (returning value of pffft_simd_size())
       if (max_iter_ == 0) max_iter_ = 1;
       if (cplx) {
         api.cffti(N, wrk_.PtrBoth()).IgnoreError();
       } else {
         api.rffti(N, wrk_.PtrBoth()).IgnoreError();
       }
-      t0 = uclock_sec();  
-      
+      t0 = uclock_sec();
+
       for (iter = 0; iter < max_iter_; ++iter) {
         if (cplx) {
           api.cfftf(N, X_.PtrBoth(), wrk_.PtrBoth()).IgnoreError();
@@ -139,10 +144,9 @@ int main(int argc, char* argv[]) {
         }
       }
       t1 = uclock_sec();
-      
-      flops = (max_iter_ * 2) * ((cplx ? 5 : 2.5) * N * log((double)N) / M_LN2); 
-      show_output("FFTPack", N, cplx, flops, t0, t1, max_iter_); 
-      
+
+      flops = (max_iter_ * 2) * ((cplx ? 5 : 2.5) * N * log((double)N) / M_LN2);
+      show_output("FFTPack", N, cplx, flops, t0, t1, max_iter_);
     }
   }
 

From 2ab097f82fe012956e0e5299bd76a5db19530788 Mon Sep 17 00:00:00 2001
From: doinachiroiu <inach@google.com>
Date: Mon, 17 Aug 2020 11:21:27 +0000
Subject: [PATCH 05/29] Added README

---
 oss-internship-2020/pffft/README.txt       | 425 +--------------------
 oss-internship-2020/pffft/README_pffft.txt | 416 ++++++++++++++++++++
 2 files changed, 430 insertions(+), 411 deletions(-)
 create mode 100644 oss-internship-2020/pffft/README_pffft.txt

diff --git a/oss-internship-2020/pffft/README.txt b/oss-internship-2020/pffft/README.txt
index ee20b42..ef51f1a 100644
--- a/oss-internship-2020/pffft/README.txt
+++ b/oss-internship-2020/pffft/README.txt
@@ -1,416 +1,19 @@
-PFFFT: a pretty fast FFT.
+Sandboxing PFFFT library
 
-TL;DR
---
+Builder: CMake
 
-PFFFT does 1D Fast Fourier Transforms, of single precision real and
-complex vectors. It tries do it fast, it tries to be correct, and it
-tries to be small. Computations do take advantage of SSE1 instructions
-on x86 cpus, Altivec on powerpc cpus, and NEON on ARM cpus. The
-license is BSD-like.
+For testing: 
+`cd build`, then `./pffft_sandboxed`
 
+For debug:
+`SAPI_VLOG_LEVEL=1 ./pffft_sandboxed --v=100
+--sandbox2_danger_danger_permit_all_and_log <auxiliar file>`
 
-Why does it exist:
---
-
-I was in search of a good performing FFT library , preferably very
-small and with a very liberal license.
-
-When one says "fft library", FFTW ("Fastest Fourier Transform in the
-West") is probably the first name that comes to mind -- I guess that
-99% of open-source projects that need a FFT do use FFTW, and are happy
-with it. However, it is quite a large library , which does everything
-fft related (2d transforms, 3d transforms, other transformations such
-as discrete cosine , or fast hartley). And it is licensed under the
-GNU GPL , which means that it cannot be used in non open-source
-products.
-
-An alternative to FFTW that is really small, is the venerable FFTPACK
-v4, which is available on NETLIB. A more recent version (v5) exists,
-but it is larger as it deals with multi-dimensional transforms. This
-is a library that is written in FORTRAN 77, a language that is now
-considered as a bit antiquated by many. FFTPACKv4 was written in 1985,
-by Dr Paul Swarztrauber of NCAR, more than 25 years ago ! And despite
-its age, benchmarks show it that it still a very good performing FFT
-library, see for example the 1d single precision benchmarks here:
-http://www.fftw.org/speed/opteron-2.2GHz-32bit/ . It is however not
-competitive with the fastest ones, such as FFTW, Intel MKL, AMD ACML,
-Apple vDSP. The reason for that is that those libraries do take
-advantage of the SSE SIMD instructions available on Intel CPUs,
-available since the days of the Pentium III. These instructions deal
-with small vectors of 4 floats at a time, instead of a single float
-for a traditionnal FPU, so when using these instructions one may expect
-a 4-fold performance improvement.
-
-The idea was to take this fortran fftpack v4 code, translate to C,
-modify it to deal with those SSE instructions, and check that the
-final performance is not completely ridiculous when compared to other
-SIMD FFT libraries. Translation to C was performed with f2c (
-http://www.netlib.org/f2c/ ). The resulting file was a bit edited in
-order to remove the thousands of gotos that were introduced by
-f2c. You will find the fftpack.h and fftpack.c sources in the
-repository, this a complete translation of
-http://www.netlib.org/fftpack/ , with the discrete cosine transform
-and the test program. There is no license information in the netlib
-repository, but it was confirmed to me by the fftpack v5 curators that
-the same terms do apply to fftpack v4:
-http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html . This is a
-"BSD-like" license, it is compatible with proprietary projects.
-
-Adapting fftpack to deal with the SIMD 4-element vectors instead of
-scalar single precision numbers was more complex than I originally
-thought, especially with the real transforms, and I ended up writing
-more code than I planned..
-
-
-The code:
---
-
-Only two files, in good old C, pffft.c and pffft.h . The API is very
-very simple, just make sure that you read the comments in pffft.h.
-
-
-Comparison with other FFTs:
---
-
-The idea was not to break speed records, but to get a decently fast
-fft that is at least 50% as fast as the fastest FFT -- especially on
-slowest computers . I'm more focused on getting the best performance
-on slow cpus (Atom, Intel Core 1, old Athlons, ARM Cortex-A9...), than
-on getting top performance on today fastest cpus.
-
-It can be used in a real-time context as the fft functions do not
-perform any memory allocation -- that is why they accept a 'work'
-array in their arguments.
-
-It is also a bit focused on performing 1D convolutions, that is why it
-provides "unordered" FFTs , and a fourier domain convolution
-operation.
-
-
-Benchmark results (cpu tested: core i7 2600, core 2 quad, core 1 duo, atom N270, cortex-A9, cortex-A15, A8X)
---
-
-The benchmark shows the performance of various fft implementations measured in 
-MFlops, with the number of floating point operations being defined as 5Nlog2(N)
-for a length N complex fft, and 2.5*Nlog2(N) for a real fft. 
-See http://www.fftw.org/speed/method.html for an explanation of these formulas.
-
-MacOS Lion, gcc 4.2, 64-bit, fftw 3.3 on a 3.4 GHz core i7 2600
-
-Built with:
-
- gcc-4.2 -o test_pffft -arch x86_64 -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -DHAVE_VECLIB -framework veclib -DHAVE_FFTW -lfftw3f 
-
-| input len |real FFTPack|  real vDSP |  real FFTW | real PFFFT | |cplx FFTPack|  cplx vDSP |  cplx FFTW | cplx PFFFT |
-|-----------+------------+------------+------------+------------| |------------+------------+------------+------------|
-|       64  |     2816   |     8596   |     7329   |     8187   | |     2887   |    14898   |    14668   |    11108   |
-|       96  |     3298   |      n/a   |     8378   |     7727   | |     3953   |      n/a   |    15680   |    10878   |
-|      128  |     3507   |    11575   |     9266   |    10108   | |     4233   |    17598   |    16427   |    12000   |
-|      160  |     3391   |      n/a   |     9838   |    10711   | |     4220   |      n/a   |    16653   |    11187   |
-|      192  |     3919   |      n/a   |     9868   |    10956   | |     4297   |      n/a   |    15770   |    12540   |
-|      256  |     4283   |    13179   |    10694   |    13128   | |     4545   |    19550   |    16350   |    13822   |
-|      384  |     3136   |      n/a   |    10810   |    12061   | |     3600   |      n/a   |    16103   |    13240   |
-|      480  |     3477   |      n/a   |    10632   |    12074   | |     3536   |      n/a   |    11630   |    12522   |
-|      512  |     3783   |    15141   |    11267   |    13838   | |     3649   |    20002   |    16560   |    13580   |
-|      640  |     3639   |      n/a   |    11164   |    13946   | |     3695   |      n/a   |    15416   |    13890   |
-|      768  |     3800   |      n/a   |    11245   |    13495   | |     3590   |      n/a   |    15802   |    14552   |
-|      800  |     3440   |      n/a   |    10499   |    13301   | |     3659   |      n/a   |    12056   |    13268   |
-|     1024  |     3924   |    15605   |    11450   |    15339   | |     3769   |    20963   |    13941   |    15467   |
-|     2048  |     4518   |    16195   |    11551   |    15532   | |     4258   |    20413   |    13723   |    15042   |
-|     2400  |     4294   |      n/a   |    10685   |    13078   | |     4093   |      n/a   |    12777   |    13119   |
-|     4096  |     4750   |    16596   |    11672   |    15817   | |     4157   |    19662   |    14316   |    14336   |
-|     8192  |     3820   |    16227   |    11084   |    12555   | |     3691   |    18132   |    12102   |    13813   |
-|     9216  |     3864   |      n/a   |    10254   |    12870   | |     3586   |      n/a   |    12119   |    13994   |
-|    16384  |     3822   |    15123   |    10454   |    12822   | |     3613   |    16874   |    12370   |    13881   |
-|    32768  |     4175   |    14512   |    10662   |    11095   | |     3881   |    14702   |    11619   |    11524   |
-|   262144  |     3317   |    11429   |     6269   |     9517   | |     2810   |    11729   |     7757   |    10179   |
-|  1048576  |     2913   |    10551   |     4730   |     5867   | |     2661   |     7881   |     3520   |     5350   |
-|-----------+------------+------------+------------+------------| |------------+------------+------------+------------|
-
-
-Debian 6, gcc 4.4.5, 64-bit, fftw 3.3.1 on a 3.4 GHz core i7 2600
-
-Built with:
-gcc -o test_pffft -DHAVE_FFTW -msse2 -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L$HOME/local/lib -I$HOME/local/include/ -lfftw3f -lm
-
-| N (input length) | real FFTPack |   real FFTW  |  real PFFFT  | | cplx FFTPack |   cplx FFTW  |  cplx PFFFT  |
-|------------------+--------------+--------------+--------------| |--------------+--------------+--------------|
-|           64     |      3840    |      7680    |      8777    | |      4389    |     20480    |     11171    |
-|           96     |      4214    |      9633    |      8429    | |      4816    |     22477    |     11238    |
-|          128     |      3584    |     10240    |     10240    | |      5120    |     23893    |     11947    |
-|          192     |      4854    |     11095    |     12945    | |      4854    |     22191    |     14121    |
-|          256     |      4096    |     11703    |     16384    | |      5120    |     23406    |     13653    |
-|          384     |      4395    |     14651    |     12558    | |      4884    |     19535    |     14651    |
-|          512     |      5760    |     13166    |     15360    | |      4608    |     23040    |     15360    |
-|          768     |      4907    |     14020    |     16357    | |      4461    |     19628    |     14020    |
-|         1024     |      5120    |     14629    |     14629    | |      5120    |     20480    |     15754    |
-|         2048     |      5632    |     14080    |     18773    | |      4693    |     12516    |     16091    |
-|         4096     |      5120    |     13653    |     17554    | |      4726    |      7680    |     14456    |
-|         8192     |      4160    |      7396    |     13312    | |      4437    |     14791    |     13312    |
-|         9216     |      4210    |      6124    |     13473    | |      4491    |      7282    |     14970    |
-|        16384     |      3976    |     11010    |     14313    | |      4210    |     11450    |     13631    |
-|        32768     |      4260    |     10224    |     10954    | |      4260    |      6816    |     11797    |
-|       262144     |      3736    |      6896    |      9961    | |      2359    |      8965    |      9437    |
-|      1048576     |      2796    |      4534    |      6453    | |      1864    |      3078    |      5592    |
-|------------------+--------------+--------------+--------------| |--------------+--------------+--------------|
-
-
-
-MacOS Snow Leopard, gcc 4.0, 32-bit, fftw 3.3 on a 1.83 GHz core 1 duo
-
-Built with:
-
- gcc -o test_pffft -DHAVE_FFTW -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -framework veclib
-
-| input len |real FFTPack|  real vDSP |  real FFTW | real PFFFT | |cplx FFTPack|  cplx vDSP |  cplx FFTW | cplx PFFFT |
-|-----------+------------+------------+------------+------------| |------------+------------+------------+------------|
-|      64   |      745   |     2145   |     1706   |     2028   | |      961   |     3356   |     3313   |     2300   |
-|      96   |      877   |      n/a   |     1976   |     1978   | |     1059   |      n/a   |     3333   |     2233   |
-|     128   |      951   |     2808   |     2213   |     2279   | |     1202   |     3803   |     3739   |     2494   |
-|     192   |     1002   |      n/a   |     2456   |     2429   | |     1186   |      n/a   |     3701   |     2508   |
-|     256   |     1065   |     3205   |     2641   |     2793   | |     1302   |     4013   |     3912   |     2663   |
-|     384   |      845   |      n/a   |     2759   |     2499   | |      948   |      n/a   |     3729   |     2504   |
-|     512   |      900   |     3476   |     2956   |     2759   | |      974   |     4057   |     3954   |     2645   |
-|     768   |      910   |      n/a   |     2912   |     2737   | |      975   |      n/a   |     3837   |     2614   |
-|    1024   |      936   |     3583   |     3107   |     3009   | |     1006   |     4124   |     3821   |     2697   |
-|    2048   |     1057   |     3585   |     3091   |     2837   | |     1089   |     3889   |     3701   |     2513   |
-|    4096   |     1083   |     3524   |     3092   |     2733   | |     1039   |     3617   |     3462   |     2364   |
-|    8192   |      874   |     3252   |     2967   |     2363   | |      911   |     3106   |     2789   |     2302   |
-|    9216   |      898   |      n/a   |     2420   |     2290   | |      865   |      n/a   |     2676   |     2204   |
-|   16384   |      903   |     2892   |     2506   |     2421   | |      899   |     3026   |     2797   |     2289   |
-|   32768   |      965   |     2837   |     2550   |     2358   | |      920   |     2922   |     2763   |     2240   |
-|  262144   |      738   |     2422   |     1589   |     1708   | |      610   |     2038   |     1436   |     1091   |
-| 1048576   |      528   |     1207   |      845   |      880   | |      606   |     1020   |      669   |     1036   |
-|-----------+------------+------------+------------+------------| |------------+------------+------------+------------|
-
-
-
-Ubuntu 11.04, gcc 4.5, 32-bit, fftw 3.2 on a 2.66 core 2 quad
-
-Built with:
-gcc -o test_pffft -DHAVE_FFTW -msse -mfpmath=sse -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -lm
-
-| input len |real FFTPack|  real FFTW | real PFFFT | |cplx FFTPack|  cplx FFTW | cplx PFFFT |
-|-----------+------------+------------+------------| |------------+------------+------------|
-|       64  |     1920   |     3614   |     5120   | |     2194   |     7680   |     6467   |
-|       96  |     1873   |     3549   |     5187   | |     2107   |     8429   |     5863   |
-|      128  |     2240   |     3773   |     5514   | |     2560   |     7964   |     6827   |
-|      192  |     1765   |     4569   |     7767   | |     2284   |     9137   |     7061   |
-|      256  |     2048   |     5461   |     7447   | |     2731   |     9638   |     7802   |
-|      384  |     1998   |     5861   |     6762   | |     2313   |     9253   |     7644   |
-|      512  |     2095   |     6144   |     7680   | |     2194   |    10240   |     7089   |
-|      768  |     2230   |     5773   |     7549   | |     2045   |    10331   |     7010   |
-|     1024  |     2133   |     6400   |     8533   | |     2133   |    10779   |     7877   |
-|     2048  |     2011   |     7040   |     8665   | |     1942   |    10240   |     7768   |
-|     4096  |     2194   |     6827   |     8777   | |     1755   |     9452   |     6827   |
-|     8192  |     1849   |     6656   |     6656   | |     1752   |     7831   |     6827   |
-|     9216  |     1871   |     5858   |     6416   | |     1643   |     6909   |     6266   |
-|    16384  |     1883   |     6223   |     6506   | |     1664   |     7340   |     6982   |
-|    32768  |     1826   |     6390   |     6667   | |     1631   |     7481   |     6971   |
-|   262144  |     1546   |     4075   |     5977   | |     1299   |     3415   |     3551   |
-|  1048576  |     1104   |     2071   |     1730   | |     1104   |     1149   |     1834   |
-|-----------+------------+------------+------------| |------------+------------+------------|
-
-
-
-Ubuntu 11.04, gcc 4.5, 32-bit, fftw 3.3 on a 1.6 GHz Atom N270
-
-Built with:
-gcc -o test_pffft -DHAVE_FFTW -msse -mfpmath=sse -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -lm
-
-| N (input length) | real FFTPack |   real FFTW  |  real PFFFT  | | cplx FFTPack |   cplx FFTW  |  cplx PFFFT  |
-|------------------+--------------+--------------+--------------| |--------------+--------------+--------------|
-|           64     |       452    |      1041    |      1336    | |       549    |      2318    |      1781    |
-|           96     |       444    |      1297    |      1297    | |       503    |      2408    |      1686    |
-|          128     |       527    |      1525    |      1707    | |       543    |      2655    |      1886    |
-|          192     |       498    |      1653    |      1849    | |       539    |      2678    |      1942    |
-|          256     |       585    |      1862    |      2156    | |       594    |      2777    |      2244    |
-|          384     |       499    |      1870    |      1998    | |       511    |      2586    |      1890    |
-|          512     |       562    |      2095    |      2194    | |       542    |      2973    |      2194    |
-|          768     |       545    |      2045    |      2133    | |       545    |      2365    |      2133    |
-|         1024     |       595    |      2133    |      2438    | |       569    |      2695    |      2179    |
-|         2048     |       587    |      2125    |      2347    | |       521    |      2230    |      1707    |
-|         4096     |       495    |      1890    |      1834    | |       492    |      1876    |      1672    |
-|         8192     |       469    |      1548    |      1729    | |       438    |      1740    |      1664    |
-|         9216     |       468    |      1663    |      1663    | |       446    |      1585    |      1531    |
-|        16384     |       453    |      1608    |      1767    | |       398    |      1476    |      1664    |
-|        32768     |       456    |      1420    |      1503    | |       387    |      1388    |      1345    |
-|       262144     |       309    |       385    |       726    | |       262    |       415    |       840    |
-|      1048576     |       280    |       351    |       739    | |       261    |       313    |       797    |
-|------------------+--------------+--------------+--------------| |--------------+--------------+--------------|
-
-
-
-Windows 7, visual c++ 2010 on a 1.6 GHz Atom N270
-
-Built with:
-cl /Ox -D_USE_MATH_DEFINES /arch:SSE test_pffft.c pffft.c fftpack.c
-
-(visual c++ is definitively not very good with SSE intrinsics...)
-
-| N (input length) | real FFTPack |  real PFFFT  | | cplx FFTPack |  cplx PFFFT  |
-|------------------+--------------+--------------| |--------------+--------------|
-|           64     |       173    |      1009    | |       174    |      1159    |
-|           96     |       169    |      1029    | |       188    |      1201    |
-|          128     |       195    |      1242    | |       191    |      1275    |
-|          192     |       178    |      1312    | |       184    |      1276    |
-|          256     |       196    |      1591    | |       186    |      1281    |
-|          384     |       172    |      1409    | |       181    |      1281    |
-|          512     |       187    |      1640    | |       181    |      1313    |
-|          768     |       171    |      1614    | |       176    |      1258    |
-|         1024     |       186    |      1812    | |       178    |      1223    |
-|         2048     |       190    |      1707    | |       186    |      1099    |
-|         4096     |       182    |      1446    | |       177    |       975    |
-|         8192     |       175    |      1345    | |       169    |      1034    |
-|         9216     |       165    |      1271    | |       168    |      1023    |
-|        16384     |       166    |      1396    | |       165    |       949    |
-|        32768     |       172    |      1311    | |       161    |       881    |
-|       262144     |       136    |       632    | |       134    |       629    |
-|      1048576     |       134    |       698    | |       127    |       623    |
-|------------------+--------------+--------------| |--------------+--------------|
-
-
-
-Ubuntu 12.04, gcc-4.7.3, 32-bit, with fftw 3.3.3 (built with --enable-neon), on a 1.2GHz ARM Cortex A9 (Tegra 3)
-
-Built with:
-gcc-4.7 -O3 -DHAVE_FFTW -march=armv7-a -mtune=cortex-a9 -mfloat-abi=hard -mfpu=neon -ffast-math test_pffft.c pffft.c -o test_pffft_arm fftpack.c -lm -I/usr/local/include/ -L/usr/local/lib/ -lfftw3f
-
-| input len |real FFTPack|  real FFTW | real PFFFT | |cplx FFTPack|  cplx FFTW | cplx PFFFT |
-|-----------+------------+------------+------------| |------------+------------+------------|
-|       64  |      549   |      452   |      731   | |      512   |      602   |      640   |
-|       96  |      421   |      272   |      702   | |      496   |      571   |      602   |
-|      128  |      498   |      512   |      815   | |      597   |      618   |      652   |
-|      160  |      521   |      536   |      815   | |      586   |      669   |      625   |
-|      192  |      539   |      571   |      883   | |      485   |      597   |      626   |
-|      256  |      640   |      539   |      975   | |      569   |      611   |      671   |
-|      384  |      499   |      610   |      879   | |      499   |      602   |      637   |
-|      480  |      518   |      507   |      877   | |      496   |      661   |      616   |
-|      512  |      524   |      591   |     1002   | |      549   |      678   |      668   |
-|      640  |      542   |      612   |      955   | |      568   |      663   |      645   |
-|      768  |      557   |      613   |      981   | |      491   |      663   |      598   |
-|      800  |      514   |      353   |      882   | |      514   |      360   |      574   |
-|     1024  |      640   |      640   |     1067   | |      492   |      683   |      602   |
-|     2048  |      587   |      640   |      908   | |      486   |      640   |      552   |
-|     2400  |      479   |      368   |      777   | |      422   |      376   |      518   |
-|     4096  |      511   |      614   |      853   | |      426   |      640   |      534   |
-|     8192  |      415   |      584   |      708   | |      386   |      622   |      516   |
-|     9216  |      419   |      571   |      687   | |      364   |      586   |      506   |
-|    16384  |      426   |      577   |      716   | |      398   |      606   |      530   |
-|    32768  |      417   |      572   |      673   | |      399   |      572   |      468   |
-|   262144  |      219   |      380   |      293   | |      255   |      431   |      343   |
-|  1048576  |      202   |      274   |      237   | |      265   |      282   |      355   |
-|-----------+------------+------------+------------| |------------+------------+------------|
-
-Same platform as above, but this time pffft and fftpack are built with clang 3.2:
-
-clang -O3 -DHAVE_FFTW -march=armv7-a -mtune=cortex-a9 -mfloat-abi=hard -mfpu=neon -ffast-math test_pffft.c pffft.c -o test_pffft_arm fftpack.c -lm -I/usr/local/include/ -L/usr/local/lib/ -lfftw3f
-
-| input len |real FFTPack|  real FFTW | real PFFFT | |cplx FFTPack|  cplx FFTW | cplx PFFFT |
-|-----------+------------+------------+------------| |------------+------------+------------|
-|       64  |      427   |      452   |      853   | |      427   |      602   |     1024   |
-|       96  |      351   |      276   |      843   | |      337   |      571   |      963   |
-|      128  |      373   |      512   |      996   | |      390   |      618   |     1054   |
-|      160  |      426   |      536   |      987   | |      375   |      669   |      914   |
-|      192  |      404   |      571   |     1079   | |      388   |      588   |     1079   |
-|      256  |      465   |      539   |     1205   | |      445   |      602   |     1170   |
-|      384  |      366   |      610   |     1099   | |      343   |      594   |     1099   |
-|      480  |      356   |      507   |     1140   | |      335   |      651   |      931   |
-|      512  |      411   |      591   |     1213   | |      384   |      649   |     1124   |
-|      640  |      398   |      612   |     1193   | |      373   |      654   |      901   |
-|      768  |      409   |      613   |     1227   | |      383   |      663   |     1044   |
-|      800  |      411   |      348   |     1073   | |      353   |      358   |      809   |
-|     1024  |      427   |      640   |     1280   | |      413   |      692   |     1004   |
-|     2048  |      414   |      626   |     1126   | |      371   |      640   |      853   |
-|     2400  |      399   |      373   |      898   | |      319   |      368   |      653   |
-|     4096  |      404   |      602   |     1059   | |      357   |      633   |      778   |
-|     8192  |      332   |      584   |      792   | |      308   |      616   |      716   |
-|     9216  |      322   |      561   |      783   | |      299   |      586   |      687   |
-|    16384  |      344   |      568   |      778   | |      314   |      617   |      745   |
-|    32768  |      342   |      564   |      737   | |      314   |      552   |      629   |
-|   262144  |      201   |      383   |      313   | |      227   |      435   |      413   |
-|  1048576  |      187   |      262   |      251   | |      228   |      281   |      409   |
-|-----------+------------+------------+------------| |------------+------------+------------|
-
-So it looks like, on ARM, gcc 4.7 is the best at scalar floating point
-(the fftpack performance numbers are better with gcc), while clang is
-the best with neon intrinsics (see how pffft perf has improved with
-clang 3.2).
-
-
-NVIDIA Jetson TK1 board, gcc-4.8.2. The cpu is a 2.3GHz cortex A15 (Tegra K1).
-
-Built with:
-gcc -O3 -march=armv7-a -mtune=native -mfloat-abi=hard -mfpu=neon -ffast-math test_pffft.c pffft.c -o test_pffft_arm fftpack.c -lm
-
-| input len |real FFTPack| real PFFFT | |cplx FFTPack| cplx PFFFT |
-|-----------+------------+------------| |------------+------------|
-|       64  |     1735   |     3308   | |     1994   |     3744   |
-|       96  |     1596   |     3448   | |     1987   |     3572   |
-|      128  |     1807   |     4076   | |     2255   |     3960   |
-|      160  |     1769   |     4083   | |     2071   |     3845   |
-|      192  |     1990   |     4233   | |     2017   |     3939   |
-|      256  |     2191   |     4882   | |     2254   |     4346   |
-|      384  |     1878   |     4492   | |     2073   |     4012   |
-|      480  |     1748   |     4398   | |     1923   |     3951   |
-|      512  |     2030   |     5064   | |     2267   |     4195   |
-|      640  |     1918   |     4756   | |     2094   |     4184   |
-|      768  |     2099   |     4907   | |     2048   |     4297   |
-|      800  |     1822   |     4555   | |     1880   |     4063   |
-|     1024  |     2232   |     5355   | |     2187   |     4420   |
-|     2048  |     2176   |     4983   | |     2027   |     3602   |
-|     2400  |     1741   |     4256   | |     1710   |     3344   |
-|     4096  |     1816   |     3914   | |     1851   |     3349   |
-|     8192  |     1716   |     3481   | |     1700   |     3255   |
-|     9216  |     1735   |     3589   | |     1653   |     3094   |
-|    16384  |     1567   |     3483   | |     1637   |     3244   |
-|    32768  |     1624   |     3240   | |     1655   |     3156   |
-|   262144  |     1012   |     1898   | |      983   |     1503   |
-|  1048576  |      876   |     1154   | |      868   |     1341   |
-|-----------+------------+------------| |------------+------------|
-
-The performance on the tegra K1 is pretty impressive. I'm not
-including the FFTW numbers as they as slightly below the scalar
-fftpack numbers, so something must be wrong (however it seems to be
-correctly configured and is using neon simd instructions).
-
-When using clang 3.4 the pffft version is even a bit faster, reaching
-5.7 GFlops for real ffts of size 1024.
-
-
-iPad Air 2 with iOS9, xcode 8.0, arm64. The cpu is an Apple A8X, supposedly running at 1.5GHz.
-
-| input len |real FFTPack|  real vDSP | real PFFFT | |cplx FFTPack|  cplx vDSP | cplx PFFFT |
-|-----------+------------+------------+------------| |------------+------------+------------|
-|       64  |     2517   |     7995   |     6086   | |     2725   |    13006   |     8495   |
-|       96  |     2442   |      n/a   |     6691   | |     2256   |      n/a   |     7991   |
-|      128  |     2664   |    10186   |     7877   | |     2575   |    15115   |     9115   |
-|      160  |     2638   |      n/a   |     8283   | |     2682   |      n/a   |     8806   |
-|      192  |     2903   |      n/a   |     9083   | |     2634   |      n/a   |     8980   |
-|      256  |     3184   |    11452   |    10039   | |     3026   |    15410   |    10199   |
-|      384  |     2665   |      n/a   |    10100   | |     2275   |      n/a   |     9247   |
-|      480  |     2546   |      n/a   |     9863   | |     2341   |      n/a   |     8892   |
-|      512  |     2832   |    12197   |    10989   | |     2547   |    16768   |    10154   |
-|      640  |     2755   |      n/a   |    10461   | |     2569   |      n/a   |     9666   |
-|      768  |     2998   |      n/a   |    11355   | |     2585   |      n/a   |     9813   |
-|      800  |     2516   |      n/a   |    10332   | |     2433   |      n/a   |     9164   |
-|     1024  |     3109   |    12965   |    12114   | |     2869   |    16448   |    10519   |
-|     2048  |     3027   |    12996   |    12023   | |     2648   |    17304   |    10307   |
-|     2400  |     2515   |      n/a   |    10372   | |     2355   |      n/a   |     8443   |
-|     4096  |     3204   |    13603   |    12359   | |     2814   |    16570   |     9780   |
-|     8192  |     2759   |    13422   |    10824   | |     2153   |    15652   |     7884   |
-|     9216  |     2700   |      n/a   |     9938   | |     2241   |      n/a   |     7900   |
-|    16384  |     2280   |    13057   |     7976   | |      593   |     4272   |     2534   |
-|    32768  |      768   |     4269   |     2882   | |      606   |     4405   |     2604   |
-|   262144  |      724   |     3527   |     2630   | |      534   |     2418   |     2157   |
-|  1048576  |      674   |     1467   |     2135   | |      530   |     1621   |     2055   |
-|-----------+------------+------------+------------| |------------+------------+------------|
-
-I double-checked to make sure I did not make a mistake in the time
-measurements, as the numbers are much higher than what I initially
-expected. They are in fact higher than the number I get on the 2.8GHz
-Xeon of my 2008 mac pro.. (except for FFT lengths >= 32768 where
-having a big cache is useful). A good surprise is also that the perf
-is not too far from apple's vDSP (at least for the real FFT).
+CMake observations:
+    * linking pffft and fftpack (which contains necessary functions for pffft)
+    * set math library 
 
+Sandboxed main observations:
+    * containing two testing parts (fft / pffft benchmarks)
+        ! current stage: fft - works :)
+                         pffft - not implemented
diff --git a/oss-internship-2020/pffft/README_pffft.txt b/oss-internship-2020/pffft/README_pffft.txt
new file mode 100644
index 0000000..ee20b42
--- /dev/null
+++ b/oss-internship-2020/pffft/README_pffft.txt
@@ -0,0 +1,416 @@
+PFFFT: a pretty fast FFT.
+
+TL;DR
+--
+
+PFFFT does 1D Fast Fourier Transforms, of single precision real and
+complex vectors. It tries do it fast, it tries to be correct, and it
+tries to be small. Computations do take advantage of SSE1 instructions
+on x86 cpus, Altivec on powerpc cpus, and NEON on ARM cpus. The
+license is BSD-like.
+
+
+Why does it exist:
+--
+
+I was in search of a good performing FFT library , preferably very
+small and with a very liberal license.
+
+When one says "fft library", FFTW ("Fastest Fourier Transform in the
+West") is probably the first name that comes to mind -- I guess that
+99% of open-source projects that need a FFT do use FFTW, and are happy
+with it. However, it is quite a large library , which does everything
+fft related (2d transforms, 3d transforms, other transformations such
+as discrete cosine , or fast hartley). And it is licensed under the
+GNU GPL , which means that it cannot be used in non open-source
+products.
+
+An alternative to FFTW that is really small, is the venerable FFTPACK
+v4, which is available on NETLIB. A more recent version (v5) exists,
+but it is larger as it deals with multi-dimensional transforms. This
+is a library that is written in FORTRAN 77, a language that is now
+considered as a bit antiquated by many. FFTPACKv4 was written in 1985,
+by Dr Paul Swarztrauber of NCAR, more than 25 years ago ! And despite
+its age, benchmarks show it that it still a very good performing FFT
+library, see for example the 1d single precision benchmarks here:
+http://www.fftw.org/speed/opteron-2.2GHz-32bit/ . It is however not
+competitive with the fastest ones, such as FFTW, Intel MKL, AMD ACML,
+Apple vDSP. The reason for that is that those libraries do take
+advantage of the SSE SIMD instructions available on Intel CPUs,
+available since the days of the Pentium III. These instructions deal
+with small vectors of 4 floats at a time, instead of a single float
+for a traditionnal FPU, so when using these instructions one may expect
+a 4-fold performance improvement.
+
+The idea was to take this fortran fftpack v4 code, translate to C,
+modify it to deal with those SSE instructions, and check that the
+final performance is not completely ridiculous when compared to other
+SIMD FFT libraries. Translation to C was performed with f2c (
+http://www.netlib.org/f2c/ ). The resulting file was a bit edited in
+order to remove the thousands of gotos that were introduced by
+f2c. You will find the fftpack.h and fftpack.c sources in the
+repository, this a complete translation of
+http://www.netlib.org/fftpack/ , with the discrete cosine transform
+and the test program. There is no license information in the netlib
+repository, but it was confirmed to me by the fftpack v5 curators that
+the same terms do apply to fftpack v4:
+http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html . This is a
+"BSD-like" license, it is compatible with proprietary projects.
+
+Adapting fftpack to deal with the SIMD 4-element vectors instead of
+scalar single precision numbers was more complex than I originally
+thought, especially with the real transforms, and I ended up writing
+more code than I planned..
+
+
+The code:
+--
+
+Only two files, in good old C, pffft.c and pffft.h . The API is very
+very simple, just make sure that you read the comments in pffft.h.
+
+
+Comparison with other FFTs:
+--
+
+The idea was not to break speed records, but to get a decently fast
+fft that is at least 50% as fast as the fastest FFT -- especially on
+slowest computers . I'm more focused on getting the best performance
+on slow cpus (Atom, Intel Core 1, old Athlons, ARM Cortex-A9...), than
+on getting top performance on today fastest cpus.
+
+It can be used in a real-time context as the fft functions do not
+perform any memory allocation -- that is why they accept a 'work'
+array in their arguments.
+
+It is also a bit focused on performing 1D convolutions, that is why it
+provides "unordered" FFTs , and a fourier domain convolution
+operation.
+
+
+Benchmark results (cpu tested: core i7 2600, core 2 quad, core 1 duo, atom N270, cortex-A9, cortex-A15, A8X)
+--
+
+The benchmark shows the performance of various fft implementations measured in 
+MFlops, with the number of floating point operations being defined as 5Nlog2(N)
+for a length N complex fft, and 2.5*Nlog2(N) for a real fft. 
+See http://www.fftw.org/speed/method.html for an explanation of these formulas.
+
+MacOS Lion, gcc 4.2, 64-bit, fftw 3.3 on a 3.4 GHz core i7 2600
+
+Built with:
+
+ gcc-4.2 -o test_pffft -arch x86_64 -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -DHAVE_VECLIB -framework veclib -DHAVE_FFTW -lfftw3f 
+
+| input len |real FFTPack|  real vDSP |  real FFTW | real PFFFT | |cplx FFTPack|  cplx vDSP |  cplx FFTW | cplx PFFFT |
+|-----------+------------+------------+------------+------------| |------------+------------+------------+------------|
+|       64  |     2816   |     8596   |     7329   |     8187   | |     2887   |    14898   |    14668   |    11108   |
+|       96  |     3298   |      n/a   |     8378   |     7727   | |     3953   |      n/a   |    15680   |    10878   |
+|      128  |     3507   |    11575   |     9266   |    10108   | |     4233   |    17598   |    16427   |    12000   |
+|      160  |     3391   |      n/a   |     9838   |    10711   | |     4220   |      n/a   |    16653   |    11187   |
+|      192  |     3919   |      n/a   |     9868   |    10956   | |     4297   |      n/a   |    15770   |    12540   |
+|      256  |     4283   |    13179   |    10694   |    13128   | |     4545   |    19550   |    16350   |    13822   |
+|      384  |     3136   |      n/a   |    10810   |    12061   | |     3600   |      n/a   |    16103   |    13240   |
+|      480  |     3477   |      n/a   |    10632   |    12074   | |     3536   |      n/a   |    11630   |    12522   |
+|      512  |     3783   |    15141   |    11267   |    13838   | |     3649   |    20002   |    16560   |    13580   |
+|      640  |     3639   |      n/a   |    11164   |    13946   | |     3695   |      n/a   |    15416   |    13890   |
+|      768  |     3800   |      n/a   |    11245   |    13495   | |     3590   |      n/a   |    15802   |    14552   |
+|      800  |     3440   |      n/a   |    10499   |    13301   | |     3659   |      n/a   |    12056   |    13268   |
+|     1024  |     3924   |    15605   |    11450   |    15339   | |     3769   |    20963   |    13941   |    15467   |
+|     2048  |     4518   |    16195   |    11551   |    15532   | |     4258   |    20413   |    13723   |    15042   |
+|     2400  |     4294   |      n/a   |    10685   |    13078   | |     4093   |      n/a   |    12777   |    13119   |
+|     4096  |     4750   |    16596   |    11672   |    15817   | |     4157   |    19662   |    14316   |    14336   |
+|     8192  |     3820   |    16227   |    11084   |    12555   | |     3691   |    18132   |    12102   |    13813   |
+|     9216  |     3864   |      n/a   |    10254   |    12870   | |     3586   |      n/a   |    12119   |    13994   |
+|    16384  |     3822   |    15123   |    10454   |    12822   | |     3613   |    16874   |    12370   |    13881   |
+|    32768  |     4175   |    14512   |    10662   |    11095   | |     3881   |    14702   |    11619   |    11524   |
+|   262144  |     3317   |    11429   |     6269   |     9517   | |     2810   |    11729   |     7757   |    10179   |
+|  1048576  |     2913   |    10551   |     4730   |     5867   | |     2661   |     7881   |     3520   |     5350   |
+|-----------+------------+------------+------------+------------| |------------+------------+------------+------------|
+
+
+Debian 6, gcc 4.4.5, 64-bit, fftw 3.3.1 on a 3.4 GHz core i7 2600
+
+Built with:
+gcc -o test_pffft -DHAVE_FFTW -msse2 -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L$HOME/local/lib -I$HOME/local/include/ -lfftw3f -lm
+
+| N (input length) | real FFTPack |   real FFTW  |  real PFFFT  | | cplx FFTPack |   cplx FFTW  |  cplx PFFFT  |
+|------------------+--------------+--------------+--------------| |--------------+--------------+--------------|
+|           64     |      3840    |      7680    |      8777    | |      4389    |     20480    |     11171    |
+|           96     |      4214    |      9633    |      8429    | |      4816    |     22477    |     11238    |
+|          128     |      3584    |     10240    |     10240    | |      5120    |     23893    |     11947    |
+|          192     |      4854    |     11095    |     12945    | |      4854    |     22191    |     14121    |
+|          256     |      4096    |     11703    |     16384    | |      5120    |     23406    |     13653    |
+|          384     |      4395    |     14651    |     12558    | |      4884    |     19535    |     14651    |
+|          512     |      5760    |     13166    |     15360    | |      4608    |     23040    |     15360    |
+|          768     |      4907    |     14020    |     16357    | |      4461    |     19628    |     14020    |
+|         1024     |      5120    |     14629    |     14629    | |      5120    |     20480    |     15754    |
+|         2048     |      5632    |     14080    |     18773    | |      4693    |     12516    |     16091    |
+|         4096     |      5120    |     13653    |     17554    | |      4726    |      7680    |     14456    |
+|         8192     |      4160    |      7396    |     13312    | |      4437    |     14791    |     13312    |
+|         9216     |      4210    |      6124    |     13473    | |      4491    |      7282    |     14970    |
+|        16384     |      3976    |     11010    |     14313    | |      4210    |     11450    |     13631    |
+|        32768     |      4260    |     10224    |     10954    | |      4260    |      6816    |     11797    |
+|       262144     |      3736    |      6896    |      9961    | |      2359    |      8965    |      9437    |
+|      1048576     |      2796    |      4534    |      6453    | |      1864    |      3078    |      5592    |
+|------------------+--------------+--------------+--------------| |--------------+--------------+--------------|
+
+
+
+MacOS Snow Leopard, gcc 4.0, 32-bit, fftw 3.3 on a 1.83 GHz core 1 duo
+
+Built with:
+
+ gcc -o test_pffft -DHAVE_FFTW -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -framework veclib
+
+| input len |real FFTPack|  real vDSP |  real FFTW | real PFFFT | |cplx FFTPack|  cplx vDSP |  cplx FFTW | cplx PFFFT |
+|-----------+------------+------------+------------+------------| |------------+------------+------------+------------|
+|      64   |      745   |     2145   |     1706   |     2028   | |      961   |     3356   |     3313   |     2300   |
+|      96   |      877   |      n/a   |     1976   |     1978   | |     1059   |      n/a   |     3333   |     2233   |
+|     128   |      951   |     2808   |     2213   |     2279   | |     1202   |     3803   |     3739   |     2494   |
+|     192   |     1002   |      n/a   |     2456   |     2429   | |     1186   |      n/a   |     3701   |     2508   |
+|     256   |     1065   |     3205   |     2641   |     2793   | |     1302   |     4013   |     3912   |     2663   |
+|     384   |      845   |      n/a   |     2759   |     2499   | |      948   |      n/a   |     3729   |     2504   |
+|     512   |      900   |     3476   |     2956   |     2759   | |      974   |     4057   |     3954   |     2645   |
+|     768   |      910   |      n/a   |     2912   |     2737   | |      975   |      n/a   |     3837   |     2614   |
+|    1024   |      936   |     3583   |     3107   |     3009   | |     1006   |     4124   |     3821   |     2697   |
+|    2048   |     1057   |     3585   |     3091   |     2837   | |     1089   |     3889   |     3701   |     2513   |
+|    4096   |     1083   |     3524   |     3092   |     2733   | |     1039   |     3617   |     3462   |     2364   |
+|    8192   |      874   |     3252   |     2967   |     2363   | |      911   |     3106   |     2789   |     2302   |
+|    9216   |      898   |      n/a   |     2420   |     2290   | |      865   |      n/a   |     2676   |     2204   |
+|   16384   |      903   |     2892   |     2506   |     2421   | |      899   |     3026   |     2797   |     2289   |
+|   32768   |      965   |     2837   |     2550   |     2358   | |      920   |     2922   |     2763   |     2240   |
+|  262144   |      738   |     2422   |     1589   |     1708   | |      610   |     2038   |     1436   |     1091   |
+| 1048576   |      528   |     1207   |      845   |      880   | |      606   |     1020   |      669   |     1036   |
+|-----------+------------+------------+------------+------------| |------------+------------+------------+------------|
+
+
+
+Ubuntu 11.04, gcc 4.5, 32-bit, fftw 3.2 on a 2.66 core 2 quad
+
+Built with:
+gcc -o test_pffft -DHAVE_FFTW -msse -mfpmath=sse -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -lm
+
+| input len |real FFTPack|  real FFTW | real PFFFT | |cplx FFTPack|  cplx FFTW | cplx PFFFT |
+|-----------+------------+------------+------------| |------------+------------+------------|
+|       64  |     1920   |     3614   |     5120   | |     2194   |     7680   |     6467   |
+|       96  |     1873   |     3549   |     5187   | |     2107   |     8429   |     5863   |
+|      128  |     2240   |     3773   |     5514   | |     2560   |     7964   |     6827   |
+|      192  |     1765   |     4569   |     7767   | |     2284   |     9137   |     7061   |
+|      256  |     2048   |     5461   |     7447   | |     2731   |     9638   |     7802   |
+|      384  |     1998   |     5861   |     6762   | |     2313   |     9253   |     7644   |
+|      512  |     2095   |     6144   |     7680   | |     2194   |    10240   |     7089   |
+|      768  |     2230   |     5773   |     7549   | |     2045   |    10331   |     7010   |
+|     1024  |     2133   |     6400   |     8533   | |     2133   |    10779   |     7877   |
+|     2048  |     2011   |     7040   |     8665   | |     1942   |    10240   |     7768   |
+|     4096  |     2194   |     6827   |     8777   | |     1755   |     9452   |     6827   |
+|     8192  |     1849   |     6656   |     6656   | |     1752   |     7831   |     6827   |
+|     9216  |     1871   |     5858   |     6416   | |     1643   |     6909   |     6266   |
+|    16384  |     1883   |     6223   |     6506   | |     1664   |     7340   |     6982   |
+|    32768  |     1826   |     6390   |     6667   | |     1631   |     7481   |     6971   |
+|   262144  |     1546   |     4075   |     5977   | |     1299   |     3415   |     3551   |
+|  1048576  |     1104   |     2071   |     1730   | |     1104   |     1149   |     1834   |
+|-----------+------------+------------+------------| |------------+------------+------------|
+
+
+
+Ubuntu 11.04, gcc 4.5, 32-bit, fftw 3.3 on a 1.6 GHz Atom N270
+
+Built with:
+gcc -o test_pffft -DHAVE_FFTW -msse -mfpmath=sse -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -lm
+
+| N (input length) | real FFTPack |   real FFTW  |  real PFFFT  | | cplx FFTPack |   cplx FFTW  |  cplx PFFFT  |
+|------------------+--------------+--------------+--------------| |--------------+--------------+--------------|
+|           64     |       452    |      1041    |      1336    | |       549    |      2318    |      1781    |
+|           96     |       444    |      1297    |      1297    | |       503    |      2408    |      1686    |
+|          128     |       527    |      1525    |      1707    | |       543    |      2655    |      1886    |
+|          192     |       498    |      1653    |      1849    | |       539    |      2678    |      1942    |
+|          256     |       585    |      1862    |      2156    | |       594    |      2777    |      2244    |
+|          384     |       499    |      1870    |      1998    | |       511    |      2586    |      1890    |
+|          512     |       562    |      2095    |      2194    | |       542    |      2973    |      2194    |
+|          768     |       545    |      2045    |      2133    | |       545    |      2365    |      2133    |
+|         1024     |       595    |      2133    |      2438    | |       569    |      2695    |      2179    |
+|         2048     |       587    |      2125    |      2347    | |       521    |      2230    |      1707    |
+|         4096     |       495    |      1890    |      1834    | |       492    |      1876    |      1672    |
+|         8192     |       469    |      1548    |      1729    | |       438    |      1740    |      1664    |
+|         9216     |       468    |      1663    |      1663    | |       446    |      1585    |      1531    |
+|        16384     |       453    |      1608    |      1767    | |       398    |      1476    |      1664    |
+|        32768     |       456    |      1420    |      1503    | |       387    |      1388    |      1345    |
+|       262144     |       309    |       385    |       726    | |       262    |       415    |       840    |
+|      1048576     |       280    |       351    |       739    | |       261    |       313    |       797    |
+|------------------+--------------+--------------+--------------| |--------------+--------------+--------------|
+
+
+
+Windows 7, visual c++ 2010 on a 1.6 GHz Atom N270
+
+Built with:
+cl /Ox -D_USE_MATH_DEFINES /arch:SSE test_pffft.c pffft.c fftpack.c
+
+(visual c++ is definitively not very good with SSE intrinsics...)
+
+| N (input length) | real FFTPack |  real PFFFT  | | cplx FFTPack |  cplx PFFFT  |
+|------------------+--------------+--------------| |--------------+--------------|
+|           64     |       173    |      1009    | |       174    |      1159    |
+|           96     |       169    |      1029    | |       188    |      1201    |
+|          128     |       195    |      1242    | |       191    |      1275    |
+|          192     |       178    |      1312    | |       184    |      1276    |
+|          256     |       196    |      1591    | |       186    |      1281    |
+|          384     |       172    |      1409    | |       181    |      1281    |
+|          512     |       187    |      1640    | |       181    |      1313    |
+|          768     |       171    |      1614    | |       176    |      1258    |
+|         1024     |       186    |      1812    | |       178    |      1223    |
+|         2048     |       190    |      1707    | |       186    |      1099    |
+|         4096     |       182    |      1446    | |       177    |       975    |
+|         8192     |       175    |      1345    | |       169    |      1034    |
+|         9216     |       165    |      1271    | |       168    |      1023    |
+|        16384     |       166    |      1396    | |       165    |       949    |
+|        32768     |       172    |      1311    | |       161    |       881    |
+|       262144     |       136    |       632    | |       134    |       629    |
+|      1048576     |       134    |       698    | |       127    |       623    |
+|------------------+--------------+--------------| |--------------+--------------|
+
+
+
+Ubuntu 12.04, gcc-4.7.3, 32-bit, with fftw 3.3.3 (built with --enable-neon), on a 1.2GHz ARM Cortex A9 (Tegra 3)
+
+Built with:
+gcc-4.7 -O3 -DHAVE_FFTW -march=armv7-a -mtune=cortex-a9 -mfloat-abi=hard -mfpu=neon -ffast-math test_pffft.c pffft.c -o test_pffft_arm fftpack.c -lm -I/usr/local/include/ -L/usr/local/lib/ -lfftw3f
+
+| input len |real FFTPack|  real FFTW | real PFFFT | |cplx FFTPack|  cplx FFTW | cplx PFFFT |
+|-----------+------------+------------+------------| |------------+------------+------------|
+|       64  |      549   |      452   |      731   | |      512   |      602   |      640   |
+|       96  |      421   |      272   |      702   | |      496   |      571   |      602   |
+|      128  |      498   |      512   |      815   | |      597   |      618   |      652   |
+|      160  |      521   |      536   |      815   | |      586   |      669   |      625   |
+|      192  |      539   |      571   |      883   | |      485   |      597   |      626   |
+|      256  |      640   |      539   |      975   | |      569   |      611   |      671   |
+|      384  |      499   |      610   |      879   | |      499   |      602   |      637   |
+|      480  |      518   |      507   |      877   | |      496   |      661   |      616   |
+|      512  |      524   |      591   |     1002   | |      549   |      678   |      668   |
+|      640  |      542   |      612   |      955   | |      568   |      663   |      645   |
+|      768  |      557   |      613   |      981   | |      491   |      663   |      598   |
+|      800  |      514   |      353   |      882   | |      514   |      360   |      574   |
+|     1024  |      640   |      640   |     1067   | |      492   |      683   |      602   |
+|     2048  |      587   |      640   |      908   | |      486   |      640   |      552   |
+|     2400  |      479   |      368   |      777   | |      422   |      376   |      518   |
+|     4096  |      511   |      614   |      853   | |      426   |      640   |      534   |
+|     8192  |      415   |      584   |      708   | |      386   |      622   |      516   |
+|     9216  |      419   |      571   |      687   | |      364   |      586   |      506   |
+|    16384  |      426   |      577   |      716   | |      398   |      606   |      530   |
+|    32768  |      417   |      572   |      673   | |      399   |      572   |      468   |
+|   262144  |      219   |      380   |      293   | |      255   |      431   |      343   |
+|  1048576  |      202   |      274   |      237   | |      265   |      282   |      355   |
+|-----------+------------+------------+------------| |------------+------------+------------|
+
+Same platform as above, but this time pffft and fftpack are built with clang 3.2:
+
+clang -O3 -DHAVE_FFTW -march=armv7-a -mtune=cortex-a9 -mfloat-abi=hard -mfpu=neon -ffast-math test_pffft.c pffft.c -o test_pffft_arm fftpack.c -lm -I/usr/local/include/ -L/usr/local/lib/ -lfftw3f
+
+| input len |real FFTPack|  real FFTW | real PFFFT | |cplx FFTPack|  cplx FFTW | cplx PFFFT |
+|-----------+------------+------------+------------| |------------+------------+------------|
+|       64  |      427   |      452   |      853   | |      427   |      602   |     1024   |
+|       96  |      351   |      276   |      843   | |      337   |      571   |      963   |
+|      128  |      373   |      512   |      996   | |      390   |      618   |     1054   |
+|      160  |      426   |      536   |      987   | |      375   |      669   |      914   |
+|      192  |      404   |      571   |     1079   | |      388   |      588   |     1079   |
+|      256  |      465   |      539   |     1205   | |      445   |      602   |     1170   |
+|      384  |      366   |      610   |     1099   | |      343   |      594   |     1099   |
+|      480  |      356   |      507   |     1140   | |      335   |      651   |      931   |
+|      512  |      411   |      591   |     1213   | |      384   |      649   |     1124   |
+|      640  |      398   |      612   |     1193   | |      373   |      654   |      901   |
+|      768  |      409   |      613   |     1227   | |      383   |      663   |     1044   |
+|      800  |      411   |      348   |     1073   | |      353   |      358   |      809   |
+|     1024  |      427   |      640   |     1280   | |      413   |      692   |     1004   |
+|     2048  |      414   |      626   |     1126   | |      371   |      640   |      853   |
+|     2400  |      399   |      373   |      898   | |      319   |      368   |      653   |
+|     4096  |      404   |      602   |     1059   | |      357   |      633   |      778   |
+|     8192  |      332   |      584   |      792   | |      308   |      616   |      716   |
+|     9216  |      322   |      561   |      783   | |      299   |      586   |      687   |
+|    16384  |      344   |      568   |      778   | |      314   |      617   |      745   |
+|    32768  |      342   |      564   |      737   | |      314   |      552   |      629   |
+|   262144  |      201   |      383   |      313   | |      227   |      435   |      413   |
+|  1048576  |      187   |      262   |      251   | |      228   |      281   |      409   |
+|-----------+------------+------------+------------| |------------+------------+------------|
+
+So it looks like, on ARM, gcc 4.7 is the best at scalar floating point
+(the fftpack performance numbers are better with gcc), while clang is
+the best with neon intrinsics (see how pffft perf has improved with
+clang 3.2).
+
+
+NVIDIA Jetson TK1 board, gcc-4.8.2. The cpu is a 2.3GHz cortex A15 (Tegra K1).
+
+Built with:
+gcc -O3 -march=armv7-a -mtune=native -mfloat-abi=hard -mfpu=neon -ffast-math test_pffft.c pffft.c -o test_pffft_arm fftpack.c -lm
+
+| input len |real FFTPack| real PFFFT | |cplx FFTPack| cplx PFFFT |
+|-----------+------------+------------| |------------+------------|
+|       64  |     1735   |     3308   | |     1994   |     3744   |
+|       96  |     1596   |     3448   | |     1987   |     3572   |
+|      128  |     1807   |     4076   | |     2255   |     3960   |
+|      160  |     1769   |     4083   | |     2071   |     3845   |
+|      192  |     1990   |     4233   | |     2017   |     3939   |
+|      256  |     2191   |     4882   | |     2254   |     4346   |
+|      384  |     1878   |     4492   | |     2073   |     4012   |
+|      480  |     1748   |     4398   | |     1923   |     3951   |
+|      512  |     2030   |     5064   | |     2267   |     4195   |
+|      640  |     1918   |     4756   | |     2094   |     4184   |
+|      768  |     2099   |     4907   | |     2048   |     4297   |
+|      800  |     1822   |     4555   | |     1880   |     4063   |
+|     1024  |     2232   |     5355   | |     2187   |     4420   |
+|     2048  |     2176   |     4983   | |     2027   |     3602   |
+|     2400  |     1741   |     4256   | |     1710   |     3344   |
+|     4096  |     1816   |     3914   | |     1851   |     3349   |
+|     8192  |     1716   |     3481   | |     1700   |     3255   |
+|     9216  |     1735   |     3589   | |     1653   |     3094   |
+|    16384  |     1567   |     3483   | |     1637   |     3244   |
+|    32768  |     1624   |     3240   | |     1655   |     3156   |
+|   262144  |     1012   |     1898   | |      983   |     1503   |
+|  1048576  |      876   |     1154   | |      868   |     1341   |
+|-----------+------------+------------| |------------+------------|
+
+The performance on the tegra K1 is pretty impressive. I'm not
+including the FFTW numbers as they as slightly below the scalar
+fftpack numbers, so something must be wrong (however it seems to be
+correctly configured and is using neon simd instructions).
+
+When using clang 3.4 the pffft version is even a bit faster, reaching
+5.7 GFlops for real ffts of size 1024.
+
+
+iPad Air 2 with iOS9, xcode 8.0, arm64. The cpu is an Apple A8X, supposedly running at 1.5GHz.
+
+| input len |real FFTPack|  real vDSP | real PFFFT | |cplx FFTPack|  cplx vDSP | cplx PFFFT |
+|-----------+------------+------------+------------| |------------+------------+------------|
+|       64  |     2517   |     7995   |     6086   | |     2725   |    13006   |     8495   |
+|       96  |     2442   |      n/a   |     6691   | |     2256   |      n/a   |     7991   |
+|      128  |     2664   |    10186   |     7877   | |     2575   |    15115   |     9115   |
+|      160  |     2638   |      n/a   |     8283   | |     2682   |      n/a   |     8806   |
+|      192  |     2903   |      n/a   |     9083   | |     2634   |      n/a   |     8980   |
+|      256  |     3184   |    11452   |    10039   | |     3026   |    15410   |    10199   |
+|      384  |     2665   |      n/a   |    10100   | |     2275   |      n/a   |     9247   |
+|      480  |     2546   |      n/a   |     9863   | |     2341   |      n/a   |     8892   |
+|      512  |     2832   |    12197   |    10989   | |     2547   |    16768   |    10154   |
+|      640  |     2755   |      n/a   |    10461   | |     2569   |      n/a   |     9666   |
+|      768  |     2998   |      n/a   |    11355   | |     2585   |      n/a   |     9813   |
+|      800  |     2516   |      n/a   |    10332   | |     2433   |      n/a   |     9164   |
+|     1024  |     3109   |    12965   |    12114   | |     2869   |    16448   |    10519   |
+|     2048  |     3027   |    12996   |    12023   | |     2648   |    17304   |    10307   |
+|     2400  |     2515   |      n/a   |    10372   | |     2355   |      n/a   |     8443   |
+|     4096  |     3204   |    13603   |    12359   | |     2814   |    16570   |     9780   |
+|     8192  |     2759   |    13422   |    10824   | |     2153   |    15652   |     7884   |
+|     9216  |     2700   |      n/a   |     9938   | |     2241   |      n/a   |     7900   |
+|    16384  |     2280   |    13057   |     7976   | |      593   |     4272   |     2534   |
+|    32768  |      768   |     4269   |     2882   | |      606   |     4405   |     2604   |
+|   262144  |      724   |     3527   |     2630   | |      534   |     2418   |     2157   |
+|  1048576  |      674   |     1467   |     2135   | |      530   |     1621   |     2055   |
+|-----------+------------+------------+------------| |------------+------------+------------|
+
+I double-checked to make sure I did not make a mistake in the time
+measurements, as the numbers are much higher than what I initially
+expected. They are in fact higher than the number I get on the 2.8GHz
+Xeon of my 2008 mac pro.. (except for FFT lengths >= 32768 where
+having a big cache is useful). A good surprise is also that the perf
+is not too far from apple's vDSP (at least for the real FFT).
+

From c39787ddc7b874553ecd60c641ce9dd0dceb51e2 Mon Sep 17 00:00:00 2001
From: doinachiroiu <inach@google.com>
Date: Mon, 17 Aug 2020 11:21:33 +0000
Subject: [PATCH 06/29] Testing pffft - elaborate output

---
 oss-internship-2020/pffft/README.txt          |  6 ++-
 .../pffft/test_pffft_sandboxed.cc             | 41 ++++++++++++++++---
 2 files changed, 40 insertions(+), 7 deletions(-)

diff --git a/oss-internship-2020/pffft/README.txt b/oss-internship-2020/pffft/README.txt
index ef51f1a..5688ac9 100644
--- a/oss-internship-2020/pffft/README.txt
+++ b/oss-internship-2020/pffft/README.txt
@@ -16,4 +16,8 @@ CMake observations:
 Sandboxed main observations:
     * containing two testing parts (fft / pffft benchmarks)
         ! current stage: fft - works :)
-                         pffft - not implemented
+                         pffft - implemented
+    * pffft benchmark bug: "Sandbox not active" 
+                            => loop in pffft_transform for N = 64 (why?); 
+                               N = 64, status OK, pffft_transform generates error 
+                               N > 64, status not OK
diff --git a/oss-internship-2020/pffft/test_pffft_sandboxed.cc b/oss-internship-2020/pffft/test_pffft_sandboxed.cc
index d50dff3..5a56a6f 100644
--- a/oss-internship-2020/pffft/test_pffft_sandboxed.cc
+++ b/oss-internship-2020/pffft/test_pffft_sandboxed.cc
@@ -60,12 +60,6 @@ void show_output(const char* name, int N, int cplx, float flops, float t0,
   fflush(stdout);
 }
 
-/*
-  For debug:
-  SAPI_VLOG_LEVEL=1 ./pffft_sandboxed --v=100
-  --sandbox2_danger_danger_permit_all_and_log my_aux_file
-*/
-
 int main(int argc, char* argv[]) {
   gflags::ParseCommandLineFlags(&argc, &argv, true);
   int Nvalues[] = {64,    96,     128,        160,         192,     256,
@@ -148,6 +142,41 @@ int main(int argc, char* argv[]) {
       flops = (max_iter_ * 2) * ((cplx ? 5 : 2.5) * N * log((double)N) / M_LN2);
       show_output("FFTPack", N, cplx, flops, t0, t1, max_iter_);
     }
+
+    // PFFFT benchmark
+    {
+      sapi::StatusOr<PFFFT_Setup*> s =
+          api.pffft_new_setup(N, cplx ? PFFFT_COMPLEX : PFFFT_REAL);
+
+      printf("%s\n", s.status().ToString().c_str());
+
+      if (s.ok()) {
+        sapi::v::GenericPtr s_reg(s.value());
+
+        t0 = uclock_sec();
+        for (iter = 0; iter < max_iter; ++iter) {
+          printf("%s\n",
+                 api.pffft_transform(s_reg.PtrBoth(), X_.PtrBoth(),
+                                     Z_.PtrBoth(), Y_.PtrBoth(), PFFFT_FORWARD)
+                     .ToString()
+                     .c_str());
+          printf("%s\n",
+                 api.pffft_transform(s_reg.PtrBoth(), X_.PtrBoth(),
+                                     Z_.PtrBoth(), Y_.PtrBoth(), PFFFT_FORWARD)
+                     .ToString()
+                     .c_str());
+        }
+
+        t1 = uclock_sec();
+        printf("%s\n",
+               api.pffft_destroy_setup(s_reg.PtrBoth()).ToString().c_str());
+
+        flops =
+            (max_iter * 2) * ((cplx ? 5 : 2.5) * N * log((double)N) / M_LN2);
+        show_output("PFFFT", N, cplx, flops, t0, t1, max_iter);
+      }
+      printf("\n\n");
+    }
   }
 
   return 0;

From d51d558083b57481971b46998f20928b57e9b59e Mon Sep 17 00:00:00 2001
From: doinachiroiu <inach@google.com>
Date: Mon, 17 Aug 2020 11:21:42 +0000
Subject: [PATCH 07/29] Update .gitignore

---
 oss-internship-2020/pffft/.gitignore | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/oss-internship-2020/pffft/.gitignore b/oss-internship-2020/pffft/.gitignore
index f9ed608..0469302 100644
--- a/oss-internship-2020/pffft/.gitignore
+++ b/oss-internship-2020/pffft/.gitignore
@@ -1 +1,6 @@
 .hg/
+pffft.o
+test_pffft.o
+fftpack.o
+libpffft.a
+pffft_main
\ No newline at end of file

From 3fb4d5954513dbdc43272a7220ebdb3aca9c68ff Mon Sep 17 00:00:00 2001
From: doinachiroiu <inach@google.com>
Date: Mon, 17 Aug 2020 11:21:49 +0000
Subject: [PATCH 08/29] Sandbox not active error tracking observations

---
 oss-internship-2020/pffft/README.txt              | 7 +++++++
 oss-internship-2020/pffft/test_pffft_sandboxed.cc | 6 ++++--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/oss-internship-2020/pffft/README.txt b/oss-internship-2020/pffft/README.txt
index 5688ac9..9c2556a 100644
--- a/oss-internship-2020/pffft/README.txt
+++ b/oss-internship-2020/pffft/README.txt
@@ -21,3 +21,10 @@ Sandboxed main observations:
                             => loop in pffft_transform for N = 64 (why?); 
                                N = 64, status OK, pffft_transform generates error 
                                N > 64, status not OK
+                               Problem on initialising sapi::StatusOr<PFFFT_Setup *> s; 
+                               the memory that stays for s is not the same with the address passed
+                               in pffft_transform function. 
+                               (sapi::v::GenericPtr to be changed?)
+                               
+                               Temporary solution (not done): change the generated files to accept
+                               uintptr_t instead of PFFFT_Setup
diff --git a/oss-internship-2020/pffft/test_pffft_sandboxed.cc b/oss-internship-2020/pffft/test_pffft_sandboxed.cc
index 5a56a6f..8c91a77 100644
--- a/oss-internship-2020/pffft/test_pffft_sandboxed.cc
+++ b/oss-internship-2020/pffft/test_pffft_sandboxed.cc
@@ -73,6 +73,8 @@ int main(int argc, char* argv[]) {
   pffftSapiSandbox sandbox;
   sandbox.Init().IgnoreError();
 
+  printf("%s\n", sandbox.Init().ToString().c_str());
+
   pffftApi api(&sandbox);
 
   int N, cplx;
@@ -164,7 +166,7 @@ int main(int argc, char* argv[]) {
                  api.pffft_transform(s_reg.PtrBoth(), X_.PtrBoth(),
                                      Z_.PtrBoth(), Y_.PtrBoth(), PFFFT_FORWARD)
                      .ToString()
-                     .c_str());
+                     .c_str()); 
         }
 
         t1 = uclock_sec();
@@ -175,7 +177,7 @@ int main(int argc, char* argv[]) {
             (max_iter * 2) * ((cplx ? 5 : 2.5) * N * log((double)N) / M_LN2);
         show_output("PFFFT", N, cplx, flops, t0, t1, max_iter);
       }
-      printf("\n\n");
+      printf("\n\n"); 
     }
   }
 

From 06bf6cdd34a172bdd0048720d62bbda856d5593f Mon Sep 17 00:00:00 2001
From: doinachiroiu <inach@google.com>
Date: Mon, 17 Aug 2020 11:21:56 +0000
Subject: [PATCH 09/29] Sandbox not active error - resolved

---
 oss-internship-2020/pffft/README.txt          | 26 +++++++++--------
 .../pffft/test_pffft_sandboxed.cc             | 29 ++++++++-----------
 2 files changed, 26 insertions(+), 29 deletions(-)

diff --git a/oss-internship-2020/pffft/README.txt b/oss-internship-2020/pffft/README.txt
index 9c2556a..c7ac70b 100644
--- a/oss-internship-2020/pffft/README.txt
+++ b/oss-internship-2020/pffft/README.txt
@@ -16,15 +16,17 @@ CMake observations:
 Sandboxed main observations:
     * containing two testing parts (fft / pffft benchmarks)
         ! current stage: fft - works :)
-                         pffft - implemented
-    * pffft benchmark bug: "Sandbox not active" 
-                            => loop in pffft_transform for N = 64 (why?); 
-                               N = 64, status OK, pffft_transform generates error 
-                               N > 64, status not OK
-                               Problem on initialising sapi::StatusOr<PFFFT_Setup *> s; 
-                               the memory that stays for s is not the same with the address passed
-                               in pffft_transform function. 
-                               (sapi::v::GenericPtr to be changed?)
-                               
-                               Temporary solution (not done): change the generated files to accept
-                               uintptr_t instead of PFFFT_Setup
+                        pffft - implemented
+    * (Solved) pffft benchmark bug: "Sandbox not active"  
+                        N = 64, status OK, pffft_transform generates error 
+                        N > 64, status not OK
+                        Problem on initialising sapi::StatusOr<PFFFT_Setup *> s; 
+                        the memory that stays for s is not the same with the address passed
+                        in pffft_transform function. 
+                        (sapi :: v :: GenericPtr - to be changed)
+
+                        Temporary solution: change the generated files to accept
+                        uintptr_t instead of PFFFT_Setup
+
+                        Solution: using "sapi :: v :: RemotePtr" instead of "sapi :: v :: GenericPtr"
+                                  to access the memory of object s
diff --git a/oss-internship-2020/pffft/test_pffft_sandboxed.cc b/oss-internship-2020/pffft/test_pffft_sandboxed.cc
index 8c91a77..e4b4602 100644
--- a/oss-internship-2020/pffft/test_pffft_sandboxed.cc
+++ b/oss-internship-2020/pffft/test_pffft_sandboxed.cc
@@ -73,7 +73,7 @@ int main(int argc, char* argv[]) {
   pffftSapiSandbox sandbox;
   sandbox.Init().IgnoreError();
 
-  printf("%s\n", sandbox.Init().ToString().c_str());
+  printf("Initialization: %s\n", sandbox.Init().ToString().c_str());
 
   pffftApi api(&sandbox);
 
@@ -81,7 +81,7 @@ int main(int argc, char* argv[]) {
 
   cplx = 0;
 
-  for (i = 0; i < 5; i++) {
+  for (i = 0; i < 23; i++) {
     N = Nvalues[i];
 
     int Nfloat = N * (cplx ? 2 : 1);
@@ -150,34 +150,29 @@ int main(int argc, char* argv[]) {
       sapi::StatusOr<PFFFT_Setup*> s =
           api.pffft_new_setup(N, cplx ? PFFFT_COMPLEX : PFFFT_REAL);
 
-      printf("%s\n", s.status().ToString().c_str());
+      printf("Setup status is: %s\n", s.status().ToString().c_str());
 
       if (s.ok()) {
-        sapi::v::GenericPtr s_reg(s.value());
+        sapi::v::RemotePtr s_reg(s.value());
 
         t0 = uclock_sec();
         for (iter = 0; iter < max_iter; ++iter) {
-          printf("%s\n",
-                 api.pffft_transform(s_reg.PtrBoth(), X_.PtrBoth(),
-                                     Z_.PtrBoth(), Y_.PtrBoth(), PFFFT_FORWARD)
-                     .ToString()
-                     .c_str());
-          printf("%s\n",
-                 api.pffft_transform(s_reg.PtrBoth(), X_.PtrBoth(),
-                                     Z_.PtrBoth(), Y_.PtrBoth(), PFFFT_FORWARD)
-                     .ToString()
-                     .c_str()); 
+          api.pffft_transform(&s_reg, X_.PtrBoth(), Z_.PtrBoth(), Y_.PtrBoth(),
+                              PFFFT_FORWARD)
+              .IgnoreError();
+          api.pffft_transform(&s_reg, X_.PtrBoth(), Z_.PtrBoth(), Y_.PtrBoth(),
+                              PFFFT_FORWARD)
+              .IgnoreError();
         }
 
         t1 = uclock_sec();
-        printf("%s\n",
-               api.pffft_destroy_setup(s_reg.PtrBoth()).ToString().c_str());
+        api.pffft_destroy_setup(&s_reg).IgnoreError();
 
         flops =
             (max_iter * 2) * ((cplx ? 5 : 2.5) * N * log((double)N) / M_LN2);
         show_output("PFFFT", N, cplx, flops, t0, t1, max_iter);
       }
-      printf("\n\n"); 
+      printf("\n\n");
     }
   }
 

From 096d02625dd616fadb536cd29ed204aa8e2452ea Mon Sep 17 00:00:00 2001
From: doinachiroiu <inach@google.com>
Date: Thu, 20 Aug 2020 07:44:27 +0000
Subject: [PATCH 10/29] Modified sapi root to a general path

---
 oss-internship-2020/pffft/.gitignore     | 7 ++-----
 oss-internship-2020/pffft/CMakeLists.txt | 2 +-
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/oss-internship-2020/pffft/.gitignore b/oss-internship-2020/pffft/.gitignore
index 0469302..ad64a9d 100644
--- a/oss-internship-2020/pffft/.gitignore
+++ b/oss-internship-2020/pffft/.gitignore
@@ -1,6 +1,3 @@
-.hg/
-pffft.o
-test_pffft.o
-fftpack.o
-libpffft.a
+*.o
+*.a
 pffft_main
\ No newline at end of file
diff --git a/oss-internship-2020/pffft/CMakeLists.txt b/oss-internship-2020/pffft/CMakeLists.txt
index c0d62cb..a9ba38a 100644
--- a/oss-internship-2020/pffft/CMakeLists.txt
+++ b/oss-internship-2020/pffft/CMakeLists.txt
@@ -31,7 +31,7 @@ target_link_libraries(pffft PUBLIC ${MATH_LIBS})
 
 
 # Adding dependencies
-set(SAPI_ROOT "/usr/local/google/home/inach/sandboxed-api" CACHE PATH "Path to the Sandboxed API source tree")
+set(SAPI_ROOT "../.." CACHE PATH "Path to the Sandboxed API source tree")
 # Then configure:
 #   mkdir -p build && cd build
 #   cmake .. -G Ninja -DSAPI_ROOT=$HOME/sapi_root

From 94fcf82dd3aaf68481f0145338ec2f4b3d9941e6 Mon Sep 17 00:00:00 2001
From: doinachiroiu <inach@google.com>
Date: Thu, 20 Aug 2020 08:21:32 +0000
Subject: [PATCH 11/29] Added comment on Nvalues[] and N purpose

---
 .../pffft/test_pffft_sandboxed.cc             | 21 ++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/oss-internship-2020/pffft/test_pffft_sandboxed.cc b/oss-internship-2020/pffft/test_pffft_sandboxed.cc
index e4b4602..e1b897e 100644
--- a/oss-internship-2020/pffft/test_pffft_sandboxed.cc
+++ b/oss-internship-2020/pffft/test_pffft_sandboxed.cc
@@ -62,6 +62,13 @@ void show_output(const char* name, int N, int cplx, float flops, float t0,
 
 int main(int argc, char* argv[]) {
   gflags::ParseCommandLineFlags(&argc, &argv, true);
+  /*
+  * Nvalues is a vector keeping the values by which iterates N, its value 
+  * representing the input length. More concrete, N is the number of
+  * data points the caclulus is up to (determinating its accuracy).
+  * To show the performance of Fast-Fourier Transformations the program is 
+  * testing for various values of N.
+  */
   int Nvalues[] = {64,    96,     128,        160,         192,     256,
                    384,   5 * 96, 512,        5 * 128,     3 * 256, 800,
                    1024,  2048,   2400,       4096,        8192,    9 * 1024,
@@ -118,10 +125,16 @@ int main(int argc, char* argv[]) {
       X[k] = 0;
     }
 
-    // FFTPack benchmark
+    /*
+    * FFTPack benchmark
+    */
     {
+      /*
+      * SIMD_SZ == 4 (returning value of pffft_simd_size())
+      */
       int max_iter_ =
-          max_iter / 4;  // SIMD_SZ == 4 (returning value of pffft_simd_size())
+          max_iter / 4; 
+
       if (max_iter_ == 0) max_iter_ = 1;
       if (cplx) {
         api.cffti(N, wrk_.PtrBoth()).IgnoreError();
@@ -145,7 +158,9 @@ int main(int argc, char* argv[]) {
       show_output("FFTPack", N, cplx, flops, t0, t1, max_iter_);
     }
 
-    // PFFFT benchmark
+    /*
+    * PFFFT benchmark
+    */ 
     {
       sapi::StatusOr<PFFFT_Setup*> s =
           api.pffft_new_setup(N, cplx ? PFFFT_COMPLEX : PFFFT_REAL);

From 257e87e07669c29bc8fb3b32bddd66b6a43d44d7 Mon Sep 17 00:00:00 2001
From: doinachiroiu <inach@google.com>
Date: Thu, 20 Aug 2020 08:58:30 +0000
Subject: [PATCH 12/29] Added log printing for debug

---
 .../pffft/test_pffft_sandboxed.cc             | 179 ++++++++++--------
 1 file changed, 95 insertions(+), 84 deletions(-)

diff --git a/oss-internship-2020/pffft/test_pffft_sandboxed.cc b/oss-internship-2020/pffft/test_pffft_sandboxed.cc
index e1b897e..523dce0 100644
--- a/oss-internship-2020/pffft/test_pffft_sandboxed.cc
+++ b/oss-internship-2020/pffft/test_pffft_sandboxed.cc
@@ -1,4 +1,5 @@
 #include <assert.h>
+#include <glog/logging.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -61,6 +62,11 @@ void show_output(const char* name, int N, int cplx, float flops, float t0,
 }
 
 int main(int argc, char* argv[]) {
+  /*
+  * Initialize Google's logging library.
+  */
+  google::InitGoogleLogging(argv[0]);
+
   gflags::ParseCommandLineFlags(&argc, &argv, true);
   /*
   * Nvalues is a vector keeping the values by which iterates N, its value 
@@ -75,12 +81,12 @@ int main(int argc, char* argv[]) {
                    16384, 32768,  256 * 1024, 1024 * 1024, -1};
   int i;
 
-  printf("initializing sandbox...\n");
+  VLOG(1) << "Initializing sandbox...\n";
 
   pffftSapiSandbox sandbox;
   sandbox.Init().IgnoreError();
 
-  printf("Initialization: %s\n", sandbox.Init().ToString().c_str());
+  VLOG(1) << "Initialization: " << sandbox.Init().ToString().c_str() << "\n";
 
   pffftApi api(&sandbox);
 
@@ -88,108 +94,113 @@ int main(int argc, char* argv[]) {
 
   cplx = 0;
 
-  for (i = 0; i < 23; i++) {
-    N = Nvalues[i];
+  do {
+    for (i = 0; i < 23; i++) {
+      N = Nvalues[i];
 
-    int Nfloat = N * (cplx ? 2 : 1);
-    int Nbytes = Nfloat * sizeof(float);
-    int pass;
+      int Nfloat = N * (cplx ? 2 : 1);
+      int Nbytes = Nfloat * sizeof(float);
+      int pass;
 
-    float ref[Nbytes], in[Nbytes], out[Nbytes], tmp[Nbytes], tmp2[Nbytes];
+      float ref[Nbytes], in[Nbytes], out[Nbytes], tmp[Nbytes], tmp2[Nbytes];
 
-    sapi::v::Array<float> ref_(ref, Nbytes);
-    sapi::v::Array<float> in_(in, Nbytes);
-    sapi::v::Array<float> out_(out, Nbytes);
-    sapi::v::Array<float> tmp_(tmp, Nbytes);
-    sapi::v::Array<float> tmp2_(tmp2, Nbytes);
+      sapi::v::Array<float> ref_(ref, Nbytes);
+      sapi::v::Array<float> in_(in, Nbytes);
+      sapi::v::Array<float> out_(out, Nbytes);
+      sapi::v::Array<float> tmp_(tmp, Nbytes);
+      sapi::v::Array<float> tmp2_(tmp2, Nbytes);
 
-    float wrk[2 * Nbytes + 15 * sizeof(float)];
-    sapi::v::Array<float> wrk_(wrk, 2 * Nbytes + 15 * sizeof(float));
+      float wrk[2 * Nbytes + 15 * sizeof(float)];
+      sapi::v::Array<float> wrk_(wrk, 2 * Nbytes + 15 * sizeof(float));
 
-    float ref_max = 0;
-    int k;
+      float ref_max = 0;
+      int k;
 
-    Nfloat = (cplx ? N * 2 : N);
-    float X[Nbytes], Y[Nbytes], Z[Nbytes];
-    sapi::v::Array<float> X_(X, Nbytes), Y_(Y, Nbytes), Z_(Z, Nbytes);
+      Nfloat = (cplx ? N * 2 : N);
+      float X[Nbytes], Y[Nbytes], Z[Nbytes];
+      sapi::v::Array<float> X_(X, Nbytes), Y_(Y, Nbytes), Z_(Z, Nbytes);
 
-    double t0, t1, flops;
+      double t0, t1, flops;
 
-    int max_iter = 5120000 / N * 4;
-#ifdef __arm__
-    max_iter /= 4;
-#endif
-    int iter;
+      int max_iter = 5120000 / N * 4;
+  #ifdef __arm__
+      max_iter /= 4;
+  #endif
+      int iter;
 
-    for (k = 0; k < Nfloat; ++k) {
-      X[k] = 0;
-    }
+      for (k = 0; k < Nfloat; ++k) {
+        X[k] = 0;
+      }
 
-    /*
-    * FFTPack benchmark
-    */
-    {
       /*
-      * SIMD_SZ == 4 (returning value of pffft_simd_size())
+      * FFTPack benchmark
       */
-      int max_iter_ =
-          max_iter / 4; 
+      {
+        /*
+        * SIMD_SZ == 4 (returning value of pffft_simd_size())
+        */
+        int max_iter_ =
+            max_iter / 4; 
 
-      if (max_iter_ == 0) max_iter_ = 1;
-      if (cplx) {
-        api.cffti(N, wrk_.PtrBoth()).IgnoreError();
-      } else {
-        api.rffti(N, wrk_.PtrBoth()).IgnoreError();
-      }
-      t0 = uclock_sec();
-
-      for (iter = 0; iter < max_iter_; ++iter) {
+        if (max_iter_ == 0) max_iter_ = 1;
         if (cplx) {
-          api.cfftf(N, X_.PtrBoth(), wrk_.PtrBoth()).IgnoreError();
-          api.cfftb(N, X_.PtrBoth(), wrk_.PtrBoth()).IgnoreError();
+          api.cffti(N, wrk_.PtrBoth()).IgnoreError();
         } else {
-          api.rfftf(N, X_.PtrBoth(), wrk_.PtrBoth()).IgnoreError();
-          api.rfftb(N, X_.PtrBoth(), wrk_.PtrBoth()).IgnoreError();
+          api.rffti(N, wrk_.PtrBoth()).IgnoreError();
         }
-      }
-      t1 = uclock_sec();
-
-      flops = (max_iter_ * 2) * ((cplx ? 5 : 2.5) * N * log((double)N) / M_LN2);
-      show_output("FFTPack", N, cplx, flops, t0, t1, max_iter_);
-    }
-
-    /*
-    * PFFFT benchmark
-    */ 
-    {
-      sapi::StatusOr<PFFFT_Setup*> s =
-          api.pffft_new_setup(N, cplx ? PFFFT_COMPLEX : PFFFT_REAL);
-
-      printf("Setup status is: %s\n", s.status().ToString().c_str());
-
-      if (s.ok()) {
-        sapi::v::RemotePtr s_reg(s.value());
-
         t0 = uclock_sec();
-        for (iter = 0; iter < max_iter; ++iter) {
-          api.pffft_transform(&s_reg, X_.PtrBoth(), Z_.PtrBoth(), Y_.PtrBoth(),
-                              PFFFT_FORWARD)
-              .IgnoreError();
-          api.pffft_transform(&s_reg, X_.PtrBoth(), Z_.PtrBoth(), Y_.PtrBoth(),
-                              PFFFT_FORWARD)
-              .IgnoreError();
+
+        for (iter = 0; iter < max_iter_; ++iter) {
+          if (cplx) {
+            api.cfftf(N, X_.PtrBoth(), wrk_.PtrBoth()).IgnoreError();
+            api.cfftb(N, X_.PtrBoth(), wrk_.PtrBoth()).IgnoreError();
+          } else {
+            api.rfftf(N, X_.PtrBoth(), wrk_.PtrBoth()).IgnoreError();
+            api.rfftb(N, X_.PtrBoth(), wrk_.PtrBoth()).IgnoreError();
+          }
+        }
+        t1 = uclock_sec();
+
+        flops = (max_iter_ * 2) * ((cplx ? 5 : 2.5) * N * log((double)N) / M_LN2);
+        show_output("FFTPack", N, cplx, flops, t0, t1, max_iter_);
+      }
+
+      /*
+      * PFFFT benchmark
+      */ 
+      {
+        sapi::StatusOr<PFFFT_Setup*> s =
+            api.pffft_new_setup(N, cplx ? PFFFT_COMPLEX : PFFFT_REAL);
+
+        VLOG(1) << "Setup status is: " << s.status().ToString().c_str() << "\n";
+
+        if (s.ok()) {
+          sapi::v::RemotePtr s_reg(s.value());
+
+          t0 = uclock_sec();
+          for (iter = 0; iter < max_iter; ++iter) {
+            api.pffft_transform(&s_reg, X_.PtrBoth(), Z_.PtrBoth(), Y_.PtrBoth(),
+                                PFFFT_FORWARD)
+                .IgnoreError();
+            api.pffft_transform(&s_reg, X_.PtrBoth(), Z_.PtrBoth(), Y_.PtrBoth(),
+                                PFFFT_FORWARD)
+                .IgnoreError();
+          }
+
+          t1 = uclock_sec();
+          api.pffft_destroy_setup(&s_reg).IgnoreError();
+
+          flops =
+              (max_iter * 2) * ((cplx ? 5 : 2.5) * N * log((double)N) / M_LN2);
+          show_output("PFFFT", N, cplx, flops, t0, t1, max_iter);
         }
 
-        t1 = uclock_sec();
-        api.pffft_destroy_setup(&s_reg).IgnoreError();
-
-        flops =
-            (max_iter * 2) * ((cplx ? 5 : 2.5) * N * log((double)N) / M_LN2);
-        show_output("PFFFT", N, cplx, flops, t0, t1, max_iter);
+        VLOG(1) << "N = " << N << " SUCCESSFULLY\n\n";
       }
-      printf("\n\n");
-    }
-  }
+    } 
+
+    cplx = !cplx;
+  } while (cplx);
 
   return 0;
 }
\ No newline at end of file

From 25d18f985d258a34bc9b89fb41151174ab8f729b Mon Sep 17 00:00:00 2001
From: doinachiroiu <inach@google.com>
Date: Thu, 20 Aug 2020 11:19:49 +0000
Subject: [PATCH 13/29] Updated README and change names of main files

---
 oss-internship-2020/pffft/README.md           |  67 +++
 oss-internship-2020/pffft/main_pffft.c        | 419 ++++++++++++++++++
 .../pffft/main_pffft_sandboxed.cc             | 208 +++++++++
 .../pffft/pffft_library_notes.txt             | 416 +++++++++++++++++
 4 files changed, 1110 insertions(+)
 create mode 100644 oss-internship-2020/pffft/README.md
 create mode 100644 oss-internship-2020/pffft/main_pffft.c
 create mode 100644 oss-internship-2020/pffft/main_pffft_sandboxed.cc
 create mode 100644 oss-internship-2020/pffft/pffft_library_notes.txt

diff --git a/oss-internship-2020/pffft/README.md b/oss-internship-2020/pffft/README.md
new file mode 100644
index 0000000..52facf3
--- /dev/null
+++ b/oss-internship-2020/pffft/README.md
@@ -0,0 +1,67 @@
+# Sandboxing PFFFT library
+
+Builder: CMake  
+OS: Linux
+
+### For testing: 
+`cd build`, then `./pffft_sandboxed`
+
+### For debug:
+`SAPI_VLOG_LEVEL=1 ./pffft_sandboxed --v=100
+--sandbox2_danger_danger_permit_all_and_log <auxiliar file>`
+
+## ***About the project*** 
+*PFFFT library is concerned with 1D Fast-Fourier Transformations finding a
+compromise between accuracy and speed. It deals with real and complex
+vectors, both cases being illustrated in the testing part (`main_pffft.c` 
+for initially and original version, `main_pffft_sandboxed.cc` for our 
+currently implemented sandboxed version).
+The original files can be found at: https://bitbucket.org/jpommier/pffft/src.*
+
+*The purpose of sandboxing is to limit the permissions and capabilities of 
+library’s methods, in order to secure the usage of them. 
+After obtaining the sandbox, the functions will be called through an 
+Sandbox API (being called `api` in the current test) and so, the 
+operations, system calls or namspaces access may be controlled. 
+From both `pffft.h` and `fftpack.h` headers, useful methods are added to 
+sapi library builded with CMake. There is also a need to link math library 
+as the transformations made require mathematical operators. 
+Regarding the testing of the methods, one main is doing this job by 
+iterating through a set of values, that represents the accuracy of 
+transformations and print the speed for each value and type of 
+transformation. More specifically, the input length is the target for 
+accuracy (named as `N`) and it stands for the number of data points from 
+the series that calculate the result of transformation. It is also 
+important to mention that the `cplx` variable stands for a boolean value 
+that tells the type of transformation (0 for REAL and 1 for COMPLEX) and 
+it is taken into account while testing.
+In the end, the performance of PFFFT library it is outlined by the output.*
+
+#### CMake observations resume:
+    * linking pffft and fftpack (which contains necessary functions for pffft)
+    * set math library 
+
+#### Sandboxed main observations resume:
+    * containing two testing parts (fft / pffft benchmarks)
+    * showing the performance of the transformations implies 
+    testing them through various FFT dimenstions. 
+    Variable N, the input length, will take specific values 
+    meaning the number of points to which it is set the calculus 
+    (more details of mathematical purpose of N - https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm). 
+    * output shows speed depending on the input length
+
+    
+
+### Bugs history
+    - [Solved] pffft benchmark bug: "Sandbox not active"  
+    N = 64, status OK, pffft_transform generates error 
+    N > 64, status not OK
+    Problem on initialising sapi::StatusOr<PFFFT_Setup *> s; the memory that stays 
+    for s is not the same with the address passed in pffft_transform function. 
+    (sapi :: v :: GenericPtr - to be changed)
+
+    Temporary solution: change the generated files to accept 
+    uintptr_t instead of PFFFT_Setup
+
+    Solution: using "sapi::v::RemotePtr" instead of "sapi::v::GenericPtr" 
+    to access the memory of object s
\ No newline at end of file
diff --git a/oss-internship-2020/pffft/main_pffft.c b/oss-internship-2020/pffft/main_pffft.c
new file mode 100644
index 0000000..a5d20c2
--- /dev/null
+++ b/oss-internship-2020/pffft/main_pffft.c
@@ -0,0 +1,419 @@
+/*
+  Copyright (c) 2013 Julien Pommier.
+
+  Small test & bench for PFFFT, comparing its performance with the scalar FFTPACK, FFTW, and Apple vDSP
+
+  How to build: 
+
+  on linux, with fftw3:
+  gcc -o test_pffft -DHAVE_FFTW -msse -mfpmath=sse -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -lm
+
+  on macos, without fftw3:
+  clang -o test_pffft -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -framework Accelerate
+
+  on macos, with fftw3:
+  clang -o test_pffft -DHAVE_FFTW -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -framework Accelerate
+
+  on windows, with visual c++:
+  cl /Ox -D_USE_MATH_DEFINES /arch:SSE test_pffft.c pffft.c fftpack.c
+  
+  build without SIMD instructions:
+  gcc -o test_pffft -DPFFFT_SIMD_DISABLE -O3 -Wall -W pffft.c test_pffft.c fftpack.c -lm
+
+ */
+
+#include "pffft.h"
+#include "fftpack.h"
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <assert.h>
+#include <string.h>
+
+#ifdef HAVE_SYS_TIMES
+#  include <sys/times.h>
+#  include <unistd.h>
+#endif
+
+#ifdef HAVE_VECLIB
+#  include <Accelerate/Accelerate.h>
+#endif
+
+#ifdef HAVE_FFTW
+#  include <fftw3.h>
+#endif
+
+#define MAX(x,y) ((x)>(y)?(x):(y))
+
+double frand() {
+  return rand()/(double)RAND_MAX;
+}
+
+#if defined(HAVE_SYS_TIMES)
+  inline double uclock_sec(void) {
+    static double ttclk = 0.;
+    if (ttclk == 0.) ttclk = sysconf(_SC_CLK_TCK);
+    struct tms t; return ((double)times(&t)) / ttclk;
+  }
+# else
+  double uclock_sec(void)
+{ return (double)clock()/(double)CLOCKS_PER_SEC; }
+#endif
+
+
+/* compare results with the regular fftpack */
+void pffft_validate_N(int N, int cplx) {
+  int Nfloat = N*(cplx?2:1);
+  int Nbytes = Nfloat * sizeof(float);
+  float *ref, *in, *out, *tmp, *tmp2;
+  PFFFT_Setup *s = pffft_new_setup(N, cplx ? PFFFT_COMPLEX : PFFFT_REAL);
+  int pass;
+
+  if (!s) { printf("Skipping N=%d, not supported\n", N); return; }
+  ref = pffft_aligned_malloc(Nbytes);
+  in = pffft_aligned_malloc(Nbytes);
+  out = pffft_aligned_malloc(Nbytes);
+  tmp = pffft_aligned_malloc(Nbytes);
+  tmp2 = pffft_aligned_malloc(Nbytes);
+
+  for (pass=0; pass < 2; ++pass) {
+    float ref_max = 0;
+    int k;
+    //printf("N=%d pass=%d cplx=%d\n", N, pass, cplx);
+    // compute reference solution with FFTPACK
+    if (pass == 0) {
+      float *wrk = malloc(2*Nbytes+15*sizeof(float));
+      for (k=0; k < Nfloat; ++k) {
+        ref[k] = in[k] = frand()*2-1; 
+        out[k] = 1e30;
+      }
+      if (!cplx) {
+        rffti(N, wrk);
+        rfftf(N, ref, wrk);
+        // use our ordering for real ffts instead of the one of fftpack
+        {
+          float refN=ref[N-1];
+          for (k=N-2; k >= 1; --k) ref[k+1] = ref[k]; 
+          ref[1] = refN;
+        }
+      } else {
+        cffti(N, wrk);
+        cfftf(N, ref, wrk);
+      }
+      free(wrk);
+    }
+
+    for (k = 0; k < Nfloat; ++k) ref_max = MAX(ref_max, fabs(ref[k]));
+
+      
+    // pass 0 : non canonical ordering of transform coefficients  
+    if (pass == 0) {
+      // test forward transform, with different input / output
+      pffft_transform(s, in, tmp, 0, PFFFT_FORWARD);
+      memcpy(tmp2, tmp, Nbytes);
+      memcpy(tmp, in, Nbytes);
+      pffft_transform(s, tmp, tmp, 0, PFFFT_FORWARD);
+      for (k = 0; k < Nfloat; ++k) {
+        assert(tmp2[k] == tmp[k]);
+      }
+
+      // test reordering
+      pffft_zreorder(s, tmp, out, PFFFT_FORWARD);
+      pffft_zreorder(s, out, tmp, PFFFT_BACKWARD);
+      for (k = 0; k < Nfloat; ++k) {
+        assert(tmp2[k] == tmp[k]);
+      }
+      pffft_zreorder(s, tmp, out, PFFFT_FORWARD);
+    } else {
+      // pass 1 : canonical ordering of transform coeffs.
+      pffft_transform_ordered(s, in, tmp, 0, PFFFT_FORWARD);
+      memcpy(tmp2, tmp, Nbytes);
+      memcpy(tmp, in, Nbytes);
+      pffft_transform_ordered(s, tmp, tmp, 0, PFFFT_FORWARD);
+      for (k = 0; k < Nfloat; ++k) {
+        assert(tmp2[k] == tmp[k]);
+      }
+      memcpy(out, tmp, Nbytes);
+    }
+
+    {
+      for (k=0; k < Nfloat; ++k) {
+        if (!(fabs(ref[k] - out[k]) < 1e-3*ref_max)) {
+          printf("%s forward PFFFT mismatch found for N=%d\n", (cplx?"CPLX":"REAL"), N);
+          exit(1);
+        }
+      }
+        
+      if (pass == 0) pffft_transform(s, tmp, out, 0, PFFFT_BACKWARD);
+      else   pffft_transform_ordered(s, tmp, out, 0, PFFFT_BACKWARD);
+      memcpy(tmp2, out, Nbytes);
+      memcpy(out, tmp, Nbytes);
+      if (pass == 0) pffft_transform(s, out, out, 0, PFFFT_BACKWARD);
+      else   pffft_transform_ordered(s, out, out, 0, PFFFT_BACKWARD);
+      for (k = 0; k < Nfloat; ++k) {
+        assert(tmp2[k] == out[k]);
+        out[k] *= 1.f/N;
+      }
+      for (k = 0; k < Nfloat; ++k) {
+        if (fabs(in[k] - out[k]) > 1e-3 * ref_max) {
+          printf("pass=%d, %s IFFFT does not match for N=%d\n", pass, (cplx?"CPLX":"REAL"), N); break;
+          exit(1);
+        }
+      }
+    }
+
+    // quick test of the circular convolution in fft domain
+    {
+      float conv_err = 0, conv_max = 0;
+
+      pffft_zreorder(s, ref, tmp, PFFFT_FORWARD);
+      memset(out, 0, Nbytes);
+      pffft_zconvolve_accumulate(s, ref, ref, out, 1.0);
+      pffft_zreorder(s, out, tmp2, PFFFT_FORWARD);
+      
+      for (k=0; k < Nfloat; k += 2) {
+        float ar = tmp[k], ai=tmp[k+1];
+        if (cplx || k > 0) {
+          tmp[k] = ar*ar - ai*ai;
+          tmp[k+1] = 2*ar*ai;
+        } else {
+          tmp[0] = ar*ar;
+          tmp[1] = ai*ai;
+        }
+      }
+      
+      for (k=0; k < Nfloat; ++k) {
+        float d = fabs(tmp[k] - tmp2[k]), e = fabs(tmp[k]);
+        if (d > conv_err) conv_err = d;
+        if (e > conv_max) conv_max = e;
+      }
+      if (conv_err > 1e-5*conv_max) {
+        printf("zconvolve error ? %g %g\n", conv_err, conv_max); exit(1);
+      }
+    }
+
+  }
+
+  printf("%s PFFFT is OK for N=%d\n", (cplx?"CPLX":"REAL"), N); fflush(stdout);
+  
+  pffft_destroy_setup(s);
+  pffft_aligned_free(ref);
+  pffft_aligned_free(in);
+  pffft_aligned_free(out);
+  pffft_aligned_free(tmp);
+  pffft_aligned_free(tmp2);
+}
+
+void pffft_validate(int cplx) {
+  static int Ntest[] = { 16, 32, 64, 96, 128, 160, 192, 256, 288, 384, 5*96, 512, 576, 5*128, 800, 864, 1024, 2048, 2592, 4000, 4096, 12000, 36864, 0};
+  int k;
+  for (k = 0; Ntest[k]; ++k) {
+    int N = Ntest[k];
+    if (N == 16 && !cplx) continue;
+    pffft_validate_N(N, cplx);
+  }
+}
+
+int array_output_format = 0;
+
+void show_output(const char *name, int N, int cplx, float flops, float t0, float t1, int max_iter) {
+  float mflops = flops/1e6/(t1 - t0 + 1e-16);
+  if (array_output_format) {
+    if (flops != -1) {
+      printf("|%9.0f   ", mflops);
+    } else printf("|      n/a   ");
+  } else {
+    if (flops != -1) {
+      printf("N=%5d, %s %16s : %6.0f MFlops [t=%6.0f ns, %d runs]\n", N, (cplx?"CPLX":"REAL"), name, mflops, (t1-t0)/2/max_iter * 1e9, max_iter);
+    }
+  }
+  fflush(stdout);
+}
+
+void benchmark_ffts(int N, int cplx) {
+  int Nfloat = (cplx ? N*2 : N);
+  int Nbytes = Nfloat * sizeof(float);
+  float *X = pffft_aligned_malloc(Nbytes), *Y = pffft_aligned_malloc(Nbytes), *Z = pffft_aligned_malloc(Nbytes);
+
+  double t0, t1, flops;
+
+  int k;
+  int max_iter = 5120000/N*4;
+#ifdef __arm__
+  max_iter /= 4;
+#endif
+  int iter;
+
+  for (k = 0; k < Nfloat; ++k) {
+    X[k] = 0; //sqrtf(k+1);
+  }
+
+  // FFTPack benchmark
+  {
+    float *wrk = malloc(2*Nbytes + 15*sizeof(float));
+    int max_iter_ = max_iter/pffft_simd_size(); if (max_iter_ == 0) max_iter_ = 1;
+    if (cplx) cffti(N, wrk);
+    else      rffti(N, wrk);
+    t0 = uclock_sec();  
+    
+    for (iter = 0; iter < max_iter_; ++iter) {
+      if (cplx) {
+        cfftf(N, X, wrk);
+        cfftb(N, X, wrk);
+      } else {
+        rfftf(N, X, wrk);
+        rfftb(N, X, wrk);
+      }
+    }
+    t1 = uclock_sec();
+    free(wrk);
+    
+    flops = (max_iter_*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); // see http://www.fftw.org/speed/method.html
+    show_output("FFTPack", N, cplx, flops, t0, t1, max_iter_);
+  }
+
+#ifdef HAVE_VECLIB
+  int log2N = (int)(log(N)/log(2) + 0.5f);
+  if (N == (1<<log2N)) {
+    FFTSetup setup;
+
+    setup = vDSP_create_fftsetup(log2N, FFT_RADIX2);
+    DSPSplitComplex zsamples;
+    zsamples.realp = &X[0];
+    zsamples.imagp = &X[Nfloat/2];
+    t0 = uclock_sec();  
+    for (iter = 0; iter < max_iter; ++iter) {
+      if (cplx) {
+        vDSP_fft_zip(setup, &zsamples, 1, log2N, kFFTDirection_Forward);
+        vDSP_fft_zip(setup, &zsamples, 1, log2N, kFFTDirection_Inverse);
+      } else {
+        vDSP_fft_zrip(setup, &zsamples, 1, log2N, kFFTDirection_Forward); 
+        vDSP_fft_zrip(setup, &zsamples, 1, log2N, kFFTDirection_Inverse);
+      }
+    }
+    t1 = uclock_sec();
+    vDSP_destroy_fftsetup(setup);
+
+    flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); // see http://www.fftw.org/speed/method.html
+    show_output("vDSP", N, cplx, flops, t0, t1, max_iter);
+  } else {
+    show_output("vDSP", N, cplx, -1, -1, -1, -1);
+  }
+#endif
+  
+#ifdef HAVE_FFTW
+  {
+    fftwf_plan planf, planb;
+    fftw_complex *in = (fftw_complex*) fftwf_malloc(sizeof(fftw_complex) * N);
+    fftw_complex *out = (fftw_complex*) fftwf_malloc(sizeof(fftw_complex) * N);
+    memset(in, 0, sizeof(fftw_complex) * N);
+    int flags = (N < 40000 ? FFTW_MEASURE : FFTW_ESTIMATE);  // measure takes a lot of time on largest ffts
+    //int flags = FFTW_ESTIMATE;
+    if (cplx) {
+      planf = fftwf_plan_dft_1d(N, (fftwf_complex*)in, (fftwf_complex*)out, FFTW_FORWARD, flags);
+      planb = fftwf_plan_dft_1d(N, (fftwf_complex*)in, (fftwf_complex*)out, FFTW_BACKWARD, flags);
+    } else {
+      planf = fftwf_plan_dft_r2c_1d(N, (float*)in, (fftwf_complex*)out, flags);
+      planb = fftwf_plan_dft_c2r_1d(N, (fftwf_complex*)in, (float*)out, flags);
+    }
+
+    t0 = uclock_sec();  
+    for (iter = 0; iter < max_iter; ++iter) {
+      fftwf_execute(planf);
+      fftwf_execute(planb);
+    }
+    t1 = uclock_sec();
+
+    fftwf_destroy_plan(planf);
+    fftwf_destroy_plan(planb);
+    fftwf_free(in); fftwf_free(out);
+
+    flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); // see http://www.fftw.org/speed/method.html
+    show_output((flags == FFTW_MEASURE ? "FFTW (meas.)" : " FFTW (estim)"), N, cplx, flops, t0, t1, max_iter);
+  }
+#endif  
+
+  // PFFFT benchmark
+  {
+    PFFFT_Setup *s = pffft_new_setup(N, cplx ? PFFFT_COMPLEX : PFFFT_REAL);
+    if (s) {
+      t0 = uclock_sec();  
+      for (iter = 0; iter < max_iter; ++iter) {
+        pffft_transform(s, X, Z, Y, PFFFT_FORWARD);
+        pffft_transform(s, X, Z, Y, PFFFT_BACKWARD);
+      }
+      t1 = uclock_sec();
+      pffft_destroy_setup(s);
+    
+      flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); // see http://www.fftw.org/speed/method.html
+      show_output("PFFFT", N, cplx, flops, t0, t1, max_iter);
+    }
+  }
+
+  if (!array_output_format) {
+    printf("--\n");
+  }
+
+  pffft_aligned_free(X);
+  pffft_aligned_free(Y);
+  pffft_aligned_free(Z);
+}
+
+#ifndef PFFFT_SIMD_DISABLE
+void validate_pffft_simd(); // a small function inside pffft.c that will detect compiler bugs with respect to simd instruction 
+#endif
+
+int main(int argc, char **argv) {
+  int Nvalues[] = { 64, 96, 128, 160, 192, 256, 384, 5*96, 512, 5*128, 3*256, 800, 1024, 2048, 2400, 4096, 8192, 9*1024, 16384, 32768, 256*1024, 1024*1024, -1 };
+  int i;
+
+  if (argc > 1 && strcmp(argv[1], "--array-format") == 0) {
+    array_output_format = 1;
+  }
+
+#ifndef PFFFT_SIMD_DISABLE
+  validate_pffft_simd();
+#endif
+  pffft_validate(1);
+  pffft_validate(0);
+  if (!array_output_format) {
+    for (i=0; Nvalues[i] > 0; ++i) {
+      benchmark_ffts(Nvalues[i], 0 /* real fft */);
+    }
+    for (i=0; Nvalues[i] > 0; ++i) {
+      benchmark_ffts(Nvalues[i], 1 /* cplx fft */);
+    }
+  } else {
+    printf("| input len ");
+    printf("|real FFTPack");
+#ifdef HAVE_VECLIB
+    printf("|  real vDSP ");
+#endif
+#ifdef HAVE_FFTW
+    printf("|  real FFTW ");
+#endif
+    printf("| real PFFFT | ");
+
+    printf("|cplx FFTPack");
+#ifdef HAVE_VECLIB
+    printf("|  cplx vDSP ");
+#endif
+#ifdef HAVE_FFTW
+    printf("|  cplx FFTW ");
+#endif
+    printf("| cplx PFFFT |\n");
+    for (i=0; Nvalues[i] > 0; ++i) {
+      printf("|%9d  ", Nvalues[i]);
+      benchmark_ffts(Nvalues[i], 0); 
+      printf("| ");
+      benchmark_ffts(Nvalues[i], 1);
+      printf("|\n");
+    }
+    printf(" (numbers are given in MFlops)\n");
+  }
+
+
+  return 0;
+}
diff --git a/oss-internship-2020/pffft/main_pffft_sandboxed.cc b/oss-internship-2020/pffft/main_pffft_sandboxed.cc
new file mode 100644
index 0000000..662b949
--- /dev/null
+++ b/oss-internship-2020/pffft/main_pffft_sandboxed.cc
@@ -0,0 +1,208 @@
+#define GOOGLE_STRIP_LOG 1
+
+#include <assert.h>
+#include <glog/logging.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/times.h>
+#include <syscall.h>
+#include <time.h>
+
+#include "fftpack.h"
+#include "pffft_sapi.sapi.h"
+#include "sandboxed_api/util/flag.h"
+#include "sandboxed_api/vars.h"
+
+ABSL_DECLARE_FLAG(string, sandbox2_danger_danger_permit_all);
+ABSL_DECLARE_FLAG(string, sandbox2_danger_danger_permit_all_and_log);
+
+class pffftSapiSandbox : public pffftSandbox {
+ public:
+  std::unique_ptr<sandbox2::Policy> ModifyPolicy(
+      sandbox2::PolicyBuilder*) override {
+    return sandbox2::PolicyBuilder()
+        .AllowStaticStartup()
+        .AllowOpen()
+        .AllowRead()
+        .AllowWrite()
+        .AllowSystemMalloc()
+        .AllowExit()
+        .AllowSyscalls({
+            __NR_futex,
+            __NR_close,
+            __NR_getrusage,
+        })
+        .DisableNamespaces()
+        .BuildOrDie();
+  }
+};
+
+double frand() { return rand() / (double)RAND_MAX; }
+
+double uclock_sec(void) { return (double)clock() / (double)CLOCKS_PER_SEC; }
+
+int array_output_format = 0;
+
+void show_output(const char* name, int N, int cplx, float flops, float t0,
+                 float t1, int max_iter) {
+  float mflops = flops / 1e6 / (t1 - t0 + 1e-16);
+  if (array_output_format) {
+    if (flops != -1) {
+      printf("|%9.0f   ", mflops);
+    } else
+      printf("|      n/a   ");
+  } else {
+    if (flops != -1) {
+      printf("N=%5d, %s %16s : %6.0f MFlops [t=%6.0f ns, %d runs]\n", N,
+             (cplx ? "CPLX" : "REAL"), name, mflops,
+             (t1 - t0) / 2 / max_iter * 1e9, max_iter);
+    }
+  }
+  fflush(stdout);
+}
+
+int main(int argc, char* argv[]) {
+  /*
+   * Initialize Google's logging library.
+   */
+  google::InitGoogleLogging(argv[0]);
+
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+  /*
+   * Nvalues is a vector keeping the values by which iterates N, its value
+   * representing the input length. More concrete, N is the number of
+   * data points the caclulus is up to (determinating its accuracy).
+   * To show the performance of Fast-Fourier Transformations the program is
+   * testing for various values of N.
+   */
+  int Nvalues[] = {64,    96,     128,        160,         192,     256,
+                   384,   5 * 96, 512,        5 * 128,     3 * 256, 800,
+                   1024,  2048,   2400,       4096,        8192,    9 * 1024,
+                   16384, 32768,  256 * 1024, 1024 * 1024, -1};
+  int i;
+
+  VLOG(1) << "Initializing sandbox...\n";
+
+  pffftSapiSandbox sandbox;
+  sandbox.Init().IgnoreError();
+
+  VLOG(1) << "Initialization: " << sandbox.Init().ToString().c_str() << "\n";
+
+  pffftApi api(&sandbox);
+
+  int N, cplx;
+
+  cplx = 0;
+
+  do {
+    for (i = 0; i < 23; i++) {
+      N = Nvalues[i];
+
+      int Nfloat = N * (cplx ? 2 : 1);
+      int Nbytes = Nfloat * sizeof(float);
+      int pass;
+
+      float ref[Nbytes], in[Nbytes], out[Nbytes], tmp[Nbytes], tmp2[Nbytes];
+
+      sapi::v::Array<float> ref_(ref, Nbytes);
+      sapi::v::Array<float> in_(in, Nbytes);
+      sapi::v::Array<float> out_(out, Nbytes);
+      sapi::v::Array<float> tmp_(tmp, Nbytes);
+      sapi::v::Array<float> tmp2_(tmp2, Nbytes);
+
+      float wrk[2 * Nbytes + 15 * sizeof(float)];
+      sapi::v::Array<float> wrk_(wrk, 2 * Nbytes + 15 * sizeof(float));
+
+      float ref_max = 0;
+      int k;
+
+      Nfloat = (cplx ? N * 2 : N);
+      float X[Nbytes], Y[Nbytes], Z[Nbytes];
+      sapi::v::Array<float> X_(X, Nbytes), Y_(Y, Nbytes), Z_(Z, Nbytes);
+
+      double t0, t1, flops;
+
+      int max_iter = 5120000 / N * 4;
+#ifdef __arm__
+      max_iter /= 4;
+#endif
+      int iter;
+
+      for (k = 0; k < Nfloat; ++k) {
+        X[k] = 0;
+      }
+
+      /*
+       * FFTPack benchmark
+       */
+      {
+        /*
+         * SIMD_SZ == 4 (returning value of pffft_simd_size())
+         */
+        int max_iter_ = max_iter / 4;
+
+        if (max_iter_ == 0) max_iter_ = 1;
+        if (cplx) {
+          api.cffti(N, wrk_.PtrBoth()).IgnoreError();
+        } else {
+          api.rffti(N, wrk_.PtrBoth()).IgnoreError();
+        }
+        t0 = uclock_sec();
+
+        for (iter = 0; iter < max_iter_; ++iter) {
+          if (cplx) {
+            api.cfftf(N, X_.PtrBoth(), wrk_.PtrBoth()).IgnoreError();
+            api.cfftb(N, X_.PtrBoth(), wrk_.PtrBoth()).IgnoreError();
+          } else {
+            api.rfftf(N, X_.PtrBoth(), wrk_.PtrBoth()).IgnoreError();
+            api.rfftb(N, X_.PtrBoth(), wrk_.PtrBoth()).IgnoreError();
+          }
+        }
+        t1 = uclock_sec();
+
+        flops =
+            (max_iter_ * 2) * ((cplx ? 5 : 2.5) * N * log((double)N) / M_LN2);
+        show_output("FFTPack", N, cplx, flops, t0, t1, max_iter_);
+      }
+
+      /*
+       * PFFFT benchmark
+       */
+      {
+        sapi::StatusOr<PFFFT_Setup*> s =
+            api.pffft_new_setup(N, cplx ? PFFFT_COMPLEX : PFFFT_REAL);
+
+        VLOG(1) << "Setup status is: " << s.status().ToString().c_str() << "\n";
+
+        if (s.ok()) {
+          sapi::v::RemotePtr s_reg(s.value());
+
+          t0 = uclock_sec();
+          for (iter = 0; iter < max_iter; ++iter) {
+            api.pffft_transform(&s_reg, X_.PtrBoth(), Z_.PtrBoth(),
+                                Y_.PtrBoth(), PFFFT_FORWARD)
+                .IgnoreError();
+            api.pffft_transform(&s_reg, X_.PtrBoth(), Z_.PtrBoth(),
+                                Y_.PtrBoth(), PFFFT_FORWARD)
+                .IgnoreError();
+          }
+
+          t1 = uclock_sec();
+          api.pffft_destroy_setup(&s_reg).IgnoreError();
+
+          flops =
+              (max_iter * 2) * ((cplx ? 5 : 2.5) * N * log((double)N) / M_LN2);
+          show_output("PFFFT", N, cplx, flops, t0, t1, max_iter);
+        }
+
+        VLOG(1) << "N = " << N << " SUCCESSFULLY\n\n";
+      }
+    }
+
+    cplx = !cplx;
+  } while (cplx);
+
+  return 0;
+}
\ No newline at end of file
diff --git a/oss-internship-2020/pffft/pffft_library_notes.txt b/oss-internship-2020/pffft/pffft_library_notes.txt
new file mode 100644
index 0000000..ee20b42
--- /dev/null
+++ b/oss-internship-2020/pffft/pffft_library_notes.txt
@@ -0,0 +1,416 @@
+PFFFT: a pretty fast FFT.
+
+TL;DR
+--
+
+PFFFT does 1D Fast Fourier Transforms, of single precision real and
+complex vectors. It tries do it fast, it tries to be correct, and it
+tries to be small. Computations do take advantage of SSE1 instructions
+on x86 cpus, Altivec on powerpc cpus, and NEON on ARM cpus. The
+license is BSD-like.
+
+
+Why does it exist:
+--
+
+I was in search of a good performing FFT library , preferably very
+small and with a very liberal license.
+
+When one says "fft library", FFTW ("Fastest Fourier Transform in the
+West") is probably the first name that comes to mind -- I guess that
+99% of open-source projects that need a FFT do use FFTW, and are happy
+with it. However, it is quite a large library , which does everything
+fft related (2d transforms, 3d transforms, other transformations such
+as discrete cosine , or fast hartley). And it is licensed under the
+GNU GPL , which means that it cannot be used in non open-source
+products.
+
+An alternative to FFTW that is really small, is the venerable FFTPACK
+v4, which is available on NETLIB. A more recent version (v5) exists,
+but it is larger as it deals with multi-dimensional transforms. This
+is a library that is written in FORTRAN 77, a language that is now
+considered as a bit antiquated by many. FFTPACKv4 was written in 1985,
+by Dr Paul Swarztrauber of NCAR, more than 25 years ago ! And despite
+its age, benchmarks show it that it still a very good performing FFT
+library, see for example the 1d single precision benchmarks here:
+http://www.fftw.org/speed/opteron-2.2GHz-32bit/ . It is however not
+competitive with the fastest ones, such as FFTW, Intel MKL, AMD ACML,
+Apple vDSP. The reason for that is that those libraries do take
+advantage of the SSE SIMD instructions available on Intel CPUs,
+available since the days of the Pentium III. These instructions deal
+with small vectors of 4 floats at a time, instead of a single float
+for a traditionnal FPU, so when using these instructions one may expect
+a 4-fold performance improvement.
+
+The idea was to take this fortran fftpack v4 code, translate to C,
+modify it to deal with those SSE instructions, and check that the
+final performance is not completely ridiculous when compared to other
+SIMD FFT libraries. Translation to C was performed with f2c (
+http://www.netlib.org/f2c/ ). The resulting file was a bit edited in
+order to remove the thousands of gotos that were introduced by
+f2c. You will find the fftpack.h and fftpack.c sources in the
+repository, this a complete translation of
+http://www.netlib.org/fftpack/ , with the discrete cosine transform
+and the test program. There is no license information in the netlib
+repository, but it was confirmed to me by the fftpack v5 curators that
+the same terms do apply to fftpack v4:
+http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html . This is a
+"BSD-like" license, it is compatible with proprietary projects.
+
+Adapting fftpack to deal with the SIMD 4-element vectors instead of
+scalar single precision numbers was more complex than I originally
+thought, especially with the real transforms, and I ended up writing
+more code than I planned..
+
+
+The code:
+--
+
+Only two files, in good old C, pffft.c and pffft.h . The API is very
+very simple, just make sure that you read the comments in pffft.h.
+
+
+Comparison with other FFTs:
+--
+
+The idea was not to break speed records, but to get a decently fast
+fft that is at least 50% as fast as the fastest FFT -- especially on
+slowest computers . I'm more focused on getting the best performance
+on slow cpus (Atom, Intel Core 1, old Athlons, ARM Cortex-A9...), than
+on getting top performance on today fastest cpus.
+
+It can be used in a real-time context as the fft functions do not
+perform any memory allocation -- that is why they accept a 'work'
+array in their arguments.
+
+It is also a bit focused on performing 1D convolutions, that is why it
+provides "unordered" FFTs , and a fourier domain convolution
+operation.
+
+
+Benchmark results (cpu tested: core i7 2600, core 2 quad, core 1 duo, atom N270, cortex-A9, cortex-A15, A8X)
+--
+
+The benchmark shows the performance of various fft implementations measured in 
+MFlops, with the number of floating point operations being defined as 5Nlog2(N)
+for a length N complex fft, and 2.5*Nlog2(N) for a real fft. 
+See http://www.fftw.org/speed/method.html for an explanation of these formulas.
+
+MacOS Lion, gcc 4.2, 64-bit, fftw 3.3 on a 3.4 GHz core i7 2600
+
+Built with:
+
+ gcc-4.2 -o test_pffft -arch x86_64 -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -DHAVE_VECLIB -framework veclib -DHAVE_FFTW -lfftw3f 
+
+| input len |real FFTPack|  real vDSP |  real FFTW | real PFFFT | |cplx FFTPack|  cplx vDSP |  cplx FFTW | cplx PFFFT |
+|-----------+------------+------------+------------+------------| |------------+------------+------------+------------|
+|       64  |     2816   |     8596   |     7329   |     8187   | |     2887   |    14898   |    14668   |    11108   |
+|       96  |     3298   |      n/a   |     8378   |     7727   | |     3953   |      n/a   |    15680   |    10878   |
+|      128  |     3507   |    11575   |     9266   |    10108   | |     4233   |    17598   |    16427   |    12000   |
+|      160  |     3391   |      n/a   |     9838   |    10711   | |     4220   |      n/a   |    16653   |    11187   |
+|      192  |     3919   |      n/a   |     9868   |    10956   | |     4297   |      n/a   |    15770   |    12540   |
+|      256  |     4283   |    13179   |    10694   |    13128   | |     4545   |    19550   |    16350   |    13822   |
+|      384  |     3136   |      n/a   |    10810   |    12061   | |     3600   |      n/a   |    16103   |    13240   |
+|      480  |     3477   |      n/a   |    10632   |    12074   | |     3536   |      n/a   |    11630   |    12522   |
+|      512  |     3783   |    15141   |    11267   |    13838   | |     3649   |    20002   |    16560   |    13580   |
+|      640  |     3639   |      n/a   |    11164   |    13946   | |     3695   |      n/a   |    15416   |    13890   |
+|      768  |     3800   |      n/a   |    11245   |    13495   | |     3590   |      n/a   |    15802   |    14552   |
+|      800  |     3440   |      n/a   |    10499   |    13301   | |     3659   |      n/a   |    12056   |    13268   |
+|     1024  |     3924   |    15605   |    11450   |    15339   | |     3769   |    20963   |    13941   |    15467   |
+|     2048  |     4518   |    16195   |    11551   |    15532   | |     4258   |    20413   |    13723   |    15042   |
+|     2400  |     4294   |      n/a   |    10685   |    13078   | |     4093   |      n/a   |    12777   |    13119   |
+|     4096  |     4750   |    16596   |    11672   |    15817   | |     4157   |    19662   |    14316   |    14336   |
+|     8192  |     3820   |    16227   |    11084   |    12555   | |     3691   |    18132   |    12102   |    13813   |
+|     9216  |     3864   |      n/a   |    10254   |    12870   | |     3586   |      n/a   |    12119   |    13994   |
+|    16384  |     3822   |    15123   |    10454   |    12822   | |     3613   |    16874   |    12370   |    13881   |
+|    32768  |     4175   |    14512   |    10662   |    11095   | |     3881   |    14702   |    11619   |    11524   |
+|   262144  |     3317   |    11429   |     6269   |     9517   | |     2810   |    11729   |     7757   |    10179   |
+|  1048576  |     2913   |    10551   |     4730   |     5867   | |     2661   |     7881   |     3520   |     5350   |
+|-----------+------------+------------+------------+------------| |------------+------------+------------+------------|
+
+
+Debian 6, gcc 4.4.5, 64-bit, fftw 3.3.1 on a 3.4 GHz core i7 2600
+
+Built with:
+gcc -o test_pffft -DHAVE_FFTW -msse2 -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L$HOME/local/lib -I$HOME/local/include/ -lfftw3f -lm
+
+| N (input length) | real FFTPack |   real FFTW  |  real PFFFT  | | cplx FFTPack |   cplx FFTW  |  cplx PFFFT  |
+|------------------+--------------+--------------+--------------| |--------------+--------------+--------------|
+|           64     |      3840    |      7680    |      8777    | |      4389    |     20480    |     11171    |
+|           96     |      4214    |      9633    |      8429    | |      4816    |     22477    |     11238    |
+|          128     |      3584    |     10240    |     10240    | |      5120    |     23893    |     11947    |
+|          192     |      4854    |     11095    |     12945    | |      4854    |     22191    |     14121    |
+|          256     |      4096    |     11703    |     16384    | |      5120    |     23406    |     13653    |
+|          384     |      4395    |     14651    |     12558    | |      4884    |     19535    |     14651    |
+|          512     |      5760    |     13166    |     15360    | |      4608    |     23040    |     15360    |
+|          768     |      4907    |     14020    |     16357    | |      4461    |     19628    |     14020    |
+|         1024     |      5120    |     14629    |     14629    | |      5120    |     20480    |     15754    |
+|         2048     |      5632    |     14080    |     18773    | |      4693    |     12516    |     16091    |
+|         4096     |      5120    |     13653    |     17554    | |      4726    |      7680    |     14456    |
+|         8192     |      4160    |      7396    |     13312    | |      4437    |     14791    |     13312    |
+|         9216     |      4210    |      6124    |     13473    | |      4491    |      7282    |     14970    |
+|        16384     |      3976    |     11010    |     14313    | |      4210    |     11450    |     13631    |
+|        32768     |      4260    |     10224    |     10954    | |      4260    |      6816    |     11797    |
+|       262144     |      3736    |      6896    |      9961    | |      2359    |      8965    |      9437    |
+|      1048576     |      2796    |      4534    |      6453    | |      1864    |      3078    |      5592    |
+|------------------+--------------+--------------+--------------| |--------------+--------------+--------------|
+
+
+
+MacOS Snow Leopard, gcc 4.0, 32-bit, fftw 3.3 on a 1.83 GHz core 1 duo
+
+Built with:
+
+ gcc -o test_pffft -DHAVE_FFTW -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -framework veclib
+
+| input len |real FFTPack|  real vDSP |  real FFTW | real PFFFT | |cplx FFTPack|  cplx vDSP |  cplx FFTW | cplx PFFFT |
+|-----------+------------+------------+------------+------------| |------------+------------+------------+------------|
+|      64   |      745   |     2145   |     1706   |     2028   | |      961   |     3356   |     3313   |     2300   |
+|      96   |      877   |      n/a   |     1976   |     1978   | |     1059   |      n/a   |     3333   |     2233   |
+|     128   |      951   |     2808   |     2213   |     2279   | |     1202   |     3803   |     3739   |     2494   |
+|     192   |     1002   |      n/a   |     2456   |     2429   | |     1186   |      n/a   |     3701   |     2508   |
+|     256   |     1065   |     3205   |     2641   |     2793   | |     1302   |     4013   |     3912   |     2663   |
+|     384   |      845   |      n/a   |     2759   |     2499   | |      948   |      n/a   |     3729   |     2504   |
+|     512   |      900   |     3476   |     2956   |     2759   | |      974   |     4057   |     3954   |     2645   |
+|     768   |      910   |      n/a   |     2912   |     2737   | |      975   |      n/a   |     3837   |     2614   |
+|    1024   |      936   |     3583   |     3107   |     3009   | |     1006   |     4124   |     3821   |     2697   |
+|    2048   |     1057   |     3585   |     3091   |     2837   | |     1089   |     3889   |     3701   |     2513   |
+|    4096   |     1083   |     3524   |     3092   |     2733   | |     1039   |     3617   |     3462   |     2364   |
+|    8192   |      874   |     3252   |     2967   |     2363   | |      911   |     3106   |     2789   |     2302   |
+|    9216   |      898   |      n/a   |     2420   |     2290   | |      865   |      n/a   |     2676   |     2204   |
+|   16384   |      903   |     2892   |     2506   |     2421   | |      899   |     3026   |     2797   |     2289   |
+|   32768   |      965   |     2837   |     2550   |     2358   | |      920   |     2922   |     2763   |     2240   |
+|  262144   |      738   |     2422   |     1589   |     1708   | |      610   |     2038   |     1436   |     1091   |
+| 1048576   |      528   |     1207   |      845   |      880   | |      606   |     1020   |      669   |     1036   |
+|-----------+------------+------------+------------+------------| |------------+------------+------------+------------|
+
+
+
+Ubuntu 11.04, gcc 4.5, 32-bit, fftw 3.2 on a 2.66 core 2 quad
+
+Built with:
+gcc -o test_pffft -DHAVE_FFTW -msse -mfpmath=sse -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -lm
+
+| input len |real FFTPack|  real FFTW | real PFFFT | |cplx FFTPack|  cplx FFTW | cplx PFFFT |
+|-----------+------------+------------+------------| |------------+------------+------------|
+|       64  |     1920   |     3614   |     5120   | |     2194   |     7680   |     6467   |
+|       96  |     1873   |     3549   |     5187   | |     2107   |     8429   |     5863   |
+|      128  |     2240   |     3773   |     5514   | |     2560   |     7964   |     6827   |
+|      192  |     1765   |     4569   |     7767   | |     2284   |     9137   |     7061   |
+|      256  |     2048   |     5461   |     7447   | |     2731   |     9638   |     7802   |
+|      384  |     1998   |     5861   |     6762   | |     2313   |     9253   |     7644   |
+|      512  |     2095   |     6144   |     7680   | |     2194   |    10240   |     7089   |
+|      768  |     2230   |     5773   |     7549   | |     2045   |    10331   |     7010   |
+|     1024  |     2133   |     6400   |     8533   | |     2133   |    10779   |     7877   |
+|     2048  |     2011   |     7040   |     8665   | |     1942   |    10240   |     7768   |
+|     4096  |     2194   |     6827   |     8777   | |     1755   |     9452   |     6827   |
+|     8192  |     1849   |     6656   |     6656   | |     1752   |     7831   |     6827   |
+|     9216  |     1871   |     5858   |     6416   | |     1643   |     6909   |     6266   |
+|    16384  |     1883   |     6223   |     6506   | |     1664   |     7340   |     6982   |
+|    32768  |     1826   |     6390   |     6667   | |     1631   |     7481   |     6971   |
+|   262144  |     1546   |     4075   |     5977   | |     1299   |     3415   |     3551   |
+|  1048576  |     1104   |     2071   |     1730   | |     1104   |     1149   |     1834   |
+|-----------+------------+------------+------------| |------------+------------+------------|
+
+
+
+Ubuntu 11.04, gcc 4.5, 32-bit, fftw 3.3 on a 1.6 GHz Atom N270
+
+Built with:
+gcc -o test_pffft -DHAVE_FFTW -msse -mfpmath=sse -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -lm
+
+| N (input length) | real FFTPack |   real FFTW  |  real PFFFT  | | cplx FFTPack |   cplx FFTW  |  cplx PFFFT  |
+|------------------+--------------+--------------+--------------| |--------------+--------------+--------------|
+|           64     |       452    |      1041    |      1336    | |       549    |      2318    |      1781    |
+|           96     |       444    |      1297    |      1297    | |       503    |      2408    |      1686    |
+|          128     |       527    |      1525    |      1707    | |       543    |      2655    |      1886    |
+|          192     |       498    |      1653    |      1849    | |       539    |      2678    |      1942    |
+|          256     |       585    |      1862    |      2156    | |       594    |      2777    |      2244    |
+|          384     |       499    |      1870    |      1998    | |       511    |      2586    |      1890    |
+|          512     |       562    |      2095    |      2194    | |       542    |      2973    |      2194    |
+|          768     |       545    |      2045    |      2133    | |       545    |      2365    |      2133    |
+|         1024     |       595    |      2133    |      2438    | |       569    |      2695    |      2179    |
+|         2048     |       587    |      2125    |      2347    | |       521    |      2230    |      1707    |
+|         4096     |       495    |      1890    |      1834    | |       492    |      1876    |      1672    |
+|         8192     |       469    |      1548    |      1729    | |       438    |      1740    |      1664    |
+|         9216     |       468    |      1663    |      1663    | |       446    |      1585    |      1531    |
+|        16384     |       453    |      1608    |      1767    | |       398    |      1476    |      1664    |
+|        32768     |       456    |      1420    |      1503    | |       387    |      1388    |      1345    |
+|       262144     |       309    |       385    |       726    | |       262    |       415    |       840    |
+|      1048576     |       280    |       351    |       739    | |       261    |       313    |       797    |
+|------------------+--------------+--------------+--------------| |--------------+--------------+--------------|
+
+
+
+Windows 7, visual c++ 2010 on a 1.6 GHz Atom N270
+
+Built with:
+cl /Ox -D_USE_MATH_DEFINES /arch:SSE test_pffft.c pffft.c fftpack.c
+
+(visual c++ is definitively not very good with SSE intrinsics...)
+
+| N (input length) | real FFTPack |  real PFFFT  | | cplx FFTPack |  cplx PFFFT  |
+|------------------+--------------+--------------| |--------------+--------------|
+|           64     |       173    |      1009    | |       174    |      1159    |
+|           96     |       169    |      1029    | |       188    |      1201    |
+|          128     |       195    |      1242    | |       191    |      1275    |
+|          192     |       178    |      1312    | |       184    |      1276    |
+|          256     |       196    |      1591    | |       186    |      1281    |
+|          384     |       172    |      1409    | |       181    |      1281    |
+|          512     |       187    |      1640    | |       181    |      1313    |
+|          768     |       171    |      1614    | |       176    |      1258    |
+|         1024     |       186    |      1812    | |       178    |      1223    |
+|         2048     |       190    |      1707    | |       186    |      1099    |
+|         4096     |       182    |      1446    | |       177    |       975    |
+|         8192     |       175    |      1345    | |       169    |      1034    |
+|         9216     |       165    |      1271    | |       168    |      1023    |
+|        16384     |       166    |      1396    | |       165    |       949    |
+|        32768     |       172    |      1311    | |       161    |       881    |
+|       262144     |       136    |       632    | |       134    |       629    |
+|      1048576     |       134    |       698    | |       127    |       623    |
+|------------------+--------------+--------------| |--------------+--------------|
+
+
+
+Ubuntu 12.04, gcc-4.7.3, 32-bit, with fftw 3.3.3 (built with --enable-neon), on a 1.2GHz ARM Cortex A9 (Tegra 3)
+
+Built with:
+gcc-4.7 -O3 -DHAVE_FFTW -march=armv7-a -mtune=cortex-a9 -mfloat-abi=hard -mfpu=neon -ffast-math test_pffft.c pffft.c -o test_pffft_arm fftpack.c -lm -I/usr/local/include/ -L/usr/local/lib/ -lfftw3f
+
+| input len |real FFTPack|  real FFTW | real PFFFT | |cplx FFTPack|  cplx FFTW | cplx PFFFT |
+|-----------+------------+------------+------------| |------------+------------+------------|
+|       64  |      549   |      452   |      731   | |      512   |      602   |      640   |
+|       96  |      421   |      272   |      702   | |      496   |      571   |      602   |
+|      128  |      498   |      512   |      815   | |      597   |      618   |      652   |
+|      160  |      521   |      536   |      815   | |      586   |      669   |      625   |
+|      192  |      539   |      571   |      883   | |      485   |      597   |      626   |
+|      256  |      640   |      539   |      975   | |      569   |      611   |      671   |
+|      384  |      499   |      610   |      879   | |      499   |      602   |      637   |
+|      480  |      518   |      507   |      877   | |      496   |      661   |      616   |
+|      512  |      524   |      591   |     1002   | |      549   |      678   |      668   |
+|      640  |      542   |      612   |      955   | |      568   |      663   |      645   |
+|      768  |      557   |      613   |      981   | |      491   |      663   |      598   |
+|      800  |      514   |      353   |      882   | |      514   |      360   |      574   |
+|     1024  |      640   |      640   |     1067   | |      492   |      683   |      602   |
+|     2048  |      587   |      640   |      908   | |      486   |      640   |      552   |
+|     2400  |      479   |      368   |      777   | |      422   |      376   |      518   |
+|     4096  |      511   |      614   |      853   | |      426   |      640   |      534   |
+|     8192  |      415   |      584   |      708   | |      386   |      622   |      516   |
+|     9216  |      419   |      571   |      687   | |      364   |      586   |      506   |
+|    16384  |      426   |      577   |      716   | |      398   |      606   |      530   |
+|    32768  |      417   |      572   |      673   | |      399   |      572   |      468   |
+|   262144  |      219   |      380   |      293   | |      255   |      431   |      343   |
+|  1048576  |      202   |      274   |      237   | |      265   |      282   |      355   |
+|-----------+------------+------------+------------| |------------+------------+------------|
+
+Same platform as above, but this time pffft and fftpack are built with clang 3.2:
+
+clang -O3 -DHAVE_FFTW -march=armv7-a -mtune=cortex-a9 -mfloat-abi=hard -mfpu=neon -ffast-math test_pffft.c pffft.c -o test_pffft_arm fftpack.c -lm -I/usr/local/include/ -L/usr/local/lib/ -lfftw3f
+
+| input len |real FFTPack|  real FFTW | real PFFFT | |cplx FFTPack|  cplx FFTW | cplx PFFFT |
+|-----------+------------+------------+------------| |------------+------------+------------|
+|       64  |      427   |      452   |      853   | |      427   |      602   |     1024   |
+|       96  |      351   |      276   |      843   | |      337   |      571   |      963   |
+|      128  |      373   |      512   |      996   | |      390   |      618   |     1054   |
+|      160  |      426   |      536   |      987   | |      375   |      669   |      914   |
+|      192  |      404   |      571   |     1079   | |      388   |      588   |     1079   |
+|      256  |      465   |      539   |     1205   | |      445   |      602   |     1170   |
+|      384  |      366   |      610   |     1099   | |      343   |      594   |     1099   |
+|      480  |      356   |      507   |     1140   | |      335   |      651   |      931   |
+|      512  |      411   |      591   |     1213   | |      384   |      649   |     1124   |
+|      640  |      398   |      612   |     1193   | |      373   |      654   |      901   |
+|      768  |      409   |      613   |     1227   | |      383   |      663   |     1044   |
+|      800  |      411   |      348   |     1073   | |      353   |      358   |      809   |
+|     1024  |      427   |      640   |     1280   | |      413   |      692   |     1004   |
+|     2048  |      414   |      626   |     1126   | |      371   |      640   |      853   |
+|     2400  |      399   |      373   |      898   | |      319   |      368   |      653   |
+|     4096  |      404   |      602   |     1059   | |      357   |      633   |      778   |
+|     8192  |      332   |      584   |      792   | |      308   |      616   |      716   |
+|     9216  |      322   |      561   |      783   | |      299   |      586   |      687   |
+|    16384  |      344   |      568   |      778   | |      314   |      617   |      745   |
+|    32768  |      342   |      564   |      737   | |      314   |      552   |      629   |
+|   262144  |      201   |      383   |      313   | |      227   |      435   |      413   |
+|  1048576  |      187   |      262   |      251   | |      228   |      281   |      409   |
+|-----------+------------+------------+------------| |------------+------------+------------|
+
+So it looks like, on ARM, gcc 4.7 is the best at scalar floating point
+(the fftpack performance numbers are better with gcc), while clang is
+the best with neon intrinsics (see how pffft perf has improved with
+clang 3.2).
+
+
+NVIDIA Jetson TK1 board, gcc-4.8.2. The cpu is a 2.3GHz cortex A15 (Tegra K1).
+
+Built with:
+gcc -O3 -march=armv7-a -mtune=native -mfloat-abi=hard -mfpu=neon -ffast-math test_pffft.c pffft.c -o test_pffft_arm fftpack.c -lm
+
+| input len |real FFTPack| real PFFFT | |cplx FFTPack| cplx PFFFT |
+|-----------+------------+------------| |------------+------------|
+|       64  |     1735   |     3308   | |     1994   |     3744   |
+|       96  |     1596   |     3448   | |     1987   |     3572   |
+|      128  |     1807   |     4076   | |     2255   |     3960   |
+|      160  |     1769   |     4083   | |     2071   |     3845   |
+|      192  |     1990   |     4233   | |     2017   |     3939   |
+|      256  |     2191   |     4882   | |     2254   |     4346   |
+|      384  |     1878   |     4492   | |     2073   |     4012   |
+|      480  |     1748   |     4398   | |     1923   |     3951   |
+|      512  |     2030   |     5064   | |     2267   |     4195   |
+|      640  |     1918   |     4756   | |     2094   |     4184   |
+|      768  |     2099   |     4907   | |     2048   |     4297   |
+|      800  |     1822   |     4555   | |     1880   |     4063   |
+|     1024  |     2232   |     5355   | |     2187   |     4420   |
+|     2048  |     2176   |     4983   | |     2027   |     3602   |
+|     2400  |     1741   |     4256   | |     1710   |     3344   |
+|     4096  |     1816   |     3914   | |     1851   |     3349   |
+|     8192  |     1716   |     3481   | |     1700   |     3255   |
+|     9216  |     1735   |     3589   | |     1653   |     3094   |
+|    16384  |     1567   |     3483   | |     1637   |     3244   |
+|    32768  |     1624   |     3240   | |     1655   |     3156   |
+|   262144  |     1012   |     1898   | |      983   |     1503   |
+|  1048576  |      876   |     1154   | |      868   |     1341   |
+|-----------+------------+------------| |------------+------------|
+
+The performance on the tegra K1 is pretty impressive. I'm not
+including the FFTW numbers as they as slightly below the scalar
+fftpack numbers, so something must be wrong (however it seems to be
+correctly configured and is using neon simd instructions).
+
+When using clang 3.4 the pffft version is even a bit faster, reaching
+5.7 GFlops for real ffts of size 1024.
+
+
+iPad Air 2 with iOS9, xcode 8.0, arm64. The cpu is an Apple A8X, supposedly running at 1.5GHz.
+
+| input len |real FFTPack|  real vDSP | real PFFFT | |cplx FFTPack|  cplx vDSP | cplx PFFFT |
+|-----------+------------+------------+------------| |------------+------------+------------|
+|       64  |     2517   |     7995   |     6086   | |     2725   |    13006   |     8495   |
+|       96  |     2442   |      n/a   |     6691   | |     2256   |      n/a   |     7991   |
+|      128  |     2664   |    10186   |     7877   | |     2575   |    15115   |     9115   |
+|      160  |     2638   |      n/a   |     8283   | |     2682   |      n/a   |     8806   |
+|      192  |     2903   |      n/a   |     9083   | |     2634   |      n/a   |     8980   |
+|      256  |     3184   |    11452   |    10039   | |     3026   |    15410   |    10199   |
+|      384  |     2665   |      n/a   |    10100   | |     2275   |      n/a   |     9247   |
+|      480  |     2546   |      n/a   |     9863   | |     2341   |      n/a   |     8892   |
+|      512  |     2832   |    12197   |    10989   | |     2547   |    16768   |    10154   |
+|      640  |     2755   |      n/a   |    10461   | |     2569   |      n/a   |     9666   |
+|      768  |     2998   |      n/a   |    11355   | |     2585   |      n/a   |     9813   |
+|      800  |     2516   |      n/a   |    10332   | |     2433   |      n/a   |     9164   |
+|     1024  |     3109   |    12965   |    12114   | |     2869   |    16448   |    10519   |
+|     2048  |     3027   |    12996   |    12023   | |     2648   |    17304   |    10307   |
+|     2400  |     2515   |      n/a   |    10372   | |     2355   |      n/a   |     8443   |
+|     4096  |     3204   |    13603   |    12359   | |     2814   |    16570   |     9780   |
+|     8192  |     2759   |    13422   |    10824   | |     2153   |    15652   |     7884   |
+|     9216  |     2700   |      n/a   |     9938   | |     2241   |      n/a   |     7900   |
+|    16384  |     2280   |    13057   |     7976   | |      593   |     4272   |     2534   |
+|    32768  |      768   |     4269   |     2882   | |      606   |     4405   |     2604   |
+|   262144  |      724   |     3527   |     2630   | |      534   |     2418   |     2157   |
+|  1048576  |      674   |     1467   |     2135   | |      530   |     1621   |     2055   |
+|-----------+------------+------------+------------| |------------+------------+------------|
+
+I double-checked to make sure I did not make a mistake in the time
+measurements, as the numbers are much higher than what I initially
+expected. They are in fact higher than the number I get on the 2.8GHz
+Xeon of my 2008 mac pro.. (except for FFT lengths >= 32768 where
+having a big cache is useful). A good surprise is also that the perf
+is not too far from apple's vDSP (at least for the real FFT).
+

From fc6e9e82c60987271cbb286327ea6dcf2ca4656d Mon Sep 17 00:00:00 2001
From: doinachiroiu <inach@google.com>
Date: Thu, 20 Aug 2020 11:24:19 +0000
Subject: [PATCH 14/29] Unnecessary files removed

---
 oss-internship-2020/pffft/CMakeLists.txt      |   4 +-
 oss-internship-2020/pffft/README.txt          |  32 --
 oss-internship-2020/pffft/README_pffft.txt    | 416 -----------------
 oss-internship-2020/pffft/myNotes.txt         | 101 -----
 oss-internship-2020/pffft/test_pffft.c        | 419 ------------------
 .../pffft/test_pffft_sandboxed.cc             | 206 ---------
 6 files changed, 2 insertions(+), 1176 deletions(-)
 delete mode 100644 oss-internship-2020/pffft/README.txt
 delete mode 100644 oss-internship-2020/pffft/README_pffft.txt
 delete mode 100644 oss-internship-2020/pffft/myNotes.txt
 delete mode 100644 oss-internship-2020/pffft/test_pffft.c
 delete mode 100644 oss-internship-2020/pffft/test_pffft_sandboxed.cc

diff --git a/oss-internship-2020/pffft/CMakeLists.txt b/oss-internship-2020/pffft/CMakeLists.txt
index a9ba38a..6c7e776 100644
--- a/oss-internship-2020/pffft/CMakeLists.txt
+++ b/oss-internship-2020/pffft/CMakeLists.txt
@@ -13,7 +13,7 @@ add_library(pffft STATIC
 )
 
 add_executable(pffft_main
-    test_pffft.c
+    main_pffft.c
 )
 
 target_link_libraries(pffft_main PRIVATE 
@@ -82,7 +82,7 @@ target_include_directories(pffft_sapi INTERFACE
 )
 
 add_executable(pffft_sandboxed
-    test_pffft_sandboxed.cc
+    main_pffft_sandboxed.cc
 )
 
 target_link_libraries(pffft_sandboxed PRIVATE  
diff --git a/oss-internship-2020/pffft/README.txt b/oss-internship-2020/pffft/README.txt
deleted file mode 100644
index c7ac70b..0000000
--- a/oss-internship-2020/pffft/README.txt
+++ /dev/null
@@ -1,32 +0,0 @@
-Sandboxing PFFFT library
-
-Builder: CMake
-
-For testing: 
-`cd build`, then `./pffft_sandboxed`
-
-For debug:
-`SAPI_VLOG_LEVEL=1 ./pffft_sandboxed --v=100
---sandbox2_danger_danger_permit_all_and_log <auxiliar file>`
-
-CMake observations:
-    * linking pffft and fftpack (which contains necessary functions for pffft)
-    * set math library 
-
-Sandboxed main observations:
-    * containing two testing parts (fft / pffft benchmarks)
-        ! current stage: fft - works :)
-                        pffft - implemented
-    * (Solved) pffft benchmark bug: "Sandbox not active"  
-                        N = 64, status OK, pffft_transform generates error 
-                        N > 64, status not OK
-                        Problem on initialising sapi::StatusOr<PFFFT_Setup *> s; 
-                        the memory that stays for s is not the same with the address passed
-                        in pffft_transform function. 
-                        (sapi :: v :: GenericPtr - to be changed)
-
-                        Temporary solution: change the generated files to accept
-                        uintptr_t instead of PFFFT_Setup
-
-                        Solution: using "sapi :: v :: RemotePtr" instead of "sapi :: v :: GenericPtr"
-                                  to access the memory of object s
diff --git a/oss-internship-2020/pffft/README_pffft.txt b/oss-internship-2020/pffft/README_pffft.txt
deleted file mode 100644
index ee20b42..0000000
--- a/oss-internship-2020/pffft/README_pffft.txt
+++ /dev/null
@@ -1,416 +0,0 @@
-PFFFT: a pretty fast FFT.
-
-TL;DR
---
-
-PFFFT does 1D Fast Fourier Transforms, of single precision real and
-complex vectors. It tries do it fast, it tries to be correct, and it
-tries to be small. Computations do take advantage of SSE1 instructions
-on x86 cpus, Altivec on powerpc cpus, and NEON on ARM cpus. The
-license is BSD-like.
-
-
-Why does it exist:
---
-
-I was in search of a good performing FFT library , preferably very
-small and with a very liberal license.
-
-When one says "fft library", FFTW ("Fastest Fourier Transform in the
-West") is probably the first name that comes to mind -- I guess that
-99% of open-source projects that need a FFT do use FFTW, and are happy
-with it. However, it is quite a large library , which does everything
-fft related (2d transforms, 3d transforms, other transformations such
-as discrete cosine , or fast hartley). And it is licensed under the
-GNU GPL , which means that it cannot be used in non open-source
-products.
-
-An alternative to FFTW that is really small, is the venerable FFTPACK
-v4, which is available on NETLIB. A more recent version (v5) exists,
-but it is larger as it deals with multi-dimensional transforms. This
-is a library that is written in FORTRAN 77, a language that is now
-considered as a bit antiquated by many. FFTPACKv4 was written in 1985,
-by Dr Paul Swarztrauber of NCAR, more than 25 years ago ! And despite
-its age, benchmarks show it that it still a very good performing FFT
-library, see for example the 1d single precision benchmarks here:
-http://www.fftw.org/speed/opteron-2.2GHz-32bit/ . It is however not
-competitive with the fastest ones, such as FFTW, Intel MKL, AMD ACML,
-Apple vDSP. The reason for that is that those libraries do take
-advantage of the SSE SIMD instructions available on Intel CPUs,
-available since the days of the Pentium III. These instructions deal
-with small vectors of 4 floats at a time, instead of a single float
-for a traditionnal FPU, so when using these instructions one may expect
-a 4-fold performance improvement.
-
-The idea was to take this fortran fftpack v4 code, translate to C,
-modify it to deal with those SSE instructions, and check that the
-final performance is not completely ridiculous when compared to other
-SIMD FFT libraries. Translation to C was performed with f2c (
-http://www.netlib.org/f2c/ ). The resulting file was a bit edited in
-order to remove the thousands of gotos that were introduced by
-f2c. You will find the fftpack.h and fftpack.c sources in the
-repository, this a complete translation of
-http://www.netlib.org/fftpack/ , with the discrete cosine transform
-and the test program. There is no license information in the netlib
-repository, but it was confirmed to me by the fftpack v5 curators that
-the same terms do apply to fftpack v4:
-http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html . This is a
-"BSD-like" license, it is compatible with proprietary projects.
-
-Adapting fftpack to deal with the SIMD 4-element vectors instead of
-scalar single precision numbers was more complex than I originally
-thought, especially with the real transforms, and I ended up writing
-more code than I planned..
-
-
-The code:
---
-
-Only two files, in good old C, pffft.c and pffft.h . The API is very
-very simple, just make sure that you read the comments in pffft.h.
-
-
-Comparison with other FFTs:
---
-
-The idea was not to break speed records, but to get a decently fast
-fft that is at least 50% as fast as the fastest FFT -- especially on
-slowest computers . I'm more focused on getting the best performance
-on slow cpus (Atom, Intel Core 1, old Athlons, ARM Cortex-A9...), than
-on getting top performance on today fastest cpus.
-
-It can be used in a real-time context as the fft functions do not
-perform any memory allocation -- that is why they accept a 'work'
-array in their arguments.
-
-It is also a bit focused on performing 1D convolutions, that is why it
-provides "unordered" FFTs , and a fourier domain convolution
-operation.
-
-
-Benchmark results (cpu tested: core i7 2600, core 2 quad, core 1 duo, atom N270, cortex-A9, cortex-A15, A8X)
---
-
-The benchmark shows the performance of various fft implementations measured in 
-MFlops, with the number of floating point operations being defined as 5Nlog2(N)
-for a length N complex fft, and 2.5*Nlog2(N) for a real fft. 
-See http://www.fftw.org/speed/method.html for an explanation of these formulas.
-
-MacOS Lion, gcc 4.2, 64-bit, fftw 3.3 on a 3.4 GHz core i7 2600
-
-Built with:
-
- gcc-4.2 -o test_pffft -arch x86_64 -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -DHAVE_VECLIB -framework veclib -DHAVE_FFTW -lfftw3f 
-
-| input len |real FFTPack|  real vDSP |  real FFTW | real PFFFT | |cplx FFTPack|  cplx vDSP |  cplx FFTW | cplx PFFFT |
-|-----------+------------+------------+------------+------------| |------------+------------+------------+------------|
-|       64  |     2816   |     8596   |     7329   |     8187   | |     2887   |    14898   |    14668   |    11108   |
-|       96  |     3298   |      n/a   |     8378   |     7727   | |     3953   |      n/a   |    15680   |    10878   |
-|      128  |     3507   |    11575   |     9266   |    10108   | |     4233   |    17598   |    16427   |    12000   |
-|      160  |     3391   |      n/a   |     9838   |    10711   | |     4220   |      n/a   |    16653   |    11187   |
-|      192  |     3919   |      n/a   |     9868   |    10956   | |     4297   |      n/a   |    15770   |    12540   |
-|      256  |     4283   |    13179   |    10694   |    13128   | |     4545   |    19550   |    16350   |    13822   |
-|      384  |     3136   |      n/a   |    10810   |    12061   | |     3600   |      n/a   |    16103   |    13240   |
-|      480  |     3477   |      n/a   |    10632   |    12074   | |     3536   |      n/a   |    11630   |    12522   |
-|      512  |     3783   |    15141   |    11267   |    13838   | |     3649   |    20002   |    16560   |    13580   |
-|      640  |     3639   |      n/a   |    11164   |    13946   | |     3695   |      n/a   |    15416   |    13890   |
-|      768  |     3800   |      n/a   |    11245   |    13495   | |     3590   |      n/a   |    15802   |    14552   |
-|      800  |     3440   |      n/a   |    10499   |    13301   | |     3659   |      n/a   |    12056   |    13268   |
-|     1024  |     3924   |    15605   |    11450   |    15339   | |     3769   |    20963   |    13941   |    15467   |
-|     2048  |     4518   |    16195   |    11551   |    15532   | |     4258   |    20413   |    13723   |    15042   |
-|     2400  |     4294   |      n/a   |    10685   |    13078   | |     4093   |      n/a   |    12777   |    13119   |
-|     4096  |     4750   |    16596   |    11672   |    15817   | |     4157   |    19662   |    14316   |    14336   |
-|     8192  |     3820   |    16227   |    11084   |    12555   | |     3691   |    18132   |    12102   |    13813   |
-|     9216  |     3864   |      n/a   |    10254   |    12870   | |     3586   |      n/a   |    12119   |    13994   |
-|    16384  |     3822   |    15123   |    10454   |    12822   | |     3613   |    16874   |    12370   |    13881   |
-|    32768  |     4175   |    14512   |    10662   |    11095   | |     3881   |    14702   |    11619   |    11524   |
-|   262144  |     3317   |    11429   |     6269   |     9517   | |     2810   |    11729   |     7757   |    10179   |
-|  1048576  |     2913   |    10551   |     4730   |     5867   | |     2661   |     7881   |     3520   |     5350   |
-|-----------+------------+------------+------------+------------| |------------+------------+------------+------------|
-
-
-Debian 6, gcc 4.4.5, 64-bit, fftw 3.3.1 on a 3.4 GHz core i7 2600
-
-Built with:
-gcc -o test_pffft -DHAVE_FFTW -msse2 -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L$HOME/local/lib -I$HOME/local/include/ -lfftw3f -lm
-
-| N (input length) | real FFTPack |   real FFTW  |  real PFFFT  | | cplx FFTPack |   cplx FFTW  |  cplx PFFFT  |
-|------------------+--------------+--------------+--------------| |--------------+--------------+--------------|
-|           64     |      3840    |      7680    |      8777    | |      4389    |     20480    |     11171    |
-|           96     |      4214    |      9633    |      8429    | |      4816    |     22477    |     11238    |
-|          128     |      3584    |     10240    |     10240    | |      5120    |     23893    |     11947    |
-|          192     |      4854    |     11095    |     12945    | |      4854    |     22191    |     14121    |
-|          256     |      4096    |     11703    |     16384    | |      5120    |     23406    |     13653    |
-|          384     |      4395    |     14651    |     12558    | |      4884    |     19535    |     14651    |
-|          512     |      5760    |     13166    |     15360    | |      4608    |     23040    |     15360    |
-|          768     |      4907    |     14020    |     16357    | |      4461    |     19628    |     14020    |
-|         1024     |      5120    |     14629    |     14629    | |      5120    |     20480    |     15754    |
-|         2048     |      5632    |     14080    |     18773    | |      4693    |     12516    |     16091    |
-|         4096     |      5120    |     13653    |     17554    | |      4726    |      7680    |     14456    |
-|         8192     |      4160    |      7396    |     13312    | |      4437    |     14791    |     13312    |
-|         9216     |      4210    |      6124    |     13473    | |      4491    |      7282    |     14970    |
-|        16384     |      3976    |     11010    |     14313    | |      4210    |     11450    |     13631    |
-|        32768     |      4260    |     10224    |     10954    | |      4260    |      6816    |     11797    |
-|       262144     |      3736    |      6896    |      9961    | |      2359    |      8965    |      9437    |
-|      1048576     |      2796    |      4534    |      6453    | |      1864    |      3078    |      5592    |
-|------------------+--------------+--------------+--------------| |--------------+--------------+--------------|
-
-
-
-MacOS Snow Leopard, gcc 4.0, 32-bit, fftw 3.3 on a 1.83 GHz core 1 duo
-
-Built with:
-
- gcc -o test_pffft -DHAVE_FFTW -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -framework veclib
-
-| input len |real FFTPack|  real vDSP |  real FFTW | real PFFFT | |cplx FFTPack|  cplx vDSP |  cplx FFTW | cplx PFFFT |
-|-----------+------------+------------+------------+------------| |------------+------------+------------+------------|
-|      64   |      745   |     2145   |     1706   |     2028   | |      961   |     3356   |     3313   |     2300   |
-|      96   |      877   |      n/a   |     1976   |     1978   | |     1059   |      n/a   |     3333   |     2233   |
-|     128   |      951   |     2808   |     2213   |     2279   | |     1202   |     3803   |     3739   |     2494   |
-|     192   |     1002   |      n/a   |     2456   |     2429   | |     1186   |      n/a   |     3701   |     2508   |
-|     256   |     1065   |     3205   |     2641   |     2793   | |     1302   |     4013   |     3912   |     2663   |
-|     384   |      845   |      n/a   |     2759   |     2499   | |      948   |      n/a   |     3729   |     2504   |
-|     512   |      900   |     3476   |     2956   |     2759   | |      974   |     4057   |     3954   |     2645   |
-|     768   |      910   |      n/a   |     2912   |     2737   | |      975   |      n/a   |     3837   |     2614   |
-|    1024   |      936   |     3583   |     3107   |     3009   | |     1006   |     4124   |     3821   |     2697   |
-|    2048   |     1057   |     3585   |     3091   |     2837   | |     1089   |     3889   |     3701   |     2513   |
-|    4096   |     1083   |     3524   |     3092   |     2733   | |     1039   |     3617   |     3462   |     2364   |
-|    8192   |      874   |     3252   |     2967   |     2363   | |      911   |     3106   |     2789   |     2302   |
-|    9216   |      898   |      n/a   |     2420   |     2290   | |      865   |      n/a   |     2676   |     2204   |
-|   16384   |      903   |     2892   |     2506   |     2421   | |      899   |     3026   |     2797   |     2289   |
-|   32768   |      965   |     2837   |     2550   |     2358   | |      920   |     2922   |     2763   |     2240   |
-|  262144   |      738   |     2422   |     1589   |     1708   | |      610   |     2038   |     1436   |     1091   |
-| 1048576   |      528   |     1207   |      845   |      880   | |      606   |     1020   |      669   |     1036   |
-|-----------+------------+------------+------------+------------| |------------+------------+------------+------------|
-
-
-
-Ubuntu 11.04, gcc 4.5, 32-bit, fftw 3.2 on a 2.66 core 2 quad
-
-Built with:
-gcc -o test_pffft -DHAVE_FFTW -msse -mfpmath=sse -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -lm
-
-| input len |real FFTPack|  real FFTW | real PFFFT | |cplx FFTPack|  cplx FFTW | cplx PFFFT |
-|-----------+------------+------------+------------| |------------+------------+------------|
-|       64  |     1920   |     3614   |     5120   | |     2194   |     7680   |     6467   |
-|       96  |     1873   |     3549   |     5187   | |     2107   |     8429   |     5863   |
-|      128  |     2240   |     3773   |     5514   | |     2560   |     7964   |     6827   |
-|      192  |     1765   |     4569   |     7767   | |     2284   |     9137   |     7061   |
-|      256  |     2048   |     5461   |     7447   | |     2731   |     9638   |     7802   |
-|      384  |     1998   |     5861   |     6762   | |     2313   |     9253   |     7644   |
-|      512  |     2095   |     6144   |     7680   | |     2194   |    10240   |     7089   |
-|      768  |     2230   |     5773   |     7549   | |     2045   |    10331   |     7010   |
-|     1024  |     2133   |     6400   |     8533   | |     2133   |    10779   |     7877   |
-|     2048  |     2011   |     7040   |     8665   | |     1942   |    10240   |     7768   |
-|     4096  |     2194   |     6827   |     8777   | |     1755   |     9452   |     6827   |
-|     8192  |     1849   |     6656   |     6656   | |     1752   |     7831   |     6827   |
-|     9216  |     1871   |     5858   |     6416   | |     1643   |     6909   |     6266   |
-|    16384  |     1883   |     6223   |     6506   | |     1664   |     7340   |     6982   |
-|    32768  |     1826   |     6390   |     6667   | |     1631   |     7481   |     6971   |
-|   262144  |     1546   |     4075   |     5977   | |     1299   |     3415   |     3551   |
-|  1048576  |     1104   |     2071   |     1730   | |     1104   |     1149   |     1834   |
-|-----------+------------+------------+------------| |------------+------------+------------|
-
-
-
-Ubuntu 11.04, gcc 4.5, 32-bit, fftw 3.3 on a 1.6 GHz Atom N270
-
-Built with:
-gcc -o test_pffft -DHAVE_FFTW -msse -mfpmath=sse -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -lm
-
-| N (input length) | real FFTPack |   real FFTW  |  real PFFFT  | | cplx FFTPack |   cplx FFTW  |  cplx PFFFT  |
-|------------------+--------------+--------------+--------------| |--------------+--------------+--------------|
-|           64     |       452    |      1041    |      1336    | |       549    |      2318    |      1781    |
-|           96     |       444    |      1297    |      1297    | |       503    |      2408    |      1686    |
-|          128     |       527    |      1525    |      1707    | |       543    |      2655    |      1886    |
-|          192     |       498    |      1653    |      1849    | |       539    |      2678    |      1942    |
-|          256     |       585    |      1862    |      2156    | |       594    |      2777    |      2244    |
-|          384     |       499    |      1870    |      1998    | |       511    |      2586    |      1890    |
-|          512     |       562    |      2095    |      2194    | |       542    |      2973    |      2194    |
-|          768     |       545    |      2045    |      2133    | |       545    |      2365    |      2133    |
-|         1024     |       595    |      2133    |      2438    | |       569    |      2695    |      2179    |
-|         2048     |       587    |      2125    |      2347    | |       521    |      2230    |      1707    |
-|         4096     |       495    |      1890    |      1834    | |       492    |      1876    |      1672    |
-|         8192     |       469    |      1548    |      1729    | |       438    |      1740    |      1664    |
-|         9216     |       468    |      1663    |      1663    | |       446    |      1585    |      1531    |
-|        16384     |       453    |      1608    |      1767    | |       398    |      1476    |      1664    |
-|        32768     |       456    |      1420    |      1503    | |       387    |      1388    |      1345    |
-|       262144     |       309    |       385    |       726    | |       262    |       415    |       840    |
-|      1048576     |       280    |       351    |       739    | |       261    |       313    |       797    |
-|------------------+--------------+--------------+--------------| |--------------+--------------+--------------|
-
-
-
-Windows 7, visual c++ 2010 on a 1.6 GHz Atom N270
-
-Built with:
-cl /Ox -D_USE_MATH_DEFINES /arch:SSE test_pffft.c pffft.c fftpack.c
-
-(visual c++ is definitively not very good with SSE intrinsics...)
-
-| N (input length) | real FFTPack |  real PFFFT  | | cplx FFTPack |  cplx PFFFT  |
-|------------------+--------------+--------------| |--------------+--------------|
-|           64     |       173    |      1009    | |       174    |      1159    |
-|           96     |       169    |      1029    | |       188    |      1201    |
-|          128     |       195    |      1242    | |       191    |      1275    |
-|          192     |       178    |      1312    | |       184    |      1276    |
-|          256     |       196    |      1591    | |       186    |      1281    |
-|          384     |       172    |      1409    | |       181    |      1281    |
-|          512     |       187    |      1640    | |       181    |      1313    |
-|          768     |       171    |      1614    | |       176    |      1258    |
-|         1024     |       186    |      1812    | |       178    |      1223    |
-|         2048     |       190    |      1707    | |       186    |      1099    |
-|         4096     |       182    |      1446    | |       177    |       975    |
-|         8192     |       175    |      1345    | |       169    |      1034    |
-|         9216     |       165    |      1271    | |       168    |      1023    |
-|        16384     |       166    |      1396    | |       165    |       949    |
-|        32768     |       172    |      1311    | |       161    |       881    |
-|       262144     |       136    |       632    | |       134    |       629    |
-|      1048576     |       134    |       698    | |       127    |       623    |
-|------------------+--------------+--------------| |--------------+--------------|
-
-
-
-Ubuntu 12.04, gcc-4.7.3, 32-bit, with fftw 3.3.3 (built with --enable-neon), on a 1.2GHz ARM Cortex A9 (Tegra 3)
-
-Built with:
-gcc-4.7 -O3 -DHAVE_FFTW -march=armv7-a -mtune=cortex-a9 -mfloat-abi=hard -mfpu=neon -ffast-math test_pffft.c pffft.c -o test_pffft_arm fftpack.c -lm -I/usr/local/include/ -L/usr/local/lib/ -lfftw3f
-
-| input len |real FFTPack|  real FFTW | real PFFFT | |cplx FFTPack|  cplx FFTW | cplx PFFFT |
-|-----------+------------+------------+------------| |------------+------------+------------|
-|       64  |      549   |      452   |      731   | |      512   |      602   |      640   |
-|       96  |      421   |      272   |      702   | |      496   |      571   |      602   |
-|      128  |      498   |      512   |      815   | |      597   |      618   |      652   |
-|      160  |      521   |      536   |      815   | |      586   |      669   |      625   |
-|      192  |      539   |      571   |      883   | |      485   |      597   |      626   |
-|      256  |      640   |      539   |      975   | |      569   |      611   |      671   |
-|      384  |      499   |      610   |      879   | |      499   |      602   |      637   |
-|      480  |      518   |      507   |      877   | |      496   |      661   |      616   |
-|      512  |      524   |      591   |     1002   | |      549   |      678   |      668   |
-|      640  |      542   |      612   |      955   | |      568   |      663   |      645   |
-|      768  |      557   |      613   |      981   | |      491   |      663   |      598   |
-|      800  |      514   |      353   |      882   | |      514   |      360   |      574   |
-|     1024  |      640   |      640   |     1067   | |      492   |      683   |      602   |
-|     2048  |      587   |      640   |      908   | |      486   |      640   |      552   |
-|     2400  |      479   |      368   |      777   | |      422   |      376   |      518   |
-|     4096  |      511   |      614   |      853   | |      426   |      640   |      534   |
-|     8192  |      415   |      584   |      708   | |      386   |      622   |      516   |
-|     9216  |      419   |      571   |      687   | |      364   |      586   |      506   |
-|    16384  |      426   |      577   |      716   | |      398   |      606   |      530   |
-|    32768  |      417   |      572   |      673   | |      399   |      572   |      468   |
-|   262144  |      219   |      380   |      293   | |      255   |      431   |      343   |
-|  1048576  |      202   |      274   |      237   | |      265   |      282   |      355   |
-|-----------+------------+------------+------------| |------------+------------+------------|
-
-Same platform as above, but this time pffft and fftpack are built with clang 3.2:
-
-clang -O3 -DHAVE_FFTW -march=armv7-a -mtune=cortex-a9 -mfloat-abi=hard -mfpu=neon -ffast-math test_pffft.c pffft.c -o test_pffft_arm fftpack.c -lm -I/usr/local/include/ -L/usr/local/lib/ -lfftw3f
-
-| input len |real FFTPack|  real FFTW | real PFFFT | |cplx FFTPack|  cplx FFTW | cplx PFFFT |
-|-----------+------------+------------+------------| |------------+------------+------------|
-|       64  |      427   |      452   |      853   | |      427   |      602   |     1024   |
-|       96  |      351   |      276   |      843   | |      337   |      571   |      963   |
-|      128  |      373   |      512   |      996   | |      390   |      618   |     1054   |
-|      160  |      426   |      536   |      987   | |      375   |      669   |      914   |
-|      192  |      404   |      571   |     1079   | |      388   |      588   |     1079   |
-|      256  |      465   |      539   |     1205   | |      445   |      602   |     1170   |
-|      384  |      366   |      610   |     1099   | |      343   |      594   |     1099   |
-|      480  |      356   |      507   |     1140   | |      335   |      651   |      931   |
-|      512  |      411   |      591   |     1213   | |      384   |      649   |     1124   |
-|      640  |      398   |      612   |     1193   | |      373   |      654   |      901   |
-|      768  |      409   |      613   |     1227   | |      383   |      663   |     1044   |
-|      800  |      411   |      348   |     1073   | |      353   |      358   |      809   |
-|     1024  |      427   |      640   |     1280   | |      413   |      692   |     1004   |
-|     2048  |      414   |      626   |     1126   | |      371   |      640   |      853   |
-|     2400  |      399   |      373   |      898   | |      319   |      368   |      653   |
-|     4096  |      404   |      602   |     1059   | |      357   |      633   |      778   |
-|     8192  |      332   |      584   |      792   | |      308   |      616   |      716   |
-|     9216  |      322   |      561   |      783   | |      299   |      586   |      687   |
-|    16384  |      344   |      568   |      778   | |      314   |      617   |      745   |
-|    32768  |      342   |      564   |      737   | |      314   |      552   |      629   |
-|   262144  |      201   |      383   |      313   | |      227   |      435   |      413   |
-|  1048576  |      187   |      262   |      251   | |      228   |      281   |      409   |
-|-----------+------------+------------+------------| |------------+------------+------------|
-
-So it looks like, on ARM, gcc 4.7 is the best at scalar floating point
-(the fftpack performance numbers are better with gcc), while clang is
-the best with neon intrinsics (see how pffft perf has improved with
-clang 3.2).
-
-
-NVIDIA Jetson TK1 board, gcc-4.8.2. The cpu is a 2.3GHz cortex A15 (Tegra K1).
-
-Built with:
-gcc -O3 -march=armv7-a -mtune=native -mfloat-abi=hard -mfpu=neon -ffast-math test_pffft.c pffft.c -o test_pffft_arm fftpack.c -lm
-
-| input len |real FFTPack| real PFFFT | |cplx FFTPack| cplx PFFFT |
-|-----------+------------+------------| |------------+------------|
-|       64  |     1735   |     3308   | |     1994   |     3744   |
-|       96  |     1596   |     3448   | |     1987   |     3572   |
-|      128  |     1807   |     4076   | |     2255   |     3960   |
-|      160  |     1769   |     4083   | |     2071   |     3845   |
-|      192  |     1990   |     4233   | |     2017   |     3939   |
-|      256  |     2191   |     4882   | |     2254   |     4346   |
-|      384  |     1878   |     4492   | |     2073   |     4012   |
-|      480  |     1748   |     4398   | |     1923   |     3951   |
-|      512  |     2030   |     5064   | |     2267   |     4195   |
-|      640  |     1918   |     4756   | |     2094   |     4184   |
-|      768  |     2099   |     4907   | |     2048   |     4297   |
-|      800  |     1822   |     4555   | |     1880   |     4063   |
-|     1024  |     2232   |     5355   | |     2187   |     4420   |
-|     2048  |     2176   |     4983   | |     2027   |     3602   |
-|     2400  |     1741   |     4256   | |     1710   |     3344   |
-|     4096  |     1816   |     3914   | |     1851   |     3349   |
-|     8192  |     1716   |     3481   | |     1700   |     3255   |
-|     9216  |     1735   |     3589   | |     1653   |     3094   |
-|    16384  |     1567   |     3483   | |     1637   |     3244   |
-|    32768  |     1624   |     3240   | |     1655   |     3156   |
-|   262144  |     1012   |     1898   | |      983   |     1503   |
-|  1048576  |      876   |     1154   | |      868   |     1341   |
-|-----------+------------+------------| |------------+------------|
-
-The performance on the tegra K1 is pretty impressive. I'm not
-including the FFTW numbers as they as slightly below the scalar
-fftpack numbers, so something must be wrong (however it seems to be
-correctly configured and is using neon simd instructions).
-
-When using clang 3.4 the pffft version is even a bit faster, reaching
-5.7 GFlops for real ffts of size 1024.
-
-
-iPad Air 2 with iOS9, xcode 8.0, arm64. The cpu is an Apple A8X, supposedly running at 1.5GHz.
-
-| input len |real FFTPack|  real vDSP | real PFFFT | |cplx FFTPack|  cplx vDSP | cplx PFFFT |
-|-----------+------------+------------+------------| |------------+------------+------------|
-|       64  |     2517   |     7995   |     6086   | |     2725   |    13006   |     8495   |
-|       96  |     2442   |      n/a   |     6691   | |     2256   |      n/a   |     7991   |
-|      128  |     2664   |    10186   |     7877   | |     2575   |    15115   |     9115   |
-|      160  |     2638   |      n/a   |     8283   | |     2682   |      n/a   |     8806   |
-|      192  |     2903   |      n/a   |     9083   | |     2634   |      n/a   |     8980   |
-|      256  |     3184   |    11452   |    10039   | |     3026   |    15410   |    10199   |
-|      384  |     2665   |      n/a   |    10100   | |     2275   |      n/a   |     9247   |
-|      480  |     2546   |      n/a   |     9863   | |     2341   |      n/a   |     8892   |
-|      512  |     2832   |    12197   |    10989   | |     2547   |    16768   |    10154   |
-|      640  |     2755   |      n/a   |    10461   | |     2569   |      n/a   |     9666   |
-|      768  |     2998   |      n/a   |    11355   | |     2585   |      n/a   |     9813   |
-|      800  |     2516   |      n/a   |    10332   | |     2433   |      n/a   |     9164   |
-|     1024  |     3109   |    12965   |    12114   | |     2869   |    16448   |    10519   |
-|     2048  |     3027   |    12996   |    12023   | |     2648   |    17304   |    10307   |
-|     2400  |     2515   |      n/a   |    10372   | |     2355   |      n/a   |     8443   |
-|     4096  |     3204   |    13603   |    12359   | |     2814   |    16570   |     9780   |
-|     8192  |     2759   |    13422   |    10824   | |     2153   |    15652   |     7884   |
-|     9216  |     2700   |      n/a   |     9938   | |     2241   |      n/a   |     7900   |
-|    16384  |     2280   |    13057   |     7976   | |      593   |     4272   |     2534   |
-|    32768  |      768   |     4269   |     2882   | |      606   |     4405   |     2604   |
-|   262144  |      724   |     3527   |     2630   | |      534   |     2418   |     2157   |
-|  1048576  |      674   |     1467   |     2135   | |      530   |     1621   |     2055   |
-|-----------+------------+------------+------------| |------------+------------+------------|
-
-I double-checked to make sure I did not make a mistake in the time
-measurements, as the numbers are much higher than what I initially
-expected. They are in fact higher than the number I get on the 2.8GHz
-Xeon of my 2008 mac pro.. (except for FFT lengths >= 32768 where
-having a big cache is useful). A good surprise is also that the perf
-is not too far from apple's vDSP (at least for the real FFT).
-
diff --git a/oss-internship-2020/pffft/myNotes.txt b/oss-internship-2020/pffft/myNotes.txt
deleted file mode 100644
index 0dd3bcb..0000000
--- a/oss-internship-2020/pffft/myNotes.txt
+++ /dev/null
@@ -1,101 +0,0 @@
-About library's functions:
-    * pffft_aligned_malloc(size_t)
-        returns an allocated array considering the alignment offset
-    * pffft_aligned_free(void *)
-        frees the memory
-    * pffft_simd_size()
-        returns the SIMD_SZ = 4 (regarding simd vector)
-    * pffft_new_setup(int, ...)
-        with a fft size (first argument) being a multiple of 16, 32.
-
-Deleted part (validate function)
-    /*for (pass = 0; pass < 2; pass++) {
-      if (pass == 0) {
-        for (k = 0; k < Nfloat; k++) {
-          ref_[k] = in_[k] = frand() * 2 - 1;
-          out_[k] = 1e30;
-        }
-
-        if (!cplx) {
-          api.rffti(N, wrk_.PtrBoth()).IgnoreError();
-          api.rfftf(N, ref_.PtrBoth(), wrk_.PtrBoth()).IgnoreError();
-
-          {
-            float refN = ref_[N - 1];
-            for (k = N - 2; k >= 1; --k) {
-              ref_[k + 1] = ref_[k];
-            } 
-            ref_[1] = refN;
-          }
-        } else {
-          api.cffti(N, wrk_.PtrBoth()).IgnoreError();
-          api.cfftf(N, ref_.PtrBoth(), wrk_.PtrBoth()).IgnoreError();
-        } 
-      }
-
-      for (k = 0; k < Nfloat; ++k) {
-        ref_max = MAX(ref_max, fabs(ref_[k]));
-      }
-
-      if (pass == 0) {
-        api.pffft_transform(s_reg.PtrBefore(), in_.PtrBoth(), tmp_.PtrBoth(), wrk_.PtrBoth(), PFFFT_FORWARD).IgnoreError();
-
-        memcpy(tmp2, tmp, Nbytes);
-        memcpy(tmp, in, Nbytes);
-
-        api.pffft_transform(s_reg.PtrBefore(), tmp_.PtrBoth(), tmp_.PtrBoth(), wrk_.PtrBoth(), PFFFT_FORWARD).IgnoreError();
-
-        printf("Forward transformation test passed.\n");
-
-        api.pffft_zreorder(s_reg.PtrBefore(), tmp_.PtrBoth(), out_.PtrBoth(), PFFFT_FORWARD).IgnoreError();
-        api.pffft_zreorder(s_reg.PtrBefore(), out_.PtrBoth(), tmp_.PtrBoth(), PFFFT_BACKWARD).IgnoreError();
-
-        printf("Reordering test passed.\n");
-      } else {
-        api.pffft_transform_ordered(s_reg.PtrBefore(), in_.PtrBoth(), tmp_.PtrBoth(), wrk_.PtrBoth(), PFFFT_FORWARD).IgnoreError();
-
-      }
-    } */
-
-        
-MACRO for testing
-TEST(AssignOrReturn, AssignsMultipleVariablesInSequence) {
-  auto func = []() -> absl::Status {
-    int value1;
-    SAPI_ASSIGN_OR_RETURN(value1, StatusOr<int>(1));
-    EXPECT_EQ(1, value1);
-    int value2;
-    SAPI_ASSIGN_OR_RETURN(value2, StatusOr<int>(2));
-    EXPECT_EQ(2, value2);
-    int value3;
-    SAPI_ASSIGN_OR_RETURN(value3, StatusOr<int>(3));
-    EXPECT_EQ(3, value3);
-    int value4;
-    SAPI_ASSIGN_OR_RETURN(value4,
-                          StatusOr<int>(absl::UnknownError("EXPECTED"
-  int value1;
-    SAPI_ASSIGN_OR_RETURN(value1, StatusOr<int>(1));
-
-
- // PFFFT benchmark
-    /*{
-      sapi::StatusOr<PFFFT_Setup *> s = api.pffft_new_setup(N, cplx ? PFFFT_COMPLEX : PFFFT_REAL);
-      if (s.ok()) {
-        sapi::v::GenericPtr s_reg(s.value());
-
-        t0 = uclock_sec();  
-        for (iter = 0; iter < max_iter; ++iter) {
-          printf("%s 1\n", api.pffft_transform(s_reg.PtrBoth(), X_.PtrBoth(), Z_.PtrBoth(), Y_.PtrBoth(), PFFFT_FORWARD).ToString().c_str());
-          printf("%s 2\n", api.pffft_transform(s_reg.PtrBoth(), X_.PtrBoth(), Z_.PtrBoth(), Y_.PtrBoth(), PFFFT_FORWARD).ToString().c_str());
-        }
-        t1 = uclock_sec();
-        printf("%s 3 \n", api.pffft_destroy_setup(s_reg.PtrBoth()).ToString().c_str());
-
-      
-        flops = (max_iter*2) * ((cplx ? 5 : 2.5) * N * log((double)N) / M_LN2); 
-        show_output("PFFFT", N, cplx, flops, t0, t1, max_iter);
-      } else {
-        fprintf(stderr, "s NULL :(\n\n");
-      }
-    }*/
-    
\ No newline at end of file
diff --git a/oss-internship-2020/pffft/test_pffft.c b/oss-internship-2020/pffft/test_pffft.c
deleted file mode 100644
index a5d20c2..0000000
--- a/oss-internship-2020/pffft/test_pffft.c
+++ /dev/null
@@ -1,419 +0,0 @@
-/*
-  Copyright (c) 2013 Julien Pommier.
-
-  Small test & bench for PFFFT, comparing its performance with the scalar FFTPACK, FFTW, and Apple vDSP
-
-  How to build: 
-
-  on linux, with fftw3:
-  gcc -o test_pffft -DHAVE_FFTW -msse -mfpmath=sse -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -lm
-
-  on macos, without fftw3:
-  clang -o test_pffft -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -framework Accelerate
-
-  on macos, with fftw3:
-  clang -o test_pffft -DHAVE_FFTW -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -framework Accelerate
-
-  on windows, with visual c++:
-  cl /Ox -D_USE_MATH_DEFINES /arch:SSE test_pffft.c pffft.c fftpack.c
-  
-  build without SIMD instructions:
-  gcc -o test_pffft -DPFFFT_SIMD_DISABLE -O3 -Wall -W pffft.c test_pffft.c fftpack.c -lm
-
- */
-
-#include "pffft.h"
-#include "fftpack.h"
-
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <time.h>
-#include <assert.h>
-#include <string.h>
-
-#ifdef HAVE_SYS_TIMES
-#  include <sys/times.h>
-#  include <unistd.h>
-#endif
-
-#ifdef HAVE_VECLIB
-#  include <Accelerate/Accelerate.h>
-#endif
-
-#ifdef HAVE_FFTW
-#  include <fftw3.h>
-#endif
-
-#define MAX(x,y) ((x)>(y)?(x):(y))
-
-double frand() {
-  return rand()/(double)RAND_MAX;
-}
-
-#if defined(HAVE_SYS_TIMES)
-  inline double uclock_sec(void) {
-    static double ttclk = 0.;
-    if (ttclk == 0.) ttclk = sysconf(_SC_CLK_TCK);
-    struct tms t; return ((double)times(&t)) / ttclk;
-  }
-# else
-  double uclock_sec(void)
-{ return (double)clock()/(double)CLOCKS_PER_SEC; }
-#endif
-
-
-/* compare results with the regular fftpack */
-void pffft_validate_N(int N, int cplx) {
-  int Nfloat = N*(cplx?2:1);
-  int Nbytes = Nfloat * sizeof(float);
-  float *ref, *in, *out, *tmp, *tmp2;
-  PFFFT_Setup *s = pffft_new_setup(N, cplx ? PFFFT_COMPLEX : PFFFT_REAL);
-  int pass;
-
-  if (!s) { printf("Skipping N=%d, not supported\n", N); return; }
-  ref = pffft_aligned_malloc(Nbytes);
-  in = pffft_aligned_malloc(Nbytes);
-  out = pffft_aligned_malloc(Nbytes);
-  tmp = pffft_aligned_malloc(Nbytes);
-  tmp2 = pffft_aligned_malloc(Nbytes);
-
-  for (pass=0; pass < 2; ++pass) {
-    float ref_max = 0;
-    int k;
-    //printf("N=%d pass=%d cplx=%d\n", N, pass, cplx);
-    // compute reference solution with FFTPACK
-    if (pass == 0) {
-      float *wrk = malloc(2*Nbytes+15*sizeof(float));
-      for (k=0; k < Nfloat; ++k) {
-        ref[k] = in[k] = frand()*2-1; 
-        out[k] = 1e30;
-      }
-      if (!cplx) {
-        rffti(N, wrk);
-        rfftf(N, ref, wrk);
-        // use our ordering for real ffts instead of the one of fftpack
-        {
-          float refN=ref[N-1];
-          for (k=N-2; k >= 1; --k) ref[k+1] = ref[k]; 
-          ref[1] = refN;
-        }
-      } else {
-        cffti(N, wrk);
-        cfftf(N, ref, wrk);
-      }
-      free(wrk);
-    }
-
-    for (k = 0; k < Nfloat; ++k) ref_max = MAX(ref_max, fabs(ref[k]));
-
-      
-    // pass 0 : non canonical ordering of transform coefficients  
-    if (pass == 0) {
-      // test forward transform, with different input / output
-      pffft_transform(s, in, tmp, 0, PFFFT_FORWARD);
-      memcpy(tmp2, tmp, Nbytes);
-      memcpy(tmp, in, Nbytes);
-      pffft_transform(s, tmp, tmp, 0, PFFFT_FORWARD);
-      for (k = 0; k < Nfloat; ++k) {
-        assert(tmp2[k] == tmp[k]);
-      }
-
-      // test reordering
-      pffft_zreorder(s, tmp, out, PFFFT_FORWARD);
-      pffft_zreorder(s, out, tmp, PFFFT_BACKWARD);
-      for (k = 0; k < Nfloat; ++k) {
-        assert(tmp2[k] == tmp[k]);
-      }
-      pffft_zreorder(s, tmp, out, PFFFT_FORWARD);
-    } else {
-      // pass 1 : canonical ordering of transform coeffs.
-      pffft_transform_ordered(s, in, tmp, 0, PFFFT_FORWARD);
-      memcpy(tmp2, tmp, Nbytes);
-      memcpy(tmp, in, Nbytes);
-      pffft_transform_ordered(s, tmp, tmp, 0, PFFFT_FORWARD);
-      for (k = 0; k < Nfloat; ++k) {
-        assert(tmp2[k] == tmp[k]);
-      }
-      memcpy(out, tmp, Nbytes);
-    }
-
-    {
-      for (k=0; k < Nfloat; ++k) {
-        if (!(fabs(ref[k] - out[k]) < 1e-3*ref_max)) {
-          printf("%s forward PFFFT mismatch found for N=%d\n", (cplx?"CPLX":"REAL"), N);
-          exit(1);
-        }
-      }
-        
-      if (pass == 0) pffft_transform(s, tmp, out, 0, PFFFT_BACKWARD);
-      else   pffft_transform_ordered(s, tmp, out, 0, PFFFT_BACKWARD);
-      memcpy(tmp2, out, Nbytes);
-      memcpy(out, tmp, Nbytes);
-      if (pass == 0) pffft_transform(s, out, out, 0, PFFFT_BACKWARD);
-      else   pffft_transform_ordered(s, out, out, 0, PFFFT_BACKWARD);
-      for (k = 0; k < Nfloat; ++k) {
-        assert(tmp2[k] == out[k]);
-        out[k] *= 1.f/N;
-      }
-      for (k = 0; k < Nfloat; ++k) {
-        if (fabs(in[k] - out[k]) > 1e-3 * ref_max) {
-          printf("pass=%d, %s IFFFT does not match for N=%d\n", pass, (cplx?"CPLX":"REAL"), N); break;
-          exit(1);
-        }
-      }
-    }
-
-    // quick test of the circular convolution in fft domain
-    {
-      float conv_err = 0, conv_max = 0;
-
-      pffft_zreorder(s, ref, tmp, PFFFT_FORWARD);
-      memset(out, 0, Nbytes);
-      pffft_zconvolve_accumulate(s, ref, ref, out, 1.0);
-      pffft_zreorder(s, out, tmp2, PFFFT_FORWARD);
-      
-      for (k=0; k < Nfloat; k += 2) {
-        float ar = tmp[k], ai=tmp[k+1];
-        if (cplx || k > 0) {
-          tmp[k] = ar*ar - ai*ai;
-          tmp[k+1] = 2*ar*ai;
-        } else {
-          tmp[0] = ar*ar;
-          tmp[1] = ai*ai;
-        }
-      }
-      
-      for (k=0; k < Nfloat; ++k) {
-        float d = fabs(tmp[k] - tmp2[k]), e = fabs(tmp[k]);
-        if (d > conv_err) conv_err = d;
-        if (e > conv_max) conv_max = e;
-      }
-      if (conv_err > 1e-5*conv_max) {
-        printf("zconvolve error ? %g %g\n", conv_err, conv_max); exit(1);
-      }
-    }
-
-  }
-
-  printf("%s PFFFT is OK for N=%d\n", (cplx?"CPLX":"REAL"), N); fflush(stdout);
-  
-  pffft_destroy_setup(s);
-  pffft_aligned_free(ref);
-  pffft_aligned_free(in);
-  pffft_aligned_free(out);
-  pffft_aligned_free(tmp);
-  pffft_aligned_free(tmp2);
-}
-
-void pffft_validate(int cplx) {
-  static int Ntest[] = { 16, 32, 64, 96, 128, 160, 192, 256, 288, 384, 5*96, 512, 576, 5*128, 800, 864, 1024, 2048, 2592, 4000, 4096, 12000, 36864, 0};
-  int k;
-  for (k = 0; Ntest[k]; ++k) {
-    int N = Ntest[k];
-    if (N == 16 && !cplx) continue;
-    pffft_validate_N(N, cplx);
-  }
-}
-
-int array_output_format = 0;
-
-void show_output(const char *name, int N, int cplx, float flops, float t0, float t1, int max_iter) {
-  float mflops = flops/1e6/(t1 - t0 + 1e-16);
-  if (array_output_format) {
-    if (flops != -1) {
-      printf("|%9.0f   ", mflops);
-    } else printf("|      n/a   ");
-  } else {
-    if (flops != -1) {
-      printf("N=%5d, %s %16s : %6.0f MFlops [t=%6.0f ns, %d runs]\n", N, (cplx?"CPLX":"REAL"), name, mflops, (t1-t0)/2/max_iter * 1e9, max_iter);
-    }
-  }
-  fflush(stdout);
-}
-
-void benchmark_ffts(int N, int cplx) {
-  int Nfloat = (cplx ? N*2 : N);
-  int Nbytes = Nfloat * sizeof(float);
-  float *X = pffft_aligned_malloc(Nbytes), *Y = pffft_aligned_malloc(Nbytes), *Z = pffft_aligned_malloc(Nbytes);
-
-  double t0, t1, flops;
-
-  int k;
-  int max_iter = 5120000/N*4;
-#ifdef __arm__
-  max_iter /= 4;
-#endif
-  int iter;
-
-  for (k = 0; k < Nfloat; ++k) {
-    X[k] = 0; //sqrtf(k+1);
-  }
-
-  // FFTPack benchmark
-  {
-    float *wrk = malloc(2*Nbytes + 15*sizeof(float));
-    int max_iter_ = max_iter/pffft_simd_size(); if (max_iter_ == 0) max_iter_ = 1;
-    if (cplx) cffti(N, wrk);
-    else      rffti(N, wrk);
-    t0 = uclock_sec();  
-    
-    for (iter = 0; iter < max_iter_; ++iter) {
-      if (cplx) {
-        cfftf(N, X, wrk);
-        cfftb(N, X, wrk);
-      } else {
-        rfftf(N, X, wrk);
-        rfftb(N, X, wrk);
-      }
-    }
-    t1 = uclock_sec();
-    free(wrk);
-    
-    flops = (max_iter_*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); // see http://www.fftw.org/speed/method.html
-    show_output("FFTPack", N, cplx, flops, t0, t1, max_iter_);
-  }
-
-#ifdef HAVE_VECLIB
-  int log2N = (int)(log(N)/log(2) + 0.5f);
-  if (N == (1<<log2N)) {
-    FFTSetup setup;
-
-    setup = vDSP_create_fftsetup(log2N, FFT_RADIX2);
-    DSPSplitComplex zsamples;
-    zsamples.realp = &X[0];
-    zsamples.imagp = &X[Nfloat/2];
-    t0 = uclock_sec();  
-    for (iter = 0; iter < max_iter; ++iter) {
-      if (cplx) {
-        vDSP_fft_zip(setup, &zsamples, 1, log2N, kFFTDirection_Forward);
-        vDSP_fft_zip(setup, &zsamples, 1, log2N, kFFTDirection_Inverse);
-      } else {
-        vDSP_fft_zrip(setup, &zsamples, 1, log2N, kFFTDirection_Forward); 
-        vDSP_fft_zrip(setup, &zsamples, 1, log2N, kFFTDirection_Inverse);
-      }
-    }
-    t1 = uclock_sec();
-    vDSP_destroy_fftsetup(setup);
-
-    flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); // see http://www.fftw.org/speed/method.html
-    show_output("vDSP", N, cplx, flops, t0, t1, max_iter);
-  } else {
-    show_output("vDSP", N, cplx, -1, -1, -1, -1);
-  }
-#endif
-  
-#ifdef HAVE_FFTW
-  {
-    fftwf_plan planf, planb;
-    fftw_complex *in = (fftw_complex*) fftwf_malloc(sizeof(fftw_complex) * N);
-    fftw_complex *out = (fftw_complex*) fftwf_malloc(sizeof(fftw_complex) * N);
-    memset(in, 0, sizeof(fftw_complex) * N);
-    int flags = (N < 40000 ? FFTW_MEASURE : FFTW_ESTIMATE);  // measure takes a lot of time on largest ffts
-    //int flags = FFTW_ESTIMATE;
-    if (cplx) {
-      planf = fftwf_plan_dft_1d(N, (fftwf_complex*)in, (fftwf_complex*)out, FFTW_FORWARD, flags);
-      planb = fftwf_plan_dft_1d(N, (fftwf_complex*)in, (fftwf_complex*)out, FFTW_BACKWARD, flags);
-    } else {
-      planf = fftwf_plan_dft_r2c_1d(N, (float*)in, (fftwf_complex*)out, flags);
-      planb = fftwf_plan_dft_c2r_1d(N, (fftwf_complex*)in, (float*)out, flags);
-    }
-
-    t0 = uclock_sec();  
-    for (iter = 0; iter < max_iter; ++iter) {
-      fftwf_execute(planf);
-      fftwf_execute(planb);
-    }
-    t1 = uclock_sec();
-
-    fftwf_destroy_plan(planf);
-    fftwf_destroy_plan(planb);
-    fftwf_free(in); fftwf_free(out);
-
-    flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); // see http://www.fftw.org/speed/method.html
-    show_output((flags == FFTW_MEASURE ? "FFTW (meas.)" : " FFTW (estim)"), N, cplx, flops, t0, t1, max_iter);
-  }
-#endif  
-
-  // PFFFT benchmark
-  {
-    PFFFT_Setup *s = pffft_new_setup(N, cplx ? PFFFT_COMPLEX : PFFFT_REAL);
-    if (s) {
-      t0 = uclock_sec();  
-      for (iter = 0; iter < max_iter; ++iter) {
-        pffft_transform(s, X, Z, Y, PFFFT_FORWARD);
-        pffft_transform(s, X, Z, Y, PFFFT_BACKWARD);
-      }
-      t1 = uclock_sec();
-      pffft_destroy_setup(s);
-    
-      flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); // see http://www.fftw.org/speed/method.html
-      show_output("PFFFT", N, cplx, flops, t0, t1, max_iter);
-    }
-  }
-
-  if (!array_output_format) {
-    printf("--\n");
-  }
-
-  pffft_aligned_free(X);
-  pffft_aligned_free(Y);
-  pffft_aligned_free(Z);
-}
-
-#ifndef PFFFT_SIMD_DISABLE
-void validate_pffft_simd(); // a small function inside pffft.c that will detect compiler bugs with respect to simd instruction 
-#endif
-
-int main(int argc, char **argv) {
-  int Nvalues[] = { 64, 96, 128, 160, 192, 256, 384, 5*96, 512, 5*128, 3*256, 800, 1024, 2048, 2400, 4096, 8192, 9*1024, 16384, 32768, 256*1024, 1024*1024, -1 };
-  int i;
-
-  if (argc > 1 && strcmp(argv[1], "--array-format") == 0) {
-    array_output_format = 1;
-  }
-
-#ifndef PFFFT_SIMD_DISABLE
-  validate_pffft_simd();
-#endif
-  pffft_validate(1);
-  pffft_validate(0);
-  if (!array_output_format) {
-    for (i=0; Nvalues[i] > 0; ++i) {
-      benchmark_ffts(Nvalues[i], 0 /* real fft */);
-    }
-    for (i=0; Nvalues[i] > 0; ++i) {
-      benchmark_ffts(Nvalues[i], 1 /* cplx fft */);
-    }
-  } else {
-    printf("| input len ");
-    printf("|real FFTPack");
-#ifdef HAVE_VECLIB
-    printf("|  real vDSP ");
-#endif
-#ifdef HAVE_FFTW
-    printf("|  real FFTW ");
-#endif
-    printf("| real PFFFT | ");
-
-    printf("|cplx FFTPack");
-#ifdef HAVE_VECLIB
-    printf("|  cplx vDSP ");
-#endif
-#ifdef HAVE_FFTW
-    printf("|  cplx FFTW ");
-#endif
-    printf("| cplx PFFFT |\n");
-    for (i=0; Nvalues[i] > 0; ++i) {
-      printf("|%9d  ", Nvalues[i]);
-      benchmark_ffts(Nvalues[i], 0); 
-      printf("| ");
-      benchmark_ffts(Nvalues[i], 1);
-      printf("|\n");
-    }
-    printf(" (numbers are given in MFlops)\n");
-  }
-
-
-  return 0;
-}
diff --git a/oss-internship-2020/pffft/test_pffft_sandboxed.cc b/oss-internship-2020/pffft/test_pffft_sandboxed.cc
deleted file mode 100644
index 523dce0..0000000
--- a/oss-internship-2020/pffft/test_pffft_sandboxed.cc
+++ /dev/null
@@ -1,206 +0,0 @@
-#include <assert.h>
-#include <glog/logging.h>
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/times.h>
-#include <syscall.h>
-#include <time.h>
-
-#include "fftpack.h"
-#include "pffft_sapi.sapi.h"
-#include "sandboxed_api/util/flag.h"
-#include "sandboxed_api/vars.h"
-
-ABSL_DECLARE_FLAG(string, sandbox2_danger_danger_permit_all);
-ABSL_DECLARE_FLAG(string, sandbox2_danger_danger_permit_all_and_log);
-
-class pffftSapiSandbox : public pffftSandbox {
- public:
-  std::unique_ptr<sandbox2::Policy> ModifyPolicy(
-      sandbox2::PolicyBuilder*) override {
-    return sandbox2::PolicyBuilder()
-        .AllowStaticStartup()
-        .AllowOpen()
-        .AllowRead()
-        .AllowWrite()
-        .AllowSystemMalloc()
-        .AllowExit()
-        .AllowSyscalls({
-            __NR_futex,
-            __NR_close,
-            __NR_getrusage,
-        })
-        .DisableNamespaces()
-        .BuildOrDie();
-  }
-};
-
-double frand() { return rand() / (double)RAND_MAX; }
-
-double uclock_sec(void) { return (double)clock() / (double)CLOCKS_PER_SEC; }
-
-int array_output_format = 0;
-
-void show_output(const char* name, int N, int cplx, float flops, float t0,
-                 float t1, int max_iter) {
-  float mflops = flops / 1e6 / (t1 - t0 + 1e-16);
-  if (array_output_format) {
-    if (flops != -1) {
-      printf("|%9.0f   ", mflops);
-    } else
-      printf("|      n/a   ");
-  } else {
-    if (flops != -1) {
-      printf("N=%5d, %s %16s : %6.0f MFlops [t=%6.0f ns, %d runs]\n", N,
-             (cplx ? "CPLX" : "REAL"), name, mflops,
-             (t1 - t0) / 2 / max_iter * 1e9, max_iter);
-    }
-  }
-  fflush(stdout);
-}
-
-int main(int argc, char* argv[]) {
-  /*
-  * Initialize Google's logging library.
-  */
-  google::InitGoogleLogging(argv[0]);
-
-  gflags::ParseCommandLineFlags(&argc, &argv, true);
-  /*
-  * Nvalues is a vector keeping the values by which iterates N, its value 
-  * representing the input length. More concrete, N is the number of
-  * data points the caclulus is up to (determinating its accuracy).
-  * To show the performance of Fast-Fourier Transformations the program is 
-  * testing for various values of N.
-  */
-  int Nvalues[] = {64,    96,     128,        160,         192,     256,
-                   384,   5 * 96, 512,        5 * 128,     3 * 256, 800,
-                   1024,  2048,   2400,       4096,        8192,    9 * 1024,
-                   16384, 32768,  256 * 1024, 1024 * 1024, -1};
-  int i;
-
-  VLOG(1) << "Initializing sandbox...\n";
-
-  pffftSapiSandbox sandbox;
-  sandbox.Init().IgnoreError();
-
-  VLOG(1) << "Initialization: " << sandbox.Init().ToString().c_str() << "\n";
-
-  pffftApi api(&sandbox);
-
-  int N, cplx;
-
-  cplx = 0;
-
-  do {
-    for (i = 0; i < 23; i++) {
-      N = Nvalues[i];
-
-      int Nfloat = N * (cplx ? 2 : 1);
-      int Nbytes = Nfloat * sizeof(float);
-      int pass;
-
-      float ref[Nbytes], in[Nbytes], out[Nbytes], tmp[Nbytes], tmp2[Nbytes];
-
-      sapi::v::Array<float> ref_(ref, Nbytes);
-      sapi::v::Array<float> in_(in, Nbytes);
-      sapi::v::Array<float> out_(out, Nbytes);
-      sapi::v::Array<float> tmp_(tmp, Nbytes);
-      sapi::v::Array<float> tmp2_(tmp2, Nbytes);
-
-      float wrk[2 * Nbytes + 15 * sizeof(float)];
-      sapi::v::Array<float> wrk_(wrk, 2 * Nbytes + 15 * sizeof(float));
-
-      float ref_max = 0;
-      int k;
-
-      Nfloat = (cplx ? N * 2 : N);
-      float X[Nbytes], Y[Nbytes], Z[Nbytes];
-      sapi::v::Array<float> X_(X, Nbytes), Y_(Y, Nbytes), Z_(Z, Nbytes);
-
-      double t0, t1, flops;
-
-      int max_iter = 5120000 / N * 4;
-  #ifdef __arm__
-      max_iter /= 4;
-  #endif
-      int iter;
-
-      for (k = 0; k < Nfloat; ++k) {
-        X[k] = 0;
-      }
-
-      /*
-      * FFTPack benchmark
-      */
-      {
-        /*
-        * SIMD_SZ == 4 (returning value of pffft_simd_size())
-        */
-        int max_iter_ =
-            max_iter / 4; 
-
-        if (max_iter_ == 0) max_iter_ = 1;
-        if (cplx) {
-          api.cffti(N, wrk_.PtrBoth()).IgnoreError();
-        } else {
-          api.rffti(N, wrk_.PtrBoth()).IgnoreError();
-        }
-        t0 = uclock_sec();
-
-        for (iter = 0; iter < max_iter_; ++iter) {
-          if (cplx) {
-            api.cfftf(N, X_.PtrBoth(), wrk_.PtrBoth()).IgnoreError();
-            api.cfftb(N, X_.PtrBoth(), wrk_.PtrBoth()).IgnoreError();
-          } else {
-            api.rfftf(N, X_.PtrBoth(), wrk_.PtrBoth()).IgnoreError();
-            api.rfftb(N, X_.PtrBoth(), wrk_.PtrBoth()).IgnoreError();
-          }
-        }
-        t1 = uclock_sec();
-
-        flops = (max_iter_ * 2) * ((cplx ? 5 : 2.5) * N * log((double)N) / M_LN2);
-        show_output("FFTPack", N, cplx, flops, t0, t1, max_iter_);
-      }
-
-      /*
-      * PFFFT benchmark
-      */ 
-      {
-        sapi::StatusOr<PFFFT_Setup*> s =
-            api.pffft_new_setup(N, cplx ? PFFFT_COMPLEX : PFFFT_REAL);
-
-        VLOG(1) << "Setup status is: " << s.status().ToString().c_str() << "\n";
-
-        if (s.ok()) {
-          sapi::v::RemotePtr s_reg(s.value());
-
-          t0 = uclock_sec();
-          for (iter = 0; iter < max_iter; ++iter) {
-            api.pffft_transform(&s_reg, X_.PtrBoth(), Z_.PtrBoth(), Y_.PtrBoth(),
-                                PFFFT_FORWARD)
-                .IgnoreError();
-            api.pffft_transform(&s_reg, X_.PtrBoth(), Z_.PtrBoth(), Y_.PtrBoth(),
-                                PFFFT_FORWARD)
-                .IgnoreError();
-          }
-
-          t1 = uclock_sec();
-          api.pffft_destroy_setup(&s_reg).IgnoreError();
-
-          flops =
-              (max_iter * 2) * ((cplx ? 5 : 2.5) * N * log((double)N) / M_LN2);
-          show_output("PFFFT", N, cplx, flops, t0, t1, max_iter);
-        }
-
-        VLOG(1) << "N = " << N << " SUCCESSFULLY\n\n";
-      }
-    } 
-
-    cplx = !cplx;
-  } while (cplx);
-
-  return 0;
-}
\ No newline at end of file

From 726a9345fffac8c152cc9f36e1a3b0d5b437f3e7 Mon Sep 17 00:00:00 2001
From: doinachiroiu <inach@google.com>
Date: Thu, 20 Aug 2020 12:37:34 +0000
Subject: [PATCH 15/29] Enable namespaces

---
 oss-internship-2020/pffft/README.md               | 8 +++++++-
 oss-internship-2020/pffft/main_pffft_sandboxed.cc | 9 ++++-----
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/oss-internship-2020/pffft/README.md b/oss-internship-2020/pffft/README.md
index 52facf3..b3ceef8 100644
--- a/oss-internship-2020/pffft/README.md
+++ b/oss-internship-2020/pffft/README.md
@@ -64,4 +64,10 @@ In the end, the performance of PFFFT library it is outlined by the output.*
     uintptr_t instead of PFFFT_Setup
 
     Solution: using "sapi::v::RemotePtr" instead of "sapi::v::GenericPtr" 
-    to access the memory of object s
\ No newline at end of file
+    to access the memory of object s
+
+    - [Unresolved] compiling bug: "No spave left on device"
+    The building process creates some `embed` files that use lots of 
+    memory, trying to write them on /tmp.
+    
+    Temporary solution: clean /tmp directory by `sudo rm -rf /tmp/*`.
\ No newline at end of file
diff --git a/oss-internship-2020/pffft/main_pffft_sandboxed.cc b/oss-internship-2020/pffft/main_pffft_sandboxed.cc
index 662b949..b9a5a90 100644
--- a/oss-internship-2020/pffft/main_pffft_sandboxed.cc
+++ b/oss-internship-2020/pffft/main_pffft_sandboxed.cc
@@ -34,7 +34,6 @@ class pffftSapiSandbox : public pffftSandbox {
             __NR_close,
             __NR_getrusage,
         })
-        .DisableNamespaces()
         .BuildOrDie();
   }
 };
@@ -83,12 +82,12 @@ int main(int argc, char* argv[]) {
                    16384, 32768,  256 * 1024, 1024 * 1024, -1};
   int i;
 
-  VLOG(1) << "Initializing sandbox...\n";
+  LOG(INFO) << "Initializing sandbox...\n";
 
   pffftSapiSandbox sandbox;
   sandbox.Init().IgnoreError();
 
-  VLOG(1) << "Initialization: " << sandbox.Init().ToString().c_str() << "\n";
+  LOG(INFO) << "Initialization: " << sandbox.Init().ToString().c_str() << "\n";
 
   pffftApi api(&sandbox);
 
@@ -174,7 +173,7 @@ int main(int argc, char* argv[]) {
         sapi::StatusOr<PFFFT_Setup*> s =
             api.pffft_new_setup(N, cplx ? PFFFT_COMPLEX : PFFFT_REAL);
 
-        VLOG(1) << "Setup status is: " << s.status().ToString().c_str() << "\n";
+        LOG(INFO) << "Setup status is: " << s.status().ToString().c_str() << "\n";
 
         if (s.ok()) {
           sapi::v::RemotePtr s_reg(s.value());
@@ -197,7 +196,7 @@ int main(int argc, char* argv[]) {
           show_output("PFFFT", N, cplx, flops, t0, t1, max_iter);
         }
 
-        VLOG(1) << "N = " << N << " SUCCESSFULLY\n\n";
+        LOG(INFO) << "N = " << N << " SUCCESSFULLY\n\n";
       }
     }
 

From a2873ac099bd08ddd08427ebeb74e6e6f334cf58 Mon Sep 17 00:00:00 2001
From: doinachiroiu <inach@google.com>
Date: Thu, 20 Aug 2020 13:15:02 +0000
Subject: [PATCH 16/29] Update README.md

---
 oss-internship-2020/pffft/README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/oss-internship-2020/pffft/README.md b/oss-internship-2020/pffft/README.md
index b3ceef8..8b8b2bd 100644
--- a/oss-internship-2020/pffft/README.md
+++ b/oss-internship-2020/pffft/README.md
@@ -58,7 +58,7 @@ In the end, the performance of PFFFT library it is outlined by the output.*
     N > 64, status not OK
     Problem on initialising sapi::StatusOr<PFFFT_Setup *> s; the memory that stays 
     for s is not the same with the address passed in pffft_transform function. 
-    (sapi :: v :: GenericPtr - to be changed)
+    (sapi::v::GenericPtr - to be changed)
 
     Temporary solution: change the generated files to accept 
     uintptr_t instead of PFFFT_Setup
@@ -66,8 +66,8 @@ In the end, the performance of PFFFT library it is outlined by the output.*
     Solution: using "sapi::v::RemotePtr" instead of "sapi::v::GenericPtr" 
     to access the memory of object s
 
-    - [Unresolved] compiling bug: "No spave left on device"
+    - [Unresolved] compiling bug: "No space left on device"
     The building process creates some `embed` files that use lots of 
     memory, trying to write them on /tmp.
-    
+
     Temporary solution: clean /tmp directory by `sudo rm -rf /tmp/*`.
\ No newline at end of file

From 8f21b0e931ed702876a5421bf2bbf13374a58987 Mon Sep 17 00:00:00 2001
From: doinachiroiu <inach@google.com>
Date: Wed, 26 Aug 2020 11:23:33 +0000
Subject: [PATCH 17/29] Coding style update

---
 .../pffft/main_pffft_sandboxed.cc             | 66 +++++++------------
 1 file changed, 23 insertions(+), 43 deletions(-)

diff --git a/oss-internship-2020/pffft/main_pffft_sandboxed.cc b/oss-internship-2020/pffft/main_pffft_sandboxed.cc
index b9a5a90..4da5abd 100644
--- a/oss-internship-2020/pffft/main_pffft_sandboxed.cc
+++ b/oss-internship-2020/pffft/main_pffft_sandboxed.cc
@@ -1,15 +1,14 @@
-#define GOOGLE_STRIP_LOG 1
-
-#include <assert.h>
 #include <glog/logging.h>
-#include <math.h>
-#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/times.h>
 #include <syscall.h>
 #include <time.h>
 
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+
 #include "fftpack.h"
 #include "pffft_sapi.sapi.h"
 #include "sandboxed_api/util/flag.h"
@@ -38,14 +37,12 @@ class pffftSapiSandbox : public pffftSandbox {
   }
 };
 
-double frand() { return rand() / (double)RAND_MAX; }
-
-double uclock_sec(void) { return (double)clock() / (double)CLOCKS_PER_SEC; }
+double UclockSec(void) { return (double)clock() / (double)CLOCKS_PER_SEC; }
 
 int array_output_format = 0;
 
-void show_output(const char* name, int N, int cplx, float flops, float t0,
-                 float t1, int max_iter) {
+void ShowOutput(const char* name, int N, int cplx, float flops, float t0,
+                float t1, int max_iter) {
   float mflops = flops / 1e6 / (t1 - t0 + 1e-16);
   if (array_output_format) {
     if (flops != -1) {
@@ -85,39 +82,21 @@ int main(int argc, char* argv[]) {
   LOG(INFO) << "Initializing sandbox...\n";
 
   pffftSapiSandbox sandbox;
-  sandbox.Init().IgnoreError();
+  absl::Status init_status = sandbox.Init();
 
-  LOG(INFO) << "Initialization: " << sandbox.Init().ToString().c_str() << "\n";
+  LOG(INFO) << "Initialization: " << init_status.ToString().c_str() << "\n";
 
   pffftApi api(&sandbox);
-
-  int N, cplx;
-
-  cplx = 0;
+  int cplx = 0;
 
   do {
-    for (i = 0; i < 23; i++) {
-      N = Nvalues[i];
-
-      int Nfloat = N * (cplx ? 2 : 1);
+    for (int N : Nvalues) {
+      const int Nfloat = N * (cplx ? 2 : 1);
       int Nbytes = Nfloat * sizeof(float);
-      int pass;
 
-      float ref[Nbytes], in[Nbytes], out[Nbytes], tmp[Nbytes], tmp2[Nbytes];
+      float wrk[2 * Nfloat + 15 * sizeof(float)];
+      sapi::v::Array<float> wrk_(wrk, 2 * Nfloat + 15 * sizeof(float));
 
-      sapi::v::Array<float> ref_(ref, Nbytes);
-      sapi::v::Array<float> in_(in, Nbytes);
-      sapi::v::Array<float> out_(out, Nbytes);
-      sapi::v::Array<float> tmp_(tmp, Nbytes);
-      sapi::v::Array<float> tmp2_(tmp2, Nbytes);
-
-      float wrk[2 * Nbytes + 15 * sizeof(float)];
-      sapi::v::Array<float> wrk_(wrk, 2 * Nbytes + 15 * sizeof(float));
-
-      float ref_max = 0;
-      int k;
-
-      Nfloat = (cplx ? N * 2 : N);
       float X[Nbytes], Y[Nbytes], Z[Nbytes];
       sapi::v::Array<float> X_(X, Nbytes), Y_(Y, Nbytes), Z_(Z, Nbytes);
 
@@ -127,7 +106,7 @@ int main(int argc, char* argv[]) {
 #ifdef __arm__
       max_iter /= 4;
 #endif
-      int iter;
+      int iter, k;
 
       for (k = 0; k < Nfloat; ++k) {
         X[k] = 0;
@@ -148,7 +127,7 @@ int main(int argc, char* argv[]) {
         } else {
           api.rffti(N, wrk_.PtrBoth()).IgnoreError();
         }
-        t0 = uclock_sec();
+        t0 = UclockSec();
 
         for (iter = 0; iter < max_iter_; ++iter) {
           if (cplx) {
@@ -159,11 +138,11 @@ int main(int argc, char* argv[]) {
             api.rfftb(N, X_.PtrBoth(), wrk_.PtrBoth()).IgnoreError();
           }
         }
-        t1 = uclock_sec();
+        t1 = UclockSec();
 
         flops =
             (max_iter_ * 2) * ((cplx ? 5 : 2.5) * N * log((double)N) / M_LN2);
-        show_output("FFTPack", N, cplx, flops, t0, t1, max_iter_);
+        ShowOutput("FFTPack", N, cplx, flops, t0, t1, max_iter_);
       }
 
       /*
@@ -173,12 +152,13 @@ int main(int argc, char* argv[]) {
         sapi::StatusOr<PFFFT_Setup*> s =
             api.pffft_new_setup(N, cplx ? PFFFT_COMPLEX : PFFFT_REAL);
 
-        LOG(INFO) << "Setup status is: " << s.status().ToString().c_str() << "\n";
+        LOG(INFO) << "Setup status is: " << s.status().ToString().c_str()
+                  << "\n";
 
         if (s.ok()) {
           sapi::v::RemotePtr s_reg(s.value());
 
-          t0 = uclock_sec();
+          t0 = UclockSec();
           for (iter = 0; iter < max_iter; ++iter) {
             api.pffft_transform(&s_reg, X_.PtrBoth(), Z_.PtrBoth(),
                                 Y_.PtrBoth(), PFFFT_FORWARD)
@@ -188,12 +168,12 @@ int main(int argc, char* argv[]) {
                 .IgnoreError();
           }
 
-          t1 = uclock_sec();
+          t1 = UclockSec();
           api.pffft_destroy_setup(&s_reg).IgnoreError();
 
           flops =
               (max_iter * 2) * ((cplx ? 5 : 2.5) * N * log((double)N) / M_LN2);
-          show_output("PFFFT", N, cplx, flops, t0, t1, max_iter);
+          ShowOutput("PFFFT", N, cplx, flops, t0, t1, max_iter);
         }
 
         LOG(INFO) << "N = " << N << " SUCCESSFULLY\n\n";

From 139723d3b8ea803417ed1c654302d26991bbdbd0 Mon Sep 17 00:00:00 2001
From: doinachiroiu <inach@google.com>
Date: Wed, 26 Aug 2020 14:18:31 +0000
Subject: [PATCH 18/29] Added LICENSE & coding changes required

---
 oss-internship-2020/pffft/CMakeLists.txt      | 16 +++-
 oss-internship-2020/pffft/Makefile            | 15 ---
 oss-internship-2020/pffft/README.md           | 29 ++++--
 .../{pffft_library_notes.txt => README.txt}   |  0
 .../pffft/main_pffft_sandboxed.cc             | 93 ++++++++++---------
 .../pffft/{main_pffft.c => test_pffft.c}      |  0
 6 files changed, 85 insertions(+), 68 deletions(-)
 delete mode 100644 oss-internship-2020/pffft/Makefile
 rename oss-internship-2020/pffft/{pffft_library_notes.txt => README.txt} (100%)
 rename oss-internship-2020/pffft/{main_pffft.c => test_pffft.c} (100%)

diff --git a/oss-internship-2020/pffft/CMakeLists.txt b/oss-internship-2020/pffft/CMakeLists.txt
index 6c7e776..442bb74 100644
--- a/oss-internship-2020/pffft/CMakeLists.txt
+++ b/oss-internship-2020/pffft/CMakeLists.txt
@@ -1,3 +1,17 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 cmake_minimum_required(VERSION 3.10)
 
 project(pffft CXX C)
@@ -13,7 +27,7 @@ add_library(pffft STATIC
 )
 
 add_executable(pffft_main
-    main_pffft.c
+    test_pffft.c
 )
 
 target_link_libraries(pffft_main PRIVATE 
diff --git a/oss-internship-2020/pffft/Makefile b/oss-internship-2020/pffft/Makefile
deleted file mode 100644
index 326fd90..0000000
--- a/oss-internship-2020/pffft/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-CXXFLAGS ?= -std=c++17
-
-pffft_main: test_pffft.o libpffft.a
-	$(CXX) -o $@ $(CXXFLAGS) $(LDFLAGS) $^
-
-libpffft.a: pffft.o fftpack.o
-	ar rcs $@ $^
-
-pffft.c: pffft.h
-
-fftpack.c: fftpack.h
-
-.PHONY: clean
-clean: 
-	rm -f *.o *.a pffft_main
\ No newline at end of file
diff --git a/oss-internship-2020/pffft/README.md b/oss-internship-2020/pffft/README.md
index 8b8b2bd..6cef583 100644
--- a/oss-internship-2020/pffft/README.md
+++ b/oss-internship-2020/pffft/README.md
@@ -1,14 +1,23 @@
 # Sandboxing PFFFT library
 
-Builder: CMake  
+Build System: CMake  
 OS: Linux
 
+### Check out the PFFFT library & CMake set up
+`mkdir -p build && cd build`
+
+`git clone https://bitbucket.org/jpommier/pffft.git`
+
+`cmake .. -G Ninja -DPFFFT_ROOT_DIR=$PWD`
+
+`ninja`
+
 ### For testing: 
 `cd build`, then `./pffft_sandboxed`
 
 ### For debug:
-`SAPI_VLOG_LEVEL=1 ./pffft_sandboxed --v=100
---sandbox2_danger_danger_permit_all_and_log <auxiliar file>`
+display custom info with
+`./pffft_sandboxed --logtostderr`
 
 ## ***About the project*** 
 *PFFFT library is concerned with 1D Fast-Fourier Transformations finding a
@@ -38,22 +47,22 @@ it is taken into account while testing.
 In the end, the performance of PFFFT library it is outlined by the output.*
 
 #### CMake observations resume:
-    * linking pffft and fftpack (which contains necessary functions for pffft)
-    * set math library 
+* linking pffft and fftpack (which contains necessary functions for pffft)
+* set math library 
 
 #### Sandboxed main observations resume:
-    * containing two testing parts (fft / pffft benchmarks)
-    * showing the performance of the transformations implies 
+* containing two testing parts (fft / pffft benchmarks)
+* showing the performance of the transformations implies 
     testing them through various FFT dimenstions. 
     Variable N, the input length, will take specific values 
     meaning the number of points to which it is set the calculus 
     (more details of mathematical purpose of N - https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm). 
-    * output shows speed depending on the input length
+* output shows speed depending on the input length
 
     
 ### Bugs history
-    - [Solved] pffft benchmark bug: "Sandbox not active"  
+    1. [Solved] pffft benchmark bug: "Sandbox not active"  
     N = 64, status OK, pffft_transform generates error 
     N > 64, status not OK
     Problem on initialising sapi::StatusOr<PFFFT_Setup *> s; the memory that stays 
@@ -66,7 +75,7 @@ In the end, the performance of PFFFT library it is outlined by the output.*
     Solution: using "sapi::v::RemotePtr" instead of "sapi::v::GenericPtr" 
     to access the memory of object s
 
-    - [Unresolved] compiling bug: "No space left on device"
+    2. [Unresolved] compiling bug: "No space left on device"
     The building process creates some `embed` files that use lots of 
     memory, trying to write them on /tmp.
 
diff --git a/oss-internship-2020/pffft/pffft_library_notes.txt b/oss-internship-2020/pffft/README.txt
similarity index 100%
rename from oss-internship-2020/pffft/pffft_library_notes.txt
rename to oss-internship-2020/pffft/README.txt
diff --git a/oss-internship-2020/pffft/main_pffft_sandboxed.cc b/oss-internship-2020/pffft/main_pffft_sandboxed.cc
index 4da5abd..8b7e6d1 100644
--- a/oss-internship-2020/pffft/main_pffft_sandboxed.cc
+++ b/oss-internship-2020/pffft/main_pffft_sandboxed.cc
@@ -1,3 +1,17 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #include <glog/logging.h>
 #include <stdlib.h>
 #include <string.h>
@@ -17,7 +31,7 @@
 ABSL_DECLARE_FLAG(string, sandbox2_danger_danger_permit_all);
 ABSL_DECLARE_FLAG(string, sandbox2_danger_danger_permit_all_and_log);
 
-class pffftSapiSandbox : public pffftSandbox {
+class PffftSapiSandbox : public pffftSandbox {
  public:
   std::unique_ptr<sandbox2::Policy> ModifyPolicy(
       sandbox2::PolicyBuilder*) override {
@@ -60,28 +74,26 @@ void ShowOutput(const char* name, int N, int cplx, float flops, float t0,
 }
 
 int main(int argc, char* argv[]) {
-  /*
-   * Initialize Google's logging library.
-   */
+  
+  // Initialize Google's logging library.
   google::InitGoogleLogging(argv[0]);
 
   gflags::ParseCommandLineFlags(&argc, &argv, true);
-  /*
-   * Nvalues is a vector keeping the values by which iterates N, its value
-   * representing the input length. More concrete, N is the number of
-   * data points the caclulus is up to (determinating its accuracy).
-   * To show the performance of Fast-Fourier Transformations the program is
-   * testing for various values of N.
-   */
+
+  // Nvalues is a vector keeping the values by which iterates N, its value
+  // representing the input length. More concrete, N is the number of
+  // data points the caclulus is up to (determinating its accuracy).
+  // To show the performance of Fast-Fourier Transformations the program is
+  // testing for various values of N.
   int Nvalues[] = {64,    96,     128,        160,         192,     256,
                    384,   5 * 96, 512,        5 * 128,     3 * 256, 800,
                    1024,  2048,   2400,       4096,        8192,    9 * 1024,
-                   16384, 32768,  256 * 1024, 1024 * 1024, -1};
+                   16384, 32768};
   int i;
 
   LOG(INFO) << "Initializing sandbox...\n";
 
-  pffftSapiSandbox sandbox;
+  PffftSapiSandbox sandbox;
   absl::Status init_status = sandbox.Init();
 
   LOG(INFO) << "Initialization: " << init_status.ToString().c_str() << "\n";
@@ -112,13 +124,9 @@ int main(int argc, char* argv[]) {
         X[k] = 0;
       }
 
-      /*
-       * FFTPack benchmark
-       */
+      // FFTPack benchmark
       {
-        /*
-         * SIMD_SZ == 4 (returning value of pffft_simd_size())
-         */
+        // SIMD_SZ == 4 (returning value of pffft_simd_size())
         int max_iter_ = max_iter / 4;
 
         if (max_iter_ == 0) max_iter_ = 1;
@@ -144,10 +152,8 @@ int main(int argc, char* argv[]) {
             (max_iter_ * 2) * ((cplx ? 5 : 2.5) * N * log((double)N) / M_LN2);
         ShowOutput("FFTPack", N, cplx, flops, t0, t1, max_iter_);
       }
-
-      /*
-       * PFFFT benchmark
-       */
+      
+      // PFFFT benchmark
       {
         sapi::StatusOr<PFFFT_Setup*> s =
             api.pffft_new_setup(N, cplx ? PFFFT_COMPLEX : PFFFT_REAL);
@@ -155,27 +161,30 @@ int main(int argc, char* argv[]) {
         LOG(INFO) << "Setup status is: " << s.status().ToString().c_str()
                   << "\n";
 
-        if (s.ok()) {
-          sapi::v::RemotePtr s_reg(s.value());
-
-          t0 = UclockSec();
-          for (iter = 0; iter < max_iter; ++iter) {
-            api.pffft_transform(&s_reg, X_.PtrBoth(), Z_.PtrBoth(),
-                                Y_.PtrBoth(), PFFFT_FORWARD)
-                .IgnoreError();
-            api.pffft_transform(&s_reg, X_.PtrBoth(), Z_.PtrBoth(),
-                                Y_.PtrBoth(), PFFFT_FORWARD)
-                .IgnoreError();
-          }
-
-          t1 = UclockSec();
-          api.pffft_destroy_setup(&s_reg).IgnoreError();
-
-          flops =
-              (max_iter * 2) * ((cplx ? 5 : 2.5) * N * log((double)N) / M_LN2);
-          ShowOutput("PFFFT", N, cplx, flops, t0, t1, max_iter);
+        if (!s.ok()) {
+          printf("Sandbox failed.\n");
+          return 1;
         }
 
+        sapi::v::RemotePtr s_reg(s.value());
+
+        t0 = UclockSec();
+        for (iter = 0; iter < max_iter; ++iter) {
+          api.pffft_transform(&s_reg, X_.PtrBoth(), Z_.PtrBoth(),
+                              Y_.PtrBoth(), PFFFT_FORWARD)
+              .IgnoreError();
+          api.pffft_transform(&s_reg, X_.PtrBoth(), Z_.PtrBoth(),
+                              Y_.PtrBoth(), PFFFT_FORWARD)
+              .IgnoreError();
+        }
+
+        t1 = UclockSec();
+        api.pffft_destroy_setup(&s_reg).IgnoreError();
+
+        flops =
+            (max_iter * 2) * ((cplx ? 5 : 2.5) * N * log((double)N) / M_LN2);
+        ShowOutput("PFFFT", N, cplx, flops, t0, t1, max_iter);
+
         LOG(INFO) << "N = " << N << " SUCCESSFULLY\n\n";
       }
     }
diff --git a/oss-internship-2020/pffft/main_pffft.c b/oss-internship-2020/pffft/test_pffft.c
similarity index 100%
rename from oss-internship-2020/pffft/main_pffft.c
rename to oss-internship-2020/pffft/test_pffft.c

From b08726540a76c290e8c339e0f2b6bee65ddaf889 Mon Sep 17 00:00:00 2001
From: doinachiroiu <inach@google.com>
Date: Wed, 26 Aug 2020 14:25:11 +0000
Subject: [PATCH 19/29] Added PFFFT submodule - master directory

---
 .gitmodules                      | 3 +++
 oss-internship-2020/pffft/master | 1 +
 2 files changed, 4 insertions(+)
 create mode 100644 .gitmodules
 create mode 160000 oss-internship-2020/pffft/master

diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..793361c
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "oss-internship-2020/pffft/master"]
+	path = oss-internship-2020/pffft/master
+	url = https://bitbucket.org/jpommier/pffft/src/master/
diff --git a/oss-internship-2020/pffft/master b/oss-internship-2020/pffft/master
new file mode 160000
index 0000000..74d7261
--- /dev/null
+++ b/oss-internship-2020/pffft/master
@@ -0,0 +1 @@
+Subproject commit 74d7261be17cf659d5930d4830609406bd7553e3

From c8d07aeaa59e5850d212644357af31a54f4787fb Mon Sep 17 00:00:00 2001
From: doinachiroiu <inach@google.com>
Date: Wed, 26 Aug 2020 14:56:19 +0000
Subject: [PATCH 20/29] Changing paths for propertly usage of the submodule

---
 oss-internship-2020/pffft/CMakeLists.txt      |   12 +-
 oss-internship-2020/pffft/README.md           |    4 +-
 oss-internship-2020/pffft/README.txt          |  416 ---
 oss-internship-2020/pffft/fftpack.c           | 3112 -----------------
 oss-internship-2020/pffft/fftpack.h           |  799 -----
 .../pffft/main_pffft_sandboxed.cc             |    1 -
 oss-internship-2020/pffft/pffft.c             | 1881 ----------
 oss-internship-2020/pffft/pffft.h             |  177 -
 oss-internship-2020/pffft/test_pffft.c        |  419 ---
 9 files changed, 8 insertions(+), 6813 deletions(-)
 delete mode 100644 oss-internship-2020/pffft/README.txt
 delete mode 100644 oss-internship-2020/pffft/fftpack.c
 delete mode 100644 oss-internship-2020/pffft/fftpack.h
 delete mode 100644 oss-internship-2020/pffft/pffft.c
 delete mode 100644 oss-internship-2020/pffft/pffft.h
 delete mode 100644 oss-internship-2020/pffft/test_pffft.c

diff --git a/oss-internship-2020/pffft/CMakeLists.txt b/oss-internship-2020/pffft/CMakeLists.txt
index 442bb74..c2a7bcd 100644
--- a/oss-internship-2020/pffft/CMakeLists.txt
+++ b/oss-internship-2020/pffft/CMakeLists.txt
@@ -20,14 +20,14 @@ set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED True)
 
 add_library(pffft STATIC 
-    pffft.c 
-    pffft.h 
-    fftpack.c 
-    fftpack.h
+    master/pffft.c 
+    master/pffft.h 
+    master/fftpack.c 
+    master/fftpack.h
 )
 
 add_executable(pffft_main
-    test_pffft.c
+    master/test_pffft.c
 )
 
 target_link_libraries(pffft_main PRIVATE 
@@ -84,7 +84,7 @@ add_sapi_library(pffft_sapi
               sinti 
               sint
 
-    INPUTS pffft.h fftpack.h
+    INPUTS master/pffft.h master/fftpack.h
     LIBRARY pffft
     LIBRARY_NAME pffft
 
diff --git a/oss-internship-2020/pffft/README.md b/oss-internship-2020/pffft/README.md
index 6cef583..2d2a9ca 100644
--- a/oss-internship-2020/pffft/README.md
+++ b/oss-internship-2020/pffft/README.md
@@ -4,9 +4,9 @@ Build System: CMake
 OS: Linux
 
 ### Check out the PFFFT library & CMake set up
-`mkdir -p build && cd build`
+`git submodule add https://bitbucket.org/jpommier/pffft.git`
 
-`git clone https://bitbucket.org/jpommier/pffft.git`
+`mkdir -p build && cd build`
 
 `cmake .. -G Ninja -DPFFFT_ROOT_DIR=$PWD`
 
diff --git a/oss-internship-2020/pffft/README.txt b/oss-internship-2020/pffft/README.txt
deleted file mode 100644
index ee20b42..0000000
--- a/oss-internship-2020/pffft/README.txt
+++ /dev/null
@@ -1,416 +0,0 @@
-PFFFT: a pretty fast FFT.
-
-TL;DR
---
-
-PFFFT does 1D Fast Fourier Transforms, of single precision real and
-complex vectors. It tries do it fast, it tries to be correct, and it
-tries to be small. Computations do take advantage of SSE1 instructions
-on x86 cpus, Altivec on powerpc cpus, and NEON on ARM cpus. The
-license is BSD-like.
-
-
-Why does it exist:
---
-
-I was in search of a good performing FFT library , preferably very
-small and with a very liberal license.
-
-When one says "fft library", FFTW ("Fastest Fourier Transform in the
-West") is probably the first name that comes to mind -- I guess that
-99% of open-source projects that need a FFT do use FFTW, and are happy
-with it. However, it is quite a large library , which does everything
-fft related (2d transforms, 3d transforms, other transformations such
-as discrete cosine , or fast hartley). And it is licensed under the
-GNU GPL , which means that it cannot be used in non open-source
-products.
-
-An alternative to FFTW that is really small, is the venerable FFTPACK
-v4, which is available on NETLIB. A more recent version (v5) exists,
-but it is larger as it deals with multi-dimensional transforms. This
-is a library that is written in FORTRAN 77, a language that is now
-considered as a bit antiquated by many. FFTPACKv4 was written in 1985,
-by Dr Paul Swarztrauber of NCAR, more than 25 years ago ! And despite
-its age, benchmarks show it that it still a very good performing FFT
-library, see for example the 1d single precision benchmarks here:
-http://www.fftw.org/speed/opteron-2.2GHz-32bit/ . It is however not
-competitive with the fastest ones, such as FFTW, Intel MKL, AMD ACML,
-Apple vDSP. The reason for that is that those libraries do take
-advantage of the SSE SIMD instructions available on Intel CPUs,
-available since the days of the Pentium III. These instructions deal
-with small vectors of 4 floats at a time, instead of a single float
-for a traditionnal FPU, so when using these instructions one may expect
-a 4-fold performance improvement.
-
-The idea was to take this fortran fftpack v4 code, translate to C,
-modify it to deal with those SSE instructions, and check that the
-final performance is not completely ridiculous when compared to other
-SIMD FFT libraries. Translation to C was performed with f2c (
-http://www.netlib.org/f2c/ ). The resulting file was a bit edited in
-order to remove the thousands of gotos that were introduced by
-f2c. You will find the fftpack.h and fftpack.c sources in the
-repository, this a complete translation of
-http://www.netlib.org/fftpack/ , with the discrete cosine transform
-and the test program. There is no license information in the netlib
-repository, but it was confirmed to me by the fftpack v5 curators that
-the same terms do apply to fftpack v4:
-http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html . This is a
-"BSD-like" license, it is compatible with proprietary projects.
-
-Adapting fftpack to deal with the SIMD 4-element vectors instead of
-scalar single precision numbers was more complex than I originally
-thought, especially with the real transforms, and I ended up writing
-more code than I planned..
-
-
-The code:
---
-
-Only two files, in good old C, pffft.c and pffft.h . The API is very
-very simple, just make sure that you read the comments in pffft.h.
-
-
-Comparison with other FFTs:
---
-
-The idea was not to break speed records, but to get a decently fast
-fft that is at least 50% as fast as the fastest FFT -- especially on
-slowest computers . I'm more focused on getting the best performance
-on slow cpus (Atom, Intel Core 1, old Athlons, ARM Cortex-A9...), than
-on getting top performance on today fastest cpus.
-
-It can be used in a real-time context as the fft functions do not
-perform any memory allocation -- that is why they accept a 'work'
-array in their arguments.
-
-It is also a bit focused on performing 1D convolutions, that is why it
-provides "unordered" FFTs , and a fourier domain convolution
-operation.
-
-
-Benchmark results (cpu tested: core i7 2600, core 2 quad, core 1 duo, atom N270, cortex-A9, cortex-A15, A8X)
---
-
-The benchmark shows the performance of various fft implementations measured in 
-MFlops, with the number of floating point operations being defined as 5Nlog2(N)
-for a length N complex fft, and 2.5*Nlog2(N) for a real fft. 
-See http://www.fftw.org/speed/method.html for an explanation of these formulas.
-
-MacOS Lion, gcc 4.2, 64-bit, fftw 3.3 on a 3.4 GHz core i7 2600
-
-Built with:
-
- gcc-4.2 -o test_pffft -arch x86_64 -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -DHAVE_VECLIB -framework veclib -DHAVE_FFTW -lfftw3f 
-
-| input len |real FFTPack|  real vDSP |  real FFTW | real PFFFT | |cplx FFTPack|  cplx vDSP |  cplx FFTW | cplx PFFFT |
-|-----------+------------+------------+------------+------------| |------------+------------+------------+------------|
-|       64  |     2816   |     8596   |     7329   |     8187   | |     2887   |    14898   |    14668   |    11108   |
-|       96  |     3298   |      n/a   |     8378   |     7727   | |     3953   |      n/a   |    15680   |    10878   |
-|      128  |     3507   |    11575   |     9266   |    10108   | |     4233   |    17598   |    16427   |    12000   |
-|      160  |     3391   |      n/a   |     9838   |    10711   | |     4220   |      n/a   |    16653   |    11187   |
-|      192  |     3919   |      n/a   |     9868   |    10956   | |     4297   |      n/a   |    15770   |    12540   |
-|      256  |     4283   |    13179   |    10694   |    13128   | |     4545   |    19550   |    16350   |    13822   |
-|      384  |     3136   |      n/a   |    10810   |    12061   | |     3600   |      n/a   |    16103   |    13240   |
-|      480  |     3477   |      n/a   |    10632   |    12074   | |     3536   |      n/a   |    11630   |    12522   |
-|      512  |     3783   |    15141   |    11267   |    13838   | |     3649   |    20002   |    16560   |    13580   |
-|      640  |     3639   |      n/a   |    11164   |    13946   | |     3695   |      n/a   |    15416   |    13890   |
-|      768  |     3800   |      n/a   |    11245   |    13495   | |     3590   |      n/a   |    15802   |    14552   |
-|      800  |     3440   |      n/a   |    10499   |    13301   | |     3659   |      n/a   |    12056   |    13268   |
-|     1024  |     3924   |    15605   |    11450   |    15339   | |     3769   |    20963   |    13941   |    15467   |
-|     2048  |     4518   |    16195   |    11551   |    15532   | |     4258   |    20413   |    13723   |    15042   |
-|     2400  |     4294   |      n/a   |    10685   |    13078   | |     4093   |      n/a   |    12777   |    13119   |
-|     4096  |     4750   |    16596   |    11672   |    15817   | |     4157   |    19662   |    14316   |    14336   |
-|     8192  |     3820   |    16227   |    11084   |    12555   | |     3691   |    18132   |    12102   |    13813   |
-|     9216  |     3864   |      n/a   |    10254   |    12870   | |     3586   |      n/a   |    12119   |    13994   |
-|    16384  |     3822   |    15123   |    10454   |    12822   | |     3613   |    16874   |    12370   |    13881   |
-|    32768  |     4175   |    14512   |    10662   |    11095   | |     3881   |    14702   |    11619   |    11524   |
-|   262144  |     3317   |    11429   |     6269   |     9517   | |     2810   |    11729   |     7757   |    10179   |
-|  1048576  |     2913   |    10551   |     4730   |     5867   | |     2661   |     7881   |     3520   |     5350   |
-|-----------+------------+------------+------------+------------| |------------+------------+------------+------------|
-
-
-Debian 6, gcc 4.4.5, 64-bit, fftw 3.3.1 on a 3.4 GHz core i7 2600
-
-Built with:
-gcc -o test_pffft -DHAVE_FFTW -msse2 -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L$HOME/local/lib -I$HOME/local/include/ -lfftw3f -lm
-
-| N (input length) | real FFTPack |   real FFTW  |  real PFFFT  | | cplx FFTPack |   cplx FFTW  |  cplx PFFFT  |
-|------------------+--------------+--------------+--------------| |--------------+--------------+--------------|
-|           64     |      3840    |      7680    |      8777    | |      4389    |     20480    |     11171    |
-|           96     |      4214    |      9633    |      8429    | |      4816    |     22477    |     11238    |
-|          128     |      3584    |     10240    |     10240    | |      5120    |     23893    |     11947    |
-|          192     |      4854    |     11095    |     12945    | |      4854    |     22191    |     14121    |
-|          256     |      4096    |     11703    |     16384    | |      5120    |     23406    |     13653    |
-|          384     |      4395    |     14651    |     12558    | |      4884    |     19535    |     14651    |
-|          512     |      5760    |     13166    |     15360    | |      4608    |     23040    |     15360    |
-|          768     |      4907    |     14020    |     16357    | |      4461    |     19628    |     14020    |
-|         1024     |      5120    |     14629    |     14629    | |      5120    |     20480    |     15754    |
-|         2048     |      5632    |     14080    |     18773    | |      4693    |     12516    |     16091    |
-|         4096     |      5120    |     13653    |     17554    | |      4726    |      7680    |     14456    |
-|         8192     |      4160    |      7396    |     13312    | |      4437    |     14791    |     13312    |
-|         9216     |      4210    |      6124    |     13473    | |      4491    |      7282    |     14970    |
-|        16384     |      3976    |     11010    |     14313    | |      4210    |     11450    |     13631    |
-|        32768     |      4260    |     10224    |     10954    | |      4260    |      6816    |     11797    |
-|       262144     |      3736    |      6896    |      9961    | |      2359    |      8965    |      9437    |
-|      1048576     |      2796    |      4534    |      6453    | |      1864    |      3078    |      5592    |
-|------------------+--------------+--------------+--------------| |--------------+--------------+--------------|
-
-
-
-MacOS Snow Leopard, gcc 4.0, 32-bit, fftw 3.3 on a 1.83 GHz core 1 duo
-
-Built with:
-
- gcc -o test_pffft -DHAVE_FFTW -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -framework veclib
-
-| input len |real FFTPack|  real vDSP |  real FFTW | real PFFFT | |cplx FFTPack|  cplx vDSP |  cplx FFTW | cplx PFFFT |
-|-----------+------------+------------+------------+------------| |------------+------------+------------+------------|
-|      64   |      745   |     2145   |     1706   |     2028   | |      961   |     3356   |     3313   |     2300   |
-|      96   |      877   |      n/a   |     1976   |     1978   | |     1059   |      n/a   |     3333   |     2233   |
-|     128   |      951   |     2808   |     2213   |     2279   | |     1202   |     3803   |     3739   |     2494   |
-|     192   |     1002   |      n/a   |     2456   |     2429   | |     1186   |      n/a   |     3701   |     2508   |
-|     256   |     1065   |     3205   |     2641   |     2793   | |     1302   |     4013   |     3912   |     2663   |
-|     384   |      845   |      n/a   |     2759   |     2499   | |      948   |      n/a   |     3729   |     2504   |
-|     512   |      900   |     3476   |     2956   |     2759   | |      974   |     4057   |     3954   |     2645   |
-|     768   |      910   |      n/a   |     2912   |     2737   | |      975   |      n/a   |     3837   |     2614   |
-|    1024   |      936   |     3583   |     3107   |     3009   | |     1006   |     4124   |     3821   |     2697   |
-|    2048   |     1057   |     3585   |     3091   |     2837   | |     1089   |     3889   |     3701   |     2513   |
-|    4096   |     1083   |     3524   |     3092   |     2733   | |     1039   |     3617   |     3462   |     2364   |
-|    8192   |      874   |     3252   |     2967   |     2363   | |      911   |     3106   |     2789   |     2302   |
-|    9216   |      898   |      n/a   |     2420   |     2290   | |      865   |      n/a   |     2676   |     2204   |
-|   16384   |      903   |     2892   |     2506   |     2421   | |      899   |     3026   |     2797   |     2289   |
-|   32768   |      965   |     2837   |     2550   |     2358   | |      920   |     2922   |     2763   |     2240   |
-|  262144   |      738   |     2422   |     1589   |     1708   | |      610   |     2038   |     1436   |     1091   |
-| 1048576   |      528   |     1207   |      845   |      880   | |      606   |     1020   |      669   |     1036   |
-|-----------+------------+------------+------------+------------| |------------+------------+------------+------------|
-
-
-
-Ubuntu 11.04, gcc 4.5, 32-bit, fftw 3.2 on a 2.66 core 2 quad
-
-Built with:
-gcc -o test_pffft -DHAVE_FFTW -msse -mfpmath=sse -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -lm
-
-| input len |real FFTPack|  real FFTW | real PFFFT | |cplx FFTPack|  cplx FFTW | cplx PFFFT |
-|-----------+------------+------------+------------| |------------+------------+------------|
-|       64  |     1920   |     3614   |     5120   | |     2194   |     7680   |     6467   |
-|       96  |     1873   |     3549   |     5187   | |     2107   |     8429   |     5863   |
-|      128  |     2240   |     3773   |     5514   | |     2560   |     7964   |     6827   |
-|      192  |     1765   |     4569   |     7767   | |     2284   |     9137   |     7061   |
-|      256  |     2048   |     5461   |     7447   | |     2731   |     9638   |     7802   |
-|      384  |     1998   |     5861   |     6762   | |     2313   |     9253   |     7644   |
-|      512  |     2095   |     6144   |     7680   | |     2194   |    10240   |     7089   |
-|      768  |     2230   |     5773   |     7549   | |     2045   |    10331   |     7010   |
-|     1024  |     2133   |     6400   |     8533   | |     2133   |    10779   |     7877   |
-|     2048  |     2011   |     7040   |     8665   | |     1942   |    10240   |     7768   |
-|     4096  |     2194   |     6827   |     8777   | |     1755   |     9452   |     6827   |
-|     8192  |     1849   |     6656   |     6656   | |     1752   |     7831   |     6827   |
-|     9216  |     1871   |     5858   |     6416   | |     1643   |     6909   |     6266   |
-|    16384  |     1883   |     6223   |     6506   | |     1664   |     7340   |     6982   |
-|    32768  |     1826   |     6390   |     6667   | |     1631   |     7481   |     6971   |
-|   262144  |     1546   |     4075   |     5977   | |     1299   |     3415   |     3551   |
-|  1048576  |     1104   |     2071   |     1730   | |     1104   |     1149   |     1834   |
-|-----------+------------+------------+------------| |------------+------------+------------|
-
-
-
-Ubuntu 11.04, gcc 4.5, 32-bit, fftw 3.3 on a 1.6 GHz Atom N270
-
-Built with:
-gcc -o test_pffft -DHAVE_FFTW -msse -mfpmath=sse -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -lm
-
-| N (input length) | real FFTPack |   real FFTW  |  real PFFFT  | | cplx FFTPack |   cplx FFTW  |  cplx PFFFT  |
-|------------------+--------------+--------------+--------------| |--------------+--------------+--------------|
-|           64     |       452    |      1041    |      1336    | |       549    |      2318    |      1781    |
-|           96     |       444    |      1297    |      1297    | |       503    |      2408    |      1686    |
-|          128     |       527    |      1525    |      1707    | |       543    |      2655    |      1886    |
-|          192     |       498    |      1653    |      1849    | |       539    |      2678    |      1942    |
-|          256     |       585    |      1862    |      2156    | |       594    |      2777    |      2244    |
-|          384     |       499    |      1870    |      1998    | |       511    |      2586    |      1890    |
-|          512     |       562    |      2095    |      2194    | |       542    |      2973    |      2194    |
-|          768     |       545    |      2045    |      2133    | |       545    |      2365    |      2133    |
-|         1024     |       595    |      2133    |      2438    | |       569    |      2695    |      2179    |
-|         2048     |       587    |      2125    |      2347    | |       521    |      2230    |      1707    |
-|         4096     |       495    |      1890    |      1834    | |       492    |      1876    |      1672    |
-|         8192     |       469    |      1548    |      1729    | |       438    |      1740    |      1664    |
-|         9216     |       468    |      1663    |      1663    | |       446    |      1585    |      1531    |
-|        16384     |       453    |      1608    |      1767    | |       398    |      1476    |      1664    |
-|        32768     |       456    |      1420    |      1503    | |       387    |      1388    |      1345    |
-|       262144     |       309    |       385    |       726    | |       262    |       415    |       840    |
-|      1048576     |       280    |       351    |       739    | |       261    |       313    |       797    |
-|------------------+--------------+--------------+--------------| |--------------+--------------+--------------|
-
-
-
-Windows 7, visual c++ 2010 on a 1.6 GHz Atom N270
-
-Built with:
-cl /Ox -D_USE_MATH_DEFINES /arch:SSE test_pffft.c pffft.c fftpack.c
-
-(visual c++ is definitively not very good with SSE intrinsics...)
-
-| N (input length) | real FFTPack |  real PFFFT  | | cplx FFTPack |  cplx PFFFT  |
-|------------------+--------------+--------------| |--------------+--------------|
-|           64     |       173    |      1009    | |       174    |      1159    |
-|           96     |       169    |      1029    | |       188    |      1201    |
-|          128     |       195    |      1242    | |       191    |      1275    |
-|          192     |       178    |      1312    | |       184    |      1276    |
-|          256     |       196    |      1591    | |       186    |      1281    |
-|          384     |       172    |      1409    | |       181    |      1281    |
-|          512     |       187    |      1640    | |       181    |      1313    |
-|          768     |       171    |      1614    | |       176    |      1258    |
-|         1024     |       186    |      1812    | |       178    |      1223    |
-|         2048     |       190    |      1707    | |       186    |      1099    |
-|         4096     |       182    |      1446    | |       177    |       975    |
-|         8192     |       175    |      1345    | |       169    |      1034    |
-|         9216     |       165    |      1271    | |       168    |      1023    |
-|        16384     |       166    |      1396    | |       165    |       949    |
-|        32768     |       172    |      1311    | |       161    |       881    |
-|       262144     |       136    |       632    | |       134    |       629    |
-|      1048576     |       134    |       698    | |       127    |       623    |
-|------------------+--------------+--------------| |--------------+--------------|
-
-
-
-Ubuntu 12.04, gcc-4.7.3, 32-bit, with fftw 3.3.3 (built with --enable-neon), on a 1.2GHz ARM Cortex A9 (Tegra 3)
-
-Built with:
-gcc-4.7 -O3 -DHAVE_FFTW -march=armv7-a -mtune=cortex-a9 -mfloat-abi=hard -mfpu=neon -ffast-math test_pffft.c pffft.c -o test_pffft_arm fftpack.c -lm -I/usr/local/include/ -L/usr/local/lib/ -lfftw3f
-
-| input len |real FFTPack|  real FFTW | real PFFFT | |cplx FFTPack|  cplx FFTW | cplx PFFFT |
-|-----------+------------+------------+------------| |------------+------------+------------|
-|       64  |      549   |      452   |      731   | |      512   |      602   |      640   |
-|       96  |      421   |      272   |      702   | |      496   |      571   |      602   |
-|      128  |      498   |      512   |      815   | |      597   |      618   |      652   |
-|      160  |      521   |      536   |      815   | |      586   |      669   |      625   |
-|      192  |      539   |      571   |      883   | |      485   |      597   |      626   |
-|      256  |      640   |      539   |      975   | |      569   |      611   |      671   |
-|      384  |      499   |      610   |      879   | |      499   |      602   |      637   |
-|      480  |      518   |      507   |      877   | |      496   |      661   |      616   |
-|      512  |      524   |      591   |     1002   | |      549   |      678   |      668   |
-|      640  |      542   |      612   |      955   | |      568   |      663   |      645   |
-|      768  |      557   |      613   |      981   | |      491   |      663   |      598   |
-|      800  |      514   |      353   |      882   | |      514   |      360   |      574   |
-|     1024  |      640   |      640   |     1067   | |      492   |      683   |      602   |
-|     2048  |      587   |      640   |      908   | |      486   |      640   |      552   |
-|     2400  |      479   |      368   |      777   | |      422   |      376   |      518   |
-|     4096  |      511   |      614   |      853   | |      426   |      640   |      534   |
-|     8192  |      415   |      584   |      708   | |      386   |      622   |      516   |
-|     9216  |      419   |      571   |      687   | |      364   |      586   |      506   |
-|    16384  |      426   |      577   |      716   | |      398   |      606   |      530   |
-|    32768  |      417   |      572   |      673   | |      399   |      572   |      468   |
-|   262144  |      219   |      380   |      293   | |      255   |      431   |      343   |
-|  1048576  |      202   |      274   |      237   | |      265   |      282   |      355   |
-|-----------+------------+------------+------------| |------------+------------+------------|
-
-Same platform as above, but this time pffft and fftpack are built with clang 3.2:
-
-clang -O3 -DHAVE_FFTW -march=armv7-a -mtune=cortex-a9 -mfloat-abi=hard -mfpu=neon -ffast-math test_pffft.c pffft.c -o test_pffft_arm fftpack.c -lm -I/usr/local/include/ -L/usr/local/lib/ -lfftw3f
-
-| input len |real FFTPack|  real FFTW | real PFFFT | |cplx FFTPack|  cplx FFTW | cplx PFFFT |
-|-----------+------------+------------+------------| |------------+------------+------------|
-|       64  |      427   |      452   |      853   | |      427   |      602   |     1024   |
-|       96  |      351   |      276   |      843   | |      337   |      571   |      963   |
-|      128  |      373   |      512   |      996   | |      390   |      618   |     1054   |
-|      160  |      426   |      536   |      987   | |      375   |      669   |      914   |
-|      192  |      404   |      571   |     1079   | |      388   |      588   |     1079   |
-|      256  |      465   |      539   |     1205   | |      445   |      602   |     1170   |
-|      384  |      366   |      610   |     1099   | |      343   |      594   |     1099   |
-|      480  |      356   |      507   |     1140   | |      335   |      651   |      931   |
-|      512  |      411   |      591   |     1213   | |      384   |      649   |     1124   |
-|      640  |      398   |      612   |     1193   | |      373   |      654   |      901   |
-|      768  |      409   |      613   |     1227   | |      383   |      663   |     1044   |
-|      800  |      411   |      348   |     1073   | |      353   |      358   |      809   |
-|     1024  |      427   |      640   |     1280   | |      413   |      692   |     1004   |
-|     2048  |      414   |      626   |     1126   | |      371   |      640   |      853   |
-|     2400  |      399   |      373   |      898   | |      319   |      368   |      653   |
-|     4096  |      404   |      602   |     1059   | |      357   |      633   |      778   |
-|     8192  |      332   |      584   |      792   | |      308   |      616   |      716   |
-|     9216  |      322   |      561   |      783   | |      299   |      586   |      687   |
-|    16384  |      344   |      568   |      778   | |      314   |      617   |      745   |
-|    32768  |      342   |      564   |      737   | |      314   |      552   |      629   |
-|   262144  |      201   |      383   |      313   | |      227   |      435   |      413   |
-|  1048576  |      187   |      262   |      251   | |      228   |      281   |      409   |
-|-----------+------------+------------+------------| |------------+------------+------------|
-
-So it looks like, on ARM, gcc 4.7 is the best at scalar floating point
-(the fftpack performance numbers are better with gcc), while clang is
-the best with neon intrinsics (see how pffft perf has improved with
-clang 3.2).
-
-
-NVIDIA Jetson TK1 board, gcc-4.8.2. The cpu is a 2.3GHz cortex A15 (Tegra K1).
-
-Built with:
-gcc -O3 -march=armv7-a -mtune=native -mfloat-abi=hard -mfpu=neon -ffast-math test_pffft.c pffft.c -o test_pffft_arm fftpack.c -lm
-
-| input len |real FFTPack| real PFFFT | |cplx FFTPack| cplx PFFFT |
-|-----------+------------+------------| |------------+------------|
-|       64  |     1735   |     3308   | |     1994   |     3744   |
-|       96  |     1596   |     3448   | |     1987   |     3572   |
-|      128  |     1807   |     4076   | |     2255   |     3960   |
-|      160  |     1769   |     4083   | |     2071   |     3845   |
-|      192  |     1990   |     4233   | |     2017   |     3939   |
-|      256  |     2191   |     4882   | |     2254   |     4346   |
-|      384  |     1878   |     4492   | |     2073   |     4012   |
-|      480  |     1748   |     4398   | |     1923   |     3951   |
-|      512  |     2030   |     5064   | |     2267   |     4195   |
-|      640  |     1918   |     4756   | |     2094   |     4184   |
-|      768  |     2099   |     4907   | |     2048   |     4297   |
-|      800  |     1822   |     4555   | |     1880   |     4063   |
-|     1024  |     2232   |     5355   | |     2187   |     4420   |
-|     2048  |     2176   |     4983   | |     2027   |     3602   |
-|     2400  |     1741   |     4256   | |     1710   |     3344   |
-|     4096  |     1816   |     3914   | |     1851   |     3349   |
-|     8192  |     1716   |     3481   | |     1700   |     3255   |
-|     9216  |     1735   |     3589   | |     1653   |     3094   |
-|    16384  |     1567   |     3483   | |     1637   |     3244   |
-|    32768  |     1624   |     3240   | |     1655   |     3156   |
-|   262144  |     1012   |     1898   | |      983   |     1503   |
-|  1048576  |      876   |     1154   | |      868   |     1341   |
-|-----------+------------+------------| |------------+------------|
-
-The performance on the tegra K1 is pretty impressive. I'm not
-including the FFTW numbers as they as slightly below the scalar
-fftpack numbers, so something must be wrong (however it seems to be
-correctly configured and is using neon simd instructions).
-
-When using clang 3.4 the pffft version is even a bit faster, reaching
-5.7 GFlops for real ffts of size 1024.
-
-
-iPad Air 2 with iOS9, xcode 8.0, arm64. The cpu is an Apple A8X, supposedly running at 1.5GHz.
-
-| input len |real FFTPack|  real vDSP | real PFFFT | |cplx FFTPack|  cplx vDSP | cplx PFFFT |
-|-----------+------------+------------+------------| |------------+------------+------------|
-|       64  |     2517   |     7995   |     6086   | |     2725   |    13006   |     8495   |
-|       96  |     2442   |      n/a   |     6691   | |     2256   |      n/a   |     7991   |
-|      128  |     2664   |    10186   |     7877   | |     2575   |    15115   |     9115   |
-|      160  |     2638   |      n/a   |     8283   | |     2682   |      n/a   |     8806   |
-|      192  |     2903   |      n/a   |     9083   | |     2634   |      n/a   |     8980   |
-|      256  |     3184   |    11452   |    10039   | |     3026   |    15410   |    10199   |
-|      384  |     2665   |      n/a   |    10100   | |     2275   |      n/a   |     9247   |
-|      480  |     2546   |      n/a   |     9863   | |     2341   |      n/a   |     8892   |
-|      512  |     2832   |    12197   |    10989   | |     2547   |    16768   |    10154   |
-|      640  |     2755   |      n/a   |    10461   | |     2569   |      n/a   |     9666   |
-|      768  |     2998   |      n/a   |    11355   | |     2585   |      n/a   |     9813   |
-|      800  |     2516   |      n/a   |    10332   | |     2433   |      n/a   |     9164   |
-|     1024  |     3109   |    12965   |    12114   | |     2869   |    16448   |    10519   |
-|     2048  |     3027   |    12996   |    12023   | |     2648   |    17304   |    10307   |
-|     2400  |     2515   |      n/a   |    10372   | |     2355   |      n/a   |     8443   |
-|     4096  |     3204   |    13603   |    12359   | |     2814   |    16570   |     9780   |
-|     8192  |     2759   |    13422   |    10824   | |     2153   |    15652   |     7884   |
-|     9216  |     2700   |      n/a   |     9938   | |     2241   |      n/a   |     7900   |
-|    16384  |     2280   |    13057   |     7976   | |      593   |     4272   |     2534   |
-|    32768  |      768   |     4269   |     2882   | |      606   |     4405   |     2604   |
-|   262144  |      724   |     3527   |     2630   | |      534   |     2418   |     2157   |
-|  1048576  |      674   |     1467   |     2135   | |      530   |     1621   |     2055   |
-|-----------+------------+------------+------------| |------------+------------+------------|
-
-I double-checked to make sure I did not make a mistake in the time
-measurements, as the numbers are much higher than what I initially
-expected. They are in fact higher than the number I get on the 2.8GHz
-Xeon of my 2008 mac pro.. (except for FFT lengths >= 32768 where
-having a big cache is useful). A good surprise is also that the perf
-is not too far from apple's vDSP (at least for the real FFT).
-
diff --git a/oss-internship-2020/pffft/fftpack.c b/oss-internship-2020/pffft/fftpack.c
deleted file mode 100644
index b6375a8..0000000
--- a/oss-internship-2020/pffft/fftpack.c
+++ /dev/null
@@ -1,3112 +0,0 @@
-/*
-  compile with cc -DTESTING_FFTPACK fftpack.c in order to build the
-  test application.
-
-  This is an f2c translation of the full fftpack sources as found on
-  http://www.netlib.org/fftpack/ The translated code has been
-  slightlty edited to remove the ugliest artefacts of the translation
-  (a hundred of wild GOTOs were wiped during that operation).
-
-  The original fftpack file was written by Paul N. Swarztrauber
-  (Version 4, 1985), in fortran 77.
-
-   FFTPACK license:
-
-   http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
-
-   Copyright (c) 2004 the University Corporation for Atmospheric
-   Research ("UCAR"). All rights reserved. Developed by NCAR's
-   Computational and Information Systems Laboratory, UCAR,
-   www.cisl.ucar.edu.
-
-   Redistribution and use of the Software in source and binary forms,
-   with or without modification, is permitted provided that the
-   following conditions are met:
-
-   - Neither the names of NCAR's Computational and Information Systems
-   Laboratory, the University Corporation for Atmospheric Research,
-   nor the names of its sponsors or contributors may be used to
-   endorse or promote products derived from this Software without
-   specific prior written permission.  
-
-   - Redistributions of source code must retain the above copyright
-   notices, this list of conditions, and the disclaimer below.
-
-   - Redistributions in binary form must reproduce the above copyright
-   notice, this list of conditions, and the disclaimer below in the
-   documentation and/or other materials provided with the
-   distribution.
-
-   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
-   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
-   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
-   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
-   SOFTWARE.
-
-   ChangeLog:
-   2011/10/02: this is my first release of this file.
-*/
-
-#include "fftpack.h"
-#include <math.h>
-
-typedef fftpack_real real;
-typedef fftpack_int  integer;
-
-typedef struct f77complex {    
-  real r, i;
-} f77complex;   
-
-#ifdef TESTING_FFTPACK
-static real c_abs(f77complex *c) { return sqrt(c->r*c->r + c->i*c->i); }
-static double dmax(double a, double b) { return a < b ? b : a; }
-#endif
-
-/* translated by f2c (version 20061008), and slightly edited */
-
-static void passfb(integer *nac, integer ido, integer ip, integer l1, integer idl1, 
-                   real *cc, real *c1, real *c2, real *ch, real *ch2, const real *wa, real fsign)
-{
-  /* System generated locals */
-  integer ch_offset, cc_offset,
-    c1_offset, c2_offset, ch2_offset;
-
-  /* Local variables */
-  integer i, j, k, l, jc, lc, ik, idj, idl, inc, idp;
-  real wai, war;
-  integer ipp2, idij, idlj, idot, ipph;
-
-
-#define c1_ref(a_1,a_2,a_3) c1[((a_3)*l1 + (a_2))*ido + a_1]
-#define c2_ref(a_1,a_2) c2[(a_2)*idl1 + a_1]
-#define cc_ref(a_1,a_2,a_3) cc[((a_3)*ip + (a_2))*ido + a_1]
-#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1]
-#define ch2_ref(a_1,a_2) ch2[(a_2)*idl1 + a_1]
-
-  /* Parameter adjustments */
-  ch_offset = 1 + ido * (1 + l1);
-  ch -= ch_offset;
-  c1_offset = 1 + ido * (1 + l1);
-  c1 -= c1_offset;
-  cc_offset = 1 + ido * (1 + ip);
-  cc -= cc_offset;
-  ch2_offset = 1 + idl1;
-  ch2 -= ch2_offset;
-  c2_offset = 1 + idl1;
-  c2 -= c2_offset;
-  --wa;
-
-  /* Function Body */
-  idot = ido / 2;
-  ipp2 = ip + 2;
-  ipph = (ip + 1) / 2;
-  idp = ip * ido;
-
-  if (ido >= l1) {
-    for (j = 2; j <= ipph; ++j) {
-      jc = ipp2 - j;
-      for (k = 1; k <= l1; ++k) {
-        for (i = 1; i <= ido; ++i) {
-          ch_ref(i, k, j) = cc_ref(i, j, k) + cc_ref(i, jc, k);
-          ch_ref(i, k, jc) = cc_ref(i, j, k) - cc_ref(i, jc, k);
-        }
-      }
-    }
-    for (k = 1; k <= l1; ++k) {
-      for (i = 1; i <= ido; ++i) {
-        ch_ref(i, k, 1) = cc_ref(i, 1, k);
-      }
-    }
-  } else {
-    for (j = 2; j <= ipph; ++j) {
-      jc = ipp2 - j;
-      for (i = 1; i <= ido; ++i) {
-        for (k = 1; k <= l1; ++k) {
-          ch_ref(i, k, j) = cc_ref(i, j, k) + cc_ref(i, jc, k);
-          ch_ref(i, k, jc) = cc_ref(i, j, k) - cc_ref(i, jc, k);
-        }
-      }
-    }
-    for (i = 1; i <= ido; ++i) {
-      for (k = 1; k <= l1; ++k) {
-        ch_ref(i, k, 1) = cc_ref(i, 1, k);
-      }
-    }
-  }
-  idl = 2 - ido;
-  inc = 0;
-  for (l = 2; l <= ipph; ++l) {
-    lc = ipp2 - l;
-    idl += ido;
-    for (ik = 1; ik <= idl1; ++ik) {
-      c2_ref(ik, l) = ch2_ref(ik, 1) + wa[idl - 1] * ch2_ref(ik, 2);
-      c2_ref(ik, lc) = fsign*wa[idl] * ch2_ref(ik, ip);
-    }
-    idlj = idl;
-    inc += ido;
-    for (j = 3; j <= ipph; ++j) {
-      jc = ipp2 - j;
-      idlj += inc;
-      if (idlj > idp) {
-        idlj -= idp;
-      }
-      war = wa[idlj - 1];
-      wai = wa[idlj];
-      for (ik = 1; ik <= idl1; ++ik) {
-        c2_ref(ik, l) = c2_ref(ik, l) + war * ch2_ref(ik, j);
-        c2_ref(ik, lc) = c2_ref(ik, lc) + fsign*wai * ch2_ref(ik, jc);
-      }
-    }
-  }
-  for (j = 2; j <= ipph; ++j) {
-    for (ik = 1; ik <= idl1; ++ik) {
-      ch2_ref(ik, 1) = ch2_ref(ik, 1) + ch2_ref(ik, j);
-    }
-  }
-  for (j = 2; j <= ipph; ++j) {
-    jc = ipp2 - j;
-    for (ik = 2; ik <= idl1; ik += 2) {
-      ch2_ref(ik - 1, j) = c2_ref(ik - 1, j) - c2_ref(ik, jc);
-      ch2_ref(ik - 1, jc) = c2_ref(ik - 1, j) + c2_ref(ik, jc);
-      ch2_ref(ik, j) = c2_ref(ik, j) + c2_ref(ik - 1, jc);
-      ch2_ref(ik, jc) = c2_ref(ik, j) - c2_ref(ik - 1, jc);
-    }
-  }
-  *nac = 1;
-  if (ido == 2) {
-    return;
-  }
-  *nac = 0;
-  for (ik = 1; ik <= idl1; ++ik) {
-    c2_ref(ik, 1) = ch2_ref(ik, 1);
-  }
-  for (j = 2; j <= ip; ++j) {
-    for (k = 1; k <= l1; ++k) {
-      c1_ref(1, k, j) = ch_ref(1, k, j);
-      c1_ref(2, k, j) = ch_ref(2, k, j);
-    }
-  }
-  if (idot <= l1) {
-    idij = 0;
-    for (j = 2; j <= ip; ++j) {
-      idij += 2;
-      for (i = 4; i <= ido; i += 2) {
-        idij += 2;
-        for (k = 1; k <= l1; ++k) {
-          c1_ref(i - 1, k, j) = wa[idij - 1] * ch_ref(i - 1, k, j) - fsign*wa[idij] * ch_ref(i, k, j);
-          c1_ref(i, k, j) = wa[idij - 1] * ch_ref(i, k, j) + fsign*wa[idij] * ch_ref(i - 1, k, j);
-        }
-      }
-    }
-    return;
-  }
-  idj = 2 - ido;
-  for (j = 2; j <= ip; ++j) {
-    idj += ido;
-    for (k = 1; k <= l1; ++k) {
-      idij = idj;
-      for (i = 4; i <= ido; i += 2) {
-        idij += 2;
-        c1_ref(i - 1, k, j) = wa[idij - 1] * ch_ref(i - 1, k, j) - fsign*wa[idij] * ch_ref(i, k, j);
-        c1_ref(i, k, j) = wa[idij - 1] * ch_ref(i, k, j) + fsign*wa[idij] * ch_ref(i - 1, k, j);
-      }
-    }
-  }
-} /* passb */
-
-#undef ch2_ref
-#undef ch_ref
-#undef cc_ref
-#undef c2_ref
-#undef c1_ref
-
-
-static void passb2(integer ido, integer l1, const real *cc, real *ch, const real *wa1)
-{
-  /* System generated locals */
-  integer cc_offset, ch_offset;
-
-  /* Local variables */
-  integer i, k;
-  real ti2, tr2;
-
-
-#define cc_ref(a_1,a_2,a_3) cc[((a_3)*2 + (a_2))*ido + a_1]
-#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1]
-
-  /* Parameter adjustments */
-  ch_offset = 1 + ido * (1 + l1);
-  ch -= ch_offset;
-  cc_offset = 1 + ido * 3;
-  cc -= cc_offset;
-  --wa1;
-
-  /* Function Body */
-  if (ido <= 2) {
-    for (k = 1; k <= l1; ++k) {
-      ch_ref(1, k, 1) = cc_ref(1, 1, k) + cc_ref(1, 2, k);
-      ch_ref(1, k, 2) = cc_ref(1, 1, k) - cc_ref(1, 2, k);
-      ch_ref(2, k, 1) = cc_ref(2, 1, k) + cc_ref(2, 2, k);
-      ch_ref(2, k, 2) = cc_ref(2, 1, k) - cc_ref(2, 2, k);
-    }
-    return;
-  }
-  for (k = 1; k <= l1; ++k) {
-    for (i = 2; i <= ido; i += 2) {
-      ch_ref(i - 1, k, 1) = cc_ref(i - 1, 1, k) + cc_ref(i - 1, 2, k);
-      tr2 = cc_ref(i - 1, 1, k) - cc_ref(i - 1, 2, k);
-      ch_ref(i, k, 1) = cc_ref(i, 1, k) + cc_ref(i, 2, k);
-      ti2 = cc_ref(i, 1, k) - cc_ref(i, 2, k);
-      ch_ref(i, k, 2) = wa1[i - 1] * ti2 + wa1[i] * tr2;
-      ch_ref(i - 1, k, 2) = wa1[i - 1] * tr2 - wa1[i] * ti2;
-    }
-  }
-} /* passb2 */
-
-#undef ch_ref
-#undef cc_ref
-
-
-static void passb3(integer ido, integer l1, const real *cc, real *ch, const real *wa1, const real *wa2)
-{
-  static const real taur = -.5f;
-  static const real taui = .866025403784439f;
-
-  /* System generated locals */
-  integer cc_offset, ch_offset;
-
-  /* Local variables */
-  integer i, k;
-  real ci2, ci3, di2, di3, cr2, cr3, dr2, dr3, ti2, tr2;
-
-
-#define cc_ref(a_1,a_2,a_3) cc[((a_3)*3 + (a_2))*ido + a_1]
-#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1]
-
-  /* Parameter adjustments */
-  ch_offset = 1 + ido * (1 + l1);
-  ch -= ch_offset;
-  cc_offset = 1 + (ido << 2);
-  cc -= cc_offset;
-  --wa1;
-  --wa2;
-
-  /* Function Body */
-  if (ido == 2) {
-    for (k = 1; k <= l1; ++k) {
-      tr2 = cc_ref(1, 2, k) + cc_ref(1, 3, k);
-      cr2 = cc_ref(1, 1, k) + taur * tr2;
-      ch_ref(1, k, 1) = cc_ref(1, 1, k) + tr2;
-      ti2 = cc_ref(2, 2, k) + cc_ref(2, 3, k);
-      ci2 = cc_ref(2, 1, k) + taur * ti2;
-      ch_ref(2, k, 1) = cc_ref(2, 1, k) + ti2;
-      cr3 = taui * (cc_ref(1, 2, k) - cc_ref(1, 3, k));
-      ci3 = taui * (cc_ref(2, 2, k) - cc_ref(2, 3, k));
-      ch_ref(1, k, 2) = cr2 - ci3;
-      ch_ref(1, k, 3) = cr2 + ci3;
-      ch_ref(2, k, 2) = ci2 + cr3;
-      ch_ref(2, k, 3) = ci2 - cr3;
-    }
-  } else {
-    for (k = 1; k <= l1; ++k) {
-      for (i = 2; i <= ido; i += 2) {
-        tr2 = cc_ref(i - 1, 2, k) + cc_ref(i - 1, 3, k);
-        cr2 = cc_ref(i - 1, 1, k) + taur * tr2;
-        ch_ref(i - 1, k, 1) = cc_ref(i - 1, 1, k) + tr2;
-        ti2 = cc_ref(i, 2, k) + cc_ref(i, 3, k);
-        ci2 = cc_ref(i, 1, k) + taur * ti2;
-        ch_ref(i, k, 1) = cc_ref(i, 1, k) + ti2;
-        cr3 = taui * (cc_ref(i - 1, 2, k) - cc_ref(i - 1, 3, k));
-        ci3 = taui * (cc_ref(i, 2, k) - cc_ref(i, 3, k));
-        dr2 = cr2 - ci3;
-        dr3 = cr2 + ci3;
-        di2 = ci2 + cr3;
-        di3 = ci2 - cr3;
-        ch_ref(i, k, 2) = wa1[i - 1] * di2 + wa1[i] * dr2;
-        ch_ref(i - 1, k, 2) = wa1[i - 1] * dr2 - wa1[i] * di2;
-        ch_ref(i, k, 3) = wa2[i - 1] * di3 + wa2[i] * dr3;
-        ch_ref(i - 1, k, 3) = wa2[i - 1] * dr3 - wa2[i] * di3;
-      }
-    }
-  }
-} /* passb3 */
-
-#undef ch_ref
-#undef cc_ref
-
-
-static void passb4(integer ido, integer l1, const real *cc, real *ch, 
-                   const real *wa1, const real *wa2, const real *wa3)
-{
-  /* System generated locals */
-  integer cc_offset, ch_offset;
-
-  /* Local variables */
-  integer i, k;
-  real ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
-
-
-#define cc_ref(a_1,a_2,a_3) cc[((a_3)*4 + (a_2))*ido + a_1]
-#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1]
-
-  /* Parameter adjustments */
-  ch_offset = 1 + ido * (1 + l1);
-  ch -= ch_offset;
-  cc_offset = 1 + ido * 5;
-  cc -= cc_offset;
-  --wa1;
-  --wa2;
-  --wa3;
-
-  /* Function Body */
-  if (ido == 2) {
-    for (k = 1; k <= l1; ++k) {
-      ti1 = cc_ref(2, 1, k) - cc_ref(2, 3, k);
-      ti2 = cc_ref(2, 1, k) + cc_ref(2, 3, k);
-      tr4 = cc_ref(2, 4, k) - cc_ref(2, 2, k);
-      ti3 = cc_ref(2, 2, k) + cc_ref(2, 4, k);
-      tr1 = cc_ref(1, 1, k) - cc_ref(1, 3, k);
-      tr2 = cc_ref(1, 1, k) + cc_ref(1, 3, k);
-      ti4 = cc_ref(1, 2, k) - cc_ref(1, 4, k);
-      tr3 = cc_ref(1, 2, k) + cc_ref(1, 4, k);
-      ch_ref(1, k, 1) = tr2 + tr3;
-      ch_ref(1, k, 3) = tr2 - tr3;
-      ch_ref(2, k, 1) = ti2 + ti3;
-      ch_ref(2, k, 3) = ti2 - ti3;
-      ch_ref(1, k, 2) = tr1 + tr4;
-      ch_ref(1, k, 4) = tr1 - tr4;
-      ch_ref(2, k, 2) = ti1 + ti4;
-      ch_ref(2, k, 4) = ti1 - ti4;
-    }
-  } else {
-    for (k = 1; k <= l1; ++k) {
-      for (i = 2; i <= ido; i += 2) {
-        ti1 = cc_ref(i, 1, k) - cc_ref(i, 3, k);
-        ti2 = cc_ref(i, 1, k) + cc_ref(i, 3, k);
-        ti3 = cc_ref(i, 2, k) + cc_ref(i, 4, k);
-        tr4 = cc_ref(i, 4, k) - cc_ref(i, 2, k);
-        tr1 = cc_ref(i - 1, 1, k) - cc_ref(i - 1, 3, k);
-        tr2 = cc_ref(i - 1, 1, k) + cc_ref(i - 1, 3, k);
-        ti4 = cc_ref(i - 1, 2, k) - cc_ref(i - 1, 4, k);
-        tr3 = cc_ref(i - 1, 2, k) + cc_ref(i - 1, 4, k);
-        ch_ref(i - 1, k, 1) = tr2 + tr3;
-        cr3 = tr2 - tr3;
-        ch_ref(i, k, 1) = ti2 + ti3;
-        ci3 = ti2 - ti3;
-        cr2 = tr1 + tr4;
-        cr4 = tr1 - tr4;
-        ci2 = ti1 + ti4;
-        ci4 = ti1 - ti4;
-        ch_ref(i - 1, k, 2) = wa1[i - 1] * cr2 - wa1[i] * ci2;
-        ch_ref(i, k, 2) = wa1[i - 1] * ci2 + wa1[i] * cr2;
-        ch_ref(i - 1, k, 3) = wa2[i - 1] * cr3 - wa2[i] * ci3;
-        ch_ref(i, k, 3) = wa2[i - 1] * ci3 + wa2[i] * cr3;
-        ch_ref(i - 1, k, 4) = wa3[i - 1] * cr4 - wa3[i] * ci4;
-        ch_ref(i, k, 4) = wa3[i - 1] * ci4 + wa3[i] * cr4;
-      }
-    }
-  }
-} /* passb4 */
-
-#undef ch_ref
-#undef cc_ref
-
-/* passf5 and passb5 merged */
-static void passfb5(integer ido, integer l1, const real *cc, real *ch, 
-                    const real *wa1, const real *wa2, const real *wa3, const real *wa4, real fsign)
-{
-  const real tr11 = .309016994374947f;
-  const real ti11 = .951056516295154f*fsign;
-  const real tr12 = -.809016994374947f;
-  const real ti12 = .587785252292473f*fsign;
-
-  /* System generated locals */
-  integer cc_offset, ch_offset;
-
-  /* Local variables */
-  integer i, k;
-  real ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4, ti2, ti3,
-    ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5;
-
-
-#define cc_ref(a_1,a_2,a_3) cc[((a_3)*5 + (a_2))*ido + a_1]
-#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1]
-
-  /* Parameter adjustments */
-  ch_offset = 1 + ido * (1 + l1);
-  ch -= ch_offset;
-  cc_offset = 1 + ido * 6;
-  cc -= cc_offset;
-  --wa1;
-  --wa2;
-  --wa3;
-  --wa4;
-
-  /* Function Body */
-  if (ido == 2) {
-    for (k = 1; k <= l1; ++k) {
-      ti5 = cc_ref(2, 2, k) - cc_ref(2, 5, k);
-      ti2 = cc_ref(2, 2, k) + cc_ref(2, 5, k);
-      ti4 = cc_ref(2, 3, k) - cc_ref(2, 4, k);
-      ti3 = cc_ref(2, 3, k) + cc_ref(2, 4, k);
-      tr5 = cc_ref(1, 2, k) - cc_ref(1, 5, k);
-      tr2 = cc_ref(1, 2, k) + cc_ref(1, 5, k);
-      tr4 = cc_ref(1, 3, k) - cc_ref(1, 4, k);
-      tr3 = cc_ref(1, 3, k) + cc_ref(1, 4, k);
-      ch_ref(1, k, 1) = cc_ref(1, 1, k) + tr2 + tr3;
-      ch_ref(2, k, 1) = cc_ref(2, 1, k) + ti2 + ti3;
-      cr2 = cc_ref(1, 1, k) + tr11 * tr2 + tr12 * tr3;
-      ci2 = cc_ref(2, 1, k) + tr11 * ti2 + tr12 * ti3;
-      cr3 = cc_ref(1, 1, k) + tr12 * tr2 + tr11 * tr3;
-      ci3 = cc_ref(2, 1, k) + tr12 * ti2 + tr11 * ti3;
-      cr5 = ti11 * tr5 + ti12 * tr4;
-      ci5 = ti11 * ti5 + ti12 * ti4;
-      cr4 = ti12 * tr5 - ti11 * tr4;
-      ci4 = ti12 * ti5 - ti11 * ti4;
-      ch_ref(1, k, 2) = cr2 - ci5;
-      ch_ref(1, k, 5) = cr2 + ci5;
-      ch_ref(2, k, 2) = ci2 + cr5;
-      ch_ref(2, k, 3) = ci3 + cr4;
-      ch_ref(1, k, 3) = cr3 - ci4;
-      ch_ref(1, k, 4) = cr3 + ci4;
-      ch_ref(2, k, 4) = ci3 - cr4;
-      ch_ref(2, k, 5) = ci2 - cr5;
-    }
-  } else {
-    for (k = 1; k <= l1; ++k) {
-      for (i = 2; i <= ido; i += 2) {
-        ti5 = cc_ref(i, 2, k) - cc_ref(i, 5, k);
-        ti2 = cc_ref(i, 2, k) + cc_ref(i, 5, k);
-        ti4 = cc_ref(i, 3, k) - cc_ref(i, 4, k);
-        ti3 = cc_ref(i, 3, k) + cc_ref(i, 4, k);
-        tr5 = cc_ref(i - 1, 2, k) - cc_ref(i - 1, 5, k);
-        tr2 = cc_ref(i - 1, 2, k) + cc_ref(i - 1, 5, k);
-        tr4 = cc_ref(i - 1, 3, k) - cc_ref(i - 1, 4, k);
-        tr3 = cc_ref(i - 1, 3, k) + cc_ref(i - 1, 4, k);
-        ch_ref(i - 1, k, 1) = cc_ref(i - 1, 1, k) + tr2 + tr3;
-        ch_ref(i, k, 1) = cc_ref(i, 1, k) + ti2 + ti3;
-        cr2 = cc_ref(i - 1, 1, k) + tr11 * tr2 + tr12 * tr3;
-        ci2 = cc_ref(i, 1, k) + tr11 * ti2 + tr12 * ti3;
-        cr3 = cc_ref(i - 1, 1, k) + tr12 * tr2 + tr11 * tr3;
-        ci3 = cc_ref(i, 1, k) + tr12 * ti2 + tr11 * ti3;
-        cr5 = ti11 * tr5 + ti12 * tr4;
-        ci5 = ti11 * ti5 + ti12 * ti4;
-        cr4 = ti12 * tr5 - ti11 * tr4;
-        ci4 = ti12 * ti5 - ti11 * ti4;
-        dr3 = cr3 - ci4;
-        dr4 = cr3 + ci4;
-        di3 = ci3 + cr4;
-        di4 = ci3 - cr4;
-        dr5 = cr2 + ci5;
-        dr2 = cr2 - ci5;
-        di5 = ci2 - cr5;
-        di2 = ci2 + cr5;
-        ch_ref(i - 1, k, 2) = wa1[i - 1] * dr2 - fsign*wa1[i] * di2;
-        ch_ref(i, k, 2) = wa1[i - 1] * di2 + fsign*wa1[i] * dr2;
-        ch_ref(i - 1, k, 3) = wa2[i - 1] * dr3 - fsign*wa2[i] * di3;
-        ch_ref(i, k, 3) = wa2[i - 1] * di3 + fsign*wa2[i] * dr3;
-        ch_ref(i - 1, k, 4) = wa3[i - 1] * dr4 - fsign*wa3[i] * di4;
-        ch_ref(i, k, 4) = wa3[i - 1] * di4 + fsign*wa3[i] * dr4;
-        ch_ref(i - 1, k, 5) = wa4[i - 1] * dr5 - fsign*wa4[i] * di5;
-        ch_ref(i, k, 5) = wa4[i - 1] * di5 + fsign*wa4[i] * dr5;
-      }
-    }
-  }
-} /* passb5 */
-
-#undef ch_ref
-#undef cc_ref
-
-static void passf2(integer ido, integer l1, const real *cc, real *ch, const real *wa1)
-{
-  /* System generated locals */
-  integer cc_offset, ch_offset;
-
-  /* Local variables */
-  integer i, k;
-  real ti2, tr2;
-
-
-#define cc_ref(a_1,a_2,a_3) cc[((a_3)*2 + (a_2))*ido + a_1]
-#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1]
-
-  /* Parameter adjustments */
-  ch_offset = 1 + ido * (1 + l1);
-  ch -= ch_offset;
-  cc_offset = 1 + ido * 3;
-  cc -= cc_offset;
-  --wa1;
-
-  /* Function Body */
-  if (ido == 2) {
-    for (k = 1; k <= l1; ++k) {
-      ch_ref(1, k, 1) = cc_ref(1, 1, k) + cc_ref(1, 2, k);
-      ch_ref(1, k, 2) = cc_ref(1, 1, k) - cc_ref(1, 2, k);
-      ch_ref(2, k, 1) = cc_ref(2, 1, k) + cc_ref(2, 2, k);
-      ch_ref(2, k, 2) = cc_ref(2, 1, k) - cc_ref(2, 2, k);
-    }
-  } else {
-    for (k = 1; k <= l1; ++k) {
-      for (i = 2; i <= ido; i += 2) {
-        ch_ref(i - 1, k, 1) = cc_ref(i - 1, 1, k) + cc_ref(i - 1, 2,
-                                                           k);
-        tr2 = cc_ref(i - 1, 1, k) - cc_ref(i - 1, 2, k);
-        ch_ref(i, k, 1) = cc_ref(i, 1, k) + cc_ref(i, 2, k);
-        ti2 = cc_ref(i, 1, k) - cc_ref(i, 2, k);
-        ch_ref(i, k, 2) = wa1[i - 1] * ti2 - wa1[i] * tr2;
-        ch_ref(i - 1, k, 2) = wa1[i - 1] * tr2 + wa1[i] * ti2;
-      }
-    }
-  }
-} /* passf2 */
-
-#undef ch_ref
-#undef cc_ref
-
-
-static void passf3(integer ido, integer l1, const real *cc, real *ch, 
-                   const real *wa1, const real *wa2)
-{
-  static const real taur = -.5f;
-  static const real taui = -.866025403784439f;
-
-  /* System generated locals */
-  integer cc_offset, ch_offset;
-
-  /* Local variables */
-  integer i, k;
-  real ci2, ci3, di2, di3, cr2, cr3, dr2, dr3, ti2, tr2;
-
-
-#define cc_ref(a_1,a_2,a_3) cc[((a_3)*3 + (a_2))*ido + a_1]
-#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1]
-
-  /* Parameter adjustments */
-  ch_offset = 1 + ido * (1 + l1);
-  ch -= ch_offset;
-  cc_offset = 1 + (ido << 2);
-  cc -= cc_offset;
-  --wa1;
-  --wa2;
-
-  /* Function Body */
-  if (ido == 2) {
-    for (k = 1; k <= l1; ++k) {
-      tr2 = cc_ref(1, 2, k) + cc_ref(1, 3, k);
-      cr2 = cc_ref(1, 1, k) + taur * tr2;
-      ch_ref(1, k, 1) = cc_ref(1, 1, k) + tr2;
-      ti2 = cc_ref(2, 2, k) + cc_ref(2, 3, k);
-      ci2 = cc_ref(2, 1, k) + taur * ti2;
-      ch_ref(2, k, 1) = cc_ref(2, 1, k) + ti2;
-      cr3 = taui * (cc_ref(1, 2, k) - cc_ref(1, 3, k));
-      ci3 = taui * (cc_ref(2, 2, k) - cc_ref(2, 3, k));
-      ch_ref(1, k, 2) = cr2 - ci3;
-      ch_ref(1, k, 3) = cr2 + ci3;
-      ch_ref(2, k, 2) = ci2 + cr3;
-      ch_ref(2, k, 3) = ci2 - cr3;
-    }
-  } else {
-    for (k = 1; k <= l1; ++k) {
-      for (i = 2; i <= ido; i += 2) {
-        tr2 = cc_ref(i - 1, 2, k) + cc_ref(i - 1, 3, k);
-        cr2 = cc_ref(i - 1, 1, k) + taur * tr2;
-        ch_ref(i - 1, k, 1) = cc_ref(i - 1, 1, k) + tr2;
-        ti2 = cc_ref(i, 2, k) + cc_ref(i, 3, k);
-        ci2 = cc_ref(i, 1, k) + taur * ti2;
-        ch_ref(i, k, 1) = cc_ref(i, 1, k) + ti2;
-        cr3 = taui * (cc_ref(i - 1, 2, k) - cc_ref(i - 1, 3, k));
-        ci3 = taui * (cc_ref(i, 2, k) - cc_ref(i, 3, k));
-        dr2 = cr2 - ci3;
-        dr3 = cr2 + ci3;
-        di2 = ci2 + cr3;
-        di3 = ci2 - cr3;
-        ch_ref(i, k, 2) = wa1[i - 1] * di2 - wa1[i] * dr2;
-        ch_ref(i - 1, k, 2) = wa1[i - 1] * dr2 + wa1[i] * di2;
-        ch_ref(i, k, 3) = wa2[i - 1] * di3 - wa2[i] * dr3;
-        ch_ref(i - 1, k, 3) = wa2[i - 1] * dr3 + wa2[i] * di3;
-      }
-    }
-  }
-} /* passf3 */
-
-#undef ch_ref
-#undef cc_ref
-
-
-static void passf4(integer ido, integer l1, const real *cc, real *ch, 
-                   const real *wa1, const real *wa2, const real *wa3)
-{
-  /* System generated locals */
-  integer cc_offset, ch_offset;
-
-  /* Local variables */
-  integer i, k;
-  real ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
-
-
-#define cc_ref(a_1,a_2,a_3) cc[((a_3)*4 + (a_2))*ido + a_1]
-#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1]
-
-  /* Parameter adjustments */
-  ch_offset = 1 + ido * (1 + l1);
-  ch -= ch_offset;
-  cc_offset = 1 + ido * 5;
-  cc -= cc_offset;
-  --wa1;
-  --wa2;
-  --wa3;
-
-  /* Function Body */
-  if (ido == 2) {
-    for (k = 1; k <= l1; ++k) {
-      ti1 = cc_ref(2, 1, k) - cc_ref(2, 3, k);
-      ti2 = cc_ref(2, 1, k) + cc_ref(2, 3, k);
-      tr4 = cc_ref(2, 2, k) - cc_ref(2, 4, k);
-      ti3 = cc_ref(2, 2, k) + cc_ref(2, 4, k);
-      tr1 = cc_ref(1, 1, k) - cc_ref(1, 3, k);
-      tr2 = cc_ref(1, 1, k) + cc_ref(1, 3, k);
-      ti4 = cc_ref(1, 4, k) - cc_ref(1, 2, k);
-      tr3 = cc_ref(1, 2, k) + cc_ref(1, 4, k);
-      ch_ref(1, k, 1) = tr2 + tr3;
-      ch_ref(1, k, 3) = tr2 - tr3;
-      ch_ref(2, k, 1) = ti2 + ti3;
-      ch_ref(2, k, 3) = ti2 - ti3;
-      ch_ref(1, k, 2) = tr1 + tr4;
-      ch_ref(1, k, 4) = tr1 - tr4;
-      ch_ref(2, k, 2) = ti1 + ti4;
-      ch_ref(2, k, 4) = ti1 - ti4;
-    }
-  } else {
-    for (k = 1; k <= l1; ++k) {
-      for (i = 2; i <= ido; i += 2) {
-        ti1 = cc_ref(i, 1, k) - cc_ref(i, 3, k);
-        ti2 = cc_ref(i, 1, k) + cc_ref(i, 3, k);
-        ti3 = cc_ref(i, 2, k) + cc_ref(i, 4, k);
-        tr4 = cc_ref(i, 2, k) - cc_ref(i, 4, k);
-        tr1 = cc_ref(i - 1, 1, k) - cc_ref(i - 1, 3, k);
-        tr2 = cc_ref(i - 1, 1, k) + cc_ref(i - 1, 3, k);
-        ti4 = cc_ref(i - 1, 4, k) - cc_ref(i - 1, 2, k);
-        tr3 = cc_ref(i - 1, 2, k) + cc_ref(i - 1, 4, k);
-        ch_ref(i - 1, k, 1) = tr2 + tr3;
-        cr3 = tr2 - tr3;
-        ch_ref(i, k, 1) = ti2 + ti3;
-        ci3 = ti2 - ti3;
-        cr2 = tr1 + tr4;
-        cr4 = tr1 - tr4;
-        ci2 = ti1 + ti4;
-        ci4 = ti1 - ti4;
-        ch_ref(i - 1, k, 2) = wa1[i - 1] * cr2 + wa1[i] * ci2;
-        ch_ref(i, k, 2) = wa1[i - 1] * ci2 - wa1[i] * cr2;
-        ch_ref(i - 1, k, 3) = wa2[i - 1] * cr3 + wa2[i] * ci3;
-        ch_ref(i, k, 3) = wa2[i - 1] * ci3 - wa2[i] * cr3;
-        ch_ref(i - 1, k, 4) = wa3[i - 1] * cr4 + wa3[i] * ci4;
-        ch_ref(i, k, 4) = wa3[i - 1] * ci4 - wa3[i] * cr4;
-      }
-    }
-  }
-} /* passf4 */
-
-#undef ch_ref
-#undef cc_ref
-
-static void radb2(integer ido, integer l1, const real *cc, real *ch, const real *wa1)
-{
-  /* System generated locals */
-  integer cc_offset, ch_offset;
-
-  /* Local variables */
-  integer i, k, ic;
-  real ti2, tr2;
-  integer idp2;
-
-
-#define cc_ref(a_1,a_2,a_3) cc[((a_3)*2 + (a_2))*ido + a_1]
-#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1]
-
-  /* Parameter adjustments */
-  ch_offset = 1 + ido * (1 + l1);
-  ch -= ch_offset;
-  cc_offset = 1 + ido * 3;
-  cc -= cc_offset;
-  --wa1;
-
-  /* Function Body */
-  for (k = 1; k <= l1; ++k) {
-    ch_ref(1, k, 1) = cc_ref(1, 1, k) + cc_ref(ido, 2, k);
-    ch_ref(1, k, 2) = cc_ref(1, 1, k) - cc_ref(ido, 2, k);
-  }
-  if (ido < 2) return;
-  else if (ido != 2) {
-    idp2 = ido + 2;
-    for (k = 1; k <= l1; ++k) {
-      for (i = 3; i <= ido; i += 2) {
-        ic = idp2 - i;
-        ch_ref(i - 1, k, 1) = cc_ref(i - 1, 1, k) + cc_ref(ic - 1, 2, 
-                                                           k);
-        tr2 = cc_ref(i - 1, 1, k) - cc_ref(ic - 1, 2, k);
-        ch_ref(i, k, 1) = cc_ref(i, 1, k) - cc_ref(ic, 2, k);
-        ti2 = cc_ref(i, 1, k) + cc_ref(ic, 2, k);
-        ch_ref(i - 1, k, 2) = wa1[i - 2] * tr2 - wa1[i - 1] * ti2;
-        ch_ref(i, k, 2) = wa1[i - 2] * ti2 + wa1[i - 1] * tr2;
-      }
-    }
-    if (ido % 2 == 1) return;
-  }
-  for (k = 1; k <= l1; ++k) {
-    ch_ref(ido, k, 1) = cc_ref(ido, 1, k) + cc_ref(ido, 1, k);
-    ch_ref(ido, k, 2) = -(cc_ref(1, 2, k) + cc_ref(1, 2, k));
-  }
-} /* radb2 */
-
-#undef ch_ref
-#undef cc_ref
-
-
-static void radb3(integer ido, integer l1, const real *cc, real *ch, 
-                  const real *wa1, const real *wa2)
-{
-  /* Initialized data */
-
-  static const real taur = -.5f;
-  static const real taui = .866025403784439f;
-
-  /* System generated locals */
-  integer cc_offset, ch_offset;
-
-  /* Local variables */
-  integer i, k, ic;
-  real ci2, ci3, di2, di3, cr2, cr3, dr2, dr3, ti2, tr2;
-  integer idp2;
-
-
-#define cc_ref(a_1,a_2,a_3) cc[((a_3)*3 + (a_2))*ido + a_1]
-#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1]
-
-  /* Parameter adjustments */
-  ch_offset = 1 + ido * (1 + l1);
-  ch -= ch_offset;
-  cc_offset = 1 + (ido << 2);
-  cc -= cc_offset;
-  --wa1;
-  --wa2;
-
-  /* Function Body */
-  for (k = 1; k <= l1; ++k) {
-    tr2 = cc_ref(ido, 2, k) + cc_ref(ido, 2, k);
-    cr2 = cc_ref(1, 1, k) + taur * tr2;
-    ch_ref(1, k, 1) = cc_ref(1, 1, k) + tr2;
-    ci3 = taui * (cc_ref(1, 3, k) + cc_ref(1, 3, k));
-    ch_ref(1, k, 2) = cr2 - ci3;
-    ch_ref(1, k, 3) = cr2 + ci3;
-  }
-  if (ido == 1) {
-    return;
-  }
-  idp2 = ido + 2;
-  for (k = 1; k <= l1; ++k) {
-    for (i = 3; i <= ido; i += 2) {
-      ic = idp2 - i;
-      tr2 = cc_ref(i - 1, 3, k) + cc_ref(ic - 1, 2, k);
-      cr2 = cc_ref(i - 1, 1, k) + taur * tr2;
-      ch_ref(i - 1, k, 1) = cc_ref(i - 1, 1, k) + tr2;
-      ti2 = cc_ref(i, 3, k) - cc_ref(ic, 2, k);
-      ci2 = cc_ref(i, 1, k) + taur * ti2;
-      ch_ref(i, k, 1) = cc_ref(i, 1, k) + ti2;
-      cr3 = taui * (cc_ref(i - 1, 3, k) - cc_ref(ic - 1, 2, k));
-      ci3 = taui * (cc_ref(i, 3, k) + cc_ref(ic, 2, k));
-      dr2 = cr2 - ci3;
-      dr3 = cr2 + ci3;
-      di2 = ci2 + cr3;
-      di3 = ci2 - cr3;
-      ch_ref(i - 1, k, 2) = wa1[i - 2] * dr2 - wa1[i - 1] * di2;
-      ch_ref(i, k, 2) = wa1[i - 2] * di2 + wa1[i - 1] * dr2;
-      ch_ref(i - 1, k, 3) = wa2[i - 2] * dr3 - wa2[i - 1] * di3;
-      ch_ref(i, k, 3) = wa2[i - 2] * di3 + wa2[i - 1] * dr3;
-    }
-  }
-} /* radb3 */
-
-#undef ch_ref
-#undef cc_ref
-
-
-static void radb4(integer ido, integer l1, const real *cc, real *ch, 
-                  const real *wa1, const real *wa2, const real *wa3)
-{
-  /* Initialized data */
-
-  static const real sqrt2 = 1.414213562373095f;
-
-  /* System generated locals */
-  integer cc_offset, ch_offset;
-
-  /* Local variables */
-  integer i, k, ic;
-  real ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
-  integer idp2;
-
-
-#define cc_ref(a_1,a_2,a_3) cc[((a_3)*4 + (a_2))*ido + a_1]
-#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1]
-
-  /* Parameter adjustments */
-  ch_offset = 1 + ido * (1 + l1);
-  ch -= ch_offset;
-  cc_offset = 1 + ido * 5;
-  cc -= cc_offset;
-  --wa1;
-  --wa2;
-  --wa3;
-
-  /* Function Body */
-  for (k = 1; k <= l1; ++k) {
-    tr1 = cc_ref(1, 1, k) - cc_ref(ido, 4, k);
-    tr2 = cc_ref(1, 1, k) + cc_ref(ido, 4, k);
-    tr3 = cc_ref(ido, 2, k) + cc_ref(ido, 2, k);
-    tr4 = cc_ref(1, 3, k) + cc_ref(1, 3, k);
-    ch_ref(1, k, 1) = tr2 + tr3;
-    ch_ref(1, k, 2) = tr1 - tr4;
-    ch_ref(1, k, 3) = tr2 - tr3;
-    ch_ref(1, k, 4) = tr1 + tr4;
-  }
-  if (ido < 2) return;
-  if (ido != 2) {
-    idp2 = ido + 2;
-    for (k = 1; k <= l1; ++k) {
-      for (i = 3; i <= ido; i += 2) {
-        ic = idp2 - i;
-        ti1 = cc_ref(i, 1, k) + cc_ref(ic, 4, k);
-        ti2 = cc_ref(i, 1, k) - cc_ref(ic, 4, k);
-        ti3 = cc_ref(i, 3, k) - cc_ref(ic, 2, k);
-        tr4 = cc_ref(i, 3, k) + cc_ref(ic, 2, k);
-        tr1 = cc_ref(i - 1, 1, k) - cc_ref(ic - 1, 4, k);
-        tr2 = cc_ref(i - 1, 1, k) + cc_ref(ic - 1, 4, k);
-        ti4 = cc_ref(i - 1, 3, k) - cc_ref(ic - 1, 2, k);
-        tr3 = cc_ref(i - 1, 3, k) + cc_ref(ic - 1, 2, k);
-        ch_ref(i - 1, k, 1) = tr2 + tr3;
-        cr3 = tr2 - tr3;
-        ch_ref(i, k, 1) = ti2 + ti3;
-        ci3 = ti2 - ti3;
-        cr2 = tr1 - tr4;
-        cr4 = tr1 + tr4;
-        ci2 = ti1 + ti4;
-        ci4 = ti1 - ti4;
-        ch_ref(i - 1, k, 2) = wa1[i - 2] * cr2 - wa1[i - 1] * ci2;
-        ch_ref(i, k, 2) = wa1[i - 2] * ci2 + wa1[i - 1] * cr2;
-        ch_ref(i - 1, k, 3) = wa2[i - 2] * cr3 - wa2[i - 1] * ci3;
-        ch_ref(i, k, 3) = wa2[i - 2] * ci3 + wa2[i - 1] * cr3;
-        ch_ref(i - 1, k, 4) = wa3[i - 2] * cr4 - wa3[i - 1] * ci4;
-        ch_ref(i, k, 4) = wa3[i - 2] * ci4 + wa3[i - 1] * cr4;
-      }
-    }
-    if (ido % 2 == 1) return;
-  }
-  for (k = 1; k <= l1; ++k) {
-    ti1 = cc_ref(1, 2, k) + cc_ref(1, 4, k);
-    ti2 = cc_ref(1, 4, k) - cc_ref(1, 2, k);
-    tr1 = cc_ref(ido, 1, k) - cc_ref(ido, 3, k);
-    tr2 = cc_ref(ido, 1, k) + cc_ref(ido, 3, k);
-    ch_ref(ido, k, 1) = tr2 + tr2;
-    ch_ref(ido, k, 2) = sqrt2 * (tr1 - ti1);
-    ch_ref(ido, k, 3) = ti2 + ti2;
-    ch_ref(ido, k, 4) = -sqrt2 * (tr1 + ti1);
-  }
-} /* radb4 */
-
-#undef ch_ref
-#undef cc_ref
-
-
-static void radb5(integer ido, integer l1, const real *cc, real *ch, 
-                  const real *wa1, const real *wa2, const real *wa3, const real *wa4)
-{
-  /* Initialized data */
-
-  static const real tr11 = .309016994374947f;
-  static const real ti11 = .951056516295154f;
-  static const real tr12 = -.809016994374947f;
-  static const real ti12 = .587785252292473f;
-
-  /* System generated locals */
-  integer cc_offset, ch_offset;
-
-  /* Local variables */
-  integer i, k, ic;
-  real ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4, ti2, ti3,
-    ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5;
-  integer idp2;
-
-
-#define cc_ref(a_1,a_2,a_3) cc[((a_3)*5 + (a_2))*ido + a_1]
-#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1]
-
-  /* Parameter adjustments */
-  ch_offset = 1 + ido * (1 + l1);
-  ch -= ch_offset;
-  cc_offset = 1 + ido * 6;
-  cc -= cc_offset;
-  --wa1;
-  --wa2;
-  --wa3;
-  --wa4;
-
-  /* Function Body */
-  for (k = 1; k <= l1; ++k) {
-    ti5 = cc_ref(1, 3, k) + cc_ref(1, 3, k);
-    ti4 = cc_ref(1, 5, k) + cc_ref(1, 5, k);
-    tr2 = cc_ref(ido, 2, k) + cc_ref(ido, 2, k);
-    tr3 = cc_ref(ido, 4, k) + cc_ref(ido, 4, k);
-    ch_ref(1, k, 1) = cc_ref(1, 1, k) + tr2 + tr3;
-    cr2 = cc_ref(1, 1, k) + tr11 * tr2 + tr12 * tr3;
-    cr3 = cc_ref(1, 1, k) + tr12 * tr2 + tr11 * tr3;
-    ci5 = ti11 * ti5 + ti12 * ti4;
-    ci4 = ti12 * ti5 - ti11 * ti4;
-    ch_ref(1, k, 2) = cr2 - ci5;
-    ch_ref(1, k, 3) = cr3 - ci4;
-    ch_ref(1, k, 4) = cr3 + ci4;
-    ch_ref(1, k, 5) = cr2 + ci5;
-  }
-  if (ido == 1) {
-    return;
-  }
-  idp2 = ido + 2;
-  for (k = 1; k <= l1; ++k) {
-    for (i = 3; i <= ido; i += 2) {
-      ic = idp2 - i;
-      ti5 = cc_ref(i, 3, k) + cc_ref(ic, 2, k);
-      ti2 = cc_ref(i, 3, k) - cc_ref(ic, 2, k);
-      ti4 = cc_ref(i, 5, k) + cc_ref(ic, 4, k);
-      ti3 = cc_ref(i, 5, k) - cc_ref(ic, 4, k);
-      tr5 = cc_ref(i - 1, 3, k) - cc_ref(ic - 1, 2, k);
-      tr2 = cc_ref(i - 1, 3, k) + cc_ref(ic - 1, 2, k);
-      tr4 = cc_ref(i - 1, 5, k) - cc_ref(ic - 1, 4, k);
-      tr3 = cc_ref(i - 1, 5, k) + cc_ref(ic - 1, 4, k);
-      ch_ref(i - 1, k, 1) = cc_ref(i - 1, 1, k) + tr2 + tr3;
-      ch_ref(i, k, 1) = cc_ref(i, 1, k) + ti2 + ti3;
-      cr2 = cc_ref(i - 1, 1, k) + tr11 * tr2 + tr12 * tr3;
-      ci2 = cc_ref(i, 1, k) + tr11 * ti2 + tr12 * ti3;
-      cr3 = cc_ref(i - 1, 1, k) + tr12 * tr2 + tr11 * tr3;
-      ci3 = cc_ref(i, 1, k) + tr12 * ti2 + tr11 * ti3;
-      cr5 = ti11 * tr5 + ti12 * tr4;
-      ci5 = ti11 * ti5 + ti12 * ti4;
-      cr4 = ti12 * tr5 - ti11 * tr4;
-      ci4 = ti12 * ti5 - ti11 * ti4;
-      dr3 = cr3 - ci4;
-      dr4 = cr3 + ci4;
-      di3 = ci3 + cr4;
-      di4 = ci3 - cr4;
-      dr5 = cr2 + ci5;
-      dr2 = cr2 - ci5;
-      di5 = ci2 - cr5;
-      di2 = ci2 + cr5;
-      ch_ref(i - 1, k, 2) = wa1[i - 2] * dr2 - wa1[i - 1] * di2;
-      ch_ref(i, k, 2) = wa1[i - 2] * di2 + wa1[i - 1] * dr2;
-      ch_ref(i - 1, k, 3) = wa2[i - 2] * dr3 - wa2[i - 1] * di3;
-      ch_ref(i, k, 3) = wa2[i - 2] * di3 + wa2[i - 1] * dr3;
-      ch_ref(i - 1, k, 4) = wa3[i - 2] * dr4 - wa3[i - 1] * di4;
-      ch_ref(i, k, 4) = wa3[i - 2] * di4 + wa3[i - 1] * dr4;
-      ch_ref(i - 1, k, 5) = wa4[i - 2] * dr5 - wa4[i - 1] * di5;
-      ch_ref(i, k, 5) = wa4[i - 2] * di5 + wa4[i - 1] * dr5;
-    }
-  }
-} /* radb5 */
-
-#undef ch_ref
-#undef cc_ref
-
-
-static void radbg(integer ido, integer ip, integer l1, integer idl1, 
-                  const real *cc, real *c1, real *c2, real *ch, real *ch2, const real *wa)
-{
-  /* System generated locals */
-  integer ch_offset, cc_offset,
-    c1_offset, c2_offset, ch2_offset;
-
-  /* Local variables */
-  integer i, j, k, l, j2, ic, jc, lc, ik, is;
-  real dc2, ai1, ai2, ar1, ar2, ds2;
-  integer nbd;
-  real dcp, arg, dsp, ar1h, ar2h;
-  integer idp2, ipp2, idij, ipph;
-
-
-#define c1_ref(a_1,a_2,a_3) c1[((a_3)*l1 + (a_2))*ido + a_1]
-#define c2_ref(a_1,a_2) c2[(a_2)*idl1 + a_1]
-#define cc_ref(a_1,a_2,a_3) cc[((a_3)*ip + (a_2))*ido + a_1]
-#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1]
-#define ch2_ref(a_1,a_2) ch2[(a_2)*idl1 + a_1]
-
-  /* Parameter adjustments */
-  ch_offset = 1 + ido * (1 + l1);
-  ch -= ch_offset;
-  c1_offset = 1 + ido * (1 + l1);
-  c1 -= c1_offset;
-  cc_offset = 1 + ido * (1 + ip);
-  cc -= cc_offset;
-  ch2_offset = 1 + idl1;
-  ch2 -= ch2_offset;
-  c2_offset = 1 + idl1;
-  c2 -= c2_offset;
-  --wa;
-
-  /* Function Body */
-  arg = (2*M_PI) / (real) (ip);
-  dcp = cos(arg);
-  dsp = sin(arg);
-  idp2 = ido + 2;
-  nbd = (ido - 1) / 2;
-  ipp2 = ip + 2;
-  ipph = (ip + 1) / 2;
-  if (ido >= l1) {
-    for (k = 1; k <= l1; ++k) {
-      for (i = 1; i <= ido; ++i) {
-        ch_ref(i, k, 1) = cc_ref(i, 1, k);
-      }
-    }
-  } else {
-    for (i = 1; i <= ido; ++i) {
-      for (k = 1; k <= l1; ++k) {
-        ch_ref(i, k, 1) = cc_ref(i, 1, k);
-      }
-    }
-  }
-  for (j = 2; j <= ipph; ++j) {
-    jc = ipp2 - j;
-    j2 = j + j;
-    for (k = 1; k <= l1; ++k) {
-      ch_ref(1, k, j) = cc_ref(ido, j2 - 2, k) + cc_ref(ido, j2 - 2, k);
-      ch_ref(1, k, jc) = cc_ref(1, j2 - 1, k) + cc_ref(1, j2 - 1, k);
-    }
-  }
-  if (ido != 1) {
-    if (nbd >= l1) {
-      for (j = 2; j <= ipph; ++j) {
-        jc = ipp2 - j;
-        for (k = 1; k <= l1; ++k) {
-          for (i = 3; i <= ido; i += 2) {
-            ic = idp2 - i;
-            ch_ref(i - 1, k, j) = cc_ref(i - 1, (j << 1) - 1, k) + cc_ref(ic - 1, (j << 1) - 2, k);
-            ch_ref(i - 1, k, jc) = cc_ref(i - 1, (j << 1) - 1, k) - cc_ref(ic - 1, (j << 1) - 2, k);
-            ch_ref(i, k, j) = cc_ref(i, (j << 1) - 1, k) - cc_ref(ic, (j << 1) - 2, k);
-            ch_ref(i, k, jc) = cc_ref(i, (j << 1) - 1, k) + cc_ref(ic, (j << 1) - 2, k);
-          }
-        }
-      }
-    } else {
-      for (j = 2; j <= ipph; ++j) {
-        jc = ipp2 - j;
-        for (i = 3; i <= ido; i += 2) {
-          ic = idp2 - i;
-          for (k = 1; k <= l1; ++k) {
-            ch_ref(i - 1, k, j) = cc_ref(i - 1, (j << 1) - 1, k) + cc_ref(ic - 1, (j << 1) - 2, k);
-            ch_ref(i - 1, k, jc) = cc_ref(i - 1, (j << 1) - 1, k) - cc_ref(ic - 1, (j << 1) - 2, k);
-            ch_ref(i, k, j) = cc_ref(i, (j << 1) - 1, k) - cc_ref(ic, (j << 1) - 2, k);
-            ch_ref(i, k, jc) = cc_ref(i, (j << 1) - 1, k) + cc_ref(ic, (j << 1) - 2, k);
-          }
-        }
-      }
-    }
-  }
-  ar1 = 1.f;
-  ai1 = 0.f;
-  for (l = 2; l <= ipph; ++l) {
-    lc = ipp2 - l;
-    ar1h = dcp * ar1 - dsp * ai1;
-    ai1 = dcp * ai1 + dsp * ar1;
-    ar1 = ar1h;
-    for (ik = 1; ik <= idl1; ++ik) {
-      c2_ref(ik, l) = ch2_ref(ik, 1) + ar1 * ch2_ref(ik, 2);
-      c2_ref(ik, lc) = ai1 * ch2_ref(ik, ip);
-    }
-    dc2 = ar1;
-    ds2 = ai1;
-    ar2 = ar1;
-    ai2 = ai1;
-    for (j = 3; j <= ipph; ++j) {
-      jc = ipp2 - j;
-      ar2h = dc2 * ar2 - ds2 * ai2;
-      ai2 = dc2 * ai2 + ds2 * ar2;
-      ar2 = ar2h;
-      for (ik = 1; ik <= idl1; ++ik) {
-        c2_ref(ik, l) = c2_ref(ik, l) + ar2 * ch2_ref(ik, j);
-        c2_ref(ik, lc) = c2_ref(ik, lc) + ai2 * ch2_ref(ik, jc);
-      }
-    }
-  }
-  for (j = 2; j <= ipph; ++j) {
-    for (ik = 1; ik <= idl1; ++ik) {
-      ch2_ref(ik, 1) = ch2_ref(ik, 1) + ch2_ref(ik, j);
-    }
-  }
-  for (j = 2; j <= ipph; ++j) {
-    jc = ipp2 - j;
-    for (k = 1; k <= l1; ++k) {
-      ch_ref(1, k, j) = c1_ref(1, k, j) - c1_ref(1, k, jc);
-      ch_ref(1, k, jc) = c1_ref(1, k, j) + c1_ref(1, k, jc);
-    }
-  }
-  if (ido != 1) {
-    if (nbd >= l1) {
-      for (j = 2; j <= ipph; ++j) {
-        jc = ipp2 - j;
-        for (k = 1; k <= l1; ++k) {
-          for (i = 3; i <= ido; i += 2) {
-            ch_ref(i - 1, k, j) = c1_ref(i - 1, k, j) - c1_ref(i, k, jc);
-            ch_ref(i - 1, k, jc) = c1_ref(i - 1, k, j) + c1_ref(i, k, jc);
-            ch_ref(i, k, j) = c1_ref(i, k, j) + c1_ref(i - 1, k, jc);
-            ch_ref(i, k, jc) = c1_ref(i, k, j) - c1_ref(i - 1, k, jc);
-          }
-        }
-      }
-    } else {
-      for (j = 2; j <= ipph; ++j) {
-        jc = ipp2 - j;
-        for (i = 3; i <= ido; i += 2) {
-          for (k = 1; k <= l1; ++k) {
-            ch_ref(i - 1, k, j) = c1_ref(i - 1, k, j) - c1_ref(i, k, jc);
-            ch_ref(i - 1, k, jc) = c1_ref(i - 1, k, j) + c1_ref(i, k, jc);
-            ch_ref(i, k, j) = c1_ref(i, k, j) + c1_ref(i - 1, k, jc);
-            ch_ref(i, k, jc) = c1_ref(i, k, j) - c1_ref(i - 1, k, jc);
-          }
-        }
-      }
-    }
-  }
-  if (ido == 1) {
-    return;
-  }
-  for (ik = 1; ik <= idl1; ++ik) {
-    c2_ref(ik, 1) = ch2_ref(ik, 1);
-  }
-  for (j = 2; j <= ip; ++j) {
-    for (k = 1; k <= l1; ++k) {
-      c1_ref(1, k, j) = ch_ref(1, k, j);
-    }
-  }
-  if (nbd <= l1) {
-    is = -(ido);
-    for (j = 2; j <= ip; ++j) {
-      is += ido;
-      idij = is;
-      for (i = 3; i <= ido; i += 2) {
-        idij += 2;
-        for (k = 1; k <= l1; ++k) {
-          c1_ref(i - 1, k, j) = wa[idij - 1] * ch_ref(i - 1, k, j) 
-            - wa[idij] * ch_ref(i, k, j);
-          c1_ref(i, k, j) = wa[idij - 1] * ch_ref(i, k, j) + wa[idij] * ch_ref(i - 1, k, j);
-        }
-      }
-    }
-  } else {
-    is = -(ido);
-    for (j = 2; j <= ip; ++j) {
-      is += ido;
-      for (k = 1; k <= l1; ++k) {
-        idij = is;
-        for (i = 3; i <= ido; i += 2) {
-          idij += 2;
-          c1_ref(i - 1, k, j) = wa[idij - 1] * ch_ref(i - 1, k, j) 
-            - wa[idij] * ch_ref(i, k, j);
-          c1_ref(i, k, j) = wa[idij - 1] * ch_ref(i, k, j) + wa[idij] * ch_ref(i - 1, k, j);
-        }
-      }
-    }
-  }
-} /* radbg */
-
-#undef ch2_ref
-#undef ch_ref
-#undef cc_ref
-#undef c2_ref
-#undef c1_ref
-
-
-static void radf2(integer ido, integer l1, const real *cc, real *ch, 
-                  const real *wa1)
-{
-  /* System generated locals */
-  integer ch_offset, cc_offset;
-
-  /* Local variables */
-  integer i, k, ic;
-  real ti2, tr2;
-  integer idp2;
-
-
-#define cc_ref(a_1,a_2,a_3) cc[((a_3)*l1 + (a_2))*ido + a_1]
-#define ch_ref(a_1,a_2,a_3) ch[((a_3)*2 + (a_2))*ido + a_1]
-
-  /* Parameter adjustments */
-  ch_offset = 1 + ido * 3;
-  ch -= ch_offset;
-  cc_offset = 1 + ido * (1 + l1);
-  cc -= cc_offset;
-  --wa1;
-
-  /* Function Body */
-  for (k = 1; k <= l1; ++k) {
-    ch_ref(1, 1, k) = cc_ref(1, k, 1) + cc_ref(1, k, 2);
-    ch_ref(ido, 2, k) = cc_ref(1, k, 1) - cc_ref(1, k, 2);
-  }
-  if (ido < 2) return;
-  if (ido != 2) {
-    idp2 = ido + 2;
-    for (k = 1; k <= l1; ++k) {
-      for (i = 3; i <= ido; i += 2) {
-        ic = idp2 - i;
-        tr2 = wa1[i - 2] * cc_ref(i - 1, k, 2) + wa1[i - 1] * 
-          cc_ref(i, k, 2);
-        ti2 = wa1[i - 2] * cc_ref(i, k, 2) - wa1[i - 1] * cc_ref(
-                                                                 i - 1, k, 2);
-        ch_ref(i, 1, k) = cc_ref(i, k, 1) + ti2;
-        ch_ref(ic, 2, k) = ti2 - cc_ref(i, k, 1);
-        ch_ref(i - 1, 1, k) = cc_ref(i - 1, k, 1) + tr2;
-        ch_ref(ic - 1, 2, k) = cc_ref(i - 1, k, 1) - tr2;
-      }
-    }
-    if (ido % 2 == 1) {
-      return;
-    }
-  }
-  for (k = 1; k <= l1; ++k) {
-    ch_ref(1, 2, k) = -cc_ref(ido, k, 2);
-    ch_ref(ido, 1, k) = cc_ref(ido, k, 1);
-  }
-} /* radf2 */
-
-#undef ch_ref
-#undef cc_ref
-
-
-static void radf3(integer ido, integer l1, const real *cc, real *ch, 
-                  const real *wa1, const real *wa2)
-{
-  static const real taur = -.5f;
-  static const real taui = .866025403784439f;
-
-  /* System generated locals */
-  integer ch_offset, cc_offset;
-
-  /* Local variables */
-  integer i, k, ic;
-  real ci2, di2, di3, cr2, dr2, dr3, ti2, ti3, tr2, tr3;
-  integer idp2;
-
-
-#define cc_ref(a_1,a_2,a_3) cc[((a_3)*l1 + (a_2))*ido + a_1]
-#define ch_ref(a_1,a_2,a_3) ch[((a_3)*3 + (a_2))*ido + a_1]
-
-  /* Parameter adjustments */
-  ch_offset = 1 + (ido << 2);
-  ch -= ch_offset;
-  cc_offset = 1 + ido * (1 + l1);
-  cc -= cc_offset;
-  --wa1;
-  --wa2;
-
-  /* Function Body */
-  for (k = 1; k <= l1; ++k) {
-    cr2 = cc_ref(1, k, 2) + cc_ref(1, k, 3);
-    ch_ref(1, 1, k) = cc_ref(1, k, 1) + cr2;
-    ch_ref(1, 3, k) = taui * (cc_ref(1, k, 3) - cc_ref(1, k, 2));
-    ch_ref(ido, 2, k) = cc_ref(1, k, 1) + taur * cr2;
-  }
-  if (ido == 1) {
-    return;
-  }
-  idp2 = ido + 2;
-  for (k = 1; k <= l1; ++k) {
-    for (i = 3; i <= ido; i += 2) {
-      ic = idp2 - i;
-      dr2 = wa1[i - 2] * cc_ref(i - 1, k, 2) + wa1[i - 1] * 
-        cc_ref(i, k, 2);
-      di2 = wa1[i - 2] * cc_ref(i, k, 2) - wa1[i - 1] * cc_ref(
-                                                               i - 1, k, 2);
-      dr3 = wa2[i - 2] * cc_ref(i - 1, k, 3) + wa2[i - 1] * 
-        cc_ref(i, k, 3);
-      di3 = wa2[i - 2] * cc_ref(i, k, 3) - wa2[i - 1] * cc_ref(
-                                                               i - 1, k, 3);
-      cr2 = dr2 + dr3;
-      ci2 = di2 + di3;
-      ch_ref(i - 1, 1, k) = cc_ref(i - 1, k, 1) + cr2;
-      ch_ref(i, 1, k) = cc_ref(i, k, 1) + ci2;
-      tr2 = cc_ref(i - 1, k, 1) + taur * cr2;
-      ti2 = cc_ref(i, k, 1) + taur * ci2;
-      tr3 = taui * (di2 - di3);
-      ti3 = taui * (dr3 - dr2);
-      ch_ref(i - 1, 3, k) = tr2 + tr3;
-      ch_ref(ic - 1, 2, k) = tr2 - tr3;
-      ch_ref(i, 3, k) = ti2 + ti3;
-      ch_ref(ic, 2, k) = ti3 - ti2;
-    }
-  }
-} /* radf3 */
-
-#undef ch_ref
-#undef cc_ref
-
-
-static void radf4(integer ido, integer l1, const real *cc, real *ch, 
-                  const real *wa1, const real *wa2, const real *wa3)
-{
-  /* Initialized data */
-
-  static const real hsqt2 = .7071067811865475f;
-
-  /* System generated locals */
-  integer cc_offset, ch_offset;
-
-  /* Local variables */
-  integer i, k, ic;
-  real ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
-  integer idp2;
-
-
-#define cc_ref(a_1,a_2,a_3) cc[((a_3)*l1 + (a_2))*ido + a_1]
-#define ch_ref(a_1,a_2,a_3) ch[((a_3)*4 + (a_2))*ido + a_1]
-
-  /* Parameter adjustments */
-  ch_offset = 1 + ido * 5;
-  ch -= ch_offset;
-  cc_offset = 1 + ido * (1 + l1);
-  cc -= cc_offset;
-  --wa1;
-  --wa2;
-  --wa3;
-
-  /* Function Body */
-  for (k = 1; k <= l1; ++k) {
-    tr1 = cc_ref(1, k, 2) + cc_ref(1, k, 4);
-    tr2 = cc_ref(1, k, 1) + cc_ref(1, k, 3);
-    ch_ref(1, 1, k) = tr1 + tr2;
-    ch_ref(ido, 4, k) = tr2 - tr1;
-    ch_ref(ido, 2, k) = cc_ref(1, k, 1) - cc_ref(1, k, 3);
-    ch_ref(1, 3, k) = cc_ref(1, k, 4) - cc_ref(1, k, 2);
-  }  
-  if (ido < 2) return;
-  if (ido != 2) {
-    idp2 = ido + 2;
-    for (k = 1; k <= l1; ++k) {
-      for (i = 3; i <= ido; i += 2) {
-        ic = idp2 - i;
-        cr2 = wa1[i - 2] * cc_ref(i - 1, k, 2) + wa1[i - 1] * 
-          cc_ref(i, k, 2);
-        ci2 = wa1[i - 2] * cc_ref(i, k, 2) - wa1[i - 1] * cc_ref(
-                                                                 i - 1, k, 2);
-        cr3 = wa2[i - 2] * cc_ref(i - 1, k, 3) + wa2[i - 1] * 
-          cc_ref(i, k, 3);
-        ci3 = wa2[i - 2] * cc_ref(i, k, 3) - wa2[i - 1] * cc_ref(
-                                                                 i - 1, k, 3);
-        cr4 = wa3[i - 2] * cc_ref(i - 1, k, 4) + wa3[i - 1] * 
-          cc_ref(i, k, 4);
-        ci4 = wa3[i - 2] * cc_ref(i, k, 4) - wa3[i - 1] * cc_ref(
-                                                                 i - 1, k, 4);
-        tr1 = cr2 + cr4;
-        tr4 = cr4 - cr2;
-        ti1 = ci2 + ci4;
-        ti4 = ci2 - ci4;
-        ti2 = cc_ref(i, k, 1) + ci3;
-        ti3 = cc_ref(i, k, 1) - ci3;
-        tr2 = cc_ref(i - 1, k, 1) + cr3;
-        tr3 = cc_ref(i - 1, k, 1) - cr3;
-        ch_ref(i - 1, 1, k) = tr1 + tr2;
-        ch_ref(ic - 1, 4, k) = tr2 - tr1;
-        ch_ref(i, 1, k) = ti1 + ti2;
-        ch_ref(ic, 4, k) = ti1 - ti2;
-        ch_ref(i - 1, 3, k) = ti4 + tr3;
-        ch_ref(ic - 1, 2, k) = tr3 - ti4;
-        ch_ref(i, 3, k) = tr4 + ti3;
-        ch_ref(ic, 2, k) = tr4 - ti3;
-      }
-    }
-    if (ido % 2 == 1) {
-      return;
-    }
-  }
-  for (k = 1; k <= l1; ++k) {
-    ti1 = -hsqt2 * (cc_ref(ido, k, 2) + cc_ref(ido, k, 4));
-    tr1 = hsqt2 * (cc_ref(ido, k, 2) - cc_ref(ido, k, 4));
-    ch_ref(ido, 1, k) = tr1 + cc_ref(ido, k, 1);
-    ch_ref(ido, 3, k) = cc_ref(ido, k, 1) - tr1;
-    ch_ref(1, 2, k) = ti1 - cc_ref(ido, k, 3);
-    ch_ref(1, 4, k) = ti1 + cc_ref(ido, k, 3);
-  }
-} /* radf4 */
-
-#undef ch_ref
-#undef cc_ref
-
-
-static void radf5(integer ido, integer l1, const real *cc, real *ch, 
-                  const real *wa1, const real *wa2, const real *wa3, const real *wa4)
-{
-  /* Initialized data */
-
-  static const real tr11 = .309016994374947f;
-  static const real ti11 = .951056516295154f;
-  static const real tr12 = -.809016994374947f;
-  static const real ti12 = .587785252292473f;
-
-  /* System generated locals */
-  integer cc_offset, ch_offset;
-
-  /* Local variables */
-  integer i, k, ic;
-  real ci2, di2, ci4, ci5, di3, di4, di5, ci3, cr2, cr3, dr2, dr3, dr4, dr5,
-    cr5, cr4, ti2, ti3, ti5, ti4, tr2, tr3, tr4, tr5;
-  integer idp2;
-
-
-#define cc_ref(a_1,a_2,a_3) cc[((a_3)*l1 + (a_2))*ido + a_1]
-#define ch_ref(a_1,a_2,a_3) ch[((a_3)*5 + (a_2))*ido + a_1]
-
-  /* Parameter adjustments */
-  ch_offset = 1 + ido * 6;
-  ch -= ch_offset;
-  cc_offset = 1 + ido * (1 + l1);
-  cc -= cc_offset;
-  --wa1;
-  --wa2;
-  --wa3;
-  --wa4;
-
-  /* Function Body */
-  for (k = 1; k <= l1; ++k) {
-    cr2 = cc_ref(1, k, 5) + cc_ref(1, k, 2);
-    ci5 = cc_ref(1, k, 5) - cc_ref(1, k, 2);
-    cr3 = cc_ref(1, k, 4) + cc_ref(1, k, 3);
-    ci4 = cc_ref(1, k, 4) - cc_ref(1, k, 3);
-    ch_ref(1, 1, k) = cc_ref(1, k, 1) + cr2 + cr3;
-    ch_ref(ido, 2, k) = cc_ref(1, k, 1) + tr11 * cr2 + tr12 * cr3;
-    ch_ref(1, 3, k) = ti11 * ci5 + ti12 * ci4;
-    ch_ref(ido, 4, k) = cc_ref(1, k, 1) + tr12 * cr2 + tr11 * cr3;
-    ch_ref(1, 5, k) = ti12 * ci5 - ti11 * ci4;
-  }
-  if (ido == 1) {
-    return;
-  }
-  idp2 = ido + 2;
-  for (k = 1; k <= l1; ++k) {
-    for (i = 3; i <= ido; i += 2) {
-      ic = idp2 - i;
-      dr2 = wa1[i - 2] * cc_ref(i - 1, k, 2) + wa1[i - 1] * cc_ref(i, k, 2);
-      di2 = wa1[i - 2] * cc_ref(i, k, 2) - wa1[i - 1] * cc_ref(i - 1, k, 2);
-      dr3 = wa2[i - 2] * cc_ref(i - 1, k, 3) + wa2[i - 1] * cc_ref(i, k, 3);
-      di3 = wa2[i - 2] * cc_ref(i, k, 3) - wa2[i - 1] * cc_ref(i - 1, k, 3);
-      dr4 = wa3[i - 2] * cc_ref(i - 1, k, 4) + wa3[i - 1] * cc_ref(i, k, 4);
-      di4 = wa3[i - 2] * cc_ref(i, k, 4) - wa3[i - 1] * cc_ref(i - 1, k, 4);
-      dr5 = wa4[i - 2] * cc_ref(i - 1, k, 5) + wa4[i - 1] * cc_ref(i, k, 5);
-      di5 = wa4[i - 2] * cc_ref(i, k, 5) - wa4[i - 1] * cc_ref(i - 1, k, 5);
-      cr2 = dr2 + dr5;
-      ci5 = dr5 - dr2;
-      cr5 = di2 - di5;
-      ci2 = di2 + di5;
-      cr3 = dr3 + dr4;
-      ci4 = dr4 - dr3;
-      cr4 = di3 - di4;
-      ci3 = di3 + di4;
-      ch_ref(i - 1, 1, k) = cc_ref(i - 1, k, 1) + cr2 + cr3;
-      ch_ref(i, 1, k) = cc_ref(i, k, 1) + ci2 + ci3;
-      tr2 = cc_ref(i - 1, k, 1) + tr11 * cr2 + tr12 * cr3;
-      ti2 = cc_ref(i, k, 1) + tr11 * ci2 + tr12 * ci3;
-      tr3 = cc_ref(i - 1, k, 1) + tr12 * cr2 + tr11 * cr3;
-      ti3 = cc_ref(i, k, 1) + tr12 * ci2 + tr11 * ci3;
-      tr5 = ti11 * cr5 + ti12 * cr4;
-      ti5 = ti11 * ci5 + ti12 * ci4;
-      tr4 = ti12 * cr5 - ti11 * cr4;
-      ti4 = ti12 * ci5 - ti11 * ci4;
-      ch_ref(i - 1, 3, k) = tr2 + tr5;
-      ch_ref(ic - 1, 2, k) = tr2 - tr5;
-      ch_ref(i, 3, k) = ti2 + ti5;
-      ch_ref(ic, 2, k) = ti5 - ti2;
-      ch_ref(i - 1, 5, k) = tr3 + tr4;
-      ch_ref(ic - 1, 4, k) = tr3 - tr4;
-      ch_ref(i, 5, k) = ti3 + ti4;
-      ch_ref(ic, 4, k) = ti4 - ti3;
-    }
-  }
-} /* radf5 */
-
-#undef ch_ref
-#undef cc_ref
-
-
-static void radfg(integer ido, integer ip, integer l1, integer idl1, 
-                  real *cc, real *c1, real *c2, real *ch, real *ch2, const real *wa)
-{
-  /* System generated locals */
-  integer ch_offset, cc_offset,
-    c1_offset, c2_offset, ch2_offset;
-
-  /* Local variables */
-  integer i, j, k, l, j2, ic, jc, lc, ik, is;
-  real dc2, ai1, ai2, ar1, ar2, ds2;
-  integer nbd;
-  real dcp, arg, dsp, ar1h, ar2h;
-  integer idp2, ipp2, idij, ipph;
-
-
-#define c1_ref(a_1,a_2,a_3) c1[((a_3)*l1 + (a_2))*ido + a_1]
-#define c2_ref(a_1,a_2) c2[(a_2)*idl1 + a_1]
-#define cc_ref(a_1,a_2,a_3) cc[((a_3)*ip + (a_2))*ido + a_1]
-#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1]
-#define ch2_ref(a_1,a_2) ch2[(a_2)*idl1 + a_1]
-
-  /* Parameter adjustments */
-  ch_offset = 1 + ido * (1 + l1);
-  ch -= ch_offset;
-  c1_offset = 1 + ido * (1 + l1);
-  c1 -= c1_offset;
-  cc_offset = 1 + ido * (1 + ip);
-  cc -= cc_offset;
-  ch2_offset = 1 + idl1;
-  ch2 -= ch2_offset;
-  c2_offset = 1 + idl1;
-  c2 -= c2_offset;
-  --wa;
-
-  /* Function Body */
-  arg = (2*M_PI) / (real) (ip);
-  dcp = cos(arg);
-  dsp = sin(arg);
-  ipph = (ip + 1) / 2;
-  ipp2 = ip + 2;
-  idp2 = ido + 2;
-  nbd = (ido - 1) / 2;
-  if (ido == 1) {
-    for (ik = 1; ik <= idl1; ++ik) {
-      c2_ref(ik, 1) = ch2_ref(ik, 1);
-    }
-  } else {
-    for (ik = 1; ik <= idl1; ++ik) {
-      ch2_ref(ik, 1) = c2_ref(ik, 1);
-    }
-    for (j = 2; j <= ip; ++j) {
-      for (k = 1; k <= l1; ++k) {
-        ch_ref(1, k, j) = c1_ref(1, k, j);
-      }
-    }
-    if (nbd <= l1) {
-      is = -(ido);
-      for (j = 2; j <= ip; ++j) {
-        is += ido;
-        idij = is;
-        for (i = 3; i <= ido; i += 2) {
-          idij += 2;
-          for (k = 1; k <= l1; ++k) {
-            ch_ref(i - 1, k, j) = wa[idij - 1] * c1_ref(i - 1, k, j)
-              + wa[idij] * c1_ref(i, k, j);
-            ch_ref(i, k, j) = wa[idij - 1] * c1_ref(i, k, j) - wa[
-                                                                  idij] * c1_ref(i - 1, k, j);
-          }
-        }
-      }
-    } else {
-      is = -(ido);
-      for (j = 2; j <= ip; ++j) {
-        is += ido;
-        for (k = 1; k <= l1; ++k) {
-          idij = is;
-          for (i = 3; i <= ido; i += 2) {
-            idij += 2;
-            ch_ref(i - 1, k, j) = wa[idij - 1] * c1_ref(i - 1, k, j) 
-              + wa[idij] * c1_ref(i, k, j);
-            ch_ref(i, k, j) = wa[idij - 1] * c1_ref(i, k, j) - wa[
-                                                                  idij] * c1_ref(i - 1, k, j);
-          }
-        }
-      }
-    }
-    if (nbd >= l1) {
-      for (j = 2; j <= ipph; ++j) {
-        jc = ipp2 - j;
-        for (k = 1; k <= l1; ++k) {
-          for (i = 3; i <= ido; i += 2) {
-            c1_ref(i - 1, k, j) = ch_ref(i - 1, k, j) + ch_ref(i - 
-                                                               1, k, jc);
-            c1_ref(i - 1, k, jc) = ch_ref(i, k, j) - ch_ref(i, k, 
-                                                            jc);
-            c1_ref(i, k, j) = ch_ref(i, k, j) + ch_ref(i, k, jc);
-            c1_ref(i, k, jc) = ch_ref(i - 1, k, jc) - ch_ref(i - 1, 
-                                                             k, j);
-          }
-        }
-      }
-    } else {
-      for (j = 2; j <= ipph; ++j) {
-        jc = ipp2 - j;
-        for (i = 3; i <= ido; i += 2) {
-          for (k = 1; k <= l1; ++k) {
-            c1_ref(i - 1, k, j) = ch_ref(i - 1, k, j) + ch_ref(i - 
-                                                               1, k, jc);
-            c1_ref(i - 1, k, jc) = ch_ref(i, k, j) - ch_ref(i, k, 
-                                                            jc);
-            c1_ref(i, k, j) = ch_ref(i, k, j) + ch_ref(i, k, jc);
-            c1_ref(i, k, jc) = ch_ref(i - 1, k, jc) - ch_ref(i - 1, 
-                                                             k, j);
-          }
-        }
-      }
-    }
-  }
-  for (j = 2; j <= ipph; ++j) {
-    jc = ipp2 - j;
-    for (k = 1; k <= l1; ++k) {
-      c1_ref(1, k, j) = ch_ref(1, k, j) + ch_ref(1, k, jc);
-      c1_ref(1, k, jc) = ch_ref(1, k, jc) - ch_ref(1, k, j);
-    }
-  }
-
-  ar1 = 1.f;
-  ai1 = 0.f;
-  for (l = 2; l <= ipph; ++l) {
-    lc = ipp2 - l;
-    ar1h = dcp * ar1 - dsp * ai1;
-    ai1 = dcp * ai1 + dsp * ar1;
-    ar1 = ar1h;
-    for (ik = 1; ik <= idl1; ++ik) {
-      ch2_ref(ik, l) = c2_ref(ik, 1) + ar1 * c2_ref(ik, 2);
-      ch2_ref(ik, lc) = ai1 * c2_ref(ik, ip);
-    }
-    dc2 = ar1;
-    ds2 = ai1;
-    ar2 = ar1;
-    ai2 = ai1;
-    for (j = 3; j <= ipph; ++j) {
-      jc = ipp2 - j;
-      ar2h = dc2 * ar2 - ds2 * ai2;
-      ai2 = dc2 * ai2 + ds2 * ar2;
-      ar2 = ar2h;
-      for (ik = 1; ik <= idl1; ++ik) {
-        ch2_ref(ik, l) = ch2_ref(ik, l) + ar2 * c2_ref(ik, j);
-        ch2_ref(ik, lc) = ch2_ref(ik, lc) + ai2 * c2_ref(ik, jc);
-      }
-    }
-  }
-  for (j = 2; j <= ipph; ++j) {
-    for (ik = 1; ik <= idl1; ++ik) {
-      ch2_ref(ik, 1) = ch2_ref(ik, 1) + c2_ref(ik, j);
-    }
-  }
-
-  if (ido >= l1) {
-    for (k = 1; k <= l1; ++k) {
-      for (i = 1; i <= ido; ++i) {
-        cc_ref(i, 1, k) = ch_ref(i, k, 1);
-      }
-    }
-  } else {
-    for (i = 1; i <= ido; ++i) {
-      for (k = 1; k <= l1; ++k) {
-        cc_ref(i, 1, k) = ch_ref(i, k, 1);
-      }
-    }
-  }
-  for (j = 2; j <= ipph; ++j) {
-    jc = ipp2 - j;
-    j2 = j + j;
-    for (k = 1; k <= l1; ++k) {
-      cc_ref(ido, j2 - 2, k) = ch_ref(1, k, j);
-      cc_ref(1, j2 - 1, k) = ch_ref(1, k, jc);
-    }
-  }
-  if (ido == 1) {
-    return;
-  }
-  if (nbd >= l1) {
-    for (j = 2; j <= ipph; ++j) {
-      jc = ipp2 - j;
-      j2 = j + j;
-      for (k = 1; k <= l1; ++k) {
-        for (i = 3; i <= ido; i += 2) {
-          ic = idp2 - i;
-          cc_ref(i - 1, j2 - 1, k) = ch_ref(i - 1, k, j) + ch_ref(
-                                                                  i - 1, k, jc);
-          cc_ref(ic - 1, j2 - 2, k) = ch_ref(i - 1, k, j) - ch_ref(
-                                                                   i - 1, k, jc);
-          cc_ref(i, j2 - 1, k) = ch_ref(i, k, j) + ch_ref(i, k, 
-                                                          jc);
-          cc_ref(ic, j2 - 2, k) = ch_ref(i, k, jc) - ch_ref(i, k, j)
-            ;
-        }
-      }
-    }
-  } else {
-    for (j = 2; j <= ipph; ++j) {
-      jc = ipp2 - j;
-      j2 = j + j;
-      for (i = 3; i <= ido; i += 2) {
-        ic = idp2 - i;
-        for (k = 1; k <= l1; ++k) {
-          cc_ref(i - 1, j2 - 1, k) = ch_ref(i - 1, k, j) + ch_ref(
-                                                                  i - 1, k, jc);
-          cc_ref(ic - 1, j2 - 2, k) = ch_ref(i - 1, k, j) - ch_ref(
-                                                                   i - 1, k, jc);
-          cc_ref(i, j2 - 1, k) = ch_ref(i, k, j) + ch_ref(i, k, 
-                                                          jc);
-          cc_ref(ic, j2 - 2, k) = ch_ref(i, k, jc) - ch_ref(i, k, j)
-            ;
-        }
-      }
-    }
-  }
-} /* radfg */
-
-#undef ch2_ref
-#undef ch_ref
-#undef cc_ref
-#undef c2_ref
-#undef c1_ref
-
-
-static void cfftb1(integer n, real *c, real *ch, const real *wa, integer *ifac)
-{
-  integer i, k1, l1, l2, na, nf, ip, iw, ix2, ix3, ix4, nac, ido, 
-    idl1, idot;
-
-  /* Function Body */
-  nf = ifac[1];
-  na = 0;
-  l1 = 1;
-  iw = 0;
-  for (k1 = 1; k1 <= nf; ++k1) {
-    ip = ifac[k1 + 1];
-    l2 = ip * l1;
-    ido = n / l2;
-    idot = ido + ido;
-    idl1 = idot * l1;
-    switch (ip) {
-      case 4:
-        ix2 = iw + idot;
-        ix3 = ix2 + idot;
-        passb4(idot, l1, na?ch:c, na?c:ch, &wa[iw], &wa[ix2], &wa[ix3]);
-        na = 1 - na;
-        break;
-      case 2:
-        passb2(idot, l1, na?ch:c, na?c:ch, &wa[iw]);
-        na = 1 - na;
-        break;
-      case 3:
-        ix2 = iw + idot;
-        passb3(idot, l1, na?ch:c, na?c:ch, &wa[iw], &wa[ix2]);
-        na = 1 - na;
-        break;
-      case 5:
-        ix2 = iw + idot;
-        ix3 = ix2 + idot;
-        ix4 = ix3 + idot;
-        passfb5(idot, l1, na?ch:c, na?c:ch, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], +1);
-        na = 1 - na;
-        break;
-      default:
-        if (na == 0) {
-          passfb(&nac, idot, ip, l1, idl1, c, c, c, ch, ch, &wa[iw], +1);
-        } else {
-          passfb(&nac, idot, ip, l1, idl1, ch, ch, ch, c, c, &wa[iw], +1);
-        }
-        if (nac != 0) {
-          na = 1 - na;
-        }
-        break;
-    }
-    l1 = l2;
-    iw += (ip - 1) * idot;
-  }
-  if (na == 0) {
-    return;
-  }
-  for (i = 0; i < 2*n; ++i) {
-    c[i] = ch[i];
-  }
-} /* cfftb1 */
-
-void cfftb(integer n, real *c, real *wsave)
-{
-  integer iw1, iw2;
-
-  /* Parameter adjustments */
-  --wsave;
-  --c;
-
-  /* Function Body */
-  if (n == 1) {
-    return;
-  }
-  iw1 = 2*n + 1;
-  iw2 = iw1 + 2*n;
-  cfftb1(n, &c[1], &wsave[1], &wsave[iw1], (int*)&wsave[iw2]);
-} /* cfftb */
-
-static void cfftf1(integer n, real *c, real *ch, const real *wa, integer *ifac)
-{
-  /* Local variables */
-  integer i, k1, l1, l2, na, nf, ip, iw, ix2, ix3, ix4, nac, ido, 
-    idl1, idot;
-
-  /* Function Body */
-  nf = ifac[1];
-  na = 0;
-  l1 = 1;
-  iw = 0;
-  for (k1 = 1; k1 <= nf; ++k1) {
-    ip = ifac[k1 + 1];
-    l2 = ip * l1;
-    ido = n / l2;
-    idot = ido + ido;
-    idl1 = idot * l1;
-    switch (ip) {
-      case 4:
-        ix2 = iw + idot;
-        ix3 = ix2 + idot;
-        passf4(idot, l1, na?ch:c, na?c:ch, &wa[iw], &wa[ix2], &wa[ix3]);
-        na = 1 - na;
-        break;
-      case 2:
-        passf2(idot, l1, na?ch:c, na?c:ch, &wa[iw]);
-        na = 1 - na;
-        break;
-      case 3:
-        ix2 = iw + idot;
-        passf3(idot, l1, na?ch:c, na?c:ch, &wa[iw], &wa[ix2]);
-        na = 1 - na;
-        break;
-      case 5:
-        ix2 = iw + idot;
-        ix3 = ix2 + idot;
-        ix4 = ix3 + idot;
-        passfb5(idot, l1, na?ch:c, na?c:ch, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], -1);
-        na = 1 - na;
-        break;
-      default:
-        if (na == 0) {
-          passfb(&nac, idot, ip, l1, idl1, c, c, c, ch, ch, &wa[iw], -1);
-        } else {
-          passfb(&nac, idot, ip, l1, idl1, ch, ch, ch, c, c, &wa[iw], -1);
-        }
-        if (nac != 0) {
-          na = 1 - na;
-        }
-        break;
-    }
-    l1 = l2;
-    iw += (ip - 1)*idot;
-  }
-  if (na == 0) {
-    return;
-  }
-  for (i = 0; i < 2*n; ++i) {
-    c[i] = ch[i];
-  }
-} /* cfftf1 */
-
-void cfftf(integer n, real *c, real *wsave)
-{
-  integer iw1, iw2;
-
-  /* Parameter adjustments */
-  --wsave;
-  --c;
-
-  /* Function Body */
-  if (n == 1) {
-    return;
-  }
-  iw1 = 2*n + 1;
-  iw2 = iw1 + 2*n;
-  cfftf1(n, &c[1], &wsave[1], &wsave[iw1], (int*)&wsave[iw2]);
-} /* cfftf */
-
-static int decompose(integer n, integer *ifac, integer ntryh[4]) {  
-  integer ntry=0, nl = n, nf = 0, nq, nr, i, j = 0;
-  do {
-    if (j < 4) {
-      ntry = ntryh[j];
-    } else {
-      ntry += 2;
-    }
-    ++j;
-  L104:
-    nq = nl / ntry;
-    nr = nl - ntry * nq;
-    if (nr != 0) continue;
-    ++nf;
-    ifac[nf + 2] = ntry;
-    nl = nq;
-    if (ntry == 2 && nf != 1) {
-      for (i = 2; i <= nf; ++i) {
-        integer ib = nf - i + 2;
-        ifac[ib + 2] = ifac[ib + 1];
-      }
-      ifac[3] = 2;
-    }
-    if (nl != 1) {
-      goto L104;
-    }
-  } while (nl != 1);
-  ifac[1] = n;
-  ifac[2] = nf;  
-  return nf;
-}
-
-static void cffti1(integer n, real *wa, integer *ifac)
-{
-  static integer ntryh[4] = { 3,4,2,5 };
-
-  /* Local variables */
-  integer i, j, i1, k1, l1, l2;
-  real fi;
-  integer ld, ii, nf, ip;
-  real arg;
-  integer ido, ipm;
-  real argh;
-  integer idot;
-  real argld;
-
-  /* Parameter adjustments */
-  --ifac;
-  --wa;
-
-  nf = decompose(n, ifac, ntryh);
-
-  argh = (2*M_PI) / (real) (n);
-  i = 2;
-  l1 = 1;
-  for (k1 = 1; k1 <= nf; ++k1) {
-    ip = ifac[k1 + 2];
-    ld = 0;
-    l2 = l1 * ip;
-    ido = n / l2;
-    idot = ido + ido + 2;
-    ipm = ip - 1;
-    for (j = 1; j <= ipm; ++j) {
-      i1 = i;
-      wa[i - 1] = 1.f;
-      wa[i] = 0.f;
-      ld += l1;
-      fi = 0.f;
-      argld = (real) ld * argh;
-      for (ii = 4; ii <= idot; ii += 2) {
-        i += 2;
-        fi += 1.f;
-        arg = fi * argld;
-        wa[i - 1] = cos(arg);
-        wa[i] = sin(arg);
-      }
-      if (ip > 5) {
-        wa[i1 - 1] = wa[i - 1];
-        wa[i1] = wa[i];
-      };
-    }
-    l1 = l2;
-  }
-} /* cffti1 */
-
-void cffti(integer n, real *wsave)
-{
-  integer iw1, iw2;
-  /* Parameter adjustments */
-  --wsave;
-
-  /* Function Body */
-  if (n == 1) {
-    return;
-  }
-  iw1 = 2*n + 1;
-  iw2 = iw1 + 2*n;
-  cffti1(n, &wsave[iw1], (int*)&wsave[iw2]);
-  return;
-} /* cffti */
-
-static void rfftb1(integer n, real *c, real *ch, const real *wa, integer *ifac)
-{
-  /* Local variables */
-  integer i, k1, l1, l2, na, nf, ip, iw, ix2, ix3, ix4, ido, idl1;
-
-  /* Function Body */
-  nf = ifac[1];
-  na = 0;
-  l1 = 1;
-  iw = 0;
-  for (k1 = 1; k1 <= nf; ++k1) {
-    ip = ifac[k1 + 1];
-    l2 = ip * l1;
-    ido = n / l2;
-    idl1 = ido * l1;
-    switch (ip) {
-      case 4:
-        ix2 = iw + ido;
-        ix3 = ix2 + ido;
-        radb4(ido, l1, na?ch:c, na?c:ch, &wa[iw], &wa[ix2], &wa[ix3]);
-        na = 1 - na;
-        break;
-      case 2:
-        radb2(ido, l1, na?ch:c, na?c:ch, &wa[iw]);
-        na = 1 - na;
-        break;
-      case 3:
-        ix2 = iw + ido;
-        radb3(ido, l1, na?ch:c, na?c:ch, &wa[iw], &wa[ix2]);
-        na = 1 - na;
-        break;
-      case 5:
-        ix2 = iw + ido;
-        ix3 = ix2 + ido;
-        ix4 = ix3 + ido;
-        radb5(ido, l1, na?ch:c, na?c:ch, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]);
-        na = 1 - na;
-        break;
-      default:
-        if (na == 0) {
-          radbg(ido, ip, l1, idl1, c, c, c, ch, ch, &wa[iw]);
-        } else {
-          radbg(ido, ip, l1, idl1, ch, ch, ch, c, c, &wa[iw]);
-        }
-        if (ido == 1) {
-          na = 1 - na;
-        }
-        break;
-    }
-    l1 = l2;
-    iw += (ip - 1) * ido;
-  }
-  if (na == 0) {
-    return;
-  }
-  for (i = 0; i < n; ++i) {
-    c[i] = ch[i];
-  }
-} /* rfftb1 */
-
-static void rfftf1(integer n, real *c, real *ch, const real *wa, integer *ifac)
-{
-  /* Local variables */
-  integer i, k1, l1, l2, na, kh, nf, ip, iw, ix2, ix3, ix4, ido, idl1;
-
-  /* Function Body */
-  nf = ifac[1];
-  na = 1;
-  l2 = n;
-  iw = n-1;
-  for (k1 = 1; k1 <= nf; ++k1) {
-    kh = nf - k1;
-    ip = ifac[kh + 2];
-    l1 = l2 / ip;
-    ido = n / l2;
-    idl1 = ido * l1;
-    iw -= (ip - 1) * ido;
-    na = 1 - na;
-    switch (ip) {
-      case 4:
-        ix2 = iw + ido;
-        ix3 = ix2 + ido;
-        radf4(ido, l1, na ? ch : c, na ? c : ch, &wa[iw], &wa[ix2], &wa[ix3]);
-        break;
-      case 2:
-        radf2(ido, l1, na ? ch : c, na ? c : ch, &wa[iw]);
-        break;
-      case 3:        
-        ix2 = iw + ido;
-        radf3(ido, l1, na ? ch : c, na ? c : ch, &wa[iw], &wa[ix2]);
-        break;
-      case 5:
-        ix2 = iw + ido;
-        ix3 = ix2 + ido;
-        ix4 = ix3 + ido;
-        radf5(ido, l1, na ? ch : c, na ? c : ch, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]);
-        break;
-      default:
-        if (ido == 1) {
-          na = 1 - na;
-        }
-        if (na == 0) {
-          radfg(ido, ip, l1, idl1, c, c, c, ch, ch, &wa[iw]);
-          na = 1;
-        } else {
-          radfg(ido, ip, l1, idl1, ch, ch, ch, c, c, &wa[iw]);
-          na = 0;
-        }
-        break;
-    }
-    l2 = l1;
-  }
-  if (na == 1) {
-    return;
-  }
-  for (i = 0; i < n; ++i) {
-    c[i] = ch[i];
-  }
-}
-
-void rfftb(integer n, real *r, real *wsave)
-{
-
-  /* Parameter adjustments */
-  --wsave;
-  --r;
-
-  /* Function Body */
-  if (n == 1) {
-    return;
-  }
-  rfftb1(n, &r[1], &wsave[1], &wsave[n + 1], (int*)&wsave[(n << 1) + 1]);
-} /* rfftb */
-
-static void rffti1(integer n, real *wa, integer *ifac)
-{
-  static integer ntryh[4] = { 4,2,3,5 };
-
-  /* Local variables */
-  integer i, j, k1, l1, l2;
-  real fi;
-  integer ld, ii, nf, ip, is;
-  real arg;
-  integer ido, ipm;
-  integer nfm1;
-  real argh;
-  real argld;
-
-  /* Parameter adjustments */
-  --ifac;
-  --wa;
-
-  nf = decompose(n, ifac, ntryh);
-
-  argh = (2*M_PI) / (real) (n);
-  is = 0;
-  nfm1 = nf - 1;
-  l1 = 1;
-  if (nfm1 == 0) {
-    return;
-  }
-  for (k1 = 1; k1 <= nfm1; ++k1) {
-    ip = ifac[k1 + 2];
-    ld = 0;
-    l2 = l1 * ip;
-    ido = n / l2;
-    ipm = ip - 1;
-    for (j = 1; j <= ipm; ++j) {
-      ld += l1;
-      i = is;
-      argld = (real) ld * argh;
-      fi = 0.f;
-      for (ii = 3; ii <= ido; ii += 2) {
-        i += 2;
-        fi += 1.f;
-        arg = fi * argld;
-        wa[i - 1] = cos(arg);
-        wa[i] = sin(arg);
-      }
-      is += ido;
-    }
-    l1 = l2;
-  }
-} /* rffti1 */
-
-void rfftf(integer n, real *r, real *wsave)
-{
-
-  /* Parameter adjustments */
-  --wsave;
-  --r;
-
-  /* Function Body */
-  if (n == 1) {
-    return;
-  }
-  rfftf1(n, &r[1], &wsave[1], &wsave[n + 1], (int*)&wsave[(n << 1) + 1]);
-} /* rfftf */
-
-void rffti(integer n, real *wsave)
-{
-  /* Parameter adjustments */
-  --wsave;
-
-  /* Function Body */
-  if (n == 1) {
-    return;
-  }
-  rffti1(n, &wsave[n + 1], (int*)&wsave[(n << 1) + 1]);
-  return;
-} /* rffti */
-
-static void cosqb1(integer n, real *x, real *w, real *xh)
-{
-  /* Local variables */
-  integer i, k, kc, np2, ns2;
-  real xim1;
-  integer modn;
-
-  /* Parameter adjustments */
-  --xh;
-  --w;
-  --x;
-
-  /* Function Body */
-  ns2 = (n + 1) / 2;
-  np2 = n + 2;
-  for (i = 3; i <= n; i += 2) {
-    xim1 = x[i - 1] + x[i];
-    x[i] -= x[i - 1];
-    x[i - 1] = xim1;
-  }
-  x[1] += x[1];
-  modn = n % 2;
-  if (modn == 0) {
-    x[n] += x[n];
-  }
-  rfftb(n, &x[1], &xh[1]);
-  for (k = 2; k <= ns2; ++k) {
-    kc = np2 - k;
-    xh[k] = w[k - 1] * x[kc] + w[kc - 1] * x[k];
-    xh[kc] = w[k - 1] * x[k] - w[kc - 1] * x[kc];
-  }
-  if (modn == 0) {
-    x[ns2 + 1] = w[ns2] * (x[ns2 + 1] + x[ns2 + 1]);
-  }
-  for (k = 2; k <= ns2; ++k) {
-    kc = np2 - k;
-    x[k] = xh[k] + xh[kc];
-    x[kc] = xh[k] - xh[kc];
-  }
-  x[1] += x[1];
-} /* cosqb1 */
-
-void cosqb(integer n, real *x, real *wsave)
-{
-  static const real tsqrt2 = 2.82842712474619f;
-
-  /* Local variables */
-  real x1;
-
-  /* Parameter adjustments */
-  --wsave;
-  --x;
-
-  if (n < 2) {
-    x[1] *= 4.f;
-  } else if (n == 2) {
-    x1 = (x[1] + x[2]) * 4.f;
-    x[2] = tsqrt2 * (x[1] - x[2]);
-    x[1] = x1;
-  } else {
-    cosqb1(n, &x[1], &wsave[1], &wsave[n + 1]);
-  }
-} /* cosqb */
-
-static void cosqf1(integer n, real *x, real *w, real *xh)
-{
-  /* Local variables */
-  integer i, k, kc, np2, ns2;
-  real xim1;
-  integer modn;
-
-  /* Parameter adjustments */
-  --xh;
-  --w;
-  --x;
-
-  /* Function Body */
-  ns2 = (n + 1) / 2;
-  np2 = n + 2;
-  for (k = 2; k <= ns2; ++k) {
-    kc = np2 - k;
-    xh[k] = x[k] + x[kc];
-    xh[kc] = x[k] - x[kc];
-  }
-  modn = n % 2;
-  if (modn == 0) {
-    xh[ns2 + 1] = x[ns2 + 1] + x[ns2 + 1];
-  }
-  for (k = 2; k <= ns2; ++k) {
-    kc = np2 - k;
-    x[k] = w[k - 1] * xh[kc] + w[kc - 1] * xh[k];
-    x[kc] = w[k - 1] * xh[k] - w[kc - 1] * xh[kc];
-  }
-  if (modn == 0) {
-    x[ns2 + 1] = w[ns2] * xh[ns2 + 1];
-  }
-  rfftf(n, &x[1], &xh[1]);
-  for (i = 3; i <= n; i += 2) {
-    xim1 = x[i - 1] - x[i];
-    x[i] = x[i - 1] + x[i];
-    x[i - 1] = xim1;
-  }
-} /* cosqf1 */
-
-void cosqf(integer n, real *x, real *wsave)
-{
-  static const real sqrt2 = 1.4142135623731f;
-
-  /* Local variables */
-  real tsqx;
-
-  /* Parameter adjustments */
-  --wsave;
-  --x;
-
-  if (n == 2) {
-    tsqx = sqrt2 * x[2];
-    x[2] = x[1] - tsqx;
-    x[1] += tsqx;
-  } else if (n > 2) {
-    cosqf1(n, &x[1], &wsave[1], &wsave[n + 1]);
-  }
-} /* cosqf */
-
-void cosqi(integer n, real *wsave)
-{
-  /* Local variables */
-  integer k;
-  real fk, dt;
-
-  /* Parameter adjustments */
-  --wsave;
-
-  dt = M_PI/2 / (real) (n);
-  fk = 0.f;
-  for (k = 1; k <= n; ++k) {
-    fk += 1.f;
-    wsave[k] = cos(fk * dt);
-  }
-  rffti(n, &wsave[n + 1]);
-} /* cosqi */
-
-void cost(integer n, real *x, real *wsave)
-{
-  /* Local variables */
-  integer i, k;
-  real c1, t1, t2;
-  integer kc;
-  real xi;
-  integer nm1, np1;
-  real x1h;
-  integer ns2;
-  real tx2, x1p3, xim2;
-  integer modn;
-
-  /* Parameter adjustments */
-  --wsave;
-  --x;
-
-  /* Function Body */
-  nm1 = n - 1;
-  np1 = n + 1;
-  ns2 = n / 2;
-  if (n < 2) {
-  } else if (n == 2) {
-    x1h = x[1] + x[2];
-    x[2] = x[1] - x[2];
-    x[1] = x1h;
-  } else if (n == 3) {
-    x1p3 = x[1] + x[3];
-    tx2 = x[2] + x[2];
-    x[2] = x[1] - x[3];
-    x[1] = x1p3 + tx2;
-    x[3] = x1p3 - tx2;
-  } else {
-    c1 = x[1] - x[n];
-    x[1] += x[n];
-    for (k = 2; k <= ns2; ++k) {
-      kc = np1 - k;
-      t1 = x[k] + x[kc];
-      t2 = x[k] - x[kc];
-      c1 += wsave[kc] * t2;
-      t2 = wsave[k] * t2;
-      x[k] = t1 - t2;
-      x[kc] = t1 + t2;
-    }
-    modn = n % 2;
-    if (modn != 0) {
-      x[ns2 + 1] += x[ns2 + 1];
-    }
-    rfftf(nm1, &x[1], &wsave[n + 1]);
-    xim2 = x[2];
-    x[2] = c1;
-    for (i = 4; i <= n; i += 2) {
-      xi = x[i];
-      x[i] = x[i - 2] - x[i - 1];
-      x[i - 1] = xim2;
-      xim2 = xi;
-    }
-    if (modn != 0) {
-      x[n] = xim2;
-    }
-  }
-} /* cost */
-
-void costi(integer n, real *wsave)
-{
-  /* Initialized data */
-
-  /* Local variables */
-  integer k, kc;
-  real fk, dt;
-  integer nm1, np1, ns2;
-
-  /* Parameter adjustments */
-  --wsave;
-
-  /* Function Body */
-  if (n <= 3) {
-    return;
-  }
-  nm1 = n - 1;
-  np1 = n + 1;
-  ns2 = n / 2;
-  dt = M_PI / (real) nm1;
-  fk = 0.f;
-  for (k = 2; k <= ns2; ++k) {
-    kc = np1 - k;
-    fk += 1.f;
-    wsave[k] = sin(fk * dt) * 2.f;
-    wsave[kc] = cos(fk * dt) * 2.f;
-  }
-  rffti(nm1, &wsave[n + 1]);
-} /* costi */
-
-void sinqb(integer n, real *x, real *wsave)
-{
-  /* Local variables */
-  integer k, kc, ns2;
-  real xhold;
-
-  /* Parameter adjustments */
-  --wsave;
-  --x;
-
-  /* Function Body */
-  if (n <= 1) {
-    x[1] *= 4.f;
-    return;
-  }
-  ns2 = n / 2;
-  for (k = 2; k <= n; k += 2) {
-    x[k] = -x[k];
-  }
-  cosqb(n, &x[1], &wsave[1]);
-  for (k = 1; k <= ns2; ++k) {
-    kc = n - k;
-    xhold = x[k];
-    x[k] = x[kc + 1];
-    x[kc + 1] = xhold;
-  }
-} /* sinqb */
-
-void sinqf(integer n, real *x, real *wsave)
-{
-  /* Local variables */
-  integer k, kc, ns2;
-  real xhold;
-
-  /* Parameter adjustments */
-  --wsave;
-  --x;
-
-  /* Function Body */
-  if (n == 1) {
-    return;
-  }
-  ns2 = n / 2;
-  for (k = 1; k <= ns2; ++k) {
-    kc = n - k;
-    xhold = x[k];
-    x[k] = x[kc + 1];
-    x[kc + 1] = xhold;
-  }
-  cosqf(n, &x[1], &wsave[1]);
-  for (k = 2; k <= n; k += 2) {
-    x[k] = -x[k];
-  }
-} /* sinqf */
-
-void sinqi(integer n, real *wsave)
-{
-
-  /* Parameter adjustments */
-  --wsave;
-
-  /* Function Body */
-  cosqi(n, &wsave[1]);
-} /* sinqi */
-
-static void sint1(integer n, real *war, real *was, real *xh, real *
-                  x, integer *ifac)
-{
-  /* Initialized data */
-
-  static const real sqrt3 = 1.73205080756888f;
-
-  /* Local variables */
-  integer i, k;
-  real t1, t2;
-  integer kc, np1, ns2, modn;
-  real xhold;
-
-  /* Parameter adjustments */
-  --ifac;
-  --x;
-  --xh;
-  --was;
-  --war;
-
-  /* Function Body */
-  for (i = 1; i <= n; ++i) {
-    xh[i] = war[i];
-    war[i] = x[i];
-  }
-  
-  if (n < 2) {
-    xh[1] += xh[1];
-  } else if (n == 2) {
-    xhold = sqrt3 * (xh[1] + xh[2]);
-    xh[2] = sqrt3 * (xh[1] - xh[2]);
-    xh[1] = xhold;
-  } else {
-    np1 = n + 1;
-    ns2 = n / 2;
-    x[1] = 0.f;
-    for (k = 1; k <= ns2; ++k) {
-      kc = np1 - k;
-      t1 = xh[k] - xh[kc];
-      t2 = was[k] * (xh[k] + xh[kc]);
-      x[k + 1] = t1 + t2;
-      x[kc + 1] = t2 - t1;
-    }
-    modn = n % 2;
-    if (modn != 0) {
-      x[ns2 + 2] = xh[ns2 + 1] * 4.f;
-    }
-    rfftf1(np1, &x[1], &xh[1], &war[1], &ifac[1]);
-    xh[1] = x[1] * .5f;
-    for (i = 3; i <= n; i += 2) {
-      xh[i - 1] = -x[i];
-      xh[i] = xh[i - 2] + x[i - 1];
-    }
-    if (modn == 0) {
-      xh[n] = -x[n + 1];
-    }
-  }
-  for (i = 1; i <= n; ++i) {
-    x[i] = war[i];
-    war[i] = xh[i];
-  }
-} /* sint1 */
-
-void sinti(integer n, real *wsave)
-{
-  /* Local variables */
-  integer k;
-  real dt;
-  integer np1, ns2;
-
-  /* Parameter adjustments */
-  --wsave;
-
-  /* Function Body */
-  if (n <= 1) {
-    return;
-  }
-  ns2 = n / 2;
-  np1 = n + 1;
-  dt = M_PI / (real) np1;
-  for (k = 1; k <= ns2; ++k) {
-    wsave[k] = sin(k * dt) * 2.f;
-  }
-  rffti(np1, &wsave[ns2 + 1]);
-} /* sinti */
-
-void sint(integer n, real *x, real *wsave)
-{
-  integer np1, iw1, iw2, iw3;
-
-  /* Parameter adjustments */
-  --wsave;
-  --x;
-
-  /* Function Body */
-  np1 = n + 1;
-  iw1 = n / 2 + 1;
-  iw2 = iw1 + np1;
-  iw3 = iw2 + np1;
-  sint1(n, &x[1], &wsave[1], &wsave[iw1], &wsave[iw2], (int*)&wsave[iw3]);
-} /* sint */
-
-#ifdef TESTING_FFTPACK
-#include <stdio.h>
-
-int main(void)
-{
-  static integer nd[] = { 120,91,54,49,32,28,24,8,4,3,2 };
-
-  /* System generated locals */
-  real r1, r2, r3;
-  f77complex q1, q2, q3;
-
-  /* Local variables */
-  integer i, j, k, n;
-  real w[2000], x[200], y[200], cf, fn, dt;
-  f77complex cx[200], cy[200];
-  real xh[200];
-  integer nz, nm1, np1, ns2;
-  real arg, tfn;
-  real sum, arg1, arg2;
-  real sum1, sum2, dcfb;
-  integer modn;
-  real rftb, rftf;
-  real sqrt2;
-  real rftfb;
-  real costt, sintt, dcfftb, dcfftf, cosqfb, costfb;
-  real sinqfb;
-  real sintfb;
-  real cosqbt, cosqft, sinqbt, sinqft;
-
-
-
-  /*     * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
-
-  /*                       VERSION 4  APRIL 1985 */
-
-  /*                         A TEST DRIVER FOR */
-  /*          A PACKAGE OF FORTRAN SUBPROGRAMS FOR THE FAST FOURIER */
-  /*           TRANSFORM OF PERIODIC AND OTHER SYMMETRIC SEQUENCES */
-
-  /*                              BY */
-
-  /*                       PAUL N SWARZTRAUBER */
-
-  /*       NATIONAL CENTER FOR ATMOSPHERIC RESEARCH  BOULDER,COLORADO 80307 */
-
-  /*        WHICH IS SPONSORED BY THE NATIONAL SCIENCE FOUNDATION */
-
-  /*     * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
-
-
-  /*             THIS PROGRAM TESTS THE PACKAGE OF FAST FOURIER */
-  /*     TRANSFORMS FOR BOTH COMPLEX AND REAL PERIODIC SEQUENCES AND */
-  /*     CERTIAN OTHER SYMMETRIC SEQUENCES THAT ARE LISTED BELOW. */
-
-  /*     1.   RFFTI     INITIALIZE  RFFTF AND RFFTB */
-  /*     2.   RFFTF     FORWARD TRANSFORM OF A REAL PERIODIC SEQUENCE */
-  /*     3.   RFFTB     BACKWARD TRANSFORM OF A REAL COEFFICIENT ARRAY */
-
-  /*     4.   EZFFTI    INITIALIZE EZFFTF AND EZFFTB */
-  /*     5.   EZFFTF    A SIMPLIFIED REAL PERIODIC FORWARD TRANSFORM */
-  /*     6.   EZFFTB    A SIMPLIFIED REAL PERIODIC BACKWARD TRANSFORM */
-
-  /*     7.   SINTI     INITIALIZE SINT */
-  /*     8.   SINT      SINE TRANSFORM OF A REAL ODD SEQUENCE */
-
-  /*     9.   COSTI     INITIALIZE COST */
-  /*     10.  COST      COSINE TRANSFORM OF A REAL EVEN SEQUENCE */
-
-  /*     11.  SINQI     INITIALIZE SINQF AND SINQB */
-  /*     12.  SINQF     FORWARD SINE TRANSFORM WITH ODD WAVE NUMBERS */
-  /*     13.  SINQB     UNNORMALIZED INVERSE OF SINQF */
-
-  /*     14.  COSQI     INITIALIZE COSQF AND COSQB */
-  /*     15.  COSQF     FORWARD COSINE TRANSFORM WITH ODD WAVE NUMBERS */
-  /*     16.  COSQB     UNNORMALIZED INVERSE OF COSQF */
-
-  /*     17.  CFFTI     INITIALIZE CFFTF AND CFFTB */
-  /*     18.  CFFTF     FORWARD TRANSFORM OF A COMPLEX PERIODIC SEQUENCE */
-  /*     19.  CFFTB     UNNORMALIZED INVERSE OF CFFTF */
-
-
-  sqrt2 = sqrt(2.f);
-  int all_ok = 1;
-  for (nz = 1; nz <= (int)(sizeof nd/sizeof nd[0]); ++nz) {
-    n = nd[nz - 1];
-    modn = n % 2;
-    fn = (real) n;
-    tfn = fn + fn;
-    np1 = n + 1;
-    nm1 = n - 1;
-    for (j = 1; j <= np1; ++j) {
-      x[j - 1] = sin((real) j * sqrt2);
-      y[j - 1] = x[j - 1];
-      xh[j - 1] = x[j - 1];
-    }
-
-    /*     TEST SUBROUTINES RFFTI,RFFTF AND RFFTB */
-
-    rffti(n, w);
-    dt = (2*M_PI) / fn;
-    ns2 = (n + 1) / 2;
-    if (ns2 < 2) {
-      goto L104;
-    }
-    for (k = 2; k <= ns2; ++k) {
-      sum1 = 0.f;
-      sum2 = 0.f;
-      arg = (real) (k - 1) * dt;
-      for (i = 1; i <= n; ++i) {
-        arg1 = (real) (i - 1) * arg;
-        sum1 += x[i - 1] * cos(arg1);
-        sum2 += x[i - 1] * sin(arg1);
-      }
-      y[(k << 1) - 3] = sum1;
-      y[(k << 1) - 2] = -sum2;
-    }
-  L104:
-    sum1 = 0.f;
-    sum2 = 0.f;
-    for (i = 1; i <= nm1; i += 2) {
-      sum1 += x[i - 1];
-      sum2 += x[i];
-    }
-    if (modn == 1) {
-      sum1 += x[n - 1];
-    }
-    y[0] = sum1 + sum2;
-    if (modn == 0) {
-      y[n - 1] = sum1 - sum2;
-    }
-    rfftf(n, x, w);
-    rftf = 0.f;
-    for (i = 1; i <= n; ++i) {
-      /* Computing MAX */
-      r2 = rftf, r3 = (r1 = x[i - 1] - y[i - 1], fabs(r1));
-      rftf = dmax(r2,r3);
-      x[i - 1] = xh[i - 1];
-    }
-    rftf /= fn;
-    for (i = 1; i <= n; ++i) {
-      sum = x[0] * .5f;
-      arg = (real) (i - 1) * dt;
-      if (ns2 < 2) {
-        goto L108;
-      }
-      for (k = 2; k <= ns2; ++k) {
-        arg1 = (real) (k - 1) * arg;
-        sum = sum + x[(k << 1) - 3] * cos(arg1) - x[(k << 1) - 2] * 
-          sin(arg1);
-      }
-    L108:
-      if (modn == 0) {
-        sum += (real)pow(-1, i-1) * .5f * x[n - 1];
-      }
-      y[i - 1] = sum + sum;
-    }
-    rfftb(n, x, w);
-    rftb = 0.f;
-    for (i = 1; i <= n; ++i) {
-      /* Computing MAX */
-      r2 = rftb, r3 = (r1 = x[i - 1] - y[i - 1], fabs(r1));
-      rftb = dmax(r2,r3);
-      x[i - 1] = xh[i - 1];
-      y[i - 1] = xh[i - 1];
-    }
-    rfftb(n, y, w);
-    rfftf(n, y, w);
-    cf = 1.f / fn;
-    rftfb = 0.f;
-    for (i = 1; i <= n; ++i) {
-      /* Computing MAX */
-      r2 = rftfb, r3 = (r1 = cf * y[i - 1] - x[i - 1], fabs(
-                                                            r1));
-      rftfb = dmax(r2,r3);
-    }
-
-    /*     TEST SUBROUTINES SINTI AND SINT */
-
-    dt = M_PI / fn;
-    for (i = 1; i <= nm1; ++i) {
-      x[i - 1] = xh[i - 1];
-    }
-    for (i = 1; i <= nm1; ++i) {
-      y[i - 1] = 0.f;
-      arg1 = (real) i * dt;
-      for (k = 1; k <= nm1; ++k) {
-        y[i - 1] += x[k - 1] * sin((real) k * arg1);
-      }
-      y[i - 1] += y[i - 1];
-    }
-    sinti(nm1, w);
-    sint(nm1, x, w);
-    cf = .5f / fn;
-    sintt = 0.f;
-    for (i = 1; i <= nm1; ++i) {
-      /* Computing MAX */
-      r2 = sintt, r3 = (r1 = x[i - 1] - y[i - 1], fabs(r1));
-      sintt = dmax(r2,r3);
-      x[i - 1] = xh[i - 1];
-      y[i - 1] = x[i - 1];
-    }
-    sintt = cf * sintt;
-    sint(nm1, x, w);
-    sint(nm1, x, w);
-    sintfb = 0.f;
-    for (i = 1; i <= nm1; ++i) {
-      /* Computing MAX */
-      r2 = sintfb, r3 = (r1 = cf * x[i - 1] - y[i - 1], fabs(
-                                                             r1));
-      sintfb = dmax(r2,r3);
-    }
-
-    /*     TEST SUBROUTINES COSTI AND COST */
-
-    for (i = 1; i <= np1; ++i) {
-      x[i - 1] = xh[i - 1];
-    }
-    for (i = 1; i <= np1; ++i) {
-      y[i - 1] = (x[0] + (real) pow(-1, i+1) * x[n]) * .5f;
-      arg = (real) (i - 1) * dt;
-      for (k = 2; k <= n; ++k) {
-        y[i - 1] += x[k - 1] * cos((real) (k - 1) * arg);
-      }
-      y[i - 1] += y[i - 1];
-    }
-    costi(np1, w);
-    cost(np1, x, w);
-    costt = 0.f;
-    for (i = 1; i <= np1; ++i) {
-      /* Computing MAX */
-      r2 = costt, r3 = (r1 = x[i - 1] - y[i - 1], fabs(r1));
-      costt = dmax(r2,r3);
-      x[i - 1] = xh[i - 1];
-      y[i - 1] = xh[i - 1];
-    }
-    costt = cf * costt;
-    cost(np1, x, w);
-    cost(np1, x, w);
-    costfb = 0.f;
-    for (i = 1; i <= np1; ++i) {
-      /* Computing MAX */
-      r2 = costfb, r3 = (r1 = cf * x[i - 1] - y[i - 1], fabs(
-                                                             r1));
-      costfb = dmax(r2,r3);
-    }
-
-    /*     TEST SUBROUTINES SINQI,SINQF AND SINQB */
-
-    cf = .25f / fn;
-    for (i = 1; i <= n; ++i) {
-      y[i - 1] = xh[i - 1];
-    }
-    dt = M_PI / (fn + fn);
-    for (i = 1; i <= n; ++i) {
-      x[i - 1] = 0.f;
-      arg = dt * (real) i;
-      for (k = 1; k <= n; ++k) {
-        x[i - 1] += y[k - 1] * sin((real) (k + k - 1) * arg);
-      }
-      x[i - 1] *= 4.f;
-    }
-    sinqi(n, w);
-    sinqb(n, y, w);
-    sinqbt = 0.f;
-    for (i = 1; i <= n; ++i) {
-      /* Computing MAX */
-      r2 = sinqbt, r3 = (r1 = y[i - 1] - x[i - 1], fabs(r1))
-        ;
-      sinqbt = dmax(r2,r3);
-      x[i - 1] = xh[i - 1];
-    }
-    sinqbt = cf * sinqbt;
-    for (i = 1; i <= n; ++i) {
-      arg = (real) (i + i - 1) * dt;
-      y[i - 1] = (real) pow(-1, i+1) * .5f * x[n - 1];
-      for (k = 1; k <= nm1; ++k) {
-        y[i - 1] += x[k - 1] * sin((real) k * arg);
-      }
-      y[i - 1] += y[i - 1];
-    }
-    sinqf(n, x, w);
-    sinqft = 0.f;
-    for (i = 1; i <= n; ++i) {
-      /* Computing MAX */
-      r2 = sinqft, r3 = (r1 = x[i - 1] - y[i - 1], fabs(r1))
-        ;
-      sinqft = dmax(r2,r3);
-      y[i - 1] = xh[i - 1];
-      x[i - 1] = xh[i - 1];
-    }
-    sinqf(n, y, w);
-    sinqb(n, y, w);
-    sinqfb = 0.f;
-    for (i = 1; i <= n; ++i) {
-      /* Computing MAX */
-      r2 = sinqfb, r3 = (r1 = cf * y[i - 1] - x[i - 1], fabs(
-                                                             r1));
-      sinqfb = dmax(r2,r3);
-    }
-
-    /*     TEST SUBROUTINES COSQI,COSQF AND COSQB */
-
-    for (i = 1; i <= n; ++i) {
-      y[i - 1] = xh[i - 1];
-    }
-    for (i = 1; i <= n; ++i) {
-      x[i - 1] = 0.f;
-      arg = (real) (i - 1) * dt;
-      for (k = 1; k <= n; ++k) {
-        x[i - 1] += y[k - 1] * cos((real) (k + k - 1) * arg);
-      }
-      x[i - 1] *= 4.f;
-    }
-    cosqi(n, w);
-    cosqb(n, y, w);
-    cosqbt = 0.f;
-    for (i = 1; i <= n; ++i) {
-      /* Computing MAX */
-      r2 = cosqbt, r3 = (r1 = x[i - 1] - y[i - 1], fabs(r1))
-        ;
-      cosqbt = dmax(r2,r3);
-      x[i - 1] = xh[i - 1];
-    }
-    cosqbt = cf * cosqbt;
-    for (i = 1; i <= n; ++i) {
-      y[i - 1] = x[0] * .5f;
-      arg = (real) (i + i - 1) * dt;
-      for (k = 2; k <= n; ++k) {
-        y[i - 1] += x[k - 1] * cos((real) (k - 1) * arg);
-      }
-      y[i - 1] += y[i - 1];
-    }
-    cosqf(n, x, w);
-    cosqft = 0.f;
-    for (i = 1; i <= n; ++i) {
-      /* Computing MAX */
-      r2 = cosqft, r3 = (r1 = y[i - 1] - x[i - 1], fabs(r1))
-        ;
-      cosqft = dmax(r2,r3);
-      x[i - 1] = xh[i - 1];
-      y[i - 1] = xh[i - 1];
-    }
-    cosqft = cf * cosqft;
-    cosqb(n, x, w);
-    cosqf(n, x, w);
-    cosqfb = 0.f;
-    for (i = 1; i <= n; ++i) {
-      /* Computing MAX */
-      r2 = cosqfb, r3 = (r1 = cf * x[i - 1] - y[i - 1], fabs(r1));
-      cosqfb = dmax(r2,r3);
-    }
-
-    /*     TEST  CFFTI,CFFTF,CFFTB */
-
-    for (i = 1; i <= n; ++i) {
-      r1 = cos(sqrt2 * (real) i);
-      r2 = sin(sqrt2 * (real) (i * i));
-      q1.r = r1, q1.i = r2;
-      cx[i-1].r = q1.r, cx[i-1].i = q1.i;
-    }
-    dt = (2*M_PI) / fn;
-    for (i = 1; i <= n; ++i) {
-      arg1 = -((real) (i - 1)) * dt;
-      cy[i-1].r = 0.f, cy[i-1].i = 0.f;
-      for (k = 1; k <= n; ++k) {
-        arg2 = (real) (k - 1) * arg1;
-        r1 = cos(arg2);
-        r2 = sin(arg2);
-        q3.r = r1, q3.i = r2;
-        q2.r = q3.r * cx[k-1].r - q3.i * cx[k-1].i, q2.i = 
-          q3.r * cx[k-1].i + q3.i * cx[k-1].r;
-        q1.r = cy[i-1].r + q2.r, q1.i = cy[i-1].i + q2.i;
-        cy[i-1].r = q1.r, cy[i-1].i = q1.i;
-      }
-    }
-    cffti(n, w);
-    cfftf(n, (real*)cx, w);
-    dcfftf = 0.f;
-    for (i = 1; i <= n; ++i) {
-      /* Computing MAX */
-      q1.r = cx[i-1].r - cy[i-1].r, q1.i = cx[i-1].i - cy[i-1]
-        .i;
-      r1 = dcfftf, r2 = c_abs(&q1);
-      dcfftf = dmax(r1,r2);
-      q1.r = cx[i-1].r / fn, q1.i = cx[i-1].i / fn;
-      cx[i-1].r = q1.r, cx[i-1].i = q1.i;
-    }
-    dcfftf /= fn;
-    for (i = 1; i <= n; ++i) {
-      arg1 = (real) (i - 1) * dt;
-      cy[i-1].r = 0.f, cy[i-1].i = 0.f;
-      for (k = 1; k <= n; ++k) {
-        arg2 = (real) (k - 1) * arg1;
-        r1 = cos(arg2);
-        r2 = sin(arg2);
-        q3.r = r1, q3.i = r2;
-        q2.r = q3.r * cx[k-1].r - q3.i * cx[k-1].i, q2.i = 
-          q3.r * cx[k-1].i + q3.i * cx[k-1].r;
-        q1.r = cy[i-1].r + q2.r, q1.i = cy[i-1].i + q2.i;
-        cy[i-1].r = q1.r, cy[i-1].i = q1.i;
-      }
-    }
-    cfftb(n, (real*)cx, w);
-    dcfftb = 0.f;
-    for (i = 1; i <= n; ++i) {
-      /* Computing MAX */
-      q1.r = cx[i-1].r - cy[i-1].r, q1.i = cx[i-1].i - cy[i-1].i;
-      r1 = dcfftb, r2 = c_abs(&q1);
-      dcfftb = dmax(r1,r2);
-      cx[i-1].r = cy[i-1].r, cx[i-1].i = cy[i-1].i;
-    }
-    cf = 1.f / fn;
-    cfftf(n, (real*)cx, w);
-    cfftb(n, (real*)cx, w);
-    dcfb = 0.f;
-    for (i = 1; i <= n; ++i) {
-      /* Computing MAX */
-      q2.r = cf * cx[i-1].r, q2.i = cf * cx[i-1].i;
-      q1.r = q2.r - cy[i-1].r, q1.i = q2.i - cy[i-1].i;
-      r1 = dcfb, r2 = c_abs(&q1);
-      dcfb = dmax(r1,r2);
-    }
-    printf("%d\tRFFTF  %10.3g\tRFFTB  %10.ge\tRFFTFB %10.3g", n, rftf, rftb, rftfb);
-    printf(  "\tSINT   %10.3g\tSINTFB %10.ge\tCOST   %10.3g\n", sintt, sintfb, costt);
-    printf(  "\tCOSTFB %10.3g\tSINQF  %10.ge\tSINQB  %10.3g", costfb, sinqft, sinqbt);
-    printf(  "\tSINQFB %10.3g\tCOSQF  %10.ge\tCOSQB  %10.3g\n", sinqfb, cosqft, cosqbt);
-    printf(  "\tCOSQFB %10.3g\t", cosqfb);
-    printf(  "\tCFFTF  %10.ge\tCFFTB  %10.3g\n", dcfftf, dcfftb);
-    printf(  "\tCFFTFB %10.3g\n", dcfb);
-
-#define CHECK(x) if (x > 1e-3) { printf(#x " failed: %g\n", x); all_ok = 0; }
-    CHECK(rftf); CHECK(rftb); CHECK(rftfb); CHECK(sintt); CHECK(sintfb); CHECK(costt);
-    CHECK(costfb); CHECK(sinqft); CHECK(sinqbt); CHECK(sinqfb); CHECK(cosqft); CHECK(cosqbt);
-    CHECK(cosqfb); CHECK(dcfftf); CHECK(dcfftb);
-  }
-
-  if (all_ok) printf("Everything looks fine.\n"); 
-  else printf("ERRORS WERE DETECTED.\n");
-  /*
-    expected:
-    120     RFFTF   2.786e-06       RFFTB   6.847e-04       RFFTFB  2.795e-07       SINT    1.312e-06       SINTFB  1.237e-06       COST    1.319e-06
-    COSTFB  4.355e-06       SINQF   3.281e-04       SINQB   1.876e-06       SINQFB  2.198e-07       COSQF   6.199e-07       COSQB   2.193e-06
-    COSQFB  2.300e-07       DEZF    5.573e-06       DEZB    1.363e-05       DEZFB   1.371e-06       CFFTF   5.590e-06       CFFTB   4.751e-05
-    CFFTFB  4.215e-07
-    54      RFFTF   4.708e-07       RFFTB   3.052e-05       RFFTFB  3.439e-07       SINT    3.532e-07       SINTFB  4.145e-07       COST    3.002e-07
-    COSTFB  6.343e-07       SINQF   4.959e-05       SINQB   4.415e-07       SINQFB  2.882e-07       COSQF   2.826e-07       COSQB   2.472e-07
-    COSQFB  3.439e-07       DEZF    9.388e-07       DEZB    5.066e-06       DEZFB   5.960e-07       CFFTF   1.426e-06       CFFTB   9.482e-06
-    CFFTFB  2.980e-07
-    49      RFFTF   4.476e-07       RFFTB   5.341e-05       RFFTFB  2.574e-07       SINT    9.196e-07       SINTFB  9.401e-07       COST    8.174e-07
-    COSTFB  1.331e-06       SINQF   4.005e-05       SINQB   9.342e-07       SINQFB  3.057e-07       COSQF   2.530e-07       COSQB   6.228e-07
-    COSQFB  4.826e-07       DEZF    9.071e-07       DEZB    4.590e-06       DEZFB   5.960e-07       CFFTF   2.095e-06       CFFTB   1.414e-05
-    CFFTFB  7.398e-07
-    32      RFFTF   4.619e-07       RFFTB   2.861e-05       RFFTFB  1.192e-07       SINT    3.874e-07       SINTFB  4.172e-07       COST    4.172e-07
-    COSTFB  1.699e-06       SINQF   2.551e-05       SINQB   6.407e-07       SINQFB  2.980e-07       COSQF   1.639e-07       COSQB   1.714e-07
-    COSQFB  2.384e-07       DEZF    1.013e-06       DEZB    2.339e-06       DEZFB   7.749e-07       CFFTF   1.127e-06       CFFTB   6.744e-06
-    CFFTFB  2.666e-07
-    4       RFFTF   1.490e-08       RFFTB   1.490e-07       RFFTFB  5.960e-08       SINT    7.451e-09       SINTFB  0.000e+00       COST    2.980e-08
-    COSTFB  1.192e-07       SINQF   4.768e-07       SINQB   2.980e-08       SINQFB  5.960e-08       COSQF   2.608e-08       COSQB   5.960e-08
-    COSQFB  1.192e-07       DEZF    2.980e-08       DEZB    5.960e-08       DEZFB   0.000e+00       CFFTF   6.664e-08       CFFTB   5.960e-08
-    CFFTFB  6.144e-08
-    3       RFFTF   3.974e-08       RFFTB   1.192e-07       RFFTFB  3.303e-08       SINT    1.987e-08       SINTFB  1.069e-08       COST    4.967e-08
-    COSTFB  5.721e-08       SINQF   8.941e-08       SINQB   2.980e-08       SINQFB  1.259e-07       COSQF   7.451e-09       COSQB   4.967e-08
-    COSQFB  7.029e-08       DEZF    1.192e-07       DEZB    5.960e-08       DEZFB   5.960e-08       CFFTF   7.947e-08       CFFTB   8.429e-08
-    CFFTFB  9.064e-08
-    2       RFFTF   0.000e+00       RFFTB   0.000e+00       RFFTFB  0.000e+00       SINT    0.000e+00       SINTFB  0.000e+00       COST    0.000e+00
-    COSTFB  0.000e+00       SINQF   1.192e-07       SINQB   2.980e-08       SINQFB  5.960e-08       COSQF   7.451e-09       COSQB   1.490e-08
-    COSQFB  0.000e+00       DEZF    0.000e+00       DEZB    0.000e+00       DEZFB   0.000e+00       CFFTF   0.000e+00       CFFTB   5.960e-08
-    CFFTFB  5.960e-08
-    Everything looks fine.
-
-  */
-
-  return all_ok ? 0 : 1;
-}
-#endif //TESTING_FFTPACK
diff --git a/oss-internship-2020/pffft/fftpack.h b/oss-internship-2020/pffft/fftpack.h
deleted file mode 100644
index 5971b9f..0000000
--- a/oss-internship-2020/pffft/fftpack.h
+++ /dev/null
@@ -1,799 +0,0 @@
-/*
-  Interface for the f2c translation of fftpack as found on http://www.netlib.org/fftpack/
-  
-   FFTPACK license:
-
-   http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
-
-   Copyright (c) 2004 the University Corporation for Atmospheric
-   Research ("UCAR"). All rights reserved. Developed by NCAR's
-   Computational and Information Systems Laboratory, UCAR,
-   www.cisl.ucar.edu.
-
-   Redistribution and use of the Software in source and binary forms,
-   with or without modification, is permitted provided that the
-   following conditions are met:
-
-   - Neither the names of NCAR's Computational and Information Systems
-   Laboratory, the University Corporation for Atmospheric Research,
-   nor the names of its sponsors or contributors may be used to
-   endorse or promote products derived from this Software without
-   specific prior written permission.  
-
-   - Redistributions of source code must retain the above copyright
-   notices, this list of conditions, and the disclaimer below.
-
-   - Redistributions in binary form must reproduce the above copyright
-   notice, this list of conditions, and the disclaimer below in the
-   documentation and/or other materials provided with the
-   distribution.
-
-   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
-   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
-   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
-   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
-   SOFTWARE.
-
-   ChangeLog:
-   2011/10/02: this is my first release of this file.
-*/
-
-#ifndef FFTPACK_H
-#define FFTPACK_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// just define FFTPACK_DOUBLE_PRECISION if you want to build it as a double precision fft
-
-#ifndef FFTPACK_DOUBLE_PRECISION
-  typedef float fftpack_real;
-  typedef int   fftpack_int;
-#else
-  typedef double fftpack_real;
-  typedef int    fftpack_int;
-#endif
-
-  void cffti(fftpack_int n, fftpack_real *wsave);
-
-  void cfftf(fftpack_int n, fftpack_real *c, fftpack_real *wsave);
-
-  void cfftb(fftpack_int n, fftpack_real *c, fftpack_real *wsave); 
-
-  void rffti(fftpack_int n, fftpack_real *wsave);
-  void rfftf(fftpack_int n, fftpack_real *r, fftpack_real *wsave);
-  void rfftb(fftpack_int n, fftpack_real *r, fftpack_real *wsave);
-
-  void cosqi(fftpack_int n, fftpack_real *wsave);
-  void cosqf(fftpack_int n, fftpack_real *x, fftpack_real *wsave);
-  void cosqb(fftpack_int n, fftpack_real *x, fftpack_real *wsave);
-
-  void costi(fftpack_int n, fftpack_real *wsave);
-  void cost(fftpack_int n, fftpack_real *x, fftpack_real *wsave);
-
-  void sinqi(fftpack_int n, fftpack_real *wsave);
-  void sinqb(fftpack_int n, fftpack_real *x, fftpack_real *wsave);
-  void sinqf(fftpack_int n, fftpack_real *x, fftpack_real *wsave);
-
-  void sinti(fftpack_int n, fftpack_real *wsave);
-  void sint(fftpack_int n, fftpack_real *x, fftpack_real *wsave);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* FFTPACK_H */
-
-/*
-
-                      FFTPACK
-
-* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
-
-                  version 4  april 1985
-
-     a package of fortran subprograms for the fast fourier
-      transform of periodic and other symmetric sequences
-
-                         by
-
-                  paul n swarztrauber
-
-  national center for atmospheric research  boulder,colorado 80307
-
-   which is sponsored by the national science foundation
-
-* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
-
-
-this package consists of programs which perform fast fourier
-transforms for both complex and real periodic sequences and
-certain other symmetric sequences that are listed below.
-
-1.   rffti     initialize  rfftf and rfftb
-2.   rfftf     forward transform of a real periodic sequence
-3.   rfftb     backward transform of a real coefficient array
-
-4.   ezffti    initialize ezfftf and ezfftb
-5.   ezfftf    a simplified real periodic forward transform
-6.   ezfftb    a simplified real periodic backward transform
-
-7.   sinti     initialize sint
-8.   sint      sine transform of a real odd sequence
-
-9.   costi     initialize cost
-10.  cost      cosine transform of a real even sequence
-
-11.  sinqi     initialize sinqf and sinqb
-12.  sinqf     forward sine transform with odd wave numbers
-13.  sinqb     unnormalized inverse of sinqf
-
-14.  cosqi     initialize cosqf and cosqb
-15.  cosqf     forward cosine transform with odd wave numbers
-16.  cosqb     unnormalized inverse of cosqf
-
-17.  cffti     initialize cfftf and cfftb
-18.  cfftf     forward transform of a complex periodic sequence
-19.  cfftb     unnormalized inverse of cfftf
-
-
-******************************************************************
-
-subroutine rffti(n,wsave)
-
-  ****************************************************************
-
-subroutine rffti initializes the array wsave which is used in
-both rfftf and rfftb. the prime factorization of n together with
-a tabulation of the trigonometric functions are computed and
-stored in wsave.
-
-input parameter
-
-n       the length of the sequence to be transformed.
-
-output parameter
-
-wsave   a work array which must be dimensioned at least 2*n+15.
-        the same work array can be used for both rfftf and rfftb
-        as long as n remains unchanged. different wsave arrays
-        are required for different values of n. the contents of
-        wsave must not be changed between calls of rfftf or rfftb.
-
-******************************************************************
-
-subroutine rfftf(n,r,wsave)
-
-******************************************************************
-
-subroutine rfftf computes the fourier coefficients of a real
-perodic sequence (fourier analysis). the transform is defined
-below at output parameter r.
-
-input parameters
-
-n       the length of the array r to be transformed.  the method
-        is most efficient when n is a product of small primes.
-        n may change so long as different work arrays are provided
-
-r       a real array of length n which contains the sequence
-        to be transformed
-
-wsave   a work array which must be dimensioned at least 2*n+15.
-        in the program that calls rfftf. the wsave array must be
-        initialized by calling subroutine rffti(n,wsave) and a
-        different wsave array must be used for each different
-        value of n. this initialization does not have to be
-        repeated so long as n remains unchanged thus subsequent
-        transforms can be obtained faster than the first.
-        the same wsave array can be used by rfftf and rfftb.
-
-
-output parameters
-
-r       r(1) = the sum from i=1 to i=n of r(i)
-
-        if n is even set l =n/2   , if n is odd set l = (n+1)/2
-
-          then for k = 2,...,l
-
-             r(2*k-2) = the sum from i = 1 to i = n of
-
-                  r(i)*cos((k-1)*(i-1)*2*pi/n)
-
-             r(2*k-1) = the sum from i = 1 to i = n of
-
-                 -r(i)*sin((k-1)*(i-1)*2*pi/n)
-
-        if n is even
-
-             r(n) = the sum from i = 1 to i = n of
-
-                  (-1)**(i-1)*r(i)
-
- *****  note
-             this transform is unnormalized since a call of rfftf
-             followed by a call of rfftb will multiply the input
-             sequence by n.
-
-wsave   contains results which must not be destroyed between
-        calls of rfftf or rfftb.
-
-
-******************************************************************
-
-subroutine rfftb(n,r,wsave)
-
-******************************************************************
-
-subroutine rfftb computes the real perodic sequence from its
-fourier coefficients (fourier synthesis). the transform is defined
-below at output parameter r.
-
-input parameters
-
-n       the length of the array r to be transformed.  the method
-        is most efficient when n is a product of small primes.
-        n may change so long as different work arrays are provided
-
-r       a real array of length n which contains the sequence
-        to be transformed
-
-wsave   a work array which must be dimensioned at least 2*n+15.
-        in the program that calls rfftb. the wsave array must be
-        initialized by calling subroutine rffti(n,wsave) and a
-        different wsave array must be used for each different
-        value of n. this initialization does not have to be
-        repeated so long as n remains unchanged thus subsequent
-        transforms can be obtained faster than the first.
-        the same wsave array can be used by rfftf and rfftb.
-
-
-output parameters
-
-r       for n even and for i = 1,...,n
-
-             r(i) = r(1)+(-1)**(i-1)*r(n)
-
-                  plus the sum from k=2 to k=n/2 of
-
-                   2.*r(2*k-2)*cos((k-1)*(i-1)*2*pi/n)
-
-                  -2.*r(2*k-1)*sin((k-1)*(i-1)*2*pi/n)
-
-        for n odd and for i = 1,...,n
-
-             r(i) = r(1) plus the sum from k=2 to k=(n+1)/2 of
-
-                  2.*r(2*k-2)*cos((k-1)*(i-1)*2*pi/n)
-
-                 -2.*r(2*k-1)*sin((k-1)*(i-1)*2*pi/n)
-
- *****  note
-             this transform is unnormalized since a call of rfftf
-             followed by a call of rfftb will multiply the input
-             sequence by n.
-
-wsave   contains results which must not be destroyed between
-        calls of rfftb or rfftf.
-
-******************************************************************
-
-subroutine sinti(n,wsave)
-
-******************************************************************
-
-subroutine sinti initializes the array wsave which is used in
-subroutine sint. the prime factorization of n together with
-a tabulation of the trigonometric functions are computed and
-stored in wsave.
-
-input parameter
-
-n       the length of the sequence to be transformed.  the method
-        is most efficient when n+1 is a product of small primes.
-
-output parameter
-
-wsave   a work array with at least int(2.5*n+15) locations.
-        different wsave arrays are required for different values
-        of n. the contents of wsave must not be changed between
-        calls of sint.
-
-******************************************************************
-
-subroutine sint(n,x,wsave)
-
-******************************************************************
-
-subroutine sint computes the discrete fourier sine transform
-of an odd sequence x(i). the transform is defined below at
-output parameter x.
-
-sint is the unnormalized inverse of itself since a call of sint
-followed by another call of sint will multiply the input sequence
-x by 2*(n+1).
-
-the array wsave which is used by subroutine sint must be
-initialized by calling subroutine sinti(n,wsave).
-
-input parameters
-
-n       the length of the sequence to be transformed.  the method
-        is most efficient when n+1 is the product of small primes.
-
-x       an array which contains the sequence to be transformed
-
-
-wsave   a work array with dimension at least int(2.5*n+15)
-        in the program that calls sint. the wsave array must be
-        initialized by calling subroutine sinti(n,wsave) and a
-        different wsave array must be used for each different
-        value of n. this initialization does not have to be
-        repeated so long as n remains unchanged thus subsequent
-        transforms can be obtained faster than the first.
-
-output parameters
-
-x       for i=1,...,n
-
-             x(i)= the sum from k=1 to k=n
-
-                  2*x(k)*sin(k*i*pi/(n+1))
-
-             a call of sint followed by another call of
-             sint will multiply the sequence x by 2*(n+1).
-             hence sint is the unnormalized inverse
-             of itself.
-
-wsave   contains initialization calculations which must not be
-        destroyed between calls of sint.
-
-******************************************************************
-
-subroutine costi(n,wsave)
-
-******************************************************************
-
-subroutine costi initializes the array wsave which is used in
-subroutine cost. the prime factorization of n together with
-a tabulation of the trigonometric functions are computed and
-stored in wsave.
-
-input parameter
-
-n       the length of the sequence to be transformed.  the method
-        is most efficient when n-1 is a product of small primes.
-
-output parameter
-
-wsave   a work array which must be dimensioned at least 3*n+15.
-        different wsave arrays are required for different values
-        of n. the contents of wsave must not be changed between
-        calls of cost.
-
-******************************************************************
-
-subroutine cost(n,x,wsave)
-
-******************************************************************
-
-subroutine cost computes the discrete fourier cosine transform
-of an even sequence x(i). the transform is defined below at output
-parameter x.
-
-cost is the unnormalized inverse of itself since a call of cost
-followed by another call of cost will multiply the input sequence
-x by 2*(n-1). the transform is defined below at output parameter x
-
-the array wsave which is used by subroutine cost must be
-initialized by calling subroutine costi(n,wsave).
-
-input parameters
-
-n       the length of the sequence x. n must be greater than 1.
-        the method is most efficient when n-1 is a product of
-        small primes.
-
-x       an array which contains the sequence to be transformed
-
-wsave   a work array which must be dimensioned at least 3*n+15
-        in the program that calls cost. the wsave array must be
-        initialized by calling subroutine costi(n,wsave) and a
-        different wsave array must be used for each different
-        value of n. this initialization does not have to be
-        repeated so long as n remains unchanged thus subsequent
-        transforms can be obtained faster than the first.
-
-output parameters
-
-x       for i=1,...,n
-
-            x(i) = x(1)+(-1)**(i-1)*x(n)
-
-             + the sum from k=2 to k=n-1
-
-                 2*x(k)*cos((k-1)*(i-1)*pi/(n-1))
-
-             a call of cost followed by another call of
-             cost will multiply the sequence x by 2*(n-1)
-             hence cost is the unnormalized inverse
-             of itself.
-
-wsave   contains initialization calculations which must not be
-        destroyed between calls of cost.
-
-******************************************************************
-
-subroutine sinqi(n,wsave)
-
-******************************************************************
-
-subroutine sinqi initializes the array wsave which is used in
-both sinqf and sinqb. the prime factorization of n together with
-a tabulation of the trigonometric functions are computed and
-stored in wsave.
-
-input parameter
-
-n       the length of the sequence to be transformed. the method
-        is most efficient when n is a product of small primes.
-
-output parameter
-
-wsave   a work array which must be dimensioned at least 3*n+15.
-        the same work array can be used for both sinqf and sinqb
-        as long as n remains unchanged. different wsave arrays
-        are required for different values of n. the contents of
-        wsave must not be changed between calls of sinqf or sinqb.
-
-******************************************************************
-
-subroutine sinqf(n,x,wsave)
-
-******************************************************************
-
-subroutine sinqf computes the fast fourier transform of quarter
-wave data. that is , sinqf computes the coefficients in a sine
-series representation with only odd wave numbers. the transform
-is defined below at output parameter x.
-
-sinqb is the unnormalized inverse of sinqf since a call of sinqf
-followed by a call of sinqb will multiply the input sequence x
-by 4*n.
-
-the array wsave which is used by subroutine sinqf must be
-initialized by calling subroutine sinqi(n,wsave).
-
-
-input parameters
-
-n       the length of the array x to be transformed.  the method
-        is most efficient when n is a product of small primes.
-
-x       an array which contains the sequence to be transformed
-
-wsave   a work array which must be dimensioned at least 3*n+15.
-        in the program that calls sinqf. the wsave array must be
-        initialized by calling subroutine sinqi(n,wsave) and a
-        different wsave array must be used for each different
-        value of n. this initialization does not have to be
-        repeated so long as n remains unchanged thus subsequent
-        transforms can be obtained faster than the first.
-
-output parameters
-
-x       for i=1,...,n
-
-             x(i) = (-1)**(i-1)*x(n)
-
-                + the sum from k=1 to k=n-1 of
-
-                2*x(k)*sin((2*i-1)*k*pi/(2*n))
-
-             a call of sinqf followed by a call of
-             sinqb will multiply the sequence x by 4*n.
-             therefore sinqb is the unnormalized inverse
-             of sinqf.
-
-wsave   contains initialization calculations which must not
-        be destroyed between calls of sinqf or sinqb.
-
-******************************************************************
-
-subroutine sinqb(n,x,wsave)
-
-******************************************************************
-
-subroutine sinqb computes the fast fourier transform of quarter
-wave data. that is , sinqb computes a sequence from its
-representation in terms of a sine series with odd wave numbers.
-the transform is defined below at output parameter x.
-
-sinqf is the unnormalized inverse of sinqb since a call of sinqb
-followed by a call of sinqf will multiply the input sequence x
-by 4*n.
-
-the array wsave which is used by subroutine sinqb must be
-initialized by calling subroutine sinqi(n,wsave).
-
-
-input parameters
-
-n       the length of the array x to be transformed.  the method
-        is most efficient when n is a product of small primes.
-
-x       an array which contains the sequence to be transformed
-
-wsave   a work array which must be dimensioned at least 3*n+15.
-        in the program that calls sinqb. the wsave array must be
-        initialized by calling subroutine sinqi(n,wsave) and a
-        different wsave array must be used for each different
-        value of n. this initialization does not have to be
-        repeated so long as n remains unchanged thus subsequent
-        transforms can be obtained faster than the first.
-
-output parameters
-
-x       for i=1,...,n
-
-             x(i)= the sum from k=1 to k=n of
-
-               4*x(k)*sin((2k-1)*i*pi/(2*n))
-
-             a call of sinqb followed by a call of
-             sinqf will multiply the sequence x by 4*n.
-             therefore sinqf is the unnormalized inverse
-             of sinqb.
-
-wsave   contains initialization calculations which must not
-        be destroyed between calls of sinqb or sinqf.
-
-******************************************************************
-
-subroutine cosqi(n,wsave)
-
-******************************************************************
-
-subroutine cosqi initializes the array wsave which is used in
-both cosqf and cosqb. the prime factorization of n together with
-a tabulation of the trigonometric functions are computed and
-stored in wsave.
-
-input parameter
-
-n       the length of the array to be transformed.  the method
-        is most efficient when n is a product of small primes.
-
-output parameter
-
-wsave   a work array which must be dimensioned at least 3*n+15.
-        the same work array can be used for both cosqf and cosqb
-        as long as n remains unchanged. different wsave arrays
-        are required for different values of n. the contents of
-        wsave must not be changed between calls of cosqf or cosqb.
-
-******************************************************************
-
-subroutine cosqf(n,x,wsave)
-
-******************************************************************
-
-subroutine cosqf computes the fast fourier transform of quarter
-wave data. that is , cosqf computes the coefficients in a cosine
-series representation with only odd wave numbers. the transform
-is defined below at output parameter x
-
-cosqf is the unnormalized inverse of cosqb since a call of cosqf
-followed by a call of cosqb will multiply the input sequence x
-by 4*n.
-
-the array wsave which is used by subroutine cosqf must be
-initialized by calling subroutine cosqi(n,wsave).
-
-
-input parameters
-
-n       the length of the array x to be transformed.  the method
-        is most efficient when n is a product of small primes.
-
-x       an array which contains the sequence to be transformed
-
-wsave   a work array which must be dimensioned at least 3*n+15
-        in the program that calls cosqf. the wsave array must be
-        initialized by calling subroutine cosqi(n,wsave) and a
-        different wsave array must be used for each different
-        value of n. this initialization does not have to be
-        repeated so long as n remains unchanged thus subsequent
-        transforms can be obtained faster than the first.
-
-output parameters
-
-x       for i=1,...,n
-
-             x(i) = x(1) plus the sum from k=2 to k=n of
-
-                2*x(k)*cos((2*i-1)*(k-1)*pi/(2*n))
-
-             a call of cosqf followed by a call of
-             cosqb will multiply the sequence x by 4*n.
-             therefore cosqb is the unnormalized inverse
-             of cosqf.
-
-wsave   contains initialization calculations which must not
-        be destroyed between calls of cosqf or cosqb.
-
-******************************************************************
-
-subroutine cosqb(n,x,wsave)
-
-******************************************************************
-
-subroutine cosqb computes the fast fourier transform of quarter
-wave data. that is , cosqb computes a sequence from its
-representation in terms of a cosine series with odd wave numbers.
-the transform is defined below at output parameter x.
-
-cosqb is the unnormalized inverse of cosqf since a call of cosqb
-followed by a call of cosqf will multiply the input sequence x
-by 4*n.
-
-the array wsave which is used by subroutine cosqb must be
-initialized by calling subroutine cosqi(n,wsave).
-
-
-input parameters
-
-n       the length of the array x to be transformed.  the method
-        is most efficient when n is a product of small primes.
-
-x       an array which contains the sequence to be transformed
-
-wsave   a work array that must be dimensioned at least 3*n+15
-        in the program that calls cosqb. the wsave array must be
-        initialized by calling subroutine cosqi(n,wsave) and a
-        different wsave array must be used for each different
-        value of n. this initialization does not have to be
-        repeated so long as n remains unchanged thus subsequent
-        transforms can be obtained faster than the first.
-
-output parameters
-
-x       for i=1,...,n
-
-             x(i)= the sum from k=1 to k=n of
-
-               4*x(k)*cos((2*k-1)*(i-1)*pi/(2*n))
-
-             a call of cosqb followed by a call of
-             cosqf will multiply the sequence x by 4*n.
-             therefore cosqf is the unnormalized inverse
-             of cosqb.
-
-wsave   contains initialization calculations which must not
-        be destroyed between calls of cosqb or cosqf.
-
-******************************************************************
-
-subroutine cffti(n,wsave)
-
-******************************************************************
-
-subroutine cffti initializes the array wsave which is used in
-both cfftf and cfftb. the prime factorization of n together with
-a tabulation of the trigonometric functions are computed and
-stored in wsave.
-
-input parameter
-
-n       the length of the sequence to be transformed
-
-output parameter
-
-wsave   a work array which must be dimensioned at least 4*n+15
-        the same work array can be used for both cfftf and cfftb
-        as long as n remains unchanged. different wsave arrays
-        are required for different values of n. the contents of
-        wsave must not be changed between calls of cfftf or cfftb.
-
-******************************************************************
-
-subroutine cfftf(n,c,wsave)
-
-******************************************************************
-
-subroutine cfftf computes the forward complex discrete fourier
-transform (the fourier analysis). equivalently , cfftf computes
-the fourier coefficients of a complex periodic sequence.
-the transform is defined below at output parameter c.
-
-the transform is not normalized. to obtain a normalized transform
-the output must be divided by n. otherwise a call of cfftf
-followed by a call of cfftb will multiply the sequence by n.
-
-the array wsave which is used by subroutine cfftf must be
-initialized by calling subroutine cffti(n,wsave).
-
-input parameters
-
-
-n      the length of the complex sequence c. the method is
-       more efficient when n is the product of small primes. n
-
-c      a complex array of length n which contains the sequence
-
-wsave   a real work array which must be dimensioned at least 4n+15
-        in the program that calls cfftf. the wsave array must be
-        initialized by calling subroutine cffti(n,wsave) and a
-        different wsave array must be used for each different
-        value of n. this initialization does not have to be
-        repeated so long as n remains unchanged thus subsequent
-        transforms can be obtained faster than the first.
-        the same wsave array can be used by cfftf and cfftb.
-
-output parameters
-
-c      for j=1,...,n
-
-           c(j)=the sum from k=1,...,n of
-
-                 c(k)*exp(-i*(j-1)*(k-1)*2*pi/n)
-
-                       where i=sqrt(-1)
-
-wsave   contains initialization calculations which must not be
-        destroyed between calls of subroutine cfftf or cfftb
-
-******************************************************************
-
-subroutine cfftb(n,c,wsave)
-
-******************************************************************
-
-subroutine cfftb computes the backward complex discrete fourier
-transform (the fourier synthesis). equivalently , cfftb computes
-a complex periodic sequence from its fourier coefficients.
-the transform is defined below at output parameter c.
-
-a call of cfftf followed by a call of cfftb will multiply the
-sequence by n.
-
-the array wsave which is used by subroutine cfftb must be
-initialized by calling subroutine cffti(n,wsave).
-
-input parameters
-
-
-n      the length of the complex sequence c. the method is
-       more efficient when n is the product of small primes.
-
-c      a complex array of length n which contains the sequence
-
-wsave   a real work array which must be dimensioned at least 4n+15
-        in the program that calls cfftb. the wsave array must be
-        initialized by calling subroutine cffti(n,wsave) and a
-        different wsave array must be used for each different
-        value of n. this initialization does not have to be
-        repeated so long as n remains unchanged thus subsequent
-        transforms can be obtained faster than the first.
-        the same wsave array can be used by cfftf and cfftb.
-
-output parameters
-
-c      for j=1,...,n
-
-           c(j)=the sum from k=1,...,n of
-
-                 c(k)*exp(i*(j-1)*(k-1)*2*pi/n)
-
-                       where i=sqrt(-1)
-
-wsave   contains initialization calculations which must not be
-        destroyed between calls of subroutine cfftf or cfftb
-
-*/
diff --git a/oss-internship-2020/pffft/main_pffft_sandboxed.cc b/oss-internship-2020/pffft/main_pffft_sandboxed.cc
index 8b7e6d1..10ea802 100644
--- a/oss-internship-2020/pffft/main_pffft_sandboxed.cc
+++ b/oss-internship-2020/pffft/main_pffft_sandboxed.cc
@@ -23,7 +23,6 @@
 #include <cmath>
 #include <cstdio>
 
-#include "fftpack.h"
 #include "pffft_sapi.sapi.h"
 #include "sandboxed_api/util/flag.h"
 #include "sandboxed_api/vars.h"
diff --git a/oss-internship-2020/pffft/pffft.c b/oss-internship-2020/pffft/pffft.c
deleted file mode 100644
index 1686e15..0000000
--- a/oss-internship-2020/pffft/pffft.c
+++ /dev/null
@@ -1,1881 +0,0 @@
-/* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
-
-   Based on original fortran 77 code from FFTPACKv4 from NETLIB
-   (http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber
-   of NCAR, in 1985.
-
-   As confirmed by the NCAR fftpack software curators, the following
-   FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
-   released under the same terms.
-
-   FFTPACK license:
-
-   http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
-
-   Copyright (c) 2004 the University Corporation for Atmospheric
-   Research ("UCAR"). All rights reserved. Developed by NCAR's
-   Computational and Information Systems Laboratory, UCAR,
-   www.cisl.ucar.edu.
-
-   Redistribution and use of the Software in source and binary forms,
-   with or without modification, is permitted provided that the
-   following conditions are met:
-
-   - Neither the names of NCAR's Computational and Information Systems
-   Laboratory, the University Corporation for Atmospheric Research,
-   nor the names of its sponsors or contributors may be used to
-   endorse or promote products derived from this Software without
-   specific prior written permission.  
-
-   - Redistributions of source code must retain the above copyright
-   notices, this list of conditions, and the disclaimer below.
-
-   - Redistributions in binary form must reproduce the above copyright
-   notice, this list of conditions, and the disclaimer below in the
-   documentation and/or other materials provided with the
-   distribution.
-
-   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
-   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
-   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
-   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
-   SOFTWARE.
-
-
-   PFFFT : a Pretty Fast FFT.
-
-   This file is largerly based on the original FFTPACK implementation, modified in
-   order to take advantage of SIMD instructions of modern CPUs.
-*/
-
-/*
-  ChangeLog: 
-  - 2011/10/02, version 1: This is the very first release of this file.
-*/
-
-#include "pffft.h"
-#include <stdlib.h>
-#include <stdio.h>
-#include <math.h>
-#include <assert.h>
-
-/* detect compiler flavour */
-#if defined(_MSC_VER)
-#  define COMPILER_MSVC
-#elif defined(__GNUC__)
-#  define COMPILER_GCC
-#endif
-
-#if defined(COMPILER_GCC)
-#  define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline))
-#  define NEVER_INLINE(return_type) return_type __attribute__ ((noinline))
-#  define RESTRICT __restrict
-#  define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ varname__[size__];
-#elif defined(COMPILER_MSVC)
-#  define ALWAYS_INLINE(return_type) __forceinline return_type
-#  define NEVER_INLINE(return_type) __declspec(noinline) return_type
-#  define RESTRICT __restrict
-#  define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ *varname__ = (type__*)_alloca(size__ * sizeof(type__))
-#endif
-
-
-/* 
-   vector support macros: the rest of the code is independant of
-   SSE/Altivec/NEON -- adding support for other platforms with 4-element
-   vectors should be limited to these macros 
-*/
-
-
-// define PFFFT_SIMD_DISABLE if you want to use scalar code instead of simd code
-//#define PFFFT_SIMD_DISABLE
-
-/*
-   Altivec support macros 
-*/
-#if !defined(PFFFT_SIMD_DISABLE) && (defined(__ppc__) || defined(__ppc64__))
-typedef vector float v4sf;
-#  define SIMD_SZ 4
-#  define VZERO() ((vector float) vec_splat_u8(0))
-#  define VMUL(a,b) vec_madd(a,b, VZERO())
-#  define VADD(a,b) vec_add(a,b)
-#  define VMADD(a,b,c) vec_madd(a,b,c)
-#  define VSUB(a,b) vec_sub(a,b)
-inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_perm(v, v, vec_lvsl(0, p)), 0); }
-#  define LD_PS1(p) ld_ps1(&p)
-#  define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = vec_mergeh(in1, in2); out2 = vec_mergel(in1, in2); out1 = tmp__; }
-#  define UNINTERLEAVE2(in1, in2, out1, out2) {                           \
-    vector unsigned char vperm1 =  (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); \
-    vector unsigned char vperm2 =  (vector unsigned char)(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); \
-    v4sf tmp__ = vec_perm(in1, in2, vperm1); out2 = vec_perm(in1, in2, vperm2); out1 = tmp__; \
-  }
-#  define VTRANSPOSE4(x0,x1,x2,x3) {              \
-    v4sf y0 = vec_mergeh(x0, x2);               \
-    v4sf y1 = vec_mergel(x0, x2);               \
-    v4sf y2 = vec_mergeh(x1, x3);               \
-    v4sf y3 = vec_mergel(x1, x3);               \
-    x0 = vec_mergeh(y0, y2);                    \
-    x1 = vec_mergel(y0, y2);                    \
-    x2 = vec_mergeh(y1, y3);                    \
-    x3 = vec_mergel(y1, y3);                    \
-  }
-#  define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char)(16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15))
-#  define VALIGNED(ptr) ((((long)(ptr)) & 0xF) == 0)
-
-/*
-  SSE1 support macros
-*/
-#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(i386) || defined(_M_IX86))
-
-#include <xmmintrin.h>
-typedef __m128 v4sf;
-#  define SIMD_SZ 4 // 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/finalize functions anyway so you will have to work if you want to enable AVX with its 256-bit vectors.
-#  define VZERO() _mm_setzero_ps()
-#  define VMUL(a,b) _mm_mul_ps(a,b)
-#  define VADD(a,b) _mm_add_ps(a,b)
-#  define VMADD(a,b,c) _mm_add_ps(_mm_mul_ps(a,b), c)
-#  define VSUB(a,b) _mm_sub_ps(a,b)
-#  define LD_PS1(p) _mm_set1_ps(p)
-#  define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; }
-#  define UNINTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; }
-#  define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3)
-#  define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0))
-#  define VALIGNED(ptr) ((((long)(ptr)) & 0xF) == 0)
-
-/*
-  ARM NEON support macros
-*/
-#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__arm__) || defined(__aarch64__) || defined(__arm64__))
-#  include <arm_neon.h>
-typedef float32x4_t v4sf;
-#  define SIMD_SZ 4
-#  define VZERO() vdupq_n_f32(0)
-#  define VMUL(a,b) vmulq_f32(a,b)
-#  define VADD(a,b) vaddq_f32(a,b)
-#  define VMADD(a,b,c) vmlaq_f32(c,a,b)
-#  define VSUB(a,b) vsubq_f32(a,b)
-#  define LD_PS1(p) vld1q_dup_f32(&(p))
-#  define INTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vzipq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
-#  define UNINTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vuzpq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
-#  define VTRANSPOSE4(x0,x1,x2,x3) {                                    \
-    float32x4x2_t t0_ = vzipq_f32(x0, x2);                              \
-    float32x4x2_t t1_ = vzipq_f32(x1, x3);                              \
-    float32x4x2_t u0_ = vzipq_f32(t0_.val[0], t1_.val[0]);              \
-    float32x4x2_t u1_ = vzipq_f32(t0_.val[1], t1_.val[1]);              \
-    x0 = u0_.val[0]; x1 = u0_.val[1]; x2 = u1_.val[0]; x3 = u1_.val[1]; \
-  }
-// marginally faster version
-//#  define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); }
-#  define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a))
-#  define VALIGNED(ptr) ((((long)(ptr)) & 0x3) == 0)
-#else
-#  if !defined(PFFFT_SIMD_DISABLE)
-#    warning "building with simd disabled !\n";
-#    define PFFFT_SIMD_DISABLE // fallback to scalar code
-#  endif
-#endif
-
-// fallback mode for situations where SSE/Altivec are not available, use scalar mode instead
-#ifdef PFFFT_SIMD_DISABLE
-typedef float v4sf;
-#  define SIMD_SZ 1
-#  define VZERO() 0.f
-#  define VMUL(a,b) ((a)*(b))
-#  define VADD(a,b) ((a)+(b))
-#  define VMADD(a,b,c) ((a)*(b)+(c))
-#  define VSUB(a,b) ((a)-(b))
-#  define LD_PS1(p) (p)
-#  define VALIGNED(ptr) ((((long)(ptr)) & 0x3) == 0)
-#endif
-
-// shortcuts for complex multiplcations
-#define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); }
-#define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); }
-#ifndef SVMUL
-// multiply a scalar with a vector
-#define SVMUL(f,v) VMUL(LD_PS1(f),v)
-#endif
-
-#if !defined(PFFFT_SIMD_DISABLE)
-typedef union v4sf_union {
-  v4sf  v;
-  float f[4];
-} v4sf_union;
-
-#include <string.h>
-
-#define assertv4(v,f0,f1,f2,f3) assert(v.f[0] == (f0) && v.f[1] == (f1) && v.f[2] == (f2) && v.f[3] == (f3))
-
-/* detect bugs with the vector support macros */
-void validate_pffft_simd() {
-  float f[16] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 };
-  v4sf_union a0, a1, a2, a3, t, u; 
-  memcpy(a0.f, f, 4*sizeof(float));
-  memcpy(a1.f, f+4, 4*sizeof(float));
-  memcpy(a2.f, f+8, 4*sizeof(float));
-  memcpy(a3.f, f+12, 4*sizeof(float));
-
-  t = a0; u = a1; t.v = VZERO();
-  printf("VZERO=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 0, 0, 0, 0);
-  t.v = VADD(a1.v, a2.v);
-  printf("VADD(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 12, 14, 16, 18);
-  t.v = VMUL(a1.v, a2.v);
-  printf("VMUL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 45, 60, 77);
-  t.v = VMADD(a1.v, a2.v,a0.v);
-  printf("VMADD(4:7,8:11,0:3)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 46, 62, 80);
-
-  INTERLEAVE2(a1.v,a2.v,t.v,u.v);
-  printf("INTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
-  assertv4(t, 4, 8, 5, 9); assertv4(u, 6, 10, 7, 11);
-  UNINTERLEAVE2(a1.v,a2.v,t.v,u.v);
-  printf("UNINTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
-  assertv4(t, 4, 6, 8, 10); assertv4(u, 5, 7, 9, 11);
-
-  t.v=LD_PS1(f[15]);
-  printf("LD_PS1(15)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
-  assertv4(t, 15, 15, 15, 15);
-  t.v = VSWAPHL(a1.v, a2.v);
-  printf("VSWAPHL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
-  assertv4(t, 8, 9, 6, 7);
-  VTRANSPOSE4(a0.v, a1.v, a2.v, a3.v);
-  printf("VTRANSPOSE4(0:3,4:7,8:11,12:15)=[%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", 
-         a0.f[0], a0.f[1], a0.f[2], a0.f[3], a1.f[0], a1.f[1], a1.f[2], a1.f[3], 
-         a2.f[0], a2.f[1], a2.f[2], a2.f[3], a3.f[0], a3.f[1], a3.f[2], a3.f[3]); 
-  assertv4(a0, 0, 4, 8, 12); assertv4(a1, 1, 5, 9, 13); assertv4(a2, 2, 6, 10, 14); assertv4(a3, 3, 7, 11, 15);
-}
-#endif //!PFFFT_SIMD_DISABLE
-
-/* SSE and co like 16-bytes aligned pointers */
-#define MALLOC_V4SF_ALIGNMENT 64 // with a 64-byte alignment, we are even aligned on L2 cache lines...
-void *pffft_aligned_malloc(size_t nb_bytes) {
-  void *p, *p0 = malloc(nb_bytes + MALLOC_V4SF_ALIGNMENT);
-  if (!p0) return (void *) 0;
-  p = (void *) (((size_t) p0 + MALLOC_V4SF_ALIGNMENT) & (~((size_t) (MALLOC_V4SF_ALIGNMENT-1))));
-  *((void **) p - 1) = p0;
-  return p;
-}
-
-void pffft_aligned_free(void *p) {
-  if (p) free(*((void **) p - 1));
-}
-
-int pffft_simd_size() { return SIMD_SZ; }
-
-/*
-  passf2 and passb2 has been merged here, fsign = -1 for passf2, +1 for passb2
-*/
-static NEVER_INLINE(void) passf2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1, float fsign) {
-  int k, i;
-  int l1ido = l1*ido;
-  if (ido <= 2) {
-    for (k=0; k < l1ido; k += ido, ch += ido, cc+= 2*ido) {
-      ch[0]         = VADD(cc[0], cc[ido+0]);
-      ch[l1ido]     = VSUB(cc[0], cc[ido+0]);
-      ch[1]         = VADD(cc[1], cc[ido+1]);
-      ch[l1ido + 1] = VSUB(cc[1], cc[ido+1]);
-    }
-  } else {
-    for (k=0; k < l1ido; k += ido, ch += ido, cc += 2*ido) {
-      for (i=0; i<ido-1; i+=2) {
-        v4sf tr2 = VSUB(cc[i+0], cc[i+ido+0]);
-        v4sf ti2 = VSUB(cc[i+1], cc[i+ido+1]);
-        v4sf wr = LD_PS1(wa1[i]), wi = VMUL(LD_PS1(fsign), LD_PS1(wa1[i+1]));
-        ch[i]   = VADD(cc[i+0], cc[i+ido+0]);
-        ch[i+1] = VADD(cc[i+1], cc[i+ido+1]);
-        VCPLXMUL(tr2, ti2, wr, wi);
-        ch[i+l1ido]   = tr2;
-        ch[i+l1ido+1] = ti2;
-      }
-    }
-  }
-}
-
-/*
-  passf3 and passb3 has been merged here, fsign = -1 for passf3, +1 for passb3
-*/
-static NEVER_INLINE(void) passf3_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
-                                    const float *wa1, const float *wa2, float fsign) {
-  static const float taur = -0.5f;
-  float taui = 0.866025403784439f*fsign;
-  int i, k;
-  v4sf tr2, ti2, cr2, ci2, cr3, ci3, dr2, di2, dr3, di3;
-  int l1ido = l1*ido;
-  float wr1, wi1, wr2, wi2;
-  assert(ido > 2);
-  for (k=0; k< l1ido; k += ido, cc+= 3*ido, ch +=ido) {
-    for (i=0; i<ido-1; i+=2) {
-      tr2 = VADD(cc[i+ido], cc[i+2*ido]);
-      cr2 = VADD(cc[i], SVMUL(taur,tr2));
-      ch[i]    = VADD(cc[i], tr2);
-      ti2 = VADD(cc[i+ido+1], cc[i+2*ido+1]);
-      ci2 = VADD(cc[i    +1], SVMUL(taur,ti2));
-      ch[i+1]  = VADD(cc[i+1], ti2);
-      cr3 = SVMUL(taui, VSUB(cc[i+ido], cc[i+2*ido]));
-      ci3 = SVMUL(taui, VSUB(cc[i+ido+1], cc[i+2*ido+1]));
-      dr2 = VSUB(cr2, ci3);
-      dr3 = VADD(cr2, ci3);
-      di2 = VADD(ci2, cr3);
-      di3 = VSUB(ci2, cr3);
-      wr1=wa1[i], wi1=fsign*wa1[i+1], wr2=wa2[i], wi2=fsign*wa2[i+1]; 
-      VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
-      ch[i+l1ido] = dr2; 
-      ch[i+l1ido + 1] = di2;
-      VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
-      ch[i+2*l1ido] = dr3;
-      ch[i+2*l1ido+1] = di3;
-    }
-  }
-} /* passf3 */
-
-static NEVER_INLINE(void) passf4_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
-                                    const float *wa1, const float *wa2, const float *wa3, float fsign) {
-  /* isign == -1 for forward transform and +1 for backward transform */
-
-  int i, k;
-  v4sf ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
-  int l1ido = l1*ido;
-  if (ido == 2) {
-    for (k=0; k < l1ido; k += ido, ch += ido, cc += 4*ido) {
-      tr1 = VSUB(cc[0], cc[2*ido + 0]);
-      tr2 = VADD(cc[0], cc[2*ido + 0]);
-      ti1 = VSUB(cc[1], cc[2*ido + 1]);
-      ti2 = VADD(cc[1], cc[2*ido + 1]);
-      ti4 = VMUL(VSUB(cc[1*ido + 0], cc[3*ido + 0]), LD_PS1(fsign));
-      tr4 = VMUL(VSUB(cc[3*ido + 1], cc[1*ido + 1]), LD_PS1(fsign));
-      tr3 = VADD(cc[ido + 0], cc[3*ido + 0]);
-      ti3 = VADD(cc[ido + 1], cc[3*ido + 1]);
-
-      ch[0*l1ido + 0] = VADD(tr2, tr3);
-      ch[0*l1ido + 1] = VADD(ti2, ti3);
-      ch[1*l1ido + 0] = VADD(tr1, tr4);
-      ch[1*l1ido + 1] = VADD(ti1, ti4);
-      ch[2*l1ido + 0] = VSUB(tr2, tr3);
-      ch[2*l1ido + 1] = VSUB(ti2, ti3);        
-      ch[3*l1ido + 0] = VSUB(tr1, tr4);
-      ch[3*l1ido + 1] = VSUB(ti1, ti4);
-    }
-  } else {
-    for (k=0; k < l1ido; k += ido, ch+=ido, cc += 4*ido) {
-      for (i=0; i<ido-1; i+=2) {
-        float wr1, wi1, wr2, wi2, wr3, wi3;
-        tr1 = VSUB(cc[i + 0], cc[i + 2*ido + 0]);
-        tr2 = VADD(cc[i + 0], cc[i + 2*ido + 0]);
-        ti1 = VSUB(cc[i + 1], cc[i + 2*ido + 1]);
-        ti2 = VADD(cc[i + 1], cc[i + 2*ido + 1]);
-        tr4 = VMUL(VSUB(cc[i + 3*ido + 1], cc[i + 1*ido + 1]), LD_PS1(fsign));
-        ti4 = VMUL(VSUB(cc[i + 1*ido + 0], cc[i + 3*ido + 0]), LD_PS1(fsign));
-        tr3 = VADD(cc[i + ido + 0], cc[i + 3*ido + 0]);
-        ti3 = VADD(cc[i + ido + 1], cc[i + 3*ido + 1]);
-
-        ch[i] = VADD(tr2, tr3);
-        cr3    = VSUB(tr2, tr3);
-        ch[i + 1] = VADD(ti2, ti3);
-        ci3 = VSUB(ti2, ti3);
-
-        cr2 = VADD(tr1, tr4);
-        cr4 = VSUB(tr1, tr4);
-        ci2 = VADD(ti1, ti4);
-        ci4 = VSUB(ti1, ti4);
-        wr1=wa1[i], wi1=fsign*wa1[i+1];
-        VCPLXMUL(cr2, ci2, LD_PS1(wr1), LD_PS1(wi1));
-        wr2=wa2[i], wi2=fsign*wa2[i+1]; 
-        ch[i + l1ido] = cr2;
-        ch[i + l1ido + 1] = ci2;
-
-        VCPLXMUL(cr3, ci3, LD_PS1(wr2), LD_PS1(wi2));
-        wr3=wa3[i], wi3=fsign*wa3[i+1]; 
-        ch[i + 2*l1ido] = cr3;
-        ch[i + 2*l1ido + 1] = ci3;
-
-        VCPLXMUL(cr4, ci4, LD_PS1(wr3), LD_PS1(wi3));
-        ch[i + 3*l1ido] = cr4;
-        ch[i + 3*l1ido + 1] = ci4;
-      }
-    }
-  }
-} /* passf4 */
-
-/*
-  passf5 and passb5 has been merged here, fsign = -1 for passf5, +1 for passb5
-*/
-static NEVER_INLINE(void) passf5_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
-                                    const float *wa1, const float *wa2, 
-                                    const float *wa3, const float *wa4, float fsign) {  
-  static const float tr11 = .309016994374947f;
-  const float ti11 = .951056516295154f*fsign;
-  static const float tr12 = -.809016994374947f;
-  const float ti12 = .587785252292473f*fsign;
-
-  /* Local variables */
-  int i, k;
-  v4sf ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4, ti2, ti3,
-    ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5;
-
-  float wr1, wi1, wr2, wi2, wr3, wi3, wr4, wi4;
-
-#define cc_ref(a_1,a_2) cc[(a_2-1)*ido + a_1 + 1]
-#define ch_ref(a_1,a_3) ch[(a_3-1)*l1*ido + a_1 + 1]
-
-  assert(ido > 2);
-  for (k = 0; k < l1; ++k, cc += 5*ido, ch += ido) {
-    for (i = 0; i < ido-1; i += 2) {
-      ti5 = VSUB(cc_ref(i  , 2), cc_ref(i  , 5));
-      ti2 = VADD(cc_ref(i  , 2), cc_ref(i  , 5));
-      ti4 = VSUB(cc_ref(i  , 3), cc_ref(i  , 4));
-      ti3 = VADD(cc_ref(i  , 3), cc_ref(i  , 4));
-      tr5 = VSUB(cc_ref(i-1, 2), cc_ref(i-1, 5));
-      tr2 = VADD(cc_ref(i-1, 2), cc_ref(i-1, 5));
-      tr4 = VSUB(cc_ref(i-1, 3), cc_ref(i-1, 4));
-      tr3 = VADD(cc_ref(i-1, 3), cc_ref(i-1, 4));
-      ch_ref(i-1, 1) = VADD(cc_ref(i-1, 1), VADD(tr2, tr3));
-      ch_ref(i  , 1) = VADD(cc_ref(i  , 1), VADD(ti2, ti3));
-      cr2 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr11, tr2),SVMUL(tr12, tr3)));
-      ci2 = VADD(cc_ref(i  , 1), VADD(SVMUL(tr11, ti2),SVMUL(tr12, ti3)));
-      cr3 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr12, tr2),SVMUL(tr11, tr3)));
-      ci3 = VADD(cc_ref(i  , 1), VADD(SVMUL(tr12, ti2),SVMUL(tr11, ti3)));
-      cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4));
-      ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
-      cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4));
-      ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
-      dr3 = VSUB(cr3, ci4);
-      dr4 = VADD(cr3, ci4);
-      di3 = VADD(ci3, cr4);
-      di4 = VSUB(ci3, cr4);
-      dr5 = VADD(cr2, ci5);
-      dr2 = VSUB(cr2, ci5);
-      di5 = VSUB(ci2, cr5);
-      di2 = VADD(ci2, cr5);
-      wr1=wa1[i], wi1=fsign*wa1[i+1], wr2=wa2[i], wi2=fsign*wa2[i+1]; 
-      wr3=wa3[i], wi3=fsign*wa3[i+1], wr4=wa4[i], wi4=fsign*wa4[i+1]; 
-      VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
-      ch_ref(i - 1, 2) = dr2;
-      ch_ref(i, 2)     = di2;
-      VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
-      ch_ref(i - 1, 3) = dr3;
-      ch_ref(i, 3)     = di3;
-      VCPLXMUL(dr4, di4, LD_PS1(wr3), LD_PS1(wi3));
-      ch_ref(i - 1, 4) = dr4;
-      ch_ref(i, 4)     = di4;
-      VCPLXMUL(dr5, di5, LD_PS1(wr4), LD_PS1(wi4));
-      ch_ref(i - 1, 5) = dr5;
-      ch_ref(i, 5)     = di5;
-    }
-  }
-#undef ch_ref
-#undef cc_ref
-}
-
-static NEVER_INLINE(void) radf2_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch, const float *wa1) {
-  static const float minus_one = -1.f;
-  int i, k, l1ido = l1*ido;
-  for (k=0; k < l1ido; k += ido) {
-    v4sf a = cc[k], b = cc[k + l1ido];
-    ch[2*k] = VADD(a, b);
-    ch[2*(k+ido)-1] = VSUB(a, b);
-  }
-  if (ido < 2) return;
-  if (ido != 2) {
-    for (k=0; k < l1ido; k += ido) {
-      for (i=2; i<ido; i+=2) {
-        v4sf tr2 = cc[i - 1 + k + l1ido], ti2 = cc[i + k + l1ido];
-        v4sf br = cc[i - 1 + k], bi = cc[i + k];
-        VCPLXMULCONJ(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1])); 
-        ch[i + 2*k] = VADD(bi, ti2);
-        ch[2*(k+ido) - i] = VSUB(ti2, bi);
-        ch[i - 1 + 2*k] = VADD(br, tr2);
-        ch[2*(k+ido) - i -1] = VSUB(br, tr2);
-      }
-    }
-    if (ido % 2 == 1) return;
-  }
-  for (k=0; k < l1ido; k += ido) {
-    ch[2*k + ido] = SVMUL(minus_one, cc[ido-1 + k + l1ido]);
-    ch[2*k + ido-1] = cc[k + ido-1];
-  }
-} /* radf2 */
-
-
-static NEVER_INLINE(void) radb2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1) {
-  static const float minus_two=-2;
-  int i, k, l1ido = l1*ido;
-  v4sf a,b,c,d, tr2, ti2;
-  for (k=0; k < l1ido; k += ido) {
-    a = cc[2*k]; b = cc[2*(k+ido) - 1];
-    ch[k] = VADD(a, b);
-    ch[k + l1ido] =VSUB(a, b);
-  }
-  if (ido < 2) return;
-  if (ido != 2) {
-    for (k = 0; k < l1ido; k += ido) {
-      for (i = 2; i < ido; i += 2) {
-        a = cc[i-1 + 2*k]; b = cc[2*(k + ido) - i - 1];
-        c = cc[i+0 + 2*k]; d = cc[2*(k + ido) - i + 0];
-        ch[i-1 + k] = VADD(a, b);
-        tr2 = VSUB(a, b);
-        ch[i+0 + k] = VSUB(c, d);
-        ti2 = VADD(c, d);
-        VCPLXMUL(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
-        ch[i-1 + k + l1ido] = tr2;
-        ch[i+0 + k + l1ido] = ti2;
-      }
-    }
-    if (ido % 2 == 1) return;
-  }
-  for (k = 0; k < l1ido; k += ido) {
-    a = cc[2*k + ido-1]; b = cc[2*k + ido];
-    ch[k + ido-1] = VADD(a,a);
-    ch[k + ido-1 + l1ido] = SVMUL(minus_two, b);
-  }
-} /* radb2 */
-
-static void radf3_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
-                     const float *wa1, const float *wa2) {
-  static const float taur = -0.5f;
-  static const float taui = 0.866025403784439f;
-  int i, k, ic;
-  v4sf ci2, di2, di3, cr2, dr2, dr3, ti2, ti3, tr2, tr3, wr1, wi1, wr2, wi2;
-  for (k=0; k<l1; k++) {
-    cr2 = VADD(cc[(k + l1)*ido], cc[(k + 2*l1)*ido]);
-    ch[3*k*ido] = VADD(cc[k*ido], cr2);
-    ch[(3*k+2)*ido] = SVMUL(taui, VSUB(cc[(k + l1*2)*ido], cc[(k + l1)*ido]));
-    ch[ido-1 + (3*k + 1)*ido] = VADD(cc[k*ido], SVMUL(taur, cr2));
-  }
-  if (ido == 1) return;
-  for (k=0; k<l1; k++) {
-    for (i=2; i<ido; i+=2) {
-      ic = ido - i;
-      wr1 = LD_PS1(wa1[i - 2]); wi1 = LD_PS1(wa1[i - 1]);
-      dr2 = cc[i - 1 + (k + l1)*ido]; di2 = cc[i + (k + l1)*ido];
-      VCPLXMULCONJ(dr2, di2, wr1, wi1);
-
-      wr2 = LD_PS1(wa2[i - 2]); wi2 = LD_PS1(wa2[i - 1]);
-      dr3 = cc[i - 1 + (k + l1*2)*ido]; di3 = cc[i + (k + l1*2)*ido];
-      VCPLXMULCONJ(dr3, di3, wr2, wi2);
-        
-      cr2 = VADD(dr2, dr3);
-      ci2 = VADD(di2, di3);
-      ch[i - 1 + 3*k*ido] = VADD(cc[i - 1 + k*ido], cr2);
-      ch[i + 3*k*ido] = VADD(cc[i + k*ido], ci2);
-      tr2 = VADD(cc[i - 1 + k*ido], SVMUL(taur, cr2));
-      ti2 = VADD(cc[i + k*ido], SVMUL(taur, ci2));
-      tr3 = SVMUL(taui, VSUB(di2, di3));
-      ti3 = SVMUL(taui, VSUB(dr3, dr2));
-      ch[i - 1 + (3*k + 2)*ido] = VADD(tr2, tr3);
-      ch[ic - 1 + (3*k + 1)*ido] = VSUB(tr2, tr3);
-      ch[i + (3*k + 2)*ido] = VADD(ti2, ti3);
-      ch[ic + (3*k + 1)*ido] = VSUB(ti3, ti2);
-    }
-  }
-} /* radf3 */
-
-
-static void radb3_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
-                     const float *wa1, const float *wa2)
-{
-  static const float taur = -0.5f;
-  static const float taui = 0.866025403784439f;
-  static const float taui_2 = 0.866025403784439f*2;
-  int i, k, ic;
-  v4sf ci2, ci3, di2, di3, cr2, cr3, dr2, dr3, ti2, tr2;
-  for (k=0; k<l1; k++) {
-    tr2 = cc[ido-1 + (3*k + 1)*ido]; tr2 = VADD(tr2,tr2);
-    cr2 = VMADD(LD_PS1(taur), tr2, cc[3*k*ido]);
-    ch[k*ido] = VADD(cc[3*k*ido], tr2);
-    ci3 = SVMUL(taui_2, cc[(3*k + 2)*ido]);
-    ch[(k + l1)*ido] = VSUB(cr2, ci3);
-    ch[(k + 2*l1)*ido] = VADD(cr2, ci3);
-  }
-  if (ido == 1) return;
-  for (k=0; k<l1; k++) {
-    for (i=2; i<ido; i+=2) {
-      ic = ido - i;
-      tr2 = VADD(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]);
-      cr2 = VMADD(LD_PS1(taur), tr2, cc[i - 1 + 3*k*ido]);
-      ch[i - 1 + k*ido] = VADD(cc[i - 1 + 3*k*ido], tr2);
-      ti2 = VSUB(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]);
-      ci2 = VMADD(LD_PS1(taur), ti2, cc[i + 3*k*ido]);
-      ch[i + k*ido] = VADD(cc[i + 3*k*ido], ti2);
-      cr3 = SVMUL(taui, VSUB(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]));
-      ci3 = SVMUL(taui, VADD(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]));
-      dr2 = VSUB(cr2, ci3);
-      dr3 = VADD(cr2, ci3);
-      di2 = VADD(ci2, cr3);
-      di3 = VSUB(ci2, cr3);
-      VCPLXMUL(dr2, di2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
-      ch[i - 1 + (k + l1)*ido] = dr2;
-      ch[i + (k + l1)*ido] = di2;
-      VCPLXMUL(dr3, di3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
-      ch[i - 1 + (k + 2*l1)*ido] = dr3;
-      ch[i + (k + 2*l1)*ido] = di3;
-    }
-  }
-} /* radb3 */
-
-static NEVER_INLINE(void) radf4_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf * RESTRICT ch,
-                                   const float * RESTRICT wa1, const float * RESTRICT wa2, const float * RESTRICT wa3)
-{
-  static const float minus_hsqt2 = (float)-0.7071067811865475;
-  int i, k, l1ido = l1*ido;
-  {
-    const v4sf *RESTRICT cc_ = cc, * RESTRICT cc_end = cc + l1ido; 
-    v4sf * RESTRICT ch_ = ch;
-    while (cc < cc_end) {
-      // this loop represents between 25% and 40% of total radf4_ps cost !
-      v4sf a0 = cc[0], a1 = cc[l1ido];
-      v4sf a2 = cc[2*l1ido], a3 = cc[3*l1ido];
-      v4sf tr1 = VADD(a1, a3);
-      v4sf tr2 = VADD(a0, a2);
-      ch[2*ido-1] = VSUB(a0, a2);
-      ch[2*ido  ] = VSUB(a3, a1);
-      ch[0      ] = VADD(tr1, tr2);
-      ch[4*ido-1] = VSUB(tr2, tr1);
-      cc += ido; ch += 4*ido;
-    }
-    cc = cc_; ch = ch_;
-  }
-  if (ido < 2) return;
-  if (ido != 2) {
-    for (k = 0; k < l1ido; k += ido) {
-      const v4sf * RESTRICT pc = (v4sf*)(cc + 1 + k);
-      for (i=2; i<ido; i += 2, pc += 2) {
-        int ic = ido - i;
-        v4sf wr, wi, cr2, ci2, cr3, ci3, cr4, ci4;
-        v4sf tr1, ti1, tr2, ti2, tr3, ti3, tr4, ti4;
-
-        cr2 = pc[1*l1ido+0];
-        ci2 = pc[1*l1ido+1];
-        wr=LD_PS1(wa1[i - 2]);
-        wi=LD_PS1(wa1[i - 1]);
-        VCPLXMULCONJ(cr2,ci2,wr,wi);
-
-        cr3 = pc[2*l1ido+0];
-        ci3 = pc[2*l1ido+1];
-        wr = LD_PS1(wa2[i-2]); 
-        wi = LD_PS1(wa2[i-1]);
-        VCPLXMULCONJ(cr3, ci3, wr, wi);
-
-        cr4 = pc[3*l1ido];
-        ci4 = pc[3*l1ido+1];
-        wr = LD_PS1(wa3[i-2]); 
-        wi = LD_PS1(wa3[i-1]);
-        VCPLXMULCONJ(cr4, ci4, wr, wi);
-
-        /* at this point, on SSE, five of "cr2 cr3 cr4 ci2 ci3 ci4" should be loaded in registers */
-
-        tr1 = VADD(cr2,cr4);
-        tr4 = VSUB(cr4,cr2); 
-        tr2 = VADD(pc[0],cr3);
-        tr3 = VSUB(pc[0],cr3);
-        ch[i - 1 + 4*k] = VADD(tr1,tr2);
-        ch[ic - 1 + 4*k + 3*ido] = VSUB(tr2,tr1); // at this point tr1 and tr2 can be disposed
-        ti1 = VADD(ci2,ci4);
-        ti4 = VSUB(ci2,ci4);
-        ch[i - 1 + 4*k + 2*ido] = VADD(ti4,tr3);
-        ch[ic - 1 + 4*k + 1*ido] = VSUB(tr3,ti4); // dispose tr3, ti4
-        ti2 = VADD(pc[1],ci3);
-        ti3 = VSUB(pc[1],ci3);
-        ch[i + 4*k] = VADD(ti1, ti2);
-        ch[ic + 4*k + 3*ido] = VSUB(ti1, ti2);
-        ch[i + 4*k + 2*ido] = VADD(tr4, ti3);
-        ch[ic + 4*k + 1*ido] = VSUB(tr4, ti3);
-      }
-    }
-    if (ido % 2 == 1) return;
-  }
-  for (k=0; k<l1ido; k += ido) {
-    v4sf a = cc[ido-1 + k + l1ido], b = cc[ido-1 + k + 3*l1ido];
-    v4sf c = cc[ido-1 + k], d = cc[ido-1 + k + 2*l1ido];
-    v4sf ti1 = SVMUL(minus_hsqt2, VADD(a, b));
-    v4sf tr1 = SVMUL(minus_hsqt2, VSUB(b, a));
-    ch[ido-1 + 4*k] = VADD(tr1, c);
-    ch[ido-1 + 4*k + 2*ido] = VSUB(c, tr1);
-    ch[4*k + 1*ido] = VSUB(ti1, d); 
-    ch[4*k + 3*ido] = VADD(ti1, d); 
-  }
-} /* radf4 */
-
-
-static NEVER_INLINE(void) radb4_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
-                                   const float * RESTRICT wa1, const float * RESTRICT wa2, const float *RESTRICT wa3)
-{
-  static const float minus_sqrt2 = (float)-1.414213562373095;
-  static const float two = 2.f;
-  int i, k, l1ido = l1*ido;
-  v4sf ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
-  {
-    const v4sf *RESTRICT cc_ = cc, * RESTRICT ch_end = ch + l1ido; 
-    v4sf *ch_ = ch;
-    while (ch < ch_end) {
-      v4sf a = cc[0], b = cc[4*ido-1];
-      v4sf c = cc[2*ido], d = cc[2*ido-1];
-      tr3 = SVMUL(two,d);
-      tr2 = VADD(a,b);
-      tr1 = VSUB(a,b);
-      tr4 = SVMUL(two,c);
-      ch[0*l1ido] = VADD(tr2, tr3);
-      ch[2*l1ido] = VSUB(tr2, tr3);
-      ch[1*l1ido] = VSUB(tr1, tr4);
-      ch[3*l1ido] = VADD(tr1, tr4);
-      
-      cc += 4*ido; ch += ido;
-    }
-    cc = cc_; ch = ch_;
-  }
-  if (ido < 2) return;
-  if (ido != 2) {
-    for (k = 0; k < l1ido; k += ido) {
-      const v4sf * RESTRICT pc = (v4sf*)(cc - 1 + 4*k);
-      v4sf * RESTRICT ph = (v4sf*)(ch + k + 1);
-      for (i = 2; i < ido; i += 2) {
-
-        tr1 = VSUB(pc[i], pc[4*ido - i]);
-        tr2 = VADD(pc[i], pc[4*ido - i]);
-        ti4 = VSUB(pc[2*ido + i], pc[2*ido - i]);
-        tr3 = VADD(pc[2*ido + i], pc[2*ido - i]);
-        ph[0] = VADD(tr2, tr3);
-        cr3 = VSUB(tr2, tr3);
-
-        ti3 = VSUB(pc[2*ido + i + 1], pc[2*ido - i + 1]);
-        tr4 = VADD(pc[2*ido + i + 1], pc[2*ido - i + 1]);
-        cr2 = VSUB(tr1, tr4);
-        cr4 = VADD(tr1, tr4);
-
-        ti1 = VADD(pc[i + 1], pc[4*ido - i + 1]);
-        ti2 = VSUB(pc[i + 1], pc[4*ido - i + 1]);
-
-        ph[1] = VADD(ti2, ti3); ph += l1ido;
-        ci3 = VSUB(ti2, ti3);
-        ci2 = VADD(ti1, ti4);
-        ci4 = VSUB(ti1, ti4);
-        VCPLXMUL(cr2, ci2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
-        ph[0] = cr2;
-        ph[1] = ci2; ph += l1ido;
-        VCPLXMUL(cr3, ci3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
-        ph[0] = cr3;
-        ph[1] = ci3; ph += l1ido;
-        VCPLXMUL(cr4, ci4, LD_PS1(wa3[i-2]), LD_PS1(wa3[i-1]));
-        ph[0] = cr4;
-        ph[1] = ci4; ph = ph - 3*l1ido + 2;
-      }
-    }
-    if (ido % 2 == 1) return;
-  }
-  for (k=0; k < l1ido; k+=ido) {
-    int i0 = 4*k + ido;
-    v4sf c = cc[i0-1], d = cc[i0 + 2*ido-1];
-    v4sf a = cc[i0+0], b = cc[i0 + 2*ido+0];
-    tr1 = VSUB(c,d);
-    tr2 = VADD(c,d);
-    ti1 = VADD(b,a);
-    ti2 = VSUB(b,a);
-    ch[ido-1 + k + 0*l1ido] = VADD(tr2,tr2);
-    ch[ido-1 + k + 1*l1ido] = SVMUL(minus_sqrt2, VSUB(ti1, tr1));
-    ch[ido-1 + k + 2*l1ido] = VADD(ti2, ti2);
-    ch[ido-1 + k + 3*l1ido] = SVMUL(minus_sqrt2, VADD(ti1, tr1));
-  }
-} /* radb4 */
-
-static void radf5_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch, 
-                     const float *wa1, const float *wa2, const float *wa3, const float *wa4)
-{
-  static const float tr11 = .309016994374947f;
-  static const float ti11 = .951056516295154f;
-  static const float tr12 = -.809016994374947f;
-  static const float ti12 = .587785252292473f;
-
-  /* System generated locals */
-  int cc_offset, ch_offset;
-
-  /* Local variables */
-  int i, k, ic;
-  v4sf ci2, di2, ci4, ci5, di3, di4, di5, ci3, cr2, cr3, dr2, dr3, dr4, dr5,
-    cr5, cr4, ti2, ti3, ti5, ti4, tr2, tr3, tr4, tr5;
-  int idp2;
-
-
-#define cc_ref(a_1,a_2,a_3) cc[((a_3)*l1 + (a_2))*ido + a_1]
-#define ch_ref(a_1,a_2,a_3) ch[((a_3)*5 + (a_2))*ido + a_1]
-
-  /* Parameter adjustments */
-  ch_offset = 1 + ido * 6;
-  ch -= ch_offset;
-  cc_offset = 1 + ido * (1 + l1);
-  cc -= cc_offset;
-
-  /* Function Body */
-  for (k = 1; k <= l1; ++k) {
-    cr2 = VADD(cc_ref(1, k, 5), cc_ref(1, k, 2));
-    ci5 = VSUB(cc_ref(1, k, 5), cc_ref(1, k, 2));
-    cr3 = VADD(cc_ref(1, k, 4), cc_ref(1, k, 3));
-    ci4 = VSUB(cc_ref(1, k, 4), cc_ref(1, k, 3));
-    ch_ref(1, 1, k) = VADD(cc_ref(1, k, 1), VADD(cr2, cr3));
-    ch_ref(ido, 2, k) = VADD(cc_ref(1, k, 1), VADD(SVMUL(tr11, cr2), SVMUL(tr12, cr3)));
-    ch_ref(1, 3, k) = VADD(SVMUL(ti11, ci5), SVMUL(ti12, ci4));
-    ch_ref(ido, 4, k) = VADD(cc_ref(1, k, 1), VADD(SVMUL(tr12, cr2), SVMUL(tr11, cr3)));
-    ch_ref(1, 5, k) = VSUB(SVMUL(ti12, ci5), SVMUL(ti11, ci4));
-    //printf("pffft: radf5, k=%d ch_ref=%f, ci4=%f\n", k, ch_ref(1, 5, k), ci4);
-  }
-  if (ido == 1) {
-    return;
-  }
-  idp2 = ido + 2;
-  for (k = 1; k <= l1; ++k) {
-    for (i = 3; i <= ido; i += 2) {
-      ic = idp2 - i;
-      dr2 = LD_PS1(wa1[i-3]); di2 = LD_PS1(wa1[i-2]);
-      dr3 = LD_PS1(wa2[i-3]); di3 = LD_PS1(wa2[i-2]);
-      dr4 = LD_PS1(wa3[i-3]); di4 = LD_PS1(wa3[i-2]);
-      dr5 = LD_PS1(wa4[i-3]); di5 = LD_PS1(wa4[i-2]);
-      VCPLXMULCONJ(dr2, di2, cc_ref(i-1, k, 2), cc_ref(i, k, 2));
-      VCPLXMULCONJ(dr3, di3, cc_ref(i-1, k, 3), cc_ref(i, k, 3));
-      VCPLXMULCONJ(dr4, di4, cc_ref(i-1, k, 4), cc_ref(i, k, 4));
-      VCPLXMULCONJ(dr5, di5, cc_ref(i-1, k, 5), cc_ref(i, k, 5));
-      cr2 = VADD(dr2, dr5);
-      ci5 = VSUB(dr5, dr2);
-      cr5 = VSUB(di2, di5);
-      ci2 = VADD(di2, di5);
-      cr3 = VADD(dr3, dr4);
-      ci4 = VSUB(dr4, dr3);
-      cr4 = VSUB(di3, di4);
-      ci3 = VADD(di3, di4);
-      ch_ref(i - 1, 1, k) = VADD(cc_ref(i - 1, k, 1), VADD(cr2, cr3));
-      ch_ref(i, 1, k) = VSUB(cc_ref(i, k, 1), VADD(ci2, ci3));//
-      tr2 = VADD(cc_ref(i - 1, k, 1), VADD(SVMUL(tr11, cr2), SVMUL(tr12, cr3)));
-      ti2 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr11, ci2), SVMUL(tr12, ci3)));//
-      tr3 = VADD(cc_ref(i - 1, k, 1), VADD(SVMUL(tr12, cr2), SVMUL(tr11, cr3)));
-      ti3 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr12, ci2), SVMUL(tr11, ci3)));//
-      tr5 = VADD(SVMUL(ti11, cr5), SVMUL(ti12, cr4));
-      ti5 = VADD(SVMUL(ti11, ci5), SVMUL(ti12, ci4));
-      tr4 = VSUB(SVMUL(ti12, cr5), SVMUL(ti11, cr4));
-      ti4 = VSUB(SVMUL(ti12, ci5), SVMUL(ti11, ci4));
-      ch_ref(i - 1, 3, k) = VSUB(tr2, tr5);
-      ch_ref(ic - 1, 2, k) = VADD(tr2, tr5);
-      ch_ref(i, 3, k) = VADD(ti2, ti5);
-      ch_ref(ic, 2, k) = VSUB(ti5, ti2);
-      ch_ref(i - 1, 5, k) = VSUB(tr3, tr4);
-      ch_ref(ic - 1, 4, k) = VADD(tr3, tr4);
-      ch_ref(i, 5, k) = VADD(ti3, ti4);
-      ch_ref(ic, 4, k) = VSUB(ti4, ti3);
-    }
-  }
-#undef cc_ref
-#undef ch_ref
-} /* radf5 */
-
-static void radb5_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch, 
-                  const float *wa1, const float *wa2, const float *wa3, const float *wa4)
-{
-  static const float tr11 = .309016994374947f;
-  static const float ti11 = .951056516295154f;
-  static const float tr12 = -.809016994374947f;
-  static const float ti12 = .587785252292473f;
-
-  int cc_offset, ch_offset;
-
-  /* Local variables */
-  int i, k, ic;
-  v4sf ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4, ti2, ti3,
-    ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5;
-  int idp2;
-
-#define cc_ref(a_1,a_2,a_3) cc[((a_3)*5 + (a_2))*ido + a_1]
-#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1]
-
-  /* Parameter adjustments */
-  ch_offset = 1 + ido * (1 + l1);
-  ch -= ch_offset;
-  cc_offset = 1 + ido * 6;
-  cc -= cc_offset;
-
-  /* Function Body */
-  for (k = 1; k <= l1; ++k) {
-    ti5 = VADD(cc_ref(1, 3, k), cc_ref(1, 3, k));
-    ti4 = VADD(cc_ref(1, 5, k), cc_ref(1, 5, k));
-    tr2 = VADD(cc_ref(ido, 2, k), cc_ref(ido, 2, k));
-    tr3 = VADD(cc_ref(ido, 4, k), cc_ref(ido, 4, k));
-    ch_ref(1, k, 1) = VADD(cc_ref(1, 1, k), VADD(tr2, tr3));
-    cr2 = VADD(cc_ref(1, 1, k), VADD(SVMUL(tr11, tr2), SVMUL(tr12, tr3)));
-    cr3 = VADD(cc_ref(1, 1, k), VADD(SVMUL(tr12, tr2), SVMUL(tr11, tr3)));
-    ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
-    ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
-    ch_ref(1, k, 2) = VSUB(cr2, ci5);
-    ch_ref(1, k, 3) = VSUB(cr3, ci4);
-    ch_ref(1, k, 4) = VADD(cr3, ci4);
-    ch_ref(1, k, 5) = VADD(cr2, ci5);
-  }
-  if (ido == 1) {
-    return;
-  }
-  idp2 = ido + 2;
-  for (k = 1; k <= l1; ++k) {
-    for (i = 3; i <= ido; i += 2) {
-      ic = idp2 - i;
-      ti5 = VADD(cc_ref(i  , 3, k), cc_ref(ic  , 2, k));
-      ti2 = VSUB(cc_ref(i  , 3, k), cc_ref(ic  , 2, k));
-      ti4 = VADD(cc_ref(i  , 5, k), cc_ref(ic  , 4, k));
-      ti3 = VSUB(cc_ref(i  , 5, k), cc_ref(ic  , 4, k));
-      tr5 = VSUB(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k));
-      tr2 = VADD(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k));
-      tr4 = VSUB(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k));
-      tr3 = VADD(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k));
-      ch_ref(i - 1, k, 1) = VADD(cc_ref(i-1, 1, k), VADD(tr2, tr3));
-      ch_ref(i, k, 1) = VADD(cc_ref(i, 1, k), VADD(ti2, ti3));
-      cr2 = VADD(cc_ref(i-1, 1, k), VADD(SVMUL(tr11, tr2), SVMUL(tr12, tr3)));
-      ci2 = VADD(cc_ref(i  , 1, k), VADD(SVMUL(tr11, ti2), SVMUL(tr12, ti3)));
-      cr3 = VADD(cc_ref(i-1, 1, k), VADD(SVMUL(tr12, tr2), SVMUL(tr11, tr3)));
-      ci3 = VADD(cc_ref(i  , 1, k), VADD(SVMUL(tr12, ti2), SVMUL(tr11, ti3)));
-      cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4));
-      ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
-      cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4));
-      ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
-      dr3 = VSUB(cr3, ci4);
-      dr4 = VADD(cr3, ci4);
-      di3 = VADD(ci3, cr4);
-      di4 = VSUB(ci3, cr4);
-      dr5 = VADD(cr2, ci5);
-      dr2 = VSUB(cr2, ci5);
-      di5 = VSUB(ci2, cr5);
-      di2 = VADD(ci2, cr5);
-      VCPLXMUL(dr2, di2, LD_PS1(wa1[i-3]), LD_PS1(wa1[i-2]));
-      VCPLXMUL(dr3, di3, LD_PS1(wa2[i-3]), LD_PS1(wa2[i-2]));
-      VCPLXMUL(dr4, di4, LD_PS1(wa3[i-3]), LD_PS1(wa3[i-2]));
-      VCPLXMUL(dr5, di5, LD_PS1(wa4[i-3]), LD_PS1(wa4[i-2]));
-
-      ch_ref(i-1, k, 2) = dr2; ch_ref(i, k, 2) = di2;
-      ch_ref(i-1, k, 3) = dr3; ch_ref(i, k, 3) = di3;
-      ch_ref(i-1, k, 4) = dr4; ch_ref(i, k, 4) = di4;
-      ch_ref(i-1, k, 5) = dr5; ch_ref(i, k, 5) = di5;
-    }
-  }
-#undef cc_ref
-#undef ch_ref
-} /* radb5 */
-
-static NEVER_INLINE(v4sf *) rfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, 
-                                      const float *wa, const int *ifac) {  
-  v4sf *in  = (v4sf*)input_readonly;
-  v4sf *out = (in == work2 ? work1 : work2);
-  int nf = ifac[1], k1;
-  int l2 = n;
-  int iw = n-1;
-  assert(in != out && work1 != work2);
-  for (k1 = 1; k1 <= nf; ++k1) {
-    int kh = nf - k1;
-    int ip = ifac[kh + 2];
-    int l1 = l2 / ip;
-    int ido = n / l2;
-    iw -= (ip - 1)*ido;
-    switch (ip) {
-      case 5: {
-        int ix2 = iw + ido;
-        int ix3 = ix2 + ido;
-        int ix4 = ix3 + ido;
-        radf5_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]);
-      } break;
-      case 4: {
-        int ix2 = iw + ido;
-        int ix3 = ix2 + ido;
-        radf4_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3]);
-      } break;
-      case 3: {
-        int ix2 = iw + ido;
-        radf3_ps(ido, l1, in, out, &wa[iw], &wa[ix2]);
-      } break;
-      case 2:
-        radf2_ps(ido, l1, in, out, &wa[iw]);
-        break;
-      default:
-        assert(0);
-        break;
-    }
-    l2 = l1;
-    if (out == work2) {
-      out = work1; in = work2;
-    } else {
-      out = work2; in = work1;
-    }
-  }
-  return in; /* this is in fact the output .. */
-} /* rfftf1 */
-
-static NEVER_INLINE(v4sf *) rfftb1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, 
-                                      const float *wa, const int *ifac) {  
-  v4sf *in  = (v4sf*)input_readonly;
-  v4sf *out = (in == work2 ? work1 : work2);
-  int nf = ifac[1], k1;
-  int l1 = 1;
-  int iw = 0;
-  assert(in != out);
-  for (k1=1; k1<=nf; k1++) {
-    int ip = ifac[k1 + 1];
-    int l2 = ip*l1;
-    int ido = n / l2;
-    switch (ip) {
-      case 5: {
-        int ix2 = iw + ido;
-        int ix3 = ix2 + ido;
-        int ix4 = ix3 + ido;
-        radb5_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]);
-      } break;
-      case 4: {
-        int ix2 = iw + ido;
-        int ix3 = ix2 + ido;
-        radb4_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3]);
-      } break;
-      case 3: {
-        int ix2 = iw + ido;
-        radb3_ps(ido, l1, in, out, &wa[iw], &wa[ix2]);
-      } break;
-      case 2:
-        radb2_ps(ido, l1, in, out, &wa[iw]);
-        break;
-      default:
-        assert(0);
-        break;
-    }
-    l1 = l2;
-    iw += (ip - 1)*ido;
-
-    if (out == work2) {
-      out = work1; in = work2;
-    } else {
-      out = work2; in = work1;
-    }
-  }
-  return in; /* this is in fact the output .. */
-}
-
-static int decompose(int n, int *ifac, const int *ntryh) {
-  int nl = n, nf = 0, i, j = 0;
-  for (j=0; ntryh[j]; ++j) {
-    int ntry = ntryh[j];
-    while (nl != 1) {
-      int nq = nl / ntry;
-      int nr = nl - ntry * nq;
-      if (nr == 0) {
-        ifac[2+nf++] = ntry;
-        nl = nq;
-        if (ntry == 2 && nf != 1) {
-          for (i = 2; i <= nf; ++i) {
-            int ib = nf - i + 2;
-            ifac[ib + 1] = ifac[ib];
-          }
-          ifac[2] = 2;
-        }
-      } else break;
-    }
-  }
-  ifac[0] = n;
-  ifac[1] = nf;  
-  return nf;
-}
-
-
-
-static void rffti1_ps(int n, float *wa, int *ifac)
-{
-  static const int ntryh[] = { 4,2,3,5,0 };
-  int k1, j, ii;
-
-  int nf = decompose(n,ifac,ntryh);
-  float argh = (2*M_PI) / n;
-  int is = 0;
-  int nfm1 = nf - 1;
-  int l1 = 1;
-  for (k1 = 1; k1 <= nfm1; k1++) {
-    int ip = ifac[k1 + 1];
-    int ld = 0;
-    int l2 = l1*ip;
-    int ido = n / l2;
-    int ipm = ip - 1;
-    for (j = 1; j <= ipm; ++j) {
-      float argld;
-      int i = is, fi=0;
-      ld += l1;
-      argld = ld*argh;
-      for (ii = 3; ii <= ido; ii += 2) {
-        i += 2;
-        fi += 1;
-        wa[i - 2] = cos(fi*argld);
-        wa[i - 1] = sin(fi*argld);
-      }
-      is += ido;
-    }
-    l1 = l2;
-  }
-} /* rffti1 */
-
-void cffti1_ps(int n, float *wa, int *ifac)
-{
-  static const int ntryh[] = { 5,3,4,2,0 };
-  int k1, j, ii;
-
-  int nf = decompose(n,ifac,ntryh);
-  float argh = (2*M_PI)/(float)n;
-  int i = 1;
-  int l1 = 1;
-  for (k1=1; k1<=nf; k1++) {
-    int ip = ifac[k1+1];
-    int ld = 0;
-    int l2 = l1*ip;
-    int ido = n / l2;
-    int idot = ido + ido + 2;
-    int ipm = ip - 1;
-    for (j=1; j<=ipm; j++) {
-      float argld;
-      int i1 = i, fi = 0;
-      wa[i-1] = 1;
-      wa[i] = 0;
-      ld += l1;
-      argld = ld*argh;
-      for (ii = 4; ii <= idot; ii += 2) {
-        i += 2;
-        fi += 1;
-        wa[i-1] = cos(fi*argld);
-        wa[i] = sin(fi*argld);
-      }
-      if (ip > 5) {
-        wa[i1-1] = wa[i-1];
-        wa[i1] = wa[i];
-      }
-    }
-    l1 = l2;
-  }
-} /* cffti1 */
-
-
-v4sf *cfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, const float *wa, const int *ifac, int isign) {
-  v4sf *in  = (v4sf*)input_readonly;
-  v4sf *out = (in == work2 ? work1 : work2); 
-  int nf = ifac[1], k1;
-  int l1 = 1;
-  int iw = 0;
-  assert(in != out && work1 != work2);
-  for (k1=2; k1<=nf+1; k1++) {
-    int ip = ifac[k1];
-    int l2 = ip*l1;
-    int ido = n / l2;
-    int idot = ido + ido;
-    switch (ip) {
-      case 5: {
-        int ix2 = iw + idot;
-        int ix3 = ix2 + idot;
-        int ix4 = ix3 + idot;
-        passf5_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);
-      } break;
-      case 4: {
-        int ix2 = iw + idot;
-        int ix3 = ix2 + idot;
-        passf4_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], isign);
-      } break;
-      case 2: {
-        passf2_ps(idot, l1, in, out, &wa[iw], isign);
-      } break;
-      case 3: {
-        int ix2 = iw + idot;
-        passf3_ps(idot, l1, in, out, &wa[iw], &wa[ix2], isign);
-      } break;
-      default:
-        assert(0);
-    }
-    l1 = l2;
-    iw += (ip - 1)*idot;
-    if (out == work2) {
-      out = work1; in = work2;
-    } else {
-      out = work2; in = work1;
-    }
-  }
-
-  return in; /* this is in fact the output .. */
-}
-
-
-struct PFFFT_Setup {
-  int     N;
-  int     Ncvec; // nb of complex simd vectors (N/4 if PFFFT_COMPLEX, N/8 if PFFFT_REAL)
-  int ifac[15];
-  pffft_transform_t transform;
-  v4sf *data; // allocated room for twiddle coefs
-  float *e;    // points into 'data' , N/4*3 elements
-  float *twiddle; // points into 'data', N/4 elements
-};
-
-PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform) {
-  PFFFT_Setup *s = (PFFFT_Setup*)malloc(sizeof(PFFFT_Setup));
-  int k, m;
-  /* unfortunately, the fft size must be a multiple of 16 for complex FFTs 
-     and 32 for real FFTs -- a lot of stuff would need to be rewritten to
-     handle other cases (or maybe just switch to a scalar fft, I don't know..) */
-  if (transform == PFFFT_REAL) { assert((N%(2*SIMD_SZ*SIMD_SZ))==0 && N>0); }
-  if (transform == PFFFT_COMPLEX) { assert((N%(SIMD_SZ*SIMD_SZ))==0 && N>0); }
-  //assert((N % 32) == 0);
-  s->N = N;
-  s->transform = transform;  
-  /* nb of complex simd vectors */
-  s->Ncvec = (transform == PFFFT_REAL ? N/2 : N)/SIMD_SZ;
-  s->data = (v4sf*)pffft_aligned_malloc(2*s->Ncvec * sizeof(v4sf));
-  s->e = (float*)s->data;
-  s->twiddle = (float*)(s->data + (2*s->Ncvec*(SIMD_SZ-1))/SIMD_SZ);  
-
-  if (transform == PFFFT_REAL) {
-    for (k=0; k < s->Ncvec; ++k) {
-      int i = k/SIMD_SZ;
-      int j = k%SIMD_SZ;
-      for (m=0; m < SIMD_SZ-1; ++m) {
-        float A = -2*M_PI*(m+1)*k / N;
-        s->e[(2*(i*3 + m) + 0) * SIMD_SZ + j] = cos(A);
-        s->e[(2*(i*3 + m) + 1) * SIMD_SZ + j] = sin(A);
-      }
-    }
-    rffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac);
-  } else {
-    for (k=0; k < s->Ncvec; ++k) {
-      int i = k/SIMD_SZ;
-      int j = k%SIMD_SZ;
-      for (m=0; m < SIMD_SZ-1; ++m) {
-        float A = -2*M_PI*(m+1)*k / N;
-        s->e[(2*(i*3 + m) + 0)*SIMD_SZ + j] = cos(A);
-        s->e[(2*(i*3 + m) + 1)*SIMD_SZ + j] = sin(A);
-      }
-    }
-    cffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac);
-  }
-
-  /* check that N is decomposable with allowed prime factors */
-  for (k=0, m=1; k < s->ifac[1]; ++k) { m *= s->ifac[2+k]; }
-  if (m != N/SIMD_SZ) {
-    pffft_destroy_setup(s); s = 0;
-  }
-
-  return s;
-}
-
-
-void pffft_destroy_setup(PFFFT_Setup *s) {
-  pffft_aligned_free(s->data);
-  free(s);
-}
-
-#if !defined(PFFFT_SIMD_DISABLE)
-
-/* [0 0 1 2 3 4 5 6 7 8] -> [0 8 7 6 5 4 3 2 1] */
-static void reversed_copy(int N, const v4sf *in, int in_stride, v4sf *out) {
-  v4sf g0, g1;
-  int k;
-  INTERLEAVE2(in[0], in[1], g0, g1); in += in_stride;
-  
-  *--out = VSWAPHL(g0, g1); // [g0l, g0h], [g1l g1h] -> [g1l, g0h]
-  for (k=1; k < N; ++k) {
-    v4sf h0, h1;
-    INTERLEAVE2(in[0], in[1], h0, h1); in += in_stride;
-    *--out = VSWAPHL(g1, h0);
-    *--out = VSWAPHL(h0, h1);
-    g1 = h1;
-  }
-  *--out = VSWAPHL(g1, g0);
-}
-
-static void unreversed_copy(int N, const v4sf *in, v4sf *out, int out_stride) {
-  v4sf g0, g1, h0, h1;
-  int k;
-  g0 = g1 = in[0]; ++in;
-  for (k=1; k < N; ++k) {
-    h0 = *in++; h1 = *in++;
-    g1 = VSWAPHL(g1, h0);
-    h0 = VSWAPHL(h0, h1);
-    UNINTERLEAVE2(h0, g1, out[0], out[1]); out += out_stride;
-    g1 = h1;
-  }
-  h0 = *in++; h1 = g0;
-  g1 = VSWAPHL(g1, h0);
-  h0 = VSWAPHL(h0, h1);
-  UNINTERLEAVE2(h0, g1, out[0], out[1]);
-}
-
-void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) {
-  int k, N = setup->N, Ncvec = setup->Ncvec;
-  const v4sf *vin = (const v4sf*)in;
-  v4sf *vout = (v4sf*)out;
-  assert(in != out);
-  if (setup->transform == PFFFT_REAL) {
-    int k, dk = N/32;
-    if (direction == PFFFT_FORWARD) {
-      for (k=0; k < dk; ++k) {
-        INTERLEAVE2(vin[k*8 + 0], vin[k*8 + 1], vout[2*(0*dk + k) + 0], vout[2*(0*dk + k) + 1]);
-        INTERLEAVE2(vin[k*8 + 4], vin[k*8 + 5], vout[2*(2*dk + k) + 0], vout[2*(2*dk + k) + 1]);
-      }
-      reversed_copy(dk, vin+2, 8, (v4sf*)(out + N/2));
-      reversed_copy(dk, vin+6, 8, (v4sf*)(out + N));
-    } else {
-      for (k=0; k < dk; ++k) {
-        UNINTERLEAVE2(vin[2*(0*dk + k) + 0], vin[2*(0*dk + k) + 1], vout[k*8 + 0], vout[k*8 + 1]);
-        UNINTERLEAVE2(vin[2*(2*dk + k) + 0], vin[2*(2*dk + k) + 1], vout[k*8 + 4], vout[k*8 + 5]);
-      }
-      unreversed_copy(dk, (v4sf*)(in + N/4), (v4sf*)(out + N - 6*SIMD_SZ), -8);
-      unreversed_copy(dk, (v4sf*)(in + 3*N/4), (v4sf*)(out + N - 2*SIMD_SZ), -8);
-    }
-  } else {
-    if (direction == PFFFT_FORWARD) {
-      for (k=0; k < Ncvec; ++k) { 
-        int kk = (k/4) + (k%4)*(Ncvec/4);
-        INTERLEAVE2(vin[k*2], vin[k*2+1], vout[kk*2], vout[kk*2+1]);
-      }
-    } else {
-      for (k=0; k < Ncvec; ++k) { 
-        int kk = (k/4) + (k%4)*(Ncvec/4);
-        UNINTERLEAVE2(vin[kk*2], vin[kk*2+1], vout[k*2], vout[k*2+1]);
-      }
-    }
-  }
-}
-
-void pffft_cplx_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
-  int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
-  v4sf r0, i0, r1, i1, r2, i2, r3, i3;
-  v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
-  assert(in != out);
-  for (k=0; k < dk; ++k) {    
-    r0 = in[8*k+0]; i0 = in[8*k+1];
-    r1 = in[8*k+2]; i1 = in[8*k+3];
-    r2 = in[8*k+4]; i2 = in[8*k+5];
-    r3 = in[8*k+6]; i3 = in[8*k+7];
-    VTRANSPOSE4(r0,r1,r2,r3);
-    VTRANSPOSE4(i0,i1,i2,i3);
-    VCPLXMUL(r1,i1,e[k*6+0],e[k*6+1]);
-    VCPLXMUL(r2,i2,e[k*6+2],e[k*6+3]);
-    VCPLXMUL(r3,i3,e[k*6+4],e[k*6+5]);
-
-    sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2);
-    sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3);
-    si0 = VADD(i0,i2); di0 = VSUB(i0, i2);
-    si1 = VADD(i1,i3); di1 = VSUB(i1, i3);
-
-    /*
-      transformation for each column is:
-      
-      [1   1   1   1   0   0   0   0]   [r0]
-      [1   0  -1   0   0  -1   0   1]   [r1]
-      [1  -1   1  -1   0   0   0   0]   [r2]
-      [1   0  -1   0   0   1   0  -1]   [r3]
-      [0   0   0   0   1   1   1   1] * [i0]
-      [0   1   0  -1   1   0  -1   0]   [i1]
-      [0   0   0   0   1  -1   1  -1]   [i2]
-      [0  -1   0   1   1   0  -1   0]   [i3]    
-    */
-    
-    r0 = VADD(sr0, sr1); i0 = VADD(si0, si1);
-    r1 = VADD(dr0, di1); i1 = VSUB(di0, dr1);
-    r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1);
-    r3 = VSUB(dr0, di1); i3 = VADD(di0, dr1);
-  
-    *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1;
-    *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3;
-  }
-}
-
-void pffft_cplx_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
-  int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
-  v4sf r0, i0, r1, i1, r2, i2, r3, i3;
-  v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
-  assert(in != out);
-  for (k=0; k < dk; ++k) {    
-    r0 = in[8*k+0]; i0 = in[8*k+1];
-    r1 = in[8*k+2]; i1 = in[8*k+3];
-    r2 = in[8*k+4]; i2 = in[8*k+5];
-    r3 = in[8*k+6]; i3 = in[8*k+7];
-
-    sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2);
-    sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3);
-    si0 = VADD(i0,i2); di0 = VSUB(i0, i2);
-    si1 = VADD(i1,i3); di1 = VSUB(i1, i3);
-
-    r0 = VADD(sr0, sr1); i0 = VADD(si0, si1);
-    r1 = VSUB(dr0, di1); i1 = VADD(di0, dr1);
-    r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1);
-    r3 = VADD(dr0, di1); i3 = VSUB(di0, dr1);
-
-    VCPLXMULCONJ(r1,i1,e[k*6+0],e[k*6+1]);
-    VCPLXMULCONJ(r2,i2,e[k*6+2],e[k*6+3]);
-    VCPLXMULCONJ(r3,i3,e[k*6+4],e[k*6+5]);
-
-    VTRANSPOSE4(r0,r1,r2,r3);
-    VTRANSPOSE4(i0,i1,i2,i3);
-
-    *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1;
-    *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3;
-  }
-}
-
-
-static ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *in1, const v4sf *in,
-                            const v4sf *e, v4sf *out) {
-  v4sf r0, i0, r1, i1, r2, i2, r3, i3;
-  v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
-  r0 = *in0; i0 = *in1;
-  r1 = *in++; i1 = *in++; r2 = *in++; i2 = *in++; r3 = *in++; i3 = *in++;
-  VTRANSPOSE4(r0,r1,r2,r3);
-  VTRANSPOSE4(i0,i1,i2,i3);
- 
-  /*
-    transformation for each column is:
-
-    [1   1   1   1   0   0   0   0]   [r0]
-    [1   0  -1   0   0  -1   0   1]   [r1]
-    [1   0  -1   0   0   1   0  -1]   [r2]
-    [1  -1   1  -1   0   0   0   0]   [r3]
-    [0   0   0   0   1   1   1   1] * [i0]
-    [0  -1   0   1  -1   0   1   0]   [i1]
-    [0  -1   0   1   1   0  -1   0]   [i2]
-    [0   0   0   0  -1   1  -1   1]   [i3]    
-  */
-  
-  //cerr << "matrix initial, before e , REAL:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
-  //cerr << "matrix initial, before e, IMAG :\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
-
-  VCPLXMUL(r1,i1,e[0],e[1]);
-  VCPLXMUL(r2,i2,e[2],e[3]);
-  VCPLXMUL(r3,i3,e[4],e[5]);
-
-  //cerr << "matrix initial, real part:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
-  //cerr << "matrix initial, imag part:\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
-
-  sr0 = VADD(r0,r2); dr0 = VSUB(r0,r2); 
-  sr1 = VADD(r1,r3); dr1 = VSUB(r3,r1);
-  si0 = VADD(i0,i2); di0 = VSUB(i0,i2); 
-  si1 = VADD(i1,i3); di1 = VSUB(i3,i1);
-
-  r0 = VADD(sr0, sr1);
-  r3 = VSUB(sr0, sr1);
-  i0 = VADD(si0, si1);
-  i3 = VSUB(si1, si0);
-  r1 = VADD(dr0, di1);
-  r2 = VSUB(dr0, di1);
-  i1 = VSUB(dr1, di0);
-  i2 = VADD(dr1, di0);
-
-  *out++ = r0;
-  *out++ = i0;
-  *out++ = r1;
-  *out++ = i1;
-  *out++ = r2;
-  *out++ = i2;
-  *out++ = r3;
-  *out++ = i3;
-
-}
-
-static NEVER_INLINE(void) pffft_real_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
-  int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
-  /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
-
-  v4sf_union cr, ci, *uout = (v4sf_union*)out;
-  v4sf save = in[7], zero=VZERO();
-  float xr0, xi0, xr1, xi1, xr2, xi2, xr3, xi3;
-  static const float s = M_SQRT2/2;
-
-  cr.v = in[0]; ci.v = in[Ncvec*2-1];
-  assert(in != out);
-  pffft_real_finalize_4x4(&zero, &zero, in+1, e, out);
-
-  /*
-    [cr0 cr1 cr2 cr3 ci0 ci1 ci2 ci3]
-
-    [Xr(1)]  ] [1   1   1   1   0   0   0   0]
-    [Xr(N/4) ] [0   0   0   0   1   s   0  -s]
-    [Xr(N/2) ] [1   0  -1   0   0   0   0   0]
-    [Xr(3N/4)] [0   0   0   0   1  -s   0   s]
-    [Xi(1)   ] [1  -1   1  -1   0   0   0   0]
-    [Xi(N/4) ] [0   0   0   0   0  -s  -1  -s]
-    [Xi(N/2) ] [0  -1   0   1   0   0   0   0]
-    [Xi(3N/4)] [0   0   0   0   0  -s   1  -s]
-  */
-
-  xr0=(cr.f[0]+cr.f[2]) + (cr.f[1]+cr.f[3]); uout[0].f[0] = xr0;
-  xi0=(cr.f[0]+cr.f[2]) - (cr.f[1]+cr.f[3]); uout[1].f[0] = xi0;
-  xr2=(cr.f[0]-cr.f[2]);                     uout[4].f[0] = xr2;
-  xi2=(cr.f[3]-cr.f[1]);                     uout[5].f[0] = xi2;
-  xr1= ci.f[0] + s*(ci.f[1]-ci.f[3]);        uout[2].f[0] = xr1;
-  xi1=-ci.f[2] - s*(ci.f[1]+ci.f[3]);        uout[3].f[0] = xi1;
-  xr3= ci.f[0] - s*(ci.f[1]-ci.f[3]);        uout[6].f[0] = xr3;
-  xi3= ci.f[2] - s*(ci.f[1]+ci.f[3]);        uout[7].f[0] = xi3; 
-
-  for (k=1; k < dk; ++k) {
-    v4sf save_next = in[8*k+7];
-    pffft_real_finalize_4x4(&save, &in[8*k+0], in + 8*k+1,
-                           e + k*6, out + k*8);
-    save = save_next;
-  }
-
-}
-
-static ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in, 
-                                             const v4sf *e, v4sf *out, int first) {
-  v4sf r0=in[0], i0=in[1], r1=in[2], i1=in[3], r2=in[4], i2=in[5], r3=in[6], i3=in[7];
-  /*
-    transformation for each column is:
-
-    [1   1   1   1   0   0   0   0]   [r0]
-    [1   0   0  -1   0  -1  -1   0]   [r1]
-    [1  -1  -1   1   0   0   0   0]   [r2]
-    [1   0   0  -1   0   1   1   0]   [r3]
-    [0   0   0   0   1  -1   1  -1] * [i0]
-    [0  -1   1   0   1   0   0   1]   [i1]
-    [0   0   0   0   1   1  -1  -1]   [i2]
-    [0   1  -1   0   1   0   0   1]   [i3]    
-  */
-
-  v4sf sr0 = VADD(r0,r3), dr0 = VSUB(r0,r3); 
-  v4sf sr1 = VADD(r1,r2), dr1 = VSUB(r1,r2);
-  v4sf si0 = VADD(i0,i3), di0 = VSUB(i0,i3); 
-  v4sf si1 = VADD(i1,i2), di1 = VSUB(i1,i2);
-
-  r0 = VADD(sr0, sr1);
-  r2 = VSUB(sr0, sr1);
-  r1 = VSUB(dr0, si1);
-  r3 = VADD(dr0, si1);
-  i0 = VSUB(di0, di1);
-  i2 = VADD(di0, di1);
-  i1 = VSUB(si0, dr1);
-  i3 = VADD(si0, dr1);
-
-  VCPLXMULCONJ(r1,i1,e[0],e[1]);
-  VCPLXMULCONJ(r2,i2,e[2],e[3]);
-  VCPLXMULCONJ(r3,i3,e[4],e[5]);
-
-  VTRANSPOSE4(r0,r1,r2,r3);
-  VTRANSPOSE4(i0,i1,i2,i3);
-
-  if (!first) {
-    *out++ = r0;
-    *out++ = i0;
-  }
-  *out++ = r1;
-  *out++ = i1;
-  *out++ = r2;
-  *out++ = i2;
-  *out++ = r3;
-  *out++ = i3;
-}
-
-static NEVER_INLINE(void) pffft_real_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
-  int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
-  /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
-
-  v4sf_union Xr, Xi, *uout = (v4sf_union*)out;
-  float cr0, ci0, cr1, ci1, cr2, ci2, cr3, ci3;
-  static const float s = M_SQRT2;
-  assert(in != out);
-  for (k=0; k < 4; ++k) {
-    Xr.f[k] = ((float*)in)[8*k];
-    Xi.f[k] = ((float*)in)[8*k+4];
-  }
-
-  pffft_real_preprocess_4x4(in, e, out+1, 1); // will write only 6 values
-
-  /*
-    [Xr0 Xr1 Xr2 Xr3 Xi0 Xi1 Xi2 Xi3]
-
-    [cr0] [1   0   2   0   1   0   0   0]
-    [cr1] [1   0   0   0  -1   0  -2   0]
-    [cr2] [1   0  -2   0   1   0   0   0]
-    [cr3] [1   0   0   0  -1   0   2   0]
-    [ci0] [0   2   0   2   0   0   0   0]
-    [ci1] [0   s   0  -s   0  -s   0  -s]
-    [ci2] [0   0   0   0   0  -2   0   2]
-    [ci3] [0  -s   0   s   0  -s   0  -s]
-  */
-  for (k=1; k < dk; ++k) {    
-    pffft_real_preprocess_4x4(in+8*k, e + k*6, out-1+k*8, 0);
-  }
-
-  cr0=(Xr.f[0]+Xi.f[0]) + 2*Xr.f[2]; uout[0].f[0] = cr0;
-  cr1=(Xr.f[0]-Xi.f[0]) - 2*Xi.f[2]; uout[0].f[1] = cr1;
-  cr2=(Xr.f[0]+Xi.f[0]) - 2*Xr.f[2]; uout[0].f[2] = cr2;
-  cr3=(Xr.f[0]-Xi.f[0]) + 2*Xi.f[2]; uout[0].f[3] = cr3;
-  ci0= 2*(Xr.f[1]+Xr.f[3]);                       uout[2*Ncvec-1].f[0] = ci0;
-  ci1= s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[1] = ci1;
-  ci2= 2*(Xi.f[3]-Xi.f[1]);                       uout[2*Ncvec-1].f[2] = ci2;
-  ci3=-s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[3] = ci3;
-}
-
-
-void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *foutput, v4sf *scratch,
-                             pffft_direction_t direction, int ordered) {
-  int k, Ncvec   = setup->Ncvec;
-  int nf_odd = (setup->ifac[1] & 1);
-
-  // temporary buffer is allocated on the stack if the scratch pointer is NULL
-  int stack_allocate = (scratch == 0 ? Ncvec*2 : 1);
-  VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate);
-
-  const v4sf *vinput = (const v4sf*)finput;
-  v4sf *voutput      = (v4sf*)foutput;
-  v4sf *buff[2]      = { voutput, scratch ? scratch : scratch_on_stack };
-  int ib = (nf_odd ^ ordered ? 1 : 0);
-
-  assert(VALIGNED(finput) && VALIGNED(foutput));
-
-  //assert(finput != foutput);
-  if (direction == PFFFT_FORWARD) {
-    ib = !ib;
-    if (setup->transform == PFFFT_REAL) { 
-      ib = (rfftf1_ps(Ncvec*2, vinput, buff[ib], buff[!ib],
-                      setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);      
-      pffft_real_finalize(Ncvec, buff[ib], buff[!ib], (v4sf*)setup->e);
-    } else {
-      v4sf *tmp = buff[ib];
-      for (k=0; k < Ncvec; ++k) {
-        UNINTERLEAVE2(vinput[k*2], vinput[k*2+1], tmp[k*2], tmp[k*2+1]);
-      }
-      ib = (cfftf1_ps(Ncvec, buff[ib], buff[!ib], buff[ib], 
-                      setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1);
-      pffft_cplx_finalize(Ncvec, buff[ib], buff[!ib], (v4sf*)setup->e);
-    }
-    if (ordered) {
-      pffft_zreorder(setup, (float*)buff[!ib], (float*)buff[ib], PFFFT_FORWARD);       
-    } else ib = !ib;
-  } else {
-    if (vinput == buff[ib]) { 
-      ib = !ib; // may happen when finput == foutput
-    }
-    if (ordered) {
-      pffft_zreorder(setup, (float*)vinput, (float*)buff[ib], PFFFT_BACKWARD); 
-      vinput = buff[ib]; ib = !ib;
-    }
-    if (setup->transform == PFFFT_REAL) {
-      pffft_real_preprocess(Ncvec, vinput, buff[ib], (v4sf*)setup->e);
-      ib = (rfftb1_ps(Ncvec*2, buff[ib], buff[0], buff[1], 
-                      setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
-    } else {
-      pffft_cplx_preprocess(Ncvec, vinput, buff[ib], (v4sf*)setup->e);
-      ib = (cfftf1_ps(Ncvec, buff[ib], buff[0], buff[1], 
-                      setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1);
-      for (k=0; k < Ncvec; ++k) {
-        INTERLEAVE2(buff[ib][k*2], buff[ib][k*2+1], buff[ib][k*2], buff[ib][k*2+1]);
-      }
-    }
-  }
-  
-  if (buff[ib] != voutput) {
-    /* extra copy required -- this situation should only happen when finput == foutput */
-    assert(finput==foutput);
-    for (k=0; k < Ncvec; ++k) {
-      v4sf a = buff[ib][2*k], b = buff[ib][2*k+1];
-      voutput[2*k] = a; voutput[2*k+1] = b;
-    }
-    ib = !ib;
-  }
-  assert(buff[ib] == voutput);
-}
-
-void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab, float scaling) {
-  int Ncvec = s->Ncvec;
-  const v4sf * RESTRICT va = (const v4sf*)a;
-  const v4sf * RESTRICT vb = (const v4sf*)b;
-  v4sf * RESTRICT vab = (v4sf*)ab;
-
-#ifdef __arm__
-  __builtin_prefetch(va);
-  __builtin_prefetch(vb);
-  __builtin_prefetch(vab);
-  __builtin_prefetch(va+2);
-  __builtin_prefetch(vb+2);
-  __builtin_prefetch(vab+2);
-  __builtin_prefetch(va+4);
-  __builtin_prefetch(vb+4);
-  __builtin_prefetch(vab+4);
-  __builtin_prefetch(va+6);
-  __builtin_prefetch(vb+6);
-  __builtin_prefetch(vab+6);
-# ifndef __clang__
-#   define ZCONVOLVE_USING_INLINE_NEON_ASM
-# endif
-#endif
-
-  float ar, ai, br, bi, abr, abi;
-#ifndef ZCONVOLVE_USING_INLINE_ASM
-  v4sf vscal = LD_PS1(scaling);
-  int i;
-#endif
-
-  assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab));
-  ar = ((v4sf_union*)va)[0].f[0];
-  ai = ((v4sf_union*)va)[1].f[0];
-  br = ((v4sf_union*)vb)[0].f[0];
-  bi = ((v4sf_union*)vb)[1].f[0];
-  abr = ((v4sf_union*)vab)[0].f[0];
-  abi = ((v4sf_union*)vab)[1].f[0];
- 
-#ifdef ZCONVOLVE_USING_INLINE_ASM // inline asm version, unfortunately miscompiled by clang 3.2, at least on ubuntu.. so this will be restricted to gcc
-  const float *a_ = a, *b_ = b; float *ab_ = ab;
-  int N = Ncvec;
-  asm volatile("mov         r8, %2                  \n"
-               "vdup.f32    q15, %4                 \n"
-               "1:                                  \n"
-               "pld         [%0,#64]                \n"
-               "pld         [%1,#64]                \n"
-               "pld         [%2,#64]                \n"
-               "pld         [%0,#96]                \n"
-               "pld         [%1,#96]                \n"
-               "pld         [%2,#96]                \n"
-               "vld1.f32    {q0,q1},   [%0,:128]!         \n"
-               "vld1.f32    {q4,q5},   [%1,:128]!         \n"
-               "vld1.f32    {q2,q3},   [%0,:128]!         \n"
-               "vld1.f32    {q6,q7},   [%1,:128]!         \n"
-               "vld1.f32    {q8,q9},   [r8,:128]!          \n"
-               
-               "vmul.f32    q10, q0, q4             \n"
-               "vmul.f32    q11, q0, q5             \n"
-               "vmul.f32    q12, q2, q6             \n" 
-               "vmul.f32    q13, q2, q7             \n"                 
-               "vmls.f32    q10, q1, q5             \n"
-               "vmla.f32    q11, q1, q4             \n"
-               "vld1.f32    {q0,q1}, [r8,:128]!     \n"
-               "vmls.f32    q12, q3, q7             \n"
-               "vmla.f32    q13, q3, q6             \n"
-               "vmla.f32    q8, q10, q15            \n"
-               "vmla.f32    q9, q11, q15            \n"
-               "vmla.f32    q0, q12, q15            \n"
-               "vmla.f32    q1, q13, q15            \n"
-               "vst1.f32    {q8,q9},[%2,:128]!    \n"
-               "vst1.f32    {q0,q1},[%2,:128]!    \n"
-               "subs        %3, #2                  \n"
-               "bne         1b                      \n"
-               : "+r"(a_), "+r"(b_), "+r"(ab_), "+r"(N) : "r"(scaling) : "r8", "q0","q1","q2","q3","q4","q5","q6","q7","q8","q9", "q10","q11","q12","q13","q15","memory");
-#else // default routine, works fine for non-arm cpus with current compilers
-  for (i=0; i < Ncvec; i += 2) {
-    v4sf ar, ai, br, bi;
-    ar = va[2*i+0]; ai = va[2*i+1];
-    br = vb[2*i+0]; bi = vb[2*i+1];
-    VCPLXMUL(ar, ai, br, bi);
-    vab[2*i+0] = VMADD(ar, vscal, vab[2*i+0]);
-    vab[2*i+1] = VMADD(ai, vscal, vab[2*i+1]);
-    ar = va[2*i+2]; ai = va[2*i+3];
-    br = vb[2*i+2]; bi = vb[2*i+3];
-    VCPLXMUL(ar, ai, br, bi);
-    vab[2*i+2] = VMADD(ar, vscal, vab[2*i+2]);
-    vab[2*i+3] = VMADD(ai, vscal, vab[2*i+3]);
-  }
-#endif
-  if (s->transform == PFFFT_REAL) {
-    ((v4sf_union*)vab)[0].f[0] = abr + ar*br*scaling;
-    ((v4sf_union*)vab)[1].f[0] = abi + ai*bi*scaling;
-  }
-}
-
-
-#else // defined(PFFFT_SIMD_DISABLE)
-
-// standard routine using scalar floats, without SIMD stuff.
-
-#define pffft_zreorder_nosimd pffft_zreorder
-void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) {
-  int k, N = setup->N;
-  if (setup->transform == PFFFT_COMPLEX) {
-    for (k=0; k < 2*N; ++k) out[k] = in[k];
-    return;
-  }
-  else if (direction == PFFFT_FORWARD) {
-    float x_N = in[N-1];
-    for (k=N-1; k > 1; --k) out[k] = in[k-1]; 
-    out[0] = in[0];
-    out[1] = x_N;
-  } else {
-    float x_N = in[1];
-    for (k=1; k < N-1; ++k) out[k] = in[k+1]; 
-    out[0] = in[0];
-    out[N-1] = x_N;
-  }
-}
-
-#define pffft_transform_internal_nosimd pffft_transform_internal
-void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, float *output, float *scratch,
-                                    pffft_direction_t direction, int ordered) {
-  int Ncvec   = setup->Ncvec;
-  int nf_odd = (setup->ifac[1] & 1);
-
-  // temporary buffer is allocated on the stack if the scratch pointer is NULL
-  int stack_allocate = (scratch == 0 ? Ncvec*2 : 1);
-  VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate);
-  float *buff[2];
-  int ib;
-  if (scratch == 0) scratch = scratch_on_stack;
-  buff[0] = output; buff[1] = scratch;
-
-  if (setup->transform == PFFFT_COMPLEX) ordered = 0; // it is always ordered.
-  ib = (nf_odd ^ ordered ? 1 : 0);
-
-  if (direction == PFFFT_FORWARD) {
-    if (setup->transform == PFFFT_REAL) { 
-      ib = (rfftf1_ps(Ncvec*2, input, buff[ib], buff[!ib],
-                      setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);      
-    } else {
-      ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib], 
-                      setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1);
-    }
-    if (ordered) {
-      pffft_zreorder(setup, buff[ib], buff[!ib], PFFFT_FORWARD); ib = !ib;
-    }
-  } else {    
-    if (input == buff[ib]) { 
-      ib = !ib; // may happen when finput == foutput
-    }
-    if (ordered) {
-      pffft_zreorder(setup, input, buff[!ib], PFFFT_BACKWARD); 
-      input = buff[!ib];
-    }
-    if (setup->transform == PFFFT_REAL) {
-      ib = (rfftb1_ps(Ncvec*2, input, buff[ib], buff[!ib], 
-                      setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
-    } else {
-      ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib], 
-                      setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1);
-    }
-  }
-  if (buff[ib] != output) {
-    int k;
-    // extra copy required -- this situation should happens only when finput == foutput
-    assert(input==output);
-    for (k=0; k < Ncvec; ++k) {
-      float a = buff[ib][2*k], b = buff[ib][2*k+1];
-      output[2*k] = a; output[2*k+1] = b;
-    }
-    ib = !ib;
-  }
-  assert(buff[ib] == output);
-}
-
-#define pffft_zconvolve_accumulate_nosimd pffft_zconvolve_accumulate
-void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup *s, const float *a, const float *b,
-                                       float *ab, float scaling) {
-  int i, Ncvec = s->Ncvec;
-
-  if (s->transform == PFFFT_REAL) {
-    // take care of the fftpack ordering
-    ab[0] += a[0]*b[0]*scaling;
-    ab[2*Ncvec-1] += a[2*Ncvec-1]*b[2*Ncvec-1]*scaling;
-    ++ab; ++a; ++b; --Ncvec;
-  }
-  for (i=0; i < Ncvec; ++i) {
-    float ar, ai, br, bi;
-    ar = a[2*i+0]; ai = a[2*i+1];
-    br = b[2*i+0]; bi = b[2*i+1];
-    VCPLXMUL(ar, ai, br, bi);
-    ab[2*i+0] += ar*scaling;
-    ab[2*i+1] += ai*scaling;
-  }
-}
-
-#endif // defined(PFFFT_SIMD_DISABLE)
-
-void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) {
-  pffft_transform_internal(setup, input, output, (v4sf*)work, direction, 0);
-}
-
-void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) {
-  pffft_transform_internal(setup, input, output, (v4sf*)work, direction, 1);
-}
diff --git a/oss-internship-2020/pffft/pffft.h b/oss-internship-2020/pffft/pffft.h
deleted file mode 100644
index 2bfa7b3..0000000
--- a/oss-internship-2020/pffft/pffft.h
+++ /dev/null
@@ -1,177 +0,0 @@
-/* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com ) 
-
-   Based on original fortran 77 code from FFTPACKv4 from NETLIB,
-   authored by Dr Paul Swarztrauber of NCAR, in 1985.
-
-   As confirmed by the NCAR fftpack software curators, the following
-   FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
-   released under the same terms.
-
-   FFTPACK license:
-
-   http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
-
-   Copyright (c) 2004 the University Corporation for Atmospheric
-   Research ("UCAR"). All rights reserved. Developed by NCAR's
-   Computational and Information Systems Laboratory, UCAR,
-   www.cisl.ucar.edu.
-
-   Redistribution and use of the Software in source and binary forms,
-   with or without modification, is permitted provided that the
-   following conditions are met:
-
-   - Neither the names of NCAR's Computational and Information Systems
-   Laboratory, the University Corporation for Atmospheric Research,
-   nor the names of its sponsors or contributors may be used to
-   endorse or promote products derived from this Software without
-   specific prior written permission.  
-
-   - Redistributions of source code must retain the above copyright
-   notices, this list of conditions, and the disclaimer below.
-
-   - Redistributions in binary form must reproduce the above copyright
-   notice, this list of conditions, and the disclaimer below in the
-   documentation and/or other materials provided with the
-   distribution.
-
-   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
-   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
-   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
-   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
-   SOFTWARE.
-*/
-   
-/*
-   PFFFT : a Pretty Fast FFT.
-
-   This is basically an adaptation of the single precision fftpack
-   (v4) as found on netlib taking advantage of SIMD instruction found
-   on cpus such as intel x86 (SSE1), powerpc (Altivec), and arm (NEON).
-   
-   For architectures where no SIMD instruction is available, the code
-   falls back to a scalar version.  
-
-   Restrictions: 
-
-   - 1D transforms only, with 32-bit single precision.
-
-   - supports only transforms for inputs of length N of the form
-   N=(2^a)*(3^b)*(5^c), a >= 5, b >=0, c >= 0 (32, 48, 64, 96, 128,
-   144, 160, etc are all acceptable lengths). Performance is best for
-   128<=N<=8192.
-
-   - all (float*) pointers in the functions below are expected to
-   have an "simd-compatible" alignment, that is 16 bytes on x86 and
-   powerpc CPUs.
-  
-   You can allocate such buffers with the functions
-   pffft_aligned_malloc / pffft_aligned_free (or with stuff like
-   posix_memalign..)
-
-*/
-
-#ifndef PFFFT_H
-#define PFFFT_H
-
-#include <stddef.h> // for size_t
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-  /* opaque struct holding internal stuff (precomputed twiddle factors)
-     this struct can be shared by many threads as it contains only
-     read-only data.  
-  */
-  typedef struct PFFFT_Setup PFFFT_Setup;
-
-  /* direction of the transform */
-  typedef enum { PFFFT_FORWARD, PFFFT_BACKWARD } pffft_direction_t;
-  
-  /* type of transform */
-  typedef enum { PFFFT_REAL, PFFFT_COMPLEX } pffft_transform_t;
-
-  /*
-    prepare for performing transforms of size N -- the returned
-    PFFFT_Setup structure is read-only so it can safely be shared by
-    multiple concurrent threads. 
-  */
-  PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform);
-  void pffft_destroy_setup(PFFFT_Setup *);
-  /* 
-     Perform a Fourier transform , The z-domain data is stored in the
-     most efficient order for transforming it back, or using it for
-     convolution. If you need to have its content sorted in the
-     "usual" way, that is as an array of interleaved complex numbers,
-     either use pffft_transform_ordered , or call pffft_zreorder after
-     the forward fft, and before the backward fft.
-
-     Transforms are not scaled: PFFFT_BACKWARD(PFFFT_FORWARD(x)) = N*x.
-     Typically you will want to scale the backward transform by 1/N.
-     
-     The 'work' pointer should point to an area of N (2*N for complex
-     fft) floats, properly aligned. If 'work' is NULL, then stack will
-     be used instead (this is probably the best strategy for small
-     FFTs, say for N < 16384).
-
-     input and output may alias.
-  */
-  void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);
-
-  /* 
-     Similar to pffft_transform, but makes sure that the output is
-     ordered as expected (interleaved complex numbers).  This is
-     similar to calling pffft_transform and then pffft_zreorder.
-     
-     input and output may alias.
-  */
-  void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);
-
-  /* 
-     call pffft_zreorder(.., PFFFT_FORWARD) after pffft_transform(...,
-     PFFFT_FORWARD) if you want to have the frequency components in
-     the correct "canonical" order, as interleaved complex numbers.
-     
-     (for real transforms, both 0-frequency and half frequency
-     components, which are real, are assembled in the first entry as
-     F(0)+i*F(n/2+1). Note that the original fftpack did place
-     F(n/2+1) at the end of the arrays).
-     
-     input and output should not alias.
-  */
-  void pffft_zreorder(PFFFT_Setup *setup, const float *input, float *output, pffft_direction_t direction);
-
-  /* 
-     Perform a multiplication of the frequency components of dft_a and
-     dft_b and accumulate them into dft_ab. The arrays should have
-     been obtained with pffft_transform(.., PFFFT_FORWARD) and should
-     *not* have been reordered with pffft_zreorder (otherwise just
-     perform the operation yourself as the dft coefs are stored as
-     interleaved complex numbers).
-     
-     the operation performed is: dft_ab += (dft_a * fdt_b)*scaling
-     
-     The dft_a, dft_b and dft_ab pointers may alias.
-  */
-  void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
-
-  /*
-    the float buffers must have the correct alignment (16-byte boundary
-    on intel and powerpc). This function may be used to obtain such
-    correctly aligned buffers.  
-  */
-  void *pffft_aligned_malloc(size_t nb_bytes);
-  void pffft_aligned_free(void *);
-
-  /* return 4 or 1 wether support SSE/Altivec instructions was enable when building pffft.c */
-  int pffft_simd_size();
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif // PFFFT_H
diff --git a/oss-internship-2020/pffft/test_pffft.c b/oss-internship-2020/pffft/test_pffft.c
deleted file mode 100644
index a5d20c2..0000000
--- a/oss-internship-2020/pffft/test_pffft.c
+++ /dev/null
@@ -1,419 +0,0 @@
-/*
-  Copyright (c) 2013 Julien Pommier.
-
-  Small test & bench for PFFFT, comparing its performance with the scalar FFTPACK, FFTW, and Apple vDSP
-
-  How to build: 
-
-  on linux, with fftw3:
-  gcc -o test_pffft -DHAVE_FFTW -msse -mfpmath=sse -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -lm
-
-  on macos, without fftw3:
-  clang -o test_pffft -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -framework Accelerate
-
-  on macos, with fftw3:
-  clang -o test_pffft -DHAVE_FFTW -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -framework Accelerate
-
-  on windows, with visual c++:
-  cl /Ox -D_USE_MATH_DEFINES /arch:SSE test_pffft.c pffft.c fftpack.c
-  
-  build without SIMD instructions:
-  gcc -o test_pffft -DPFFFT_SIMD_DISABLE -O3 -Wall -W pffft.c test_pffft.c fftpack.c -lm
-
- */
-
-#include "pffft.h"
-#include "fftpack.h"
-
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <time.h>
-#include <assert.h>
-#include <string.h>
-
-#ifdef HAVE_SYS_TIMES
-#  include <sys/times.h>
-#  include <unistd.h>
-#endif
-
-#ifdef HAVE_VECLIB
-#  include <Accelerate/Accelerate.h>
-#endif
-
-#ifdef HAVE_FFTW
-#  include <fftw3.h>
-#endif
-
-#define MAX(x,y) ((x)>(y)?(x):(y))
-
-double frand() {
-  return rand()/(double)RAND_MAX;
-}
-
-#if defined(HAVE_SYS_TIMES)
-  inline double uclock_sec(void) {
-    static double ttclk = 0.;
-    if (ttclk == 0.) ttclk = sysconf(_SC_CLK_TCK);
-    struct tms t; return ((double)times(&t)) / ttclk;
-  }
-# else
-  double uclock_sec(void)
-{ return (double)clock()/(double)CLOCKS_PER_SEC; }
-#endif
-
-
-/* compare results with the regular fftpack */
-void pffft_validate_N(int N, int cplx) {
-  int Nfloat = N*(cplx?2:1);
-  int Nbytes = Nfloat * sizeof(float);
-  float *ref, *in, *out, *tmp, *tmp2;
-  PFFFT_Setup *s = pffft_new_setup(N, cplx ? PFFFT_COMPLEX : PFFFT_REAL);
-  int pass;
-
-  if (!s) { printf("Skipping N=%d, not supported\n", N); return; }
-  ref = pffft_aligned_malloc(Nbytes);
-  in = pffft_aligned_malloc(Nbytes);
-  out = pffft_aligned_malloc(Nbytes);
-  tmp = pffft_aligned_malloc(Nbytes);
-  tmp2 = pffft_aligned_malloc(Nbytes);
-
-  for (pass=0; pass < 2; ++pass) {
-    float ref_max = 0;
-    int k;
-    //printf("N=%d pass=%d cplx=%d\n", N, pass, cplx);
-    // compute reference solution with FFTPACK
-    if (pass == 0) {
-      float *wrk = malloc(2*Nbytes+15*sizeof(float));
-      for (k=0; k < Nfloat; ++k) {
-        ref[k] = in[k] = frand()*2-1; 
-        out[k] = 1e30;
-      }
-      if (!cplx) {
-        rffti(N, wrk);
-        rfftf(N, ref, wrk);
-        // use our ordering for real ffts instead of the one of fftpack
-        {
-          float refN=ref[N-1];
-          for (k=N-2; k >= 1; --k) ref[k+1] = ref[k]; 
-          ref[1] = refN;
-        }
-      } else {
-        cffti(N, wrk);
-        cfftf(N, ref, wrk);
-      }
-      free(wrk);
-    }
-
-    for (k = 0; k < Nfloat; ++k) ref_max = MAX(ref_max, fabs(ref[k]));
-
-      
-    // pass 0 : non canonical ordering of transform coefficients  
-    if (pass == 0) {
-      // test forward transform, with different input / output
-      pffft_transform(s, in, tmp, 0, PFFFT_FORWARD);
-      memcpy(tmp2, tmp, Nbytes);
-      memcpy(tmp, in, Nbytes);
-      pffft_transform(s, tmp, tmp, 0, PFFFT_FORWARD);
-      for (k = 0; k < Nfloat; ++k) {
-        assert(tmp2[k] == tmp[k]);
-      }
-
-      // test reordering
-      pffft_zreorder(s, tmp, out, PFFFT_FORWARD);
-      pffft_zreorder(s, out, tmp, PFFFT_BACKWARD);
-      for (k = 0; k < Nfloat; ++k) {
-        assert(tmp2[k] == tmp[k]);
-      }
-      pffft_zreorder(s, tmp, out, PFFFT_FORWARD);
-    } else {
-      // pass 1 : canonical ordering of transform coeffs.
-      pffft_transform_ordered(s, in, tmp, 0, PFFFT_FORWARD);
-      memcpy(tmp2, tmp, Nbytes);
-      memcpy(tmp, in, Nbytes);
-      pffft_transform_ordered(s, tmp, tmp, 0, PFFFT_FORWARD);
-      for (k = 0; k < Nfloat; ++k) {
-        assert(tmp2[k] == tmp[k]);
-      }
-      memcpy(out, tmp, Nbytes);
-    }
-
-    {
-      for (k=0; k < Nfloat; ++k) {
-        if (!(fabs(ref[k] - out[k]) < 1e-3*ref_max)) {
-          printf("%s forward PFFFT mismatch found for N=%d\n", (cplx?"CPLX":"REAL"), N);
-          exit(1);
-        }
-      }
-        
-      if (pass == 0) pffft_transform(s, tmp, out, 0, PFFFT_BACKWARD);
-      else   pffft_transform_ordered(s, tmp, out, 0, PFFFT_BACKWARD);
-      memcpy(tmp2, out, Nbytes);
-      memcpy(out, tmp, Nbytes);
-      if (pass == 0) pffft_transform(s, out, out, 0, PFFFT_BACKWARD);
-      else   pffft_transform_ordered(s, out, out, 0, PFFFT_BACKWARD);
-      for (k = 0; k < Nfloat; ++k) {
-        assert(tmp2[k] == out[k]);
-        out[k] *= 1.f/N;
-      }
-      for (k = 0; k < Nfloat; ++k) {
-        if (fabs(in[k] - out[k]) > 1e-3 * ref_max) {
-          printf("pass=%d, %s IFFFT does not match for N=%d\n", pass, (cplx?"CPLX":"REAL"), N); break;
-          exit(1);
-        }
-      }
-    }
-
-    // quick test of the circular convolution in fft domain
-    {
-      float conv_err = 0, conv_max = 0;
-
-      pffft_zreorder(s, ref, tmp, PFFFT_FORWARD);
-      memset(out, 0, Nbytes);
-      pffft_zconvolve_accumulate(s, ref, ref, out, 1.0);
-      pffft_zreorder(s, out, tmp2, PFFFT_FORWARD);
-      
-      for (k=0; k < Nfloat; k += 2) {
-        float ar = tmp[k], ai=tmp[k+1];
-        if (cplx || k > 0) {
-          tmp[k] = ar*ar - ai*ai;
-          tmp[k+1] = 2*ar*ai;
-        } else {
-          tmp[0] = ar*ar;
-          tmp[1] = ai*ai;
-        }
-      }
-      
-      for (k=0; k < Nfloat; ++k) {
-        float d = fabs(tmp[k] - tmp2[k]), e = fabs(tmp[k]);
-        if (d > conv_err) conv_err = d;
-        if (e > conv_max) conv_max = e;
-      }
-      if (conv_err > 1e-5*conv_max) {
-        printf("zconvolve error ? %g %g\n", conv_err, conv_max); exit(1);
-      }
-    }
-
-  }
-
-  printf("%s PFFFT is OK for N=%d\n", (cplx?"CPLX":"REAL"), N); fflush(stdout);
-  
-  pffft_destroy_setup(s);
-  pffft_aligned_free(ref);
-  pffft_aligned_free(in);
-  pffft_aligned_free(out);
-  pffft_aligned_free(tmp);
-  pffft_aligned_free(tmp2);
-}
-
-void pffft_validate(int cplx) {
-  static int Ntest[] = { 16, 32, 64, 96, 128, 160, 192, 256, 288, 384, 5*96, 512, 576, 5*128, 800, 864, 1024, 2048, 2592, 4000, 4096, 12000, 36864, 0};
-  int k;
-  for (k = 0; Ntest[k]; ++k) {
-    int N = Ntest[k];
-    if (N == 16 && !cplx) continue;
-    pffft_validate_N(N, cplx);
-  }
-}
-
-int array_output_format = 0;
-
-void show_output(const char *name, int N, int cplx, float flops, float t0, float t1, int max_iter) {
-  float mflops = flops/1e6/(t1 - t0 + 1e-16);
-  if (array_output_format) {
-    if (flops != -1) {
-      printf("|%9.0f   ", mflops);
-    } else printf("|      n/a   ");
-  } else {
-    if (flops != -1) {
-      printf("N=%5d, %s %16s : %6.0f MFlops [t=%6.0f ns, %d runs]\n", N, (cplx?"CPLX":"REAL"), name, mflops, (t1-t0)/2/max_iter * 1e9, max_iter);
-    }
-  }
-  fflush(stdout);
-}
-
-void benchmark_ffts(int N, int cplx) {
-  int Nfloat = (cplx ? N*2 : N);
-  int Nbytes = Nfloat * sizeof(float);
-  float *X = pffft_aligned_malloc(Nbytes), *Y = pffft_aligned_malloc(Nbytes), *Z = pffft_aligned_malloc(Nbytes);
-
-  double t0, t1, flops;
-
-  int k;
-  int max_iter = 5120000/N*4;
-#ifdef __arm__
-  max_iter /= 4;
-#endif
-  int iter;
-
-  for (k = 0; k < Nfloat; ++k) {
-    X[k] = 0; //sqrtf(k+1);
-  }
-
-  // FFTPack benchmark
-  {
-    float *wrk = malloc(2*Nbytes + 15*sizeof(float));
-    int max_iter_ = max_iter/pffft_simd_size(); if (max_iter_ == 0) max_iter_ = 1;
-    if (cplx) cffti(N, wrk);
-    else      rffti(N, wrk);
-    t0 = uclock_sec();  
-    
-    for (iter = 0; iter < max_iter_; ++iter) {
-      if (cplx) {
-        cfftf(N, X, wrk);
-        cfftb(N, X, wrk);
-      } else {
-        rfftf(N, X, wrk);
-        rfftb(N, X, wrk);
-      }
-    }
-    t1 = uclock_sec();
-    free(wrk);
-    
-    flops = (max_iter_*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); // see http://www.fftw.org/speed/method.html
-    show_output("FFTPack", N, cplx, flops, t0, t1, max_iter_);
-  }
-
-#ifdef HAVE_VECLIB
-  int log2N = (int)(log(N)/log(2) + 0.5f);
-  if (N == (1<<log2N)) {
-    FFTSetup setup;
-
-    setup = vDSP_create_fftsetup(log2N, FFT_RADIX2);
-    DSPSplitComplex zsamples;
-    zsamples.realp = &X[0];
-    zsamples.imagp = &X[Nfloat/2];
-    t0 = uclock_sec();  
-    for (iter = 0; iter < max_iter; ++iter) {
-      if (cplx) {
-        vDSP_fft_zip(setup, &zsamples, 1, log2N, kFFTDirection_Forward);
-        vDSP_fft_zip(setup, &zsamples, 1, log2N, kFFTDirection_Inverse);
-      } else {
-        vDSP_fft_zrip(setup, &zsamples, 1, log2N, kFFTDirection_Forward); 
-        vDSP_fft_zrip(setup, &zsamples, 1, log2N, kFFTDirection_Inverse);
-      }
-    }
-    t1 = uclock_sec();
-    vDSP_destroy_fftsetup(setup);
-
-    flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); // see http://www.fftw.org/speed/method.html
-    show_output("vDSP", N, cplx, flops, t0, t1, max_iter);
-  } else {
-    show_output("vDSP", N, cplx, -1, -1, -1, -1);
-  }
-#endif
-  
-#ifdef HAVE_FFTW
-  {
-    fftwf_plan planf, planb;
-    fftw_complex *in = (fftw_complex*) fftwf_malloc(sizeof(fftw_complex) * N);
-    fftw_complex *out = (fftw_complex*) fftwf_malloc(sizeof(fftw_complex) * N);
-    memset(in, 0, sizeof(fftw_complex) * N);
-    int flags = (N < 40000 ? FFTW_MEASURE : FFTW_ESTIMATE);  // measure takes a lot of time on largest ffts
-    //int flags = FFTW_ESTIMATE;
-    if (cplx) {
-      planf = fftwf_plan_dft_1d(N, (fftwf_complex*)in, (fftwf_complex*)out, FFTW_FORWARD, flags);
-      planb = fftwf_plan_dft_1d(N, (fftwf_complex*)in, (fftwf_complex*)out, FFTW_BACKWARD, flags);
-    } else {
-      planf = fftwf_plan_dft_r2c_1d(N, (float*)in, (fftwf_complex*)out, flags);
-      planb = fftwf_plan_dft_c2r_1d(N, (fftwf_complex*)in, (float*)out, flags);
-    }
-
-    t0 = uclock_sec();  
-    for (iter = 0; iter < max_iter; ++iter) {
-      fftwf_execute(planf);
-      fftwf_execute(planb);
-    }
-    t1 = uclock_sec();
-
-    fftwf_destroy_plan(planf);
-    fftwf_destroy_plan(planb);
-    fftwf_free(in); fftwf_free(out);
-
-    flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); // see http://www.fftw.org/speed/method.html
-    show_output((flags == FFTW_MEASURE ? "FFTW (meas.)" : " FFTW (estim)"), N, cplx, flops, t0, t1, max_iter);
-  }
-#endif  
-
-  // PFFFT benchmark
-  {
-    PFFFT_Setup *s = pffft_new_setup(N, cplx ? PFFFT_COMPLEX : PFFFT_REAL);
-    if (s) {
-      t0 = uclock_sec();  
-      for (iter = 0; iter < max_iter; ++iter) {
-        pffft_transform(s, X, Z, Y, PFFFT_FORWARD);
-        pffft_transform(s, X, Z, Y, PFFFT_BACKWARD);
-      }
-      t1 = uclock_sec();
-      pffft_destroy_setup(s);
-    
-      flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); // see http://www.fftw.org/speed/method.html
-      show_output("PFFFT", N, cplx, flops, t0, t1, max_iter);
-    }
-  }
-
-  if (!array_output_format) {
-    printf("--\n");
-  }
-
-  pffft_aligned_free(X);
-  pffft_aligned_free(Y);
-  pffft_aligned_free(Z);
-}
-
-#ifndef PFFFT_SIMD_DISABLE
-void validate_pffft_simd(); // a small function inside pffft.c that will detect compiler bugs with respect to simd instruction 
-#endif
-
-int main(int argc, char **argv) {
-  int Nvalues[] = { 64, 96, 128, 160, 192, 256, 384, 5*96, 512, 5*128, 3*256, 800, 1024, 2048, 2400, 4096, 8192, 9*1024, 16384, 32768, 256*1024, 1024*1024, -1 };
-  int i;
-
-  if (argc > 1 && strcmp(argv[1], "--array-format") == 0) {
-    array_output_format = 1;
-  }
-
-#ifndef PFFFT_SIMD_DISABLE
-  validate_pffft_simd();
-#endif
-  pffft_validate(1);
-  pffft_validate(0);
-  if (!array_output_format) {
-    for (i=0; Nvalues[i] > 0; ++i) {
-      benchmark_ffts(Nvalues[i], 0 /* real fft */);
-    }
-    for (i=0; Nvalues[i] > 0; ++i) {
-      benchmark_ffts(Nvalues[i], 1 /* cplx fft */);
-    }
-  } else {
-    printf("| input len ");
-    printf("|real FFTPack");
-#ifdef HAVE_VECLIB
-    printf("|  real vDSP ");
-#endif
-#ifdef HAVE_FFTW
-    printf("|  real FFTW ");
-#endif
-    printf("| real PFFFT | ");
-
-    printf("|cplx FFTPack");
-#ifdef HAVE_VECLIB
-    printf("|  cplx vDSP ");
-#endif
-#ifdef HAVE_FFTW
-    printf("|  cplx FFTW ");
-#endif
-    printf("| cplx PFFFT |\n");
-    for (i=0; Nvalues[i] > 0; ++i) {
-      printf("|%9d  ", Nvalues[i]);
-      benchmark_ffts(Nvalues[i], 0); 
-      printf("| ");
-      benchmark_ffts(Nvalues[i], 1);
-      printf("|\n");
-    }
-    printf(" (numbers are given in MFlops)\n");
-  }
-
-
-  return 0;
-}

From 5ae99706e20cbd579416661a28726754c5e77bfd Mon Sep 17 00:00:00 2001
From: doinachiroiu <inach@google.com>
Date: Wed, 26 Aug 2020 15:03:25 +0000
Subject: [PATCH 21/29] Added PFFFT_ROOT_DIR

---
 oss-internship-2020/pffft/CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/oss-internship-2020/pffft/CMakeLists.txt b/oss-internship-2020/pffft/CMakeLists.txt
index c2a7bcd..41d7520 100644
--- a/oss-internship-2020/pffft/CMakeLists.txt
+++ b/oss-internship-2020/pffft/CMakeLists.txt
@@ -19,6 +19,8 @@ project(pffft CXX C)
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED True)
 
+set(PFFFT_ROOT_DIR https://bitbucket.org/jpommier/pffft.git)
+
 add_library(pffft STATIC 
     master/pffft.c 
     master/pffft.h 

From bb623d4c0b25baa8ba2fa0d91a0110f045fd0399 Mon Sep 17 00:00:00 2001
From: doinachiroiu <inach@google.com>
Date: Thu, 27 Aug 2020 12:54:57 +0000
Subject: [PATCH 22/29] Required changes resolved

---
 oss-internship-2020/pffft/CMakeLists.txt      |   2 +-
 oss-internship-2020/pffft/README.md           |  36 +++---
 .../pffft/main_pffft_sandboxed.cc             | 108 ++++++++++--------
 3 files changed, 79 insertions(+), 67 deletions(-)

diff --git a/oss-internship-2020/pffft/CMakeLists.txt b/oss-internship-2020/pffft/CMakeLists.txt
index 41d7520..be687cd 100644
--- a/oss-internship-2020/pffft/CMakeLists.txt
+++ b/oss-internship-2020/pffft/CMakeLists.txt
@@ -88,7 +88,7 @@ add_sapi_library(pffft_sapi
 
     INPUTS master/pffft.h master/fftpack.h
     LIBRARY pffft
-    LIBRARY_NAME pffft
+    LIBRARY_NAME Pffft
 
     NAMESPACE ""
 )
diff --git a/oss-internship-2020/pffft/README.md b/oss-internship-2020/pffft/README.md
index 2d2a9ca..eaadb20 100644
--- a/oss-internship-2020/pffft/README.md
+++ b/oss-internship-2020/pffft/README.md
@@ -4,7 +4,9 @@ Build System: CMake
 OS: Linux
 
 ### Check out the PFFFT library & CMake set up
-`git submodule add https://bitbucket.org/jpommier/pffft.git`
+`git clone https://github.com/doinachiroiu/sandboxed-api/tree/master/oss-internship-2020/pffft`
+
+`git submodule update --init --recursive`
 
 `mkdir -p build && cd build`
 
@@ -22,7 +24,7 @@ display custom info with
 ## ***About the project*** 
 *PFFFT library is concerned with 1D Fast-Fourier Transformations finding a
 compromise between accuracy and speed. It deals with real and complex
-vectors, both cases being illustrated in the testing part (`main_pffft.c` 
+vectors, both cases being illustrated in the testing part (`test_pffft.c` 
 for initially and original version, `main_pffft_sandboxed.cc` for our 
 currently implemented sandboxed version).
 The original files can be found at: https://bitbucket.org/jpommier/pffft/src.*
@@ -62,21 +64,23 @@ In the end, the performance of PFFFT library it is outlined by the output.*
     
 
 ### Bugs history
-    1. [Solved] pffft benchmark bug: "Sandbox not active"  
-    N = 64, status OK, pffft_transform generates error 
-    N > 64, status not OK
-    Problem on initialising sapi::StatusOr<PFFFT_Setup *> s; the memory that stays 
-    for s is not the same with the address passed in pffft_transform function. 
-    (sapi::v::GenericPtr - to be changed)
+1. [Solved] pffft benchmark bug: "Sandbox not active"  
+        
+        N = 64, status OK, pffft_transform generates error 
+        N > 64, status not OK
+        Problem on initialising sapi::StatusOr<PFFFT_Setup *> s; the memory that stays 
+        for s is not the same with the address passed in pffft_transform function. 
+        (sapi::v::GenericPtr - to be changed)
 
-    Temporary solution: change the generated files to accept 
-    uintptr_t instead of PFFFT_Setup
+        Temporary solution: change the generated files to accept 
+        uintptr_t instead of PFFFT_Setup
 
-    Solution: using "sapi::v::RemotePtr" instead of "sapi::v::GenericPtr" 
-    to access the memory of object s
+        Solution: using "sapi::v::RemotePtr" instead of "sapi::v::GenericPtr" 
+        to access the memory of object s
 
-    2. [Unresolved] compiling bug: "No space left on device"
-    The building process creates some `embed` files that use lots of 
-    memory, trying to write them on /tmp.
+2. [Unresolved] compiling bug: "No space left on device"
+    
+        The building process creates some `embed` files that use lots of 
+        memory, trying to write them on /tmp.
 
-    Temporary solution: clean /tmp directory by `sudo rm -rf /tmp/*`.
\ No newline at end of file
+        Temporary solution: clean /tmp directory by `sudo rm -rf /tmp/*`.
\ No newline at end of file
diff --git a/oss-internship-2020/pffft/main_pffft_sandboxed.cc b/oss-internship-2020/pffft/main_pffft_sandboxed.cc
index 10ea802..f6abf05 100644
--- a/oss-internship-2020/pffft/main_pffft_sandboxed.cc
+++ b/oss-internship-2020/pffft/main_pffft_sandboxed.cc
@@ -30,10 +30,10 @@
 ABSL_DECLARE_FLAG(string, sandbox2_danger_danger_permit_all);
 ABSL_DECLARE_FLAG(string, sandbox2_danger_danger_permit_all_and_log);
 
-class PffftSapiSandbox : public pffftSandbox {
+class PffftSapiSandbox : public PffftSandbox {
  public:
   std::unique_ptr<sandbox2::Policy> ModifyPolicy(
-      sandbox2::PolicyBuilder*) override {
+      sandbox2::PolicyBuilder*) {
     return sandbox2::PolicyBuilder()
         .AllowStaticStartup()
         .AllowOpen()
@@ -50,11 +50,11 @@ class PffftSapiSandbox : public pffftSandbox {
   }
 };
 
-double UclockSec(void) { return (double)clock() / (double)CLOCKS_PER_SEC; }
+double UclockSec() { return static_cast<double>(clock()) / CLOCKS_PER_SEC; }
 
 int array_output_format = 0;
 
-void ShowOutput(const char* name, int N, int cplx, float flops, float t0,
+void ShowOutput(const char* name, int n, int cplx, float flops, float t0,
                 float t1, int max_iter) {
   float mflops = flops / 1e6 / (t1 - t0 + 1e-16);
   if (array_output_format) {
@@ -64,7 +64,7 @@ void ShowOutput(const char* name, int N, int cplx, float flops, float t0,
       printf("|      n/a   ");
   } else {
     if (flops != -1) {
-      printf("N=%5d, %s %16s : %6.0f MFlops [t=%6.0f ns, %d runs]\n", N,
+      printf("n=%5d, %s %16s : %6.0f MFlops [t=%6.0f ns, %d runs]\n", n,
              (cplx ? "CPLX" : "REAL"), name, mflops,
              (t1 - t0) / 2 / max_iter * 1e9, max_iter);
     }
@@ -72,55 +72,64 @@ void ShowOutput(const char* name, int N, int cplx, float flops, float t0,
   fflush(stdout);
 }
 
+absl::Status PffftMain() {
+  PffftSapiSandbox sandbox;
+  SAPI_RETURN_IF_ERROR(sandbox.Init());
+
+  return absl::OkStatus();
+}
+
 int main(int argc, char* argv[]) {
-  
   // Initialize Google's logging library.
   google::InitGoogleLogging(argv[0]);
 
   gflags::ParseCommandLineFlags(&argc, &argv, true);
 
-  // Nvalues is a vector keeping the values by which iterates N, its value
-  // representing the input length. More concrete, N is the number of
+  // kTransformSizes is a vector keeping the values by which iterates n, its value
+  // representing the input length. More concrete, n is the number of
   // data points the caclulus is up to (determinating its accuracy).
   // To show the performance of Fast-Fourier Transformations the program is
-  // testing for various values of N.
-  int Nvalues[] = {64,    96,     128,        160,         192,     256,
+  // testing for various values of n.
+  constexpr int kTransformSizes[] = {64,    96,     128,        160,         192,     256,
                    384,   5 * 96, 512,        5 * 128,     3 * 256, 800,
                    1024,  2048,   2400,       4096,        8192,    9 * 1024,
                    16384, 32768};
-  int i;
 
   LOG(INFO) << "Initializing sandbox...\n";
 
   PffftSapiSandbox sandbox;
   absl::Status init_status = sandbox.Init();
 
-  LOG(INFO) << "Initialization: " << init_status.ToString().c_str() << "\n";
+  if (absl::Status status = PffftMain(); !status.ok()) {
+    LOG(ERROR) << "Initialization failed: " << status.ToString();
+    return EXIT_FAILURE;
+  }
 
-  pffftApi api(&sandbox);
+  LOG(INFO) << "Initialization: " << init_status.ToString();
+
+  PffftApi api(&sandbox);
   int cplx = 0;
 
   do {
-    for (int N : Nvalues) {
-      const int Nfloat = N * (cplx ? 2 : 1);
-      int Nbytes = Nfloat * sizeof(float);
+    for (int n : kTransformSizes) {
+      const int n_float = n * (cplx ? 2 : 1);
+      int n_bytes = n_float * sizeof(float);
 
-      float wrk[2 * Nfloat + 15 * sizeof(float)];
-      sapi::v::Array<float> wrk_(wrk, 2 * Nfloat + 15 * sizeof(float));
+      std::vector<float> work(2 * n_float + 15, 0.0);
+      sapi::v::Array<float> work_array(&work[0], work.size());
 
-      float X[Nbytes], Y[Nbytes], Z[Nbytes];
-      sapi::v::Array<float> X_(X, Nbytes), Y_(Y, Nbytes), Z_(Z, Nbytes);
+      float x[n_bytes], y[n_bytes], z[n_bytes];
+      sapi::v::Array<float> x_array(x, n_bytes), y_array(y, n_bytes), z_array(z, n_bytes);
 
-      double t0, t1, flops;
+      double t0;
+      double t1;
+      double flops;
 
-      int max_iter = 5120000 / N * 4;
-#ifdef __arm__
-      max_iter /= 4;
-#endif
-      int iter, k;
+      int k;
+      int max_iter = 5120000 / n * 4;
 
-      for (k = 0; k < Nfloat; ++k) {
-        X[k] = 0;
+      for (k = 0; k < n_float; ++k) {
+        x[k] = 0;
       }
 
       // FFTPack benchmark
@@ -130,50 +139,49 @@ int main(int argc, char* argv[]) {
 
         if (max_iter_ == 0) max_iter_ = 1;
         if (cplx) {
-          api.cffti(N, wrk_.PtrBoth()).IgnoreError();
+          api.cffti(n, work_array.PtrBoth()).IgnoreError();
         } else {
-          api.rffti(N, wrk_.PtrBoth()).IgnoreError();
+          api.rffti(n, work_array.PtrBoth()).IgnoreError();
         }
         t0 = UclockSec();
 
-        for (iter = 0; iter < max_iter_; ++iter) {
+        for (int iter = 0; iter < max_iter_; ++iter) {
           if (cplx) {
-            api.cfftf(N, X_.PtrBoth(), wrk_.PtrBoth()).IgnoreError();
-            api.cfftb(N, X_.PtrBoth(), wrk_.PtrBoth()).IgnoreError();
+            api.cfftf(n, x_array.PtrBoth(), work_array.PtrBoth()).IgnoreError();
+            api.cfftb(n, x_array.PtrBoth(), work_array.PtrBoth()).IgnoreError();
           } else {
-            api.rfftf(N, X_.PtrBoth(), wrk_.PtrBoth()).IgnoreError();
-            api.rfftb(N, X_.PtrBoth(), wrk_.PtrBoth()).IgnoreError();
+            api.rfftf(n, x_array.PtrBoth(), work_array.PtrBoth()).IgnoreError();
+            api.rfftb(n, x_array.PtrBoth(), work_array.PtrBoth()).IgnoreError();
           }
         }
         t1 = UclockSec();
 
         flops =
-            (max_iter_ * 2) * ((cplx ? 5 : 2.5) * N * log((double)N) / M_LN2);
-        ShowOutput("FFTPack", N, cplx, flops, t0, t1, max_iter_);
+            (max_iter_ * 2) * ((cplx ? 5 : 2.5) * n * log((double)n) / M_LN2);
+        ShowOutput("FFTPack", n, cplx, flops, t0, t1, max_iter_);
       }
       
       // PFFFT benchmark
       {
         sapi::StatusOr<PFFFT_Setup*> s =
-            api.pffft_new_setup(N, cplx ? PFFFT_COMPLEX : PFFFT_REAL);
+            api.pffft_new_setup(n, cplx ? PFFFT_COMPLEX : PFFFT_REAL);
 
-        LOG(INFO) << "Setup status is: " << s.status().ToString().c_str()
-                  << "\n";
+        LOG(INFO) << "Setup status is: " << s.status().ToString();
 
         if (!s.ok()) {
           printf("Sandbox failed.\n");
-          return 1;
+          return EXIT_FAILURE;
         }
 
         sapi::v::RemotePtr s_reg(s.value());
 
         t0 = UclockSec();
-        for (iter = 0; iter < max_iter; ++iter) {
-          api.pffft_transform(&s_reg, X_.PtrBoth(), Z_.PtrBoth(),
-                              Y_.PtrBoth(), PFFFT_FORWARD)
+        for (int iter = 0; iter < max_iter; ++iter) {
+          api.pffft_transform(&s_reg, x_array.PtrBoth(), z_array.PtrBoth(),
+                              y_array.PtrBoth(), PFFFT_FORWARD)
               .IgnoreError();
-          api.pffft_transform(&s_reg, X_.PtrBoth(), Z_.PtrBoth(),
-                              Y_.PtrBoth(), PFFFT_FORWARD)
+          api.pffft_transform(&s_reg, x_array.PtrBoth(), z_array.PtrBoth(),
+                              y_array.PtrBoth(), PFFFT_FORWARD)
               .IgnoreError();
         }
 
@@ -181,15 +189,15 @@ int main(int argc, char* argv[]) {
         api.pffft_destroy_setup(&s_reg).IgnoreError();
 
         flops =
-            (max_iter * 2) * ((cplx ? 5 : 2.5) * N * log((double)N) / M_LN2);
-        ShowOutput("PFFFT", N, cplx, flops, t0, t1, max_iter);
+            (max_iter * 2) * ((cplx ? 5 : 2.5) * n * log((double)n) / M_LN2);
+        ShowOutput("PFFFT", n, cplx, flops, t0, t1, max_iter);
 
-        LOG(INFO) << "N = " << N << " SUCCESSFULLY\n\n";
+        LOG(INFO) << "n = " << n << " SUCCESSFULLY";
       }
     }
 
     cplx = !cplx;
   } while (cplx);
 
-  return 0;
+  return EXIT_SUCCESS;
 }
\ No newline at end of file

From 4336c005d3b6505db883a34da40ceb03843d0a6e Mon Sep 17 00:00:00 2001
From: doinachiroiu <inach@google.com>
Date: Thu, 27 Aug 2020 16:46:59 +0000
Subject: [PATCH 23/29] Added output format flag and made other required
 changes

---
 oss-internship-2020/pffft/README.md           |   8 +-
 .../pffft/main_pffft_sandboxed.cc             | 118 ++++++++++--------
 2 files changed, 72 insertions(+), 54 deletions(-)

diff --git a/oss-internship-2020/pffft/README.md b/oss-internship-2020/pffft/README.md
index eaadb20..cf04563 100644
--- a/oss-internship-2020/pffft/README.md
+++ b/oss-internship-2020/pffft/README.md
@@ -46,7 +46,11 @@ the series that calculate the result of transformation. It is also
 important to mention that the `cplx` variable stands for a boolean value 
 that tells the type of transformation (0 for REAL and 1 for COMPLEX) and 
 it is taken into account while testing.
-In the end, the performance of PFFFT library it is outlined by the output.*
+In the end, the performance of PFFFT library it is outlined by the output.
+There are two output formats available, from which you can choose through
+from which you can choose through `--output_format=` command-line flag.
+Without using this type of argument when running, the output format is set
+by default.*
 
 #### CMake observations resume:
 * linking pffft and fftpack (which contains necessary functions for pffft)
@@ -60,6 +64,8 @@ In the end, the performance of PFFFT library it is outlined by the output.*
     meaning the number of points to which it is set the calculus 
     (more details of mathematical purpose of N - https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm). 
 * output shows speed depending on the input length
+* use `--output_format=0` or `--output_format=1` arguments to choose between output formats.
+    `0` is for a detailed output, while `1` is only displaying each transformation process speed.
 
     
diff --git a/oss-internship-2020/pffft/main_pffft_sandboxed.cc b/oss-internship-2020/pffft/main_pffft_sandboxed.cc
index f6abf05..fc14319 100644
--- a/oss-internship-2020/pffft/main_pffft_sandboxed.cc
+++ b/oss-internship-2020/pffft/main_pffft_sandboxed.cc
@@ -12,16 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <gflags/gflags.h>
 #include <glog/logging.h>
-#include <stdlib.h>
-#include <string.h>
 #include <sys/times.h>
 #include <syscall.h>
-#include <time.h>
 
 #include <cassert>
 #include <cmath>
 #include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <ctime>
 
 #include "pffft_sapi.sapi.h"
 #include "sandboxed_api/util/flag.h"
@@ -32,8 +33,7 @@ ABSL_DECLARE_FLAG(string, sandbox2_danger_danger_permit_all_and_log);
 
 class PffftSapiSandbox : public PffftSandbox {
  public:
-  std::unique_ptr<sandbox2::Policy> ModifyPolicy(
-      sandbox2::PolicyBuilder*) {
+  std::unique_ptr<sandbox2::Policy> ModifyPolicy(sandbox2::PolicyBuilder*) {
     return sandbox2::PolicyBuilder()
         .AllowStaticStartup()
         .AllowOpen()
@@ -50,14 +50,27 @@ class PffftSapiSandbox : public PffftSandbox {
   }
 };
 
-double UclockSec() { return static_cast<double>(clock()) / CLOCKS_PER_SEC; }
+// output_format flag determines whether the output shows information in detail
+// or not. By default, the flag is set as 0, meaning an elaborate display
+// (see ShowOutput method).
+static bool ValidateFlag(const char* flagname, int32_t value) {
+  if (value >= 0 && value < 32768) {
+    return true;
+  }
 
-int array_output_format = 0;
+  LOG(ERROR) << "Invalid value for --" << flagname << ".";
+  return false;
+}
+
+DEFINE_int32(output_format, 0, "Value to specific the output format.");
+DEFINE_validator(output_format, &ValidateFlag);
+
+double UclockSec() { return static_cast<double>(clock()) / CLOCKS_PER_SEC; }
 
 void ShowOutput(const char* name, int n, int cplx, float flops, float t0,
                 float t1, int max_iter) {
   float mflops = flops / 1e6 / (t1 - t0 + 1e-16);
-  if (array_output_format) {
+  if (FLAGS_output_format) {
     if (flops != -1) {
       printf("|%9.0f   ", mflops);
     } else
@@ -76,40 +89,18 @@ absl::Status PffftMain() {
   PffftSapiSandbox sandbox;
   SAPI_RETURN_IF_ERROR(sandbox.Init());
 
-  return absl::OkStatus();
-}
-
-int main(int argc, char* argv[]) {
-  // Initialize Google's logging library.
-  google::InitGoogleLogging(argv[0]);
-
-  gflags::ParseCommandLineFlags(&argc, &argv, true);
-
-  // kTransformSizes is a vector keeping the values by which iterates n, its value
-  // representing the input length. More concrete, n is the number of
-  // data points the caclulus is up to (determinating its accuracy).
-  // To show the performance of Fast-Fourier Transformations the program is
-  // testing for various values of n.
-  constexpr int kTransformSizes[] = {64,    96,     128,        160,         192,     256,
-                   384,   5 * 96, 512,        5 * 128,     3 * 256, 800,
-                   1024,  2048,   2400,       4096,        8192,    9 * 1024,
-                   16384, 32768};
-
-  LOG(INFO) << "Initializing sandbox...\n";
-
-  PffftSapiSandbox sandbox;
-  absl::Status init_status = sandbox.Init();
-
-  if (absl::Status status = PffftMain(); !status.ok()) {
-    LOG(ERROR) << "Initialization failed: " << status.ToString();
-    return EXIT_FAILURE;
-  }
-
-  LOG(INFO) << "Initialization: " << init_status.ToString();
-
   PffftApi api(&sandbox);
   int cplx = 0;
 
+  // kTransformSizes is a vector keeping the values by which iterates n, its
+  // value representing the input length. More concrete, n is the number of data
+  // points the caclulus is up to (determinating its accuracy). To show the
+  // performance of Fast-Fourier Transformations the program is testing for
+  // various values of n.
+  constexpr int kTransformSizes[] = {
+      64,      96,  128,  160,  192,  256,  384,  5 * 96,   512,   5 * 128,
+      3 * 256, 800, 1024, 2048, 2400, 4096, 8192, 9 * 1024, 16384, 32768};
+
   do {
     for (int n : kTransformSizes) {
       const int n_float = n * (cplx ? 2 : 1);
@@ -118,26 +109,31 @@ int main(int argc, char* argv[]) {
       std::vector<float> work(2 * n_float + 15, 0.0);
       sapi::v::Array<float> work_array(&work[0], work.size());
 
-      float x[n_bytes], y[n_bytes], z[n_bytes];
-      sapi::v::Array<float> x_array(x, n_bytes), y_array(y, n_bytes), z_array(z, n_bytes);
+      std::vector<float> x(n_bytes, 0.0);
+      sapi::v::Array<float> x_array(&x[0], x.size());
+
+      std::vector<float> y(n_bytes, 0.0);
+      sapi::v::Array<float> y_array(&y[0], y.size());
+
+      std::vector<float> z(n_bytes, 0.0);
+      sapi::v::Array<float> z_array(&z[0], z.size());
 
       double t0;
       double t1;
       double flops;
 
-      int k;
       int max_iter = 5120000 / n * 4;
 
-      for (k = 0; k < n_float; ++k) {
+      for (int k = 0; k < n_float; ++k) {
         x[k] = 0;
       }
 
       // FFTPack benchmark
       {
         // SIMD_SZ == 4 (returning value of pffft_simd_size())
-        int max_iter_ = max_iter / 4;
+        int simd_size_iter = max_iter / 4;
 
-        if (max_iter_ == 0) max_iter_ = 1;
+        if (simd_size_iter == 0) simd_size_iter = 1;
         if (cplx) {
           api.cffti(n, work_array.PtrBoth()).IgnoreError();
         } else {
@@ -145,7 +141,7 @@ int main(int argc, char* argv[]) {
         }
         t0 = UclockSec();
 
-        for (int iter = 0; iter < max_iter_; ++iter) {
+        for (int iter = 0; iter < simd_size_iter; ++iter) {
           if (cplx) {
             api.cfftf(n, x_array.PtrBoth(), work_array.PtrBoth()).IgnoreError();
             api.cfftb(n, x_array.PtrBoth(), work_array.PtrBoth()).IgnoreError();
@@ -156,11 +152,11 @@ int main(int argc, char* argv[]) {
         }
         t1 = UclockSec();
 
-        flops =
-            (max_iter_ * 2) * ((cplx ? 5 : 2.5) * n * log((double)n) / M_LN2);
-        ShowOutput("FFTPack", n, cplx, flops, t0, t1, max_iter_);
+        flops = (simd_size_iter * 2) *
+                ((cplx ? 5 : 2.5) * n * log((double)n) / M_LN2);
+        ShowOutput("FFTPack", n, cplx, flops, t0, t1, simd_size_iter);
       }
-      
+
       // PFFFT benchmark
       {
         sapi::StatusOr<PFFFT_Setup*> s =
@@ -170,7 +166,7 @@ int main(int argc, char* argv[]) {
 
         if (!s.ok()) {
           printf("Sandbox failed.\n");
-          return EXIT_FAILURE;
+          return s.status();
         }
 
         sapi::v::RemotePtr s_reg(s.value());
@@ -188,8 +184,8 @@ int main(int argc, char* argv[]) {
         t1 = UclockSec();
         api.pffft_destroy_setup(&s_reg).IgnoreError();
 
-        flops =
-            (max_iter * 2) * ((cplx ? 5 : 2.5) * n * log((double)n) / M_LN2);
+        flops = (max_iter * 2) * ((cplx ? 5 : 2.5) * static_cast<double>(n) *
+                                  log((double)n) / M_LN2);
         ShowOutput("PFFFT", n, cplx, flops, t0, t1, max_iter);
 
         LOG(INFO) << "n = " << n << " SUCCESSFULLY";
@@ -199,5 +195,21 @@ int main(int argc, char* argv[]) {
     cplx = !cplx;
   } while (cplx);
 
+  return absl::OkStatus();
+}
+
+int main(int argc, char* argv[]) {
+  // Initialize Google's logging library.
+  google::InitGoogleLogging(argv[0]);
+
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+
+  LOG(INFO) << "Initializing sandbox...\n";
+
+  if (absl::Status status = PffftMain(); !status.ok()) {
+    LOG(ERROR) << "Initialization failed: " << status.ToString();
+    return EXIT_FAILURE;
+  }
+
   return EXIT_SUCCESS;
 }
\ No newline at end of file

From 60b3b5057ce5c32eebd9c47600c71f8a31245c8e Mon Sep 17 00:00:00 2001
From: doinachiroiu <inach@google.com>
Date: Thu, 27 Aug 2020 16:49:09 +0000
Subject: [PATCH 24/29] Small README correction

---
 oss-internship-2020/pffft/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/oss-internship-2020/pffft/README.md b/oss-internship-2020/pffft/README.md
index cf04563..4f2338a 100644
--- a/oss-internship-2020/pffft/README.md
+++ b/oss-internship-2020/pffft/README.md
@@ -47,8 +47,8 @@ important to mention that the `cplx` variable stands for a boolean value
 that tells the type of transformation (0 for REAL and 1 for COMPLEX) and 
 it is taken into account while testing.
 In the end, the performance of PFFFT library it is outlined by the output.
-There are two output formats available, from which you can choose through
-from which you can choose through `--output_format=` command-line flag.
+There are two output formats available, from which you can choose through 
+`--output_format=` command-line flag.
 Without using this type of argument when running, the output format is set
 by default.*
 

From b2351ec6395c7fc0ebe6a8b90d6becc0ed12bd8b Mon Sep 17 00:00:00 2001
From: doinachiroiu <inach@google.com>
Date: Thu, 27 Aug 2020 16:55:55 +0000
Subject: [PATCH 25/29] Small coding style corrections

---
 oss-internship-2020/pffft/README.md           |  2 +-
 .../pffft/main_pffft_sandboxed.cc             | 27 +++++++++----------
 2 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/oss-internship-2020/pffft/README.md b/oss-internship-2020/pffft/README.md
index 4f2338a..d94a083 100644
--- a/oss-internship-2020/pffft/README.md
+++ b/oss-internship-2020/pffft/README.md
@@ -43,7 +43,7 @@ transformations and print the speed for each value and type of
 transformation. More specifically, the input length is the target for 
 accuracy (named as `N`) and it stands for the number of data points from 
 the series that calculate the result of transformation. It is also 
-important to mention that the `cplx` variable stands for a boolean value 
+important to mention that the `complex` variable stands for a boolean value 
 that tells the type of transformation (0 for REAL and 1 for COMPLEX) and 
 it is taken into account while testing.
 In the end, the performance of PFFFT library it is outlined by the output.
diff --git a/oss-internship-2020/pffft/main_pffft_sandboxed.cc b/oss-internship-2020/pffft/main_pffft_sandboxed.cc
index fc14319..33116ed 100644
--- a/oss-internship-2020/pffft/main_pffft_sandboxed.cc
+++ b/oss-internship-2020/pffft/main_pffft_sandboxed.cc
@@ -67,7 +67,7 @@ DEFINE_validator(output_format, &ValidateFlag);
 
 double UclockSec() { return static_cast<double>(clock()) / CLOCKS_PER_SEC; }
 
-void ShowOutput(const char* name, int n, int cplx, float flops, float t0,
+void ShowOutput(const char* name, int n, int complex, float flops, float t0,
                 float t1, int max_iter) {
   float mflops = flops / 1e6 / (t1 - t0 + 1e-16);
   if (FLAGS_output_format) {
@@ -78,7 +78,7 @@ void ShowOutput(const char* name, int n, int cplx, float flops, float t0,
   } else {
     if (flops != -1) {
       printf("n=%5d, %s %16s : %6.0f MFlops [t=%6.0f ns, %d runs]\n", n,
-             (cplx ? "CPLX" : "REAL"), name, mflops,
+             (complex ? "CPLX" : "REAL"), name, mflops,
              (t1 - t0) / 2 / max_iter * 1e9, max_iter);
     }
   }
@@ -90,7 +90,6 @@ absl::Status PffftMain() {
   SAPI_RETURN_IF_ERROR(sandbox.Init());
 
   PffftApi api(&sandbox);
-  int cplx = 0;
 
   // kTransformSizes is a vector keeping the values by which iterates n, its
   // value representing the input length. More concrete, n is the number of data
@@ -101,9 +100,9 @@ absl::Status PffftMain() {
       64,      96,  128,  160,  192,  256,  384,  5 * 96,   512,   5 * 128,
       3 * 256, 800, 1024, 2048, 2400, 4096, 8192, 9 * 1024, 16384, 32768};
 
-  do {
+  for (int complex : {0, 1}) {
     for (int n : kTransformSizes) {
-      const int n_float = n * (cplx ? 2 : 1);
+      const int n_float = n * (complex ? 2 : 1);
       int n_bytes = n_float * sizeof(float);
 
       std::vector<float> work(2 * n_float + 15, 0.0);
@@ -134,7 +133,7 @@ absl::Status PffftMain() {
         int simd_size_iter = max_iter / 4;
 
         if (simd_size_iter == 0) simd_size_iter = 1;
-        if (cplx) {
+        if (complex) {
           api.cffti(n, work_array.PtrBoth()).IgnoreError();
         } else {
           api.rffti(n, work_array.PtrBoth()).IgnoreError();
@@ -142,7 +141,7 @@ absl::Status PffftMain() {
         t0 = UclockSec();
 
         for (int iter = 0; iter < simd_size_iter; ++iter) {
-          if (cplx) {
+          if (complex) {
             api.cfftf(n, x_array.PtrBoth(), work_array.PtrBoth()).IgnoreError();
             api.cfftb(n, x_array.PtrBoth(), work_array.PtrBoth()).IgnoreError();
           } else {
@@ -153,14 +152,14 @@ absl::Status PffftMain() {
         t1 = UclockSec();
 
         flops = (simd_size_iter * 2) *
-                ((cplx ? 5 : 2.5) * n * log((double)n) / M_LN2);
-        ShowOutput("FFTPack", n, cplx, flops, t0, t1, simd_size_iter);
+                ((complex ? 5 : 2.5) * n * log((double)n) / M_LN2);
+        ShowOutput("FFTPack", n, complex, flops, t0, t1, simd_size_iter);
       }
 
       // PFFFT benchmark
       {
         sapi::StatusOr<PFFFT_Setup*> s =
-            api.pffft_new_setup(n, cplx ? PFFFT_COMPLEX : PFFFT_REAL);
+            api.pffft_new_setup(n, complex ? PFFFT_COMPLEX : PFFFT_REAL);
 
         LOG(INFO) << "Setup status is: " << s.status().ToString();
 
@@ -184,16 +183,14 @@ absl::Status PffftMain() {
         t1 = UclockSec();
         api.pffft_destroy_setup(&s_reg).IgnoreError();
 
-        flops = (max_iter * 2) * ((cplx ? 5 : 2.5) * static_cast<double>(n) *
+        flops = (max_iter * 2) * ((complex ? 5 : 2.5) * static_cast<double>(n) *
                                   log((double)n) / M_LN2);
-        ShowOutput("PFFFT", n, cplx, flops, t0, t1, max_iter);
+        ShowOutput("PFFFT", n, complex, flops, t0, t1, max_iter);
 
         LOG(INFO) << "n = " << n << " SUCCESSFULLY";
       }
     }
-
-    cplx = !cplx;
-  } while (cplx);
+  } 
 
   return absl::OkStatus();
 }

From 905f86a0ab64a78ca05e038b7bd553ca5fd1b85e Mon Sep 17 00:00:00 2001
From: doinachiroiu <inach@google.com>
Date: Thu, 27 Aug 2020 16:59:58 +0000
Subject: [PATCH 26/29] Small README update regarding variables names

---
 oss-internship-2020/pffft/README.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/oss-internship-2020/pffft/README.md b/oss-internship-2020/pffft/README.md
index d94a083..a380ff4 100644
--- a/oss-internship-2020/pffft/README.md
+++ b/oss-internship-2020/pffft/README.md
@@ -41,7 +41,7 @@ Regarding the testing of the methods, one main is doing this job by
 iterating through a set of values, that represents the accuracy of 
 transformations and print the speed for each value and type of 
 transformation. More specifically, the input length is the target for 
-accuracy (named as `N`) and it stands for the number of data points from 
+accuracy (named as `n`) and it stands for the number of data points from 
 the series that calculate the result of transformation. It is also 
 important to mention that the `complex` variable stands for a boolean value 
 that tells the type of transformation (0 for REAL and 1 for COMPLEX) and 
@@ -60,9 +60,9 @@ by default.*
 * containing two testing parts (fft / pffft benchmarks)
 * showing the performance of the transformations implies 
     testing them through various FFT dimenstions. 
-    Variable N, the input length, will take specific values 
+    Variable n, the input length, will take specific values 
     meaning the number of points to which it is set the calculus 
-    (more details of mathematical purpose of N - https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm). 
+    (more details of mathematical purpose of n - https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm). 
 * output shows speed depending on the input length
 * use `--output_format=0` or `--output_format=1` arguments to choose between output formats.
     `0` is for a detailed output, while `1` is only displaying each transformation process speed.
@@ -72,8 +72,8 @@ by default.*
 ### Bugs history
 1. [Solved] pffft benchmark bug: "Sandbox not active"  
         
-        N = 64, status OK, pffft_transform generates error 
-        N > 64, status not OK
+        n = 64, status OK, pffft_transform generates error 
+        n > 64, status not OK
         Problem on initialising sapi::StatusOr<PFFFT_Setup *> s; the memory that stays 
         for s is not the same with the address passed in pffft_transform function. 
         (sapi::v::GenericPtr - to be changed)

From 531850afc219947ccc4fd3d5208b0049b55a42c6 Mon Sep 17 00:00:00 2001
From: doinachiroiu <inach@google.com>
Date: Mon, 31 Aug 2020 11:17:15 +0000
Subject: [PATCH 27/29] Required changes resolved

---
 oss-internship-2020/pffft/README.md               | 2 +-
 oss-internship-2020/pffft/main_pffft_sandboxed.cc | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/oss-internship-2020/pffft/README.md b/oss-internship-2020/pffft/README.md
index a380ff4..194c6b6 100644
--- a/oss-internship-2020/pffft/README.md
+++ b/oss-internship-2020/pffft/README.md
@@ -4,7 +4,7 @@ Build System: CMake
 OS: Linux
 
 ### Check out the PFFFT library & CMake set up
-`git clone https://github.com/doinachiroiu/sandboxed-api/tree/master/oss-internship-2020/pffft`
+`git checkout -b master`
 
 `git submodule update --init --recursive`
 
diff --git a/oss-internship-2020/pffft/main_pffft_sandboxed.cc b/oss-internship-2020/pffft/main_pffft_sandboxed.cc
index 33116ed..ea42a26 100644
--- a/oss-internship-2020/pffft/main_pffft_sandboxed.cc
+++ b/oss-internship-2020/pffft/main_pffft_sandboxed.cc
@@ -86,6 +86,8 @@ void ShowOutput(const char* name, int n, int complex, float flops, float t0,
 }
 
 absl::Status PffftMain() {
+  LOG(INFO) << "Initializing sandbox...\n";
+
   PffftSapiSandbox sandbox;
   SAPI_RETURN_IF_ERROR(sandbox.Init());
 
@@ -201,8 +203,6 @@ int main(int argc, char* argv[]) {
 
   gflags::ParseCommandLineFlags(&argc, &argv, true);
 
-  LOG(INFO) << "Initializing sandbox...\n";
-
   if (absl::Status status = PffftMain(); !status.ok()) {
     LOG(ERROR) << "Initialization failed: " << status.ToString();
     return EXIT_FAILURE;

From 7e22952c424536dd950ccf32e42a9abc6ce0a6f7 Mon Sep 17 00:00:00 2001
From: doinachiroiu <inach@google.com>
Date: Mon, 31 Aug 2020 11:48:19 +0000
Subject: [PATCH 28/29] Setup error handling modified

---
 .../pffft/main_pffft_sandboxed.cc                | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/oss-internship-2020/pffft/main_pffft_sandboxed.cc b/oss-internship-2020/pffft/main_pffft_sandboxed.cc
index ea42a26..51e271d 100644
--- a/oss-internship-2020/pffft/main_pffft_sandboxed.cc
+++ b/oss-internship-2020/pffft/main_pffft_sandboxed.cc
@@ -160,17 +160,11 @@ absl::Status PffftMain() {
 
       // PFFFT benchmark
       {
-        sapi::StatusOr<PFFFT_Setup*> s =
-            api.pffft_new_setup(n, complex ? PFFFT_COMPLEX : PFFFT_REAL);
+        SAPI_ASSIGN_OR_RETURN(
+            PFFFT_Setup *s,
+            api.pffft_new_setup(n, complex ? PFFFT_COMPLEX : PFFFT_REAL));
 
-        LOG(INFO) << "Setup status is: " << s.status().ToString();
-
-        if (!s.ok()) {
-          printf("Sandbox failed.\n");
-          return s.status();
-        }
-
-        sapi::v::RemotePtr s_reg(s.value());
+        sapi::v::RemotePtr s_reg(s);
 
         t0 = UclockSec();
         for (int iter = 0; iter < max_iter; ++iter) {
@@ -192,7 +186,7 @@ absl::Status PffftMain() {
         LOG(INFO) << "n = " << n << " SUCCESSFULLY";
       }
     }
-  } 
+  }
 
   return absl::OkStatus();
 }

From 1869fe515ff110f5b3b12f72ff0e5d9afd04a314 Mon Sep 17 00:00:00 2001
From: doinachiroiu <inach@google.com>
Date: Thu, 3 Sep 2020 14:59:54 +0000
Subject: [PATCH 29/29] Required changes resolved

---
 oss-internship-2020/pffft/CMakeLists.txt      |  3 --
 oss-internship-2020/pffft/README.md           | 15 ++++----
 .../pffft/main_pffft_sandboxed.cc             | 36 ++++++++++---------
 3 files changed, 26 insertions(+), 28 deletions(-)

diff --git a/oss-internship-2020/pffft/CMakeLists.txt b/oss-internship-2020/pffft/CMakeLists.txt
index be687cd..b172009 100644
--- a/oss-internship-2020/pffft/CMakeLists.txt
+++ b/oss-internship-2020/pffft/CMakeLists.txt
@@ -19,8 +19,6 @@ project(pffft CXX C)
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED True)
 
-set(PFFFT_ROOT_DIR https://bitbucket.org/jpommier/pffft.git)
-
 add_library(pffft STATIC 
     master/pffft.c 
     master/pffft.h 
@@ -45,7 +43,6 @@ endif()
 
 target_link_libraries(pffft PUBLIC ${MATH_LIBS})
 
-
 # Adding dependencies
 set(SAPI_ROOT "../.." CACHE PATH "Path to the Sandboxed API source tree")
 # Then configure:
diff --git a/oss-internship-2020/pffft/README.md b/oss-internship-2020/pffft/README.md
index 194c6b6..2e023c2 100644
--- a/oss-internship-2020/pffft/README.md
+++ b/oss-internship-2020/pffft/README.md
@@ -4,16 +4,13 @@ Build System: CMake
 OS: Linux
 
 ### Check out the PFFFT library & CMake set up
-`git checkout -b master`
-
-`git submodule update --init --recursive`
-
-`mkdir -p build && cd build`
-
-`cmake .. -G Ninja -DPFFFT_ROOT_DIR=$PWD`
-
-`ninja`
+```
+git submodule update --init --recursive
 
+mkdir -p build && cd build
+cmake .. -G Ninja -DPFFFT_ROOT_DIR=$PWD
+ninjas
+```
 ### For testing: 
 `cd build`, then `./pffft_sandboxed`
 
diff --git a/oss-internship-2020/pffft/main_pffft_sandboxed.cc b/oss-internship-2020/pffft/main_pffft_sandboxed.cc
index 51e271d..f922275 100644
--- a/oss-internship-2020/pffft/main_pffft_sandboxed.cc
+++ b/oss-internship-2020/pffft/main_pffft_sandboxed.cc
@@ -136,51 +136,55 @@ absl::Status PffftMain() {
 
         if (simd_size_iter == 0) simd_size_iter = 1;
         if (complex) {
-          api.cffti(n, work_array.PtrBoth()).IgnoreError();
+          SAPI_RETURN_IF_ERROR(api.cffti(n, work_array.PtrBoth()))
         } else {
-          api.rffti(n, work_array.PtrBoth()).IgnoreError();
+          SAPI_RETURN_IF_ERROR(api.rffti(n, work_array.PtrBoth()));
         }
         t0 = UclockSec();
 
         for (int iter = 0; iter < simd_size_iter; ++iter) {
           if (complex) {
-            api.cfftf(n, x_array.PtrBoth(), work_array.PtrBoth()).IgnoreError();
-            api.cfftb(n, x_array.PtrBoth(), work_array.PtrBoth()).IgnoreError();
+            SAPI_RETURN_IF_ERROR(
+                api.cfftf(n, x_array.PtrBoth(), work_array.PtrBoth()));
+            SAPI_RETURN_IF_ERROR(
+                api.cfftb(n, x_array.PtrBoth(), work_array.PtrBoth()));
           } else {
-            api.rfftf(n, x_array.PtrBoth(), work_array.PtrBoth()).IgnoreError();
-            api.rfftb(n, x_array.PtrBoth(), work_array.PtrBoth()).IgnoreError();
+            SAPI_RETURN_IF_ERROR(
+                api.rfftf(n, x_array.PtrBoth(), work_array.PtrBoth()));
+            SAPI_RETURN_IF_ERROR(
+                api.rfftb(n, x_array.PtrBoth(), work_array.PtrBoth()));
           }
         }
         t1 = UclockSec();
 
         flops = (simd_size_iter * 2) *
-                ((complex ? 5 : 2.5) * n * log((double)n) / M_LN2);
+                ((complex ? 5 : 2.5) * static_cast<double>(n) * log(static_cast<double>(n)) / M_LN2);
         ShowOutput("FFTPack", n, complex, flops, t0, t1, simd_size_iter);
       }
 
       // PFFFT benchmark
       {
         SAPI_ASSIGN_OR_RETURN(
-            PFFFT_Setup *s,
+            PFFFT_Setup * s,
             api.pffft_new_setup(n, complex ? PFFFT_COMPLEX : PFFFT_REAL));
 
         sapi::v::RemotePtr s_reg(s);
 
         t0 = UclockSec();
         for (int iter = 0; iter < max_iter; ++iter) {
-          api.pffft_transform(&s_reg, x_array.PtrBoth(), z_array.PtrBoth(),
-                              y_array.PtrBoth(), PFFFT_FORWARD)
-              .IgnoreError();
-          api.pffft_transform(&s_reg, x_array.PtrBoth(), z_array.PtrBoth(),
-                              y_array.PtrBoth(), PFFFT_FORWARD)
-              .IgnoreError();
+          SAPI_RETURN_IF_ERROR(
+              api.pffft_transform(&s_reg, x_array.PtrBoth(), z_array.PtrBoth(),
+                                  y_array.PtrBoth(), PFFFT_FORWARD));
+          SAPI_RETURN_IF_ERROR(
+              api.pffft_transform(&s_reg, x_array.PtrBoth(), z_array.PtrBoth(),
+                                  y_array.PtrBoth(), PFFFT_FORWARD));
         }
 
         t1 = UclockSec();
-        api.pffft_destroy_setup(&s_reg).IgnoreError();
+        SAPI_RETURN_IF_ERROR(api.pffft_destroy_setup(&s_reg));
 
         flops = (max_iter * 2) * ((complex ? 5 : 2.5) * static_cast<double>(n) *
-                                  log((double)n) / M_LN2);
+                                  log(static_cast<double>(n)) / M_LN2);
         ShowOutput("PFFFT", n, complex, flops, t0, t1, max_iter);
 
         LOG(INFO) << "n = " << n << " SUCCESSFULLY";