/* Header file for super parallel fft library */
#ifndef _spfft_h
#define _spfft_h

/* 
OPTIONS:
-------
Edit the following parameters to change precision, the sign of the
exponential function, and the scaling of forwards and inverse transforms.
Remember to recompile everything after changing parameters! */

/* PRECISION:
#define float_type 1 
for single precision
#define float_type 2
for double precision 
*/
#define float_type 1

/* EXPONENTIAL SIGN:
#define exp_sign 1 
for exp(2*pi*i...) in forwards transform and exp(-2*pi*i...) in the inverse.
#define exp_sign -1
for exp(-2*pi*i...) in forwards transform and exp(2*pi*i...) in the inverse.
*/
#define exp_sign 1

/* SCALING:
#define fft_scaling 0
for no scaling in forwards transform and no scaling in inverse transform.
#define fft_scaling 1
for no scaling in forwards transform and scaling by 1/N in inverse transform.
#define fft_scaling 2
for scaling by 1/N in forwards transform and no scaling in inverse transform.
#define fft_scaling 3
for scaling by 1/sqrt(N) in both forwards and inverse transform.
*/
#define fft_scaling 1

/* --------
END OPTIONS */

/* For later convenience, we label the float type as FLTYP */
#if (float_type == 1)
#define FLTYP float
#else
#define FLTYP double
#endif

/* c-include files */
#include <stdlib.h>
#include <stdio.h>
#include <reduce.h>
#include <mpl.h>
#include <sys/time.h>
#include <math.h>

#include "p3pack.h"
#include "genutil.h"

/* Define type PLCPLX (plural complex) as array of two plural FLTYP */
typedef plural FLTYP PLCPLX[2];

/* Define control structure for the FFT computations */
struct FFTCTL {
   PLCPLX *twdfac;              /* list of twiddle factors */
   plural unsigned char *twdpt; /* pointer to tw-factor list */
   short  lsttwp;               /* length of list twdpt */
   short *actgr,nactgr;         /* list of and no. of active groups */
   int mb[3], mbits,actcnt; /* mem-config, no. of mem-bits, no. of act bts. */
   /* bit-masks for: memory bits, x- and y-proc, active bits, active groups  */
   unsigned membits,xbits,ybits,actbits,*agrbits;
   FLTYP scale;             /* scaling of inverse transform */
   /* arrays to find identities of f-coeffs (see spfft.doc) */
   plural unsigned *fc,*ifc; 
   /* Bit linear permuts: unscrambling, scrambl.,  post permutation, post*unscr, 
   memory_interlaced->memory_bits left, inverse of that */
   BL_PERMUT   unscr_blp,scr_blp,post_blp,final_blp,mem2l_blp,l2mem_blp;
   PERMUT_ROUT final_rout; /* routing of final permutation */
   char symmetry; /* 0 for none, 1 for real data */
   /* routing of real->complex unpacking */
   plural unsigned short *sPackRout1, *sPackRout2; 
};

/* Function declarations */

struct FFTCTL *fft_init();
struct FFTCTL *fft_ainit();
void smb_fft();
void sp_fft();
void fft_srt();
void fft_free();
void fft_permut();

/* in case M_PI is not defined in math.h */
#ifndef M_PI
#define M_PI  3.14159265358979323846
#endif

/* Macros */
/* for easier optimization we define the crucial parts of the code in
the preprocessor (inline expansion instead of funciton call): */
 
/* define macro to compute the memory and proc. address for a data bit
   given the long address */
#define mp_adr(lad,mad,pad,mb)\
{  unsigned _tmp;\
   _tmp = lad;\
   mad = _tmp & ((1<<((mb)[0]))-1);                 /* m0 bits */ \
   _tmp >>= (mb)[0];\
   pad = _tmp & (nxproc-1);                         /* nx bits */ \
   _tmp >>= lnxproc;\
   mad |= (_tmp & ((1<<((mb)[1]))-1)) << ((mb)[0]); /* m1 bits */ \
   _tmp >>= (mb)[1];\
   pad |= (_tmp &(nyproc-1)) << lnxproc;            /* ny bits */ \
   _tmp >>= lnyproc;\
   mad |= _tmp << (((mb)[0])+((mb)[1]));            /* m2 bits */ \
}
 
 
/* define macro to compute the long address for a data bit on each proc */
#define long_adr(lad,mad,mb) \
{  unsigned _tmp;\
   _tmp = (mad) >> ((mb)[0]+(mb)[1]);          /* find m2 bits */ \
   lad = (_tmp << lnyproc) | iyproc;           /* lad  = (m2,iy) */ \
   if ((mb)[1]){\
      lad  <<= (mb)[1];                        /* lad  = (m2,iy,0) */ \
      _tmp = ((mad) >> ((mb)[0])) & ((1<<((mb)[1])) -1); /* find m1 bits */ \
      lad  |= _tmp;                            /* lad  = (m2,iy,m1) */ \
   }\
   lad  <<= lnxproc;                           /* lad  = (m2,iy,m1,0) */ \
   lad  |= ixproc;                             /* lad  = (m2,iy,m1,ix) */ \
   if ((mb)[0]){\
      lad  <<= (mb)[0];                        /* lad  = (m2,iy,m1,ix,0) */ \
      _tmp = (mad) & ((1<<((mb)[0])) -1);      /* find m0 bits */ \
      lad  |= _tmp;                            /* lad  = (m2,iy,m1,ix,m0) */ \
   }\
}
 

#endif