!{\src2tex{textfont=tt}}
!!****m* ABINIT/m_fftw3
!! NAME
!! m_fftw3
!!
!! FUNCTION
!!  This module provides wrappers for the FFTW3 routines: in-place and out-of-place version.
!!
!! COPYRIGHT
!! Copyright (C) 2009-2012 ABINIT group (MG)
!! This file is distributed under the terms of the
!! GNU General Public License, see ~abinit/COPYING
!! or http://www.gnu.org/copyleft/gpl.txt .
!!
!! NOTES
!!  1) MPI parallelism is not supported 
!!  2) For better performance the FFT divisions should contain small factors  (/2, 3, 5, 7, 11/).
!!
!! SOURCE

#if defined HAVE_CONFIG_H
#include "config.h"
#endif

#include "abi_common.h"

! It seems that MKL wrappers do not like the advanced interfaces for 
! r2c and c2r transforms although they work fine if the true FFTW3 library is used.
#define DEV_RC_BUG

MODULE m_fftw3

 use defs_basis
 use m_profiling
 use m_errors
 use m_timer
 use m_xomp
 use m_blas
#ifdef HAVE_FC_ISO_C_BINDING
 use iso_c_binding
#else
 use m_iso_c_binding
#endif

 use m_numeric_tools,   only : imax_loc

 implicit none

 private

 public :: fftw3_cleanup        ! Reset FFTW to the pristine state it was in when you started your program, 
 public :: fftw3_init_threads   ! one-time initialization required to use FFTW3 threads.
 public :: fftw3_set_nthreads   ! Set the number of threads you want FFTW3 to use when HAVE_FFT_FFTW3_THREADS is defined.
 public :: fftw3_r2c_op         ! Real to complex transform (out-of-place version).
 public :: fftw3_c2r_op         ! Complex to real transform (out-of-place version).
 public :: fftw3_c2c_op         ! complex to complex transform (out-of-place version).
 public :: fftw3_c2c_ip         ! complex to complex transform (in-place version).
 public :: fftw3_many_dft_op    ! Driver routine for many out-of-place 3D complex-to-complex FFTs.
 public :: fftw3_many_dft_ip    ! Driver routine for many in-place 3D complex-to-complex FFTs.
 public :: fftw3_fftpad         ! Driver routines for zero-padded FFT of wavefunctions.
 public :: fftw3_fftpad_cplx    ! Driver routines for zero-padded FFT of wavefunctions.

 public :: fftw3_fftpad_tr      ! Still under development.

! flags copied from fftw3.f
 integer,public,parameter :: FFTW_R2HC=0
 integer,public,parameter :: FFTW_HC2R=1
 integer,public,parameter :: FFTW_DHT=2
 integer,public,parameter :: FFTW_REDFT00=3
 integer,public,parameter :: FFTW_REDFT01=4
 integer,public,parameter :: FFTW_REDFT10=5
 integer,public,parameter :: FFTW_REDFT11=6
 integer,public,parameter :: FFTW_RODFT00=7
 integer,public,parameter :: FFTW_RODFT01=8
 integer,public,parameter :: FFTW_RODFT10=9
 integer,public,parameter :: FFTW_RODFT11=10
 integer,public,parameter :: FFTW_FORWARD=-1
 integer,public,parameter :: FFTW_BACKWARD=+1
 integer,public,parameter :: FFTW_MEASURE=0
 integer,public,parameter :: FFTW_DESTROY_INPUT=1
 integer,public,parameter :: FFTW_UNALIGNED=2
 integer,public,parameter :: FFTW_CONSERVE_MEMORY=4
 integer,public,parameter :: FFTW_EXHAUSTIVE=8
 integer,public,parameter :: FFTW_PRESERVE_INPUT=16
 integer,public,parameter :: FFTW_PATIENT=32
 integer,public,parameter :: FFTW_ESTIMATE=64
 integer,public,parameter :: FFTW_ESTIMATE_PATIENT=128
 integer,public,parameter :: FFTW_BELIEVE_PCOST=256
 integer,public,parameter :: FFTW_NO_DFT_R2HC=512
 integer,public,parameter :: FFTW_NO_NONTHREADED=1024
 integer,public,parameter :: FFTW_NO_BUFFERING=2048
 integer,public,parameter :: FFTW_NO_INDIRECT_OP=4096
 integer,public,parameter :: FFTW_ALLOW_LARGE_GENERIC=8192
 integer,public,parameter :: FFTW_NO_RANK_SPLITS=16384
 integer,public,parameter :: FFTW_NO_VRANK_SPLITS=32768
 integer,public,parameter :: FFTW_NO_VRECURSE=65536
 integer,public,parameter :: FFTW_NO_SIMD=131072
 integer,public,parameter :: FFTW_NO_SLOW=262144
 integer,public,parameter :: FFTW_NO_FIXED_RADIX_LARGE_N=524288
 integer,public,parameter :: FFTW_ALLOW_PRUNING=1048576
 integer,public,parameter :: FFTW_WISDOM_ONLY=2097152
! end flags copied from fftw3.f

! ==========================================================================================
! ==== Variables introduced for the FFTW3 interface in abinit. Not belonging to fftw3.f ====
! ==========================================================================================

 integer,public,parameter :: NULL_PLAN = 0
 ! MKL wrappers might return NULL_PLAN if a particular FFTW3 feature is not available

 integer,public,parameter :: KIND_FFTW_PLAN = 8
 ! It should be at least integer*@SIZEOF_INT_P@
 ! MKL wrappers requires it to be integer*8, so do _not_ use C_INTPTR_T.

 integer,private,save :: THREADS_INITED = 0
 ! 1 if treads have been initialized. 0 otherwise.

!!***

!----------------------------------------------------------------------

!!****t* m_fftw3/zpad_t
!! NAME
!!  zpad_t
!! 
!! FUNCTION
!! 
!! SOURCE

 type,public :: zpad_t

   integer :: nlinex
   ! Total number of 1D transforms.

   integer :: n_zplanes
   ! Number of z-planes intersecting the sphere.

   integer,pointer :: zplane(:,:) => null()
   ! zplane(3,n_zplanes)
   ! zplane(1,zpl) : mapping z-plane index -> FFT index_z
   ! zplane(2,zpl) : mapping z-plane index -> igb index in array gboud 

   integer,pointer :: linex2ifft_yz(:,:) => null()
   ! linex2ifft_yz(2,nlinex)
   ! mapping 1D-FFT -> (FFT_index_y, FFT index_z)

 end type zpad_t
!!***

!----------------------------------------------------------------------

!!****t* m_fftw3/fftw3_plan3_t
!! NAME
!! fftw3_plan3_t     
!! 
!! FUNCTION
!!  Structure storing the pointer to the FFTW plan as well as the options used to generate it. 
!! 
!! SOURCE

 type,private :: fftw3_plan3_t
   integer :: isign=0                           ! Sign of the exponential in the FFT
   integer :: ndat=-1                           ! Number of FFTs associated to the plan
   integer :: flags=-HUGE(0)                    ! FFTW3 flags used to construct the plan.
   integer(KIND_FFTW_PLAN) :: plan=NULL_PLAN    ! FFTW3 plan.
   integer :: nthreads=1                        ! The number of threads associated to the plan.
   integer :: idist=-1 
   integer :: odist=-1
   integer :: istride=-1
   integer :: ostride=-1
   integer :: n(3)=-1                           ! The number of FFT divisions.
   integer :: inembed(3)=-1
   integer :: onembed(3)=-1
   !integer(C_INT) :: alignment(2)              ! The alignment of the arrays used to construct the plan.
 end type fftw3_plan3_t
!!***

! private Variables

CONTAINS  !===========================================================

!!****f* m_fftw3/fftw3_c2c_ip
!! NAME
!!  fftw3_c2c_ip
!!
!! FUNCTION
!! Driver routine for in-place 3D complex-complex FFT.
!!
!! INPUTS
!! nx,ny,nz=Number of points along the three directions.
!! ldx,ldy,ldz=Physical dimensions of the array.
!! ndat=Number of FFTs to be done.
!! isign= +1 : ff(G) => ff(R); -1 : ff(R) => ff(G)
!! [fftw_flags]=Flags used to create the plan. They can be combined with the "+" operator. 
!!   Defaults to FFTW_ESTIMATE.
!!
!! SIDE EFFECTS
!!  ff(ldx*ldy*ldz*ndat)=
!!    In input: the complex array to be transformed.
!!    In output: the Fourier transformed in the space specified by isign.
!!
!! PARENTS
!!      fftw3_fourdp
!!
!! CHILDREN
!!
!! SOURCE

subroutine fftw3_c2c_ip(nx,ny,nz,ldx,ldy,ldz,ndat,isign,ff,fftw_flags)


!This section has been created automatically by the script Abilint (TD).
!Do not modify the following lines by hand.
#undef ABI_FUNC
#define ABI_FUNC 'fftw3_c2c_ip'
!End of the abilint section

 implicit none 

!Arguments ------------------------------------
!scalars
 integer,intent(in) :: nx,ny,nz,ldx,ldy,ldz,ndat,isign
 integer,optional,intent(in) :: fftw_flags
!arrays
 complex(dpc),intent(inout) :: ff(ldx*ldy*ldz*ndat)

#ifdef HAVE_FFT_FFTW3
!Local variables-------------------------------
!scalars
 integer,parameter :: rank=3,nt_all=-1
 integer :: my_flags,dist,ii,stride
 integer(KIND_FFTW_PLAN) :: my_plan 
!arrays
 integer :: embed(rank),n(rank)

! *************************************************************************

 my_flags=FFTW_ESTIMATE; if (PRESENT(fftw_flags)) my_flags=fftw_flags

 stride = 1
 dist   = ldx*ldy*ldz
 embed  = (/ldx,ldy,ldz/)
 n      = (/nx ,ny ,nz /) 

 my_plan = zplan_many_dft(rank, n, ndat, ff, embed, stride, dist, ff, embed, stride, dist, isign, my_flags, nt_all)

 ! Now perform the 3D FFT via FFTW.
 call dfftw_execute_dft(my_plan, ff, ff)

 call fftw3_destroy_plan(my_plan)

 if (isign==FFTW_FORWARD) then ! -1, FFTW returns not normalized FTs
  call ZDSCAL(ldx*ldy*ldz*ndat, one/(nx*ny*nz), ff, 1) 
 end if

#else 
 MSG_ERROR("FFTW3 support not activated")
 ABI_UNUSED((/nx,ny,nz,ldx,ldy,ldz,isign/))
 ABI_UNUSED(ff)
 if (PRESENT(fftw_flags)) then
   ABI_UNUSED(fftw_flags)
 end if
#endif

end subroutine fftw3_c2c_ip
!!***

!----------------------------------------------------------------------

!!****f* m_fftw3/fftw3_c2c_op
!! NAME
!!  fftw3_c2c_op
!!
!! FUNCTION
!! Driver routine for out-of-place 3D complex-complex FFT of lengths nx, ny, nz.
!!
!! INPUTS
!! nx,ny,nz=Number of points along the three directions.
!! ldx,ldy,ldz=Physical dimensions of the array.
!! ndat=Number of FFTs to be done.
!! isign= +1 : ff(G) => gg(R); -1 : ff(R) => gg(G)
!! ff(ldx*ldy*ldz*ndat)=The array to be transformed.
!! [fftw_flags]=Flags used to create the plan. They can be combined with the "+" operator. 
!!   Defaults to FFTW_ESTIMATE.
!!
!! OUTPUT 
!! gg(ldx*ldy*ldz*ndat)=The FFT of ff.
!!
!! PARENTS
!!      fftw3_fourdp
!!
!! CHILDREN
!!
!! SOURCE

subroutine fftw3_c2c_op(nx,ny,nz,ldx,ldy,ldz,ndat,isign,ff,gg,fftw_flags)


!This section has been created automatically by the script Abilint (TD).
!Do not modify the following lines by hand.
#undef ABI_FUNC
#define ABI_FUNC 'fftw3_c2c_op'
!End of the abilint section

 implicit none 

!Arguments ------------------------------------
!scalars
 integer,intent(in) :: nx,ny,nz,ldx,ldy,ldz,isign,ndat
 integer,optional,intent(in) :: fftw_flags
!arrays
 complex(dpc),intent(in) :: ff(ldx*ldy*ldz*ndat)
 complex(dpc),intent(out) :: gg(ldx*ldy*ldz*ndat)

#ifdef HAVE_FFT_FFTW3 
!Local variables-------------------------------
!scalars
 integer,parameter :: rank=3,nt_all=-1
 integer :: my_flags,dist,ii,stride
 integer(KIND_FFTW_PLAN) :: my_plan
!arrays
 integer :: embed(rank),n(rank)

! *************************************************************************

 my_flags=FFTW_ESTIMATE; if (PRESENT(fftw_flags)) my_flags= fftw_flags

 stride = 1
 dist   = ldx*ldy*ldz
 embed  = (/ldx,ldy,ldz/) 
 n      = (/nx ,ny ,nz/) 

 my_plan = zplan_many_dft(rank, n, ndat, ff, embed, stride, dist, gg, embed, stride, dist, isign, my_flags, nt_all)

 ! Now perform the 3D FFT via FFTW.
 call dfftw_execute_dft(my_plan, ff, gg)

 call fftw3_destroy_plan(my_plan)

 if (isign==FFTW_FORWARD) then ! -1, FFTW returns not normalized FTs
   call ZDSCAL(ldx*ldy*ldz*ndat, one/(nx*ny*nz), gg, 1)  
 end if

#else 
 MSG_ERROR("FFTW3 support not activated")
 ABI_UNUSED((/nx,ny,nz,ldx,ldy,ldz,isign/))
 ABI_UNUSED(ff)
 ABI_UNUSED(gg)
 if (PRESENT(fftw_flags)) then
   ABI_UNUSED(fftw_flags)
 end if
#endif

end subroutine fftw3_c2c_op
!!***

!----------------------------------------------------------------------

!!****f* m_fftw3/fftw3_r2c_op
!! NAME
!!  fftw3_r2c_op
!!
!! FUNCTION
!! Driver routine for out-of-place 3D real-to-complex FFT of lengths nx, ny, nz. 
!!
!! INPUTS
!! nx,ny,nz=Number of points along the three directions.
!! ldx,ldy,ldz=Physical dimensions of the f array (to avoid cache conflicts).
!! ff(ldx*ldy*ldz*ndat)=The real array to be transformed.
!! ndat=Number of FFTs to be done.
!! [fftw_flags]=Flags used to create the plan. They can be combined with the "+" operator. 
!!   Defaults to FFTW_ESTIMATE.
!!
!! OUTPUT 
!! gg(2,nx*ny*nz*ndat)=The forward FFT of ff.
!!
!! NOTES
!!  FIXME For the time-being. No augmentation of the mesh to reduce memory conflicts, as MKL crashes 
!!  if the advanced interface is used.
!!
!! PARENTS
!!      fftw3_fourdp
!!
!! CHILDREN
!!
!! SOURCE

subroutine fftw3_r2c_op(nx,ny,nz,ldx,ldy,ldz,ndat,ff,gg,fftw_flags)


!This section has been created automatically by the script Abilint (TD).
!Do not modify the following lines by hand.
#undef ABI_FUNC
#define ABI_FUNC 'fftw3_r2c_op'
!End of the abilint section

 implicit none 

!Arguments ------------------------------------
!scalars
 integer,intent(in) :: nx,ny,nz,ldx,ldy,ldz,ndat
 integer,optional,intent(in) :: fftw_flags
!arrays
 real(dp),intent(in) :: ff(ldx*ldy*ldz*ndat)
 real(dp),intent(out) :: gg(2,ldx*ldy*ldz*ndat)

#ifdef HAVE_FFT_FFTW3 
!Local variables-------------------------------
!scalars
 integer,parameter :: rank=3,nt_all=-1
 integer :: nhp,my_flags,idist,odist,padx,i1,i2,i3,igp,igf,imgf,ii,stride
 integer :: i1inv,i2inv,i3inv,idat,padatf     
 integer(KIND_FFTW_PLAN) :: my_plan
!arrays
 integer :: inembed(rank),onembed(rank),n(rank)
 integer,allocatable :: i1inver(:),i2inver(:),i3inver(:)
 real(dp),allocatable :: gg_hp(:,:),gg_test(:,:)

! *************************************************************************

 my_flags=FFTW_ESTIMATE; if (PRESENT(fftw_flags)) my_flags= fftw_flags

 idist = nx*ny*nz
 nhp = (nx/2+1)*ny*nz
 odist = nhp
                            
 stride = 1
 n      = (/nx,ny,nz/) 
 inembed= (/ldx,ldy,ldz/) 
 onembed= (/(nx/2+1),ny,nz/)

 ABI_MALLOC(gg_hp,(2,nhp*ndat))

!#if 0
#ifdef DEV_RC_BUG
 if (ndat/=1) MSG_ERROR("ndat/=1 + MKL not coded")

 if (ANY( (/nx,ny,nz/) /= (/ldx,ldy,ldz/) )) then
   MSG_ERROR("Augmentation not supported")
 end if

 call dfftw_plan_dft_r2c_3d(my_plan, nx, ny, nz, ff, gg_hp, my_flags)
 if (my_plan==NULL_PLAN) then 
   MSG_ERROR("dfftw_plan_dft_r2c_3d returned NULL_PLAN")
 end if

 !fftw_plan fftw_plan_many_dft_r2c(int rank, const int *n, int howmany,
 !  double *in, const int *inembed, int istride, int idist,
 !  fftw_complex *out, const int *onembed, int ostride, int odist, unsigned flags);
#else
 my_plan = dplan_many_dft_r2c(rank, n, ndat, ff, inembed, stride, idist, gg_hp, onembed, stride, odist, my_flags, nt_all)
#endif

 ! Now perform the 3D FFT via FFTW. r2c are always FFTW_FORWARD
 call dfftw_execute_dft_r2c(my_plan, ff, gg_hp)

 call fftw3_destroy_plan(my_plan)

 call ZDSCAL(nhp*ndat, one/(nx*ny*nz), gg_hp, 1)  ! FFTW returns not normalized FTs
 
 ! Reconstruct full FFT: Hermitian redundancy: out[i] is the conjugate of out[n-i]
 padx = (nx/2+1)

 ABI_MALLOC(i1inver,(padx))
 ABI_MALLOC(i2inver,(ny))
 ABI_MALLOC(i3inver,(nz))

 i1inver(1)=1
 do i1=2,padx
   i1inver(i1)=nx+2-i1
 end do

 i2inver(1)=1
 do i2=2,ny
   i2inver(i2)=ny+2-i2
 end do

 i3inver(1)=1
 do i3=2,nz
   i3inver(i3)=nz+2-i3
 end do

 igp=0
 do idat=1,ndat
   padatf = (idat-1)*ldx*ldy*ldz
   do i3=1,nz
     i3inv = i3inver(i3)
     do i2=1,ny
       i2inv = i2inver(i2)
       do i1=1,padx
         igp = igp+1
         igf = i1 + (i3-1)*ldx*ldy + (i2-1)*ldx + padatf
         gg(:,igf) =  gg_hp(:,igp)
         i1inv = i1inver(i1)
         if (i1inv/=i1) then
           imgf = i1inv + (i3inv-1)*ldx*ldy + (i2inv-1)*ldx + padatf
           gg(1,imgf) =  gg_hp(1,igp)
           gg(2,imgf) = -gg_hp(2,igp)
         end if
       end do
     end do
   end do
 end do

 ABI_FREE(i1inver)
 ABI_FREE(i2inver)
 ABI_FREE(i3inver)

 ABI_FREE(gg_hp)

#else
 MSG_ERROR("FFTW3 support not activated")
 ABI_UNUSED((/nx,ny,nz,ldx,ldy,ldz/))
 ABI_UNUSED(ff)
 ABI_UNUSED(gg(1,1))
 if (PRESENT(fftw_flags)) then
   ABI_UNUSED(fftw_flags)
 end if
#endif

end subroutine fftw3_r2c_op
!!***

!----------------------------------------------------------------------

!!****f* m_fftw3/fftw3_c2r_op
!! NAME
!!  fftw3_c2r_op
!!
!! FUNCTION
!! Driver routine for out-of-place 3D complex-to-real FFT of lengths nx, ny, nz. 
!!
!! INPUTS
!! nx,ny,nz=Number of point along the three directions.
!! ldx,ldy,ldz=Physical dimension of the f array (to avoid cache conflicts).
!! ndat=Number of FFTs to be done.
!! ff(2*ldx*ldy*ldz*ndat)=The complex array to be transformed.
!! [fftw_flags]=Flags used to create the plan. They can be combined with the "+" operator. 
!!   Defaults to FFTW_ESTIMATE.
!!
!! OUTPUT 
!! gg(ldx*ldy*ldz*ndat)=The backwards real FFT of ff.
!!
!! NOTES
!!  FIXME For the time-being. No augmentation of the mesh to reduce memory conflicts, as MKL crashes 
!!  if the advanced interface is used.
!!
!! PARENTS
!!      fftw3_fourdp
!!
!! CHILDREN
!!
!! SOURCE

subroutine fftw3_c2r_op(nx,ny,nz,ldx,ldy,ldz,ndat,ff,gg,fftw_flags)


!This section has been created automatically by the script Abilint (TD).
!Do not modify the following lines by hand.
#undef ABI_FUNC
#define ABI_FUNC 'fftw3_c2r_op'
!End of the abilint section

 implicit none 

!Arguments ------------------------------------
!scalars
 integer,intent(in) :: nx,ny,nz,ldx,ldy,ldz,ndat
 integer,optional,intent(in) :: fftw_flags
!arrays
 real(dp),intent(in) :: ff(2,ldx*ldy*ldz*ndat)
 real(dp),intent(out) :: gg(ldx*ldy*ldz*ndat)

#ifdef HAVE_FFT_FFTW3
!Local variables-------------------------------
!scalars
 integer,parameter :: rank=3,nt_all=-1
 integer :: nhp,my_flags,padx,i2,i3,igp,igf,idat,padatf,padatp,ii,idist,odist,stride
 integer(KIND_FFTW_PLAN) :: my_plan
!arrays
 integer :: inembed(rank),onembed(rank),n(rank)
 real(dp),allocatable :: ff_hp(:,:)

! *************************************************************************

 if (ANY( (/nx,ny,nz/) /= (/ldx,ldy,ldz/) )) then
   MSG_ERROR("Augmentation not supported")
 end if

 my_flags=FFTW_ESTIMATE; if (PRESENT(fftw_flags)) my_flags= fftw_flags

 stride  = 1
 nhp     = (nx/2+1)*ny*nz
 idist   = nhp
 odist   = ldx*ldy*ldz
 n       = (/nx,ny,nz/)
 inembed = (/(nx/2+1),ny,nz/)
 onembed = (/ldx,ldy,ldz/) 

 ! Fill the Hermitian part: Hermitian redundancy: out[i] is the conjugate of out[n-i]
 ABI_MALLOC(ff_hp,(2,nhp*ndat))

 padx = (nx/2+1)
 do idat=1,ndat
   padatf=(idat-1)*ldx*ldy*ldz
   padatp=(idat-1)*padx*ny*nz
!$omp parallel do private(igf,igp)
   do i3=1,nz
     do i2=1,ny
       igf = (i3-1)*ldx*ldy + (i2-1)*ldx   + padatf
       igp = (i3-1)*padx*ny + (i2-1)*padx  + padatp
       ff_hp(:,igp+1:igp+padx) = ff(:,igf+1:igf+padx)
     end do
   end do
 end do

 ! NOTE: The c2r transform destroys its input array even for out-of-place transforms.
!#if 0
#ifdef DEV_RC_BUG 
 if (ndat/=1) MSG_ERROR("ndat/=1 + MKL not coded")
 call dfftw_plan_dft_c2r_3d(my_plan, nx, ny, nz, ff_hp, gg, my_flags)
 if (my_plan==NULL_PLAN) then 
   MSG_ERROR("dfftw_plan_dft_c2r_3d returned NULL_PLAN")
 end if
#else
 my_plan = dplan_many_dft_c2r(rank, n, ndat, ff_hp, inembed, stride, idist, gg, onembed, stride, odist, my_flags, nt_all)
#endif

 ! Now perform the 3D FFT via FFTW. c2r are always FFTW_BACKWARD
 call dfftw_execute_dft_c2r(my_plan, ff_hp, gg)

 call fftw3_destroy_plan(my_plan)

 ABI_FREE(ff_hp)

#else 
 MSG_ERROR("FFTW3 support not activated")
 ABI_UNUSED((/nx,ny,nz,ldx,ldy,ldz/))
 ABI_UNUSED(ff(1,1))
 ABI_UNUSED(gg(1))
 if (PRESENT(fftw_flags)) then
   ABI_UNUSED(fftw_flags)
 end if
#endif

end subroutine fftw3_c2r_op
!!***

!----------------------------------------------------------------------

!!****f* m_fftw3/fftw3_many_dft_op
!! NAME
!!  fftw3_many_dft_op
!!
!! FUNCTION
!! Driver routine for many out-of-place 3D complex-to-complex FFTs of lengths nx, ny, nz. 
!!
!! INPUTS
!! nx,ny,nz=Number of points along the three directions.
!! ldx,ldy,ldz=Physical dimension of the fin and fout arrays (to avoid cache conflicts).
!! ndat=Number of FFTs to be done.
!! fin(2*ldx*ldy*ldz*ndat)=The complex array to be transformed.
!! isign=sign of Fourier transform exponent: current convention uses
!!   +1 for transforming from G to r, 
!!   -1 for transforming from r to G.
!! [fftw_flags]=Flags used to create the plan. They can be combined with the "+" operator. 
!!   Defaults to FFTW_ESTIMATE.
!!
!! OUTPUT 
!! fout(2,ldx*ldy*ldz*ndat)=The Fourier transform of fin.
!!
!! PARENTS
!!      fftw3_fourdp
!!
!! CHILDREN
!!
!! SOURCE

subroutine fftw3_many_dft_op(nx,ny,nz,ldx,ldy,ldz,ndat,isign,fin,fout,fftw_flags)


!This section has been created automatically by the script Abilint (TD).
!Do not modify the following lines by hand.
#undef ABI_FUNC
#define ABI_FUNC 'fftw3_many_dft_op'
!End of the abilint section

 implicit none 

!Arguments ------------------------------------
!scalars
 integer,intent(in) :: nx,ny,nz,ldx,ldy,ldz,ndat,isign
 integer,optional,intent(in) :: fftw_flags
!arrays
 real(dp),intent(in) :: fin(2*ldx*ldy*ldz*ndat)
 real(dp),intent(out) :: fout(2*ldx*ldy*ldz*ndat)

#ifdef HAVE_FFT_FFTW3
!Local variables-------------------------------
!scalars
 integer,parameter :: rank=3,nt_all=-1
 integer :: my_flags,dist,ii,stride
 integer(KIND_FFTW_PLAN) :: my_plan
!arrays
 integer :: embed(rank),n(rank)

! *************************************************************************

 my_flags=FFTW_ESTIMATE; if (PRESENT(fftw_flags)) my_flags= fftw_flags

 stride = 1
 dist   = ldx*ldy*ldz
 embed  = (/ldx,ldy,ldz/)
 n      = (/nx ,ny ,nz /)

 my_plan = dplan_many_dft(rank, n, ndat, fin, embed, stride, dist, fout, embed, stride, dist, isign, my_flags, nt_all)

 ! Now perform the 3D FFT via FFTW.
 call dfftw_execute_dft(my_plan, fin, fout)

 call fftw3_destroy_plan(my_plan)

 if (isign==FFTW_FORWARD) then ! -1, FFTW returns not normalized FTs
  call ZDSCAL(ldx*ldy*ldz*ndat, one/DBLE(nx*ny*nz), fout, 1) 
 end if

#else 
 MSG_ERROR("FFTW3 support not activated")
 ABI_UNUSED((/nx,ny,nz,ldx,ldy,ldz,ndat,isign/))
 if (PRESENT(fftw_flags)) then
   ABI_UNUSED(fftw_flags)
 end if
 ABI_UNUSED(fin(1))
 ABI_UNUSED(fout(1))
#endif

end subroutine fftw3_many_dft_op
!!***

!----------------------------------------------------------------------

!!****f* m_fftw3/fftw3_many_dft_ip
!! NAME
!!  fftw3_many_dft_ip
!!
!! FUNCTION
!! Driver routine for many in-place 3D complex-to-complex FFTs of lengths nx, ny, nz. 
!!
!! INPUTS
!! nx,ny,nz=Number of points along the three directions.
!! ldx,ldy,ldz=Physical dimension of the finout array (to avoid cache conflicts).
!! ndat=Number of FFTs to be done.
!! isign=sign of Fourier transform exponent: current convention uses
!!   +1 for transforming from G to r, 
!!   -1 for transforming from r to G.
!! [fftw_flags]=Flags used to create the plan. They can be combined with the "+" operator. 
!!   Defaults to FFTW_ESTIMATE.
!!
!! OUTPUT 
!! finout(2,ldx*ldy*ldz*ndat)=
!!   In input: The complex array to be transformed.
!!   In output: The FFT results.
!!
!! PARENTS
!!      fftw3_fourwf
!!
!! CHILDREN
!!
!! SOURCE

subroutine fftw3_many_dft_ip(nx,ny,nz,ldx,ldy,ldz,ndat,isign,finout,fftw_flags)


!This section has been created automatically by the script Abilint (TD).
!Do not modify the following lines by hand.
#undef ABI_FUNC
#define ABI_FUNC 'fftw3_many_dft_ip'
!End of the abilint section

 implicit none 

!Arguments ------------------------------------
!scalars
 integer,intent(in) :: nx,ny,nz,ldx,ldy,ldz,ndat,isign
 integer,optional,intent(in) :: fftw_flags
!arrays
 real(dp),intent(inout) :: finout(2*ldx*ldy*ldz*ndat)

#ifdef HAVE_FFT_FFTW3
!Local variables-------------------------------
!scalars
 integer,parameter :: rank=3,nt_all=-1
 integer :: my_flags,dist,ii,stride
 integer(KIND_FFTW_PLAN) :: my_plan
!arrays
 integer :: embed(rank),n(rank)

! *************************************************************************

 my_flags=FFTW_ESTIMATE; if (PRESENT(fftw_flags)) my_flags= fftw_flags

 stride = 1
 dist   = ldx*ldy*ldz
 embed  = (/ldx,ldy,ldz/)
 n      = (/nx ,ny ,nz /)

 my_plan = dplan_many_dft(rank, n, ndat, finout, embed, stride, dist, finout,embed, stride, dist, isign, my_flags, nt_all)

 ! Now perform the 3D FFT via FFTW.
 call dfftw_execute_dft(my_plan, finout, finout)

 call fftw3_destroy_plan(my_plan)

 if (isign==FFTW_FORWARD) then ! -1, FFTW returns not normalized FTs
  call ZDSCAL(ldx*ldy*ldz*ndat, one/DBLE(nx*ny*nz), finout, 1) 
 end if

#else 
 MSG_ERROR("FFTW3 support not activated")
 ABI_UNUSED((/nx,ny,nz,ldx,ldy,ldz,ndat,isign/))
 if (PRESENT(fftw_flags)) then
   ABI_UNUSED(fftw_flags)
 end if
 ABI_UNUSED(finout(1))
#endif

end subroutine fftw3_many_dft_ip
!!***

!----------------------------------------------------------------------

!!****f* m_fftw3/fftw3_cleanup
!! NAME
!!  fftw3_cleanup
!!
!! FUNCTION
!!  Reset FFTW to the pristine state it was in when you started your program, 
!!  All existing plans become undefined. 
!!
!! NOTES
!!  FFTW planner saves some other persistent data, such as the accumulated wisdom and a list of 
!!  algorithms available in the current configuration. If you want to deallocate all of that and reset 
!!  FFTW to the pristine state it was in when you started your program, you can call fftw3_cleanup();
!!  After calling fftw3_cleanup, all existing plans become undefined, and you should not attempt to 
!!  execute them nor to destroy them. You can however create and execute/destroy new plans, in which case 
!!  FFTW starts accumulating wisdom information again. 
!!  fftw3_cleanup does not deallocate your plans, however. To prevent memory leaks, you must still call 
!!  fftw_destroy_plan before executing fftw3_cleanup
!!
!! PARENTS
!!      driver
!!
!! CHILDREN
!!
!! SOURCE

subroutine fftw3_cleanup()


!This section has been created automatically by the script Abilint (TD).
!Do not modify the following lines by hand.
#undef ABI_FUNC
#define ABI_FUNC 'fftw3_cleanup'
!End of the abilint section

 implicit none

!Arguments ------------------------------------
!scalars

! *************************************************************************

#ifdef HAVE_FFT_FFTW3_THREADS 
 if (THREADS_INITED==1) then
   call dfftw_cleanup_threads()
   THREADS_INITED = 0
 end if
#elif defined HAVE_FFT_FFTW3
 call dfftw_cleanup()
#else
 MSG_ERROR("FFTW3 support not activated")
#endif

end subroutine fftw3_cleanup
!!***

!----------------------------------------------------------------------

!!****f* m_fftw3/fftw3_destroy_plan
!! NAME
!!  fftw3_destroy_plan
!!
!! FUNCTION
!!  Release the memory allocate for the plan.
!!
!! INPUTS 
!!
!! PARENTS
!!      m_fftw3
!!
!! CHILDREN
!!
!! SOURCE

subroutine fftw3_destroy_plan(plan)


!This section has been created automatically by the script Abilint (TD).
!Do not modify the following lines by hand.
#undef ABI_FUNC
#define ABI_FUNC 'fftw3_destroy_plan'
!End of the abilint section

 implicit none

!Arguments ------------------------------------
!scalars
 integer(KIND_FFTW_PLAN),intent(in) :: plan

!Local variables ------------------------------
!scalars

! *************************************************************************

#ifdef HAVE_FFT_FFTW3
!!$OMP CRITICAL (OMPC_fftw3_destroy_plan)
  call dfftw_destroy_plan(plan)
!!$OMP END CRITICAL (OMPC_fftw3_destroy_plan)

#else
 if (.FALSE.) write(std_out,*)plan
#endif

end subroutine fftw3_destroy_plan
!!***

!----------------------------------------------------------------------

!!****f* m_fftw3/fftw3_init_threads
!! NAME
!!  fftw3_init_threads
!!
!! FUNCTION
!!  This function performs the one-time initialization required to use FFTW3 threads.
!!  It does nothing if HAVE_FFT_FFTW3_THREADS is not defined.
!!
!! INPUTS 
!!
!! SIDE EFFECTS
!!  The one-time initialization required to use FFTW3 threads is performed when the routine 
!!  is called for the first time.
!!
!! PARENTS
!!      driver,fftprof
!!
!! CHILDREN
!!
!! SOURCE

subroutine fftw3_init_threads()


!This section has been created automatically by the script Abilint (TD).
!Do not modify the following lines by hand.
#undef ABI_FUNC
#define ABI_FUNC 'fftw3_init_threads'
!End of the abilint section

 implicit none

!Arguments ------------------------------------
!scalars

!Local variables ------------------------------
!scalars
 integer :: iret

! *************************************************************************

#ifdef HAVE_FFT_FFTW3_THREADS 
 if (THREADS_INITED==0) then
   call dfftw_init_threads(iret)
   !
   if (iret==0) then
     MSG_WARNING(" dfftw_init_threads returned 0; threaded FFTW3 is not being used!")
   else
     THREADS_INITED=1
   end if
   !
 end if

#ifndef HAVE_OPENMP
  MSG_WARNING("Using FFTW3 with threads but HAVE_OPENMP is not defined!")
#endif
#endif
 
 RETURN
 ABI_UNUSED(iret)

end subroutine fftw3_init_threads
!!***

!----------------------------------------------------------------------

!!****f* m_fftw3/fftw3_set_nthreads
!! NAME
!!  fftw3_set_nthreads
!!
!! FUNCTION
!!  This function sets the number of threads you want FFTW3 to use (or actually, the maximum number). 
!!  It also performs any one-time initialization required to use FFTW3 threads.
!!  All plans subsequently created with any planner routine will use nthreads threads. 
!!  If you pass an nthreads argument of 1 (the default), threads are disabled for subsequent plans.
!!  It does nothing if HAVE_FFT_FFTW3_THREADS is not defined.
!!
!! INPUTS 
!!  [nthreads]=The number of threads you want FFTW3 to use.  Default xomp_get_max_threads()
!!
!! PARENTS
!!      m_fft_prof,m_fftw3
!!
!! CHILDREN
!!
!! SOURCE

subroutine fftw3_set_nthreads(nthreads)


!This section has been created automatically by the script Abilint (TD).
!Do not modify the following lines by hand.
#undef ABI_FUNC
#define ABI_FUNC 'fftw3_set_nthreads'
!End of the abilint section

 implicit none

!Arguments ------------------------------------
!scalars
 integer,optional,intent(in) :: nthreads

!Local variables ------------------------------
!scalars
#ifdef HAVE_FFT_FFTW3_THREADS 
 integer :: istat,nt
 character(len=500) :: msg !,str_nthreads
#endif

! *************************************************************************

#ifdef HAVE_FFT_FFTW3_THREADS 
 if (THREADS_INITED==0) then
   !MSG_WARNING("Threads are not initialized")
 end if

 if (PRESENT(nthreads)) then
   if (nthreads<=0) then
     nt = xomp_get_max_threads()
   else
     nt = nthreads
   end if
 else
   nt = xomp_get_max_threads()
 end if

 call dfftw_plan_with_nthreads(nt)

#ifndef HAVE_OPENMP
  MSG_WARNING("Using FFTW3 with threads but HAVE_OPENMP is not defined!")
#endif
 
#else
 if (PRESENT(nthreads)) then
   ABI_UNUSED(nthreads) 
 end if
#endif

end subroutine fftw3_set_nthreads
!!***

!----------------------------------------------------------------------

!!****f* m_fftw3/get_nt
!! NAME
!!  get_nt
!!
!! FUNCTION
!!  
!! INPUTS
!!
!! CHILDREN
!!
!! PARENTS
!!      m_fftw3
!!
!! CHILDREN
!!
!! SOURCE

subroutine get_nt(ndat,nt_here,nt_nest)


!This section has been created automatically by the script Abilint (TD).
!Do not modify the following lines by hand.
#undef ABI_FUNC
#define ABI_FUNC 'get_nt'
!End of the abilint section

 implicit none

!Arguments ------------------------------------
!scalars
 integer,intent(in) :: ndat
 integer,intent(out) :: nt_here,nt_nest

!Local variables-------------------------------
!scalars
 integer :: nthreads
!arrays

! *************************************************************************

 nthreads = xomp_get_max_threads()

 if (ndat==1 .or. MOD(ndat,nthreads)/=0) then
   nt_here = 1
   nt_nest = nthreads
 else
   nt_here = ndat
   nt_nest = nthreads/ndat
 end if

 !nt_here = nthreads
 !nt_nest = 1

 nt_here = 1
 nt_nest = nthreads

end subroutine get_nt
!!***
!----------------------------------------------------------------------

!!****f* m_fftw3/fftw3_fftpad
!! NAME
!!  fftw3_fftpad
!!
!! FUNCTION
!!  This routine transforms wavefunctions using 3D zero-padded FFTs with FFTW3. 
!!  The 3D ffts are computed only on lines and planes which have non zero elements. 
!!  These lines and planes are defined by the two vectors do_fft_x(ldy*nz) and do_fft_y(nz) 
!!  FFT transform is in-place.
!!  
!! INPUTS
!!   nx,ny,nz=Logical dimensions of the FFT mesh.
!!   ldx,ldy,ldz=Physical dimension of the f array (to avoid cache conflicts).
!!   ndat=Number of FFT transforms.
!!   mgfft=MAX(nx,ny,nz), only used to dimension gbound
!!   isign=The sign of the transform.
!!   gbound(2*mgfft+8,2)= The boundaries of the basis sphere of G vectors at a given k-point.
!!     See sphereboundary for more info.
!!
!! SIDE EFFECTS
!!   ff(2*ldx*ldy*ldz*ndat)=
!!     input: The array with the data to be transformed.
!!     output: The results of the FFT.
!!
!! PARENTS
!!      fftw3_fourwf,m_wfs
!!
!! CHILDREN
!!
!! SOURCE

subroutine fftw3_fftpad(ff,nx,ny,nz,ldx,ldy,ldz,ndat,mgfft,isign,gbound)


!This section has been created automatically by the script Abilint (TD).
!Do not modify the following lines by hand.
#undef ABI_FUNC
#define ABI_FUNC 'fftw3_fftpad'
!End of the abilint section

 implicit none

!Arguments ------------------------------------
!scalars
 integer,intent(in) :: nx,ny,nz,ldx,ldy,ldz,ndat,mgfft,isign
!arrays
 integer,intent(in) :: gbound(2*mgfft+8,2)
 real(dp),intent(inout) :: ff(2*ldx*ldy*ldz*ndat)

!Local variables-------------------------------
!scalars
#ifdef HAVE_FFT_FFTW3
 integer :: dat,pt,nt_here,nt_nest

! *************************************************************************

 call get_nt(ndat,nt_here, nt_nest)
 !write(std_out,*)"FFTW3: ndat, nt_here, nt_nest",ndat,nt_here,nt_nest

 if (nt_here > 1) then
!!$OMP PARALLEL DO PRIVATE(pt) NUM_THREADS(nt_here)
   do dat=1,ndat
     pt = 1 + 2*(dat-1) * ldx*ldy*ldz
     call fftw3_fftpad_nt(ff(pt),nx,ny,nz,ldx,ldy,ldz,1,mgfft,isign,gbound,nt_nest)
   end do
 else
   call fftw3_fftpad_nt(ff,nx,ny,nz,ldx,ldy,ldz,ndat,mgfft,isign,gbound,nt_nest)
 end if

#else
 MSG_ERROR("FFTW3 support not activated")
 ABI_UNUSED((/nx,ny,nz,ldx,ldy,ldz,mgfft,isign/))
 ABI_UNUSED(gbound(1,1))
 ABI_UNUSED(ff(1))
#endif

end subroutine fftw3_fftpad     
!!***

!----------------------------------------------------------------------

!!****f* m_fftw3/fftw3_fftpad_nt
!! NAME
!!  fftw3_fftpad_nt
!!
!! FUNCTION
!!  This routine transforms wavefunctions using 3D zero-padded FFTs with FFTW3. 
!!  The 3D ffts are computed only on lines and planes which have non zero elements. 
!!  These lines and planes are defined by the two vectors do_fft_x(ldy*nz) and do_fft_y(nz) 
!!  FFT transform is in-place.
!!  
!! INPUTS
!!   nx,ny,nz=Logical dimensions of the FFT mesh.
!!   ldx,ldy,ldz=Physical dimension of the f array (to avoid cache conflicts).
!!   ndat=Number of FFT transforms.
!!   mgfft=MAX(nx,ny,nz), only used to dimension gbound
!!   isign=The sign of the transform.
!!   gbound(2*mgfft+8,2)= The boundaries of the basis sphere of G vectors at a given k-point.
!!     See sphereboundary for more info.
!!
!! SIDE EFFECTS
!!   ff(2*ldx*ldy*ldz*ndat)=
!!     input: The array with the data to be transformed.
!!     output: The results of the FFT.
!!
!! PARENTS
!!      m_fftw3
!!
!! CHILDREN
!!
!! SOURCE

subroutine fftw3_fftpad_nt(ff,nx,ny,nz,ldx,ldy,ldz,ndat,mgfft,isign,gbound,nthreads)


!This section has been created automatically by the script Abilint (TD).
!Do not modify the following lines by hand.
#undef ABI_FUNC
#define ABI_FUNC 'fftw3_fftpad_nt'
!End of the abilint section

 implicit none

!Arguments ------------------------------------
!scalars
 integer,intent(in) :: nx,ny,nz,ldx,ldy,ldz,ndat,mgfft,isign,nthreads
!arrays
 integer,intent(in) :: gbound(2*mgfft+8,2)
 real(dp),intent(inout) :: ff(2*ldx*ldy*ldz*ndat)

!Local variables-------------------------------
!scalars
#ifdef HAVE_FFT_FFTW3
 integer :: jj,kk,sidx,dat,cnt,line,zplane
 integer(KIND_FFTW_PLAN) :: bw_plan1,bw_plan2,bw_plan3
 integer(KIND_FFTW_PLAN) :: fw_plan1,fw_plan2,fw_plan3
 character(len=500) :: msg
 type(zpad_t) :: zpad

! *************************************************************************

 call zpad_init(zpad,nx,ny,nz,ldx,ldy,ldz,mgfft,gbound)

 SELECT CASE (isign)

 CASE (FFTW_BACKWARD) ! G --> R
   !
   ! The prototype for dfftw_plan_many_dft is:
   ! dfftw_plan_many_dft(rank, n, howmany, 
   !   fin,  iembed, istride, idist, 
   !   fout, oembed, ostride, odist, isign, my_flags)
   !
   ! 1) Transform along x.
   write(msg,'(a,i3)')"FFT_X: ",nthreads
   DEV_TIMER_START(msg)

   bw_plan1 = dplan_many_dft(1, (/nx/), 1, &   ! Single 1D transform of f(Gx,Gy,Gz) along Gx.
&         ff, (/ldx, ldy, ldz/), 1, ldx,   &
&         ff, (/ldx, ldy, ldz/), 1, ldx, FFTW_BACKWARD, FFTW_ESTIMATE, 1)

!!$omp parallel do private(dat,line,jj,kk,sidx) NUM_THREADS(nthreads)
   do cnt=0,(zpad%nlinex*ndat)-1 
     dat  = 1 + cnt / zpad%nlinex
     line = 1 + MOD(cnt, zpad%nlinex)
     jj   = zpad%linex2ifft_yz(1,line)
     kk   = zpad%linex2ifft_yz(2,line)
     sidx = 1 + 2*(jj-1)*ldx + 2*(kk-1)*ldx*ldy + 2*(dat-1)* ldx*ldy*ldz ! Pass the pointer, 2 to account for the imag part.
     call dfftw_execute_dft(bw_plan1, ff(sidx), ff(sidx))
   end do

   call fftw3_destroy_plan(bw_plan1)

   DEV_TIMER_STOP(msg)
   !
   ! 2) Transform along y.
   bw_plan2 = dplan_many_dft(1, (/ny/), nx,  &   ! nx 1D transforms of f(x,Gy,Gz) along Gy.
&         ff, (/ldx, ldy, ldz/), ldx, 1,     &
&         ff, (/ldx, ldy, ldz/), ldx, 1, FFTW_BACKWARD, FFTW_ESTIMATE, 1)

!!$omp parallel do private(dat,zplane,kk,sidx) NUM_THREADS(nthreads)
   do cnt=0,(zpad%n_zplanes*ndat)-1 
     dat    = 1 + cnt / zpad%n_zplanes
     zplane = 1 + MOD(cnt, zpad%n_zplanes)
     kk     = zpad%zplane(1,zplane)
     sidx   = 1 + 2*(kk-1)*ldx*ldy + 2*(dat-1) * ldx*ldy*ldz
     call dfftw_execute_dft(bw_plan2, ff(sidx), ff(sidx))
   end do

   call fftw3_destroy_plan(bw_plan2)

   DEV_TIMER_STOP(msg)
   !
   ! 3) Transform along z.
   write(msg,'(a,i3)')"A_Z: ",nthreads
   DEV_TIMER_START(msg)

   ! Here I can use the guru interface that however is not supported
   ! by the mkl wrappers.
   if (ndat==1 .or. nthreads==1 .or. MOD(ndat,nthreads)/=0) then
     bw_plan3  = dplan_many_dft(1, (/nz/), ldx*ldy, & ! ldx*ldy 1D transforms of f(x,y,Gz) along Gz.
&           ff, (/ldx, ldy, ldz/), ldx*ldy, 1,      & ! Note that we have to visit the entire augmented x-y plane!
&           ff, (/ldx, ldy, ldz/), ldx*ldy, 1, FFTW_BACKWARD, FFTW_ESTIMATE, nthreads)

     call dfftw_execute_dft(bw_plan3, ff, ff)

     call fftw3_destroy_plan(bw_plan3)
  else
     ! Split ndat transforms among the threads.
     bw_plan3  = dplan_many_dft(1, (/nz/), ldx*ldy, & ! ldx*ldy 1D transforms of f(x,y,Gz) along Gz.
&           ff, (/ldx, ldy, ldz/), ldx*ldy, 1,      & ! Note that we have to visit the entire augmented x-y plane!
&           ff, (/ldx, ldy, ldz/), ldx*ldy, 1, FFTW_BACKWARD, FFTW_ESTIMATE, 1)

!!$omp parallel do private(sidx) NUM_THREADS(nthreads)
     do dat=1,ndat
       sidx = 1 + 2*(dat-1) *ldx*ldy*ldz
       call dfftw_execute_dft(bw_plan3, ff(sidx), ff(sidx))
     end do

     call fftw3_destroy_plan(bw_plan3)
  end if

  DEV_TIMER_STOP(msg)
 
 CASE (FFTW_FORWARD) ! R --> G
   !
   ! The prototype for dfftw_plan_many_dft is:
   ! dfftw_plan_many_dft(rank, n, howmany, 
   !   fin,  iembed, istride, idist, 
   !   fout, oembed, ostride, odist, isign, my_flags)
   !
   ! 1) Transform along z.
   if (ndat==1 .or. nthreads==1 .or. MOD(ndat,nthreads)/=0) then
     !
     ! Use FFTW3 internal threading if single FFT or ndat is not divisible by nthreads
     fw_plan3 = dplan_many_dft(1, (/nz/), ldx*ldy, & ! We have to visit the entire augmented x-y plane!
&           ff, (/ldx, ldy, ldz/), ldx*ldy, 1,     &
&           ff, (/ldx, ldy, ldz/), ldx*ldy, 1, FFTW_FORWARD, FFTW_ESTIMATE, nthreads)

     do dat=1,ndat
       sidx = 1 + 2*(dat-1) *ldx*ldy*ldz
       call dfftw_execute_dft(fw_plan3, ff(sidx), ff(sidx))
     end do

     call fftw3_destroy_plan(fw_plan3)
   else
     ! Split ndat transforms among the threads.
     fw_plan3 = dplan_many_dft(1, (/nz/), ldx*ldy, & ! We have to visit the entire augmented x-y plane!
&           ff, (/ldx, ldy, ldz/), ldx*ldy, 1,     &
&           ff, (/ldx, ldy, ldz/), ldx*ldy, 1, FFTW_FORWARD, FFTW_ESTIMATE, 1)

!!$omp parallel do private(sidx) NUM_THREADS(nthreads)
     do dat=1,ndat
       sidx = 1 + 2*(dat-1) *ldx*ldy*ldz
       call dfftw_execute_dft(fw_plan3, ff(sidx), ff(sidx))
     end do

     call fftw3_destroy_plan(fw_plan3)
   end if
   !
   ! 2) Transform along y.
   fw_plan2 = dplan_many_dft(1, (/ny/), nx, &
&         ff, (/ldx, ldy, ldz/), ldx, 1,    &
&         ff, (/ldx, ldy, ldz/), ldx, 1, FFTW_FORWARD, FFTW_ESTIMATE, 1)

!!$omp parallel do private(dat,zplane,kk,sidx) NUM_THREADS(nthreads)
   do cnt=0,(zpad%n_zplanes*ndat)-1 
     dat    = 1 + cnt / zpad%n_zplanes
     zplane = 1 + MOD(cnt, zpad%n_zplanes)
     kk     = zpad%zplane(1,zplane)
     sidx   = 1 + 2*ldx*ldy*(kk-1) + 2*(dat-1)*ldx*ldy*ldz
     call dfftw_execute_dft(fw_plan2, ff(sidx), ff(sidx))
   end do

   call fftw3_destroy_plan(fw_plan2)
   !
   ! 3) Transform along x. 
   fw_plan1 = dplan_many_dft(1, (/nx/), 1,  &
&         ff, (/ldx, ldy, ldz/), 1, ldx,    &
&         ff, (/ldx, ldy, ldz/), 1, ldx, FFTW_FORWARD, FFTW_ESTIMATE, 1)

!!$omp parallel do private(dat,line,jj,kk,sidx) NUM_THREADS(nthreads)
   do cnt=0,(zpad%nlinex*ndat)-1 
     dat  = 1 + cnt / zpad%nlinex
     line = 1 + MOD(cnt, zpad%nlinex)
     jj = zpad%linex2ifft_yz(1,line)
     kk = zpad%linex2ifft_yz(2,line)
     sidx = 1 + 2*(jj-1)*ldx + 2*(kk-1)*ldx*ldy + 2*(dat-1)*ldx*ldy*ldz ! Pass the pointer, 2 to account for the imag part.
     call dfftw_execute_dft(fw_plan1, ff(sidx), ff(sidx))
   end do

   call fftw3_destroy_plan(fw_plan1)
   !
   ! 4) Normalize the transform.
   call ZDSCAL(ldx*ldy*ldz*ndat, one/(nx*ny*nz), ff, 1)
 
 CASE DEFAULT 
   MSG_BUG("Wrong isign")
 END SELECT

 call zpad_free(zpad)

#else
 MSG_ERROR("FFTW3 support not activated")
 ABI_UNUSED((/nx,ny,nz,ldx,ldy,ldz,mgfft,isign,nthreads/))
 ABI_UNUSED(gbound(1,1))
 ABI_UNUSED(ff(1))
#endif

end subroutine fftw3_fftpad_nt   
!!***

!----------------------------------------------------------------------

!!****f* m_fftw3/fftw3_fftpad_cplx
!! NAME
!!  fftw3_fftpad_cplx
!!
!! FUNCTION
!!  This routine transforms wavefunctions using 3D zero-padded FFTs with FFTW3. 
!!  The 3D ffts are computed only on lines and planes which have non zero elements. 
!!  These lines and planes are defined by the two vectors do_fft_x(ldy*nz) and do_fft_y(nz) 
!!  FFT transform is in-place. Target: complex arrays.
!!  
!! INPUTS
!!   nx,ny,nz=Logical dimensions of the FFT mesh.
!!   ldx,ldy,ldz=Physical dimension of the f array (to avoid cache conflicts).
!!   ndat=Number of FFT transforms.
!!   mgfft=MAX(nx,ny,nz), only used to dimension gbound.
!!   isign=The sign of the transform.
!!   gbound(2*mgfft+8,2)= The boundaries of the basis sphere of G vectors at a given k-point.
!!     See sphereboundary for more info.
!!
!! SIDE EFFECTS
!!  ff(ldx*ldy*ldz*ndat)=
!!    input: The array with the data to be transformed.
!!    output: The results of the FFT.
!!
!! PARENTS
!!      fftw3_fourwf,m_wfs
!!
!! CHILDREN
!!
!! SOURCE

subroutine fftw3_fftpad_cplx(ff,nx,ny,nz,ldx,ldy,ldz,ndat,mgfft,isign,gbound)


!This section has been created automatically by the script Abilint (TD).
!Do not modify the following lines by hand.
#undef ABI_FUNC
#define ABI_FUNC 'fftw3_fftpad_cplx'
!End of the abilint section

 implicit none

!Arguments ------------------------------------
!scalars
 integer,intent(in) :: nx,ny,nz,ldx,ldy,ldz,ndat,mgfft,isign
!arrays
 integer,intent(in) :: gbound(2*mgfft+8,2)
 complex(dpc),intent(inout) :: ff(ldx*ldy*ldz*ndat)

#ifdef HAVE_FFT_FFTW3
!Local variables-------------------------------
!scalars
 integer :: dat,pt,nt_here,nt_nest
!arrays

! *************************************************************************

 call get_nt(ndat,nt_here, nt_nest)
 !write(std_out,*)"FFTW3: ndat, nt_here, nt_nest",ndat,nt_here,nt_nest

 if (nt_here > 1) then
!!$OMP PARALLEL DO PRIVATE(pt) NUM_THREADS(nt_here)
   do dat=1,ndat
     pt = 1 + (dat-1) * ldx*ldy*ldz
     call fftw3_fftpad_cplx_nt(ff(pt),nx,ny,nz,ldx,ldy,ldz,1,mgfft,isign,gbound,nt_nest)
   end do
 else
   call fftw3_fftpad_cplx_nt(ff,nx,ny,nz,ldx,ldy,ldz,ndat,mgfft,isign,gbound,nt_nest)
 end if

#else
 MSG_ERROR("FFTW3 support not activated")
 ABI_UNUSED((/nx,ny,nz,ldx,ldy,ldz,ndat,mgfft,isign/))
 ABI_UNUSED(gbound(1,1))
 ABI_UNUSED(ff(1))
#endif

end subroutine fftw3_fftpad_cplx
!!***

!----------------------------------------------------------------------

!!****f* m_fftw3/fftw3_fftpad_cplx_nt        
!! NAME
!!  fftw3_fftpad_cplx_nt
!!
!! FUNCTION
!!  This routine transforms wavefunctions using 3D zero-padded FFTs with FFTW3. 
!!  The 3D ffts are computed only on lines and planes which have non zero elements. 
!!  These lines and planes are defined by the two vectors do_fft_x(ldy*nz) and do_fft_y(nz) 
!!  FFT transform is in-place. Target: complex arrays.
!!  
!! INPUTS
!!   nx,ny,nz=Logical dimensions of the FFT mesh.
!!   ldx,ldy,ldz=Physical dimension of the f array (to avoid cache conflicts).
!!   ndat=Number of FFT transforms.
!!   mgfft=MAX(nx,ny,nz), only used to dimension gbound.
!!   isign=The sign of the transform.
!!   gbound(2*mgfft+8,2)= The boundaries of the basis sphere of G vectors at a given k-point.
!!     See sphereboundary for more info.
!!
!! SIDE EFFECTS
!!  ff(ldx*ldy*ldz*ndat)=
!!    input: The array with the data to be transformed.
!!    output: The results of the FFT.
!!
!! PARENTS
!!      m_fftw3
!!
!! CHILDREN
!!
!! SOURCE

subroutine fftw3_fftpad_cplx_nt(ff,nx,ny,nz,ldx,ldy,ldz,ndat,mgfft,isign,gbound,nthreads)


!This section has been created automatically by the script Abilint (TD).
!Do not modify the following lines by hand.
#undef ABI_FUNC
#define ABI_FUNC 'fftw3_fftpad_cplx_nt'
!End of the abilint section

 implicit none

!Arguments ------------------------------------
!scalars
 integer,intent(in) :: nx,ny,nz,ldx,ldy,ldz,ndat,mgfft,isign,nthreads
!arrays
 integer,intent(in) :: gbound(2*mgfft+8,2)
 complex(dpc),intent(inout) :: ff(ldx*ldy*ldz*ndat)

#ifdef HAVE_FFT_FFTW3
!Local variables-------------------------------
!scalars
 integer :: cnt,dat,line,zplane,jj,kk,sidx
 integer(KIND_FFTW_PLAN) :: bw_plan1,bw_plan2,bw_plan3
 integer(KIND_FFTW_PLAN) :: fw_plan1,fw_plan2,fw_plan3
 type(zpad_t) :: zpad
!arrays

! *************************************************************************

 call zpad_init(zpad,nx,ny,nz,ldx,ldy,ldz,mgfft,gbound)

 SELECT CASE (isign)

 CASE (FFTW_BACKWARD) ! G --> R
   !
   ! The prototype fo dfftw_plan_many_dft is:
   ! dfftw_plan_many_dft(rank, n, howmany, 
   !   fin,  iembed, istride, idist, 
   !   fout, oembed, ostride, odist, isign, my_flags)
   !
   ! 1) Transform along x.
   bw_plan1 = zplan_many_dft(1, (/nx/), 1, &
&       ff, (/ldx, ldy, ldz/), 1, ldx,     &
&       ff, (/ldx, ldy, ldz/), 1, ldx, FFTW_BACKWARD, FFTW_ESTIMATE, 1)

!!$omp parallel do private(dat,line,jj,kk,sidx) NUM_THREADS(nthreads)
   do cnt=0,(zpad%nlinex*ndat)-1 
     dat  = 1 + cnt / zpad%nlinex
     line = 1 + MOD(cnt, zpad%nlinex)
     jj   = zpad%linex2ifft_yz(1,line)
     kk   = zpad%linex2ifft_yz(2,line)
     sidx = 1+ (jj-1)*ldx + (kk-1)*ldx*ldy + (dat-1) * ldx*ldy*ldz
     call dfftw_execute_dft(bw_plan1, ff(sidx), ff(sidx) ) ! Pass the pointer
   end do

   call fftw3_destroy_plan(bw_plan1)
   !
   ! 2) Transform along y.
   bw_plan2 = zplan_many_dft(1, (/ny/), nx,&
&       ff, (/ldx, ldy, ldz/), ldx, 1,     &
&       ff, (/ldx, ldy, ldz/), ldx, 1, FFTW_BACKWARD, FFTW_ESTIMATE, 1)

!!$omp parallel do private(dat,zplane,kk,sidx) NUM_THREADS(nthreads)
   do cnt=0,(zpad%n_zplanes*ndat)-1 
     dat    = 1 + cnt / zpad%n_zplanes
     zplane = 1 + MOD(cnt, zpad%n_zplanes)
     kk     = zpad%zplane(1,zplane)
     sidx   = 1 + ldx*ldy*(kk-1) + (dat-1) *ldx*ldy*ldz
     call dfftw_execute_dft(bw_plan2, ff(sidx), ff(sidx))
   end do

   call fftw3_destroy_plan(bw_plan2)
   !
   ! 3) Transform along z.
   if (ndat==1 .or. nthreads==1 .or. MOD(ndat,nthreads)/=0) then
     bw_plan3 = zplan_many_dft(1, (/nz/), ldx*ldy, & ! We have to visit the entire augmented x-y plane!
&         ff, (/ldx, ldy, ldz/), ldx*ldy, 1,       &
&         ff, (/ldx, ldy, ldz/), ldx*ldy, 1, FFTW_BACKWARD, FFTW_ESTIMATE, nthreads) 

     do dat=1,ndat
       sidx = 1 + (dat-1) *ldx*ldy*ldz
       call dfftw_execute_dft(bw_plan3, ff(sidx), ff(sidx))
     end do

     call fftw3_destroy_plan(bw_plan3)
   else 
     ! Split ndat transforms among the threads.
     bw_plan3  = zplan_many_dft(1, (/nz/), ldx*ldy, & ! ldx*ldy 1D transforms of f(x,y,Gz) along Gz.
&           ff, (/ldx, ldy, ldz/), ldx*ldy, 1,      & ! Note that we have to visit the entire augmented x-y plane!
&           ff, (/ldx, ldy, ldz/), ldx*ldy, 1, FFTW_BACKWARD, FFTW_ESTIMATE, 1)

!!$omp parallel do private(sidx) NUM_THREADS(nthreads)
     do dat=1,ndat
       sidx = 1 + (dat-1) *ldx*ldy*ldz
       call dfftw_execute_dft(bw_plan3, ff(sidx), ff(sidx))
     end do

     call fftw3_destroy_plan(bw_plan3)
   end if
 
 CASE (FFTW_FORWARD) ! R --> G
   !
   ! The prototype for dfftw_plan_many_dft is:
   ! dfftw_plan_many_dft(n, howmany, 
   !   fin,  iembed, istride, idist, 
   !   fout, oembed, ostride, odist, isign, my_flags)
   !
   ! 1) Transform along z.
   if (ndat==1 .or. nthreads==1 .or. MOD(ndat,nthreads)/=0) then
     fw_plan3 = zplan_many_dft(1, (/nz/), ldx*ldy, & ! We have to visit the entire augmented x-y plane!
&         ff, (/ldx, ldy, ldz/), ldx*ldy, 1,       &
&         ff, (/ldx, ldy, ldz/), ldx*ldy, 1, FFTW_FORWARD, FFTW_ESTIMATE, nthreads)

     do dat=1,ndat
       sidx = 1 + (dat-1) *ldx*ldy*ldz
       call dfftw_execute_dft(fw_plan3, ff(sidx), ff(sidx))
     end do

     call fftw3_destroy_plan(fw_plan3)
   else
     ! Split ndat transforms among the threads.
     fw_plan3 = zplan_many_dft(1, (/nz/), ldx*ldy, & ! We have to visit the entire augmented x-y plane!
&         ff, (/ldx, ldy, ldz/), ldx*ldy, 1,       &
&         ff, (/ldx, ldy, ldz/), ldx*ldy, 1, FFTW_FORWARD, FFTW_ESTIMATE, 1)

!!$omp parallel do private(sidx) NUM_THREADS(nthreads)
     do dat=1,ndat
       sidx = 1 + (dat-1) *ldx*ldy*ldz
       call dfftw_execute_dft(fw_plan3, ff(sidx), ff(sidx))
     end do

     call fftw3_destroy_plan(fw_plan3)
   end if
   !
   ! 2) Transform along y.
   fw_plan2 = zplan_many_dft(1, (/ny/), nx, &
&       ff, (/ldx, ldy, ldz/), ldx, 1,      &
&       ff, (/ldx, ldy, ldz/), ldx, 1, FFTW_FORWARD, FFTW_ESTIMATE, 1)

!!$omp parallel do private(dat,zplane,kk,sidx) NUM_THREADS(nthreads)
   do cnt=0,(zpad%n_zplanes*ndat)-1 
     dat    = 1 + cnt / zpad%n_zplanes
     zplane = 1 + MOD(cnt, zpad%n_zplanes)
     kk     = zpad%zplane(1,zplane)
     sidx   = 1 + ldx*ldy*(kk-1) + (dat-1) *ldx*ldy*ldz
     call dfftw_execute_dft(fw_plan2, ff(sidx), ff(sidx))
   end do

   call fftw3_destroy_plan(fw_plan2)
   !
   ! 3) Transform along x. 
   fw_plan1 = zplan_many_dft(1, (/nx/), 1, &
&       ff, (/ldx, ldy, ldz/), 1, ldx,     &
&       ff, (/ldx, ldy, ldz/), 1, ldx, FFTW_FORWARD, FFTW_ESTIMATE, 1)

!!$omp parallel do private(dat,line,jj,kk,sidx) NUM_THREADS(nthreads)
   do cnt=0,(zpad%nlinex*ndat)-1 
     dat  = 1 + cnt / zpad%nlinex
     line = 1 + MOD(cnt, zpad%nlinex)
     jj   = zpad%linex2ifft_yz(1,line)
     kk   = zpad%linex2ifft_yz(2,line)
     sidx = 1+ (jj-1)*ldx + (kk-1)*ldx*ldy + (dat-1) * ldx*ldy*ldz
     call dfftw_execute_dft(fw_plan1, ff(sidx), ff(sidx))
   end do

   call fftw3_destroy_plan(fw_plan1)
   !
   ! 4) Normalize the transform.
   call ZDSCAL(ldx*ldy*ldz*ndat, one/(nx*ny*nz), ff, 1)
 
 CASE DEFAULT 
   MSG_BUG("Wrong isign")
 END SELECT

 call zpad_free(zpad)

#else
 MSG_ERROR("FFTW3 support not activated")
 ABI_UNUSED((/nx,ny,nz,ldx,ldy,ldz,ndat,mgfft,isign,nthreads/))
 ABI_UNUSED(gbound(1,1))
 ABI_UNUSED(ff(1))
#endif

end subroutine fftw3_fftpad_cplx_nt
!!***

#ifdef HAVE_FFT_FFTW3 

!----------------------------------------------------------------------

!!****f* m_fftw3/dplan_many_dft
!! NAME
!!
!! FUNCTION
!!  
!! INPUTS
!!
!! SIDE EFFECTS
!!
!! PARENTS
!!
!! CHILDREN
!!
!! SOURCE

function dplan_many_dft(rank,n,howmany,fin,inembed,istride,idist,fout,onembed,ostride,odist,sign,flags,nthreads) result(plan)


!This section has been created automatically by the script Abilint (TD).
!Do not modify the following lines by hand.
#undef ABI_FUNC
#define ABI_FUNC 'dplan_many_dft'
 use interfaces_14_hidewrite
!End of the abilint section

 implicit none

!Arguments ------------------------------------
!scalars
 integer,intent(in) :: rank,howmany,istride,ostride, sign,flags,idist,odist,nthreads
 integer,intent(in) :: n(rank),inembed(rank),onembed(rank)
 integer(KIND_FFTW_PLAN) :: plan
!arrays
 real(dp) :: fin(*),fout(*)

!Local variables-------------------------------
 character(len=500) :: msg,frmt

! *************************************************************************
 
!!$OMP CRITICAL (OMPC_dfftw_plan_many_dft)
 call fftw3_set_nthreads(nthreads)

 call dfftw_plan_many_dft(plan, rank, n, howmany, &
&  fin, inembed, istride, idist, fout, onembed, ostride, odist, sign, flags)
!!$OMP END CRITICAL (OMPC_dfftw_plan_many_dft)

 if (plan==NULL_PLAN) then
   call wrtout(std_out,"dfftw_plan_many_dft returned NULL_PLAN!","COLL")
   write(frmt,*)"(a,",rank,"(1x,i0),3(a,i0),a,2(a,",rank,"(1x,i0),2(a,i0),a))"
   write(msg,frmt)&
&    " n= ",n," howmany= ",howmany," sign= ",sign," flags= ",flags,ch10,&
&    " inembed= ",inembed," istride= ",istride," idist=",idist,ch10,    &
&    " onembed= ",onembed," ostride= ",ostride," odist=",idist,ch10
   call wrtout(std_out,msg,"COLL")
   MSG_ERROR("Check FFTW library and/or abinit code")
 end if

end function dplan_many_dft
!!***

!----------------------------------------------------------------------

!!****f* m_fftw3/zplan_many_dft
!! NAME
!!
!! FUNCTION
!!  
!! INPUTS
!!
!! SIDE EFFECTS
!!
!! PARENTS
!!
!! CHILDREN
!!
!! SOURCE
!! FIXME  technically it should be intent(inout) since FFTW3 can destroy the input for particular flags.

function zplan_many_dft(rank,n,howmany,fin,inembed,istride,idist,fout,onembed,ostride,odist,sign,flags,nthreads) result(plan)


!This section has been created automatically by the script Abilint (TD).
!Do not modify the following lines by hand.
#undef ABI_FUNC
#define ABI_FUNC 'zplan_many_dft'
 use interfaces_14_hidewrite
!End of the abilint section

 implicit none

!Arguments ------------------------------------
!scalars
 integer,intent(in) :: rank,howmany,istride,ostride, sign,flags,idist,odist,nthreads
 integer,intent(in) :: n(rank),inembed(rank),onembed(rank)
 integer(KIND_FFTW_PLAN) :: plan
!arrays
 complex(dpc) :: fin(*),fout(*) 

!Local variables-------------------------------
 character(len=500) :: msg,frmt

! *************************************************************************
 
!!$OMP CRITICAL (OMPC_zplan_many_dft)
 call fftw3_set_nthreads(nthreads)

 call dfftw_plan_many_dft(plan, rank, n, howmany, &
&  fin, inembed, istride, idist, fout, onembed, ostride, odist, sign, flags)
!!$OMP END CRITICAL (OMPC_zplan_many_dft)

 if (plan==NULL_PLAN) then ! handle the error
   call wrtout(std_out,"dfftw_plan_many_dft returned NULL_PLAN (complex version)","COLL")
   write(frmt,*)"(a,",rank,"(1x,i0),3(a,i0),a,2(a,",rank,"(1x,i0),2(a,i0),a))"
   write(msg,frmt)&
&    " n = ",n," howmany = ",howmany," sign = ",sign," flags = ",flags,ch10,&
&    " inembed = ",inembed," istride = ",istride," idist =",idist,ch10,     &
&    " onembed = ",onembed," ostride = ",ostride," odist =",idist,ch10
   call wrtout(std_out,msg,"COLL")
   MSG_ERROR("Check FFTW library and/or abinit code")
 end if

end function zplan_many_dft
!!***

!----------------------------------------------------------------------

!!****f* m_fftw3/dplan_many_dft_r2c
!! NAME
!!
!! FUNCTION
!!  
!! INPUTS
!!
!! SIDE EFFECTS
!!
!! PARENTS
!!
!! CHILDREN
!!
!! SOURCE
!! FIXME  technically it should be intent(inout) since FFTW3 can destroy the input 
!! for particular flags.

function dplan_many_dft_r2c(rank,n,howmany,fin,inembed,istride,idist,fout,onembed,ostride,odist,flags,nthreads) result(plan)


!This section has been created automatically by the script Abilint (TD).
!Do not modify the following lines by hand.
#undef ABI_FUNC
#define ABI_FUNC 'dplan_many_dft_r2c'
 use interfaces_14_hidewrite
!End of the abilint section

 implicit none

!Arguments ------------------------------------
!scalars
 integer,intent(in) :: rank,howmany,istride,ostride,flags,idist,odist,nthreads
 integer,intent(in) :: n(rank),inembed(rank),onembed(rank)
 integer(KIND_FFTW_PLAN) :: plan
!arrays
 real(dp) :: fin(*),fout(*)

!Local variables-------------------------------
 character(len=500) :: msg,frmt

! *************************************************************************

!!$OMP CRITICAL (OMPC_dplan_many_dft_r2c)
 call fftw3_set_nthreads(nthreads)

 call dfftw_plan_many_dft_r2c(plan, rank, n, howmany, &
&  fin, inembed, istride, idist, fout, onembed, ostride, odist, flags)
!!$OMP END CRITICAL (OMPC_dplan_many_dft_r2c)

 if (plan==NULL_PLAN) then ! handle the error.
   call wrtout(std_out,"dfftw_plan_many_dft_r2c returned NULL_PLAN","COLL")
   write(frmt,*)"(a,",rank,"(1x,i0),2(a,i0),a,2(a,",rank,"(1x,i0),2(a,i0),a))"
   write(msg,frmt)&
&    " n = ",n," howmany = ",howmany," flags = ",flags,ch10,&
&    " inembed = ",inembed," istride = ",istride," idist = ",idist,ch10,&
&    " onembed = ",onembed," ostride = ",ostride," odist = ",idist,ch10
   call wrtout(std_out,msg,"COLL")
   MSG_ERROR("Check FFTW library and/or abinit code")
 end if

end function dplan_many_dft_r2c
!!***

!----------------------------------------------------------------------

!!****f* m_fftw3/dplan_many_dft_c2r
!! NAME
!!
!! FUNCTION
!!  
!! INPUTS
!!
!! SIDE EFFECTS
!!
!! PARENTS
!!
!! CHILDREN
!!
!! SOURCE

function dplan_many_dft_c2r(rank,n,howmany,fin,inembed,istride,idist,fout,onembed,ostride,odist,flags, nthreads) result(plan)


!This section has been created automatically by the script Abilint (TD).
!Do not modify the following lines by hand.
#undef ABI_FUNC
#define ABI_FUNC 'dplan_many_dft_c2r'
 use interfaces_14_hidewrite
!End of the abilint section

 implicit none

!Arguments ------------------------------------
!scalars
 integer,intent(in) :: rank,howmany,istride,ostride,flags,idist,odist,nthreads
 integer,intent(in) :: n(rank),inembed(rank),onembed(rank)
 integer(KIND_FFTW_PLAN) :: plan
!arrays
 real(dp) :: fin(*),fout(*)

!Local variables-------------------------------
 character(len=500) :: msg,frmt

! *************************************************************************

!!$OMP CRITICAL (OMPC_dplan_many_dft_c2r)
 call fftw3_set_nthreads(nthreads)

 call dfftw_plan_many_dft_c2r(plan, rank, n, howmany, &
&  fin, inembed, istride, idist, fout, onembed, ostride, odist, flags)
!!$OMP END CRITICAL (OMPC_dplan_many_dft_c2r)

 if (plan==NULL_PLAN) then ! handle the error.
   call wrtout(std_out,"dfftw_plan_many_dft_c2r returned NULL_PLAN","COLL")
   write(frmt,*)"(a,",rank,"(1x,i0),2(a,i0),a,2(a,",rank,"(1x,i0),2(a,i0),a))"
   write(msg,frmt)&
&    " n = ",n," howmany = ",howmany," flags = ",flags,ch10,&
&    " inembed = ",inembed," istride = ",istride," idist = ",idist,ch10,&
&    " onembed = ",onembed," ostride = ",ostride," odist = ",idist,ch10
   call wrtout(std_out,msg,"COLL")
   MSG_ERROR("Check FFTW library and/or abinit code")
 end if

end function dplan_many_dft_c2r
!!***

#endif

!----------------------------------------------------------------------

!!****f* m_fftw3/fftw3_fftpad_tr
!! NAME
!!  fftw3_fftpad_tr
!!
!! FUNCTION
!!  This routine transforms wavefunctions using 3D zero-padded FFTs with FFTW3 taking advantage
!!  of time reversal symmetry.
!!  The 3D ffts are computed only on lines and planes which have non zero elements. 
!!  These lines and planes are defined by the two vectors (see zpad_t)
!!  FFT transform is in-place.
!!  
!! INPUTS
!!   logical dimensions of the fft physical dimensions of the f array sign of the transformation
!!   nx,ny,nz=Logical dimensions of the FFT mesh.
!!   ldx,ldy,ldz=Physical dimension of the f array (to avoid cache conflicts).
!!   ndat=Number of FFTs
!!   mgfft=MAX(nx,ny,nz), only used to dimension gbound
!!   isign=The sign of the transform.
!!   gbound(2*mgfft+8,2)= The boundaries of the basis sphere of G vectors at a given k-point.
!!     See sphereboundary for more info.
!!
!! SIDE EFFECTS
!!   f(2*ldx*ldy*ldz*ndat)=
!!     input: The array with the data to be transformed.
!!     output: The results of the FFT.
!!
!! PARENTS
!!
!! CHILDREN
!!
!! SOURCE

subroutine fftw3_fftpad_tr(ff,nx,ny,nz,ldx,ldy,ldz,ndat,mgfft,isign,gbound)


!This section has been created automatically by the script Abilint (TD).
!Do not modify the following lines by hand.
#undef ABI_FUNC
#define ABI_FUNC 'fftw3_fftpad_tr'
!End of the abilint section

 implicit none

!Arguments ------------------------------------
!scalars
 integer,intent(in) :: nx,ny,nz,ldx,ldy,ldz,ndat,mgfft,isign
!arrays
 integer,intent(in) :: gbound(2*mgfft+8,2)
 real(dp),intent(inout) :: ff(2*ldx*ldy*ldz*ndat)

#ifdef HAVE_FFT_FFTW3
!Local variables-------------------------------
!scalars
 integer,parameter :: rank1=1,nt_all=-1
 integer :: nffts,istride,ostride,idist,odist
 integer :: kk,ii,jj,sidx,nhp,padz,igf,igp,ifft
 integer :: id_yz,g3_max,g3_min,len3,gg2,gg3,g2,ifft_g3,igb,g2min,g2max
 integer(KIND_FFTW_PLAN) :: bw_plan1,bw_plan2,c2r_plan,bw_plan3
 integer(KIND_FFTW_PLAN) :: fw_plan1,fw_plan2,fw_plan3
 character(len=500) :: msg
!arrays
 integer :: n(1),inembed(1),onembed(1)
 real(dp),allocatable :: gg(:) !ldx*ldy*ldz) ! logical or physical dims?
 real(dp),allocatable :: ff_hp(:)
 integer,allocatable :: do_fft_x(:,:),do_fft_y(:)

! *************************************************************************

 !TODO
 !write(std_out,*)ABI_FUNC
 !MSG_ERROR("FFTW3 with istwf_k>1 is still under development!")
 ABI_CHECK(ndat==1,"ndata != 1")

 ABI_MALLOC(do_fft_x,(ldy,ldz))
 ABI_MALLOC(do_fft_y,(nz))
 do_fft_y = 0                             ! we have to recalculate them at each call.
 do_fft_x = 0

 g3_min=gbound(3,2)
 g3_max=gbound(4,2)
 len3=g3_max-g3_min+1

 do gg3=1,g3_max+1 ! Loop over the z-planes intersecting the G-sphere.
   ifft_g3=gg3
   ! Select the set of y for this z-plane.
   igb=2*gg3+3
   do_fft_y(ifft_g3) = igb
   g2min = gbound(igb  ,2) 
   g2max = gbound(igb+1,2)
   do_fft_x(1:g2max+1    ,ifft_g3) = 1 ! Positive g_y.
   do_fft_x(g2min+ny+1:ny,ifft_g3) = 1 ! Negative g_y.
 end do

 SELECT CASE (isign)

 CASE (FFTW_BACKWARD) ! G --> R
   !
   ! The prototype for dfftw_plan_many_dft is:
   ! dfftw_plan_many_dft(rank, n, howmany, 
   !   fin,  iembed, istride, idist, 
   !   fout, oembed, ostride, odist, isign, my_flags)
   !     

#if 0
!    Go from wk1d_b to wk1d_a, using 1D FFTs on the z direction
   bw_plan3 = dplan_many_dft(1, (/nz/), ldx*(ldy/2+1), & ! ldx*ldy 1D transforms of f(Gx,Gy,Gz) along Gz.
   !bw_plan3 = dplan_many_dft(1, (/nz/), ldx*ldy, & ! ldx*ldy 1D transforms of f(Gx,Gy,Gz) along Gz.
   &       ff, (/ldx, ldy, ldz/), ldx*ldy, 1,          & ! Note that we have to visit the entire augmented x-y plane!
   &       ff, (/ldx, ldy, ldz/), ldx*ldy, 1, FFTW_BACKWARD, FFTW_ESTIMATE, nt_all)

   call dfftw_execute_dft(bw_plan3, ff, ff)

   call fftw3_destroy_plan(bw_plan3)

!  Do-loop on the planes stacked in the z direction
!  Perform x transform, taking into account arrays of zeros
   ! 1) Transform along x.

   bw_plan1 = dplan_many_dft(1, (/nx/), 1,&   ! Single 1D transform of f(Gx,Gy,Gz) along Gx.
&       ff, (/ldx, ldy, ldz/), 1, ldx,    &
&       ff, (/ldx, ldy, ldz/), 1, ldx, FFTW_BACKWARD, FFTW_ESTIMATE,1)

   !do kk=1,g3_max+1
   do kk=1,nz
     do jj=1,ny
       !if (do_fft_x(jj,kk) == 1) then 
       !if (gy => 0)
         sidx = 1 + 2*(jj-1)*ldx + 2*(kk-1)*ldx*ldy ! Pass the pointer, 2 to account for the imag part.
         call dfftw_execute_dft(bw_plan1, ff(sidx), ff(sidx))
       !end if
     end do
   end do

   call fftw3_destroy_plan(bw_plan1)

   ! Now ff contains f(x,Gy,z) for Gy >= 0
   ! Note that f(x,Gy,z) = f(x,-Gy,z)* hence here we can perform the c2r FFT.

   call dfftw_plan_many_dft_c2r(bw_plan2, 1, (/ny/),   nx,         &
                                ff, (/  ldx*ldy*ldz/), ldx, 1, &
                                ff, (/2*ldx*ldy*ldz/), 2*ldx, 2, FFTW_ESTIMATE)
                                                                                    
   if (bw_plan2==NULL_PLAN) then ! handle the error.
     MSG_ERROR("Check FFTW library and/or abinit code")
   end if
                                                                                    
   ! 2) Transform along y.
!   bw_plan2 = dplan_many_dft(1, (/ny/), nx, &   ! nx 1D transforms of f(x,Gy,Gz) along Gy.
!&       ff, (/ldx, ldy, ldz/), ldx, 1,                          &
!&       ff, (/ldx, ldy, ldz/), ldx, 1, FFTW_BACKWARD, FFTW_ESTIMATE,1)

   do kk=1,nz
     sidx = 1 + 2*(kk-1)*ldx*ldy ! Pass the pointer, 2 to account for the imag part.
     call dfftw_execute_dft(bw_plan2, ff(sidx), ff(sidx))
   end do

   call fftw3_destroy_plan(bw_plan2)

#else
   
   bw_plan1 = dplan_many_dft(1, (/nx/), 1,&   ! Single 1D transform of f(Gx,Gy,Gz) along Gx.
&       ff, (/ldx, ldy, ldz/), 1, ldx,    &
&       ff, (/ldx, ldy, ldz/), 1, ldx, FFTW_BACKWARD, FFTW_ESTIMATE,1)
   !
   ! 1) Transform along x (only the region G_z >= 0).
   !do kk=1,nz
   do kk=1,g3_max+1
     if (do_fft_y(kk) /= 0) then
       do jj=1,ny
         if (do_fft_x(jj,kk) == 1) then 
           sidx = 1 + 2*(jj-1)*ldx + 2*(kk-1)*ldx*ldy ! Pass the pointer, 2 to account for the imag part.
           call dfftw_execute_dft(bw_plan1, ff(sidx), ff(sidx))
         end if
       end do
     end if
   end do

   call fftw3_destroy_plan(bw_plan1)
   !
   ! 2) Transform along y (only the region G_z >= 0).
   bw_plan2 = dplan_many_dft(1, (/ny/), nx, &   ! nx 1D transforms of f(x,Gy,Gz) along Gy.
&       ff, (/ldx, ldy, ldz/), ldx, 1,                          &
&       ff, (/ldx, ldy, ldz/), ldx, 1, FFTW_BACKWARD, FFTW_ESTIMATE, 1)

   !do kk=1,nz
   do kk=1,g3_max+1
     if (do_fft_y(kk) /= 0) then
       sidx = 1 + 2*(kk-1)*ldx*ldy
       call dfftw_execute_dft(bw_plan2, ff(sidx), ff(sidx))
     end if
   end do

   call fftw3_destroy_plan(bw_plan2)
   !
   ! Now ff contains f(x,y,Gz) for Gz >= 0
   ! Note that f(x,y,-Gz) = f(x,y,Gz)* hence here we can perform the c2r FFT.

   !bw_plan3 = dplan_many_dft(1, (/nz/), ldx*ldy, & ! ldx*ldy 1D transforms of f(x,y,Gz) along Gz.
   !&       ff, (/ldx, ldy, ldz/), ldx*ldy, 1,      & ! Note that we have to visit the entire augmented x-y plane!
   !&       ff, (/ldx, ldy, ldz/), ldx*ldy, 1, FFTW_BACKWARD, FFTW_ESTIMATE)


   if (.FALSE.) then
   !if (.TRUE.) then
     call dfftw_plan_many_dft_c2r(bw_plan3, 1, (/nz/),     ldx*ldy,    &
                                  ff, (/  ldx*ldy*ldz/),   ldx*ldy, 1, &
                                  ff, (/2*ldx*ldy*ldz/), 2*ldx*ldy, 2, FFTW_ESTIMATE, 1)

     if (bw_plan3==NULL_PLAN) then ! handle the error.
       MSG_ERROR("Check FFTW library and/or abinit code")
     end if

     call dfftw_execute_dft(bw_plan3, ff, ff)
     call fftw3_destroy_plan(bw_plan3)
   else
     ! Fill the Hermitian part: Hermitian redundancy: out[i] is the conjugate of out[n-i]
     padz = (nz/2+1)

     n = (/nz/)
     !n = (/padz/)
     nffts = ldx*ldy
     inembed  = (/ldx*ldy*ldz/)
     !istride = 1
     !idist   = nx*ny
     istride = ldx*ldy
     idist   = 1

     onembed  = (/ldx*ldy*ldz/)
     ostride = ldx*ldy
     odist   = 1

     ABI_MALLOC(gg,(ldx*ldy*ldz))

     c2r_plan = dplan_many_dft_c2r(rank1, n, nffts, ff, inembed, istride, idist, gg, &
&                                  onembed, ostride, odist, FFTW_ESTIMATE, nt_all)

     ! Now perform the 3D FFT via FFTW. c2r are always FFTW_BACKWARD
     call dfftw_execute_dft_c2r(c2r_plan, ff, gg)

     call fftw3_destroy_plan(c2r_plan)

     !call xcopy(ldx*ldy*ldz*ndat,gg,1,ff,2)

     do ifft=1,ldx*ldy*ldz
       ff(2*ifft-1) = gg(ifft)
       ff(2*ifft) = zero
     end do
     ABI_FREE(gg)
   end if
#endif
 
 CASE (FFTW_FORWARD) ! R --> G
   MSG_ERROR("R-->G not coded")
 
 CASE DEFAULT 
   MSG_BUG("Wrong isign")
 END SELECT

 ABI_FREE(do_fft_x)
 ABI_FREE(do_fft_y)

#else
 MSG_ERROR("FFTW3 support not activated")
 ABI_UNUSED((/nx,ny,nz,ldx,ldy,ldz,ndat,mgfft,isign/))
 ABI_UNUSED(gbound(1,1))
 ABI_UNUSED(ff(1))
#endif

end subroutine fftw3_fftpad_tr     
!!***

!----------------------------------------------------------------------

!!****f* m_fftw3/zpad_init
!! NAME
!!  zpad_init
!!
!! FUNCTION
!!  
!! INPUTS
!!   mgfft=MAX(nx,ny,nz), only used to dimension gbound
!!   gbound(2*mgfft+8,2)= The boundaries of the basis sphere of G vectors at a given k-point.
!!     See sphereboundary for more info.
!!
!! OUTPUT
!!  zpad<type(zpad_t)>
!!
!! PARENTS
!!      m_fftw3
!!
!! CHILDREN
!!
!! SOURCE

subroutine zpad_init(zpad,nx,ny,nz,ldx,ldy,ldz,mgfft,gbound)


!This section has been created automatically by the script Abilint (TD).
!Do not modify the following lines by hand.
#undef ABI_FUNC
#define ABI_FUNC 'zpad_init'
!End of the abilint section

 implicit none

!Arguments ------------------------------------
!scalars
 integer,intent(in) :: nx,ny,nz,ldx,ldy,ldz,mgfft
 type(zpad_t),intent(out) :: zpad
!arrays
 integer,intent(in) :: gbound(2*mgfft+8,2)

!Local variables-------------------------------
!scalars
 integer :: jj,g3_max,g3_min,gg3,ifft_g3,igb,g2min,g2max,nlinex

! *************************************************************************

 g3_min = gbound(3,2)
 g3_max = gbound(4,2)

 zpad%n_zplanes = g3_max - g3_min + 1

 ABI_MALLOC(zpad%zplane,      (2,nz))
 ABI_MALLOC(zpad%linex2ifft_yz, (2,nx*ny*nz))
 !
 ! Loop over the z-planes intersecting the G-sphere.
 nlinex = 0
 do gg3=1,zpad%n_zplanes 
   !
   if (gg3<=g3_max+1) then
     ifft_g3 = gg3
   else 
     ifft_g3 = gg3 + nz - zpad%n_zplanes ! Wrap around for negative gg3.
   end if
   !
   ! Select the set of y for this z-plane.
   igb=2*gg3+3
   g2min = gbound(igb  ,2) 
   g2max = gbound(igb+1,2)

   zpad%zplane(1,gg3) = ifft_g3
   zpad%zplane(2,gg3) = igb

   !(1:g2max+1,ifft_g3)     ! Positive g_y.
   !(g2min+ny+1:ny,ifft_g3) ! Negative g_y.

   do jj=1,g2max+1
     nlinex = nlinex + 1
     zpad%linex2ifft_yz(1,nlinex) = jj  
     zpad%linex2ifft_yz(2,nlinex) = ifft_g3  
   end do

   do jj=g2min+ny+1,ny
     nlinex = nlinex + 1
     zpad%linex2ifft_yz(1,nlinex) = jj  
     zpad%linex2ifft_yz(2,nlinex) = ifft_g3  
   end do
 end do

 zpad%nlinex = nlinex

 RETURN
 ABI_UNUSED((/ldx,ldy,ldz/))

end subroutine zpad_init
!!***

!----------------------------------------------------------------------

!!****f* m_fftw3/zpad_free
!! NAME
!!  zpad_free
!!
!! FUNCTION
!!  
!! INPUTS
!!
!! OUTPUT
!!
!! PARENTS
!!      m_fftw3
!!
!! CHILDREN
!!
!! SOURCE

subroutine zpad_free(zpad)


!This section has been created automatically by the script Abilint (TD).
!Do not modify the following lines by hand.
#undef ABI_FUNC
#define ABI_FUNC 'zpad_free'
!End of the abilint section

 implicit none

!Arguments ------------------------------------
!scalars
 type(zpad_t),intent(inout) :: zpad

!Local variables-------------------------------

! *************************************************************************

 if (associated(zpad%zplane)) then
   ABI_FREE(zpad%zplane)
 end if

 if (associated(zpad%linex2ifft_yz)) then
   ABI_FREE(zpad%linex2ifft_yz)
 end if

end subroutine zpad_free
!!***

END MODULE m_fftw3
!!***
