!{\src2tex{textfont=tt}}
!!****f* ABINIT/mpi_setup
!! NAME
!! mpi_setup
!!
!! FUNCTION
!! Big loop on the datasets :
!! - compute mgfft,mpw,nfft,... for this data set ;
!! - fill mpi_enreg
!!  *** At the output of this routine, all the dtsets input variables are known ***
!! The content of dtsets should not be modified anymore afterwards.
!!
!! COPYRIGHT
!! Copyright (C) 1999-2012 ABINIT group (FJ,MT)
!! This file is distributed under the terms of the
!! GNU General Public License, see ~abinit/COPYING
!! or http://www.gnu.org/copyleft/gpl.txt .
!! For the initials of contributors, see ~abinit/doc/developers/contributors.txt.
!!
!! INPUTS
!!  ndtset= number of datasets to be read; if 0, no multi-dataset mode
!!  ndtset_alloc=number of datasets, corrected for allocation of at least
!!      one data set.
!!
!! OUTPUT
!!  dtsets(0:ndtset_alloc)=<type datafiles_type>contains all input variables,
!!   some of which are initialized here, while other were already
!!   initialized previously.
!!
!! SIDE EFFECTS
!!   mpi_enregs=informations about MPI parallelization
!!
!! NOTES
!!
!! PARENTS
!!      abinit
!!
!! CHILDREN
!!      abi_io_redirect,distrb2,finddistrproc,getmpw,getng,init_mpi_enreg
!!      initmpi_atom,initmpi_grid,initmpi_img,initmpi_pert,intagm,leave_new
!!      metric,mkrdim,wrtout
!!
!! SOURCE

#if defined HAVE_CONFIG_H
#include "config.h"
#endif

#include "abi_common.h"

subroutine mpi_setup(dtsets,lenstr,mpi_enregs,ndtset,ndtset_alloc,string)

 use m_profiling

 use defs_basis
 use defs_abitypes
 use defs_parameters
 use m_xmpi
 use m_errors

!This section has been created automatically by the script Abilint (TD).
!Do not modify the following lines by hand.
#undef ABI_FUNC
#define ABI_FUNC 'mpi_setup'
 use interfaces_14_hidewrite
 use interfaces_16_hideleave
 use interfaces_32_util
 use interfaces_41_geometry
 use interfaces_42_parser
 use interfaces_51_manage_mpi
 use interfaces_56_recipspace
 use interfaces_57_iovars, except_this_one => mpi_setup
!End of the abilint section

 implicit none

!Arguments ------------------------------------
!scalars
 integer,intent(in) :: lenstr,ndtset,ndtset_alloc
 type(MPI_type),intent(inout) :: mpi_enregs(0:ndtset_alloc)
 character(len=*),intent(inout) :: string
!arrays
 type(dataset_type),intent(inout) :: dtsets(0:ndtset_alloc)

!Local variables -------------------------------
!scalars
 integer :: blocksize,exchn2n3d,iband,idtset,ierr,ii,iikpt,iikpt_modulo,isppol,jdtset,marr,mband_upper
 integer :: me_fft,mgfft,mgfftdg,mkmem,mpw,mpw_k
 integer :: nfft,nfftdg,ngfft7_default,nkpt,nkpt_me,nproc,nproc_fft,nqpt
 integer :: nspink,nsppol,nsym,paral_fft,response,tnband,tread,tread_kgb,usepaw,vectsize
 logical :: fftalg_read,ortalg_read,wfoptalg_read
 real(dp) :: dilatmx,ecut,ecut_eff,ecutdg_eff,ucvol
 character(len=500) :: message
!arrays
 integer :: ngfft(18),ngfftdg(18),ngfftc(3)
 integer,allocatable :: intarr(:),istwfk(:),symrel(:,:,:)
 real(dp) :: gmet(3,3),gprimd(3,3),qphon(3),rmet(3,3),rprimd(3,3)
 real(dp),allocatable :: dprarr(:),kpt_with_shift(:,:)
 character(len=30) :: token
 character(len=6) :: nm_mkmem(3)

!*************************************************************************

!DEBUG   XG120526 Please do not remove the following write.
!It makes the test paral#AH4 execute correctly with buda_gcc43, there is likely a compiler bug ...
 write(std_out,*)' mpi_setup : enter '
!ENDDEBUG

 mpw_k=0
 ierr=0
!Special treatment of ngfft(7)=fftalg, which is machine-dependent,
!and also depends on some input variables ...
 ngfft7_default=112
#if defined FC_FUJITSU
!For the vpp fujitsu, it is better to have the last digit at 1.
 ngfft7_default=111
#elif defined HAVE_FFT_ASL
!For the NEC computer, the library FFT routine is (presently) faster than the routines from Stefan
 ngfft7_default=200
#elif defined HAVE_FFT_FFTW3
!There is still a problem with the algorithm 312 in case of DMFT
 if(sum(dtsets(1:ndtset_alloc)%usedmft)==0)then
   ngfft7_default=312
 end if
#endif

 call init_mpi_enreg(mpi_enregs(0))
 call initmpi_img(dtsets(0),mpi_enregs(0),-1)

 do idtset=1,ndtset_alloc

   call init_mpi_enreg(mpi_enregs(idtset))

   if(xmpi_paral==0.and.dtsets(idtset)%paral_kgb==1)then
     dtsets(idtset)%paral_kgb=0
     write(message, '(8a)' )ch10,&
&     ' abinit : WARNING -',ch10,&
&     '  When ABINIT is compiled without MPI flag,',ch10,&
&     '  setting paral_kgb/=0 is useless. paral_kgb has been reset to 0.',ch10,&
&     '  Action : modify compilation option or paral_kgb in the input file.'
     call wrtout(std_out,  message,'COLL')
   end if

!  Read parameters and parallel input data
   marr=dtsets(idtset)%npsp;if (dtsets(idtset)%npsp<3) marr=3
   marr=max(marr,dtsets(idtset)%nimage)
   ABI_ALLOCATE(intarr,(marr))
   ABI_ALLOCATE(dprarr,(marr))
   nkpt  =dtsets(idtset)%nkpt
   nsppol=dtsets(idtset)%nsppol
   jdtset=dtsets(idtset)%jdtset ; if(ndtset==0)jdtset=0
   usepaw=dtsets(idtset)%usepaw
   mband_upper=maxval(dtsets(idtset)%nband(1:nkpt*nsppol))

   token = 'npimage'
   call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),token,tread,'INT')
   if(tread==1) then
     dtsets(idtset)%npimage=intarr(1)
   else
     dtsets(idtset)%npimage=mpi_enregs(idtset)%nproc
     ii=dtsets(idtset)%ndynimage;if (dtsets(idtset)%ntimimage<=1) ii=dtsets(idtset)%nimage
     if (ii<mpi_enregs(idtset)%nproc) dtsets(idtset)%npimage=ii
   end if

   token = 'npkpt'
   call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),token,tread,'INT')
   if(tread==1) dtsets(idtset)%npkpt=intarr(1)

   dtsets(idtset)%npspinor=1
   token = 'npspinor'
   call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),token,tread,'INT')
   if(tread==1) dtsets(idtset)%npspinor=intarr(1)

   token = 'npfft'
   call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),token,tread,'INT')
   if(tread==1) dtsets(idtset)%npfft=intarr(1)

   token = 'npband'
   call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),token,tread,'INT')
   if(tread==1) dtsets(idtset)%npband=intarr(1)

   token = 'bandpp'
   call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),token,tread,'INT')
   if(tread==1) dtsets(idtset)%bandpp=intarr(1)

   token = 'nbdblock'
   call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),token,tread,'INT')
   if(tread==1) then
     dtsets(idtset)%nbdblock=intarr(1)
   else
     dtsets(idtset)%nbdblock=1
   end if

   tnband=0
   token = 'nband'
   call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),token,tnband,'INT')

   token = 'paral_kgb'
   call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),token,tread_kgb,'INT')

   if ((dtsets(idtset)%npkpt/=1.or.dtsets(idtset)%npband/=1.or.dtsets(idtset)%npfft/=1.or. &
&   dtsets(idtset)%npspinor/=1.or.dtsets(idtset)%bandpp/=1).and.(dtsets(idtset)%paral_kgb>=0)) then
     if (xmpi_paral==0) dtsets(idtset)%paral_kgb=0
     if (xmpi_paral==1) dtsets(idtset)%paral_kgb=1
   end if

!  From total number of procs, compute all possible distributions
   call finddistrproc(dtsets(idtset),idtset,ierr,mband_upper,mpi_enregs(idtset),tread_kgb)

   if(((dtsets(idtset)%optdriver/=RUNL_GSTATE).and.(dtsets(idtset)%npkpt/=1.or.dtsets(idtset)%npband/=1&
&   .or.dtsets(idtset)%npfft/=1.or.dtsets(idtset)%npspinor/=1.or.dtsets(idtset)%bandpp/=1))&
&   .or.(dtsets(idtset)%iscf<0)) then
     dtsets(idtset)%npfft=1
     dtsets(idtset)%npband=1
     dtsets(idtset)%npkpt=1
     dtsets(idtset)%npspinor=1
     dtsets(idtset)%bandpp=1
     write(message,'(4a)') ch10,&
&     ' mpi_setup : WARNING -',ch10,&
&     ' For non ground state calculation, set bandpp, npfft, npband, nspinor and npkpt to 1'
     call wrtout(std_out,message,'COLL')
   end if
   if ((dtsets(idtset)%optdriver/=RUNL_GSTATE).or.(dtsets(idtset)%iscf<0)) dtsets(idtset)%paral_kgb=0

!  Read again some input data to take into account a possible change of paral_kgb
   token = 'wfoptalg' ; wfoptalg_read=.false.
   call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),token,tread,'INT')
   if(tread==1) then
     dtsets(idtset)%wfoptalg=intarr(1)
     wfoptalg_read=.true.
   else
     if (dtsets(idtset)%usepaw==0)    dtsets(idtset)%wfoptalg=0
     if (dtsets(idtset)%usepaw/=0)    dtsets(idtset)%wfoptalg=10
     if (dtsets(idtset)%optdriver==RUNL_GSTATE) then
       if (dtsets(idtset)%paral_kgb/=0) dtsets(idtset)%wfoptalg=14
!      MT sept 2012: following default for wfoptalg has been displaced at the end of the routine
!      now depends on gpu_linalg_limit
!      if (dtsets(idtset)%use_gpu_cuda==1.or.dtsets(idtset)%use_gpu_cuda==-1) dtsets(idtset)%wfoptalg=14
     end if
   end if

   dtsets(idtset)%ngfft(7)=ngfft7_default
   token = 'fftalg' ; fftalg_read=.false.
   call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),token,tread,'INT')
   if(tread==1) then
     dtsets(idtset)%ngfft(7)=intarr(1)
     if (usepaw==1) dtsets(idtset)%ngfftdg(7)=intarr(1)
     fftalg_read=.true.
   else
     if (dtsets(idtset)%optdriver==RUNL_GSTATE) then
       if(dtsets(idtset)%paral_kgb==1.or.dtsets(idtset)%use_gpu_cuda/=0.or. &
&       mod(dtsets(idtset)%wfoptalg,10)==4) then
         dtsets(idtset)%ngfft(7)=401
         if (usepaw==1) dtsets(idtset)%ngfftdg(7)=401
       end if
     end if
   end if

   token = 'fft_opt_lob'
   call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),token,tread,'INT')
   if(tread==1) then
     dtsets(idtset)%fft_opt_lob=intarr(1)
   else
     if(dtsets(idtset)%paral_kgb==1) dtsets(idtset)%fft_opt_lob=2
   end if

   dtsets(idtset)%iprcch=2
   token = 'iprcch'
   call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),token,tread,'INT')
   if(tread==1) then
     dtsets(idtset)%iprcch=intarr(1)
   else
     if (dtsets(idtset)%paral_kgb==1) dtsets(idtset)%iprcch=6
   end if
   if((dtsets(idtset)%iscf==5.or.dtsets(idtset)%iscf==6) &
&   .and. dtsets(idtset)%ionmov==4 .and. dtsets(idtset)%iprcch/=3 )then
     dtsets(idtset)%iprcch=3
     write(message, '(a,a,a,a,a,a)' ) ch10,&
&     ' mpi_setup: COMMENT -',ch10,&
&     '  When ionmov==4 and iscf==5 or 6, iprcch must be 3.',ch10,&
&     '  Set iprcch to 3.'
     call wrtout(std_out,message,'COLL')
   end if

   if (usepaw==0) then
     dtsets(idtset)%ortalg=2
   else
     dtsets(idtset)%ortalg=-2
   end if
   token = 'ortalg' ; ortalg_read=.false.
   call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),token,tread,'INT')
   if(tread==1) then
     dtsets(idtset)%ortalg=intarr(1)
     ortalg_read=.true.
   else if (dtsets(idtset)%wfoptalg>=10 .and. dtsets(idtset)%ortalg>0) then
     dtsets(idtset)%ortalg=-dtsets(idtset)%ortalg
   end if

   token = 'accesswff'
   call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),token,tread,'INT')
   if(tread==1) then
     dtsets(idtset)%accesswff=intarr(1)
   else
     if ((xmpi_mpiio==1).and.(dtsets(idtset)%paral_kgb==1)) &
&     dtsets(idtset)%accesswff=IO_MODE_MPI
   end if

   token = 'pawmixdg'
   call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),token,tread,'INT')
   if(tread==1) then
     dtsets(idtset)%pawmixdg=intarr(1)
   else if (dtsets(idtset)%npfft>1.and.usepaw==1) then
     dtsets(idtset)%pawmixdg=1
   end if

   if (dtsets(idtset)%paral_rf/=0) dtsets(idtset)%paral_atom=0
   if (dtsets(idtset)%prtden>1.and.dtsets(idtset)%paral_kgb<=0) dtsets(idtset)%paral_atom=0

   mpi_enregs(idtset)%paral_kgb=dtsets(idtset)%paral_kgb
   call initmpi_img(dtsets(idtset),mpi_enregs(idtset),-1)
!  Cycle if the processor is not used
   if (mpi_enregs(idtset)%me<0) then
     ABI_DEALLOCATE(intarr)
     ABI_DEALLOCATE(dprarr)
     cycle
   end if
   response=0
   if(dtsets(idtset)%rfddk/=0 .or. dtsets(idtset)%rfelfd/=0 .or. &
&   dtsets(idtset)%rfphon/=0 .or. &
&   dtsets(idtset)%rfstrs/=0 .or.  dtsets(idtset)%rfuser/=0    ) response=1
   nproc=mpi_enregs(idtset)%nproc_cell
!  --IF CUDA AND RECURSION:ONLY BAND PARALLELISATION
   if(dtsets(idtset)%tfkinfunc==2 .and. nproc/=1)then
     dtsets(idtset)%npband = dtsets(idtset)%npband*dtsets(idtset)%npkpt*dtsets(idtset)%npspinor*dtsets(idtset)%npfft
     dtsets(idtset)%npkpt = 1
     dtsets(idtset)%npfft = 1
     dtsets(idtset)%npspinor = 1
     write(message, '(8a,i6,a)' ) ch10,&
&     ' mpi_setup1: WARNING -',ch10,&
&     '  If HAVE_GPU_CUDA and recursion are used ',ch10,&
&     '  only the band parallelisation is active, we set:',ch10,&
&     '  npfft= 1, npkpt= 1, npband=',dtsets(idtset)%npband,' .'
     call wrtout(std_out,message,'COLL')
   end if

   if (dtsets(idtset)%npspinor>=2.and.dtsets(idtset)%nspinor==1) then
     dtsets(idtset)%npspinor=1
     dtsets(idtset)%npfft=2*dtsets(idtset)%npfft
     write(message,'(6a)') ch10,&
&     ' abinit : WARNING -',ch10,&
&     '  npspinor is bigger than nspinor !',ch10,&
&     '  We set npspinor to 1 ; we set npfft to 2*npfft'
     call wrtout(std_out,message,'COLL')
   end if

!  Some checks on parallelization data
   if(dtsets(idtset)%paral_kgb/=0.and.(dtsets(idtset)%bandpp/=1.or.dtsets(idtset)%npband/=1.or.&
&   dtsets(idtset)%npfft/=1.or.dtsets(idtset)%npkpt/=1.or.dtsets(idtset)%npspinor/=1))then
     if(dtsets(idtset)%npkpt*dtsets(idtset)%npfft*dtsets(idtset)%npband*dtsets(idtset)%npspinor > nproc )then
       write(message,'(10a)') ch10,&
&       ' abinit : WARNING -',ch10,&
&       '  The product of npkpt, npfft, npband and npspinor is bigger than the number of processors.',ch10,&
&       '  The user-defined values of npkpt, npfft, npband or npspinor will be modified,',ch10,&
&       '  in order to bring this product below nproc .',ch10,&
&       '  At present, only a very simple algorithm is used ...'
       call wrtout(std_out,message,'COLL')

       if(dtsets(idtset)%npkpt*dtsets(idtset)%npband*dtsets(idtset)%npspinor <= nproc) then
         dtsets(idtset)%npfft=1
         write(message,'(4a)') ch10,&
&         ' abinit : WARNING -',ch10,&
&         '  Set npfft to 1'
         call wrtout(std_out,message,'COLL')
       else if(dtsets(idtset)%npkpt*dtsets(idtset)%npspinor <= nproc)then
         dtsets(idtset)%npfft=1
         dtsets(idtset)%npband=1
         write(message,'(4a)') ch10,&
&         ' abinit : WARNING -',ch10,&
&         '  Set npfft and npband to 1'
         call wrtout(std_out,message,'COLL')
       else if(dtsets(idtset)%npkpt <= nproc)then
         dtsets(idtset)%npfft=1
         dtsets(idtset)%npband=1
         dtsets(idtset)%npspinor=1
         write(message,'(4a)') ch10,&
&         ' abinit : WARNING -',ch10,&
&         '  Set npfft ,npband and npspinor to 1'
         call wrtout(std_out,message,'COLL')
       else
         dtsets(idtset)%npfft=1
         dtsets(idtset)%npband=1
         dtsets(idtset)%npkpt=1
         dtsets(idtset)%npspinor=1
         write(message,'(4a)') ch10,&
&         ' abinit : WARNING -',ch10,&
&         '  Set npfft, npband, nspinor and npkpt to 1'
         call wrtout(std_out,message,'COLL')
       end if
     else if(dtsets(idtset)%npkpt*dtsets(idtset)%npfft*dtsets(idtset)%npband*dtsets(idtset)%npspinor < nproc)then
       write(message,'(5a)') ch10,&
&       ' abinit : STOP -',ch10,&
&       ' The number of processor must not be greater than npfft*npband*npkpt*npsinor ',&
&       ' when npfft or npkpt or npband or nspinor are chosen manually in the input file.'
       call wrtout(std_out,message,'COLL')
       call leave_new('COLL')
     end if
   end if

!  Set mpi_enreg
   if(dtsets(idtset)%paral_kgb/=0)then
     mpi_enregs(idtset)%nproc_kpt=dtsets(idtset)%npkpt
     mpi_enregs(idtset)%nproc_fft=dtsets(idtset)%npfft
     mpi_enregs(idtset)%nproc_band=dtsets(idtset)%npband
     mpi_enregs(idtset)%nproc_spinor=min(dtsets(idtset)%npspinor,dtsets(idtset)%nspinor)
     mpi_enregs(idtset)%bandpp=dtsets(idtset)%bandpp
     if (dtsets(idtset)%fft_opt_lob == 0) dtsets(idtset)%fft_opt_lob=1
   else
     mpi_enregs(idtset)%nproc_kpt = mpi_enregs(idtset)%nproc_cell
   end if


   if(dtsets(idtset)%paral_kgb>=0) then
!    Compute processor distribution over kpt (and eventually band-fft)
     mpi_enregs(idtset)%paral_pert=dtsets(idtset)%paral_rf
     if(mpi_enregs(idtset)%paral_pert==1) then
       token = 'nppert'
       call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),token,tread,'INT')
       if(dtsets(idtset)%nppert>mpi_enregs(idtset)%nproc) then
         write(message,'(4a)') ch10,&
&         ' abinit : STOP -',ch10,&
&         ' The number of processor must not be smaller than nppert '
         call wrtout(std_out,message,'COLL')
         call leave_new('COLL')
       end if
       call initmpi_pert(dtsets(idtset),mpi_enregs(idtset),tread)
       mpi_enregs(idtset)%nproc_kpt = mpi_enregs(idtset)%nproc_cell
       nproc=mpi_enregs(idtset)%nproc_cell
     end if
!    Cycle if the processor is not used
     if (mpi_enregs(idtset)%me<0) then
       ABI_DEALLOCATE(intarr)
       ABI_DEALLOCATE(dprarr)
       cycle
     end if

     call initmpi_grid(mpi_enregs(idtset))

!    Compute the maximal number of k points treated per processor
     nkpt_me=nkpt
     ABI_ALLOCATE(mpi_enregs(idtset)%proc_distrb,(nkpt,mband_upper,nsppol))
     ABI_ALLOCATE(mpi_enregs(idtset)%my_kpttab,(nkpt))
     mpi_enregs(idtset)%proc_distrb=0
     if(xmpi_paral==1 .and. dtsets(idtset)%usewvl == 0) then
!      Determine who I am
!      Define k-points distribution
!      Note that nkpt_me may differ from processor to processor
!      This fact will NOT be taken into account when
!      the memory needs will be evaluated in the subroutine memory.
!      Also, the reduction of k points due to symmetry in RF calculations
!      is NOT taken into account. This should be changed later ...

       nkpt_me=0
       if(response==0)then
         mpi_enregs(idtset)%paralbd=0
         call distrb2(mband_upper,dtsets(idtset)%nband,nkpt,nproc,nsppol,mpi_enregs(idtset))
         do iikpt=1,nkpt
           if(.not.(proc_distrb_cycle(mpi_enregs(idtset)%proc_distrb,iikpt,1,1,-1,mpi_enregs(idtset)%me_kpt)))&
&           nkpt_me=nkpt_me+1
         end do ! ikpt=1,nkpt

       else ! response==1
!        Wrongly assumes that the number of elements of the
!        k-point sets of the two spin polarizations is the maximal
!        value of one of these k-point sets ...
!        This is to be corrected when RF is implemented
!        for spin-polarized case.
         mpi_enregs(idtset)%paralbd=1
!        nproc=mpi_enregs(idtset)%nproc_cell*mpi_enregs(idtset)%nproc_pert
         call distrb2(mband_upper,dtsets(idtset)%nband,nkpt,nproc,nsppol,mpi_enregs(idtset))
         do isppol=1,nsppol
           nspink=0
           do iikpt=1,nkpt
             do iband=1,dtsets(idtset)%nband(iikpt+(isppol-1)*nkpt)
               if(mpi_enregs(idtset)%proc_distrb(iikpt,iband,isppol)==mpi_enregs(idtset)%me_cell)then
                 nspink=nspink+1
                 exit
               end if
             end do ! iband
           end do ! iikpt
           if(nspink>nkpt_me)nkpt_me=nspink
         end do ! isppol
!        If the number of bands was estimated, there might be a side effect
!        when the definitive number of bands is known. k points
!        might be attributed to different processors than the present
!        proc_distrb describes. At most, the number of k points could
!        increase by 1 ...
         if(tnband==0)nkpt_me=nkpt_me+1
!        In any case, the maximal number of k points is nkpt
         if(nkpt_me>nkpt)nkpt_me=nkpt
       end if
     end if
   end if

!  Take care of mkmems. Use the generic name -mkmem- for mkmem as well as mkqmem
!  and mk1mem.
   nm_mkmem(1)='mkmem '
   nm_mkmem(2)='mkqmem'
   nm_mkmem(3)='mk1mem'

   do ii=1,3

!    Read in mkmem here if it is in the input file
     if(ii==1)then
       token = 'mkmem'
       call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),token,tread,'INT')
     else if(ii==2)then
       token = 'mkqmem'
       call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),token,tread,'INT')
     else if(ii==3)then
       token = 'mk1mem'
       call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),token,tread,'INT')
     end if

!    Note that mkmem is used as a dummy variable, representing mkmem as well
!    as mkqmem, and mk1mem.
     if(tread==1) then
       mkmem=intarr(1)
       if (mkmem<0) then
!        mkmem is unreasonable; must be zero or positive
         write(message, '(a,a,a,a,a,a,a,a,i6,a,a,a,a)' ) ch10,&
&         ' mpi_setup1: WARNING -',ch10,&
&         '  ',nm_mkmem(ii),' must be positive or nul but ',&
&         nm_mkmem(ii),' =',mkmem,ch10,&
&         '  Use default ',nm_mkmem(ii),' = nkpt .'
         call wrtout(std_out,message,'COLL')
         mkmem=nkpt
       end if

     else

!      mkmem was not set in the input file so default to incore solution
       write(message, '(a,a,a,a,a,a)' ) &
&       ' mpi_setup1: ',nm_mkmem(ii),' undefined in the input file.',&
&       ' Use default ',nm_mkmem(ii),' = nkpt'
       call wrtout(std_out,message,'COLL')
       mkmem=nkpt
     end if

!    Check whether nkpt distributed on the processors <= mkmem;
!    if so then may run entirely in core,
!    avoiding i/o to disk for wavefunctions and kg data.
!    mkmem/=0 to avoid i/o; mkmem==0 to use disk i/o for nkpt>=1.
     if (nkpt_me<=mkmem .and. mkmem/=0 ) then
       write(message, '(a,i5,a,a,a,i5,a)' ) &
&       ' mpi_setup1: With nkpt_me=',nkpt_me,&
&       ' and ',nm_mkmem(ii),' = ',mkmem,', ground state wf handled in core.'
       call wrtout(std_out,message,'COLL')
       if(nkpt_me<mkmem .and. nkpt_me/=0)then
         write(message, '(a,a,a)' ) &
&         ' Resetting ',nm_mkmem(ii),' to nkpt_me to save memory space.'
         mkmem=nkpt_me
         call wrtout(std_out,message,'COLL')
       end if
     else if(mkmem/=0)then
       write(message, '(a,i5,a,a,a,i5,a,a,a,a,a)' ) &
&       ' mpi_setup1: With nkpt_me=',nkpt_me,&
&       ' and ',nm_mkmem(ii),' = ',mkmem,&
&       ' ground state wf require disk i/o.',ch10,&
&       ' Resetting ',nm_mkmem(ii),' to zero to save memory space.'
       mkmem=0
       call wrtout(std_out,message,'COLL')
     end if
     if(dtsets(idtset)%usewvl == 0)then
       if(ii==1)dtsets(idtset)%mkmem=mkmem
     end if
     if(ii==2)dtsets(idtset)%mkqmem=mkmem
     if(ii==3)dtsets(idtset)%mk1mem=mkmem

!    check condition for no empty processors
!    mkmem_eff = ceiling(dtset%nkpt/mpi_enreg%nproc_kpt)
     if (dtsets(idtset)%usewvl == 0)then
       if (mpi_enregs(idtset)%paralbd==0.or.mkmem>1) then
         if (mpi_enregs(idtset)%nproc_kpt - floor(nsppol*dtsets(idtset)%nkpt*one/mkmem) >= mkmem) then
           write (message,'(4a,I6,3a,I6,5a,I6,a)') ch10, &
&           ' mpi_setup1 : WARNING -',ch10, &
&           '   Your number of k-points (',dtsets(idtset)%nkpt,') will not distribute correctly',ch10, &
&           '   with the current number of processors (',mpi_enregs(idtset)%nproc_kpt,').',ch10,&
&           '   You will leave some empty.',ch10, &
&           ' ACTION: you can reduce number of processors to ', floor(nsppol*dtsets(idtset)%nkpt*one/mkmem), &
&           ' without losing speed.'
           call wrtout(std_out,message,'COLL')
         end if
       else
         if (mod(mband_upper,max(1,mpi_enregs(idtset)%nproc_kpt/(nsppol*dtsets(idtset)%nkpt)))/=0) then
           write (message,'(4a,I6,a,I6,3a,I6,3a)') ch10, &
&           ' mpi_setup1 : WARNING -',ch10, &
&           '   Your number of k-points (',dtsets(idtset)%nkpt,') and bands (',mband_upper,&
&           ') will not distribute correctly',ch10, &
&           '   with the current number of processors (',mpi_enregs(idtset)%nproc_kpt,').',ch10,&
&           '   You will leave some empty.'
           call wrtout(std_out,message,'COLL')
         end if
       end if
     end if

!    End the loop on the three possiblities mkmem, mkqmem, mk1mem.
   end do
   if(dtsets(idtset)%paral_kgb==1) mpi_enregs(idtset)%paralbd=0


!  call mpi_setup1(dtsets(idtset),jdtset,lenstr,mband_upper,mpi_enregs(idtset),string)
!  Printing of processor distribution
!  MPIWF : here, set up the complete ngfft, containing the information
!  for the parallelisation of the FFT
   call abi_io_redirect(new_io_comm=mpi_enregs(idtset)%comm_world,new_leave_comm=mpi_enregs(idtset)%comm_world)

!  Default values for sequentiel case
   paral_fft=0
   nproc_fft=1
   me_fft=0

   if(dtsets(idtset)%usewvl == 0)then
     if(dtsets(idtset)%optdriver==RUNL_GSTATE.or.dtsets(idtset)%optdriver==RUNL_RESPFN) then
       paral_fft=1           ! parallelisation over FFT
       if (mpi_enregs(idtset)%nproc_cell>0) then
         if(mpi_enregs(idtset)%paral_kgb == 1) then

           if((dtsets(idtset)%use_gpu_cuda==1).and.(mpi_enregs(idtset)%nproc_fft/=1))then
             write(message,'(3a,i5)') &
&             '  When use_gpu_cuda is on, the number of FFT processors, npfft, must be 1',ch10,&
&             '  However, npfft=',mpi_enregs(idtset)%nproc_fft
             MSG_ERROR(message)
           end if

           if(modulo(dtsets(idtset)%ngfft(2),mpi_enregs(idtset)%nproc_fft)/=0)then
             write(message,'(5a,i5,a,i5)') &
&             '  The number of FFT processors, npfft, should be',ch10,&
&             '  a multiple of the number of ngfft(2).',ch10,&
&             '  However, npfft=',mpi_enregs(idtset)%nproc_fft,' and ngfft(2)=',dtsets(idtset)%ngfft(2)
             MSG_BUG(message)
           end if

           do iikpt=1,nkpt*nsppol
             iikpt_modulo = modulo(iikpt,nkpt)+1
             if ((dtsets(idtset)%istwfk(iikpt_modulo)==2).and.(dtsets(idtset)%ngfft(7)==401)) then
               if ((mpi_enregs(idtset)%bandpp==0).or. &
               ((mpi_enregs(idtset)%bandpp/=1).and.(modulo(mpi_enregs(idtset)%bandpp,2)/=0))) then
                 write(message,'(3a,i5)') &
&                 '  The number bandpp should be 1 or a multiple of 2',ch10,&
&                 '  However, bandpp=',mpi_enregs(idtset)%bandpp
                 MSG_BUG(message)
               end if
               if(modulo(dtsets(idtset)%nband(iikpt),mpi_enregs(idtset)%nproc_band*mpi_enregs(idtset)%bandpp)/=0)then
                 write(message,'(5a,i5,a,i5)') &
&                 '  The number of band for the k-point, nband_k, should be',ch10,&
&                 '  a multiple of the number nproc_band*bandpp.',ch10,&
&                 '  However, nband_k=',dtsets(idtset)%nband(iikpt),' and nproc_band*bandpp=', &
&                 mpi_enregs(idtset)%nproc_band* mpi_enregs(idtset)%bandpp
                 MSG_BUG(message)
               end if
             elseif ((dtsets(idtset)%istwfk(iikpt_modulo)==2) .and. (dtsets(idtset)%ngfft(7)==400)) then
               message='  The fftalg=400 with istwfk=2 is not valid'
               MSG_BUG(message)
             else
               if(modulo(dtsets(idtset)%nband(iikpt),mpi_enregs(idtset)%nproc_band*mpi_enregs(idtset)%bandpp)/=0)then
                 write(message,'(5a,i5,a,i5)') &
&                 '  The number of band for the k-point, nband_k, should be',ch10,&
&                 '  a multiple of the number nproc_band*bandpp.',ch10,&
&                 '  However, nband_k=',dtsets(idtset)%nband(iikpt),' and nproc_band*bandpp=', &
&                 mpi_enregs(idtset)%nproc_band* mpi_enregs(idtset)%bandpp
                 MSG_BUG(message)
               end if
               if ((mpi_enregs(idtset)%bandpp==0)) then
                 write(message,'(a,i5,2a,i5,2a,i5)')&
&                 '  The number bandpp should not be 0 with fftalg=',dtsets(idtset)%ngfft(7),ch10,&
&                 ' and istwfk=',dtsets(idtset)%istwfk(iikpt_modulo),ch10,&
&                 '  However, bandpp=',mpi_enregs(idtset)%bandpp
                 MSG_BUG(message)
               end if
             end if
           end do

           if (xmpi_paral==1) then
             if(modulo(nkpt*nsppol,mpi_enregs(idtset)%nproc_kpt)/=0)then
               write(message,'(5a,i5,a,i5)') &
&               '  The number of KPT processors, npkpt, should be',ch10,&
&               '  a multiple of the number of nkpt*nsppol.',ch10,&
&               '  However, npkpt=',mpi_enregs(idtset)%nproc_kpt,' and nkpt*nsppol=',nkpt*nsppol
               MSG_WARNING(message)
             end if
           end if
         end if
       end if
       nproc_fft=mpi_enregs(idtset)%nproc_fft
       me_fft=mpi_enregs(idtset)%me_fft
     end if
   end if

!  Compute mgfft,mpw,nfft for this data set ( it is dependent of mpi_enreg)
   call mkrdim(dtsets(idtset)%acell_orig(1:3,1),dtsets(idtset)%rprim_orig(1:3,1:3,1),rprimd)
   call metric(gmet,gprimd,-1,rmet,rprimd,ucvol)

   ABI_ALLOCATE(istwfk,(nkpt))
   ABI_ALLOCATE(kpt_with_shift,(3,nkpt))

   ecut     =dtsets(idtset)%ecut
   dilatmx  =dtsets(idtset)%dilatmx
   ngfft(:) =dtsets(idtset)%ngfft(:)
   istwfk(:)=dtsets(idtset)%istwfk(1:nkpt)
   nsym     =dtsets(idtset)%nsym

   ABI_ALLOCATE(symrel,(3,3,nsym))
   symrel(:,:,1:nsym)=dtsets(idtset)%symrel(:,:,1:nsym)
   ecut_eff=ecut*dilatmx**2

   if (usepaw==1) then
     write(message,'(2a)') ch10,' getng is called for the coarse grid:'
     call wrtout(std_out,message,'COLL')
   end if
   call getng(dtsets(idtset)%boxcutmin,ecut_eff,gmet,me_fft,mgfft,nfft,&
&   ngfft,nproc_fft,nsym,dtsets(idtset)%fft_opt_lob,paral_fft,symrel,&
&   use_gpu_cuda=dtsets(idtset)%use_gpu_cuda)
   dtsets(idtset)%ngfft(:)=ngfft(:)
   dtsets(idtset)%mgfft=mgfft
   dtsets(idtset)%nfft=nfft
   kpt_with_shift(:,:)=dtsets(idtset)%kpt(:,1:nkpt)/dtsets(idtset)%kptnrm
   nqpt=dtsets(idtset)%nqpt

   exchn2n3d=dtsets(idtset)%exchn2n3d
   nproc_fft=ngfft(10) ; me_fft=ngfft(11)
   if(response/=0)then
!    This value of mpw is used in the first part of respfn.f
     call getmpw(ecut_eff,exchn2n3d,gmet,istwfk,kpt_with_shift,mpi_enregs(idtset),mpw_k,nkpt)
   end if
   qphon(:)=zero
   if(nqpt/=0)then
     qphon(:)=dtsets(idtset)%qptn(:)
     kpt_with_shift(1,:)=kpt_with_shift(1,:)+qphon(1)
     kpt_with_shift(2,:)=kpt_with_shift(2,:)+qphon(2)
     kpt_with_shift(3,:)=kpt_with_shift(3,:)+qphon(3)
   end if
   if (dtsets(idtset)%usewvl == 0) then
     call getmpw(ecut_eff,exchn2n3d,gmet,istwfk,kpt_with_shift,mpi_enregs(idtset),mpw,nkpt)
     if((xmpi_mpiio==1).and.(dtsets(idtset)%accesswff==IO_MODE_MPI).and.(mpi_enregs(idtset)%paral_kgb == 1)) then
       ABI_ALLOCATE(mpi_enregs(idtset)%my_kgtab,(mpw,dtsets(idtset)%mkmem))
     end if
   else
     mpw = 0
   end if

!  The dimensioning, in the RF case, should be done only with mpw,
!  but mpw is used in the first part of respfn.f, and should at least
!  be equal to mpw_k . The chosen way to code is not optimal, only convenient :
!  it leads to a small waste of memory.
   if(response/=0 .and. mpw_k>mpw)mpw=mpw_k
   dtsets(idtset)%ngfft(:)=ngfft(:)

!  Initialize ngfftc to the initial guess for the coarse mesh
   ngfftc(:) = 2
!  In case of PAW, compute fine FFT parameters
   if (usepaw==1) then
     ecutdg_eff=dtsets(idtset)%pawecutdg*dtsets(idtset)%dilatmx**2
     ngfftdg(:)=dtsets(idtset)%ngfftdg(:)
     write(message,'(2a)') ch10,' getng is called for the fine grid:'
     call wrtout(std_out,message,'COLL')
!    Start with the coarse mesh as an initial guess for the fine mesh
!    This ensures that the fine mesh will not be any coarser than the coarse mesh in each dimension
     ngfftc(:) = ngfft(1:3)
     call getng(dtsets(idtset)%bxctmindg,ecutdg_eff,gmet,me_fft,mgfftdg,&
&     nfftdg,ngfftdg,nproc_fft,nsym,dtsets(idtset)%fft_opt_lob,paral_fft,symrel,ngfftc,&
&     use_gpu_cuda=dtsets(idtset)%use_gpu_cuda)
     dtsets(idtset)%ngfftdg(:)=ngfftdg(:)
     dtsets(idtset)%mgfftdg=mgfftdg
     dtsets(idtset)%nfftdg=nfftdg
   end if
   dtsets(idtset)%mpw=mpw
   ABI_DEALLOCATE(symrel)
   ABI_DEALLOCATE(istwfk)
   ABI_DEALLOCATE(kpt_with_shift)
   ABI_DEALLOCATE(intarr)
   ABI_DEALLOCATE(dprarr)

!  Initialize data for the parallelization over atomic sites (PAW)
   call initmpi_atom(dtsets(idtset),mpi_enregs(idtset))

!  In case of the use of a GPU (Cuda), some defaults can change
!  according to a threshold on matrix sizes
   if (dtsets(idtset)%use_gpu_cuda==1.or.dtsets(idtset)%use_gpu_cuda==-1) then
     if (dtsets(idtset)%optdriver==RUNL_GSTATE) then
       vectsize=dtsets(idtset)%mpw*dtsets(idtset)%nspinor/dtsets(idtset)%npspinor
       if (all(dtsets(idtset)%istwfk(:)==2)) vectsize=2*vectsize
       blocksize=dtsets(idtset)%npband*dtsets(idtset)%bandpp
       if (dtsets(idtset)%paral_kgb==0) blocksize=dtsets(idtset)%npfft
       if ((vectsize*blocksize**2)>=dtsets(idtset)%gpu_linalg_limit) then
         if (.not.wfoptalg_read) then
           dtsets(idtset)%wfoptalg=14
           if (.not.fftalg_read) then
             dtsets(idtset)%ngfft(7)=401;if (usepaw==1) dtsets(idtset)%ngfftdg(7)=401
           end if
           if (.not.ortalg_read) dtsets(idtset)%ortalg=-abs(dtsets(idtset)%ortalg)
         end if
       end if
     end if
   end if

 end do

 if(ierr/=0)then
   call leave_new('COLL',print_config=.false.)
 end if

!DEBUG
!write(std_out,*)' mpi_setup : exit ', mpw
!ENDDEBUG

end subroutine mpi_setup
!!***
