!{\src2tex{textfont=tt}}
!!****f* ABINIT/finddistrproc
!! NAME
!! finddistrproc
!!
!! FUNCTION
!! Find a distribution of processor to fill npkpt, npband, npfft and bandpp, knowing nproc
!!
!! COPYRIGHT
!! Copyright (C) 1999-2012 ABINIT group (FJ,MT)
!! This file is distributed under the terms of the
!! GNU General Public License, see ~abinit/COPYING
!! or http://www.gnu.org/copyleft/gpl.txt .
!! For the initials of contributors, see ~abinit/doc/developers/contributors.txt.
!!
!! INPUTS
!!  mpi_enreg=informations about MPI parallelization
!!  mband=maximum number of bands.
!!
!! OUTPUT
!!
!! SIDE EFFECTS
!!  mpi_enreg%keywp(6,250)= transient storage to print nproc,npkpt,npspinor,npband,npfft, and bandpp.
!!  mpi_enreg%trialproc(3)= transient storage for the number of processors and the error flag.
!!  dtset%npkpt= number of processors for parallelisation on k points
!!  dtset%npband=number of processors for parallelisation on bands
!!  dtset%npfft=number of processors for parallelisation on fft grid
!!  dtset%bandpp= internal parameter for lobpcg parallelisation algorithm
!!
!! PARENTS
!!      mpi_setup
!!
!! CHILDREN
!!      initmpi_world,sort_int,wrtout
!!
!! SOURCE

#if defined HAVE_CONFIG_H
#include "config.h"
#endif

#include "abi_common.h"

subroutine finddistrproc(dtset,idtset,ierr,mband,mpi_enreg,tread_kgb)

 use defs_basis
 use defs_abitypes
 use m_profiling
 use m_errors
 use m_xmpi

!This section has been created automatically by the script Abilint (TD).
!Do not modify the following lines by hand.
#undef ABI_FUNC
#define ABI_FUNC 'finddistrproc'
 use interfaces_14_hidewrite
 use interfaces_28_numeric_noabirule
 use interfaces_51_manage_mpi
!End of the abilint section

 implicit none

!Arguments ------------------------------------
!scalars
 type(MPI_type),intent(inout) :: mpi_enreg
 integer,intent(in) :: idtset,mband,tread_kgb
 integer,intent(inout) :: ierr
 type(dataset_type),intent(inout) :: dtset
!arrays

!Local variables-------------------------------
!scalars
 integer :: icount,i1,i3,i4,i5,ii,in,na,ncount,ncount1,ncount2,ncount3,ncount4,nleft
 integer :: npband,npfft,npkpt,nproc,nproc1,nprocmin,nspin
 real(dp):: weight0
 character(len=500) :: message
!arrays
 integer :: keywp(6,250),trialproc(2)
 integer,allocatable :: bandppndiv(:),ndiv(:),distp(:,:),distp1(:,:),distp2(:),iperm(:),work(:),work1(:),work2(:)
 real(dp),allocatable :: weight(:),rwork(:)

!******************************************************************

 if((dtset%optdriver/=RUNL_GSTATE).or.(dtset%iscf<0).or.(dtset%usewvl==1)) return
 if((mpi_enreg%nproc==1.and.dtset%paral_kgb==1).or.(dtset%paral_kgb==0.and.tread_kgb==1)) return
 if (((dtset%npkpt/=1.or.dtset%npband/=1.or.dtset%npfft/=1.or. &
& dtset%npspinor/=1.or.dtset%bandpp/=1).and.dtset%paral_kgb>=0)) then
   nproc=dtset%npkpt*dtset%npband*dtset%npfft*dtset%npspinor*dtset%npimage
   if(nproc>mpi_enreg%nproc)then
     write(message,'(a,a)' )ch10,&
&     ' finddistrproc: nproc must be equal to npkpt*npband*npfft*npspinor*npimage'
     MSG_ERROR(message)
   else
     call initmpi_world(mpi_enreg,nproc)
     return
   end if
 end if


 keywp=0;trialproc=0
 nprocmin=2
 if(xmpi_paral==1)then
   if(dtset%paral_kgb <0) then
     nproc=-dtset%paral_kgb;if (dtset%nimage>1) nproc=nproc/dtset%npimage
!    nprocmin=max(2,nproc-100)
   else
     nproc=mpi_enreg%nproc;if (dtset%nimage>1) nproc=nproc/dtset%npimage
     nprocmin=max(2,nproc-100)
!    nprocmin=nproc
   end if
 else
   if (dtset%paral_kgb>=0) return
   nproc=-dtset%paral_kgb;if (dtset%nimage>1) nproc=nproc/dtset%npimage
!  nprocmin=max(2,nproc-100)
 end if
 trialproc(1)=nproc

 if(nproc==1) return


!Spin treatment (could be improved)
 nspin=max(dtset%nsppol,dtset%nspinor)

!Compute bandpp and the list if divisors of mband
 ABI_ALLOCATE(ndiv,((mband/2)+1))
 ABI_ALLOCATE(bandppndiv,((mband/2)+1))
 bandppndiv=1
 ndiv=1
 icount=0
 do ii=1,mband/2
   na=mband/ii
   if(na*ii==mband)then
     icount=icount+1
     ndiv(icount)=na
     if(mband/(2*na)*(2*na)==mband)bandppndiv(icount)=2
     if(mband/(4*na)*(4*na)==mband)bandppndiv(icount)=4
   end if
 end do
 ncount=icount+1

!calculate all the possible npband and npfft
 ABI_ALLOCATE(distp,(3,ncount*(nproc-nprocmin+1)))
 distp=0
 icount=0
 do in=nprocmin,nproc
   npband=1;npfft=1;npkpt=1
!  if(2*dtset%nkpt*nspin>=in) then
   if(dtset%nkpt*nspin>=in) then
     npkpt=in
   else
     npkpt=dtset%nkpt*nspin
!    if((dtset%paral_kgb==1).and.(in/npkpt*npkpt/=in)) npkpt=1
     nleft=in/npkpt
     i1=0
     do ii=1,ncount
!      if((mband/(bandpp*ndiv(ii))*(bandpp*ndiv(ii)))/=mband) cycle
       if(ndiv(ii)>nleft)cycle
       npband=ndiv(ii)
       if (npband==1) cycle
       i1=1
       if(dtset%use_gpu_cuda==1) then
         npfft=1
       else
         npfft=nleft/npband
       end if
       if (npfft>npband) cycle
       nproc1=npkpt*npband*npfft
       if (nproc1==0) cycle
!      if((dtset%paral_kgb==1).and.(npband*npfft*npkpt/=nproc)) cycle
       icount=icount+1
       distp(1,icount)=npband;distp(2,icount)=npfft;distp(3,icount)=bandppndiv(ii)
     end do
   end if
 end do
 ncount1=icount
 if(icount==0) then
   distp(1,1)=npband;distp(2,1)=npfft;distp(3,1)=1
   ncount1=1
 end if

!Sort the possible npband by ascending order
 ABI_ALLOCATE(distp1,(3,ncount1))
 ABI_ALLOCATE(iperm,(ncount1))
 ABI_ALLOCATE(work,(ncount1))
 ABI_ALLOCATE(work1,(ncount1))
 do ii=1,ncount1
!  write(std_out,*)'coucou',distp(1,ii),distp(2,ii),distp(3,ii)
   iperm(ii)=ii
 end do
 call sort_int(ncount1,distp(1,1:ncount1),iperm)
 work(1:ncount1)=distp(2,1:ncount1)
 work1(1:ncount1)=distp(3,1:ncount1)
 do ii=1,ncount1
   distp(2,ii)=work(iperm(ii))
   distp(3,ii)=work1(iperm(ii))
 end do

!Eliminate the same distributions
 if (ncount1>=2) then
   i3=1;i4=1;work1(1)=0
   do icount=2,ncount1
     if ((distp(1,icount)==distp(1,icount-1))) then
       i4=i4+1
     else
       i3=i3+1
       work1(i3)=i4
       i4=1
     end if
   end do
   if ((distp(1,ncount1)/=distp(1,ncount1-1))) then
     i3=i3+1
     work1(i3)=1
   else
     i3=i3+1
     work1(i3)=i4
   end if
   ncount2=i3
   icount=0
   do ii=2,ncount2
     do i1=1,work1(ii)
       iperm(i1)=i1
     end do
     call sort_int(work1(ii),distp(2,icount+1:icount+work1(ii)),iperm(1:work1(ii)))
     work(1:work1(ii))=distp(3,1+icount:icount+work1(ii))
     do i5=1,work1(ii)
       distp(3,1+icount:icount+work1(ii))=work(iperm(i5))
     end do
     icount=icount+work1(ii)
   end do
   distp1(:,1)=distp(:,1)
   i3=1
   do icount=2,ncount1
     if ((distp(1,icount)/=distp1(1,i3)).or.(distp(2,icount)/=distp1(2,i3))) then
       i3=i3+1
       distp1(:,i3)=distp(:,icount)
     end if
   end do
   ncount4=i3
 else
   ncount4=1
   distp1(:,1)=distp(:,1)
 end if

 ABI_ALLOCATE(distp2,(ncount4))
 do icount=1,ncount4
   distp2(icount)=npkpt*distp1(1,icount)*distp1(2,icount)
 end do
 do i1=1,ncount4
   iperm(i1)=i1
 end do
 call sort_int(ncount4,distp2,iperm(1:ncount4))
 ABI_ALLOCATE(work2,(ncount4))
 work(1:ncount4)=distp1(1,1:ncount4)
 work1(1:ncount4)=distp1(2,1:ncount4)
 work2(1:ncount4)=distp1(3,1:ncount4)
 do i1=1,ncount4
   distp1(1,i1)=work(iperm(i1))
   distp1(2,i1)=work1(iperm(i1))
   distp1(3,i1)=work2(iperm(i1))
 end do
 ABI_DEALLOCATE(work)
 ABI_DEALLOCATE(work1)
 ABI_DEALLOCATE(work2)

 ABI_ALLOCATE(weight,(ncount4))
 do icount=1,ncount4
   weight(icount)=distp1(1,icount)/distp1(2,icount)/4.d0
   if ((distp1(1,icount)<=50).and.(distp1(2,icount)==1)) weight(icount)=1
!  write(std_out,*),distp1(1,icount),distp1(2,icount),distp1(3,icount)
 end do

!Store and print the results
 if(ncount4>250) then
   write(message,'(a,a,a,a)' )ch10,&
&   'WARNING in finddistrproc: more than 250 possible choices for nproc',ch10,&
&   'Only the first 250 ones are printed'
 end if
 if(dtset%paral_kgb<0)then
   dtset%npkpt=mpi_enreg%nproc
   ncount3=min(ncount4,250)
   trialproc(2)=1
   do icount=ncount3,1,-1
     keywp(1,ncount3-icount+1)=npkpt*distp1(1,icount)*distp1(2,icount)
     if (dtset%nspinor==2) then
       if (mod(npkpt,2)==0) then
         keywp(2,ncount3-icount+1)=npkpt/2
         keywp(3,ncount3-icount+1)=2
       else
         keywp(1,ncount3-icount+1)=0
       end if
     else
       keywp(2,ncount3-icount+1)=npkpt
       keywp(3,ncount3-icount+1)=1
     end if
     keywp(4,ncount3-icount+1)=distp1(1,icount)
     keywp(5,ncount3-icount+1)=distp1(2,icount)
     keywp(6,ncount3-icount+1)=distp1(3,icount)
   end do
 else
   if (ncount4>1) then
     ABI_ALLOCATE(rwork,(ncount4))
     do ii=1,ncount4
       rwork(ii)=abs(weight(ii)-1.d0)
     end do
     do ii=1,ncount4
       if (abs(rwork(ii)-minval(rwork))<1d-8) i1=ii
     end do
     ABI_DEALLOCATE(rwork)
   else
     i1=1
   end if
   if (distp1(1,i1)*distp1(2,i1)/=distp1(1,ncount4)*distp1(2,ncount4)) i1=ncount4
   if (dtset%nspinor==2.and.npkpt>dtset%nkpt) then
     dtset%npkpt=npkpt/2
     dtset%npspinor=2-mod(npkpt,2)
   else
     dtset%npkpt=npkpt
     dtset%npspinor=1
   end if
   dtset%npband=distp1(1,i1)
   dtset%npfft=distp1(2,i1)
   dtset%bandpp=distp1(3,i1)

!  if(distp1(1,i1)*distp1(2,i1)*npkpt/=mpi_enreg%nproc) dtset%npkpt=mpi_enreg%nproc/distp1(1,i1)/distp1(2,i1)+1
   keywp(1,1)=npkpt*distp1(1,i1)*distp1(2,i1)
   keywp(2,1)=npkpt
   keywp(3,1)=dtset%npspinor
   keywp(4,1)=distp1(1,i1)
   keywp(5,1)=distp1(2,i1)
   keywp(6,1)=distp1(3,i1)

   if(keywp(1,1)>mpi_enreg%nproc)then
     write(message,'(a,a)' )ch10,'finddistrproc: nproc must be equal to npkpt*npband*npfft'
     MSG_ERROR(message)
   else
     call initmpi_world(mpi_enreg,keywp(1,1)*dtset%npimage)
   end if
   if ((dtset%paral_kgb==0).and.(distp1(1,i1)*distp1(2,i1)/=1)) dtset%paral_kgb=1
 end if

 if (dtset%paral_kgb>0) dtset%wfoptalg=14

 ABI_DEALLOCATE(bandppndiv)
 ABI_DEALLOCATE(ndiv)
 ABI_DEALLOCATE(distp)
 ABI_DEALLOCATE(distp1)
 ABI_DEALLOCATE(distp2)
 ABI_DEALLOCATE(iperm)
 ABI_DEALLOCATE(weight)

!write(std_out,*)'finddistrproc: paral_kgb, npkpt, npband, npfft =',dtset%paral_kgb,npkpt,dtset%npband,dtset%npfft!!
!+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
!!  Printing of results

 if(dtset%paral_kgb/=0) then
   if(mpi_enreg%me==0)then
     if (any(keywp(1,:)/=0)) then
       write(message, '(4a,i4,a,i4,a,a,a)' ) ch10,&
&       ' mpi_setup : COMMENT -',ch10,&
&       ' For dataset=', idtset,&
&       '  a possible choice for less than ',trialproc(1),' processors is:',ch10,&
&       '  nproc     npkpt  npspinor    npband     npfft    bandpp    weight'
       call wrtout(std_out,  message,'COLL')
       if(trialproc(2)==1) call wrtout(ab_out,  message,'COLL')
       do ii=1,250
         if(keywp(1,ii)/=0) then
           weight0=keywp(4,ii)/keywp(5,ii)/4.d0
           if ((keywp(4,ii)<=50).and.(keywp(5,ii)==1)) weight0=1
           write(message, '(2x,i4,1x,5(3x,i4,3x),1x,f8.2)' ) keywp(1:6,ii),weight0
           call wrtout(std_out,message,'COLL')
           if(trialproc(2)==1) call wrtout(ab_out,  message,'COLL')
         end if
       end do
     end if
     if(trialproc(2)==1) then
       write(message,'(6a)' )ch10,&
&       ' Launch a parallel version of ABINIT with a number of processors among the above list,',ch10,&
&       ' and the associated input variables npkpt, npband, npfft and bandpp. ',ch10,&
&       ' The optimal weight is close to 1.'
       call wrtout(std_out,message,'COLL')
       call wrtout(ab_out,  message,'COLL')
       if (dtset%nimage>1) then
         write(message,'(2a)' )ch10,&
&         ' You are using a formalism involving several replicas of the cell (nimage>1 in input file):'
         call wrtout(std_out,message,'COLL')
         if (dtset%npimage==1) then
           write(message,'(7a)' )&
&           ' You could benefit by using the parallelization over the replicas',ch10,&
&           ' (putting the npimage>1 keyword in the input file).',ch10,&
&           ' Note that the numbers of processors listed above do not',ch10,&
&           ' take into account this level of parallelization.'
           call wrtout(std_out,message,'COLL')
         else
           write(message,'(a,i4,6a)' )&
&           ' npimage=',dtset%npimage,' processors have been attributed to ',&
&           'the parallelization over replicas.',ch10,&
&           ' The numbers given above have been computed taking into account',ch10,&
&           ' this first level of parallelization.'
           call wrtout(std_out,message,'COLL')
           if (mod(dtset%nimage,dtset%npimage)/=0) then
             write(message,'(3a,i4,a)' )&
&             ' Note that npimage does not divide nimage:',ch10,&
&             ' ',mod(dtset%nimage,dtset%npimage)*trialproc(1),&
&             ' processors are not allocated ; this is unefficient.'
             call wrtout(std_out,message,'COLL')
           end if
         end if
       end if
       call wrtout(std_out,ch10,'COLL')
     end if
   end if
   ierr=ierr+trialproc(2)
 end if



end subroutine finddistrproc
!!***
