!{\src2tex{textfont=tt}}
!!****p* ABINIT/lapackprof
!! NAME
!! lapackprof
!!
!! FUNCTION
!!  Utility for profiling the (Sca)Lapack libraries supported by abinit.
!!
!! COPYRIGHT
!! Copyright (C) 2004-2012 ABINIT group (MG)
!! This file is distributed under the terms of the
!! GNU General Public License, see ~abinit/COPYING
!! or http://www.gnu.org/copyleft/gpl.txt .
!!
!! INPUTS
!!  (main program)
!!
!! OUTPUT
!!  Timing analysis of the different libraries and algorithms.
!!
!! NOTES
!!
!! PARENTS
!!
!! CHILDREN
!!      abi_io_redirect,blas_cholesky_ortho,cg_zaxpy,cg_zcopy,cg_zgemm,cg_zgemv
!!      cwtime,destroy_mpi_enreg,herald,init_mpi_enreg,pw_orthon,random_number
!!      sqmat_itranspose,test_xginv,wrtout,xcast_mpi,xgerc,xheevx,xhpev
!!      xmpi_end,xmpi_init,xomp_show_info
!!
!! SOURCE

#if defined HAVE_CONFIG_H
#include "config.h"
#endif

#include "abi_common.h"

program lapackprof

 use defs_basis
 use defs_abitypes
 use m_build_info
 use m_xmpi
 use m_xomp
 use m_errors
 use m_blas
 use m_cgtools
 use m_abilasi
#if defined HAVE_MPI2
 use mpi
#endif

 use m_fstrings,      only : lower
 use m_io_tools,      only : prompt
 use m_numeric_tools, only : arth

!This section has been created automatically by the script Abilint (TD).
!Do not modify the following lines by hand.
#undef ABI_FUNC
#define ABI_FUNC 'lapackprof'
 use interfaces_14_hidewrite
 use interfaces_18_timing
 use interfaces_51_manage_mpi
 use interfaces_66_wfs
!End of the abilint section

 implicit none

#if defined HAVE_MPI1
 include 'mpif.h'
#endif

!Local variables-------------------------------
!scalars
 integer,parameter :: master=0,NMEAS=100
 integer :: comm,npw,ierr,my_rank,nsizes,ii,isz,jj,it,step,meas,nfound
 integer :: nband,istwfk=1,ortalgo,useoverlap,mcg,mgsc,band,g0,idx
 logical :: do_check
 character(len=24) :: skinds
 !character(len=500) :: header
 type(MPI_type) :: MPI_enreg
!arrays
 integer :: size_arth(2)
 integer,allocatable :: sizes(:)
 real(dp) :: ctime(2),wtime(2),alpha(2),beta(2),dot(2)
 real(dp),allocatable :: cg(:,:),gsc(:,:),ortho_check(:,:,:)
 real(dp),allocatable :: cg1(:,:),cg2(:,:),cg3(:,:),ene(:)
 complex(dpc),allocatable :: zvec(:),zmat(:,:),wmat(:,:),zpmat(:),evec(:,:)
 complex(spc),allocatable :: vec(:),mat(:,:)
 type(latime_t) :: Tres

! *************************************************************************

 call xmpi_init()

!Change communicator for I/O (mandatory!)
 call abi_io_redirect(new_io_comm=xmpi_world,new_leave_comm=xmpi_world)

 call init_mpi_enreg(mpi_enreg)

 comm    = xmpi_world
 my_rank = xcomm_rank(comm)

 call herald("LAPACKPROF",abinit_version,std_out)

 if (my_rank == master) then
   write(std_out,'(a)')" Tool for profiling and testing the (Sca)Lapack libraries used in ABINIT."

!  write(std_out,'(a)')" Allowed options are: "
!  write(std_out,'(a)')" "
!  call prompt("Enter size0 and size_step:",size_arth)
!  call prompt("Enter nsizes:",nsizes)

   size_arth = (/1000,2000/) 
!  nsizes = 2
   nsizes = 60
 end if

 call xomp_show_info(std_out,"COLL")

 call xcast_mpi(size_arth,master,comm,ierr)
 call xcast_mpi(nsizes,master,comm,ierr)

 ABI_MALLOC(sizes,(nsizes))
 sizes = arth(size_arth(1),size_arth(2),nsizes)

 if (.FALSE.) then
!  do ortalgo=1,3
   do ortalgo=3,3
     do isz=1,nsizes
       npw = sizes(isz)
       nband = 200
       istwfk =1
       useoverlap = 0
       mcg  = npw * nband
       mgsc = mcg * useoverlap
       ABI_MALLOC(cg,(2,mcg))
       call random_number(cg)

       ABI_MALLOC(gsc,(2,mgsc))

       if (istwfk/=1) then 
         do band=1,nband
           g0 = 1 + (band-1)*npw
           cg(2,g0) = zero
         end do
       end if

       call cwtime(ctime(1),wtime(1),"start")

       call pw_orthon(0,0,istwfk,mcg,mgsc,mpi_enreg,npw,nband,ortalgo,gsc,useoverlap,cg)

       call cwtime(ctime(1),wtime(1),"stop")

       write(std_out,'(3(a,i0),2(a,f9.6))')&
&       " ortalgo: ",ortalgo," npw = ",npw,", nband = ",nband,", ctime ",ctime(1),", wtime ",wtime(1)

       ABI_MALLOC(ortho_check,(2,nband,nband))
       if (istwfk/=1) then
         do band=1,nband
           g0 = 1 + (band-1)*npw
           cg(:,g0) = half * cg(:,g0) 
         end do
       end if

       call cg_zgemm("C","N",npw,nband,nband,cg,cg,ortho_check)

       if (istwfk/=1) ortho_check = two * ortho_check

       do band=1,nband
         ortho_check(1,band,band) = ortho_check(1,band,band) - one
       end do

       write(std_out,*)"MAX ABS error",MAXVAL( ABS(RESHAPE(ortho_check,(/2*nband*nband/)) ))
       ABI_FREE(ortho_check)

       ABI_FREE(cg)
       ABI_FREE(gsc)
     end do
   end do
 end if


!Cholesky
 if (.FALSE.) then
!  if (.TRUE.) then
   do isz=1,nsizes
     npw = sizes(isz)

     do step=1,2
       ABI_MALLOC(zmat,(npw,npw))
       ABI_MALLOC(wmat,(npw,npw))
       do jj=1,npw
!        do ii=1,npw
         zmat(jj,jj) = DCMPLX(jj,jj)
!        end do
       end do

       call cwtime(ctime(step),wtime(step),"start")
       if (step==1) then
         do meas=1,NMEAS
           call blas_cholesky_ortho(npw,npw,zmat,wmat,use_gemm=.TRUE.)
!          if (meas==NMEAS) write(777,*)zmat
         end do
       else
         do meas=1,NMEAS
           call blas_cholesky_ortho(npw,npw,zmat,wmat,use_gemm=.FALSE.)
!          if (meas==NMEAS) write(778,*)zmat
         end do
       end if
       call cwtime(ctime(step),wtime(step),"stop")
       ABI_FREE(zmat)
       ABI_FREE(wmat)
     end do

     write(std_out,'(a,i0,2(a,f9.6))')"CHOLESKY: size = ",npw,", ctime ratio ",ctime(1)/ctime(2),", wtime ratio ",wtime(1)/wtime(2)
   end do
 end if

!copy
 if (.TRUE.) then
!  if (.FALSE.) then
   do isz=1,nsizes
     npw  = sizes(isz)
     do step=1,2
       ABI_MALLOC(cg1,(2,npw))
       ABI_MALLOC(cg2,(2,npw))
       cg1 = zero

       call cwtime(ctime(step),wtime(step),"start")
       if (step==1) then
         do ii=1,NMEAS
           do jj=1,npw
             cg2(1,jj) = cg1(1,jj)
             cg2(2,jj) = cg1(2,jj)
           end do
         end do
       else 
         do ii=1,NMEAS
           call cg_zcopy(npw,cg1,cg2)
         end do
       end if
       call cwtime(ctime(step),wtime(step),"stop")

       ABI_FREE(cg1)
       ABI_FREE(cg2)
     end do
     write(std_out,'(a,i0,2(a,f9.6))')"COPY: size = ",npw,", ctime ratio ",ctime(1)/ctime(2),", wtime ratio ",wtime(1)/wtime(2)
   end do
 end if

!zdotc
 if (.TRUE.) then
!  if (.FALSE.) then
   do isz=1,nsizes
     npw  = sizes(isz)
     do step=1,2
       ABI_MALLOC(cg1,(2,npw))
       ABI_MALLOC(cg2,(2,npw))
       call random_number(cg1)
       call random_number(cg2)
!      cg1 = zero
!      cg2 = zero

       call cwtime(ctime(step),wtime(step),"start")
       if (step==1) then
         do jj=1,NMEAS
           dot = zero
!$OMP PARALLEL DO REDUCTION(+:dot)
           do ii=1,npw
             dot(1) = dot(1) + cg1(1,ii)*cg2(1,ii) + cg1(2,ii)*cg2(2,ii)
             dot(2) = dot(2) + cg1(1,ii)*cg2(2,ii) - cg1(2,ii)*cg2(1,ii)
           end do
         end do
       else 
         do ii=1,NMEAS
           dot = cg_zdotc(npw,cg1,cg2)
         end do
       end if
       call cwtime(ctime(step),wtime(step),"stop")

       ABI_FREE(cg1)
       ABI_FREE(cg2)
     end do
     write(std_out,'(a,i0,2(a,f9.6))')"ZDOTC: size = ",npw,", ctime ratio ",ctime(1)/ctime(2),", wtime ratio ",wtime(1)/wtime(2)
   end do
 end if

!axpy
!if (.FALSE.) then
 if (.TRUE.) then
!  alpha = (/one,two/)
   alpha = (/one,zero/)
   do isz=1,nsizes
     npw = sizes(isz)
     do step=1,2
       ABI_MALLOC(cg1,(2,npw))
       ABI_MALLOC(cg2,(2,npw))
       cg2 = zero

       do jj=1,npw
         cg1(:,jj) = jj 
       end do

       call cwtime(ctime(step),wtime(step),"start")
       if (step==1) then
         jj = 0
         do meas=1,NMEAS
!          call random_number(cg1)
           call random_number(cg2(:,1:1))
!          cg2 = zero
           do ii=1,npw
             jj = jj+1
             cg2(1,ii) = alpha(1)*cg1(1,ii) - alpha(2)*cg1(2,ii) + cg2(1,ii)
             cg2(2,ii) = alpha(1)*cg1(2,ii) + alpha(2)*cg1(1,ii) + cg2(2,ii)
           end do
         end do
       else 
         do meas=1,NMEAS
!          call random_number(cg1)
!          call random_number(cg2)
!          cg2 = zero
           call random_number(cg2(:,1:1))
           call cg_zaxpy(npw,alpha,cg1,cg2)
         end do
       end if
       call cwtime(ctime(step),wtime(step),"stop")

       ABI_FREE(cg1)
       ABI_FREE(cg2)
     end do
     write(std_out,'(a,i0,2(a,f9.6))')"AXPY: size = ",npw,", ctime ratio ",ctime(1)/ctime(2),", wtime ratio ",wtime(1)/wtime(2)
   end do
 end if

!cg_zgemv
!if (.TRUE.) then
 if (.FALSE.) then
   alpha = (/one,two/)
   beta = (/zero,zero/)
   do isz=1,nsizes
     npw = sizes(isz)
     do step=1,2
       ABI_MALLOC(cg1,(2,npw*npw))
       ABI_MALLOC(cg2,(2,npw))
       ABI_MALLOC(cg3,(2,npw))

       do jj=1,npw*npw
         cg1(:,jj) = jj 
       end do
       cg2 = one
       cg3 = zero

       call cwtime(ctime(step),wtime(step),"start")
       if (step==1) then
!        do meas=1,NMEAS
!        do jj=1,npw
!        ar=scprod(1,iband);ai=scprod(2,iband)
!        do ipw=1,npw_sp
!        cg_re=cg(1,index1+ipw)
!        cg_im=cg(2,index1+ipw)
!        direc(1,ipw)=direc(1,ipw)-ar*cg_re+ai*cg_im
!        direc(2,ipw)=direc(2,ipw)-ar*cg_im-ai*cg_re
!        end do
!        end do 
!        end do
!        !cg3(1,ii) = alpha(1)*cg1(1,ii) - alpha(2)*cg1(2,ii) + cg2(1,ii)
!        !cg3(2,ii) = alpha(1)*cg1(2,ii) + alpha(2)*cg1(1,ii) + cg2(2,ii)
!        end do
!        end do
       else 
         do meas=1,NMEAS
           call cg_zgemv("N",npw,npw,cg1,cg2,cg3)
         end do
       end if
       call cwtime(ctime(step),wtime(step),"stop")

       ABI_FREE(cg1)
       ABI_FREE(cg2)
       ABI_FREE(cg3)
     end do
     write(std_out,'(a,i0,2(a,f9.6))')"ZGEMV: size = ",npw,", ctime ",ctime(2),", wtime ",wtime(2)
   end do
 end if

!itranspose
 if (.FALSE.) then
!  if (.TRUE.) then
   do isz=1,nsizes
     npw = sizes(isz)

     do step=1,2
       ABI_MALLOC(zmat,(npw,npw))
       ABI_MALLOC(wmat,(npw,npw))
       do jj=1,npw
         do ii=1,npw
           zmat(ii,jj) = DCMPLX(ii,jj)
         end do
       end do

       call cwtime(ctime(step),wtime(step),"start")
       if (step==1) then
         do meas=1,NMEAS
!          wmat = TRANSPOSE(zmat)
           zmat = TRANSPOSE(zmat)
         end do
       else
         do meas=1,NMEAS
!          call sqmat_otranspose(npw,zmat,wmat)
           call sqmat_itranspose(npw,zmat)
         end do
       end if
       call cwtime(ctime(step),wtime(step),"stop")
       ABI_FREE(zmat)
       ABI_FREE(wmat)
     end do

     write(std_out,'(a,i0,2(a,f9.6))')"ITRANS: size = ",npw,", ctime ratio ",ctime(1)/ctime(2),", wtime ratio ",wtime(1)/wtime(2)
   end do
 end if

!zgerc
 if (.FALSE.) then
   do isz=1,nsizes
     npw  = sizes(isz)

     do step=1,2
       ABI_MALLOC(zvec,(npw))
       ABI_MALLOC(zmat,(npw,npw))
       zvec = cone; zmat = czero

       call cwtime(ctime(step),wtime(step),"start")
       if (step==1) then
!        Home made zgerc
         do it=1,NMEAS
           zmat = czero
!$omp parallel do
           do jj=1,npw
             do ii=1,npw
               zmat(ii,jj) = zmat(ii,jj) + CONJG(zvec(ii)) * zvec(jj)
             end do
           end do
         end do
       else 
         do jj=1,NMEAS
           zmat = czero
           call XGERC(npw,npw,(1._dp,0._dp),zvec,1,zvec,1,zmat,npw)
         end do
       end if
       call cwtime(ctime(step),wtime(step),"stop")
     end do

     ABI_FREE(zvec)
     ABI_FREE(zmat)
     write(std_out,'(a,i0,2(a,f9.6))')" size = ",npw,", cpu_time ratio ",ctime(1)/ctime(2),", wall_time ratio ",wtime(1)/wtime(2)
   end do

   do isz=1,nsizes
     npw  = sizes(isz)
     ABI_MALLOC(vec,(npw))
     ABI_MALLOC(mat,(npw,npw))
     vec = cone; mat = czero

     call cwtime(ctime(1),wtime(1),"start")

     do jj=1,NMEAS
       call XGERC(npw,npw,(1._sp,0._sp),vec,1,vec,1,mat,npw)
     end do

     call cwtime(ctime(1),wtime(1),"stop")

     write(std_out,'(a,i0,2f9.3)')" CGERG size, cpu_time, wall_time, max_abserr ",npw,ctime(1),wtime(1)

     ABI_FREE(vec)
     ABI_FREE(mat)
   end do
 end if

!xginv
 if (.FALSE.) then
   do_check = .FALSE.
!  do_check = .TRUE.
   do ii=1,nsizes
     npw  = sizes(ii)
     call test_xginv(npw,skinds,do_check,Tres,comm) 

     if (my_rank==master) then
       write(std_out,'(a,i0,3f9.3)')&
&       " size, cpu_time, wall_time, max_abserr ",Tres%msize,Tres%ctime,Tres%wtime,Tres%max_abserr 
     end if
   end do
 end if

!xhpev vs xheev
 if (.FALSE.) then
!  if (.TRUE.) then
   do isz=1,nsizes
     npw = sizes(isz)
     ABI_MALLOC(ene,(npw))
     ABI_MALLOC(evec,(npw,npw))

     ABI_MALLOC(zpmat,(npw*(npw+1)/2))
     zpmat = czero
     idx = 0
     do jj=1,npw
       do ii=1,jj
         idx = idx + 1
         zpmat(idx) = cone
       end do
     end do

     do step=1,2
       if (step==1) then
         call cwtime(ctime(step),wtime(step),"start")
         call xhpev("V","U",npw,zpmat,ene,evec,npw)
       else 
         ABI_MALLOC(zmat,(npw,npw))
!        idx = 0
!        !$omp parallel do private(idx)
         do jj=1,npw
           do ii=1,jj
!            idx = idx + 1
             idx = ii + jj*(jj-1)/2
             zmat(ii,jj) = zpmat(idx)
           end do
         end do
         call cwtime(ctime(step),wtime(step),"start")
!        call xheev("V","U",npw,zmat,ene)
         call xheevx("V","A","U",npw,zmat,zero,zero,1,1,zero,nfound,ene,evec,npw)
         ABI_FREE(zmat)
       end if
       call cwtime(ctime(step),wtime(step),"stop")
     end do

     ABI_FREE(ene)
     ABI_FREE(evec)
     ABI_FREE(zpmat)
     write(std_out,'(a,i0,2(a,f9.6))')&
&     "EIG PROBLEM: size = ",npw,", ctime ratio ",ctime(1)/ctime(2),", wtime ratio ",wtime(1)/wtime(2)
   end do
 end if

!
!===============================
!=== End of run, free memory ===
!===============================
 call wrtout(std_out,ch10//" Analysis completed.","COLL")

 ABI_FREE(sizes)

 call destroy_mpi_enreg(MPI_enreg)
 call xmpi_end()

 end program lapackprof
!!***
