!{\src2tex{textfont=tt}}
!!****f* ABINIT/nonlop_pl
!! nonlop_pl
!!
!! NAME
!! nonlop_pl
!!
!! FUNCTION
!! * Compute application of a nonlocal operator Vnl in order to get:
!!    - contracted elements (energy, forces, stresses, ...), if signs=1
!!    - a function in reciprocal space (|out> = Vnl|in>), if signs=2
!!   Operator Vnl, as the following form:
!!    $Vnl=sum_{R,lmn,l''m''n''} {|P_{Rlmn}> Enl^{R}_{lmn,l''m''n''} <P_{Rl''m''n''}|}$
!!   Operator Vnl is -- in the typical case -- the nonlocal potential.
!!   - With norm-conserving pseudopots, $Enl^{R}_{lmn,l''m''n''}$ is the
!!     Kleinmann-Bylander energy $Ekb^{R}_{ln}$.
!!   - The |P_{Rlmn}> are the projector functions.
!! * This routine uses Legendre polynomials Pl to express Vnl.
!!
!! COPYRIGHT
!! Copyright (C) 1998-2009 ABINIT group (DCA, XG, GMR, GZ, MT, FF, DRH)
!! This file is distributed under the terms of the
!! GNU General Public License, see ~abinit/COPYING
!! or http://www.gnu.org/copyleft/gpl.txt .
!! For the initials of contributors, see ~abinit/doc/developers/contributors.txt.
!!
!! INPUTS
!!  choice: chooses possible output:
!!    choice=1 => a non-local energy contribution
!!          =2 => a gradient with respect to atomic position(s)
!!          =3 => a gradient with respect to strain(s)
!!          =23=> a gradient with respect to atm. pos. and strain(s)
!!          =4 => a gradient and 2nd derivative with respect to atomic pos.
!!          =5 => a gradient with respect to k wavevector
!!          =6 => 2nd derivatives with respect to strain
!!  dimekb1,dimekb2=dimensions of ekb (see ekb)
!!  dimffnlin=second dimension of ffnlin (1+number of derivatives)
!!  dimffnlout=second dimension of ffnlout (1+number of derivatives)
!!  ekb(dimekb1,dimekb2,nspinor**2)= (Real) Kleinman-Bylander energies (hartree)
!!                                   dimekb1=lmnmax  -  dimekp2=ntypat
!!  ffnlin(npwin,dimffnlin,lmnmax,ntypat)=nonlocal form factors to be used
!!          for the application of the nonlocal operator to the |in> vector
!!  ffnlout(npwout,dimffnlout,lmnmax,ntypat)=nonlocal form factors to be used
!!          for the application of the nonlocal operator to the |out> vector
!!  gmet(3,3)=metric tensor for G vecs (in bohr**-2)
!!  gprimd(3,3)=dimensional reciprocal space primitive translations
!!   (bohr^-1)
!!  idir=direction of the - atom to be moved in the case (choice=2,signs=2),
!!                        - k point direction in the case (choice=5,signs=2)
!!                        - strain component (1:6) in the case (choice=3,signs=2) or (choice=6,signs=1)
!!  indlmn(6,i,ntypat)= array giving l,m,n,lm,ln,s for i=ln
!!  istwf_k=option parameter that describes the storage of wfs
!!  kgin(3,npwin)=integer coords of planewaves in basis sphere, for the |in> vector
!!  kgout(3,npwout)=integer coords of planewaves in basis sphere, for the |out> vector
!!  kpgin(npw,npkgin)= (k+G) components and related data, for the |in> vector
!!  kpgout(npw,nkpgout)=(k+G) components and related data, for the |out> vector
!!  kptin(3)=k point in terms of recip. translations, for the |in> vector
!!  kptout(3)=k point in terms of recip. translations, for the |out> vector
!!  lmnmax=max. number of (l,m,n) components over all types of atoms
!!  matblk=dimension of the arrays ph3din and ph3dout
!!  mgfft=maximum size of 1D FFTs
!!  mpi_enreg=informations about MPI parallelization
!!  natom=number of atoms in cell
!!  nattyp(ntypat)=number of atoms of each type
!!  ngfft(18)=contain all needed information about 3D FFT, see ~abinit/doc/input_variables/vargs.htm#ngfft
!!  nkpgin,nkpgout=second sizes of arrays kpgin/kpgout
!!  nloalg(5)=governs the choice of the algorithm for nonlocal operator
!!  nnlout=dimension of enlout: choice=1=>nnlout=1   choice=2=>nnlout=3*natom
!!                              choice=3=>nnlout=6   choice=4=>nnlout=6*natom
!!                              choice=5=>nnlout=1   choice=6=>nnlout=6*(3*natom+6)
!!                              choice=23=>nnlout=6+3*natom
!!  npwin=number of planewaves for given k point, for the |in> vector
!!  npwout=number of planewaves for given k point, for the |out> vector
!!  nspinor=number of spinorial components of the wavefunctions
!!  ntypat=number of types of atoms in cell
!!  only_SO=flag to calculate only the SO part in nonlop
!!  phkxredin(2,natom)=phase factors exp(2 pi kptin.xred)
!!  phkxredout(2,natom)=phase factors exp(2 pi kptout.xred)
!!  ph1d(2,3*(2*mgfft+1)*natom)=1D structure factors phase information
!!  ph3din(2,npwin,matblk)=3D structure factors, for each atom and plane wave (in)
!!  ph3dout(2,npwout,matblk)=3-dim structure factors, for each atom and plane wave (out)
!!  pspso(ntypat)=spin-orbit characteristic for each atom type
!!  signs= if 1, get contracted elements (energy, forces, stress, ...)
!!         if 2, applies the non-local operator to a function in reciprocal space
!!  ucvol=unit cell volume (bohr^3)
!!  vectin(2,nspinor*npwin)=input cmplx wavefunction coefficients <G|Cnk>
!!
!! OUTPUT
!!  ==== if (signs==1) ====
!!     enlout(nnlout)= contribution of this state to the nl part
!!                     of the following properties:
!!       if choice=1 : enlout(1)               -> the energy
!!       if choice=2 : enlout(1:3*natom)       -> the forces
!!       if choice=3 : enlout(1:6)             -> the stresses
!!       if choice=23: enlout(1:6+3*natom)     -> the forces and the stresses
!!       if choice=4 : enlout(1:6*natom)       -> the frozen wf part of dynam. matrix
!!       if choice=6 : enlout(1:6*(3*natom+6)) -> the frozen wf part of elastic tensor
!!  ==== if (signs==2) ====
!!     vectout(2,nspinor*npwout)= result of the aplication of the nl operator
!!                                or one of its derivative to the input vect.:
!!       if choice=1 : Vnl |vectin>
!!       if choice=2 : dVnl/d(xred(idir,iatom) |vectin> (xred=reduced atm. pos.)
!!       if choice=3 : dVnl/d(strain(idir)) |vectin>    (symmetric strain =>idir=1...6)
!!       if choice=5 : dVnl/dk(idir) |vectin>           (k wavevector)
!!
!! NOTES
!! In the case signs=1, the array vectout is not used, nor modified
!! so that the same array as vectin can be used as a dummy argument;
!! the same is true for the pairs npwin-npwout, ffnlin-ffnlout,
!! kgin-kgout, ph3din-ph3dout, phkredin-phkxredout).
!!
!! Calculation includes contributions to grads of Etot wrt coord and
!! wrt strains for l=0,1,2,3.
!!
!! WARNINGS
!!  - Warning 1: This routine is in a transient state, during the
!!    time of the implementation of the spin-orbit coupling...
!!    In particular, the OMP parallelisation is still missing,
!!    but it matters here only when nspinor==2.
!!  - Warning 2: the order of atoms is governed by atindx
!!
!! PARENTS
!!      nonlop
!!
!! CHILDREN
!!      cont13,cont22,cont22cso,cont22so,cont24,cont3,cont33cso,cont33so,cont35
!!      contistr01,contistr03,contistr12,contstr21,contstr23,contstr25
!!      contstr25a,contstr26,ddkten,leave_new,metcon,metcon_so,metric_so,metstr
!!      opernl2,opernl3,opernl4a,opernl4b,ph1d3d,scalewf_nonlop,strconv,strsocv
!!      trace2,wrtout,xcomm_init,xsum_mpi
!!
!! SOURCE

#if defined HAVE_CONFIG_H
#include "config.h"
#endif

subroutine nonlop_pl(choice,dimekb1,dimekb2,dimffnlin,dimffnlout,ekb,enlout,&
&                     ffnlin,ffnlout,gmet,gprimd,idir,indlmn,istwf_k,kgin,kgout,kpgin,kpgout,&
&                     kptin,kptout,lmnmax,matblk,mgfft,mpi_enreg,mpsang,mpssoang,&
&                     natom,nattyp,ngfft,nkpgin,nkpgout,nloalg,nnlout,npwin,npwout,nspinor,ntypat,&
&                     only_SO,phkxredin,phkxredout,ph1d,ph3din,ph3dout,pspso,signs,&
&                     ucvol,vectin,vectout)

 use defs_basis
 use defs_datatypes

!This section has been created automatically by the script Abilint (TD).
!Do not modify the following lines by hand.
 use interfaces_01manage_mpi
 use interfaces_12geometry
 use interfaces_12nlstrain
 use interfaces_15nonlocal, except_this_one => nonlop_pl
 use interfaces_lib01hidempi
!End of the abilint section

 implicit none

!Arguments ------------------------------------
!This type is defined in defs_mpi
!The (inout) classification below is misleading; mpi_enreg is temporarily
! changed but reset to its initial condition before exiting.
!scalars
 integer,intent(in) :: choice,dimekb1,dimekb2,dimffnlin,dimffnlout,idir,istwf_k
 integer,intent(in) :: lmnmax,matblk,mgfft,mpsang,mpssoang,natom,nkpgin,nkpgout
 integer,intent(in) :: nnlout,npwin,npwout,nspinor,ntypat,only_SO,signs
 real(dp),intent(in) :: ucvol
 type(MPI_type),intent(inout) :: mpi_enreg
!arrays
 integer,intent(in) :: indlmn(6,lmnmax,ntypat),kgin(3,npwin),kgout(3,npwout)
 integer,intent(in) :: nattyp(ntypat),ngfft(18),nloalg(5),pspso(ntypat)
 real(dp),intent(in) :: ekb(dimekb1,dimekb2,nspinor**2)
 real(dp),intent(in) :: ffnlin(npwin,dimffnlin,lmnmax,ntypat)
 real(dp),intent(in) :: ffnlout(npwout,dimffnlout,lmnmax,ntypat),gmet(3,3)
 real(dp),intent(in) :: gprimd(3,3),kpgin(npwin,nkpgin),kpgout(npwout,nkpgout)
 real(dp),intent(in) :: kptin(3),kptout(3),ph1d(2,3*(2*mgfft+1)*natom)
 real(dp),intent(in) :: phkxredin(2,natom),phkxredout(2,natom)
 real(dp),intent(inout) :: ph3din(2,npwin,matblk),ph3dout(2,npwout,matblk)
 real(dp),intent(inout) :: vectin(2,nspinor*npwin)
 real(dp),intent(out) :: enlout(nnlout),vectout(2,nspinor*npwout)

!Local variables-------------------------------
!mlang is the maximum number of different angular momenta
!(mlang=4 means s,p,d,f)
! Note : in a future version, one should adjust mlang to mpsang.
!mlang2 is the maximum number of unique tensor components for a tensor
!of rank (mlang-1) with index range 1-3
!mlang3 is the maximum number of unique tensor components summed over
!all tensors of rank 0 through mlang-1.
!mlang4 is the total number of additional unique tensor components
!related to strain gradients, ranks 2 through mlang+1.
!mlang6 is the total number of additional unique tensor components
!related to strain 2nd derivaives, ranks 4 through mlang+3.
!mlang1 is the total number of certain additional unique tensor components
!related to internal strain, ranks 1 through mlang
!mlang5 is the total number of other additional unique tensor components
!related to internal strain, ranks 1 through mlang
!scalars
 integer,parameter :: mlang=4
 integer,save :: mlang1=((mlang+1)*(mlang+2)*(mlang+3))/6-1
 integer,save :: mlang2=(mlang*(mlang+1))/2
 integer,save :: mlang3=(mlang*(mlang+1)*(mlang+2))/6
 integer,save :: mlang4=((mlang+2)*(mlang+3)*(mlang+4))/6-4
 integer,save :: mlang5=((mlang+3)*(mlang+4)*(mlang+5))/6-10
 integer,save :: mlang6=((mlang+4)*(mlang+5)*(mlang+6))/6-20
 integer :: compact,ia,ia1,ia2,ia3,ia4,ia5,ierr,iest,ig,ii,ilang,ilang2,ilmn
 integer :: iln,iln0,indx,iproj,ipsang,ipssoang,ishift,ispin,ispinor,ispinp
 integer :: istr,istr1,istr2,iterm,itypat,jj,jjk,jjs,jjs1,jjs2,jjs3,jjs4,jjstr
 integer :: lpsang,mincat,mproj,mu,mumax,n1,n2,n3,ndgxdt,ndgxdtfac,nincat,nlang
 integer :: nlangso,nproj,nspinso,old_paral_level,rank,shift1,shift2,shift3
 integer :: sign,spaceComm
 real(dp) :: e2nl,e2nldd,enlk,ph12i,ph12r,ph1i,ph1r,ph2i,ph2r,ph3i,ph3r
 character(len=500) :: message
!arrays
 integer,allocatable :: indlmn_s(:,:,:),jproj(:)
 real(dp) :: amet(2,3,3,2,2),amet_lo(3,3),e2nl_tmp(6),eisnl(3),rank2(6)
 real(dp) :: rank2c(2,6),strsnl(6),strsso(6,3),strssoc(6),trace(2),tsec(2)
 real(dp),allocatable :: d2gxdis(:,:,:,:,:),d2gxdis_s(:,:,:,:)
 real(dp),allocatable :: d2gxds2(:,:,:,:,:),d2gxds2_s(:,:,:,:)
 real(dp),allocatable :: dgxdis(:,:,:,:,:),dgxdis_s(:,:,:,:),dgxds(:,:,:,:,:)
 real(dp),allocatable :: dgxds_s(:,:,:,:),dgxdsfac(:,:,:,:,:)
 real(dp),allocatable :: dgxdt(:,:,:,:,:,:),dgxdt_s(:,:,:,:,:)
 real(dp),allocatable :: dgxdtfac(:,:,:,:,:),ekb_s(:,:),gxa(:,:,:,:,:)
 real(dp),allocatable :: gxa_s(:,:,:,:),gxafac(:,:,:,:),pauli(:,:,:,:)
 real(dp),allocatable :: temp(:,:),tmpfac(:,:),vectin_s(:,:),vectout_s(:,:)
 real(dp),allocatable :: wt(:,:)

! **********************************************************************

!DEBUG
!write(6,*)' nonlop_pl: enter '
!ENDDEBUG

!Test: spin orbit not allowed for choice=5,6
 if (nspinor==2 .and. choice==6 ) then
  write(message, '(4a)' ) ch10,&
&  ' nonlop_pl : BUG -',ch10,&
&  '  For nspinor=2, choice=6 is not yet allowed.'
  call wrtout(06,message,'PERS')
  call leave_new('PERS')
 end if

!Test: choice 51 and 52 only allowed with nonlop_ylm 
!JWZ, 01-Sep-08
 if (choice==51 .or. choice==52) then
  write(message, '(4a)' ) ch10,&
&  ' nonlop_pl : BUG -',ch10,&
&  '  choice 51 or 52 is not yet allowed.'
  call wrtout(06,message,'PERS')
  call leave_new('PERS')
 end if

!Define dimension of work arrays.
 mincat=min(nloalg(4),maxval(nattyp))
 mproj=maxval(indlmn(3,:,:))
 allocate(temp(2,mlang4),tmpfac(2,mlang4))
 allocate(wt(mlang,mproj),jproj(mlang))
 n1=ngfft(1) ; n2=ngfft(2) ; n3=ngfft(3)
 allocate(ekb_s(mlang,mproj))
 allocate(indlmn_s(6,lmnmax,ntypat))

!Eventually compute the spin-orbit metric tensor:
 if (mpssoang>mpsang) then
  allocate(pauli(2,2,2,3))
  call metric_so(amet,gprimd,pauli)
 end if

!Allocate array gxa (contains projected scalars).
 allocate(gxa(2,mlang3,mincat,mproj,nspinor))
 if(nspinor==2)allocate(gxa_s(2,mlang3,mincat,mproj))
 allocate(gxafac(2,mlang3,mincat,mproj))
 gxa(:,:,:,:,:)=zero

!Allocate dgxdt (contains derivatives of gxa with respect to atomic
!displacements or ddk).
!If choice==2 : first-order atomic displacements
!If signs==2, only one direction is considered
!If signs==1, the three directions are considered
!If choice==4 and signs==1 : second-order atomic displacements,
!the nine components are considered
!If choice==5 and signs==2 : ddk
!component 1 -> from ffnl(:,2,...)
!component 2 -> from ffnl(:,1,...) (too much space is booked for this
!case, since the number of angular momenta is smaller
!than mlang3, but it is easier)
 ndgxdt=0
 if(signs==2 .and. choice==2) ndgxdt=1
 if(signs==1 .and. (choice==2.or.choice==23)) ndgxdt=3
 if(choice==4) ndgxdt=9
 if(choice==5) ndgxdt=2
 allocate(dgxdt(2,ndgxdt,mlang3,mincat,mproj,nspinor))
 if(nspinor==2)allocate(dgxdt_s(2,ndgxdt,mlang3,mincat,mproj))
 dgxdt(:,:,:,:,:,:)=zero
 ndgxdtfac=0
 if(signs==2 .and. choice==2) ndgxdtfac=1
 if(choice==4) ndgxdtfac=3
 if(choice==5) ndgxdtfac=2
 allocate(dgxdtfac(2,ndgxdtfac,mlang3,mincat,mproj))

!Allocate dgxds (contains derivatives of gxa with respect to strains).
 allocate(dgxds(2,mlang4,mincat,mproj,nspinor))
 dgxds(:,:,:,:,:)=zero
 allocate(dgxdsfac(2,mlang4,mincat,mproj,nspinor))
 if(choice==6) then
  allocate(dgxdis(2,mlang1,mincat,mproj,nspinor))
  allocate(d2gxdis(2,mlang5,mincat,mproj,nspinor))
  allocate(d2gxds2(2,mlang6,mincat,mproj,nspinor))
 end if
 if(nspinor==2)allocate(dgxds_s(2,mlang4,mincat,mproj))
 if(nspinor==2 .and. choice==6) then
  allocate(dgxdis_s(2,mlang1,mincat,mproj))
  allocate(d2gxdis_s(2,mlang5,mincat,mproj))
  allocate(d2gxds2_s(2,mlang6,mincat,mproj))
 end if

!Zero out some arrays
 if(choice==2 .or. choice==4 .or. choice==5 .or. choice==6 .or. choice==23) enlout(:)=0.0d0

#if defined FC_NEC
 if(signs==2)then
  vectout(1,:)=0.0d0 ; vectout(2,:)=0.0d0
 end if
#else
 if(signs==2) vectout(:,:)=0.0d0
#endif

 if(choice==3.or.choice==23) then
  strsnl(:)=0.0d0
  if(mpssoang>mpsang) strsso(:,:)=0.0d0
 end if
 enlk=0.0d0

!In the case vectin is a spinor, split its second part.
!Also, eventually take into account the storage format of the wavefunction
!(the original vector will be restored before leaving the routine,
!except for the vectin(2,1) component with istwf_k==2,
!that should vanish)
!Treat the second spinor part first
 if(nspinor==2)then
  allocate(vectin_s(2,npwin),vectout_s(2,npwout))

! Initialize it
! $OMP PARALLEL DO PRIVATE(ig) SHARED(npwin,vectin,vectin_s)
#if defined FC_NEC
! CDIR NODEP
#endif
  do ig=1,npwin
   vectin_s(1,ig)=vectin(1,ig+npwin)
   vectin_s(2,ig)=vectin(2,ig+npwin)
  end do

! $OMP END PARALLEL DO
! Take into account the storage
  if(istwf_k/=1)then
   call scalewf_nonlop(istwf_k,mpi_enreg,npwin,1,vectin_s)
  end if
 end if ! nspinor==2

!Treat the first spinor part now
 if(istwf_k/=1)then
  call scalewf_nonlop(istwf_k,mpi_enreg,npwin,1,vectin)
 end if


!Big loop on atom types.
 ia1=1
 do itypat=1,ntypat

! Get atom loop indices for different types:
  ia2=ia1+nattyp(itypat)-1

! Cut the sum on different atoms in blocks, to allow memory saving.
! Inner summations on atoms will be done from ia3 to ia4.
! Note that the maximum range from ia3 to ia4 is mincat (maximum
! increment of atoms).
  do ia3=ia1,ia2,mincat
   ia4=min(ia2,ia3+mincat-1)
!  Give the increment of number of atoms in this subset.
   nincat=ia4-ia3+1

!  Prepare the phase factors for the atoms between ia3 and ia4 :
!  For nloalg(1)==0, they were not prepared previously,it is needed to
!  compute them again.
   if(nloalg(1)<=0)then
!   For nloalg(1)==0, it is needed to compute the phase factors.
    if(mincat>matblk)then
     write(message, '(a,a,a,a,a,a,i4,a,i4,a)' ) ch10,&
&     ' nonlop : BUG -',ch10,&
&     '  With nloalg<=0, mincat must be less than matblk.',ch10,&
&     '  Their value is ',mincat,' and ',matblk,'.'
     call wrtout(6,message,'PERS')
     call leave_new('PERS')
    end if
    call ph1d3d(ia3,ia4,kgin,kptin,matblk,natom,npwin,&
&    n1,n2,n3,phkxredin,ph1d,ph3din)
   end if

!  Here begins the different treatment for the scalar-relativistic
!  part(s) and the spin-orbit part.
!  Loop on ispinor : 1 for scalar-Relativistic, 2 for spin-orbit
   nspinso=1;if (mpssoang>mpsang) nspinso=2
   do ispinor=1,nspinso

!   
!   mjv 25 6 2008: if only_SO == 1 or 2 skip the scalar relativistic terms
!   only output the spin orbit ones
!   
    if (ispinor==1 .and. only_SO>0) cycle

!   Select scalar-relativistic or spin-orbit KB-energies:
    ekb_s(:,:)=zero ; wt(:,:)=zero
!   Loop over (l,n) values (loop over l,m,n and test on l,n)
    iln0=0 ; jproj(:)=0 ; nlang=0
    indlmn_s(:,:,itypat)=0
    do ilmn=1,lmnmax
     if(ispinor/=indlmn(6,ilmn,itypat))cycle
     indlmn_s(:,ilmn,itypat)=indlmn(:,ilmn,itypat)
     iln =indlmn(5,ilmn,itypat)
     if (iln>iln0) then
      iln0=iln
      ipsang=indlmn(1,ilmn,itypat)+1
!     DEBUG
!     write(6,*)' nonlop : ipsang,ilmn,itypat,ispinor=',ipsang,ilmn,itypat,ispinor
!     ENDDEBUG
      iproj=indlmn(3,ilmn,itypat)
!     This shift is not needed anymore
!     if (ispinor==2) ipsang=indlmn(1,ilmn,itypat)-mpsang+2
      ekb_s(ipsang,iproj)=ekb(iln,itypat,ispinor)
      wt(ipsang,iproj)=4.d0*pi/ucvol*dble(2*ipsang-1)*ekb_s(ipsang,iproj)
!     
!     mjv 27 6 2008: if only_SO == 2 remove the factor of l in the operator
!     
      if (only_SO == 2) then
       wt(ipsang,iproj)=4.d0*pi/ucvol*ekb_s(ipsang,iproj)
      end if
      jproj(ipsang)=max(jproj(ipsang),iproj)
      if(iproj>0)nlang=max(nlang,ipsang)
     end if
    end do ! ilmn


!   If nlang is not 0, then some non-local part is to be applied for
!   that type of atom.
    if (nlang/=0) then
!    Operate with the non-local potential on the wavefunction, in order
!    to get projected quantities. Call different routines opernl2,
!    opernl3, opernl4x which corresponds to different writings of the
!    same numerical operations. There is still optimisation left for
!    the case istwf_k/=1 (up to a factor 2 in speed).
!    call timab(74+choice,1,tsec)
     sign=1
!    Only the first spinorial component of vectin is taken into account first
     ispin=1
     if(nloalg(1)==2 .or. nloalg(1)==-2) then
      call opernl2(choice,dgxdis,dgxds,d2gxdis,d2gxds2,dgxdt,&
&      ffnlin,gmet,gxa,ia3,idir,indlmn_s,ispinor,istwf_k,itypat,&
&      jproj,kgin,kpgin,kptin,lmnmax,matblk,mincat,mlang1,mlang3,mlang4,&
      mlang5,mlang6,mproj,ndgxdt,dimffnlin,nincat,nkpgin,nlang,nloalg,npwin,&
&      ntypat,ph3din,sign,vectin)
     else if(nloalg(1)==3 .or. nloalg(1)==-3) then
      call opernl3(choice,dgxdis,dgxds,d2gxdis,d2gxds2,dgxdt,&
&      ffnlin,gmet,gxa,ia3,idir,indlmn_s,ispinor,istwf_k,itypat,&
&      jproj,kgin,kpgin,kptin,lmnmax,matblk,mincat,mlang1,mlang3,mlang4,&
      mlang5,mlang6,mproj,ndgxdt,dimffnlin,nincat,nkpgin,nlang,nloalg,npwin,&
&      ntypat,ph3din,sign,vectin)
     else if(nloalg(1)==4 .or. nloalg(1)==-4) then
      call opernl4a(choice,dgxdis,dgxds,d2gxdis,d2gxds2,dgxdt,&
&      ffnlin,gmet,gxa,ia3,idir,indlmn_s,ispinor,istwf_k,itypat,&
&      jproj,kgin,kpgin,kptin,lmnmax,matblk,mincat,mlang1,mlang3,mlang4,&
      mlang5,mlang6,mproj,ndgxdt,dimffnlin,nincat,nkpgin,nlang,nloalg,npwin,&
&      ntypat,ph3din,vectin)
     end if
!    This duplication of the opernl calls is needed to avoid copying
!    vectin, with a detrimental effect on speed.
     if(nspinor==2)then
      ispin=2
      if(nloalg(1)==2 .or. nloalg(1)==-2) then
       call opernl2(choice,dgxdis_s,dgxds_s,d2gxdis_s,d2gxds2_s,dgxdt_s,&
&       ffnlin,gmet,gxa_s,ia3,idir,indlmn_s,ispinor,istwf_k,itypat,&
&       jproj,kgin,kpgin,kptin,lmnmax,matblk,mincat,mlang1,mlang3,mlang4,&
&       mlang5,mlang6,mproj,ndgxdt,dimffnlin,nincat,nkpgin,nlang,nloalg,npwin,&
&       ntypat,ph3din,sign,vectin_s)
      else if(nloalg(1)==3 .or. nloalg(1)==-3) then
       call opernl3(choice,dgxdis_s,dgxds_s,d2gxdis_s,d2gxds2_s,dgxdt_s,&
&       ffnlin,gmet,gxa_s,ia3,idir,indlmn_s,ispinor,istwf_k,itypat,&
&       jproj,kgin,kpgin,kptin,lmnmax,matblk,mincat,mlang1,mlang3,mlang4,&
&       mlang5,mlang6,mproj,ndgxdt,dimffnlin,nincat,nkpgin,nlang,nloalg,npwin,&
&       ntypat,ph3din,sign,vectin_s)
      else if(nloalg(1)==4 .or. nloalg(1)==-4) then
       call opernl4a(choice,dgxdis_s,dgxds_s,d2gxdis_s,d2gxds2_s,dgxdt_s,&
&       ffnlin,gmet,gxa_s,ia3,idir,indlmn_s,ispinor,istwf_k,itypat,&
&       jproj,kgin,kpgin,kptin,lmnmax,matblk,mincat,mlang1,mlang3,mlang4,&
&       mlang5,mlang6,mproj,ndgxdt,dimffnlin,nincat,nkpgin,nlang,nloalg,npwin,&
&       ntypat,ph3din,vectin_s)
      end if
      dgxds(:,:,:,:,ispin)=dgxds_s(:,:,:,:)
      dgxdt(:,:,:,:,:,ispin)=dgxdt_s(:,:,:,:,:)
      gxa(:,:,:,:,ispin)=gxa_s(:,:,:,:)
     end if
     old_paral_level=mpi_enreg%paral_level
     mpi_enreg%paral_level=3
     call xcomm_init(mpi_enreg,spaceComm)
     if(mpi_enreg%mode_para=='b') spaceComm=mpi_enreg%comm_fft
!    call timab(48,1,tsec)
     call xsum_mpi(dgxds,spaceComm,ierr)

     if (choice /=1 .and. choice /=3)then
      call xsum_mpi(dgxdt,spaceComm,ierr)
     end if
     call xsum_mpi(gxa,spaceComm,ierr)
!    call timab(48,2,tsec)
     mpi_enreg%paral_level=old_paral_level
!    XG030513 : MPIWF, at this place, one should perform the reduction
!    and spread of data of gxa, dgxdt and dgxds

!    call timab(74+choice,2,tsec)


!    Loop over spins:
     do ispin=1,nspinor

!     Perform contractions for the various tensors (d)gx?, producing the
!     contracted tensors (d)gx?fac to be passed back to opernl:
      do ia=1,nincat
       do ilang=1,nlang
        nproj=jproj(ilang)
        if(nproj/=0) then
         ilang2=(ilang*(ilang+1))/2
         do iproj=1,nproj

!         The rank of the tensor gxa equals l:
          rank=ilang-1
!         jjs gives the starting address of the relevant components
          jjs=1+((ilang-1)*ilang*(ilang+1))/6
          if (ilang>4) then
           write(message, '(a,a,a,a,i8,a)' )ch10,&
&           ' nonlop: BUG -',ch10,&
&           '  ilang must fall in range [1..4] but value is ',ilang,'.'
           call wrtout(06,message,'PERS')
           call leave_new('PERS')
          end if

!         Metric & spinorial contraction from gxa to gxafac. The treatment
!         is different for the scalar-relativistic and spin-orbit parts.
          if(ispinor==1) then
!          ------ Scalar-Relativistic ------
           temp(:,1:((rank+1)*(rank+2))/2)= &
&           gxa(:,jjs:jjs-1+((rank+1)*(rank+2))/2,ia,iproj,ispin)
           call metcon(rank,gmet,temp,tmpfac)
           gxafac(:,jjs:jjs-1+((rank+1)*(rank+2))/2,ia,iproj)= &
&           wt(ilang,iproj)*tmpfac(:,1:((rank+1)*(rank+2))/2)
          else
!          ------ Spin-orbit ------
           gxafac(:,jjs:jjs-1+((rank+1)*(rank+2))/2,ia,iproj)=0.d0
!          Contraction over spins:
           do ispinp=1,nspinor
!           => Imaginary part (multiplying by i, then by the Im of amet):
            temp(1,1:((rank+1)*(rank+2))/2)= &
&            -gxa(2,jjs:jjs-1+((rank+1)*(rank+2))/2,ia,iproj,ispinp)
            temp(2,1:((rank+1)*(rank+2))/2)= &
&            gxa(1,jjs:jjs-1+((rank+1)*(rank+2))/2,ia,iproj,ispinp)
            amet_lo(:,:)=amet(2,:,:,ispin,ispinp)
            call metcon_so(rank,gmet,amet_lo,temp,tmpfac)
            gxafac(:,jjs:jjs-1+((rank+1)*(rank+2))/2,ia,iproj)= &
&            gxafac(:,jjs:jjs-1+((rank+1)*(rank+2))/2,ia,iproj)+ &
&            wt(ilang,iproj)*tmpfac(:,1:((rank+1)*(rank+2))/2)
!           => Real part:
            temp(:,1:((rank+1)*(rank+2))/2)= &
&            gxa(:,jjs:jjs-1+((rank+1)*(rank+2))/2,ia,iproj,ispinp)
            amet_lo(:,:)=amet(1,:,:,ispin,ispinp)
            call metcon_so(rank,gmet,amet_lo,temp,tmpfac)
            gxafac(:,jjs:jjs-1+((rank+1)*(rank+2))/2,ia,iproj)= &
&            gxafac(:,jjs:jjs-1+((rank+1)*(rank+2))/2,ia,iproj)+ &
&            wt(ilang,iproj)*tmpfac(:,1:((rank+1)*(rank+2))/2)
           end do
          end if

!         Eventual tensorial compaction/decompaction for ddk
!         perturbation:
          if(choice==5 .and. ilang>=2) then
           jjk=1+((ilang-2)*(ilang-1)*ilang)/6
           compact=-1
           temp(:,1:(rank*(rank+1))/2)= &
&           dgxdt(:,2,jjk:jjk-1+(rank*(rank+1))/2,ia,iproj,ispin)
           call ddkten(compact,idir,rank,temp,tmpfac)
           dgxdt(:,1,jjs:jjs-1+((rank+1)*(rank+2))/2,ia,iproj,ispin)= &
&           dgxdt(:,1,jjs:jjs-1+((rank+1)*(rank+2))/2,ia,iproj,ispin)&
&           +tmpfac(:,1:((rank+1)*(rank+2))/2)
           compact=1
           tmpfac(:,1:((rank+1)*(rank+2))/2)= &
&           gxafac(:,jjs:jjs-1+((rank+1)*(rank+2))/2,ia,iproj)
           call ddkten(compact,idir,rank,temp,tmpfac)
           dgxdtfac(:,2,jjk:jjk-1+(rank*(rank+1))/2,ia,iproj)= &
&           temp(:,1:(rank*(rank+1))/2)
          end if

!         Section for strain perturbation
!         no spin-orbit yet

          if(choice==3 .and. signs==2) then
           istr=idir
           if(ispinor==1) then
!           ------ Scalar-Relativistic ------
!           jjstr is the starting address for dgxds and dgxdsfac
            jjstr=-3+((ilang+1)*(ilang+2)*(ilang+3))/6
!           diagonal enlk contribution
!           note sign change (12/05/02)
            if(istr>3) then
             gxafac(:,jjs:jjs-1+((rank+1)*(rank+2))/2,ia,iproj)= 0.d0
            else
             gxafac(:,jjs:jjs-1+((rank+1)*(rank+2))/2,ia,iproj)=&
&             -gxafac(:,jjs:jjs-1+((rank+1)*(rank+2))/2,ia,iproj)
            end if
            dgxdsfac(:,jjstr:jjstr-1+((rank+3)*(rank+4))/2,ia,iproj,ispin)= &
&            0.d0
            iterm=1
            temp(:,1:((rank+1)*(rank+2))/2)= &
&            gxa(:,jjs:jjs-1+((rank+1)*(rank+2))/2,ia,iproj,ispin)
            call metstr(istr,rank,iterm,gmet,gprimd,temp,tmpfac)
            dgxdsfac(:,jjstr:jjstr-1+((rank+3)*(rank+4))/2,ia,iproj,ispin)= &
&            wt(ilang,iproj)*tmpfac(:,1:((rank+3)*(rank+4))/2)
            iterm=2
            temp(:,1:((rank+1)*(rank+2))/2)= &
&            gxa(:,jjs:jjs-1+((rank+1)*(rank+2))/2,ia,iproj,ispin)
            call metstr(istr,rank,iterm,gmet,gprimd,temp,tmpfac)
            gxafac(:,jjs:jjs-1+((rank+1)*(rank+2))/2,ia,iproj)= &
&            +gxafac(:,jjs:jjs-1+((rank+1)*(rank+2))/2,ia,iproj)+ &
&            wt(ilang,iproj)*tmpfac(:,1:((rank+1)*(rank+2))/2)
            iterm=3
            temp(:,1:((rank+3)*(rank+4))/2)= &
            dgxds(:,jjstr:jjstr-1+((rank+3)*(rank+4))/2,ia,iproj,ispin)
            call metstr(istr,rank,iterm,gmet,gprimd,temp,tmpfac)
            gxafac(:,jjs:jjs-1+((rank+1)*(rank+2))/2,ia,iproj)= &
&            gxafac(:,jjs:jjs-1+((rank+1)*(rank+2))/2,ia,iproj)+ &
&            wt(ilang,iproj)*tmpfac(:,1:((rank+1)*(rank+2))/2)
           end if
!          end section for strain perturbation
          end if

!         Eventual metric & spinorial contraction from dgxdt to dgxdtfac.
!         either for the dynamical matrix, or for the application of the
!         gradient of the operator. The treatment is different for the
!         scalar-relativistic and spin-orbit parts.
          if ((choice==2.and.signs==2).or. &
&          (choice==5.and.signs==2).or. &
&          (choice==4)) then
           mumax=ndgxdtfac;if (choice==5) mumax=1
           do mu=1,mumax
            if(ispinor==1) then
!            ------ Scalar-Relativistic ------
             temp(:,1:((rank+1)*(rank+2))/2)= &
&             dgxdt(:,mu,jjs:jjs-1+((rank+1)*(rank+2))/2,ia,iproj,ispin)
             call metcon(rank,gmet,temp,tmpfac)
             dgxdtfac(:,mu,jjs:jjs-1+((rank+1)*(rank+2))/2,ia,iproj)=&
&             wt(ilang,iproj)*tmpfac(:,1:((rank+1)*(rank+2))/2)
            else
!            ------ Spin-orbit ------
             dgxdtfac(:,mu,jjs:jjs-1+((rank+1)*(rank+2))/2,ia,iproj)=0.d0
!            Contraction over spins:
             do ispinp=1,nspinor
!             => Imaginary part (multiplying by i, then by the Im of amet):
              temp(1,1:((rank+1)*(rank+2))/2)= &
&              -dgxdt(2,mu,jjs:jjs-1+((rank+1)*(rank+2))/2,ia,iproj,ispinp)
              temp(2,1:((rank+1)*(rank+2))/2)= &
&              dgxdt(1,mu,jjs:jjs-1+((rank+1)*(rank+2))/2,ia,iproj,ispinp)
              amet_lo(:,:)=amet(2,:,:,ispin,ispinp)
              call metcon_so(rank,gmet,amet_lo,temp,tmpfac)
              dgxdtfac(:,mu,jjs:jjs-1+((rank+1)*(rank+2))/2,ia,iproj)=&
&              dgxdtfac(:,mu,jjs:jjs-1+((rank+1)*(rank+2))/2,ia,iproj)+&
&              wt(ilang,iproj)*tmpfac(:,1:((rank+1)*(rank+2))/2)
!             => Real part:
              temp(:,1:((rank+1)*(rank+2))/2)= &
&              dgxdt(:,mu,jjs:jjs-1+((rank+1)*(rank+2))/2,ia,iproj,ispinp)
              amet_lo(:,:)=amet(1,:,:,ispin,ispinp)
              call metcon_so(rank,gmet,amet_lo,temp,tmpfac)
              dgxdtfac(:,mu,jjs:jjs-1+((rank+1)*(rank+2))/2,ia,iproj)=&
&              dgxdtfac(:,mu,jjs:jjs-1+((rank+1)*(rank+2))/2,ia,iproj)+&
&              wt(ilang,iproj)*tmpfac(:,1:((rank+1)*(rank+2))/2)
             end do
            end if
           end do
          end if


!         ----  Accumulate the nonlocal energy.
          do ii=1,ilang2
           jj=ii-1+jjs
           enlk=enlk+(gxafac(1,jj,ia,iproj)*gxa(1,jj,ia,iproj,ispin)&
&           +gxafac(2,jj,ia,iproj)*gxa(2,jj,ia,iproj,ispin) )
          end do

!         ----  Accumulate force contributions if requested.
!         Note that the contraction of gxa with dgxdt can be done like
!         a cartesian dot product now because the symmetrically-related
!         terms are accounted for with weights in gxa.
          if ((choice==2.or.choice==23) .and. signs==1) then
           ishift=0;if (choice==23) ishift=6
           ia5=ia+ia3-1
           do ii=1,ilang2
            jj=ii-1+jjs
            do mu=1,3
!            (includes also factor of 2 from "2*Re[stuff]")
             indx=mu+3*(ia5-1)+ishift
             enlout(indx)=enlout(indx)+two*ucvol*&
&             ( gxafac(1,jj,ia,iproj)*dgxdt(1,mu,jj,ia,iproj,ispin)&
&             +gxafac(2,jj,ia,iproj)*dgxdt(2,mu,jj,ia,iproj,ispin))
            end do
           end do
          end if

!         ----  Accumulate stress tensor contributions if requested.
          if ((choice==3.or.choice==23).and.signs==1) then
!          1- Compute contractions involving gxa and dgxds:
!          --- Same formula for Scalar-relativistic and Spin-orbit ---
           if (ilang==1) then
            do ii=1,6
             rank2(ii)=2.d0*&
&             (gxafac(1,1,ia,iproj)*dgxds(1,ii,ia,iproj,ispin)+ &
&             gxafac(2,1,ia,iproj)*dgxds(2,ii,ia,iproj,ispin) )
            end do
           else if (ilang==2) then
            call cont13(gxafac(:,jjs:jjs+2,ia,iproj),&
&            dgxds(:, 7:16,ia,iproj,ispin),rank2)
           else if (ilang==3) then
            call cont24(gxafac(:,jjs:jjs+5,ia,iproj),&
&            dgxds(:,17:31,ia,iproj,ispin),rank2)
           else if (ilang==4) then
            call cont35(gxafac(:,jjs:jjs+9,ia,iproj),&
&            dgxds(:,32:52,ia,iproj,ispin),rank2)
           end if
!          In all cases add rank2 term into stress tensor
           strsnl(:)=strsnl(:)-rank2(:)
!          2- Compute contractions involving gxa and gxa:
           if(ispinor==1) then
!           2a ------ Scalar-Relativistic ------
            if (ilang==2) then
             strsnl(1)=strsnl(1)-wt(ilang,iproj)*2.d0*&
&             (gxa(1,jjs  ,ia,iproj,ispin)*gxa(1,jjs  ,ia,iproj,ispin)+&
&             gxa(2,jjs  ,ia,iproj,ispin)*gxa(2,jjs  ,ia,iproj,ispin))
             strsnl(2)=strsnl(2)-wt(ilang,iproj)*2.d0*&
&             (gxa(1,jjs+1,ia,iproj,ispin)*gxa(1,jjs+1,ia,iproj,ispin)+&
&             gxa(2,jjs+1,ia,iproj,ispin)*gxa(2,jjs+1,ia,iproj,ispin))
             strsnl(3)=strsnl(3)-wt(ilang,iproj)*2.d0*&
&             (gxa(1,jjs+2,ia,iproj,ispin)*gxa(1,jjs+2,ia,iproj,ispin)+&
&             gxa(2,jjs+2,ia,iproj,ispin)*gxa(2,jjs+2,ia,iproj,ispin))
             strsnl(4)=strsnl(4)-wt(ilang,iproj)*2.d0*&
&             (gxa(1,jjs+2,ia,iproj,ispin)*gxa(1,jjs+1,ia,iproj,ispin)+&
&             gxa(2,jjs+2,ia,iproj,ispin)*gxa(2,jjs+1,ia,iproj,ispin))
             strsnl(5)=strsnl(5)-wt(ilang,iproj)*2.d0*&
&             (gxa(1,jjs+2,ia,iproj,ispin)*gxa(1,jjs  ,ia,iproj,ispin)+&
&             gxa(2,jjs+2,ia,iproj,ispin)*gxa(2,jjs  ,ia,iproj,ispin))
             strsnl(6)=strsnl(6)-wt(ilang,iproj)*2.d0*&
&             (gxa(1,jjs+1,ia,iproj,ispin)*gxa(1,jjs  ,ia,iproj,ispin)+&
&             gxa(2,jjs+1,ia,iproj,ispin)*gxa(2,jjs  ,ia,iproj,ispin))
            else if (ilang==3) then
             call trace2(gxa(:,jjs:jjs+5,ia,iproj,ispin),gmet,trace)
             call cont22(gxa(:,jjs:jjs+5,ia,iproj,ispin),gmet,rank2)
             do ii=1,6
              strsnl(ii)=strsnl(ii)+wt(ilang,iproj)*&
&              (2.d0*(trace(1)*gxa(1,jjs+ii-1,ia,iproj,ispin)+&
&              trace(2)*gxa(2,jjs+ii-1,ia,iproj,ispin))-3.d0*rank2(ii))
             end do
            else if (ilang==4) then
             call cont3(gxa(:,jjs:jjs+9,ia,iproj,ispin),gmet,rank2)
             strsnl(:)=strsnl(:)-wt(ilang,iproj)*rank2(:)
            end if
           else
!           2b ------ Spin-orbit ------
            do ispinp=1,nspinor
             if (ilang==3) then
              call cont22so(gxa(:,jjs:jjs+5,ia,iproj,ispin),&
&              gxa(:,jjs:jjs+5,ia,iproj,ispinp),&
&              amet(:,:,:,ispin,ispinp),rank2)
              strsnl(:)=strsnl(:)-wt(ilang,iproj)*3.d0*rank2(:)
             else if (ilang==4) then
              call cont33so(gxa(:,jjs:jjs+9,ia,iproj,ispin),&
&              gxa(:,jjs:jjs+9,ia,iproj,ispinp),&
&              gmet,amet(:,:,:,ispin,ispinp),rank2)
              strsnl(:)=strsnl(:)-wt(ilang,iproj)*rank2(:)
             end if
            end do
           end if
!          3- Compute contractions involving gxa and gxa due to
!          gradients of antisymmetric tensor (amet):
!          --- Only in case of Spin-orbit ---
           if(ispinor==2) then
            do ispinp=1,nspinor
!            Be carefull: no need to compute rank2c(:,1:3) !
             if (ilang==2) then
              rank2c(1,4)=gxa(1,jjs+2,ia,iproj,ispin)*gxa(1,jjs+1,ia,iproj,ispinp)&
&              +gxa(2,jjs+2,ia,iproj,ispin)*gxa(2,jjs+1,ia,iproj,ispinp)
              rank2c(2,4)=gxa(1,jjs+2,ia,iproj,ispin)*gxa(2,jjs+1,ia,iproj,ispinp)&
&              -gxa(2,jjs+2,ia,iproj,ispin)*gxa(1,jjs+1,ia,iproj,ispinp)
              rank2c(1,5)=gxa(1,jjs+2,ia,iproj,ispin)*gxa(1,jjs  ,ia,iproj,ispinp)&
&              +gxa(2,jjs+2,ia,iproj,ispin)*gxa(2,jjs  ,ia,iproj,ispinp)
              rank2c(2,5)=gxa(1,jjs+2,ia,iproj,ispin)*gxa(2,jjs  ,ia,iproj,ispinp)&
&              -gxa(2,jjs+2,ia,iproj,ispin)*gxa(1,jjs  ,ia,iproj,ispinp)
              rank2c(1,6)=gxa(1,jjs+1,ia,iproj,ispin)*gxa(1,jjs  ,ia,iproj,ispinp)&
&              +gxa(2,jjs+1,ia,iproj,ispin)*gxa(2,jjs  ,ia,iproj,ispinp)
              rank2c(2,6)=gxa(1,jjs+1,ia,iproj,ispin)*gxa(2,jjs  ,ia,iproj,ispinp)&
&              -gxa(2,jjs+1,ia,iproj,ispin)*gxa(1,jjs  ,ia,iproj,ispinp)
             else if (ilang==3) then
              call cont22cso(gxa(:,jjs:jjs+5,ia,iproj,ispin),&
&              gxa(:,jjs:jjs+5,ia,iproj,ispinp),&
&              gmet,rank2c)
             else if (ilang==4) then
              call cont33cso(gxa(:,jjs:jjs+9,ia,iproj,ispin),&
&              gxa(:,jjs:jjs+9,ia,iproj,ispinp),&
&              gmet,rank2c)
             end if
             if (ilang>1) then
              do jj=1,3
               do ii=4,6
                strsso(ii,jj)=strsso(ii,jj)-2.d0*wt(ilang,iproj)*&
&                (pauli(1,ispin,ispinp,jj)*rank2c(2,ii)+&
&                pauli(2,ispin,ispinp,jj)*rank2c(1,ii))
               end do
              end do
             end if
            end do
           end if
!          Enf if (choice==3)
          end if

!         ----  Accumulate dynamical matrix contributions if requested.
          if (choice==4) then
           ia5=ia+ia3-1
           do ii=1,ilang2
            jj=ii-1+jjs
            do mu=1,6
!            (includes also factor of 2 from "2*Re[stuff]")
             enlout(mu+6*(ia5-1))=enlout(mu+6*(ia5-1))+ucvol*2.0d0*&
&             (gxafac(1,jj,ia,iproj)*dgxdt(1,mu+3,jj,ia,iproj,ispin)&
&             +gxafac(2,jj,ia,iproj)*dgxdt(2,mu+3,jj,ia,iproj,ispin))
            end do
            do mu=1,3
             enlout(mu+6*(ia5-1))=enlout(mu+6*(ia5-1))+ucvol*2.0d0*&
&             (dgxdtfac(1,mu,jj,ia,iproj)*dgxdt(1,mu,jj,ia,iproj,ispin)&
&             +dgxdtfac(2,mu,jj,ia,iproj)*dgxdt(2,mu,jj,ia,iproj,ispin))
            end do
            enlout(4+6*(ia5-1))=enlout(4+6*(ia5-1)) +ucvol*2.d0*&
&            (dgxdtfac(1,2,jj,ia,iproj)*dgxdt(1,3,jj,ia,iproj,ispin)&
&            +dgxdtfac(2,2,jj,ia,iproj)*dgxdt(2,3,jj,ia,iproj,ispin))
            enlout(5+6*(ia5-1))=enlout(5+6*(ia5-1)) +ucvol*2.d0*&
&            (dgxdtfac(1,1,jj,ia,iproj)*dgxdt(1,3,jj,ia,iproj,ispin)&
&            +dgxdtfac(2,1,jj,ia,iproj)*dgxdt(2,3,jj,ia,iproj,ispin))
            enlout(6+6*(ia5-1))=enlout(6+6*(ia5-1)) +ucvol*2.d0*&
&            (dgxdtfac(1,1,jj,ia,iproj)*dgxdt(1,2,jj,ia,iproj,ispin)&
&            +dgxdtfac(2,1,jj,ia,iproj)*dgxdt(2,2,jj,ia,iproj,ispin))
           end do
          end if

!         ----  Accumulate elastic tensor contributions if requested.

          if(choice==6) then
!          XG 081121 : Message to the person who has introduced this CPP option (sorry, I did not have to time to locate who did this ...) 
!          This section of ABINIT should be allowed by default to the user. I have found that on the contrary, the build
!          system defaults are such that this section is forbidden by default. You might restore this flag if at the same time,
!          you modify the build system in such a way that by default this section is included, and if the user wants, it can disable it.
!          #if defined USE_NLSTRAIN_LEGENDRE
           ia5=ia+ia3-1
           jjs1=((ilang)*(ilang+1)*(ilang+2))/6
           jjs2=-3+((ilang+1)*(ilang+2)*(ilang+3))/6
           jjs3=-9+((ilang+2)*(ilang+3)*(ilang+4))/6
           jjs4=-19+((ilang+3)*(ilang+4)*(ilang+5))/6

!          Contribution for two diagonal strains (basically, the nonlocal
!          enlk)
           temp(:,1:((rank+1)*(rank+2))/2)= &
&           gxa(:,jjs:jjs-1+((rank+1)*(rank+2))/2,ia,iproj,ispin)
           call metcon(rank,gmet,temp,tmpfac)
           e2nldd=0.d0
           do ii=1,((rank+1)*(rank+2))/2
            e2nldd=e2nldd+&
&            (gxa(1,jjs-1+ii,ia,iproj,ispin)*tmpfac(1,ii)+&
&            gxa(2,jjs-1+ii,ia,iproj,ispin)*tmpfac(2,ii))
           end do

!          Terms involving one ucvol derivative (digonal strain only)
!          and one derivative of nonlocal operator
!          Loop over strain index
           do istr2=1,6

!           rank->rank+2
            iterm=1
            temp(:,1:((rank+1)*(rank+2))/2)= &
&            gxa(:,jjs:jjs-1+((rank+1)*(rank+2))/2,ia,iproj,ispin)
            call metstr(istr2,rank,iterm,gmet,gprimd,temp,tmpfac)
            e2nl_tmp(istr2)=0.d0
            do ii=1,((rank+3)*(rank+4))/2
             e2nl_tmp(istr2)=e2nl_tmp(istr2)-2.d0*&
&             (dgxds(1,jjs2-1+ii,ia,iproj,ispin)*tmpfac(1,ii)+&
&             dgxds(2,jjs2-1+ii,ia,iproj,ispin)*tmpfac(2,ii))
            end do
!           rank->rank

            iterm=2
            temp(:,1:((rank+1)*(rank+2))/2)= &
&            gxa(:,jjs:jjs-1+((rank+1)*(rank+2))/2,ia,iproj,ispin)
            call metstr(istr2,rank,iterm,gmet,gprimd,temp,tmpfac)
            do ii=1,((rank+1)*(rank+2))/2
             e2nl_tmp(istr2)=e2nl_tmp(istr2)-&
&             (gxa(1,jjs-1+ii,ia,iproj,ispin)*tmpfac(1,ii)+&
&             gxa(2,jjs-1+ii,ia,iproj,ispin)*tmpfac(2,ii))
            end do
!           DEBUG
!           This and subsequent similar debug sections evaluate the
!           hermitial conjugate of the contraction immeditely above
!           and the comparison was useful for development purposes.
!           rank+2->rank
!           iterm=3
!           temp(:,1:((rank+3)*(rank+4))/2)= &
!           dgxds(:,jjs2:jjs2-1+((rank+3)*(rank+4))/2,ia,iproj,ispin)
!           call metstr(istr2,rank,iterm,gmet,gprimd,temp,tmpfac)
!           e2nl_tmp(istr2)=0.d0
!           do ii=1,((rank+1)*(rank+2))/2
!           e2nl_tmp(istr2)=e2nl_tmp(istr2)-&
!           &             (gxa(1,jjs-1+ii,ia,iproj,ispin)*tmpfac(1,ii)+&
!           &              gxa(2,jjs-1+ii,ia,iproj,ispin)*tmpfac(2,ii))
!           end do
!           ENDDEBUG
           end do

!          Terms involving two derivatives of the nonlocal operator
!          Loop over 2nd strain index
           do istr2=1,6
!           Loop over 1st strain index, upper triangle only
            do istr1=1,6
             iest=istr1+(3*natom+6)*(istr2-1)

!            Accumulate terms corresponding to two derivatives of ucvol
!            (simply the nonlocal energy contributin) for both indices
!            corresponding to diagonal strains

             if(istr1<=3 .and. istr2<=3) then
              enlout(iest)= enlout(iest)+wt(ilang,iproj)*e2nldd
             end if

!            Accumulate terms computed above from 1st derivatives
!            when one or more indices corresponds to diagonal strain
             if(istr2<=3) then
              enlout(iest)= enlout(iest)+wt(ilang,iproj)*e2nl_tmp(istr1)
             end if
             if(istr1<=3) then
              enlout(iest)= enlout(iest)+wt(ilang,iproj)*e2nl_tmp(istr2)
             end if

!            rank->rank+4
             call contstr21(istr2,istr1,rank,gmet,gprimd,e2nl,&
&             gxa(:,jjs:jjs-1+((rank+1)*(rank+2))/2,ia,iproj,ispin),&
&             d2gxds2(:,jjs4:jjs4-1+((rank+5)*(rank+6))/2,ia,iproj,ispin))
             enlout(iest)= enlout(iest)+wt(ilang,iproj)*e2nl

!            DEBUG
!            rank+4->rank
!            call contstr22(istr2,istr1,rank,gmet,gprimd,e2nl,&
!            &             d2gxds2(:,jjs4:jjs4-1+((rank+5)*(rank+6))/2,ia,iproj,ispin),&
!            &             gxa(:,jjs:jjs-1+((rank+1)*(rank+2))/2,ia,iproj,ispin))
!            enlout(iest)= enlout(iest)+wt(ilang,iproj)*e2nl
!            ENDDEBUG

!            rank->rank+2
             call contstr23(istr2,istr1,rank,gmet,gprimd,e2nl,&
&             gxa(:,jjs:jjs-1+((rank+1)*(rank+2))/2,ia,iproj,ispin),&
&             dgxds(:,jjs2:jjs2-1+((rank+3)*(rank+4))/2,ia,iproj,ispin))
             enlout(iest)= enlout(iest)+wt(ilang,iproj)*e2nl
!            DEBUG

!            rank+2->rank
!            call contstr24(istr2,istr1,rank,gmet,gprimd,e2nl,&
!            &             dgxds(:,jjs2:jjs2-1+((rank+3)*(rank+4))/2,ia,iproj,ispin),&
!            &             gxa(:,jjs:jjs-1+((rank+1)*(rank+2))/2,ia,iproj,ispin))
!            enlout(iest)= enlout(iest)+wt(ilang,iproj)*e2nl
!            ENDDEBUG

!            rank+2->rank+2
             if(rank<=2) then
              call contstr25(istr2,istr1,rank,gmet,gprimd,e2nl,&
&              dgxds(:,jjs2:jjs2-1+((rank+3)*(rank+4))/2,ia,iproj,ispin),&
&              dgxds(:,jjs2:jjs2-1+((rank+3)*(rank+4))/2,ia,iproj,ispin))
             else
              call contstr25a(istr2,istr1,rank,gmet,gprimd,e2nl,&
&              dgxds(:,jjs2:jjs2-1+((rank+3)*(rank+4))/2,ia,iproj,ispin),&
&              dgxds(:,jjs2:jjs2-1+((rank+3)*(rank+4))/2,ia,iproj,ispin))
             end if
             enlout(iest)= enlout(iest)+wt(ilang,iproj)*e2nl

!            rank->rank
             call contstr26(istr2,istr1,rank,gmet,gprimd,e2nl,&
&             gxa(:,jjs:jjs-1+((rank+1)*(rank+2))/2,ia,iproj,ispin),&
&             gxa(:,jjs:jjs-1+((rank+1)*(rank+2))/2,ia,iproj,ispin))
             enlout(iest)= enlout(iest)+wt(ilang,iproj)*e2nl

            end do !istr1

!           Contributions to internal strain (one cartesian strain and one
!           reduced-coordinate atomic displacement derivative).
            iest=7+3*(ia5-1)+(3*natom+6)*(istr2-1)

!           rank->rank+3
            call contistr03(istr2,rank,gmet,gprimd,eisnl,&
&            gxa(:,jjs:jjs-1+((rank+1)*(rank+2))/2,ia,iproj,ispin),&
&            d2gxdis(:,jjs3:jjs3-1+((rank+4)*(rank+5))/2,ia,iproj,ispin))
            enlout(iest:iest+2)= enlout(iest:iest+2)&
&            +wt(ilang,iproj)*eisnl(1:3)

!           DEBUG
!           rank+3->rank
!           call contistr30(istr2,rank,gmet,gprimd,eisnl,&
!           &            d2gxdis(:,jjs3:jjs3-1+((rank+4)*(rank+5))/2,ia,iproj,ispin),&
!           &            gxa(:,jjs:jjs-1+((rank+1)*(rank+2))/2,ia,iproj,ispin))
!           enlout(iest:iest+2)= enlout(iest:iest+2)&
!           &            +wt(ilang,iproj)*eisnl(1:3)
!           ENDDEBUG

!           rank+1->rank+2
            call contistr12(istr2,rank,gmet,gprimd,eisnl,&
&            dgxdis(:,jjs1:jjs1-1+((rank+2)*(rank+3))/2,ia,iproj,ispin),&
&            dgxds(:,jjs2:jjs2-1+((rank+3)*(rank+4))/2,ia,iproj,ispin))
            enlout(iest:iest+2)= enlout(iest:iest+2)&
&            +wt(ilang,iproj)*eisnl(1:3)

!           DEBUG
!           rank+2->rank+1
!           call contistr21(istr2,rank,gmet,gprimd,eisnl,&
!           &            dgxds(:,jjs2:jjs2-1+((rank+3)*(rank+4))/2,ia,iproj,ispin),&
!           &            dgxdis(:,jjs1:jjs1-1+((rank+2)*(rank+3))/2,ia,iproj,ispin))
!           enlout(iest:iest+2)= enlout(iest:iest+2)&
!           &            +wt(ilang,iproj)*eisnl(1:3)
!           ENDDEBUG

!           rank->rank+1
            call contistr01(istr2,rank,gmet,gprimd,eisnl,&
&            gxa(:,jjs:jjs-1+((rank+1)*(rank+2))/2,ia,iproj,ispin),&
&            dgxdis(:,jjs1:jjs1-1+((rank+2)*(rank+3))/2,ia,iproj,ispin))
            enlout(iest:iest+2)= enlout(iest:iest+2)&
&            +wt(ilang,iproj)*eisnl(1:3)
!           
!           DEBUG
!           rank+1->rank
!           call contistr10(istr2,rank,gmet,gprimd,eisnl,&
!           &            dgxdis(:,jjs1:jjs1-1+((rank+2)*(rank+3))/2,ia,iproj,ispin),&
!           &            gxa(:,jjs:jjs-1+((rank+1)*(rank+2))/2,ia,iproj,ispin))
!           enlout(iest:iest+2)= enlout(iest:iest+2)&
!           &            +wt(ilang,iproj)*eisnl(1:3)
!           ENDDEBUG

           end do !istr2
!          #else
!          write(message, '(a,a,a,a)' )ch10,&
!          &            ' nonlop_pl : ERROR -',ch10,&
!          &            '  ABINIT has been compiled without Legendre polynomial support.'
!          call wrtout(std_out,message,'PERS')
!          call leave_new('PERS')
!          #endif
          end if !choice==6

!         End loop over iproj:
         end do
!        End condition of existence of a reference state:
        end if

!       End loop over ilang:
       end do

!      End loop over ia:
      end do

!     Operate with the non-local potential on the projected scalars,
!     in order to get matrix element [NOT needed if only force or stress
!     or dynamical matrix is being computed].

      if (signs==2) then
       if(nloalg(1)<=0 .and. choice==2)then
!       Prepare the phase k+q factors for the atoms between ia3 and ia4:
!       they were not prepared previously for nloalg<=0 and choice==2.
        call ph1d3d(ia3,ia4,kgout,kptout,matblk,natom,npwout,&
&        n1,n2,n3,phkxredout,ph1d,ph3dout)
       end if

!      call timab(74+choice,1,tsec)
       sign=-1
!      The duplication of the opernl calls has the purpose to avoid
!      copying vectout/vectout_s
       if(ispin==1)then
        if( nloalg(1)==2 .or. nloalg(1)==-2 ) then
         call opernl2(choice,dgxdis,dgxdsfac,d2gxdis,d2gxds2,dgxdtfac,&
&         ffnlout,gmet,gxafac,ia3,idir,indlmn_s,ispinor,istwf_k,itypat,&
&         jproj,kgout,kpgout,kptout,lmnmax,matblk,mincat,mlang1,mlang3,mlang4,&
&         mlang5,mlang6,mproj,ndgxdt,dimffnlout,nincat,nkpgout,nlang,nloalg,npwout,&
&         ntypat,ph3dout,sign,vectout)
        else if( nloalg(1)==3 .or. nloalg(1)==-3 ) then
         call opernl3(choice,dgxdis,dgxdsfac,d2gxdis,d2gxds2,dgxdtfac,&
&         ffnlout,gmet,gxafac,ia3,idir,indlmn_s,ispinor,istwf_k,itypat,&
&         jproj,kgout,kpgout,kptout,lmnmax,matblk,mincat,mlang1,mlang3,mlang4,&
&         mlang5,mlang6,mproj,ndgxdt,dimffnlout,nincat,nkpgout,nlang,nloalg,npwout,&
&         ntypat,ph3dout,sign,vectout)
        else if( nloalg(1)==4 .or. nloalg(1)==-4 ) then
         call opernl4b(choice,dgxdsfac,d2gxds2,dgxdtfac,ffnlout,gmet,gxafac,&
&         ia3,idir,indlmn_s,ispinor,istwf_k,itypat,jproj,kgout,kpgout,kptout,&
&         lmnmax,matblk,mincat,mlang3,mlang4,mlang6,mproj,ndgxdt,&
&         dimffnlout,nincat,nkpgout,nlang,nloalg,npwout,ntypat,ph3dout,vectout)
        end if
       else  ! if ispin == 2
#if defined FC_NEC
        vectout_s(1,:)=0.d0
        vectout_s(2,:)=0.d0
#else
        vectout_s(:,:)=0.d0
#endif
        if( nloalg(1)==2 .or. nloalg(1)==-2 ) then
         call opernl2(choice,dgxdis,dgxdsfac,d2gxdis,d2gxds2,dgxdtfac,&
&         ffnlout,gmet,gxafac,ia3,idir,indlmn_s,ispinor,istwf_k,itypat,&
&         jproj,kgout,kpgout,kptout,lmnmax,matblk,mincat,mlang1,mlang3,mlang4,&
&         mlang5,mlang6,mproj,ndgxdt,dimffnlout,nincat,nkpgout,nlang,nloalg,npwout,&
&         ntypat,ph3dout,sign,vectout_s)
        else if( nloalg(1)==3 .or. nloalg(1)==-3 ) then
         call opernl3(choice,dgxdis,dgxdsfac,d2gxdis,d2gxds2,dgxdtfac,&
&         ffnlout,gmet,gxafac,ia3,idir,indlmn_s,ispinor,istwf_k,itypat,&
&         jproj,kgout,kpgout,kptout,lmnmax,matblk,mincat,mlang1,mlang3,mlang4,&
&         mlang5,mlang6,mproj,ndgxdt,dimffnlout,nincat,nkpgout,nlang,nloalg,npwout,&
&         ntypat,ph3dout,sign,vectout_s)
        else if( nloalg(1)==4 .or. nloalg(1)==-4 ) then
         call opernl4b(choice,dgxds,d2gxds2,dgxdtfac,ffnlout,gmet,gxafac,&
&         ia3,idir,indlmn_s,ispinor,istwf_k,itypat,jproj,kgout,kpgout,kptout,&
&         lmnmax,matblk,mincat,mlang3,mlang4,mlang6,mproj,ndgxdt,&
&         dimffnlout,nincat,nkpgout,nlang,nloalg,npwout,ntypat,ph3dout,vectout_s)
        end if
        vectout(1,1+npwout:2*npwout)=&
&        vectout(1,1+npwout:2*npwout)+vectout_s(1,1:npwout)
        vectout(2,1+npwout:2*npwout)=&
&        vectout(2,1+npwout:2*npwout)+vectout_s(2,1:npwout)

       end if ! end ispin if
!      call timab(74+choice,2,tsec)
      end if ! end signs==2 if

!     End loops over spins:
     end do

!    End condition of existence of a non-local part for that type of atom:
    end if

!   End loop over ispinor:
   end do

!  End sum on atom subset loop, over ia3:
  end do

! End atom type loop, over itypat:
  ia1=ia2+1
 end do


!De-allocate temporary space.
 deallocate(ekb_s)
 deallocate(gxa)
 deallocate(gxafac)
 deallocate(dgxds)
 deallocate(dgxdt)
 deallocate(dgxdtfac)
 deallocate(wt)
 deallocate(jproj)
 deallocate(temp)
 deallocate(tmpfac)
 deallocate(dgxdsfac)
 deallocate(indlmn_s)
 if(choice==6)deallocate(dgxdis,d2gxdis,d2gxds2)
 if(nspinor==2)deallocate(dgxds_s,dgxdt_s,gxa_s)
 if(nspinor==2 .and. choice==6)deallocate(dgxdis_s,d2gxdis_s,d2gxds2_s)
 if (mpssoang>mpsang) deallocate(pauli)

!Restore the original content of the vectin array.
!Note that only the first part was modified
 if(istwf_k/=1)then
  call scalewf_nonlop(istwf_k,mpi_enreg,npwin,2,vectin)
 end if

!DEBUG
!write(6,*)'  vectout='
!do ii=1,nspinor*npwout
!write(6, '(i3,2es16.6)' )ii,vectout(1:2,ii)
!end do
!ENDDEBUG

 if(nspinor==2)then
  deallocate(vectin_s,vectout_s)
 end if

!Save non-local energy
 if((choice==1).and.size(enlout)>0) enlout(1)=enlk ! on test v4/93 size(enlout) can be zero (PMA)

!Do final manipulations to produce strain gradients for
!stress tensor, in cartesian coordinates
 if ((choice==3.or.choice==23) .and. signs==1) then
! Convert strsnl from reduced to cartesian coordinates
  call strconv(strsnl,gprimd,strsnl)
! Add diagonal part and normalize by additional ucvol
! (fill up first 6 components of enlout with these gradients--
! elements 7,8,9 of enlout are not used)
  enlout(1)=(strsnl(1)-enlk)/ucvol
  enlout(2)=(strsnl(2)-enlk)/ucvol
  enlout(3)=(strsnl(3)-enlk)/ucvol
  enlout(4)=strsnl(4)/ucvol
  enlout(5)=strsnl(5)/ucvol
  enlout(6)=strsnl(6)/ucvol
! Eventually, add spin-orbit part due to gradients of
! antisymmetric tensor
  if (mpssoang>mpsang) then
   call strsocv(strsso,gprimd,strssoc)
   enlout(:)=enlout(:)+strssoc(:)/ucvol
  end if
 end if

 if ((choice<1 .or. choice>6) .and. choice/=23 ) then
  write(message, '(a,a,a,a,i4,a)' )ch10,&
&  ' nonlop : BUG -',ch10,&
&  '  Does not presently support this choice=',choice,'.'
  call wrtout(06,message,'PERS')
  call leave_new('PERS')
 end if

!DEBUG
!write(6,*)' nonlop_pl: exit '
!ENDDEBUG

end subroutine nonlop_pl
!!***
