!{\src2tex{textfont=tt}}
!!****f* ABINIT/exc_build_block
!! NAME
!!  exc_build_block
!!
!! FUNCTION
!!  Calculate and write the excitonic Hamiltonian on an external binary file (Fortran file open
!!  in random mode) for subsequent treatment in the Bethe-Salpeter code.
!!
!! COPYRIGHT
!! Copyright (C) 1992-2009 EXC group (L.Reining, V.Olevano, F.Sottile, S.Albrecht, G.Onida)
!! Copyright (C) 2009-2012 ABINIT group (L.Reining, V.Olevano, F.Sottile, S.Albrecht, G.Onida, M.Giantomassi)
!! This file is distributed under the terms of the
!! GNU General Public License, see ~abinit/COPYING
!! or http://www.gnu.org/copyleft/gpl.txt .
!! For the initials of contributors, see ~abinit/doc/developers/contributors.txt .
!!
!! INPUTS
!!  BSp<excparam>=The parameters for the Bethe-Salpeter calculation. 
!!  Cryst<crystal_structure>=Info on the crystalline structure.
!!  Kmesh<BZ_mesh_type>=The list of k-points in the BZ, IBZ and symmetry tables.
!!  Qmesh<BZ_mesh_type>=The list of q-points for epsilon^{-1} and related symmetry tables. 
!!  ktabr(nfftot_osc,BSp%nkbz)=The FFT index of $(R^{-1}(r-\tau))$ where R is symmetry needed to obtains 
!!    the k-points from the irreducible image.  Used to symmetrize u_Sk where S = \transpose R^{-1}
!!  Gsph_x<gvectors_type>=Info on the G-sphere used to describe wavefunctions and W (the largest one is actually stored).  
!!  Gsph_c<gvectors_type>=Info on the G-sphere used to describe the correlation part.
!!  Vcp<vcoul_t>=The Coulomb interaction in reciprocal space. A cutoff can be used
!!  W<screen_t>=Data type gathering info and data for W.
!!  nfftot_osc=Total Number of FFT points used for the oscillator matrix elements.
!!  ngfft_osc(18)=Info on the FFT algorithm used to calculate the oscillator matrix elements.
!!  Psps<Pseudopotential_type>=Variables related to pseudopotentials
!!  Pawtab(Psps%ntypat)<pawtab_type>=PAW tabulated starting data.
!!  Pawang<pawang_type>=PAW angular mesh and related data.
!!  Paw_pwff(Cryst%ntypat*Wfd%usepaw)<Paw_pwff_type>=Form factor used to calculate the onsite mat. elements of a plane wave.
!!  Wfd<wfs_descriptor>=Handler for the wavefunctions.
!!    prtvol=Verbosity level.
!!  rhxtwg_q0
!!  is_resonant
!!  fname
!!  comm=MPI communicator.
!!
!! OUTPUT
!!  The excitonic Hamiltonian is saved on an external binary file (see below).
!!
!! NOTES
!!  *) Version for K_V = K_C (q=0), thus KP_V = KP_C
!!  *) No exchange limit: use LDA energies in case 
!!  *) Symmetry of H(-k-k') = H*(k k') not used.
!!  *) Coulomb term can be approssimateed as diagonal in G
!!  *) Valence bands treated from lomo on
!!  *) Symmetries of the sub-blocks are used to reduced the number of elements to calculate.
!!
!!            ____________
!!           |_(cv)__(vc)_|
!!   H_exc = |  R      C  |
!!           | -C*    -R* |
!!
!!   where C is symmetric and R is Hermitian provided that the QP energies are real.
!!
!!  For nsppol=1 ==> R = diag-W+2v; C = -W+2v 
!!  since the Hamiltonian can be diagonalized in the spin-singlet basis set thanks to
!!  the fact that spin triplet do not contribute to the optical limit of epsilon. 
!!
!!  For nsppol=2 ==> R = diag-W+v; C = -W+v 
!!  Now the matrix elements depend on the spin of the transitions but only those
!!  transition in which the spin of the electron and of the hole are equal contribute 
!!  to the macroscopic dielectric function. Moreover only the exchange term can connect
!!  transitions of different spin.
!!  When nsppol==2 the transitions are ordered using | (cv up) | (cv dwn) | (vc up) | (vc down) |
!!
!!  The resonant block is given by:
!!      |  (v'c' up)       | (v'c' dwn)   | 
!!      -----------------------------------           where v_{-+} = v_{+-}^H when the momentum of the photon is neglected. 
!!      | [diag-W+v]++     |      v+-     | (vc up)   Note that v_{+-} is not Hermitian due to the presence of different spins.
!!  R = -----------------------------------           Actually it reduces to a Hermitian matrix when the system is not spin polarized.
!!      |     v-+          | [diag-W+v]-- | (vc dwn)  but in this case one should use nsppol=1.
!!      -----------------------------------           As a consequence the entire matrix is calculated and stored on file.
!!
!!  The coupling block is given by:
!!      |  (c'v' up)   |    (c'v dwn)     | 
!!      -----------------------------------           where v_{-+} = v_{+-}^t when the momentum of the photon is neglected. 
!!      | [-W+v]++     |      v+-         | (vc up)   Also in this case the entire matrix v_{+-} has to be calculated
!!  C = -----------------------------------           and stored on file.
!!      |     v-+      |    [-W+v]--      | (vc dwn)
!!      -----------------------------------   
!!
!! PARENTS
!!      exc_build_ham
!!
!! CHILDREN
!!      cprj_alloc,cprj_free,destroy_paw_pwij,exc_skip_bshdr
!!      exc_skip_bshdr_mpio,exc_write_bshdr,findqg0,flush_unit,get_bz_item
!!      gsph_fft_tabs,init_paw_pwij,mpi_file_close,mpi_file_open
!!      mpi_file_set_view,mpi_file_write_all,mpi_type_free,paw_rho_tw_g
!!      paw_symcprj_op,rho_tw_g,screen_symmetrizer,screen_w0gemv,timab
!!      wfd_change_ngfft,wfd_get_cprj,wfd_get_ur,wrtout,xbarrier_mpi,xexch_mpi
!!      xmpi_split_work2_i4b,xmpi_split_work2_i8b,xmpio_create_fherm_packed
!!      xmpio_create_fsubarray_2d,xmpio_write_frmarkers
!!
!! SOURCE

#if defined HAVE_CONFIG_H
#include "config.h"
#endif

#include "abi_common.h"

subroutine exc_build_block(BSp,Cryst,Kmesh,Qmesh,ktabr,Gsph_x,Gsph_c,Vcp,Wfd,W,Hdr_bse,&
&  nfftot_osc,ngfft_osc,Psps,Pawtab,Pawang,Paw_pwff,rhxtwg_q0,is_resonant,fname)

 use defs_basis
 use defs_datatypes
 use defs_abitypes
 use m_profiling
 use m_bs_defs
 use m_bse_io
 use m_xmpi
 use m_timer
 use m_errors
#if defined HAVE_MPI2
 use mpi
#endif

 use m_gwdefs,       only : czero_gw, cone_gw, GW_TOLQ0
 use m_io_tools,     only : get_unit, flush_unit
 use m_blas,         only : xdotc, xgemv
 use m_geometry,     only : normv
 use m_header,       only : hdr_mpio_skip
 use m_crystal,      only : crystal_structure
 use m_gsphere,      only : gvectors_type, gsph_fft_tabs
 use m_vcoul,        only : vcoul_t
 use m_bz_mesh,      only : bz_mesh_type, get_BZ_item, get_BZ_diff, has_BZ_item, isamek, findqg0
 use m_paw_pwij,     only : paw_pwff_type, paw_pwij_type, init_paw_pwij, destroy_paw_pwij, paw_rho_tw_g
 use m_pawcprj,         only : cprj_type, cprj_alloc, cprj_free
 use m_wfs,          only : wfs_descriptor, wfd_get_ur, wfd_get_cprj, wfd_change_ngfft, wfd_ihave_ur, wfd_ihave_cprj
 use m_oscillators,  only : rho_tw_g, sym_rhotwgq0
 use m_screen

!This section has been created automatically by the script Abilint (TD).
!Do not modify the following lines by hand.
#undef ABI_FUNC
#define ABI_FUNC 'exc_build_block'
 use interfaces_14_hidewrite
 use interfaces_18_timing
 use interfaces_66_paw
!End of the abilint section

 implicit none

#if defined HAVE_MPI1
 include 'mpif.h'
#endif

!Arguments ------------------------------------
!scalars
 integer,intent(in) :: nfftot_osc
 character(len=fnlen),intent(in) :: fname
 logical,intent(in) :: is_resonant
 type(excparam),intent(in) :: BSp
 type(screen_t),intent(inout) :: W
 type(BZ_mesh_type),intent(in) :: Kmesh,Qmesh
 type(crystal_structure),intent(in) :: Cryst
 type(vcoul_t),intent(in) :: Vcp
 type(Gvectors_type),intent(in) :: Gsph_x,Gsph_c
 type(Pseudopotential_type),intent(in) :: Psps
 type(Hdr_type),intent(inout) :: Hdr_bse
 type(pawang_type),intent(in) :: Pawang
 type(wfs_descriptor),intent(inout) :: Wfd
!arrays
 integer,intent(in) :: ngfft_osc(18)
 integer,intent(in) :: ktabr(nfftot_osc,BSp%nkbz)
 complex(gwpc),intent(in) :: rhxtwg_q0(BSp%npweps,BSp%lomo:BSp%nbnds,BSp%lomo:BSp%nbnds,Wfd%nkibz,Wfd%nsppol)
 type(Pawtab_type),intent(in) :: Pawtab(Psps%ntypat)
 type(Paw_pwff_type),intent(in) :: Paw_pwff(Psps%ntypat*Wfd%usepaw)

!Local variables ------------------------------
!scalars
 integer,parameter :: tim_fourdp=0,map2sphere=1,paral_kgb=0
 integer(i8b) :: bsize_my_block
 integer :: nspinor,nsppol,ISg,ios,mpi_err,tmp_size,ngx
 integer :: ik_bz,ikp_bz,col_glob
 integer :: itpk_min,itpk_max
 integer :: dim_rtwg,bsh_unt,ncol,dump_unt
#ifdef HAVE_MPI_IO
 integer :: amode,mpi_fh,hmat_type,offset_err,old_type
 integer(XMPI_OFFSET_KIND) :: ehdr_offset,my_offset
 logical,parameter :: is_fortran_file=.TRUE.
#endif
 integer :: neh1,neh2,ig,nblocks
 integer :: ik_ibz,itim_k,ikp_ibz,itim_kp,isym_k,isym_kp
 integer :: iq_bz,iq_ibz,isym_q,itim_q,iqbz0,rank
 integer :: iv,ivp,ic,icp,jj,nrows,sender,my_ncols
 integer :: use_padfft,prev_nrows,spin1,spin2,block
 integer :: ierr,nproc,my_rank,master,mgfft_osc,fftalga_osc,comm
 integer(i8b) :: tot_nels,prev_nels,prev_ncols,nels,ir,it,itp,ist,iend,my_hsize
 real(dp) :: faq,kx_fact
 complex(spc) :: http,ctemp
 complex(dpc) :: ph_mkpt,ph_mkt,ene_t,ene_tp
 logical :: use_mpiio,do_coulomb_term,do_exchange_term,w_is_diagonal 
 logical :: is_qeq0
 character(len=500) :: msg
!arrays
 integer :: bidx(2,4),g0(3),spin_ids(2,3)
 integer(i8b) :: nels_block(3)
 integer :: my_cols(2),my_rows(2),proc_end(2),proc_start(2),extrema(2,2),my_starts(2),my_ends(2)
 integer,allocatable :: igfftg0(:),ktabr_k(:),ktabr_kp(:) 
 integer,allocatable :: ncols_of(:)
 integer(i8b),allocatable :: t_start(:),t_stop(:),hsize_of(:)
 integer,allocatable :: col_start(:),col_stop(:)
 integer,allocatable :: gbound(:,:)
 real(dp) :: kbz(3),kpbz(3),qbz(3),spinrot_k(4),spinrot_kp(4),kmkp(3),tsec(2)
 complex(dpc),allocatable :: my_bsham(:),buffer(:),buffer_2d(:,:),my_kxssp(:,:),prev_col(:)
 complex(gwpc),allocatable :: vc_sqrt_qbz(:)
 complex(gwpc),allocatable :: rhotwg1(:),rhotwg2(:),rhxtwg_vpv(:),rhxtwg_cpc(:),ctccp(:) 
 complex(gwpc),target,allocatable :: ur_ckp(:),ur_vkp(:),ur_vk(:),ur_ck(:)
 complex(gwpc),pointer :: ptur_ckp(:),ptur_vkp(:),ptur_vk(:),ptur_ck(:)
 type(Cprj_type),target,allocatable :: Cp_tmp1(:,:),Cp_tmp2(:,:)
 type(Cprj_type),target,allocatable :: Cp_tmp3(:,:),Cp_tmp4(:,:)
 type(Cprj_type),allocatable :: Cp_ckp(:,:),Cp_vkp(:,:)
 type(Cprj_type),allocatable :: Cp_vk(:,:),Cp_ck(:,:)
 type(cprj_type),pointer :: ptcp_ckp(:,:),ptcp_vkp(:,:),ptcp_vk(:,:),ptcp_ck(:,:)
 type(Paw_pwij_type),allocatable :: Pwij_q(:)
#ifdef HAVE_MPI_IO
 integer(XMPI_OFFSET_KIND) :: tmp_off,my_offpad
 integer(XMPI_OFFSET_KIND),allocatable :: bsize_frecord(:),offset_of_block(:)
#endif
#ifdef DEV_MG_DEBUG_MODE
 integer,allocatable :: ttp_check(:,:)
#endif

!************************************************************************

 call timab(680,1,tsec)
 call timab(681,1,tsec)

 DBG_ENTER("COLL")

 ABI_TIMER_START("")

 ABI_CHECK(Wfd%nspinor==1,"nspinor==2 not coded")
 ABI_CHECK(nfftot_osc==PRODUCT(ngfft_osc(1:3)),"mismatch in FFT size")

 if (Wfd%nsppol==2) then 
   MSG_WARNING("nsppol==2 is still under testing")
 end if
 !
 ! MPI variables.
 comm      = Wfd%comm
 nproc     = Wfd%nproc
 my_rank   = Wfd%my_rank
 master    = Wfd%master
 !
 ! Basic constants.
 nspinor = Wfd%nspinor
 nsppol  = Wfd%nsppol
 dim_rtwg=1; faq = one/(Cryst%ucvol*BSp%nkbz)
 !
 ! Prepare the FFT tables to have u(r) on the ngfft_osc mesh.
 mgfft_osc = MAXVAL(ngfft_osc(1:3)) 
 fftalga_osc = ngfft_osc(7)/100 
 if ( ANY(ngfft_osc(1:3) /= Wfd%ngfft(1:3)) ) call wfd_change_ngfft(Wfd,Cryst,Psps,ngfft_osc) 

 ABI_MALLOC(igfftg0,(BSp%npweps))
 ABI_MALLOC(ktabr_k,(nfftot_osc))
 ABI_MALLOC(ktabr_kp,(nfftot_osc))
 !
 ! Workspace arrays for wavefunctions and oscillator matrix elements.
 ABI_MALLOC(rhxtwg_vpv,(BSp%npweps))
 ABI_MALLOC(rhxtwg_cpc,(BSp%npweps))

 ABI_MALLOC(ur_ckp,(nspinor*nfftot_osc))
 ABI_MALLOC(ur_vkp,(nspinor*nfftot_osc))
 ABI_MALLOC(ur_ck ,(nspinor*nfftot_osc))
 ABI_MALLOC(ur_vk ,(nspinor*nfftot_osc))

 if (Wfd%usepaw==1) then 
   ABI_DATATYPE_ALLOCATE(Cp_vk,(Wfd%natom,nspinor))
   call cprj_alloc(Cp_vk,0,Wfd%nlmn_atm)
   ABI_DATATYPE_ALLOCATE(Cp_ck,(Wfd%natom,nspinor))
   call cprj_alloc(Cp_ck,0,Wfd%nlmn_atm)
   ABI_DATATYPE_ALLOCATE(Cp_ckp,(Wfd%natom,nspinor))
   call cprj_alloc(Cp_ckp,0,Wfd%nlmn_atm)
   ABI_DATATYPE_ALLOCATE(Cp_vkp,(Wfd%natom,nspinor))
   call cprj_alloc(Cp_vkp,0,Wfd%nlmn_atm)

   ABI_DATATYPE_ALLOCATE(Cp_tmp1,(Wfd%natom,nspinor))
   call cprj_alloc(Cp_tmp1,0,Wfd%nlmn_atm)
   ABI_DATATYPE_ALLOCATE(Cp_tmp2,(Wfd%natom,nspinor))
   call cprj_alloc(Cp_tmp2,0,Wfd%nlmn_atm)
   ABI_DATATYPE_ALLOCATE(Cp_tmp3,(Wfd%natom,nspinor))
   call cprj_alloc(Cp_tmp3,0,Wfd%nlmn_atm)
   ABI_DATATYPE_ALLOCATE(Cp_tmp4,(Wfd%natom,nspinor))
   call cprj_alloc(Cp_tmp4,0,Wfd%nlmn_atm)
 end if
 !
 ! Identify the index of q==0
 iqbz0=0 
 do iq_bz=1,Qmesh%nbz
   if (ALL(ABS(Qmesh%bz(:,iq_bz))<tol3)) iqbz0 = iq_bz
 end do
 ABI_CHECK(iqbz0/=0,"q=0 not found")
 !
 ! Indeces used to loop over bands.
 ! bidx contains the starting and final indeces used to loop over bands.
 !
 !      (b3,b4)
 !         |... ...|
 ! (b1,b2) |... ...|
 !
 ! Resonant matrix is given by
 !      (v',c')
 !       |... ...|
 ! (v,c) |... ...|
 !
 ! Coupling matrix is given by
 !       (c',v')
 !       |... ...|
 ! (v,c) |... ...|

 if (is_resonant) then
   bidx(:,1) = (/BSp%lomo,BSp%homo/)  ! range for b1
   bidx(:,2) = (/BSp%lumo,BSp%nbnds/) ! range for b2
   bidx(:,3) = (/BSp%lomo,BSp%homo/)  ! range for b3
   bidx(:,4) = (/BSp%lumo,BSp%nbnds/) ! range for b4
 else 
   bidx(:,1) = (/BSp%lomo,BSp%homo/)  ! range for b1
   bidx(:,2) = (/BSp%lumo,BSp%nbnds/) ! range for b2
   bidx(:,3) = (/BSp%lumo,BSp%nbnds/) ! range for b3
   bidx(:,4) = (/BSp%lomo,BSp%homo/)  ! range for b4
 end if
 !
 ! Treat the spin polarization.
 spin_ids(:,1) = (/1,1/)
 spin_ids(:,2) = (/2,2/)
 spin_ids(:,3) = (/1,2/)

 nblocks=1
 kx_fact=two
 nels_block(:)=0
 nels_block(1)=BSp%nreh(1)*(BSp%nreh(1)+1_i8b)/2 
 tot_nels=nels_block(1)

 if (nsppol==2) then 
   nblocks=3
   kx_fact=one
   nels_block(1) = BSp%nreh(1)*(BSp%nreh(1)+1_i8b)/2   ! Only the upper triangle for block 1 and 2 
   nels_block(2) = BSp%nreh(2)*(BSp%nreh(2)+1_i8b)/2 
   nels_block(3) = BSp%nreh(1)*BSp%nreh(2)*1_i8b       ! Block 3 does not have symmetries. 
   tot_nels= SUM(nels_block)
 end if
 !
 ! Distribute the calculation of the matrix elements among the nodes.
 ! * istarst and t_stop gives the initial and final transition treated by each node.
 ! * my_hsize is the number of transitions treated by my_cols(1:2) gives the initial and final column treated by this node.
 !
 use_mpiio=.FALSE.
#ifdef HAVE_MPI_IO 
 use_mpiio = (nproc>1)
#endif
 !use_mpiio=.TRUE.
 use_mpiio=.FALSE.

 if (is_resonant) then
   if (use_mpiio) then
     write(msg,'(2a,f6.2,a)')&
&      ". Writing resonant excitonic Hamiltonian on file "//TRIM(fname)," via MPI-IO; file size= ",two*tot_nels*dpc*b2Gb," [Gb]."
   else
     write(msg,'(2a,f6.2,a)')&
&      ". Writing resonant excitonic Hamiltonian on file "//TRIM(fname),"; file size= ",two*dpc*tot_nels*b2Gb," [Gb]."
   end if
 else
   if (use_mpiio) then
     write(msg,'(2a,f6.2,a)')&
&      ". Writing coupling excitonic Hamiltonian on file "//TRIM(fname)," via MPI-IO; file size= ",tot_nels*2*dpc*b2Gb," [Gb]."
   else
     write(msg,'(2a,f6.2,a)')&
&      ". Writing coupling excitonic Hamiltonian on file "//TRIM(fname),"; file size= ",two*dpc*tot_nels*b2Gb," [Gb]."
   end if
 end if
 call wrtout(std_out,msg,"COLL")
 call wrtout(ab_out,msg,"COLL")
 call flush_unit(std_out)
 !
 ! Master writes the BSE header with Fortran IO.
 if (my_rank==master) then 
   bsh_unt = get_unit() 
   open(unit=bsh_unt,file=fname,form='unformatted',iostat=ios)
   ABI_CHECK(ios==0,"Opening file: "//TRIM(fname))
   call exc_write_bshdr(bsh_unt,Bsp,Hdr_bse)
   close(bsh_unt) ! To force the writing (needed for MPI-IO).
   if (.not.use_mpiio) then ! Reopen the file and skip the header.
     open(unit=bsh_unt,file=fname,form='unformatted',iostat=ios)
     call exc_skip_bshdr(bsh_unt,ierr)
   end if
 end if

 call xbarrier_mpi(comm)

 if (use_mpiio) then
#ifdef HAVE_MPI_IO 
   ! Open the file with MPI-IO
   amode=MPI_MODE_RDWR
   
   call MPI_FILE_OPEN(comm, fname, amode, MPI_INFO_NULL, mpi_fh, mpi_err)
   ABI_CHECK_MPI(mpi_err,"opening: "//TRIM(fname))

   call exc_skip_bshdr_mpio(mpi_fh,xmpio_at_all,ehdr_offset)
   !call hdr_mpio_skip(mpi_fh,fform,ehdr_offset) 
   !call xmpio_read_frm(mpi_fh,ehdr_offset,xmpio_at_all,fmarker,mpi_err)
   !
   ! Precompute the offset of the each block including the Fortran markers.
   ABI_MALLOC(offset_of_block,(nblocks))
   offset_of_block(1) = ehdr_offset
   do block=2,nblocks
     tmp_off = offset_of_block(block-1) + nels_block(block-1)*xmpi_bsize_dpc 
     tmp_off = tmp_off + Bsp%nreh(block-1)*2*xmpio_bsize_frm  ! markers.
     offset_of_block(block) = tmp_off
   end do
#endif
 end if

 call timab(681,2,tsec)
 !
 do block=1,nsppol
   spin1 = spin_ids(1,block)
   spin2 = spin_ids(2,block)
                                                                  
   do_coulomb_term  = (Bsp%use_coulomb_term .and. (spin1==spin2))
   do_exchange_term = (Bsp%exchange_term>0)
   w_is_diagonal    = BSp%use_diagonal_Wgg
   !
   ! Distribution of the matrix elements among the nodes.
   nels=nels_block(block) 
   ABI_MALLOC(t_start,(0:nproc-1))
   ABI_MALLOC(t_stop,(0:nproc-1))
   call xmpi_split_work2_i8b(nels,nproc,t_start,t_stop,msg,ierr) 
   if (ierr/=0) then 
     MSG_WARNING(msg)
   end if

   ABI_MALLOC(hsize_of,(0:nproc-1))
   hsize_of=0
   do rank=0,nproc-1
     if (t_stop(rank)>=t_start(rank)) hsize_of(rank) = t_stop(rank)-t_start(rank)+1
     write(std_out,*)"nels",nels,hsize_of(rank)
   end do

   my_hsize = hsize_of(my_rank)
   if (my_hsize<=0) then
     write(msg,'(a,i0)')"Wrong number of transitions: my_hsize= ",my_hsize
     MSG_ERROR(msg)
   end if
   if (my_hsize /= INT(my_hsize,KIND=i4b)) then
     write(msg,'(a,i0)')"Size of local block too large for a default integer, Increase the number of CPUs: my_hsize= ",my_hsize
     MSG_ERROR(msg)
   end if

   my_cols=0
   do itp=1,Bsp%nreh(block) 
     do it=1,itp 
       ir = it + itp*(itp-1_i8b)/2
       if (ir==t_start(my_rank)) then 
         my_rows(1) = it
         my_cols(1) = itp
       end if
       if (ir==t_stop(my_rank)) then
         my_rows(2) = it
         my_cols(2) = itp
       end if
     end do
   end do

   my_starts = (/my_rows(1),my_cols(1)/)
   my_ends   = (/my_rows(2),my_cols(2)/)
   !
   ! * Announce the treatment of submatrix treated by each node.
   bsize_my_block = 2*dpc*my_hsize
   write(msg,'(4(a,i0))')' Treating ',my_hsize,'/',nels,' matrix elements, from column ',my_cols(1),' up to column ',my_cols(2)
   call wrtout(std_out,msg,'PERS')

   if (is_resonant) then
     write(msg,'(a,f8.1,a)')&
&     ' Calculating resonant blocks. Memory required: ',bsize_my_block*b2Mb,' Mb. '
   else 
     write(msg,'(a,f8.1,a)')&
&     ' Calculating coupling blocks. Memory required: ',bsize_my_block*b2Mb,' Mb. '
   end if
   call wrtout(std_out,msg,"COLL")

   ABI_MALLOC(my_bsham,(t_start(my_rank):t_stop(my_rank)))
   ABI_CHECK_ALLOC('Not enough memory for exc Hamiltonian')
   my_bsham = czero

   if (do_coulomb_term) then ! Construct Coulomb term.

     call timab(682,1,tsec) ! exc_build_ham(Coulomb)

     write(msg,'(a,2i2,a)')" Calculating direct Coulomb term for (spin1,spin2) ",spin1,spin2," using full W_{GG'} ..."
     if (w_is_diagonal) then 
        write(msg,'(a,2i2,a)')&
&        " Calculating direct Coulomb term for (spin1, spin2) ",spin1,spin2," using diagonal approximation for W_{GG'} ..."
     end if
     call wrtout(std_out,msg,"COLL")

     ABI_MALLOC(ctccp,(BSp%npweps))
     ABI_MALLOC(vc_sqrt_qbz,(BSp%npweps))

#ifdef DEV_MG_DEBUG_MODE
     ABI_MALLOC(ttp_check,(BSp%nreh(block),BSp%nreh(block)))
     ttp_check=0
#endif

     do ikp_bz=1,BSp%nkbz ! Loop over kp  
       ! NOTE: this way of looping is good for bulk but it's not optimal in the
       !       case of systems sampled only at Gamma e.g. isolated systems in which
       !       one should take advantage of Hermiticity by looping over c-v !!!!

       ! Check whether (vp,cp,ikp_bz,spin2) belongs to the set of columns treated by me for some vp,cp
       ! Be careful since vcks2t contains zeros corresponding to transitions that should be skipped.
       itpk_min = MINVAL(Bsp%vcks2t(:,:,ikp_bz,spin2), MASK=(Bsp%vcks2t(:,:,ikp_bz,spin2)>0) )
       itpk_max = MAXVAL(Bsp%vcks2t(:,:,ikp_bz,spin2))
       if ( my_cols(2)<itpk_min .or. my_cols(1)>itpk_max) CYCLE 

       write(msg,'(3(a,i0))')" status: ",ikp_bz,"/",BSp%nkbz," done by node ",my_rank
       call wrtout(std_out,msg,"PERS")
       call flush_unit(std_out)

       ! * Get ikp_ibz, non-symmorphic phase, ph_mkpt, and symmetries from ikp_bz.
       call get_BZ_item(Kmesh,ikp_bz,kpbz,ikp_ibz,isym_kp,itim_kp,ph_mkpt)

       ktabr_kp(:) = ktabr(:,ikp_bz)
       spinrot_kp(:)=Cryst%spinrot(:,isym_kp)

       do ik_bz=1,ikp_bz ! Loop over k
         !
         ! * Get ik_ibz, non-symmorphic phase, ph_mkt, and symmetries from ik_bz
         call get_BZ_item(Kmesh,ik_bz,kbz,ik_ibz,isym_k,itim_k,ph_mkt)
                                                                                 
         ktabr_k(:) = ktabr(:,ik_bz)
         spinrot_k(:)=Cryst%spinrot(:,isym_k)
         !if(itim_k==2) CYCLE ! time-reversal or not       
         !
         ! * Find q = K-KP-G0 in the full BZ.
         kmkp = Kmesh%bz(:,ik_bz) - Kmesh%bz(:,ikp_bz)
         call findqg0(iq_bz,g0,kmkp,Qmesh%nbz,Qmesh%bz,BSp%mG0)

         ! Evaluate the tables needed for the padded FFT performed in rhotwg. Note that we have 
         ! to pass G-G0 to sphereboundary instead of G as we need FFT results on the shifted G-sphere, 
         ! If Gamma is not inside G-G0 one has to disable FFT padding as sphereboundary will give wrong tables.
         ! * Get the G-G0 shift for the FFT of the oscillators.
         !
         ABI_MALLOC(gbound,(2*mgfft_osc+8,2))
         call gsph_fft_tabs(Gsph_c,g0,mgfft_osc,ngfft_osc,use_padfft,gbound,igfftg0)
         if ( ANY(fftalga_osc == (/2,4/)) ) use_padfft=0 ! Pad-FFT is not coded in rho_tw_g
         if (use_padfft==0) then 
           ABI_FREE(gbound)
           ABI_MALLOC(gbound,(2*mgfft_osc+8,2*use_padfft))
         end if
         !
         ! * Get iq_ibz, and symmetries from iq_bz
         call get_BZ_item(Qmesh,iq_bz,qbz,iq_ibz,isym_q,itim_q)

         is_qeq0 = (normv(qbz,Cryst%gmet,'G')<GW_TOLQ0)

         ! Symmetrize em1(omega=0) 
         call screen_symmetrizer(W,iq_bz,Cryst,Gsph_c,Qmesh) 
         !
         ! * Set up table of |q_BZ+G|
         if (iq_ibz==1) then
           do ig=1,BSp%npweps
             isg = Gsph_c%rottb(ig,itim_q,isym_q) 
             vc_sqrt_qbz(isg)=Vcp%vcqlwl_sqrt(ig,1)
           end do
         else 
           do ig=1,BSp%npweps
             isg = Gsph_c%rottb(ig,itim_q,isym_q) 
             vc_sqrt_qbz(isg) = Vcp%vc_sqrt(ig,iq_ibz)
           end do
         end if

         ! === Evaluate oscillator matrix elements ===
         ! * $ <phj/r|e^{-i(q+G)}|phi/r> - <tphj/r|e^{-i(q+G)}|tphi/r> $ in packed form.
         if (Wfd%usepaw==1.and.ik_bz/=ikp_bz) then
           ABI_DATATYPE_ALLOCATE(Pwij_q,(Cryst%ntypat))
           call init_paw_pwij(Pwij_q,BSp%npweps,Qmesh%bz(:,iq_bz),Gsph_c%gvec,Cryst%rprimd,Psps,Pawtab,Paw_pwff)
         end if

         ! =======================================
         ! === Loop over the four band indeces ===
         ! =======================================
         !
         do ic=bidx(1,2),bidx(2,2)     !do ic=BSp%lumo,BSp%nbnds

           if (wfd_ihave_ur(Wfd,ic,ik_ibz,spin1,how="Stored")) then
             ptur_ck =>  Wfd%Wave(ic,ik_ibz,spin1)%ur
           else 
             call wfd_get_ur(Wfd,ic,ik_ibz,spin1,ur_ck)
             ptur_ck => ur_ck
           end if
           !
           ! Get cprj for this (c,kbz,s1) in the BZ.
           ! * phase due to the umklapp G0 in k-q is already included.
           if (Wfd%usepaw==1) then 
             if (wfd_ihave_cprj(Wfd,ic,ik_ibz,spin1,how="Stored")) then
               ptcp_ck =>  Wfd%Wave(ic,ik_ibz,spin1)%cprj
             else
               call wfd_get_cprj(Wfd,ic,ik_ibz,spin1,Cryst,Cp_tmp1,sorted=.FALSE.)
               ptcp_ck =>  Cp_tmp1
             end if
             call paw_symcprj_op(ik_bz,nspinor,1,Cryst,Kmesh,Psps,Pawtab,Pawang,ptcp_ck,Cp_ck) 
           end if

           do icp=bidx(1,4),bidx(2,4)  !do icp=BSp%lumo,BSp%nbnds
             ! * Calculate matrix-elements rhxtwg_cpc
             !
             if (ik_bz==ikp_bz) then ! Already in memory.
               rhxtwg_cpc(:) = sym_rhotwgq0(itim_k,isym_k,dim_rtwg,BSp%npweps,rhxtwg_q0(:,icp,ic,ik_ibz,spin1),Gsph_c)

             else ! Calculate matrix element from wfr.
               ! TODO: change the order of the loops.
               if (wfd_ihave_ur(Wfd,icp,ikp_ibz,spin2,how="Stored")) then
                 ptur_ckp => Wfd%Wave(icp,ikp_ibz,spin2)%ur
               else
                 call wfd_get_ur(Wfd,icp,ikp_ibz,spin2,ur_ckp)
                 ptur_ckp => ur_ckp  
               end if

               ! Load cprj for this (c,k,s2) in the BZ.
               ! * Do not care about umklapp G0 in k-q as the phase is already included.
               if (Wfd%usepaw==1) then 
                 if (wfd_ihave_cprj(Wfd,icp,ikp_ibz,spin2,how="Stored")) then
                   ptcp_ckp =>  Wfd%Wave(icp,ikp_ibz,spin2)%cprj
                 else
                   call wfd_get_cprj(Wfd,icp,ikp_ibz,spin2,Cryst,Cp_tmp2,sorted=.FALSE.)
                   ptcp_ckp =>  Cp_tmp2
                 end if
                 call paw_symcprj_op(ikp_bz,nspinor,1,Cryst,Kmesh,Psps,Pawtab,Pawang,ptcp_ckp,Cp_ckp) 
               end if

               call rho_tw_g(nspinor,BSp%npweps,nfftot_osc,ngfft_osc,map2sphere,use_padfft,igfftg0,gbound,&
&                ptur_ckp,itim_kp,ktabr_kp,ph_mkpt,spinrot_kp,&
&                ptur_ck ,itim_k ,ktabr_k ,ph_mkt ,spinrot_k ,&
&                dim_rtwg,rhxtwg_cpc)
               
               if (Wfd%usepaw==1) then ! Add PAW onsite contribution.   
                 call paw_rho_tw_g(Bsp%npweps,dim_rtwg,nspinor,Cryst%natom,Cryst%ntypat,Cryst%typat,Cryst%xred,Gsph_c%gvec,&
&                  Cp_ckp,Cp_ck,Pwij_q,rhxtwg_cpc)
               end if
             end if

             ! Prepare sum_GG' rho_c'c*(G) W_qbz(G,G') rho_v'v(G')
             ! First sum on G: sum_G rho_c'c(G) W_qbz*(G,G') (W_qbz conjugated)

             rhxtwg_cpc = rhxtwg_cpc * vc_sqrt_qbz
             call screen_w0gemv(W,"C",Bsp%npweps,nspinor,w_is_diagonal,cone_gw,czero_gw,rhxtwg_cpc,ctccp)
             
             do iv=bidx(1,1),bidx(2,1)    !do iv=BSp%lomo,BSp%homo
               it = BSp%vcks2t(iv,ic,ik_bz,spin1); if (it==0) CYCLE ! ir-uv-cutoff
               ene_t = BSp%Trans(it,spin1)%en

               ! TODO: use this but change the order of the loops.
               if (wfd_ihave_ur(Wfd,iv,ik_ibz,spin1,how="Stored")) then
                 ptur_vk => Wfd%Wave(iv,ik_ibz,spin1)%ur
               else
                 call wfd_get_ur(Wfd,iv,ik_ibz,spin1,ur_vk)
                 ptur_vk => ur_vk
               end if
               !
               ! Load cprj for this (v,k,s1) in the BZ.
               ! * Do not care about umklapp G0 in k-q as the phase is already included.
               if (Wfd%usepaw==1) then 
                 if (wfd_ihave_cprj(Wfd,iv,ik_ibz,spin1,how="Stored")) then
                   ptcp_vk =>  Wfd%Wave(iv,ik_ibz,spin1)%cprj
                 else
                   call wfd_get_cprj(Wfd,iv,ik_ibz,spin1,Cryst,Cp_tmp3,sorted=.FALSE.)
                   ptcp_vk => Cp_tmp3
                 end if
                 call paw_symcprj_op(ik_bz,nspinor,1,Cryst,Kmesh,Psps,Pawtab,Pawang,ptcp_vk,Cp_vk) 
               end if

               do ivp=bidx(1,3),bidx(2,3) !do ivp=BSp%lomo,BSp%homo

                 if (is_resonant) then
                   itp = BSp%vcks2t(ivp,icp,ikp_bz,spin2) 
                 else ! have to exchange band indeces
                   itp = BSp%vcks2t(icp,ivp,ikp_bz,spin2) 
                 end if
                 
                 if (itp==0) CYCLE ! ir-uv-cutoff

                 ! FIXME Temporary work around, when ikp_bz == ik it might happen that itp<it
                 ! should rewrite the loops using contracted k-dependent indeces for bands
                 if (itp<it) CYCLE 

                 ir = it + itp*(itp-1)/2
                 if (ir<t_start(my_rank).or.ir>t_stop(my_rank)) CYCLE

                 ene_tp = BSp%Trans(itp,spin2)%en
                 if (ABS(DBLE(ene_t - ene_tp)) > BSp%stripecut) CYCLE ! stripe

                 ! ============================================
                 ! === Calculate matrix elements rhxtwg_vpv ===
                 ! ============================================
                 if (ik_bz==ikp_bz) then ! Already in memory.

                   rhxtwg_vpv(:) = sym_rhotwgq0(itim_k,isym_k,dim_rtwg,BSp%npweps,rhxtwg_q0(:,ivp,iv,ik_ibz,spin1),Gsph_c)

                 else ! Calculate matrix element from wfr.
                   if (wfd_ihave_ur(Wfd,ivp,ikp_ibz,spin2,how="Stored")) then
                     ptur_vkp => Wfd%Wave(ivp,ikp_ibz,spin2)%ur
                   else
                     call wfd_get_ur(Wfd,ivp,ikp_ibz,spin2,ur_vkp)
                     ptur_vkp => ur_vkp
                   end if                                                                                
                   !
                   ! Load cprj for this (vp,kp,s2) in the BZ.
                   ! * Do not care about umklapp G0 in k-q as the phase is already included.
                   if (Wfd%usepaw==1) then 
                     if (wfd_ihave_cprj(Wfd,ivp,ikp_ibz,spin2,how="Stored")) then
                       ptcp_vkp =>  Wfd%Wave(ivp,ikp_ibz,spin2)%cprj
                     else
                       call wfd_get_cprj(Wfd,ivp,ikp_ibz,spin2,Cryst,Cp_tmp4,sorted=.FALSE.)
                       ptcp_vkp => Cp_tmp4
                     end if
                     call paw_symcprj_op(ikp_bz,nspinor,1,Cryst,Kmesh,Psps,Pawtab,Pawang,ptcp_vkp,Cp_vkp) 
                   end if

                   call rho_tw_g(nspinor,BSp%npweps,nfftot_osc,ngfft_osc,map2sphere,use_padfft,igfftg0,gbound,&
&                     ptur_vkp,itim_kp,ktabr_kp,ph_mkpt,spinrot_kp,&
&                     ptur_vk ,itim_k ,ktabr_k ,ph_mkt ,spinrot_k ,&
&                     dim_rtwg,rhxtwg_vpv)

                   if (Wfd%usepaw==1) then ! Add PAW onsite contribution.
                     call paw_rho_tw_g(Bsp%npweps,dim_rtwg,nspinor,Cryst%natom,Cryst%ntypat,Cryst%typat,Cryst%xred,&
&                      Gsph_c%gvec,Cp_vkp,Cp_vk,Pwij_q,rhxtwg_vpv)
                   end if
                 end if
                 !
                 ! sum_G2 rho_c'c(G) W_qbz(G,G') rho_v'v(G')
                 rhxtwg_vpv = vc_sqrt_qbz * rhxtwg_vpv 
                 http = - faq * XDOTC(BSp%npweps,ctccp,1,rhxtwg_vpv,1)

                 ! Save result taking into account the symmetry of the matrix.
                 ! Note that the diagonal of the resonant block is not forced to be real 
                 if (itp >= it) then
                   ir = it + itp*(itp-1_i8b)/2
                   if (ir<t_start(my_rank).or.ir>t_stop(my_rank)) then 
                     write(msg,'(a,3(1x,i0))')" Gonna SIGFAULT, ir, t_start, t_stop ",ir,t_start(my_rank),t_stop(my_rank)
                     MSG_ERROR(msg)
                   end if
                   my_bsham(ir) = http
                 else
                   MSG_ERROR("itp < it but You should not be here")
                   ir = itp + it*(it-1_i8b)/2
                   if (is_resonant) then
                     my_bsham(ir) = CONJG(http)
                   else ! Code for coupling block
                     my_bsham(ir) =       http
                   end if
                 end if

#ifdef DEV_MG_DEBUG_MODE
                 ttp_check(it,itp) = ttp_check(it,itp)+1
#endif
               end do !ivp
             end do !iv
           end do !icp
         end do !ic

         ABI_FREE(gbound)

         if (Wfd%usepaw==1.and.ik_bz/=ikp_bz) then ! Free the onsite contribution for this q.
           call destroy_paw_pwij(Pwij_q)
           ABI_DATATYPE_DEALLOCATE(Pwij_q)
         end if

       end do ! ik_bz
     end do ! Fat loop over ikp_bz

#ifdef DEV_MG_DEBUG_MODE
     do itp=1,BSp%nreh(block)
       do it=1,BSp%nreh(block)
        ir = it + itp*(itp-1_i8b)/2
         if (itp>=it .and. ttp_check(it,itp) /= 1) then
           if (ir>=t_start(my_rank).and.ir<=t_stop(my_rank)) then 
             write(std_out,*)"WARNING upper triangle is not 1 ",it,itp,ttp_check(it,itp)
             write(std_out,*)TRIM(repr_trans(Bsp%Trans(it ,spin1)))
             write(std_out,*)TRIM(repr_trans(Bsp%Trans(itp,spin2)))
           end if
         end if
         if (itp< it .and. ttp_check(it,itp) /= 0) then
           write(std_out,*)"WARNING: then lower triangle is not 0 ",it,itp,ttp_check(it,itp)
           write(std_out,*)TRIM(repr_trans(Bsp%Trans(it ,spin1)))
           write(std_out,*)TRIM(repr_trans(Bsp%Trans(itp,spin2)))
         end if
       end do
     end do
     ierr = SUM(SUM(ttp_check,DIM=2),DIM=1)
     if (ierr/=my_hsize) then 
       write(msg,'(a,2i0)')"ierr/=my_hsize",ierr,my_hsize
       MSG_ERROR(msg)
     end if
     ABI_FREE(ttp_check)
#endif

     ABI_FREE(ctccp)
     ABI_FREE(vc_sqrt_qbz)
     call wrtout(std_out,' Coulomb term completed',"COLL")

     call timab(682,2,tsec) ! exc_build_ham(Coulomb)

   end if ! do_coulomb_term
   !
   ! =====================
   ! === Exchange term ===
   ! =====================
   ! TODO might add treatment of <psi|q+G|psi> for q+G -> 0
   ! TODO might used enlarged G-sphere for better convergence.
   if (do_exchange_term) then

     call timab(683,1,tsec) ! exc_build_ham(exchange)

     write(msg,'(a,2i2,a)')" Calculating exchange term for (spin1,spin2) ",spin1,spin2," ..."
     call wrtout(std_out,msg,"COLL")

     ABI_MALLOC(rhotwg1,(BSp%npweps))
     ABI_MALLOC(rhotwg2,(BSp%npweps))

     ngx = Gsph_x%ng 
     ABI_MALLOC(vc_sqrt_qbz,(ngx))

     ! * Get iq_ibz, and symmetries from iq_bz.
     iq_bz = iqbz0 ! q = 0 -> iqbz0
     call get_BZ_item(Qmesh,iq_bz,qbz,iq_ibz,isym_q,itim_q)

     ! * Set up table of |q(BZ)+G|
     if (iq_ibz==1) then 
       do ig=1,ngx
         ISg = Gsph_x%rottb(ig,itim_q,isym_q) 
         vc_sqrt_qbz(ISg)=Vcp%vcqlwl_sqrt(ig,1)
       end do
     else 
        MSG_ERROR("iq_ibz should be 1")
     end if

     do itp=1,BSp%nreh(block) ! Loop over transition tp = (kp,vp,cp,spin2)

       if (itp<my_cols(1) .or. itp>my_cols(2)) CYCLE ! I dont have this column.
       ene_tp = Bsp%Trans(itp,spin2)%en
       ikp_bz = Bsp%Trans(itp,spin2)%k
       ivp    = Bsp%Trans(itp,spin2)%v
       icp    = Bsp%Trans(itp,spin2)%c
                                                                                                             
       ikp_ibz = Kmesh%tab (ikp_bz)
       isym_kp = Kmesh%tabo(ikp_bz)
       itim_kp = (3-Kmesh%tabi(ikp_bz))/2
                                                                                                             
       if (is_resonant) then 
         rhotwg2(:) = sym_rhotwgq0(itim_kp,isym_kp,dim_rtwg,BSp%npweps,rhxtwg_q0(:,ivp,icp,ikp_ibz,spin2),Gsph_c)
       else ! Code for coupling block.
         rhotwg2(:) = sym_rhotwgq0(itim_kp,isym_kp,dim_rtwg,BSp%npweps,rhxtwg_q0(:,icp,ivp,ikp_ibz,spin2),Gsph_c)
       end if
       !
       ! Multiply by the Coulomb term.
        do ig=2,BSp%npweps
          rhotwg2(ig) = rhotwg2(ig) * vc_sqrt_qbz(ig) * vc_sqrt_qbz(ig) 
        end do

       do it=1,itp ! Loop over transition t = (k,v,c,spin1)
         ir = it + itp*(itp-1_i8b)/2
         if (ir<t_start(my_rank) .or. ir>t_stop(my_rank)) CYCLE

         ene_t = Bsp%Trans(it,spin1)%en
         if (ABS(DBLE(ene_t - ene_tp)) > BSp%stripecut) CYCLE  ! Stripe cutoff
         ik_bz   = Bsp%Trans(it,spin1)%k
         iv      = Bsp%Trans(it,spin1)%v
         ic      = Bsp%Trans(it,spin1)%c

         ik_ibz = Kmesh%tab(ik_bz)
         isym_k = Kmesh%tabo(ik_bz)
         itim_k = (3-Kmesh%tabi(ik_bz))/2
         !if (itim_k==2) CYCLE ! time-reversal or not       

         rhotwg1(:) = sym_rhotwgq0(itim_k,isym_k,dim_rtwg,BSp%npweps,rhxtwg_q0(:,iv,ic,ik_ibz,spin1),Gsph_c)
         !
         ! sum over G/=0
         ctemp = XDOTC(BSp%npweps-1,rhotwg1(2:),1,rhotwg2(2:),1)
         ctemp = faq * kx_fact * ctemp

         my_bsham(ir) = my_bsham(ir) + ctemp
       end do !it
     end do !itp

     ABI_FREE(rhotwg1)
     ABI_FREE(rhotwg2)
     ABI_FREE(vc_sqrt_qbz)

     call timab(683,2,tsec) ! exc_build_ham(exchange)
   end if ! do_exchange_term
   !
   ! =====================
   ! === Diagonal term ===
   ! =====================
   if (is_resonant .and. spin1==spin2) then
     write(msg,'(a,2i2,a)')" Adding diagonal term for (spin1,spin2) ",spin1,spin2," ..."
     call wrtout(std_out,msg,"COLL")
     do it=1,BSp%nreh(block)
       ir = it + it*(it-1_i8b)/2
       if (ir>=t_start(my_rank) .and. ir<=t_stop(my_rank)) my_bsham(ir) = my_bsham(ir) + Bsp%Trans(it,spin1)%en
     end do
   end if

   if (.FALSE.) then
     dump_unt = get_unit()
     !dump_unt = 999
     msg=' Coupling Hamiltonian matrix elements: '
     if (is_resonant) msg=' Reasonant Hamiltonian matrix elements: '
     call wrtout(dump_unt,msg,"PERS")
     call wrtout(dump_unt,'    v  c  k      v" c  k"       H',"PERS")
     do itp=1,BSp%nreh(block)      
       ikp_bz = Bsp%Trans(itp,spin2)%k
       ivp    = Bsp%Trans(itp,spin2)%v
       icp    = Bsp%Trans(itp,spin2)%c
       do it=1,itp
         ik_bz = Bsp%Trans(it,spin1)%k
         iv    = Bsp%Trans(it,spin1)%v
         ic    = Bsp%Trans(it,spin1)%c
         ir = it + itp*(itp-1_i8b)/2
         if (ir>=t_start(my_rank).and.ir<=t_stop(my_rank)) then 
           http = my_bsham(ir)
           !if (ABS(http) > tol3) then
           write(msg,'(2(i0,1x),2(i5,3i3,3x),2f7.3)')it,itp, ik_bz,iv,ic,spin1, ikp_bz,ivp,icp,spin2, http
           call wrtout(dump_unt,msg,"PERS")
           !end if
         end if
       end do
     end do
   end if

   call timab(684,1,tsec) ! exc_build_ham(synchro)
   call xbarrier_mpi(comm)
   call timab(684,2,tsec) ! exc_build_ham(synchro)
   !
   ! =================================
   ! === Write Hamiltonian on disk ===
   ! =================================
   call timab(685,1,tsec) ! exc_build_ham(write_ham)
   if (use_mpiio) then
#ifdef HAVE_MPI_IO 
     ! Write the Hamiltonian with collective MPI-IO.
     !ABI_CHECK(nsppol==1,"nsppol==2 not coded, offset is wrong")
     !
     old_type = MPI_DOUBLE_COMPLEX
     call xmpio_create_fherm_packed(my_starts,my_ends,is_fortran_file,my_offset,old_type,hmat_type,offset_err)

     if (offset_err/=0) then 
       write(msg,"(3a)")&
&        " Global position index cannot be stored in a standard Fortran integer. ",ch10,&
&        " BSE matrix cannot be written with a single MPI-IO call. "
       MSG_ERROR(msg)
     end if
     !
     ! Each node uses a different offset to skip the header and the blocks written by the other CPUs.
     my_offset = offset_of_block(block) + my_offset

     call MPI_FILE_SET_VIEW(mpi_fh, my_offset, MPI_BYTE, hmat_type, 'native', MPI_INFO_NULL, mpi_err)
     ABI_CHECK_MPI(mpi_err,"SET_VIEW")

     call MPI_TYPE_FREE(hmat_type,mpi_err)
     ABI_CHECK_MPI(mpi_err,"MPI_TYPE_FREE")

     if (hsize_of(my_rank) /= INT(hsize_of(my_rank),kind=i4b) ) then
       MSG_ERROR("Wraparound error")
     end if

     tmp_size = INT(hsize_of(my_rank))
     call MPI_FILE_WRITE_ALL(mpi_fh, my_bsham, tmp_size, MPI_DOUBLE_COMPLEX, MPI_STATUS_IGNORE, mpi_err)
     ABI_CHECK_MPI(mpi_err,"FILE_WRITE")

     ! It seems that personal calls in make the code stuck
     !if (is_fortran_file .and. my_rank==master) then ! Master writes the Fortran record markers.
     ! Write the Fortran record markers.
     neh2=BSp%nreh(block)
     ABI_MALLOC(bsize_frecord,(neh2))
     bsize_frecord = (/(col_glob * xmpi_bsize_dpc, col_glob=1,neh2)/)
     ! ehdr_offset points to the end of the header.
     !call xmpio_write_frmarkers(mpi_fh,ehdr_offset,xmpio_at_all,neh2,bsize_frecord,mpi_err)
     my_offset = offset_of_block(block)
     call xmpio_write_frmarkers(mpi_fh,my_offset,xmpio_at_all,neh2,bsize_frecord,ierr)
     ABI_CHECK(ierr==0,"Error while writing Fortran markers")
     ABI_FREE(bsize_frecord)
#else
     MSG_BUG("You should not be here!")
#endif
   else
     ! Use FORTRAN IO with sequential access mode.
     ! * Each node sends its data to master node. 
     ! * Blocks are distributed according to the rank of the node.
     ! * Matrix is written by columns hence make sure that the last column is completely written.
     if (my_rank==master) then
       prev_nrows=0; if (my_cols(2) /= my_rows(2)) prev_nrows = my_rows(2)
       ncol = my_cols(2)-my_cols(1)+1
       ist=1
       do jj=1,ncol
         col_glob = my_starts(2) + jj - 1
         nrows = col_glob; if (jj==ncol) nrows=my_rows(2)
         iend = ist + nrows -1
         write(bsh_unt) my_bsham(ist:iend)
         ist=iend+1
       end do
       write(msg,'(2(a,i0))')" Wraparound error: iend=",iend," my_hsize=",hsize_of(my_rank)
       ABI_CHECK(iend==hsize_of(my_rank),msg)
       ABI_FREE(my_bsham)
     end if

     call xbarrier_mpi(comm)
     !
     ! Collect data from the other nodes.
     do sender=1,nproc-1
       if (my_rank==master)  then
         ABI_MALLOC(buffer,(hsize_of(sender)))
       end if
       tmp_size = INT(hsize_of(sender),kind=i4b)
       call xexch_mpi(my_bsham,tmp_size,sender,buffer,master,comm,mpi_err)

       ! TODO Be careful with the MPI TAG here, add optional Arguments in xexch_mpi so that the TAG can be specified!
       proc_start = (/my_rows(1),my_cols(1)/)
       proc_end   = (/my_rows(2),my_cols(2)/)
       extrema(:,1) = proc_start
       extrema(:,2) = proc_end
       call xexch_mpi(extrema,4,sender,extrema,master,comm,mpi_err)

       proc_start = extrema(:,1) 
       proc_end   = extrema(:,2)
       !write(std_out,*)"proc_start, proc_end",proc_start,proc_end
       !
       if (my_rank==master) then
         if (prev_nrows>0) then ! backspace the file if the last record written was not complete.
           !write(std_out,*)" master node had to call backspace"
           backspace(bsh_unt)
           ABI_MALLOC(prev_col,(prev_nrows))
           read(bsh_unt) prev_col
           backspace(bsh_unt)
         end if
         !
         ! Write the columns owned by sender.
         ncol = proc_end(2)-proc_start(2)+1
         ist=1
         do jj=1,ncol
           col_glob = proc_start(2) + jj-1
           nrows = col_glob
           if (jj==1   )  nrows=col_glob - proc_start(1) + 1
           if (jj==ncol) then 
             nrows=proc_end(1)
             if (ncol==1)  nrows=proc_end(1) - proc_start(1) + 1
           end if
           iend = ist + nrows -1
           !write(std_out,*)"Using nrows, ist, iend=",nrows,ist,iend
           if (jj==1 .and. prev_nrows>0) then ! join prev_col and this subcolumn.
             write(bsh_unt) CMPLX(prev_col,kind=dpc),CMPLX(buffer(ist:iend),kind=dpc)
             prev_nrows = prev_nrows + iend-ist+1
           else 
             write(bsh_unt) CMPLX(buffer(ist:iend),kind=dpc)
             prev_nrows=0
           end if
           ist=iend+1
         end do
         if (ncol>1) then ! Reset prev_nrows if a new column has begun.
           prev_nrows = proc_end(1)
           if (proc_end(1) == proc_end(2)) prev_nrows = 0
         end if
         if (iend/=hsize_of(sender)) then 
           write(msg,'(2(a,i0))')" Wraparound error: iend=",iend," my_hsize=",hsize_of(sender)
           MSG_ERROR(msg)
         end if
         if (allocated(prev_col)) then
           ABI_FREE(prev_col)
         end if
         ABI_FREE(buffer)
       end if ! master
       !
       call xbarrier_mpi(comm)
     end do ! sender
   end if ! use_mpiio
   call timab(685,2,tsec) ! exc_build_ham(write_ham)
   !
   if (allocated(my_bsham)) then
     ABI_FREE(my_bsham)
   end if
   ABI_FREE(t_start)
   ABI_FREE(t_stop)
   ABI_FREE(hsize_of)
 end do ! block
 !
 ! ===========================================
 ! === Exchange term for spin_up spin_down ===
 ! ===========================================

 if (nsppol==2) then
   call timab(686,2,tsec) ! exc_build_ham(exch.spin)
   block=3
   neh1=BSp%nreh(1)
   neh2=BSp%nreh(2)
   !
   ! The oscillators at q=0 are available on each node for both spin.
   ! Here the calculation of the block is parallelized over columns.
   ABI_MALLOC(col_start,(0:nproc-1))
   ABI_MALLOC(col_stop,(0:nproc-1))
   call xmpi_split_work2_i4b(neh2,nproc,col_start,col_stop,msg,ierr) !check this but it should be OK.
   if (ierr/=0) then 
     MSG_WARNING(msg)
   end if

   my_cols(1) = col_start(my_rank)
   my_cols(2) = col_stop (my_rank)
   if (my_cols(2)-my_cols(1)<=0) then
     MSG_ERROR("One of the processors has zero columns!")
   end if

   ABI_MALLOC(ncols_of,(0:nproc-1))
   ncols_of=0
   do rank=0,nproc-1
     if (col_stop(rank)>=col_start(rank)) ncols_of(rank) = col_stop(rank)-col_start(rank)+1
   end do

   ABI_FREE(col_start)
   ABI_FREE(col_stop)
   !
   ! TODO might add treatment of <psi|q+G|psi> for q+G -> 0
   ! TODO might used enlarged G-sphere for better convergence.
   ! Note that my_kxssp is always written on file when nsppol=2, even when 
   ! non-local field effects are neglected.
   ABI_MALLOC(my_kxssp,(neh1,my_cols(1):my_cols(2)))
   my_kxssp=czero

   if (do_exchange_term) then
     spin1=1; spin2=2
     write(msg,'(a,2i2,a)')" Calculating exchange term for (spin1,spin2) ",spin1,spin2," ..."
     call wrtout(std_out,msg,"COLL")

     ABI_MALLOC(rhotwg1,(BSp%npweps))
     ABI_MALLOC(rhotwg2,(BSp%npweps))

     ngx = Gsph_x%ng 
     ABI_MALLOC(vc_sqrt_qbz,(ngx))
     !
     ! * Get iq_ibz, and symmetries from iq_bz.
     iq_bz = iqbz0 ! q = 0 -> iqbz0
     call get_BZ_item(Qmesh,iq_bz,qbz,iq_ibz,isym_q,itim_q)
     !
     ! * Set up table of |q(BZ)+G|
     if (iq_ibz==1) then 
       do ig=1,ngx
         ISg = Gsph_x%rottb(ig,itim_q,isym_q) 
         vc_sqrt_qbz(ISg)=Vcp%vcqlwl_sqrt(ig,1)
       end do
     else 
        MSG_ERROR("iq_ibz should be 1")
     end if

     do itp=1,neh2 ! Loop over transition tp = (kp,vp,cp,spin2)

       if (itp<my_cols(1) .or. itp>my_cols(2)) CYCLE ! I dont have this column.
       ene_tp = Bsp%Trans(itp,spin2)%en
       ikp_bz = Bsp%Trans(itp,spin2)%k
       ivp    = Bsp%Trans(itp,spin2)%v
       icp    = Bsp%Trans(itp,spin2)%c
                                                                                                             
       ikp_ibz = Kmesh%tab (ikp_bz)
       isym_kp = Kmesh%tabo(ikp_bz)
       itim_kp = (3-Kmesh%tabi(ikp_bz))/2
                                                                                                             
       if (is_resonant) then 
         rhotwg2(:) = sym_rhotwgq0(itim_kp,isym_kp,dim_rtwg,BSp%npweps,rhxtwg_q0(:,ivp,icp,ikp_ibz,spin2),Gsph_c)
       else ! Code for coupling block.
         rhotwg2(:) = sym_rhotwgq0(itim_kp,isym_kp,dim_rtwg,BSp%npweps,rhxtwg_q0(:,icp,ivp,ikp_ibz,spin2),Gsph_c)
       end if
       !
       ! Multiply by the Coulomb term.
        do ig=2,BSp%npweps
          rhotwg2(ig) = rhotwg2(ig) * vc_sqrt_qbz(ig) * vc_sqrt_qbz(ig) 
        end do

       do it=1,neh1 ! Loop over transition t = (k,v,c,spin1) FULL matrix.

         ene_t = Bsp%Trans(it,spin1)%en
         if (ABS(DBLE(ene_t - ene_tp)) > BSp%stripecut) CYCLE  ! Stripe cutoff
         ik_bz   = Bsp%Trans(it,spin1)%k
         iv      = Bsp%Trans(it,spin1)%v
         ic      = Bsp%Trans(it,spin1)%c

         ik_ibz = Kmesh%tab(ik_bz)
         isym_k = Kmesh%tabo(ik_bz)
         itim_k = (3-Kmesh%tabi(ik_bz))/2
         !if (itim_k==2) CYCLE ! time-reversal or not       

         rhotwg1(:) = sym_rhotwgq0(itim_k,isym_k,dim_rtwg,BSp%npweps,rhxtwg_q0(:,iv,ic,ik_ibz,spin1),Gsph_c)
         !
         ! sum over G/=0
         ctemp = XDOTC(BSp%npweps-1,rhotwg1(2:),1,rhotwg2(2:),1)
         ctemp = faq * kx_fact * ctemp

         my_kxssp(it,itp) = ctemp
       end do !it
     end do !itp

     ABI_FREE(rhotwg1)
     ABI_FREE(rhotwg2)
     ABI_FREE(vc_sqrt_qbz)
   end if ! do_exchange_term
   call timab(686,2,tsec) ! exc_build_ham(exch.spin)
   !
   ! =====================================
   ! === Write the Hamiltonian on disk ===
   ! =====================================
   call timab(685,1,tsec) ! exc_build_ham(write_ham)
   if (use_mpiio) then
#ifdef HAVE_MPI_IO 
     my_ncols=ncols_of(my_rank); old_type=MPI_DOUBLE_COMPLEX
     call xmpio_create_fsubarray_2D((/neh1,my_ncols/),(/neh1,my_ncols/),(/1,1/),old_type,hmat_type,my_offpad,mpi_err)
     ABI_CHECK_MPI(mpi_err,"fsubarray_2D")
     !
     ! Each node uses a different offset to skip the header and the blocks written by the other CPUs.
     prev_nels=0
     prev_ncols=0
     if (my_rank>0) then 
       prev_ncols =SUM(ncols_of(0:my_rank-1))
       prev_nels = neh1*prev_ncols
     end if
     tmp_off = prev_nels*xmpi_bsize_dpc + prev_ncols*2*xmpio_bsize_frm

     my_offset = offset_of_block(block) + tmp_off + my_offpad

     call MPI_FILE_SET_VIEW(mpi_fh, my_offset, MPI_BYTE, hmat_type, 'native', MPI_INFO_NULL, mpi_err)
     ABI_CHECK_MPI(mpi_err,"SET_VIEW")
                                                                                                               
     call MPI_TYPE_FREE(hmat_type,mpi_err)
     ABI_CHECK_MPI(mpi_err,"MPI_TYPE_FREE")

     tmp_size = INT(neh1*my_ncols)
     call MPI_FILE_WRITE_ALL(mpi_fh, my_kxssp,tmp_size, MPI_DOUBLE_COMPLEX, MPI_STATUS_IGNORE, mpi_err)
     ABI_CHECK_MPI(mpi_err,"FILE_WRITE")

     ! It seems that personal calls in make the code stuck
     ! Master writes the Fortran record markers.
     ABI_MALLOC(bsize_frecord,(neh2))
     bsize_frecord = neh1 * xmpi_bsize_dpc
     ! ehdr_offset points to the end of the header.
     !call xmpio_write_frmarkers(mpi_fh,ehdr_offset,xmpio_at_all,neh2,bsize_frecord,mpi_err)
     my_offset = offset_of_block(block)
     call xmpio_write_frmarkers(mpi_fh,my_offset,xmpio_at_all,neh2,bsize_frecord,ierr)
     ABI_CHECK(ierr==0,"Error while writing Fortran markers")
     ABI_FREE(bsize_frecord)
#else
     MSG_BUG("You should not be here")
#endif
   else
     ! Use FORTRAN IO with sequential access mode.
     ! * Each node sends its data to master node. 
     ! * Columns are distributed according to the rank of the node.
     if (my_rank==master) then
       do jj=my_cols(1),my_cols(2)
         write(bsh_unt) my_kxssp(:,jj)
       end do
       ABI_FREE(my_kxssp)
     end if
                                                                                             
     call xbarrier_mpi(comm)
     !
     ! Collect data from the other nodes.
     do sender=1,nproc-1
       if (my_rank==master)  then
         ABI_MALLOC(buffer_2d,(neh1,ncols_of(sender)))
       end if
       call xexch_mpi(my_kxssp,neh1*ncols_of(sender),sender,buffer_2d,master,comm,mpi_err)
       !
       if (my_rank==master) then ! Write the columns owned by sender.
         do jj=1,ncols_of(sender)
           write(bsh_unt) buffer_2d(:,jj)
         end do
         ABI_FREE(buffer_2d)
       end if ! master
       !
       call xbarrier_mpi(comm)
     end do ! sender
   end if
   call timab(685,2,tsec) ! exc_build_ham(write_ham)

   ABI_FREE(ncols_of)
   if (allocated(my_kxssp))  then
     ABI_FREE(my_kxssp)
   end if
 end if
 !
 ! Close the file.
 if (use_mpiio) then
#ifdef HAVE_MPI_IO 
   call MPI_FILE_CLOSE(mpi_fh, mpi_err)               
   ABI_CHECK_MPI(mpi_err,"FILE_CLOSE")
   ABI_FREE(offset_of_block)
#endif
 end if

 if (my_rank==master) close(bsh_unt)
 !
 ! * Free memory.
 ABI_FREE(igfftg0)
 ABI_FREE(ktabr_k)
 ABI_FREE(ktabr_kp)
 ABI_FREE(rhxtwg_vpv)
 ABI_FREE(rhxtwg_cpc)
 ABI_FREE(ur_ckp)
 ABI_FREE(ur_vkp)
 ABI_FREE(ur_vk)
 ABI_FREE(ur_ck)
 !
 ! * Deallocation for PAW.
 if (Wfd%usepaw==1) then
   call cprj_free(Cp_vk)
   ABI_DATATYPE_DEALLOCATE(Cp_vk)
   call cprj_free(Cp_ck)
   ABI_DATATYPE_DEALLOCATE(Cp_ck)
   call cprj_free(Cp_ckp)
   ABI_DATATYPE_DEALLOCATE(Cp_ckp)
   call cprj_free(Cp_vkp)
   ABI_DATATYPE_DEALLOCATE(Cp_vkp)
   call cprj_free(Cp_tmp1)
   ABI_DATATYPE_DEALLOCATE(Cp_tmp1)
   call cprj_free(Cp_tmp2)
   ABI_DATATYPE_DEALLOCATE(Cp_tmp2)
   call cprj_free(Cp_tmp3)
   ABI_DATATYPE_DEALLOCATE(Cp_tmp3)
   call cprj_free(Cp_tmp4)
   ABI_DATATYPE_DEALLOCATE(Cp_tmp4)
 end if

 call xbarrier_mpi(comm) 

 ABI_TIMER_STOP("")

 DBG_EXIT("COLL")

 call timab(680,2,tsec)

end subroutine exc_build_block
!!***
