!{\src2tex{textfont=tt}}
!!****f* abinit/prep_nonlop
!! NAME
!! prep_nonlop
!!
!! FUNCTION
!! this routine prepares the data to the call of nonlop.
!!
!! COPYRIGHT
!! Copyright (C) 1998-2007 ABINIT group (FBottin)
!! this file is distributed under the terms of the
!! gnu general public license, see ~abinit/COPYING
!! or http://www.gnu.org/copyleft/gpl.txt .
!! for the initials of contributors, see ~abinit/doc/developers/contributors.txt .
!!
!! INPUTS
!!  atindx1(natom)=index table for atoms, inverse of atindx
!!  choice: chooses possible output:
!!    choice=1 => a non-local energy contribution
!!          =2 => a gradient with respect to atomic position(s)
!!          =3 => a gradient with respect to strain(s)
!!          =23=> a gradient with respect to atm. pos. and strain(s)
!!          =4 => a 2nd derivative with respect to atomic pos.
!!          =24=> a gradient and 2nd derivative with respect to atomic pos.
!!          =5 => a gradient with respect to k wavevector
!!          =6 => 2nd derivatives with respect to strain and atm. pos.
!!  blocksize= size of block for FFT
!!  cpopt=flag defining the status of cprjin%cp(:)=<Proj_i|Cnk> scalars (see below, side effects)
!!  cwavef(2,npw*nspinor*ndat)=planewave coefficients of wavefunction.
!!  dimenl1,dimenl2=dimensions of enl (see enl)
!!  dimffnl=second dimension of ffnl (1+number of derivatives)
!!  enl(dimenl1,dimenl2)=
!!  ->Norm conserving : ==== when paw_opt=0 ====
!!                      (Real) Kleinman-Bylander energies (hartree)
!!                      dimenl1=lmnmax  -  dimenl2=ntypat
!!  ->PAW :             ==== when paw_opt=1 or 4 ====
!!                      (Real, symmetric) Dij coefs to connect projectors
!!                      dimenl1=lmnmax*(lmnmax+1)/2  -  dimenl2=natom
!!  ffnl(npw,dimffnl,lmnmax,ntypat)=nonlocal form factors on basis sphere.
!!  gmet(3,3)=metric tensor for G vecs (in bohr**-2)
!!  gprimd(3,3)=dimensional reciprocal space primitive translations
!!  gvnlc=matrix elements <G|Vnonlocal|C>
!!  kg_k(3,npw_k)=reduced planewave coordinates.
!!  kpg(npw,nkpg)= if nkpg==3 (k+G) components
!!                 if nkpg==9 [(k+G)_a].[(k+G)_b] quantities
!!  kpt(3)=k point in terms of recip. translations
!!  icall = order of call of this routine in lobpcgccwf
!!  idir=direction of the - atom to be moved in the case (choice=2,signs=2),
!!                        - k point direction in the case (choice=5,signs=2)
!!                        - strain component (1:6) in the case (choice=2,signs=2) or (choice=6,signs=1)
!!  indlmn(6,i,ntypat)= array giving l,m,n,lm,ln,s for i=ln  (if useylm=0)
!!                                                  or i=lmn (if useylm=1)
!!  istwf_k=option parameter that describes the storage of wfs
!!  lambda=factor to be used when computing (Vln-lambda.S) - only for paw_opt=2
!!  lmnmax=if useylm=1, max number of (l,m,n) comp. over all type of psps
!!        =if useylm=0, max number of (l,n)   comp. over all type of psps
!!  matblk=dimension of the array ph3d
!!  mgfft=maximum size of 1d ffts
!!  mpi_enreg=informations about mpi parallelization
!!  mpsang= 1+maximum angular momentum for nonlocal pseudopotentials
!!  mpssoang= 1+maximum (spin*angular momentum) for nonlocal pseudopotentials
!!  natom=number of atoms in cell.
!!  nattyp(ntypat)=number of atoms of each type
!!  nband_k=number of bands at this k point for that spin polarization
!!  nbdblock=
!!  ngfft(18)=contain all needed information about 3D FFT, see ~abinit/doc/input_variables/vargs.htm#ngfft
!!  nkpg=second dimension of array kpg
!!  nloalg(5)=governs the choice of the algorithm for nonlocal operator
!!  nnlout=dimension of enlout (when signs=1):
!!  npw_k=number of plane waves at this k point
!!  nspinor=number of spinorial components of the wavefunctions
!!  ntypat=number of types of atoms in cell
!!  paw_opt= define the nonlocal operator concerned with
!!  phkxred(2,natom)=phase factors exp(2 pi kpt.xred)
!!  ph1d(2,3*(2*mgfft+1)*natom)=1D structure factors phase information
!!  ph3d(2,npw,matblk)=3-dim structure factors, for each atom and plane wave.
!!  prtvol=control print volume and debugging output
!!  pspso(ntypat)=spin-orbit characteristic for each atom type
!!  signs= if 1, get contracted elements (energy, forces, stress, ...)
!!         if 2, applies the non-local operator to a function in reciprocal space
!!  sij(dimenl1,ntypat*(paw_opt/3))=overlap matrix components (only if paw_opt=2, 3 or 4)
!!  tim_nonlop=timing code of the calling routine (can be set to 0 if not attributed)
!!  ucvol=unit cell volume (bohr^3)
!!  useylm=governs the way the nonlocal operator is to be applied:
!!         1=using Ylm, 0=using Legendre polynomials
!!
!! OUTPUT
!!  ==== if (signs==1) ====
!!  enlout(nnlout)=
!!    if paw_opt==0, 1 or 2: contribution of this state to the nl part of various properties
!!    if paw_opt==3:        contribution of this state to <c|S|c>  (where S=overlap when PAW)
!!    if paw_opt==-1:       contribution of this state to various PAW Rhoij quantities
!!  ==== if (signs==2) ====
!!    if paw_opt==0, 1, 2 or 4:
!!       gvnlc(2,nspinor*npw)=result of the aplication of the nl operator
!!                        or one of its derivative to the input vect.
!!    if paw_opt==3 or 4:
!!       gsc(2,nspinor*npw*(paw_opt/3))=result of the aplication of (I+S)
!!                        to the input vect. (where S=overlap when PAW)
!!
!! SIDE EFFECTS
!!  ==== ONLY IF useylm=1
!!  cprj(natom) <type(cprj_type)>=projected input wave function |c> on non-local projector
!!                                  =<p_lmn|c> and derivatives
!!                                  Treatment depends on cpopt parameter:
!!                     if cpopt=-1, <p_lmn|in> (and derivatives)
!!                                  have to be computed (and not saved)
!!                     if cpopt= 0, <p_lmn|in> have to be computed and saved
!!                                  derivatives are eventually computed but not saved
!!                     if cpopt= 1, <p_lmn|in> and first derivatives have to be computed and saved
!!                                  other derivatives are eventually computed but not saved
!!                     if cpopt= 2  <p_lmn|in> are already in memory;
!!                                  only derivatives are computed here and not saved
!! (if useylm=0, should have cpopt=-1)
!!
!! PARENTS
!!      lobpcgccwf,vtowfk
!!
!! CHILDREN
!!      cprj1_alloc,cprj1_free,cprj1_nullify,mpi_allgather,nonlop,timab
!!      xallgather_mpi,xallgatherv_mpi,xalltoallv_mpi,xcomm_init
!!
!! SOURCE

#if defined HAVE_CONFIG_H
#include "config.h"
#endif

subroutine prep_nonlop(atindx1,choice,cpopt,cprj_block,dimenl1,dimenl2,dimffnl,enl,enlout_block,&
&                       ffnl,gmet,gprimd,iblock,icall,idir,indlmn,istwf_k,kg_k,&
&                       kpg,kpt,lambdablock,lmnmax,matblk,&
&                       blocksize,mgfft,mpi_enreg,mpsang,mpssoang,&
&                       natom,nattyp,nbdblock,nband_k,ngfft,nkpg,nloalg,nnlout,npw_k,&
&                       nspinor,ntypat,paw_opt,phkxred,ph1d,ph3d,prtvol,pspso,signs,sij,gsc,&
&                       tim_nonlop,ucvol,useylm,cwavef,gvnlc)

 use defs_basis
 use defs_datatypes

!This section has been created automatically by the script Abilint (TD). Do not modify these by hand.
#ifdef HAVE_FORTRAN_INTERFACES
 use interfaces_01manage_mpi
 use interfaces_13nonlocal
 use interfaces_lib01hidempi
#else
 use defs_xfuncmpi
#endif
!End of the abilint section

 implicit none

#if defined MPI_FFT
          include 'mpif.h'
#endif
!Arguments ------------------------------------
 integer,intent(in) :: blocksize,choice,cpopt,dimenl1,dimenl2,dimffnl,iblock,icall,idir,istwf_k
 integer :: lmnmax,matblk,mgfft,mpsang,mpssoang,signs
 integer :: natom,nband_k,nbdblock,nkpg,nnlout,npw_k,nspinor,ntypat,paw_opt,prtvol,useylm
 real(dp),intent(in) :: ucvol
 type(mpi_type),intent(inout) :: mpi_enreg
 integer,intent(in) :: atindx1(natom),indlmn(6,lmnmax,ntypat),kg_k(3,npw_k)
 integer,intent(in) :: nattyp(ntypat),ngfft(18),nloalg(5),pspso(ntypat)
 real(dp),intent(in) :: enl(dimenl1,dimenl2)
 real(dp),intent(in) :: ffnl(npw_k,dimffnl,lmnmax,ntypat)
 real(dp),intent(in) :: gmet(3,3),gprimd(3,3),lambdablock(blocksize)
 real(dp),intent(in) :: kpt(3)
 real(dp),intent(in) :: kpg(npw_k,nkpg)
 real(dp),intent(in) :: ph1d(2,3*(2*mgfft+1)*natom),phkxred(2,natom)
 real(dp),intent(in) :: sij(dimenl1,ntypat*((paw_opt+1)/3))
 real(dp),intent(out) :: enlout_block(nnlout*blocksize),gvnlc(2,npw_k*nspinor*blocksize)
 real(dp),intent(out) :: gsc(2,npw_k*nspinor*blocksize*(paw_opt/3))
 real(dp),intent(inout) :: cwavef(2,npw_k*nspinor*blocksize),ph3d(2,npw_k,matblk)
 type(cprj_type) :: cprj_block(natom*blocksize*((cpopt+3)/3))

!Local variables-------------------------------
 integer :: spacecomm=0
 integer :: bufdim,iat,ier,ipw,ii,ilmn,mu,ncpgr
 integer :: old_paral_level,tim_nonlop
 real(dp) :: lambda
 complex(dp), allocatable :: dummy2(:,:)
 real(dp) :: tsec(2)

!local variables for mpialltoallv
 real(dp), allocatable :: buffer1(:),buffer2(:),enlout(:)
 real(dp), allocatable :: cwavef_alltoall(:,:),gvnlc_alltoall(:,:),&
 gsc_alltoall(:,:),ffnl_little(:,:,:,:),ffnl_little_gather(:,:,:,:),&
 ph3d_little(:,:,:),ph3d_little_gather(:,:,:)
 integer,allocatable :: dimlmn(:)
 integer, allocatable,save :: kg_k_gather(:,:)
! integer, allocatable,save :: kpg_little(:,:)
! integer, allocatable,save :: kpg_little_gather(:,:)
! real(dp), allocatable, save :: kpg_gather(:,:)
 real(dp), allocatable,save :: ffnl_gather(:,:,:,:),ph3d_gather(:,:,:)
 integer,save :: iproc,ndatarecv,ndatarecvloc,npw_tot
 integer,  allocatable :: recvcounts(:)
 integer,  allocatable :: sendcounts(:),sdispls(:),rdispls(:)
 integer,  allocatable :: sendcountsloc(:),sdisplsloc(:),recvcountsloc(:),rdisplsloc(:)
 type(cprj_type),allocatable :: cprj(:)
!no_abirules
!correspondence with abinit. here for real wf but in complex mode
!this is the index of a given band

! *************************************************************************
 old_paral_level= mpi_enreg%paral_level
 mpi_enreg%paral_level=3
 call xcomm_init(mpi_enreg,spaceComm)
 if(mpi_enreg%mode_para=='b') spaceComm=mpi_enreg%comm_band

 allocate(sendcountsloc(blocksize))
 allocate(sdisplsloc(blocksize))
 allocate(sdispls(blocksize))
 allocate(sendcounts(blocksize))
 allocate(recvcountsloc(blocksize))
 allocate(rdisplsloc(blocksize))
 allocate(rdispls(blocksize))
 allocate(recvcounts(blocksize))
 call timab(582,1,tsec)
 call xallgather_mpi(npw_k,recvcounts,spaceComm,ier)
 call timab(582,2,tsec)
 rdispls(1)=0
 do iproc=2,blocksize
  rdispls(iproc)=rdispls(iproc-1)+recvcounts(iproc-1)
 end do
 ndatarecv=rdispls(blocksize)+recvcounts(blocksize)

 if (icall==1 .and. iblock==1) then
  if (allocated(ffnl_gather)) deallocate(ffnl_gather)
  allocate(ffnl_gather(ndatarecv,dimffnl,lmnmax,ntypat))
  allocate(ffnl_little(dimffnl,lmnmax,ntypat,npw_k))
  allocate(ffnl_little_gather(dimffnl,lmnmax,ntypat,ndatarecv))
  do ipw=1,npw_k
   ffnl_little(:,:,:,ipw)=ffnl(ipw,:,:,:)
  end do
  recvcountsloc(:)=recvcounts(:)*dimffnl*lmnmax*ntypat
  rdisplsloc(:)=rdispls(:)*dimffnl*lmnmax*ntypat
  call timab(582,1,tsec)
  call xallgatherv_mpi(ffnl_little,npw_k*dimffnl*lmnmax*ntypat,ffnl_little_gather,&
  & recvcountsloc(:),rdisplsloc,spaceComm,ier)
  call timab(582,2,tsec)
  do ipw=1,ndatarecv
   ffnl_gather(ipw,:,:,:)=ffnl_little_gather(:,:,:,ipw)
  end do
  deallocate(ffnl_little,ffnl_little_gather)

  if (allocated(ph3d_gather)) deallocate(ph3d_gather)
  allocate(ph3d_gather(2,ndatarecv,matblk))
  allocate(ph3d_little(2,matblk,npw_k),ph3d_little_gather(2,matblk,ndatarecv))
  recvcountsloc(:)=recvcounts(:)*2*matblk
  rdisplsloc(:)=rdispls(:)*2*matblk
  do ipw=1,npw_k
   ph3d_little(:,:,ipw)=ph3d(:,ipw,:)
  end do
  call timab(582,1,tsec)
  call xallgatherv_mpi(ph3d_little,npw_k*2*matblk,ph3d_little_gather,recvcountsloc(:),rdisplsloc,spaceComm,ier)
  call timab(582,2,tsec)
  do ipw=1,ndatarecv
   ph3d_gather(:,ipw,:)=ph3d_little_gather(:,:,ipw)
  end do
  deallocate(ph3d_little_gather,ph3d_little)

  if (allocated(kg_k_gather)) deallocate(kg_k_gather)
  allocate(kg_k_gather(3,ndatarecv))
  recvcountsloc(:)=recvcounts(:)*3
  rdisplsloc(:)=rdispls(:)*3
  call timab(582,1,tsec)
  call xallgatherv_mpi(kg_k(1,:),npw_k,kg_k_gather(1,:),recvcounts(:),rdispls,spaceComm,ier)
  call xallgatherv_mpi(kg_k(2,:),npw_k,kg_k_gather(2,:),recvcounts(:),rdispls,spaceComm,ier)
  call xallgatherv_mpi(kg_k(3,:),npw_k,kg_k_gather(3,:),recvcounts(:),rdispls,spaceComm,ier)
  call timab(582,2,tsec)

!It is not needed to transfer the kpg if nloalg<10
!This is the defaut in norm-conserving calculations and
!this is forced for PAW calculations
!  if (allocated(kpg_gather)) deallocate(kpg_gather)
!  allocate(kpg_gather(ndatarecv,nkpg*useylm))
!  if (useylm==1) then
!   allocate(kpg_little(nkpg,npw_k),kpg_little_gather(nkpg,ndatarecv))
!   recvcountsloc(:)=recvcounts(:)*nkpg
!   rdisplsloc(:)=rdispls(:)*nkpg
!   do ipw=1,npw_k
!    kpg_little(:,ipw)=kpg(ipw,:)
!   end do
!   call timab(582,1,tsec)
!   call xallgatherv_mpi(kpg_little,npw_k*nkpg,kpg_little_gather,recvcountsloc(:),rdisplsloc,spaceComm,ier)
!   call timab(582,2,tsec)
!   do ipw=1,ndatarecv
!    kpg_gather(ipw,:)=real(kpg_little_gather(:,ipw), dp)
!   end do
!   deallocate(kpg_little_gather,kpg_little)
!  end if

!  if (nkpg>0) call mkkpg(kg_k,kpg_k,kpt,nkpg,npw_k)

 end if !End of the icall=1 and iblock=1 conditions

 sendcounts(:)=npw_k
 do iproc=1,blocksize
  sdispls(iproc)=(iproc-1)*npw_k
 end do

 allocate(cwavef_alltoall(2,ndatarecv*nspinor))
 allocate(gsc_alltoall(2,ndatarecv*nspinor*(paw_opt/3)))
 allocate(gvnlc_alltoall(2,ndatarecv*nspinor))
 allocate(enlout(nnlout))
 if (cpopt>=0) then
  ncpgr=cprj_block(1)%ncpgr
  allocate(dimlmn(natom))
  do iat=1,natom;dimlmn(iat)=cprj_block(iat)%nlmn;end do
  allocate(cprj(natom));call cprj1_alloc(cprj,ncpgr,dimlmn)
 end if
 recvcountsloc(:)=recvcounts(:)*2*nspinor
 rdisplsloc(:)=rdispls(:)*2*nspinor
 sendcountsloc(:)=sendcounts(:)*2*nspinor
 sdisplsloc(:)=sdispls(:)*2*nspinor
 call timab(581,1,tsec)
 call xalltoallv_mpi(cwavef,sendcountsloc,sdisplsloc,cwavef_alltoall,&
&         recvcountsloc,rdisplsloc,spaceComm,ier)
 call timab(581,2,tsec)
 if (paw_opt==2) lambda=lambdablock(mpi_enreg%me_band+1)  !MT060209, not sure of this line

 call nonlop(atindx1,choice,cpopt,cprj,dimenl1,dimenl2,dimffnl,dimffnl,&
&            enl,enlout,ffnl_gather,ffnl_gather,gmet,gprimd,idir,indlmn,&
&            istwf_k,kg_k_gather,kg_k_gather,kpg,kpg,kpt,kpt,lambda,lmnmax,matblk,mgfft,&
&            mpi_enreg,mpsang,mpssoang,natom,nattyp,ngfft,nkpg,nkpg,nloalg,&
&            nnlout,ndatarecv,ndatarecv,nspinor,ntypat,paw_opt,phkxred,&
&            phkxred,ph1d,ph3d_gather,ph3d_gather,pspso,signs,sij,gsc_alltoall,&
&            tim_nonlop,ucvol,useylm,cwavef_alltoall,gvnlc_alltoall)

!Transpose the gsc_alltoall or gvlnc_alltoall tabs
!according to the paw_opt and signs values
 call timab(581,1,tsec)
 if (signs==2 .and. (paw_opt==0 .or. paw_opt==1 .or. paw_opt==4)) then
  call xalltoallv_mpi(gvnlc_alltoall,recvcountsloc,rdisplsloc,gvnlc,&
&  sendcountsloc,sdisplsloc,spaceComm,ier)
 end if
 if (signs==2 .and. (paw_opt==3 .or. paw_opt==4)) then
  call xalltoallv_mpi(gsc_alltoall,recvcountsloc,rdisplsloc,gsc,&
&  sendcountsloc,sdisplsloc,spaceComm,ier)
 end if
 call timab(581,2,tsec)

#if defined MPI_FFT
 call MPI_ALLGATHER(enlout,nnlout,MPI_DOUBLE_PRECISION,enlout_block,nnlout, &
                    MPI_DOUBLE_PRECISION,spaceComm,ier)
 if (cpopt==0.or.cpopt==1) then
  bufdim=2*sum(dimlmn);allocate(buffer1(bufdim),buffer2(bufdim*blocksize))
  ii=1;do iat=1,natom
   do ilmn=1,dimlmn(iat)
    buffer1(ii:ii+1)=cprj(iat)%cp(1:2,ilmn)
    ii=ii+2
   end do
  end do
  call MPI_ALLGATHER(buffer1,bufdim,MPI_DOUBLE_PRECISION,&
                     buffer2,bufdim,MPI_DOUBLE_PRECISION,spaceComm,ier)
  ii=1;do iat=1,natom*blocksize
   do ilmn=1,dimlmn(1+mod(iat-1,natom))
    cprj_block(iat)%cp(1:2,ilmn)=buffer2(ii:ii+1)
    ii=ii+2
   end do
  end do
  deallocate(buffer1,buffer2)
 end if
 if (cpopt==1.and.ncpgr>0) then
  bufdim=2*sum(dimlmn)*ncpgr;allocate(buffer1(bufdim),buffer2(bufdim*blocksize))
  ii=1;do iat=1,natom
   do ilmn=1,dimlmn(iat)
    do mu=1,ncpgr
     buffer1(ii:ii+1)=cprj(iat)%dcp(1:2,mu,ilmn)
     ii=ii+2
    end do
   end do
  end do
  call MPI_ALLGATHER(buffer1,bufdim,MPI_DOUBLE_PRECISION,&
                     buffer2,bufdim,MPI_DOUBLE_PRECISION,spaceComm,ier)
  ii=1;do iat=1,natom*blocksize
   do ilmn=1,dimlmn(1+mod(iat-1,natom))
    do mu=1,ncpgr
     cprj_block(iat)%dcp(1:2,mu,ilmn)=buffer2(ii:ii+1)
     ii=ii+2
    end do
   end do
  end do
  deallocate(buffer1,buffer2)
 end if
#else
 enlout_block(:) = zero
 if (cpopt>=0) call cprj1_nullify(cprj_block)
#endif

 deallocate(enlout)
 if (cpopt>=0) then
  call cprj1_free(cprj)
  deallocate(cprj,dimlmn)
 end if

 mpi_enreg%paral_level= old_paral_level
 deallocate(sendcounts,recvcounts,sdispls,rdispls)
 deallocate(sendcountsloc,sdisplsloc)
 deallocate(recvcountsloc,rdisplsloc)
 deallocate(cwavef_alltoall,gvnlc_alltoall,gsc_alltoall)
end subroutine prep_nonlop
!!***
