!
  !---------------------------------------------------------------------
  subroutine elphel2 (npe, imode0, dvscfins, iunepmat)
  !---------------------------------------------------------------------
  !
  !      Calculation of the electron-phonon matrix elements el_ph_mat
  !      <\psi(k+q)|dV_{SCF}/du^q_{i a}|\psi(k)>
  !
  !      Written by Feliciano Giustino based on the original routine 
  !      of F. Mauri. Main difference w.r.t. to original routine is
  !      gauge fixing and shuffle (umklapp) mode. 
  !
  !      Shuffle mode implemented on may 7 2006
  !
  !      No ultrasoft now
  !
  !      Works for or Intel v9, IBM SP4, and Opteron with pgi
  !
  !      Nota Bene: this subroutine in SHUFFLE mode is intended only 
  !      for one proc per pool, i.e. with no G-vector parallelization 
  !      (some work on the igkq is required for that, as well as the 
  !      intra-pool communications)
  !
  !---------------------------------------------------------------------
  !
#include "f_defs.h"
  !
  USE para,        ONLY : kunit
#ifdef __PARA
  use para
  USE mp_global,   ONLY : nproc, my_pool_id, nproc_pool,    & 
                          intra_image_comm, inter_pool_comm, &
                          me_pool, root_pool, mpime
  USE mp,          ONLY : mp_barrier, mp_bcast, mp_put
#endif
  USE kinds, only : DP
  use io_global, only : stdout, ionode_id
  USE wavefunctions_module,  ONLY: evc
  USE io_files, ONLY: iunigk
  use wvfct, only : npwx
  use pwcom
  use phcom
  use el_phon
  ! 
  implicit none
  !
  integer :: npe, imode0, iunepmat 
  !  pert for this irrep
  !  mode number
  !  unit for e-ph matrix elements
  complex(kind=DP) :: dvscfins (nrxxs, nspin, npe)
  !  delta scf potential

  !
  ! work variables
  !
  integer :: ik, ikk, ikq, ipert, mode, ibnd, jbnd, ir, ig, jg, ng, nkq, ipool, &
       ios, nkl, nkr, ik0, iks, mbnd, pbnd, ig0, ik1, igkq_tmp (npwx), imap, ishift
  integer :: jpool, npool0
  complex(kind=DP), allocatable :: aux1 (:), aux2 (:), & 
       elphmat (:,:,:), eptmp (:,:)
  complex(kind=DP) :: ZDOTC
  real(kind=DP) :: xktmp(3), tmp
  complex(kind=DP), parameter :: czero = (0.d0, 0.d0), cone = (1.d0, 0.d0)
  !
  ! variables for folding of k+q grid
  !
  REAL(kind=DP) :: g0vec_all_r(3,27) 
  !   G-vectors needed to fold the k+q grid into the k grid, cartesian coord.
  INTEGER :: iukgmap, ng0vec, ngxx           
  !   unit with map of folding G-vector indexes 
  !   number of inequivalent such translations
  !   bound for the allocation of the array gmap
  !

  allocate (elphmat ( nbnd , nbnd , npe), eptmp ( nbnd, nbnd) )    
  allocate (aux1( nrxxs), aux2( nrxxs))
#ifdef __PARA
  if (tshuffle.and.nproc_pool>1) call errore &
       ('elphel2', 'only one proc per pool in shuffle mode', 1)
#endif

  !
  ik0 = 0
  !
#ifdef __PARA
  !
  !   output on /dev/null for silent nodes 
  !
  if  (me.ne.1) then
     iunepmat = stdout
  endif
  !
  !   number of kpoint blocks, kpoints per pool and reminder
  !
  npool = nproc/nproc_pool
  nkbl  = nkstot / kunit
  nkl   = kunit * ( nkbl / npool )
  nkr   = ( nkstot - nkl * npool ) / kunit
  !
  !  the reminder goes to the first nkr pools  
  !   (0...nkr-1)
  !
  if ( my_pool_id < nkr ) nkl = nkl + kunit
  !
  !  the index of the first k point in this pool
  !
  iks = nkl * my_pool_id + 1
  if ( my_pool_id >= nkr ) iks = iks + nkr * kunit
  !
  !  the index of the first k point block in this pool - 1
  !  (I will need the index of ik, not ikk)
  !
  ik0 = ( iks - 1 ) / kunit
  !
#else
  if (lgamma) then
    kunit = 1
  else
    kunit = 2
  endif
  nkbl  = nkstot / kunit
#endif
  !
  if (tshuffle.and.lgamma) call errore &
     ('elphel2','shuffle mode cannot be used with q = 0',1)
  !
  if (tshuffle) then 
    !
    ! --------------------------------------------------------
    !   shuffle mode: read map of G vectors G -> G-G_0 
    ! --------------------------------------------------------
    !
    !  this is for the folding of k+q into the first BZ 
    !  (readin is done for every irrep)
    !
    allocate ( shift(nkbl) )
    !
    !  OBSOLETE: now we read directly igkq and take the proper ngxx
    !
    !  read only a piece of the map to save time 
    !  the proper allocation bound would be ngxx = max(max(igkq))
    !  where the max is taken over the ig and the ik
    !  Here I use a simpler estimate: take the sphere npwx + two
    !  extra shells. This may not work for strange shapes of the
    !  reciproc latt. In this case just set ngxx = ngm_g
    !
    !  ngxx = nint(4./3.*3.14*(2+(3.0/4.0/3.14*float(npwx))**(1./3.))**3.)
    !
    ngxx = 0
    if (nksq.gt.1) then
      rewind (unit = iunigk)
      do ik = 1, nksq
        !
        read (iunigk, err = 100, iostat = ios) npw,  igk
        read (iunigk, err = 200, iostat = ios) npwq, igkq
        if (maxval(igkq(1:npwq)).gt.ngxx) ngxx = maxval(igkq(1:npwq))
        !
      enddo
    endif
    !
#ifdef __PARA
    tmp = float (ngxx) 
    call poolextreme( tmp, 1 )
    ngxx = nint (tmp)
#endif
    if (imode0.eq.0) write(stdout, 5) ngxx
5   format (5x,'Estimated size of gmap: ngxx =',i5)
    !
#ifdef __PARA
    if (mpime.eq.ionode_id) then
#endif
      !
      iukgmap = 96
      open ( unit = iukgmap, file = 'kgmap.dat', form = 'formatted')
      !
      do ik = 1, nkbl
        read (iukgmap,*) ik1, shift (ik1)
      enddo
      read (iukgmap,*) ng0vec
#ifdef __PARA
    endif
    !
    ! first node broadcasts ng0vec to all nodes for allocation of gmap
    !
    call mp_bcast( ng0vec, ionode_id, inter_pool_comm )
#endif
    !
    allocate ( gmap (ngxx * ng0vec) )
    !
#ifdef __PARA
    if (mpime.eq.ionode_id) then
#endif
      !
      do ig0 = 1, ng0vec
        read (iukgmap,*) g0vec_all_r (:,ig0)
      enddo
      do ig = 1, ngxx
        ! 
        ! at variance with the nscf calculation, here gmap is read as a vector,
        ! I could not get the bcast working for the matrix (too lazy...)
        ! 
        read (iukgmap,*) (gmap ( ng0vec * ( ig - 1 ) + ishift ), ishift = 1, ng0vec)
      enddo
      !
      close (iukgmap)
      !
#ifdef __PARA
    endif
    !
    ! first node broadcasts everything to all nodes
    !
    call mp_bcast( g0vec_all_r, ionode_id, inter_pool_comm )
    call mp_bcast( shift, ionode_id, inter_pool_comm )
    call mp_bcast( gmap, ionode_id, inter_pool_comm )
    !
#endif
    !
  endif
  !
  !  Start the loops over the k-points
  !
  if (nksq.gt.1) rewind (unit = iunigk)
  do ik = 1, nksq
     !
     if (nksq.gt.1) then
        read (iunigk, err = 100, iostat = ios) npw, igk
100     call errore ('elphel', 'reading igk', abs (ios) )
     endif
     !
     !  ik = counter of k-points with vector k
     !  ikk= index of k-point with vector k
     !  ikq= index of k-point with vector k+q
     !       k and k+q are alternated if q!=0, are the same if q=0
     !
     if (lgamma) then
        ikk = ik
        ikq = ik
        npwq = npw
     else
        ikk = 2 * ik - 1
        ikq = ikk + 1
     endif
     if (lsda) current_spin = isk (ikk)
     if (.not.lgamma.and.nksq.gt.1) then
        read (iunigk, err = 200, iostat = ios) npwq, igkq
200     call errore ('elphel', 'reading igkq', abs (ios) )
        if (tshuffle) then
          if (maxval(igkq(1:npwq)).gt.ngxx) & 
            call errore ('elphel2', 'ngxx too small', 1 )
        endif
     endif
     !
     ! read wavefuctions
     !
     call davcio (evc, lrwfc, iuwfc, ikk, - 1)
     call davcio (evq, lrwfc, iuwfc, ikq, - 1)
     !
     ! ----------------------------------------------------------------
     ! Set the gauge for the eigenstates: unitary transform and phases
     ! ----------------------------------------------------------------
     !
     ! With this option, different compilers and different machines
     ! should always give the same wavefunctions.
     !
     if ( tphases .and. (imode0.eq.0) ) then
       !
       if ( .not. allocated (umat) ) allocate ( umat(nbnd,nbnd,nks) )
       if (lgamma) then
          call setphases ( kunit, ikk, npw, umat(:,:,ikk))
       else
          call setphases ( kunit, ikk, npw, umat(:,:,ikk))
          call setphases ( kunit, ikq, npwq,umat(:,:,ikq))
       endif
       !
     endif 
     !
     ! the k-vector needed for the KB projectors
     xktmp = xk (:, ikq)
     !
     if (tshuffle) then
       !
       ! --------------------------------------------------
       !   Fourier translation of the G-sphere igkq
       ! --------------------------------------------------
       !
       !  build the map G --> G-G_0 
       !  Nota Bene: this amounts to translate by G_0 
       !  the G-sphere where evq is defined. None of the 
       !  G-points are lost
       !
       do ig = 1, npwq
          imap = ng0vec * ( igkq (ig) - 1 ) + shift( ik+ik0 )
          igkq_tmp (ig) = gmap( imap )
          !  the old matrix version... 
          !  igkq_tmp (ig) = gmap( igkq (ig),  shift( ik+ik0 ) )
       enddo
       igkq = igkq_tmp
       !
       !  find k+q from k+q+G_0
       !  (this is needed in the calculation of the KB terms
       !  for nonlocal pseudos)
       !
       xktmp = xk (:, ikq) - g0vec_all_r ( :, shift( ik+ik0 ) )
       !
     endif
     !
     call init_us_2 (npwq, igkq, xktmp, vkb)
     !
     ! --------------------------------------------------
     !   Calculation of the matrix element
     ! --------------------------------------------------
     !
     do ipert = 1, npe
        !
        !  recalculate dvbare_q*psi_k 
        !  the call to dvqpsi_us3 differs from the old one to dvqpsi_us 
        !  ony by the xktmp passed. In fact, in dvqpsi_us_only the
        !  array xk is used from the klist module with the save 
        !  attribute, which means that if I change it here to xk+G0, 
        !  that subroutine will not see the change 
        !
        mode = imode0 + ipert
        call dvqpsi_us3 (ik, mode, u (1, mode), nlcc_any, xktmp, xq )
        !
        !  calculate dvscf_q*psi_k
        !
        do ibnd = 1, nbnd
           aux1(:) = (0.d0, 0.d0)
           do ig = 1, npw
              aux1 (nls (igk (ig) ) ) = evc (ig, ibnd)
           enddo
           call cft3s (aux1, nr1s, nr2s, nr3s, nrx1s, nrx2s, nrx3s, + 2)
           do ir = 1, nrxxs
             aux1 (ir) = aux1 (ir) * dvscfins (ir, current_spin, ipert)
           enddo
           call cft3s (aux1, nr1s, nr2s, nr3s, nrx1s, nrx2s, nrx3s, - 2)
           do ig = 1, npwq
             dvpsi (ig, ibnd) = dvpsi (ig, ibnd) + aux1 (nls (igkq (ig) ) )
           enddo
        end do
        call adddvscf (ipert, ik)
        !
        ! calculate elphmat(j,i)=<psi_{k+q,j}|dvscf_q*psi_{k,i}> for this pertur
        !
        do ibnd =1, nbnd
          do jbnd = 1, nbnd
            elphmat (jbnd, ibnd, ipert) = ZDOTC (npwq, evq(1, jbnd), 1, dvpsi(1, ibnd), 1)
          enddo
        enddo
        !
     enddo
#ifdef __PARA
     call reduce (2 * nbnd * nbnd * npe, elphmat)
#endif
     !
     !  Rotate elphmat with the gauge matrices (this should be equivalent 
     !  to calculate elphmat with the truely rotated eigenstates)
     ! 
     if (tphases) then
       !
       do ipert = 1, npe
         !
         ! the two zgemm call perform the following ops:
         !  eptmp = elphmat * umat(k)
         !  elphmat = umat(k+q)^\dagger * eptmp
         ! [fortran equivalent in the remarks at the bottom]
         !
         call zgemm ('n', 'n', nbnd, nbnd, nbnd, cone, elphmat(:,:,ipert), & 
              nbnd, umat(:,:,ikk), nbnd, czero, eptmp, nbnd)
         call zgemm ('c', 'n', nbnd, nbnd, nbnd, cone, umat(:,:,ikq), & 
              nbnd, eptmp, nbnd, czero, elphmat(:,:,ipert), nbnd)
         !
       enddo
     endif
     !
     !  save all e-ph matrix elements into el_ph_mat
     !
     do ipert = 1, npe
        do jbnd = 1, nbnd
           do ibnd = 1, nbnd
              el_ph_mat (ibnd, jbnd, ik, ipert + imode0) = elphmat (ibnd, jbnd, ipert)
           enddo
        enddo
     enddo
  enddo
  !
  if ( .not. elinterp ) then
    !
    !  interpolation performed separately: write e-ph matrix elements onto EPMAT.C 
    !
    do ipert = 1, npe
      do ik = 1, nksq
        do jbnd = 1, nbnd
          do ibnd = 1, nbnd
             write(iunepmat,*) el_ph_mat (ibnd, jbnd, ik, ipert + imode0)
          enddo
        enddo
      enddo
    enddo
    !
  endif
  !
  deallocate (elphmat, eptmp)
  deallocate (aux1, aux2)
  if (tshuffle) deallocate ( gmap, shift )
  !
  return
end subroutine elphel2
!------------------------------------------------------------
!
! rotation of elphmat through fortran intrinsic:
!
! elphmat(:,:,ipert) = matmul( elphmat(:,:,ipert), umat(:,:,ikk) )
! elphmat(:,:,ipert) = matmul( conjg( transpose( umat(:,:,ikq)) ), &
!      elphmat(:,:,ipert) ) 
!