! 
  !---------------------------------------------------------------------
  subroutine elphel2_shuffle (npe, imode0, dvscfins, gmapsym, eigv, isym, invs, xq0, timerev)
  !---------------------------------------------------------------------
  !
  !      Calculation of the electron-phonon matrix elements el_ph_mat
  !      <\psi(k+q)|dV_{SCF}/du^q_{i a}|\psi(k)>
  !
  !      Written by Feliciano Giustino based on the routine elphel. 
  !      Main difference w.r.t. to original routine is gauge fixing, 
  !      shuffle (umklapp) mode and all-q implementation.
  !
  !      Shuffle mode implemented on may 7 2006
  !
  !      No ultrasoft now
  !
  !      Nota Bene: this subroutine is intended only for one proc per pool, 
  !      i.e. with no G-vector parallelization (some work on the igkq is 
  !      required for that in the g-mapping)
  !
  !      In order to allow a pool reading the wfc file of another
  !      pool, I had to modify the bound npwx in PW/n_plane_waves.f90
  !      which is now the max across all pools. In this way lrwfc is
  !      the same for all pools.
  !
  !---------------------------------------------------------------------
  !
#include "f_defs.h"
  !
  USE para,        ONLY : kunit
#ifdef __PARA
  use para
  USE mp_global,   ONLY : nproc, my_pool_id, nproc_pool,    & 
                          intra_image_comm, inter_pool_comm, &
                          me_pool, root_pool, mpime
  USE mp,          ONLY : mp_barrier, mp_bcast, mp_put
#endif
  USE kinds, only : DP
  use io_global, only : stdout, ionode_id
  USE wavefunctions_module,  ONLY: evc
  USE io_files, ONLY: prefix, iunigk, tmp_dir, nd_nmbr
  use wvfct, only : npwx
  use pwcom, only : current_spin, isk, nls, tpiba, g, lsda, nrx1s, nrx2s, nrx3s, &
       nr1s, nr2s, nr3s, nbnd, npw, xk, nrxxs, nspin, ngm, s, nkb, vkb, et, igk, nks
  use phcom, only : alphap, becp1, tphases, u, lrwfc, dvpsi, nlcc_any, nksq, iuwfc, &
       lgamma, npwq, igkq, evq, xq
  use el_phon, only : shift, gmap, el_ph_mat, umat, nkbl, xk_all, et_all
  USE control_flags, only : iverbosity
  ! 
  implicit none
  !
  integer :: npe, imode0 
  !  pert for this irrep
  !  mode number
  !  unit for e-ph matrix elements
  complex(kind=DP) :: dvscfins (nrxxs, nspin, npe)
  !  delta scf potential

  integer :: unf_recl
  ! integer variable for I/O control
  character (len=256) :: filint, tempfile
  ! the name of the file
  character (len=3) :: nd_nmbr0
  ! node number for shuffle
  logical :: exst
  !
  ! work variables
  !
  integer :: ik, ikk, ikq, ipert, mode, ibnd, jbnd, ir, ig, jg, ng, nkq, ipool, &
       ios, nkl, nkr, ik0, iks, mbnd, pbnd, ig0, ik1, igkq_tmp (npwx), imap, ishift, &
       ipooltmp, nkq_abs, igk0 (npwx), jsym, ism1, ipol, jpol
  integer :: jpool, npool0
  complex(kind=DP), allocatable :: aux1 (:), elphmat (:,:,:), eptmp (:,:), aux2(:,:)
  complex(kind=DP) :: ZDOTC
  real(kind=DP) :: xktmp(3), tmp, invsxk(3), raq(3)
  complex(kind=DP), parameter :: czero = (0.d0, 0.d0), cone = (1.d0, 0.d0), ci = (0.d0, 1.d0)
  !
  ! variables for folding of k+q grid
  !
!@  REAL(kind=DP) :: g0vec_all_r(3,27) 
  REAL(kind=DP) :: g0vec_all_r(3,125) ! for LSCO
 
  !   G-vectors needed to fold the k+q grid into the k grid, cartesian coord.
  INTEGER :: ng0vec, ngxx           
  !   number of inequivalent such translations
  !   bound for the allocation of the array gmap
  !
  ! variables for rotating the wavefunctions (in order to use q in the irr wedge)
  !
  INTEGER :: gmapsym ( ngm, 48 ), isym, invs(48)
  ! the map G->S(G)
  ! the symmetry which generates the current q in the star
  ! index of the inverse operation
  complex(kind=DP) :: eigv (ngm, 48), eig0v
  real(kind=DP) :: v(3), arg, xq0(3)
  !  the fractional traslation
  !  work variable
  !  the first q in the star (cartesian)
  real(kind=DP), parameter :: twopi = 6.28318530717959
  logical :: timerev
  !  true if we are using time reversal
  !
  allocate (elphmat( nbnd, nbnd, npe), eptmp (nbnd, nbnd), aux1(nrxxs),&
            aux2( npwx, nbnd) )

#ifdef __PARA
  if (nproc_pool>1) call errore &
    ('elphel2_shuffle', 'only one proc per pool in shuffle mode', 1)
#endif

  if (.not.lgamma) then
    !
    ! setup stuff for k+q folding
    !
    call kpointdivision ( ik0 )
    call readgmap ( nkbl, ngxx, ng0vec, g0vec_all_r )
    !
    if (imode0.eq.0 .and. iverbosity.eq.1) write(stdout, 5) ngxx
5   format (5x,'Estimated size of gmap: ngxx =',i5)
   !
  endif
  !
  ! close all sequential files in order to re-open them as direct access
  ! close all .wfc files in order to prepare shuffled read
  !
  close (unit = iunigk, status = 'keep')
  close (unit = iuwfc,  status = 'keep')
#ifdef __PARA
  ! never remove this barrier
  call mp_barrier()
#endif
  !
  ism1 = invs (isym)
  !
!  ! fractional traslation, cartesian coord
!  !
!  v(1) = - float (ftau (1, isym) )  / float (nr1)
!  v(2) = - float (ftau (2, isym) )  / float (nr2)
!  v(3) = - float (ftau (3, isym) )  / float (nr3)
!  call cryst_to_cart (1, v, bg, +1)

  !
  !  Start the loops over the k-points
  !
  do ik = 1, nksq
    !
    !  ik = counter of k-points with vector k
    !  ikk= index of k-point with vector k
    !  ikq= index of k-point with vector k+q
    !       k and k+q are alternated if q!=0, are the same if q=0
    !
    if (lgamma) then
      ikk = ik
      ikq = ik
      npwq = npw
    else
      ikk = 2 * ik - 1
      ikq = ikk + 1
    endif
    !
    ! find index, and possibly pool, of k+q 
    ! the index nkq (nkq_abs) takes into account the even/odd ordering 
    ! of the nscf calc
    ! we also redefine the ikq points and the corresponding energies
    ! (we need to make sure that xk(:,ikq) is really k+q for the KB projectors
    ! below and that also that the eigenvalues are taken correctly in ephwann)
    !
#ifdef __PARA
    ipooltmp = mypool
#endif
    !
    if (lgamma) then
       nkq   = ikk
       ipool = ipooltmp
    else
       call ktokpmq ( xk (:, ikk), xq, +1, ipool, nkq, nkq_abs)
       !
       !   we redefine xk(ikq) and et(ikq) for the current xq
       !
       xk(:, ikq) = xk_all(:, nkq_abs)
       et(:, ikq) = et_all(:, nkq_abs) 
       !
    endif
    !
    ! in serial execution ipool is not used, in parallel 
    ! mypool is for k and ipool is for k+q
    !
    call readwfc (  ipooltmp, ikk, evc)
    if (nksq.gt.1) call readigk (  ipooltmp, ikk, npw, igk)
    !
    call readwfc ( ipool, nkq, evq)
    if (nksq.gt.1) call readigk ( ipool, nkq, npwq, igkq)
    !
#ifdef __PARA
    if (.not.lgamma.and.nksq.gt.1.and.maxval(igkq(1:npwq)).gt.ngxx) &
      call errore ('elphel2_shuffle', 'ngxx too small', 1 )
#endif
    !
    ! ----------------------------------------------------------------
    ! Set the gauge for the eigenstates: unitary transform and phases
    ! ----------------------------------------------------------------
    !
    ! With this option, different compilers and different machines
    ! should always give the same wavefunctions.
    !
    if ( tphases .and. (imode0.eq.0) ) then
      !
      if ( .not. allocated (umat) ) allocate ( umat(nbnd,nbnd,nks) )
      !
      call setphases ( kunit, ikk, npw, umat(:,:,ikk))
      if (.not.lgamma) call setphases ( kunit, ikq, npwq,umat(:,:,ikq))
      !
    endif 
    !
    ! the k-vector needed for the KB projectors
    xktmp = xk (:, ikq)
    !
    ! --------------------------------------------------
    !   Fourier translation of the G-sphere igkq
    ! --------------------------------------------------
    !
    if (.not.lgamma) then
      !
      !  Translate by G_0 the G-sphere where evq is defined, 
      !  none of the G-points are lost.
      !
      do ig = 1, npwq
         imap = ng0vec * ( igkq (ig) - 1 ) + shift( ik+ik0 )
         igkq_tmp (ig) = gmap( imap )
         !  the old matrix version... 
         !  igkq_tmp (ig) = gmap( igkq (ig),  shift( ik+ik0 ) )
      enddo
      igkq = igkq_tmp
      !
      !  find k+q from k+q+G_0
      !  (this is needed in the calculation of the KB terms
      !  for nonlocal pseudos)
      !
      xktmp = xk (:, ikq) - g0vec_all_r ( :, shift( ik+ik0 ) )
      !
    endif
    !
    ! ---------------------------------------------------------------------
    ! phase factor arising from fractional traslations
    ! ---------------------------------------------------------------------
    !
    !  u_{k+q+G_0} carries and additional factor e^{i G_0 v}
    !
!   arg = twopi * dot_product ( g0vec_all_r ( :, shift( ik+ik0 ) ), v ) 
!   eig0v = dcmplx ( cos(arg), sin(arg) )
!   call fractrasl ( nbnd, npwq, npwx, ngm, igkq, evq, eigv (:,isym), eig0v)
    !
    call fractrasl ( nbnd, npw,  npwx, ngm, igk,  evc, eigv (:,isym), cone)
    call fractrasl ( nbnd, npwq, npwx, ngm, igkq, evq, eigv (:,isym), cone)
    !
    ! ---------------------------------------------------------------------
    ! wave function rotation to generate matrix elements for the star of q
    ! ---------------------------------------------------------------------
    !
    ! ps. don't use npwx instead of npw, npwq since the unused elements
    ! may be large and blow up gmapsym (personal experience)
    !
    igk (1:npw ) = gmapsym ( igk (1:npw ), isym )
    igkq(1:npwq) = gmapsym ( igkq(1:npwq), isym )
    !
    ! ---------------------------------------------------------------------
    ! In case of time-reversal symmetry we take the c.c. of the wfs
    ! ---------------------------------------------------------------------
    ! 
    ! we need the c.c. of dV_hartree + dV_loc + dV_NL, but it is easier to do c.c. of 
    ! the (phase-fixed) wfs and then take the c.c. of the final matrix element
    !
    if (timerev) then
       evc = conjg ( evc )
       evq = conjg ( evq )
       umat(:,:,ikk) = conjg ( umat(:,:,ikk) )
       umat(:,:,ikq) = conjg ( umat(:,:,ikq) )
    endif
    !


    !
    ! In dvqpsi_us_only3 we need becp1 and alphap for the rotated wfs. 
    ! The other quantities (deeq and qq) do not depend on the wfs, in
    ! particular in the KB case (not ultrasoft), the deeq's are the
    ! unscreened coefficients, and the qq's are zero.
    !
    ! For the KB part, remember dV_NL[q_0] ~ |S^-1(k)+q_0> <S^-1(k)|
    ! the total momentum transfer must be q_0 and the rotation 
    ! tranforms k+Sq_0 into S^-1(k)+q_0, k into S^-1(k)
    ! [see Eqs. (A9),(A14) Baroni et al. RMP]
    !
    call rotate_cart( xk(:,ikk), s (1, 1, isym), invsxk)
    !
    ! here we generate vkb on the igk() set and for k ...
    call init_us_2 (npw, igk, invsxk, vkb)
    !
    ! ... and we recompute the becp terms with the wfs (rotated through igk)
    !
    call ccalbec( nkb, npwx, npw, nbnd, becp1(1,1,ik), vkb, evc )
    !
    ! we also recompute the derivative of the becp terms with the (rotated) wfs
    !
    do ipol = 1, 3
      do ibnd = 1, nbnd
        do ig = 1, npw
           aux2(ig,ibnd) = evc(ig,ibnd) * tpiba * ci * ( invsxk(ipol) + g(ipol,igk(ig)) )
        enddo
      enddo
      call ccalbec( nkb, npwx, npw, nbnd, alphap(1,1,ipol,ik), vkb, aux2 )
    enddo
    !
    ! now we generate vkb on the igkq() set because dvpsi is needed on that set
    ! we need S^-1(k)+q_0 in the KB projector: total momentum transfer must be q_0
    !
    xktmp = invsxk + xq0
    call init_us_2 (npwq, igkq, xktmp, vkb)
    !
    ! --------------------------------------------------
    !   Calculation of the matrix element
    ! --------------------------------------------------
    !
    if (lsda) current_spin = isk (ikk)
    !
    do ipert = 1, npe
       !
       !  recalculate dvbare_q*psi_k 
       !  the call to dvqpsi_us3 differs from the old one to dvqpsi_us 
       !  ony by the xktmp passed. 
       !
       !  we have to use the first q in the star in the dvqpsi_us3 call below (xq0)
       !  
       mode = imode0 + ipert
       call dvqpsi_us3 (ik, mode, u (1, mode), nlcc_any, xktmp, xq0 ) 
       !
       !  calculate dvscf_q*psi_k
       !
       do ibnd = 1, nbnd
          aux1(:) = (0.d0, 0.d0)
          do ig = 1, npw
             aux1 (nls (igk (ig) ) ) = evc ( ig, ibnd)
          enddo
          call cft3s (aux1, nr1s, nr2s, nr3s, nrx1s, nrx2s, nrx3s, + 2)
          do ir = 1, nrxxs
             aux1 (ir) = aux1 (ir) * dvscfins (ir, current_spin, ipert)
          enddo
          call cft3s (aux1, nr1s, nr2s, nr3s, nrx1s, nrx2s, nrx3s, - 2)
          do ig = 1, npwq
            dvpsi (ig, ibnd) = dvpsi (ig, ibnd) + aux1 (nls (igkq (ig) ) )
          enddo
       end do
       !
       ! calculate elphmat(j,i)=<psi_{k+q,j}|dvscf_q*psi_{k,i}> for this pertur
       !
       do ibnd =1, nbnd
         do jbnd = 1, nbnd
           elphmat (jbnd, ibnd, ipert) = &
              ZDOTC (npwq, evq(1, jbnd), 1, dvpsi(1, ibnd), 1)
         enddo
       enddo
       !
    enddo
#ifdef __PARA
    call reduce (2 * nbnd * nbnd * npe, elphmat)
#endif
    !
    !  Rotate elphmat with the gauge matrices (this should be equivalent 
    !  to calculate elphmat with the truely rotated eigenstates)
    ! 
    if (tphases) then
      !
      do ipert = 1, npe
        !
        ! the two zgemm call perform the following ops:
        !  elphmat = umat(k+q)^\dagger * [ elphmat * umat(k) ]
        !
        call zgemm ('n', 'n', nbnd, nbnd, nbnd, cone, elphmat(:,:,ipert), & 
             nbnd, umat(:,:,ikk), nbnd, czero, eptmp, nbnd)
        call zgemm ('c', 'n', nbnd, nbnd, nbnd, cone, umat(:,:,ikq), & 
             nbnd, eptmp, nbnd, czero, elphmat(:,:,ipert), nbnd)
        !
      enddo
    endif
    !
    ! remember we have to take the c.c. of the final matrix element 
    ! when we use time-reversal
    !
    if (timerev) elphmat = conjg ( elphmat ) 
    !
    !  save eph matrix elements into el_ph_mat
    !
    do ipert = 1, npe
      do jbnd = 1, nbnd
      do ibnd = 1, nbnd
        el_ph_mat (ibnd, jbnd, ik, ipert + imode0) = & 
          elphmat (ibnd, jbnd, ipert)
      enddo
      enddo
    enddo
    !
  enddo
  !
  !  restore original configuration of files
  !
  filint = trim(prefix) // '.igk'
  call seqopn (iunigk, filint, 'unformatted', exst)
  filint = trim(prefix) // '.wfc'
  call diropn (iuwfc, filint, lrwfc, exst)
#ifdef __PARA
  ! never remove this barrier
  call mp_barrier()
#endif
  !
  deallocate (elphmat, eptmp, aux1, aux2)
  if (.not.lgamma) deallocate (gmap, shift)
  !
  return
  end subroutine elphel2_shuffle
  !
  !------------------------------------------------------------
  subroutine fractrasl ( nbnd, npw, npwx, ngm, igk, evc, eigv1, eig0v)
  !------------------------------------------------------------
  !
  USE kinds, only : DP
  implicit none
  integer :: nbnd, npw, npwx, ngm, igk (npwx)
  complex(kind=DP) :: evc (npwx, nbnd), eigv1 (ngm), eig0v
  integer :: ig, ibnd
  ! 
  do ibnd = 1, nbnd
    do ig = 1, npw
      evc (ig, ibnd) = evc (ig, ibnd) * eigv1 ( igk(ig) ) * eig0v
    enddo
  enddo
  !
  return
  end subroutine fractrasl
  !
  !------------------------------------------------------------
  subroutine rotate_cart( x, s, sx)
  !------------------------------------------------------------
  !
  ! a simple symmetry operation in cartesian coordinates 
  ! ( s is integer and in crystal coord!)
  !
  USE kinds, only : DP
  USE cell_base, ONLY : at, bg
  implicit none
  real(kind=DP) :: x(3), sx(3), xcrys(3)
  integer :: s(3,3),i
  !
  xcrys = x
  call cryst_to_cart (1, xcrys, at, -1)
  do i = 1, 3
     sx (i) = float(s (i, 1)) * xcrys (1) &
            + float(s (i, 2)) * xcrys (2) &
            + float(s (i, 3)) * xcrys (3)
  enddo
  call cryst_to_cart (1, sx, bg, +1)
  !
  return
  end subroutine rotate_cart
  !------------------------------------------------------------
  ! 
  ! I used the following to check the correctness of the symmetry
  ! operation on the wavefunctions: symmetrization in real-space 
  !
  !     integer :: i,j,k,ri,rj,rk
  !     complex(kind=DP), allocatable :: aux2(:)
  !     allocate (aux2(nrxxs))
  !
  !     do ibnd = 1, nbnd
  !       !
  !       aux1 = czero
  !       aux2 = czero
  !       do ig = 1, npw
  !          aux1 (nls (igk (ig) ) ) = evc ( ig, ibnd)
  !       enddo
  !       call cft3s (aux1, nr1s, nr2s, nr3s, nrx1s, nrx2s, nrx3s, + 2)
  !       do k = 1, nr3
  !        do j = 1, nr2
  !         do i = 1, nr1
  !            call ruotaijk (s (1, 1, isym), ftau (1, isym), &
  !                i, j, k, nr1, nr2, nr3, ri, rj, rk )
  !            aux2 (1 + ( i-1) + nr1*( j-1) + nr1*nr2*( k-1)) = &
  !            aux1 (1 + (ri-1) + nr1*(rj-1) + nr1*nr2*(rk-1)) 
  !         enddo
  !        enddo
  !       enddo
  !       call cft3s (aux2, nr1s, nr2s, nr3s, nrx1s, nrx2s, nrx3s, - 2)
  !       do ig = 1, npw
  !         evc ( ig, ibnd) = aux2 (nls (igk (ig) ) )
  !       enddo
  !       !
  !       aux1 = czero
  !       aux2 = czero
  !       do ig = 1, npwq
  !          aux1 (nls (igkq (ig) ) ) = evq ( ig, ibnd)
  !       enddo
  !       call cft3s (aux1, nr1s, nr2s, nr3s, nrx1s, nrx2s, nrx3s, + 2)
  !       do k = 1, nr3
  !        do j = 1, nr2
  !         do i = 1, nr1
  !            call ruotaijk (s (1, 1, isym), ftau (1, isym), &
  !                i, j, k, nr1, nr2, nr3, ri, rj, rk )
  !            aux2 (1 + ( i-1) + nr1*( j-1) + nr1*nr2*( k-1)) = &
  !            aux1 (1 + (ri-1) + nr1*(rj-1) + nr1*nr2*(rk-1)) 
  !         enddo
  !        enddo
  !       enddo
  !       call cft3s (aux2, nr1s, nr2s, nr3s, nrx1s, nrx2s, nrx3s, - 2)
  !       do ig = 1, npwq
  !         evq ( ig, ibnd) = aux2 (nls (igkq (ig) ) )
  !       enddo
  !       !
  !     enddo
  !
  !  Instead of rotating the wfs, we can also perform the operation
  !  directly on the dvscf: here is how it goes. Note that in order
  !  to use this we have to switch off the local and the nonlocal
  !  pseudo contributions (i.e. we keep only the hartree term)
  !  [set dvpsi = czero after the dvqpsi_us3 call]
  !       do k = 1, nr3
  !        do j = 1, nr2
  !         do i = 1, nr1
  !            call ruotaijk (s (1, 1, ism1), ftau (1, ism1), &
  !                i, j, k, nr1, nr2, nr3, ri, rj, rk )
  !            aux2 (1 + ( i-1) + nr1*( j-1) + nr1*nr2*( k-1)) = &
  !            dvscfins (1 + (ri-1) + nr1*(rj-1) + nr1*nr2*(rk-1), current_spin, ipert)
  !         enddo
  !        enddo
  !       enddo
  !       do ir = 1, nrxxs
  !         aux1 (ir) = aux1 (ir) * aux2 (ir)
  !       enddo
  !