!
  !-----------------------------------------------------------------------
  subroutine bcgsolve_all_optimized (Ax, e, b, x, h_diag, &
     ndmx, ndim, ethr, ik, kter, conv_root, anorm, nbnd, g2kin, vr, evq)
  !-----------------------------------------------------------------------
  !
  !  rewritten by FG following the book
  !  "Templates for the solution of linear systems: building blocks
  !  for iterative methods", by R. Barret et al, SIAM (http://www.siam.org/books)
  !  pag. 13. The only changes are:
  !  1. the product r^T*z on line 4 has been rewritten as z^T*r
  !  t obe consistent with the BiConjugate gradient method on pag. 20
  !  (look for [*] below).
  !  2. Instead of introducing p in the if/then/else, we overwrite z.
  !  Therefore our z at some point corresponds to their p.
  !
  !-----------------------------------------------------------------------
  !
  !     iterative solution of the linear system:
  !
  !                 ( h - e + Q ) * x = b                      (1)
  !
  !                 where h is a complex hermitean matrix, e is a real sca
  !                 x and b are complex vectors
  !
  !     on input:
  !                 Ax    EXTERNAL  name of a subroutine:
  !                          Ax(ndim,psi,psip)
  !                          Calculates  H*psi products.
  !                          Vectors psi and psip should be dimensined
  !                          (ndmx,nvec). nvec=1 is used!
  !
  !                 cg_psi   EXTERNAL  name of a subroutine:
  !                          g_psi(ndmx,ndim,notcnv,psi,e)
  !                          which calculates (h-e)^-1 * psi, with
  !                          some approximation, e.g. (diag(h)-e)
  !
  !                 e        real     unperturbed eigenvalue.
  !
  !                 x     contains an estimate of the solution
  !                          vector.
  !
  !                 b    contains the right hand side vector
  !                          of the system.
  !
  !                 ndmx     integer row dimension of x, ecc.
  !
  !                 ndim     integer actual row dimension of x
  !
  !                 ethr     real     convergence threshold. solution
  !                          improvement is stopped when the error in
  !                          eq (1), defined as l.h.s. - r.h.s., becomes
  !                          less than ethr in norm.
  !
  !     on output:  x     contains the refined estimate of the
  !                          solution vector.
  !
  !                 b    is corrupted on exit
  !
  !   revised (extensively)       6 Apr 1997 by A. Dal Corso & F. Mauri
  !   revised (to reduce memory) 29 May 2004 by S. de Gironcoli
  !
  !-----------------------------------------------------------------------
  !
  use parameters, only : DP, nbnd_occ, eta
  use gspace, only : nr, ngm
  use constants
  implicit none
  integer :: i
  complex(DP) :: vr(nr)
  real(DP) :: g2kin (ngm)
  complex(kind=DP) :: evq (ngm, nbnd_occ)
  !
  !   first the I/O variables
  !
  integer :: ndmx, & ! input: the maximum dimension of the vectors
             ndim, & ! input: the actual dimension of the vectors
             kter, & ! output: counter on iterations
             nbnd, & ! input: the number of bands
             ik      ! input: the k point

  real(kind=DP) :: &
             e(nbnd), &        ! input: the actual eigenvalue
             anorm,   &        ! output: the norm of the error in the solution
             h_diag(ndmx,nbnd), & ! input: preconditioner
             ethr               ! input: the required precision
  complex(kind=DP) :: &
             x (ndmx, nbnd), & ! output: the solution of the linear syst
             b (ndmx, nbnd)    ! input: the known term
  logical :: conv_root         ! output: if true the root is converged
  external Ax                  ! input: the routine computing Ax
  !
  ! local variables
  !
  integer, parameter :: maxter = 200
  integer :: iter, ibnd, lbnd
  complex(kind=DP), allocatable :: r  (:,:), q  (:,:), p  (:,:), pold  (:,:)
  complex(kind=DP), allocatable :: rt (:,:), qt (:,:), pt (:,:), ptold (:,:)
  complex(kind=DP) :: a, c, beta, alpha, ZDOTC
  real(kind=DP) :: rho (nbnd)
  real(kind=DP) :: eu (nbnd)
  integer :: conv (nbnd)
  real(kind=DP) :: kter_eff
  !
  allocate ( r (ndmx,nbnd), q (ndmx,nbnd), p (ndmx,nbnd), pold (ndmx ,nbnd) )    
  allocate ( rt(ndmx,nbnd), qt(ndmx,nbnd), pt(ndmx,nbnd), ptold(ndmx ,nbnd) )    
  !
  kter_eff = zero
  conv = 0
  !
  do iter = 1, maxter
     !
     ! the residuals r and r~ 
     !
     if (iter .eq. 1) then
        call Ax ( x, r , e, ik, nbnd, g2kin, vr, evq, + eta )
        do ibnd = 1, nbnd
           call ZAXPY (ndim, - cone, b(1,ibnd), 1, r (1,ibnd), 1)
           call DSCAL (2 * ndim, - one, r  (1, ibnd), 1)
        enddo
        !
        rt = conjg ( r )
        !
     endif
     !
     lbnd = 0
     do ibnd = 1, nbnd
        if (conv (ibnd) .eq.0) then
           lbnd = lbnd+1
           rho(lbnd) = ZDOTC (ndim, r(1,ibnd), 1, r(1,ibnd), 1)
        endif
     enddo
     !
     ! convergence check
     !
     kter_eff = kter_eff + float (lbnd) / float (nbnd)
     do ibnd = nbnd, 1, -1
        if (conv(ibnd).eq.0) then
           rho(ibnd) = rho(lbnd)
           lbnd = lbnd -1
           anorm = sqrt ( rho (ibnd) )
           if (anorm.lt.ethr) conv (ibnd) = 1
        endif
     enddo
     conv_root = .true.
     do ibnd = 1, nbnd
        conv_root = conv_root .and. (conv (ibnd) .eq.1)
     enddo
     if (conv_root) goto 100
     !
     ! compute the step direction z
     !
     lbnd = 0
     do ibnd = 1, nbnd
        if (conv (ibnd) .eq.0) then
           !
           call ZCOPY (ndim, r  (1, ibnd), 1, p  (1, ibnd), 1)
           call ZCOPY (ndim, rt (1, ibnd), 1, pt (1, ibnd), 1)
           !
           if (iter.ne.1) then
              !
              a = ZDOTC (ndim, qt(1,ibnd), 1, r(1,ibnd), 1)
              c = ZDOTC (ndim, pt(1,ibnd), 1, q(1,ibnd), 1)
              beta = - a / c
              !
              call ZAXPY (ndim, beta,        pold  (1, ibnd), 1, p  (1, ibnd), 1)
              call ZAXPY (ndim, conjg(beta), ptold (1, ibnd), 1, pt (1, ibnd), 1)
           endif
           !
           ! here pold, ptold are used as auxiliary vectors 
           ! in order to efficiently compute q = A*p and qt = A\dagger*pt
           !
           lbnd = lbnd + 1
           call ZCOPY (ndim, p  (1, ibnd), 1, pold  (1, lbnd), 1)
           call ZCOPY (ndim, pt (1, ibnd), 1, ptold (1, lbnd), 1)
           eu (lbnd) = e (ibnd)
        endif
     enddo
     !
     ! the entire business of "lbnd" is to have H*psi for
     ! a smaller number of bands in these calls
     !
     call Ax ( pold , q , eu, ik, lbnd, g2kin, vr, evq, + eta )
     call Ax ( ptold, qt, eu, ik, lbnd, g2kin, vr, evq, - eta )
     !
     lbnd=0
     do ibnd = 1, nbnd
        if (conv (ibnd) .eq.0) then
           lbnd=lbnd+1
           !
           a = ZDOTC (ndim, rt(1,ibnd), 1, r(1,ibnd), 1)
           c = ZDOTC (ndim, pt(1,ibnd), 1, q(1,lbnd), 1)
           alpha = a / c
           !
           call ZAXPY (ndim,   alpha,        p (1,ibnd), 1, x (1,ibnd), 1)
           call ZAXPY (ndim, - alpha,        q (1,lbnd), 1, r (1,ibnd), 1)
           call ZAXPY (ndim, - conjg(alpha), qt(1,lbnd), 1, rt(1,ibnd), 1)
           !
           !    save current (now old) p and pt for next iteration
           ! 
           call ZCOPY (ndim, p (1,ibnd), 1, pold (1,ibnd), 1)
           call ZCOPY (ndim, pt(1,ibnd), 1, ptold(1,ibnd), 1)
        endif
     enddo
  enddo
100 continue
  kter = kter_eff
  deallocate (r , q , p , pold )
  deallocate (rt, qt, pt, ptold)
  !
  return
  end subroutine bcgsolve_all_optimized
  !