!-----------------------------------------------------------------------
  ! Copyright (C) 2010-2015 Henry Lambert, Feliciano Giustino
  ! This file is distributed under the terms of the GNU General Public         
  ! License. See the file `LICENSE' in the root directory of the               
  ! present distribution, or http://www.gnu.org/copyleft.gpl.txt .
  !-----------------------------------------------------------------------
SUBROUTINE cbicgstabl(h_psi, cg_psi, e, d0psi, dpsi, h_diag, &
     ndmx, ndim, ethr, ik, kter, conv_root, anorm, nbnd, npol, cw, lmresloc, tprec)
!
!-----------------------------------------------------------------------
!
!   Iterative solution of the linear system:
!
!                 ( h - e + w + i * eta ) * x = b
!                 ( h - cw + i * eta ) * G(G,G') = -\delta(G,G')
!
!   where h is a complex hermitian matrix, e, w, and eta are
!   real scalar, x and b are complex vectors
!
USE kinds,       ONLY: DP
USE mp_global,   ONLY: intra_pool_comm
USE mp,          ONLY: mp_sum
USE control_gw,  ONLY: maxter_green
USE units_gw,    ONLY: iunresid, lrresid, iunalphabeta, lralphabeta
USE mp_global,   ONLY: inter_pool_comm, intra_pool_comm, mp_global_end, mpime, &
                        nproc_pool, nproc, me_pool, my_pool_id, npool
USE mp,          ONLY : mp_barrier, mp_bcast, mp_sum

implicit none

!tprec true or false conditioning?
logical :: tprec

integer :: ndmx, &   ! input: the maximum dimension of the vectors
           ndim, &   ! input: the actual dimension of the vectors
           kter, &   ! output: counter on iterations
           nbnd, &   ! input: the number of bands
           npol, &   ! input: number of components of the wavefunctions
           ik,   &   ! input: the k point
           niters, &  ! number of iterations for this BiCG min
           nrec, &    ! for composite rec numbers
           lmresloc     ! set number of loops over minimal residuals...

real(DP) :: &
             anorm,   &        ! output: the norm of the error in the solution
             ethr,    &        ! input: the required precision
             h_diag(ndmx,nbnd) ! input: an estimate of ( H - \epsilon )


COMPLEX(DP) :: alphabeta(2), beta_old

complex(DP) :: &
             dpsi (ndmx*npol, nbnd), & ! output: the solution of the linear syst
             d0psi (ndmx*npol, nbnd)   ! input: the known term

!GMRES part
  complex(DP) :: tau(lmresloc + 1,lmresloc + 1)
  complex(DP) :: sigma(lmresloc + 1)
  complex(DP) :: gmgamma(lmresloc + 1)
  complex(DP) :: gmgammapp(lmresloc + 1)
  complex(DP) :: gmgammap(lmresloc + 1)
  complex(DP) :: gschmidt

  logical :: conv_root ! output: if true the root is converged
  external h_psi       ! input: the routine computing h_psi
  external cg_psi      ! input: the routine computing cg_psi
  !here the local variables
  !integer, parameter :: maxter = 200
  !integer, parameter :: maxter = 600
  !the maximum number of iterations
  integer :: iter, ibnd, lbnd, iterj, iteri
  !counters on iteration, bands
  integer , allocatable :: conv (:)
  !if 1 the root is converged
  !HL NB GWTC: g t h hold -> SGW: r q p pold
  complex(DP), allocatable :: g (:,:), t (:,:), h (:,:), hold (:,:)
  !the gradient of psi
  !the preconditioned gradient
  !the delta gradient
  !the conjugate gradient
  !work space
  COMPLEX(DP)  :: cw
  !HL need to introduce gt tt ht htold for BICON
  !also gp grp for preconditioned systems
  complex(DP), allocatable :: gt (:,:), hn (:,:) 
  complex(DP), allocatable :: gp (:,:), gtp (:,:)
  complex(DP) ::  dcgamma, dclambda, alpha, beta, omega
  !the ratio between rho
  !step length
  complex(DP), external :: zdotc
  !HL (eigenvalue + iw) 
  complex(DP) :: e(nbnd)
  complex(DP) :: rho, rhoold
  !min residual vector
  !complex(DP), allocatable :: s (:,:)
  !the scalar product
  !why is this real??
  !real(DP), allocatable :: a(:), c(:)
    complex(DP), allocatable :: a(:), c(:)
  !the residue
  !auxiliary for h_diag
  real(DP) :: kter_eff
  !account the number of iterations with b
  !coefficient of quadratic form

  call start_clock ('cgsolve')
  allocate ( g(ndmx*npol,lmresloc+1), t(ndmx*npol,nbnd), h(ndmx*npol,nbnd), &
             hold(ndmx*npol, nbnd) )

  allocate ( gt(ndmx*npol,nbnd), hn(ndmx*npol,nbnd))
             
  allocate ( gp(ndmx*npol,nbnd), gtp(ndmx*npol,nbnd))
  allocate (a(nbnd), c(nbnd))
  allocate (conv ( nbnd))

  kter_eff = 0.d0

  do ibnd = 1, nbnd
     conv (ibnd) = 0
  enddo
  conv_root = .false.

  g        = dcmplx(0.d0,0.d0)
  t        = dcmplx(0.d0,0.d0)
  h        = dcmplx(0.d0,0.d0)
  hold     = dcmplx(0.d0,0.d0)
  gt       = dcmplx(0.d0,0.d0)
  hn       = dcmplx(0.d0,0.d0)
  gp(:,:)  = dcmplx(0.d0,0.0d0)
  gtp(:,:) = dcmplx(0.d0,0.0d0)
  rho      = dcmplx(1.0d0,0.0d0)
  rhoold   = dcmplx(1.0d0,0.0d0)
  omega    = dcmplx(1.0d0,0.0d0)
  alpha    = dcmplx(0.0d0,0.0d0)
  sigma(:) = dcmplx(1.0d0, 0.0d0)

  do iter = 1, maxter_green
  ! r    = b - Ax 
  ! rt   = conjg ( r )
!!!!!!!!!!!!!!!!!!cBICG PART!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  ! iterj allows for the bicgstab(2,3,4,...) generalizations.
  ! a choice of lmresloc = 1 corresponds to the standard bicgstab.
  ! H. A. Van Der Corst Siam J. Sci. Stat. Comput.
  ! Vol. 13, No. 2, pp. 631-644
            if (iter .eq. 1) then
               !initialize iter 0 stuff
               !r = b - A* x
               !rt = conjg (r) 
               !call h_psi (ndim, dpsi, g, e, cw, ik, nbnd)
               do ibnd = 1, nbnd
               !initial residual should be r = b
               ! I/O for multishift.
               ! call davcio (d0psi(:,1), lrresid, iunresid, iter, +1)
                  call zaxpy (ndim, (-1.d0,0.d0), d0psi(1,ibnd), 1, g(1,ibnd), 1)
                  call zscal (ndim, (-1.0d0, 0.0d0), g(1,ibnd), 1)
  ! copy r -> u, i.e. u = r_{0}
  ! call zcopy (ndim, g (1, ibnd), 1, h (1, ibnd), 1)
  ! set \tilde{r} = r^{*}
  ! gt(:,ibnd)    = conjg ( g(:,ibnd) )
                  gt(:,ibnd) = g(:,ibnd)
  ! for preconditioning we solve the system MAx = Mb:
  !               if(tprec) call cg_psi(ndmx, ndim, 1, g(1,ibnd), h_diag(1,ibnd) )
  !               if(tprec) call cg_psi(ndmx, ndim, 1, gt(1,ibnd), h_diag(1,ibnd) )
               enddo
            endif
     rhoold  = -omega*rhoold
     do iterj = 1, lmresloc
        do ibnd = 1, nbnd
                rho     = ZDOTC (ndim, gt(1,ibnd), 1, g(1,ibnd), 1)
                beta    = alpha*(rho/rhoold)
                rhoold  = rho
                do iteri = 1, iterj 
  !\hat{u}_{i} =  \hat{r}_{i} - \beta \hat{u}_{i}
                   hold(:,:) = dcmplx(0.0d0, 0.d0)
                   call ZCOPY (ndmx*npol,  g(1, iteri),      1, hold (1, iteri), 1)
                   call ZAXPY (ndmx*npol,  -beta, h(1,iteri), 1, hold(1,iteri), 1)
                   call ZCOPY (ndmx*npol,  hold(1, iteri),   1, h (1, iteri), 1)
                enddo
            lbnd = nbnd
            kter_eff = kter_eff + DBLE (lbnd) / DBLE (nbnd)
        enddo 
            conv_root = .true.
            do ibnd = 1, nbnd
               anorm = sqrt ( abs ( ZDOTC (ndim, g(1,ibnd), 1, g(1,ibnd), 1)  ) )
               if (anorm.lt.ethr) conv (ibnd) = 1
               conv_root = conv_root.and.(conv (ibnd).eq.1)
            enddo
            if (conv_root) goto 100
! HLstab
! t = u_{j+1} = A u_{j}
        do ibnd = 1, nbnd
           call h_psi (ndim, h(:,ibnd), hn(:,ibnd), e(1), cw, ik, nbnd)
           if(tprec) call cg_psi(ndmx, ndim, 1, hn(1,ibnd), h_diag(1,ibnd))
        enddo

            lbnd=0
            do ibnd = 1, nbnd
               if (conv (ibnd) .eq.0) then
                   lbnd = ibnd
! alpha = <\tilde{r}|M^{-1}r>/<\tilde{u}|A{u}>
! the denominator is stored for subsequent use in beta
! HLSTAB in sleijpen notation
! gamma = (u_{j+1}, r_{0}) = (t,gp)
        a(ibnd) = rho
        c(ibnd) = ZDOTC (ndim, gt(1,ibnd), 1, hn(1,ibnd), 1)
        alpha = a(ibnd) / c(ibnd)
! HLSTAB do i = 0; iterj:
!          ri  = ri  - alpha*hn
!       end
               do iteri = 1, iterj
                  call ZAXPY (ndmx*npol, -alpha, hn(1,ibnd), 1, g(1,ibnd), 1)
               enddo
! FOR MINIMAL RESIDUAL SCHEMA WE NEED TO UPDATE THE RESIDUALS AND THE SOLUTION VECTORS here
! but we need to hold off on the solution vectors u0
! Also we introduce the min res part:
! t  =  r_{j+1} = A r_{j}
! t = Hg = Ar i.e. the min resid. part
            call h_psi (ndim, g(:,iterj), t(:,iterj), e(1), cw, ik, nbnd)
            if(tprec) call cg_psi(ndmx, ndim, 1, t(1,ibnd), h_diag(1,ibnd))
           !MA r
           !x_{0}  = x_{0}  + alpha * u_{0}
               call ZAXPY (ndmx*npol, alpha, h(1,ibnd), 1, dpsi (1,ibnd), 1)
               endif
            enddo!do ibnd
     enddo!iterj
!!!!!!!!!!!!!!!!!!!Min Residual Part!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
   do ibnd = 1, nbnd
     do iterj = 2, lmresloc + 1
        if (lmresloc.gt.1)  then
            do iteri = 2, iterj
               tau(iteri, iterj) = (1.0d0/sigma(iteri))*ZDOTC(ndim, g(1,iterj), g(1,iteri), 1)
!orthogonalizes each residual:
               call ZAXPY (ndmx*npol, -tau(iteri, iterj),  g(1,iteri), 1, g(1,iterj), 1)
            enddo
        endif
!should correspond to van der vorst's:sleipjen:SGW
!w=(t,s)(t,t):(\sigma_{1} = (\r_{1},\r_{1})), \gammap=(1/\sigma_{j})(\r_{0}, \r_{j}): 
        sigma(iterj)   = ZDOTC(ndmx*npol, t(1,ibnd), 1, t(1,ibnd), 1) 
        gmgammap(iterj) = (1.0d0/sigma(iterj)) * ZDOTC(ndim, t(1,ibnd), 1, g(1,ibnd), 1)
     enddo
   enddo
!w = (t,s)(t,t) = (\sigma_{j} = (\r_{j}, \r_{j})), \gammap = (1/\sigma_{j})(\r_{0}, \r_{0})
     gmgamma(lmresloc+1) = gmgammap(lmresloc+1)
     omega      = gmgamma(lmresloc+1)
     if (lmresloc.gt.1)  then
         do iterj = lmresloc+1, 2
            gschmidt = dcmplx(0.0d0, 0.0d0)
         do iteri = iterj + 1, lmresloc+1
            gschmidt = gschmidt + tau(iterj, iteri)*gmgamma(iteri)
         enddo
            gmgamma(iterj) = gmgammap(iterj) - gschmidt
         enddo
     endif
     if(lmresloc.gt.1) then
        do iterj = 1, lmresloc-1
           do iteri = iterj + 1, lmresloc-1
              gschmidt = gschmidt + tau(iterj, iteri)*gmgamma(iteri + 1)
           enddo
              gmgammapp(iterj) = gmgamma(iterj+1) + gschmidt
        enddo
     endif
!The second update phase:
   do ibnd = 1, nbnd
!      x  = x  + gmgamma(1)  * \hat{r}_{0}
       call ZAXPY (ndim,  (gmgamma(lmresloc+1)), g(1,ibnd),  1, dpsi(1,ibnd), 1)
!     \hat{u}_{0}  = \hat{u}_{0}  +  beta  * u_old
       call ZAXPY (ndim, -(gmgamma(lmresloc+1)), hn(1,ibnd), 1, h(1,ibnd), 1)
!     \hat{r}_{0}  = \hat{r}_{0}  - gammap_{l}*r_{lmresloc}
       call ZAXPY (ndim, -(gmgammap(lmresloc+1)), t(1,ibnd), 1, g(1,ibnd), 1)
      !and again here we get to a point and then stop progressing...
      !call ZAXPY (ndmx*npol, (gmgamma(lmresloc+1)), hn(1,ibnd), 1, h(1,ibnd), 1)
     if(lmresloc.gt.1) then
        do iterj = 2, lmresloc
!      \hat{u_0}  = \hat{u_0} + gmgamma(lmresloc)*u_{j}
       call ZAXPY (ndmx*npol, -(gmgamma(iterj)),   hn  (1,iterj), 1, h (1,iterj), 1)
!      x_{0}  = x_{0} + gmgamma''_{j}  * r_{j}
       call ZAXPY (ndmx*npol,  gmgammapp(iterj), g(1,iterj),  1, dpsi  (1,iterj), 1)
!      r_{0}  = r_{0}  - gamma'_{j}*r_{j}
       call ZAXPY (ndmx*npol, -(gmgammap(iterj)), t(1,iterj), 1, g  (1,ibnd), 1)
        enddo
     endif
   enddo
  enddo!maxter

100 continue

  niters =  iter
  kter   =  kter_eff
  deallocate (conv)
  deallocate (a,c)
  deallocate (g, t, h, hold)
  deallocate (gt, hn)
  deallocate (gtp, gp)
  call stop_clock ('cgsolve')
  return
END SUBROUTINE cbicgstabl