!
! Copyright (C) 2001-2009 Quantum ESPRESSO group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
!-------------------------------------------------------------------------------
SUBROUTINE solve_linter(dvbarein, iw, drhoscf)
!-----------------------------------------------------------------------------
  !  HL
  !  Driver routine for the solution of the linear system which
  !  defines the change of the wavefunction due to a perturbing potential
  !  parameterized in r and iw. i.e. v_{r, iw} (r')
  !  It performs the following tasks:
  !   a) computes the bare potential term Delta V | psi > 
  !   b) adds to it the screening term Delta V_{SCF} | psi >
  !   c) applies P_c^+ (orthogonalization to valence states)
  !   d) calls c_bi_cgsolve_all to solve the linear system
  !   e) computes Delta rho, Delta V_{SCF}.
  !   Currently symmetrized in terms of mode etc. Might need to strip this out
  !   and check PW for how it stores/symmetrizes charge densities.
!----------------------------------------------------------------------------
!------------------------------------------------------------------------------
! grep @10TION for all the instances where I've turned off the regular
! PH parallelism which always assumes we have split k-points across processors.
! Each processor has a full slice of k-points. 


  USE kinds,                ONLY : DP
  USE ions_base,            ONLY : nat, ntyp => nsp, ityp
  USE io_global,            ONLY : stdout, ionode
  USE io_files,             ONLY : prefix, iunigk
  USE check_stop,           ONLY : check_stop_now
  USE wavefunctions_module, ONLY : evc
  USE constants,            ONLY : degspin
  USE cell_base,            ONLY : tpiba2
  USE ener,                 ONLY : ef
  USE klist,                ONLY : lgauss, degauss, ngauss, xk, wk, nkstot
  USE lsda_mod,             ONLY : lsda, nspin, current_spin, isk
  USE spin_orb,             ONLY : domag
  USE wvfct,                ONLY : nbnd, npw, npwx, igk,g2kin,  et
  USE scf,                  ONLY : rho
  USE uspp,                 ONLY : okvan, vkb
  USE uspp_param,           ONLY : upf, nhm, nh
  USE noncollin_module,     ONLY : noncolin, npol, nspin_mag
  USE paw_variables,        ONLY : okpaw
  USE control_gw,           ONLY : rec_code, niter_gw, nmix_gw, tr2_gw, &
                                   alpha_pv, lgamma, lgamma_gamma, convt, &
                                   nbnd_occ, alpha_mix, ldisp, rec_code_read, &
                                   where_rec, flmixdpot, current_iq, &
                                   ext_recover, eta, high_io
  USE nlcc_gw,              ONLY : nlcc_any
  USE units_gw,             ONLY : iudrho, lrdrho, iudwf, lrdwf, iubar, lrbar, &
                                   iuwfc, lrwfc, iunrec, iudvscf, iudwfm, iudwfp 
  USE output,               ONLY : fildrho, fildvscf
  USE eqv,                  ONLY : dvpsi, dpsi, evq, eprec
  USE qpoint,               ONLY : xq, npwq, igkq, nksq, ikks, ikqs
  USE modes,                ONLY : npertx, npert, u, t, irotmq, tmq, &
                                   minus_q, irgq, nsymq, rtau 
  USE freq_gw,              ONLY : fpol, fiu, nfs, nfsmax
  USE mp,                   ONLY : mp_sum, mp_barrier
  USE buffers,              ONLY : save_buffer, get_buffer
  USE mp_pools,             ONLY : inter_pool_comm
  USE mp_bands,             ONLY : intra_bgrp_comm, ntask_groups, me_bgrp
  USE mp_world,             ONLY : mpime
  USE mp_images, ONLY : intra_image_comm
  USE gvect,           ONLY : ngm, g, nl
  USE gvecs,           ONLY : nls, doublegrid
  USE fft_base,        ONLY : dfftp, dffts
  USE fft_interfaces,  ONLY : invfft, fwfft

  implicit none

  ! counter on frequencies.

  integer :: iw, ir 
  integer :: irr, imode0, npe

  ! input: the irreducible representation
  ! input: the number of perturbation
  ! input: the position of the modes

  complex(DP) :: drhoscf (dfftp%nnr, nspin_mag)
  ! output: the change of the scf charge
  complex(DP) :: dvbarein (dffts%nnr)

! HL prec
! HL careful now... complexifying preconditioner:
  real(DP) , allocatable :: h_diag (:,:)
! h_diag: diagonal part of the Hamiltonian
  real(DP) :: thresh, anorm, averlt, dr2

  ! thresh: convergence threshold
  ! anorm : the norm of the error
  ! averlt: average number of iterations
  ! dr2   : self-consistency error
  real(DP) :: dos_ef, weight, aux_avg (2)

  ! Misc variables for metals
  ! dos_ef: density of states at Ef
  real(DP), external :: w0gauss, wgauss

  ! functions computing the delta and theta function
  complex(DP), allocatable, target :: dvscfin(:,:)

  ! change of the scf potential 
  complex(DP), pointer :: dvscfins (:,:)

  ! change of the scf potential (smooth part only)
  complex(DP), allocatable :: drhoscfh (:,:), dvscfout (:,:)

  ! change of rho / scf potential (output)
  ! change of scf potential (output)
  complex(DP), allocatable :: ldos (:,:), ldoss (:,:), mixin(:), mixout(:), &
      dbecsum (:,:,:), dbecsum_nc(:,:,:,:), aux1 (:,:)
  complex(DP) :: cw
  complex(DP), allocatable :: etc(:,:)


  !HL dbecsum (:,:,:,:), dbecsum_nc(:,:,:,:,:), aux1 (:,:)
  ! Misc work space
  ! ldos : local density of states af Ef
  ! ldoss: as above, without augmentation charges
  ! dbecsum: the derivative of becsum
  ! becsum1 PAW array.
  REAL(DP), allocatable :: becsum1(:,:,:)

  !For approx, mixing scheme.
  real(kind=DP) :: DZNRM2
  complex(kind=DP) :: ZDOTC
  external ZDOTC, DZNRM2

  logical :: conv_root,  & ! true if linear system is converged
             exst,       & ! used to open the recover file
             lmetq0,     & ! true if xq=(0,0,0) in a metal
             cgsolver          
  
  integer :: kter,       & ! counter on iterations
             iter0,      & ! starting iteration
             ipert,      & ! counter on perturbations
             ibnd,       & ! counter on bands
             iter,       & ! counter on iterations
             lter,       & ! counter on iterations of linear system
             ltaver,     & ! average counter
             lintercall, & ! average number of calls to cgsolve_all
             ik, ikk,    & ! counter on k points
             ikq,        & ! counter on k+q points
             ig,         & ! counter on G vectors
             ndim,       & ! integer actual row dimension of dpsi
             is,         & ! counter on spin polarizations
             nt,         & ! counter on types
             na,         & ! counter on atoms
             nrec, nrec1,& ! the record number for dvpsi and dpsi
             ios,        & ! integer variable for I/O control
             mode,       & ! mode index
             igpert,     & ! bare perturbation g vector.
             lmres         ! number of gmres iterations to include when using bicgstabl.

  real(DP) :: tcpu, get_clock ! timing variables
  real(DP) :: meandvb
 
  external ch_psi_all, cg_psi, h_psi_all
  COMPLEX(DP) :: dpsip(npwx*npol, nbnd), dpsim(npwx*npol, nbnd)

  allocate (dpsi(npwx*npol, nbnd))
 
  IF (rec_code_read > 20 ) RETURN

!HL- Allocate arrays for dV_scf (need to alter these from (dfftp%nnr, nspin_mag, npe) 
!to just (dfftp%nnr, nspin_mag).
  npe    = 1
  imode0 = 1
  irr    = 1
  ipert  = 1
  lter   = 0
  lmres  = 1

!HLallocate (hpsi(npwx*npol, 4)) ! Test array for whether linear system is being properly solved
  call start_clock ('solve_linter')

  allocate (dvscfout ( dfftp%nnr , nspin_mag))    
  allocate (drhoscfh ( dfftp%nnr , nspin_mag))    
  allocate (etc(nbnd, nkstot))
  allocate (dbecsum ( (nhm * (nhm + 1))/2 , nat, nspin_mag))

  allocate (dvscfin  ( dfftp%nnr , nspin_mag))    
  if (doublegrid) then
     allocate (dvscfins ( dffts%nnr , nspin_mag))    
  else
     dvscfins => dvscfin
  endif

!Complex eigenvalues
  IF (noncolin) allocate (dbecsum_nc (nhm, nhm, nat, nspin))
  allocate (aux1 ( dffts%nnr, npol))    
  allocate (h_diag ( npwx*npol, nbnd))    

  iter0 = 0
  convt =.FALSE.
  where_rec='no_recover'

! The outside loop is over the iterations.
! niter_gw := maximum number of iterations
  
  do kter = 1, niter_gw
     iter = kter + iter0
     ltaver = 0
     lintercall = 0
     drhoscf(:,:)   = (0.d0, 0.d0)
     drhoscfh(:,:)  = (0.d0, 0.d0)
     dbecsum(:,:,:) = (0.d0, 0.d0)

     IF (noncolin) dbecsum_nc = (0.d0, 0.d0)
     if (nksq.gt.1) rewind (unit = iunigk)
!start kpoints loop
     do ik = 1, nksq
        if (nksq.gt.1) then
           read (iunigk, err = 100, iostat = ios) npw, igk
100        call errore ('solve_linter', 'reading igk', abs (ios) )
        endif
! lgamma is a q=0 computation
        if (lgamma)  npwq = npw
! k and k+q mesh defined in initialize_gw:
!       ikks(ik) = 2 * ik - 1
!       ikqs(ik) = 2 * ik
        ikk = ikks(ik)
        ikq = ikqs(ik)

        if (lsda) current_spin = isk (ikk)
        if (.not.lgamma.and.nksq.gt.1) then
           read (iunigk, err = 200, iostat = ios) npwq, igkq
200        call errore ('solve_linter', 'reading igkq', abs (ios) )
        endif
       !Calculates beta functions (Kleinman-Bylander projectors), with
       !structure factor, for all atoms, in reciprocal space
       !HL the beta functions (vkb) are being generated properly.  
        call init_us_2 (npwq, igkq, xk (1, ikq), vkb)

       !Reads unperturbed wavefuctions psi(k) and psi(k+q)

        if (nksq.gt.1) then
           if (lgamma) then
              call get_buffer (evc, lrwfc, iuwfc, ikk)
           else
              call get_buffer (evc, lrwfc, iuwfc, ikk)
              call get_buffer (evq, lrwfc, iuwfc, ikq)
           endif
        endif

!IS TPA preconditioner better?
        do ig = 1, npwq
           g2kin (ig) = ( (xk (1,ikq) + g (1, igkq(ig)) ) **2 + &
                          (xk (2,ikq) + g (2, igkq(ig)) ) **2 + &
                          (xk (3,ikq) + g (3, igkq(ig)) ) **2 ) * tpiba2
        enddo
        h_diag = 0.d0
        do ibnd = 1, nbnd_occ (ikk)
           do ig = 1, npwq
              h_diag(ig,ibnd)= 1.d0/max(1.0d0, g2kin(ig)/eprec(ibnd,ik))
           enddo
           IF (noncolin) THEN
              do ig = 1, npwq
                 h_diag(ig+npwx,ibnd)=1.d0/max(1.0d0,g2kin(ig)/eprec(ibnd,ik))
              enddo
           END IF
        enddo
!HL indices freezing perturbations.
           nrec = ik
!and now adds the contribution of the self consistent term
           if (where_rec =='solve_lint'.or.iter>1) then
             ! After the first iteration dvbare_q*psi_kpoint is read from file
              call get_buffer (dvpsi, lrbar, iubar, nrec)
             ! calculates dvscf_q*psi_k in G_space, for all bands, k=kpoint
             ! dvscf_q from previous iteration (mix_potential)
              call start_clock ('vpsifft')
              do ibnd = 1, nbnd_occ (ikk)
                 call cft_wave (evc (1, ibnd), aux1, +1)
                 call apply_dpot(aux1, dvscfins(1,1), current_spin)
                 call cft_wave (dvpsi (1, ibnd), aux1, -1)
              enddo
              call stop_clock ('vpsifft')
           else
               call dvqpsi_us (dvbarein, ik, .false.)
               call save_buffer (dvpsi, lrbar, iubar, nrec)
           endif
        ! Orthogonalize dvpsi to valence states: ps = <evq|dvpsi>
        ! Apply -P_c^+.
        ! -P_c^ = - (1-P_v^):
        !
           CALL orthogonalize(dvpsi, evq, ikk, ikq, dpsi)
        
           if(where_rec=='solve_lint'.or.iter > 1) then
              if(high_io) then
                 call get_buffer( dpsip, lrdwf, iudwfp, ik)
                 call get_buffer( dpsim, lrdwf, iudwfm, ik)
              else
                 dpsim(:,:)     = (0.d0, 0.d0) 
                 dpsip(:,:)     = (0.d0, 0.d0) 
              endif
              thresh = min (1.d-1 * sqrt (dr2), 1.d-2)
           else
            !
            ! At the first iteration dpsi and dvscfin are set to zero
            !
             dpsi(:,:)      = (0.d0, 0.d0) 
             dpsim(:,:)     = (0.d0, 0.d0) 
             dpsip(:,:)     = (0.d0, 0.d0) 
             if(iw.ge.2.and.high_io) then
                call get_buffer( dpsip, lrdwf, iudwfp, ik)
                call get_buffer( dpsim, lrdwf, iudwfm, ik)
              endif
              dvscfin(:, :)  = (0.d0, 0.d0)
              dvscfout(:, :) = (0.d0, 0.d0)
            !
            ! starting threshold for iterative solution of the linear system
              thresh = 1.0d-2
           endif

       conv_root = .true.
       etc(:,:)  = CMPLX( et(:,:), 0.0d0 , kind=DP)
       cw        = fiu(iw) 

       IF (iw.eq.1) THEN
               CALL cgsolve_all (h_psi_all, cg_psi, et(1,ikk), dvpsi, dpsip, h_diag, & 
                      npwx, npwq, thresh, ik, lter, conv_root, anorm, nbnd_occ(ikk), npol)
               dpsim(:,:) = dpsip(:,:)
               dpsi(:,:)  = dpsim(:,:)  
       ELSE
               CALL cbcg_solve(ch_psi_all, cg_psi, etc(1,ikk), dvpsi, dpsip, h_diag, &
                     npwx, npwq, thresh, ik, lter, conv_root, anorm, nbnd_occ(ikk), npol, cw, .true.)
               CALL cbcg_solve(ch_psi_all, cg_psi, etc(1,ikk), dvpsi, dpsim, h_diag, &
                     npwx, npwq, thresh, ik, lter, conv_root, anorm, nbnd_occ(ikk), npol, -cw, .true.)
               dpsi(:,:) = dcmplx(0.5d0,0.0d0)*(dpsim(:,:) + dpsip(:,:) ) 
       ENDIF

       ltaver = ltaver + lter
       lintercall = lintercall + 1

       IF (.NOT.conv_root) WRITE(1000+mpime, '(5x,"kpoint ",i4,"  ibnd ",i4, &
                  &              " solve_linter: root not converged ",e10.3)')  &
                  &                ik , ibnd, anorm
           nrec1 =  ik
         ! calculates dvscf, sum over k => dvscf_q_ipert
         ! incdrhoscf:  This routine computes the change of the charge density due to the
         !HL low/io
          if(high_io) then
           call save_buffer (dpsim, lrdwf, iudwfp, ik)
           call save_buffer (dpsip, lrdwf, iudwfm, ik)
          endif

         ! perturbation. It is called at the end of the computation of the
         ! change of the wavefunction for a given k point.
           weight = wk (ikk)

         IF (noncolin) THEN
           call incdrhoscf_w (drhoscf(1, 1) , weight, ik, &
                              dbecsum(1,1,current_spin), dpsi)
         ELSE
           call incdrhoscf_w (drhoscf(1, 1) , weight, ik, &
                              dbecsum(1,1,current_spin), dpsi(1,1))
         ENDIF

     enddo !on k-points

!#ifdef __MPI
     !
     !  The calculation of dbecsum is distributed across processors (see addusdbec)
     !  Sum over processors the contributions coming from each slice of bands
     !
!     IF (noncolin) THEN
!        call mp_sum ( dbecsum_nc, intra_pool_comm )
!     ELSE
!        call mp_sum ( dbecsum, intra_pool_comm )
!     ENDIF
!#endif


     if (doublegrid) then
         do is = 1, nspin_mag
            call cinterpolate (drhoscfh(1,1), drhoscf(1,1), 1)
         enddo
     else
            call zcopy (nspin_mag*dfftp%nnr, drhoscf(1,1), 1, drhoscfh(1,1), 1)
     endif
     !
     !In the noncolinear, spin-orbit case rotate dbecsum
     !
     !IF (noncolin.and.okvan) CALL set_dbecsum_nc(dbecsum_nc, dbecsum, npe)
     !
     !Now we compute for all perturbations the total charge and potential
     !
!HL NEED FOR ULTRASOFT:
!    call addusddens (drhoscfh, dbecsum, imode0, npe, 0)
!    if (.not.lgamma_gamma) then
!         call psyme (dvscfout)
!         IF ( noncolin.and.domag ) CALL psym_dmage(dvscfout)
!    endif

     call mp_sum ( drhoscf, inter_pool_comm )
     call mp_sum ( drhoscfh, inter_pool_comm )
     call zcopy (dfftp%nnr*nspin_mag, drhoscfh(1,1), 1, dvscfout(1,1),1)

     meandvb = sqrt ((sum(dreal(dvbarein)))**2.d0 + (sum(aimag(dvbarein)))**2.d0 )/float(dffts%nnr)
     if (meandvb.lt.1.d-8) then 
         CALL fwfft ('Dense', dvscfout(:,1), dfftp)
         dvscfout ( nl(1), current_spin ) = (0.d0, 0.0d0)
         CALL invfft ('Dense', dvscfout(:,1), dfftp)
     endif
!
! IF (okpaw) call mp_sum ( dbecsum, inter_pool_comm )
! for q->0 the Fermi level can shift.
! IF (lmetq0) call ef_shift(drhoscfh,ldos,ldoss,dos_ef,irr,npe,.false.)
!
     call dv_of_drho (1, dvscfout(1,1), .true.)

     nmix_gw = 4
     if (iw.eq.1) then
!Density reponse in real space should be real at zero freq no matter what!
!just using standard broyden for the zero freq. case.
        call mix_potential(2*dfftp%nnr*nspin_mag, dvscfout, dvscfin, alpha_mix(kter), &
                           dr2, tr2_gw, iter, nmix_gw, flmixdpot, convt)
     else
    !Is the hermitian mixing scheme still okay?
        call mix_potential_c(dfftp%nnr*nspin_mag, dvscfout(1,1), dvscfin(1,1), &
                             alpha_mix(kter), dr2, tr2_gw, iter, &
                             nmix_gw, convt)
     endif
 !if (lmetq0.and.convt) &
 !    call ef_shift (drhoscf, ldos, ldoss, dos_ef, irr, npe, .true.)
     CALL check_all_convt(convt)
     if (convt) goto 155
  !HL need this if we aren't dealing with electric fields:
  !lmetq0 = lgauss.and.lgamma
  !if (lmetq0) then
  !      allocate ( ldos ( dfftp%nnr  , nspin_mag) )
  !      allocate ( ldoss( dffts%nnr , nspin_mag) )
  !      call localdos ( ldos , ldoss , dos_ef )
  !endif
     if (doublegrid) then
           do is = 1, nspin_mag
              call cinterpolate (dvscfin(1,is), dvscfins(1,is), -1)
           enddo
     endif

#ifdef __PARA
     aux_avg (1) = DBLE (ltaver)
     aux_avg (2) = DBLE (lintercall)
     averlt = aux_avg (1) / aux_avg (2)
#else
     averlt = DBLE (ltaver) / lintercall
#endif
     tcpu = get_clock ('SGW')
     dr2 = dr2 
     CALL flush_unit( stdout )
     rec_code=10
  enddo !loop on kter (iterations)

155 iter0=0

!   WRITE( stdout, '(/,5x," iter # ",i3," total cpu time :",f8.1, &
!         "secs av.it.:",f5.1)') iter, tcpu, averlt
!   WRITE( stdout, '(5x," thresh=",es10.3, " alpha_mix = ",f6.3, &
!          &      " |ddv_scf|^2 = ",es10.3 )') thresh, alpha_mix (kter) , dr2
!   WRITE(1000+mpime, '(/,5x," iter # ",i3," total cpu time :",f8.1, &
!        "secs   av.it.: ",f5.1)') iter, tcpu, averlt

  drhoscf(:,:) = dvscfin(:,:)
  
  call mp_barrier(intra_image_comm)
 
  deallocate (h_diag)
  deallocate (aux1)
  deallocate (dbecsum)
  IF (noncolin) deallocate (dbecsum_nc)
  deallocate (dvscfout)
  deallocate (drhoscfh)
  if (doublegrid) deallocate (dvscfins)
  deallocate (dvscfin)
  deallocate (dpsi)
  call stop_clock ('solve_linter')
END SUBROUTINE solve_linter

SUBROUTINE check_all_convt(convt)
  USE mp,        ONLY : mp_sum
  USE mp_images, ONLY : nproc_image, me_image, intra_image_comm
  IMPLICIT NONE
  LOGICAL,INTENT(in) :: convt
  INTEGER,ALLOCATABLE :: convt_check(:)
  !
  IF(nproc_image==1) RETURN
  !
  ALLOCATE(convt_check(nproc_image+1))
  !
  convt_check = 1
  IF(convt) convt_check(me_image+1) = 0
  !
  CALL mp_sum(convt_check, intra_image_comm)
  !CALL mp_sum(ios, inter_pool_comm)
  !CALL mp_sum(ios, intra_bgrp_comm)
  !
!  convt = ALL(convt_check==0)
  IF(ANY(convt_check==0).and..not.ALL(convt_check==0) ) THEN
    CALL errore('check_all_convt', 'Only some processors converged: '&
               &' something is wrong with solve_linter', 1)
  ENDIF
  !
  DEALLOCATE(convt_check)
  RETURN
  !
END SUBROUTINE