module para_mod

  integer maxproc, ncplanex
  parameter (maxproc=64, ncplanex=30000)
  
  character*2 node
! node:    node number, useful for opening files
  integer nproc, me
! nproc:   number of processors
! me:      number of this processor
! parallel fft information for the dense grid
!
! npp:     number of plane per processor                  
! n3:      n3(me)+1 = first  plane on proc. me            
! ncp:     number of (density) columns per proc  
! ncp0:    starting column for each processor
! ncplane: number of columns in a plane                   
! nct:     total number of non-zero columns               
! nnr_:    local fft data size                            
! ipc:     index saying which proc owns columns in a plane
! ocpl:    index relating columns and pos. in the plane   
!
  integer  npp(maxproc), n3(maxproc), ncp(maxproc), ncp0(maxproc), &
           ncplane, nct, nnr_, ipc(ncplanex), icpl(ncplanex)
!
! parallel fft information for the smooth mesh
!
! npps:    number of plane per processor
! ncps:    number of (density) columns per proc 
! ncpw:    number of (wfs) columns per processor
! ncps0:   starting column for each processor
! ncplanes:number of columns in a plane (smooth)
! ncts:    total number of non-zero columns
! nnrs_:   local fft data size
! ipcs:    saying which proc owns columns in a plane
! icpls:   index relating columns and pos. in the plane 
!
  integer npps(maxproc), ncps(maxproc), ncpw(maxproc), ncp0s(maxproc), &
       ncplanes, ncts,  nnrs_, ipcs(ncplanex), icpls(ncplanex)

end module para_mod
!
!-----------------------------------------------------------------------
      subroutine startup
!-----------------------------------------------------------------------
!
!  This subroutine initializes MPI
!  NPROC is set by the environment variable MP_PROCS
!
      use para_mod
!
      implicit none
      include 'mpif.h'
      integer ierr
!
      call mpi_init(ierr)
      if (ierr.ne.0) call error('startup','mpi_init',ierr)
      call mpi_comm_size(MPI_COMM_WORLD,nproc,ierr)
      if (ierr.ne.0) call error('startup','mpi_comm_size',ierr)
      call mpi_comm_rank(MPI_COMM_WORLD,   me,ierr)
      if (ierr.ne.0) call error('startup','mpi_comm_rank',ierr)
!
      me=me+1
!
! parent process (source) will have me=1 - child process me=2,...,NPROC
! (for historical reasons: MPI uses 0,...,NPROC-1 instead )
!
      if (nproc.gt.maxproc)                                             &
     &   call error('startup',' too many processors ',nproc)
!
      if (me.lt.10) then
         write(node,'(i1,1x)') me
      else if (me.lt.100) then
         write(node,'(i2)') me
      else
         call error('startup','wow, >100 nodes !!',nproc)
      end if
!
! only the first processor writes
!
      if (me.eq.1) then
         write(6,'(/5x,''Parallel version (MPI)'')')
         write(6,'(5x,''Number of processors in use:   '',i4)') nproc
      else
          open(6,file='/dev/null',status='unknown')
!
! useful for debugging purposes 
!         open(6,file='out.'//node,status='unknown')
      end if
!
      return
      end
!
!-----------------------------------------------------------------------
      subroutine openfiles (tbuff,nbeg,ndr,ndw)
!-----------------------------------------------------------------------
!
! Open scratch files, parallel case
! Scratch files (units 21 and ndw) are opened 
! - in the current directory if the variable SCRDIR is not set, or
! - in the directory pointed by the environment variable SCRDIR, if set
! Other files - including input files - are opened in the current directory
!
! For IBM SP machines without a fast parallel file system, the file ndw 
! is written on the current directory at the end of the execution.
! If the current directory is a distributed filesystem (like the home 
! directory) this allows to find the final file. For optimal performances,
! SCRDIR should be a local filesystem (in order to reduce network I/O).
!
      use para_mod
!
      implicit none
      logical tbuff
      integer nbeg,ndr, ndw
!
      integer i, ios
      character unitndr*2, unitndw*2, scrdir*80
!
!
      if (ndr.lt.10.or.ndr.ge.100) call error('openfile','ndr ?!?',ndr)
      if (ndw.lt.10.or.ndw.ge.100) call error('openfile','ndw ?!?',ndw)
      write(unitndr,'(i2)') ndr
      write(unitndw,'(i2)') ndw
#ifdef T3D
      call pxfgetenv('SCRDIR',0,scrdir,i,ios)
#else
      call getenv('SCRDIR',scrdir)
#endif
!
! add a '/' at the end if absent and needed
!
      i=len(trim(scrdir))
      if (i.gt.0) then
         if (scrdir(i:i).ne.'/') then
            if (i+1.le.len(scrdir)) then
               scrdir(i+1:i+1)='/'
            else
               call error('openfile','scratch dir too long',i)
            end if
         end if
      end if
!
! unit 21 contains wavefunctions in real space - untested!
!
      if (tbuff) open(unit=21,file=trim(scrdir)//'fort21.'//node,       &
     &                        form='unformatted')
!
! in parallel execution only the first node reads/writes wavefunctions
!
      if (me.eq.1) then
         open(unit=ndw,file=trim(scrdir)//'fort.'//unitndw,             &
     &                           form='unformatted')
         if (nbeg.ge.-1) then
            if(ndr.ne.ndw) then
               open(unit=ndr,file='fort.'//unitndr,                     &
     &                       status='old',form='unformatted')
            else if (trim(scrdir).ne.' ') then
!
! if ndr=ndw, everything will work if we simply do not open ndr,
! but since ndr reads in current directory and ndw writes in SCRDIR
! the trick will not work if SCRDIR != current directory
!
               call error('openfile','ndr=ndw not possible',1)
            end if
         end if
      end if
!
      return
      end
!
!-----------------------------------------------------------------------
      subroutine reopenfile (ndw)
!-----------------------------------------------------------------------
!
! IBM SP hack : 
! closes unit ndw on scrdir,  reopen it on the current directory
!
      use para_mod
!
      implicit none
      integer ndw
      character unitndw*2
!
      close(unit=ndw)
      write(unitndw,'(i2)') ndw
      open(unit=ndw,file='fort.'//unitndw,form='unformatted')
!
      return
      end
!
!----------------------------------------------------------------------
      subroutine read_rho(unit,nspin,rhor)
!----------------------------------------------------------------------
!
! read from file rhor(nnr,nspin) on first node and distribute to other nodes
!
      use para_mod
      use parm
      implicit none
      include 'mpif.h'
      integer unit, nspin
      real*8 rhor(nnr,nspin)
!
      integer ir, is
      integer root, proc, ierr, n, displs(nproc), sendcount(nproc)
      real*8, allocatable:: rhodist(:)
!
!
      if (me.eq.1) allocate(rhodist(nr1x*nr2x*nr3x))
      root = 0
      do proc=1,nproc
         sendcount(proc) =  ncplane*npp(proc)
         if (proc.eq.1) then
            displs(proc)=0
         else
            displs(proc)=displs(proc-1) + sendcount(proc-1)
         end if
      end do
      do is=1,nspin
!
! read the charge density from unit "unit" on first node only
!
         if (me.eq.1) read(unit) (rhodist(ir),ir=1,nr1x*nr2x*nr3x)
!
! distribute the charge density to the other nodes
!
         call mpi_barrier ( MPI_COMM_WORLD, ierr)
         call mpi_scatterv(rhodist, sendcount, displs, MPI_REAL8,       &
     &                     rhor(1,is),sendcount(me),   MPI_REAL8,       &
     &                     root, MPI_COMM_WORLD, ierr)
         if (ierr.ne.0) call error('mpi_scatterv','ierr<>0',ierr)
!
! just in case: set to zero unread elements (if any)
!
         do ir=sendcount(me)+1,nnr
            rhor(ir,is)=0.d0
         end do
      end do
      if (me.eq.1) deallocate(rhodist)
!
      return
      end subroutine read_rho
!
!----------------------------------------------------------------------
      subroutine write_rho(unit,nspin,rhor)
!----------------------------------------------------------------------
!
! collect rhor(nnr,nspin) on first node and write to file
!
      use para_mod
      use parm
      implicit none
      include 'mpif.h'
      integer unit, nspin
      real*8 rhor(nnr,nspin)
!
      integer ir, is
      integer root, proc, ierr, displs(nproc), recvcount(nproc)
      real*8, allocatable:: rhodist(:)
!
!
      if (me.eq.1) allocate(rhodist(nr1x*nr2x*nr3x))
!
      root = 0
      do proc=1,nproc
         recvcount(proc) =  ncplane*npp(proc)
         if (proc.eq.1) then
            displs(proc)=0
         else
            displs(proc)=displs(proc-1) + recvcount(proc-1)
         end if
      end do
!
      do is=1,nspin
!
! gather the charge density on the first node
!
         call mpi_barrier ( MPI_COMM_WORLD, ierr)
         call mpi_gatherv (rhor(1,is), recvcount(me), MPI_REAL8,        &
     &                     rhodist,recvcount, displs, MPI_REAL8,        &
     &                     root, MPI_COMM_WORLD, ierr)
         if (ierr.ne.0) call error('mpi_gatherv','ierr<>0',ierr)
!
! write the charge density to unit "unit" from first node only
!
         if (me.eq.1) write(unit) (rhodist(ir),ir=1,nr1x*nr2x*nr3x)
      end do
      if (me.eq.1) deallocate(rhodist)
!
      return
      end subroutine write_rho
!
!
!-----------------------------------------------------------------------
      subroutine set_fft_para( b1, b2, b3, gcut, gcuts, gcutw,          &
     &                        nr1, nr2, nr3, nr1s, nr2s, nr3s, nnr,     &
     &                        nr1x,nr2x,nr3x,nr1sx,nr2sx,nr3sx,nnrs )
!-----------------------------------------------------------------------
!
! distribute columns to processes for parallel fft
! based on code written by Stefano de Gironcoli for PWSCF
! columns are sets of g-vectors along z: g(k) = i1*b1+i2*b2+i3*b3 , 
! with g^2<gcut and (i1,i2) running over the (xy) plane.
! Columns are "active" for a given (i1,i2) if they contain a nonzero
! number of wavevectors
!
      use para_mod
!
      implicit none
      real*8 b1(3), b2(3), b3(3), gcut, gcuts, gcutw
      integer nr1, nr2, nr3, nr1s, nr2s, nr3s, nnr,                     &
     &        nr1x,nr2x,nr3x,nr1sx,nr2sx,nr3sx,nnrs
!
      integer ngc(ncplanex),&! number of g-vectors per column (dense grid)
     &        ngcs(ncplanex),&! number of g-vectors per column (smooth grid
     &        ngcw(ncplanex),&! number of wavefct plane waves per colum
     &        in1(ncplanex),&! index i for column (i1,i2)
     &        in2(ncplanex),&! index j for column (i1,i2)
     &        ic,           &! fft index for this column (dense grid)
     &        ics,          &! as above for the smooth grid
     &        icm,          &! fft index for column (-i1,-i2) (dense grid)
     &        icms,         &! as above for the smooth grid
     &        index(ncplanex),&! used to order column
     &        ncp_(maxproc),&! number of column per processor (work space)
     &        ngp(maxproc), &! number of g-vectors per proc (dense grid)
     &        ngps(maxproc),&! number of g-vectors per proc (smooth grid)
     &        ngpw(maxproc)  ! number of wavefct plane waves per proc
!
      integer np, nps1,            &! counters on planes 
     &        nq, nqs,             &! counters on planes
     &        max1,min1,max2,min2, &! aux. variables
     &        m1, m2, n1, n2, i, mc,&! generic counter
     &        idum, nct_,          &! check variables 
     &        j,jj,                &! counters on processors
     &        n1m1,n2m1,n3m1,      &! nr1-1 and so on
     &        i1, i2, i3,          &! counters on G space
     &        good_fft_dimension    ! a function with obvious meaning
      real*8                                                              &
     &        aux(ncplanex), &! used to order columns
     &        amod            ! modulus of G vectors
!
      call tictac(27,0)
!
! set the dimensions of fft arrays
!
      nr1x =good_fft_dimension(nr1 )
      nr2x = nr2
      nr3x =good_fft_dimension(nr3 )
!
      nr1sx=good_fft_dimension(nr1s)
      nr2sx=nr2s
      nr3sx=good_fft_dimension(nr3s)
!
!     compute number of columns for each processor
!
      ncplane = nr1x*nr2x
      ncplanes= nr1sx*nr2sx
      if (ncplane.gt.ncplanex .or. ncplanes.gt.ncplanex)                &
     &     call error('set_fft_para','ncplanex too small',ncplane)
!
! set the number of plane per process
!
      if (nr3.lt.nproc) call error('set_fft_para',                      &
     &                'some processors have no planes ',-1)      
      if (nr3s.lt.nproc) call error('set_fft_para',                     &
     &                'some processors have no smooth planes ',-1)      
!
      if (nproc.eq.1) then
         npp(1) = nr3
         npps(1)= nr3s
      else
         np = nr3/nproc
         nq = nr3 - np*nproc
         nps1 = nr3s/nproc
         nqs = nr3s - nps1*nproc
         do i = 1, nproc
            npp(i) = np
            if (i.le.nq) npp(i) = np + 1
            npps(i) = nps1
            if (i.le.nqs) npps(i) = nps1 + 1
         end do
      end if
      n3(1)= 0
      do i = 2, nproc
         n3(i)=n3(i-1)+npp(i-1)
      end do
!
!     Now compute for each point of the big plane how many column have
!     non zero vectors on the smooth and dense grid
!
      do mc = 1,ncplane
         ipc(mc) = 0
      end do 
      do mc = 1,ncplanes
         ipcs(mc)= 0
      end do 
!
      nct = 0
      ncts= 0
!
! NOTA BENE: the exact limits for a correctly sized FFT grid are:
! -nr/2,..,+nr/2  for nr even; -(nr-1)/2,..,+(nr-1)/2  for nr odd.
! If the following limits are increased, a slightly undersized fft 
! grid, with some degree of G-vector refolding, can be used
! (at your own risk - a check is done in ggen).
!
      n1m1=nr1/2
      n2m1=nr2/2
      n3m1=nr3/2
!
      do i1=-n1m1,n1m1
         do i2=-n2m1,n2m1
!
! nct counts columns containing G-vectors for the dense grid
!
            nct=nct+1
            if (nct.gt.ncplane)                                         &
     &           call error('set_fft_para','too many columns',1)
            ngc (nct) = 0
            ngcs(nct) = 0
            ngcw(nct) = 0
!
            do i3 = -n3m1,n3m1
               amod = (b1(1)*i1 + b2(1)*i2 + b3(1)*i3)**2 +             &
     &                (b1(2)*i1 + b2(2)*i2 + b3(2)*i3)**2 +             &
     &                (b1(3)*i1 + b2(3)*i2 + b3(3)*i3)**2 
               if (amod.le.gcut )                                       &
     &              ngc (nct)= ngc (nct)+ 1
               if (amod.le.gcuts)                                       &
     &              ngcs(nct)= ngcs(nct)+ 1
               if (amod.le.gcutw)                                       &
     &              ngcw(nct)= ngcw(nct)+ 1
            enddo
            if (ngc(nct).gt.0) then
!
! this column contains G-vectors
!
               in1(nct) = i1
               in2(nct) = i2
               if (ngcs(nct).gt.0) then
!
! ncts counts columns contaning G-vectors for the smooth grid
!
                  ncts=ncts+1
                  if (ncts.gt.ncplanes)                                 &
     &                 call error('set_fft_para','too many columns',2)
               end if
            else
!
! this column has no G-vectors: reset the counter
!
               nct=nct-1
            end if
         enddo
      end do
!
      if(nct .eq.0) call error('set_fft_para','number of column 0', 1)
      if(ncts.eq.0) call error('set_fft_para',                          &
     &                                     'number smooth column 0', 1)
!
!   Sort the columns. First the column with the largest number of G
!   vectors on the wavefunction sphere (active columns), 
!   then on the smooth sphere, then on the big sphere. Dirty trick:
!
      do mc = 1,nct
         aux(mc)=-(ngcw(mc)*nr3x**2 + ngcs(mc)*nr3x + ngc(mc))
      end do
      call kb07ad(aux,nct,index) 
!
! assign columns to processes
!
      do j=1,nproc
         ncp (j) = 0
         ncps(j) = 0
         ncpw(j) = 0
         ngp (j) = 0
         ngps(j) = 0
         ngpw(j) = 0
      end do
!
      do mc=1, nct
         i = index(mc)
!
! index contains the desired ordering of columns (see above)
!
         i1=in1(i)
         i2=in2(i)
!
         if ( i1.lt.0.or.(i1.eq.0.and.i2.lt.0) ) go to 30
!
! only half of the columns, plus column (0,0), are scanned:
! column (-i1,-i2) must be assigned to the same proc as column (i1,i2)
! 
! ic  :  position, in fft notation, in dense grid, of column ( i1, i2)
! icm :      "         "      "          "    "         "    (-i1,-i2)
! ics :      "         "      "        smooth "         "    ( i1, i2)
! icms:      "         "      "        smooth "         "    (-i1,-i2)
!
         m1 = i1 + 1
         if (m1.lt.1) m1 = m1 + nr1
         m2 = i2 + 1
         if (m2.lt.1) m2 = m2 + nr2
         ic = m1 + (m2-1)*nr1x
!
         n1 = -i1 + 1
         if (n1.lt.1) n1 = n1 + nr1
         n2 = -i2 + 1
         if (n2.lt.1) n2 = n2 + nr2
         icm = n1 + (n2-1)*nr1x
!
         m1 = i1 + 1
         if (m1.lt.1) m1 = m1 + nr1s
         m2 = i2 + 1
         if (m2.lt.1) m2 = m2 + nr2s
         ics = m1 + (m2-1)*nr1sx
!
         n1 =-i1 + 1
         if (n1.lt.1) n1 = n1 + nr1s
         n2 =-i2 + 1
         if (n2.lt.1) n2 = n2 + nr2s
         icms = n1 + (n2-1)*nr1sx
!
         jj=1
         if (ngcw(i).gt.0) then
!
! this is an active column: find which processor has currently
! the smallest number of plane waves 
!
            do j=1,nproc
               if (ngpw(j).lt.ngpw(jj)) jj = j
            end do     
         else
!
! this is an inactive column: find which processor has currently
! the smallest number of G-vectors
!
            do j=1,nproc
               if (ngp(j).lt.ngp(jj)) jj = j
            end do 
         end if
!
! jj is the processor to which this column is assigned
! use -jj for inactive columns, jj for active columns
!
         ipc(ic) = -jj
         ncp(jj) = ncp(jj) + 1
         ngp(jj) = ngp(jj) + ngc(i)
         if (ngcs(i).gt.0) then
            ncps(jj)=ncps(jj)+1
            ngps(jj)=ngps(jj)+ngcs(i)
            ipcs(ics)=-jj
         endif
         if (ngcw(i).gt.0) then
            ipcs(ics)=jj
            ipc(ic) = jj
            ngpw(jj)= ngpw(jj) + ngcw(i)
            ncpw(jj)= ncpw(jj) + 1
         endif
!
! now assign the (-i1,-i2) column to the same processor
!
         if (i1.eq.0.and.i2.eq.0) go to 30
!
! do not count twice column (0,0) !
!
         ipc(icm) = -jj
         ncp(jj) = ncp(jj) + 1
         ngp(jj) = ngp(jj) + ngc(i)
         if (ngcs(i).gt.0) then
            ncps(jj)=ncps(jj)+1
            ngps(jj)=ngps(jj)+ngcs(i)
            ipcs(icms)=-jj
         endif
         if (ngcw(i).gt.0) then
            ipcs(icms)=jj
            ipc(icm) = jj
            ngpw(jj)= ngpw(jj) + ngcw(i)
            ncpw(jj)= ncpw(jj) + 1
         endif
 30      continue
      enddo 
!
! ipc  is the processor for this column in the dense grid
! ipcs is the same, for the smooth grid
!
      write(6,'(                                                        &
     & '' Proc  planes cols    G   planes cols    G    columns  G''/    &
     & ''       (dense grid)     (smooth grid)   (wavefct grid)'')')
      do i=1,nproc
         write(6,'(i3,2x,3(i5,2i7))') i, npp(i),ncp(i),ngp(i),          &
     &        npps(i),ncps(i),ngps(i), ncpw(i), ngpw(i)
      end do
!
! nnr_ and nnrs_ are copies of nnr and nnrs, the local fft data size,
! to be stored in "parallel" commons. Not a very elegant solution.
!
      if (nproc.eq.1) then
         nnr =nr1x*nr2x*nr3x
         nnrs=nr1sx*nr2sx*nr3sx
      else
         nnr = max(nr3x*ncp(me), nr1x*nr2x*npp(me))
         nnrs= max(nr3sx*ncps(me), nr1sx*nr2sx*npps(me))
      end if
      nnr_= nnr
      nnrs_= nnrs
!
!   computing the starting column for each processor
!
      do i=1,nproc
         if(ngpw(i).eq.0)                                               &
     &        call error('set_fft_para',                                &
     &        'some processors have no pencils, not yet implemented',1)
         if (i.eq.1) then 
            ncp0(i) = 0
            ncp0s(i)= 0
         else
            ncp0(i) = ncp0 (i-1) + ncp (i-1)
            ncp0s(i)= ncp0s(i-1) + ncps(i-1)
         endif
      enddo
!
!  Now compute the arrays ipc and icpl (dense grid):
!     ipc contain the number of the column for that processor.
!         zero if the column do not belong to the processor.
!         Note that input ipc is used and overwritten.
!     icpl contains the point in the plane for each column
!
!- active columns first........
!
      do j=1,nproc
         ncp_(j) = 0
      end do
      do mc =1,ncplane
         if (ipc(mc).gt.0) then
            j = ipc(mc)
            ncp_(j) = ncp_(j) + 1
            icpl(ncp_(j) + ncp0(j)) = mc 
            if (j.eq.me) then
               ipc(mc) = ncp_(j)
            else
               ipc(mc) = 0
            end if 
         end if
      end do
!
!-..... ( intermediate check ) ....
!
      do j=1,nproc
         if (ncp_(j).ne.ncpw(j))                                        &
     &        call error('set_fft_para','ncp_(j).ne.ncpw(j)',j)
      end do
!
!- ........then the remaining columns
!
      do mc =1,ncplane
         if (ipc(mc).lt.0) then
            j = -ipc(mc)
            ncp_(j) = ncp_(j) + 1
            icpl(ncp_(j) + ncp0(j)) = mc 
            if (j.eq.me) then
               ipc(mc) = ncp_(j)
            else
               ipc(mc) = 0
            end if 
         end if
      end do
!
!-... ( final check )
!
      nct_ = 0
      do j=1,nproc
         if (ncp_(j).ne.ncp(j))                                         &
     &        call error('set_fft_para','ncp_(j).ne.ncp(j)',j)
         nct_ = nct_ + ncp_(j)
      end do
      if (nct_.ne.nct)                                                  &
     &     call error('set_fft_para','nct_.ne.nct',1)
!
!   now compute the arrays ipcs and icpls 
!   (as ipc and icpls, for the smooth grid)
!
!   active columns first...
!
      do j=1,nproc
         ncp_(j) = 0
      end do
      do mc =1,ncplanes
         if (ipcs(mc).gt.0) then
            j = ipcs(mc)
            ncp_(j)=ncp_(j) + 1
            icpls(ncp_(j) + ncp0s(j)) = mc
            if (j.eq.me) then
               ipcs(mc) = ncp_(j)
            else
               ipcs(mc) = 0
            endif
         endif
      enddo
!
!-..... ( intermediate check ) ....
!
      do j=1,nproc
         if (ncp_(j).ne.ncpw(j))                                        &
     &        call error('set_fft_para','ncp_(j).ne.ncpw(j)',j)
      end do
!
!    and then all the others
!
      do mc =1,ncplanes
         if (ipcs(mc).lt.0) then
            j = -ipcs(mc)
            ncp_(j) = ncp_(j) + 1
            icpls(ncp_(j) + ncp0s(j)) = mc
            if (j.eq.me) then
               ipcs(mc) = ncp_(j)
            else
               ipcs(mc) = 0
            end if
         end if
      end do
!
!-... ( final check )
!
      nct_ = 0
      do j=1,nproc
         if (ncp_(j).ne.ncps(j))                                        &
     &        call error('set_fft_para','ncp_(j).ne.ncps(j)',j)
         nct_ = nct_ + ncp_(j)
      end do
      if (nct_.ne.ncts)                                                 &
     &     call error('set_fft_para','nct_.ne.ncts',1)
      call tictac(27,1)
!
      return
      end
!
!----------------------------------------------------------------------
      subroutine cfftp(f,nr1,nr2,nr3,nr1x,nr2x,nr3x,sign)
!----------------------------------------------------------------------
!
!   sign = +-1 : parallel 3d fft for rho and for the potential
!
!   sign = +1 : G-space to R-space, output = \sum_G f(G)exp(+iG*R)
!               fft along z using pencils (cft_1)
!               transpose across nodes    (fft_scatter)
!                  and reorder
!               fft along y and x         (cft_2)
!   sign = -1 : R-space to G-space, output = \int_R f(R)exp(-iG*R)/Omega 
!               fft along x and y         (cft_2)
!               transpose across nodes    (fft_scatter)
!                  and reorder
!               fft along z using pencils (cft_1)
!
!   based on code written by Stefano de Gironcoli for PWSCF
!
      use para_mod
      use work_fft
!
      implicit none
      integer nr1,nr2,nr3,nr1x,nr2x,nr3x, sign, nppx
      complex*16  f(nnr_)
      integer  mc, i, j, ii
!
! the following is needed if the fft is distributed over only one processor
! for the special case nx3.ne.n3. Not an elegant solution, but simple, fast,
! and better than the preceding one that did not work in some cases. Note 
! that fft_scatter does nothing if nproc=1. PG
!
      if (nproc.eq.1) then
         nppx=nr3x
      else
         nppx=npp(me)
      end if
      if (sign.eq.1) then
         call cft_1(f,ncp(me),nr3,nr3x,sign,aux)
         call fft_scatter(nproc,me,aux,nr3x,nnr_,f,ncp,npp,sign)
         call zero(2*nnr_,f)
         do i=1,nct
            mc = icpl(i)
            do j=1,npp(me)
               f(mc+(j-1)*ncplane) = aux(j + (i-1)*nppx)
            end do
         end do
!
         call cft_2(f,npp(me),nr1,nr2,nr1x,nr2x,sign)
!
      else if (sign.eq.-1) then
!
         call cft_2(f,npp(me),nr1,nr2,nr1x,nr2x,sign)
!
         do i=1,nct
            mc = icpl(i)
            do j=1,npp(me)
               aux(j + (i-1)*nppx) = f(mc+(j-1)*ncplane)
            end do
         end do
         call fft_scatter(nproc,me,aux,nr3x,nnr_,f,ncp,npp,sign)
         call cft_1(aux,ncp(me),nr3,nr3x,sign,f)
      else
          call error('cftp','not allowed',abs(sign))
      end if
!
      return
      end
!
!----------------------------------------------------------------------
      subroutine cfftps (f,nr1,nr2,nr3,nr1x,nr2x,nr3x,sign)
!----------------------------------------------------------------------
!
!   sign = +-1 : parallel 3d fft for rho and for the potential
!   sign = +-2 : parallel 3d fft for wavefunctions
!
!   sign = + : G-space to R-space, output = \sum_G f(G)exp(+iG*R)
!              fft along z using pencils        (cft_1)
!              transpose across nodes           (fft_scatter)
!                 and reorder
!              fft along y (using planes) and x (cft_2)
!   sign = - : R-space to G-space, output = \int_R f(R)exp(-iG*R)/Omega 
!              fft along x and y(using planes)  (cft_2)
!              transpose across nodes           (fft_scatter)
!                 and reorder
!              fft along z using pencils        (cft_1)
!
!   The array "planes" signals whether a fft is needed along y :
!     planes(i)=0 : column f(i,*,*) empty , don't do fft along y
!     planes(i)=1 : column f(i,*,*) filled, fft along y needed
!   "empty" = no active components are present in f(i,*,*) 
!             after (sign>0) or before (sign<0) the fft on z direction
!
!   Note that if sign=+/-1 (fft on rho and pot.) all fft's are needed
!   and all planes(i) are set to 1
!
!   based on code written by Stefano de Gironcoli for PWSCF
!
      use para_mod
      use work_fft
!
      implicit none
      integer nr1,nr2,nr3,nr1x,nr2x,nr3x,sign
      complex*16  f(nnrs_)
      integer  mc, i, j, ii, proc, k, nppx
      integer planes(nr1x)
!
! see comments in cfftp.F for the logic (or lack of it) of the following
!
      if (nproc.eq.1) then
         nppx=nr3x
      else
         nppx=npps(me)
      end if
      if (sign.gt.0) then
         if (sign.ne.2) then
            call cft_1s(f,ncps(me),nr3,nr3x,sign,aux)
            call fft_scatter(nproc,me,aux,nr3x,nnrs_,f,ncps,npps,sign)
            call zero(2*nnrs_,f)
            do i=1,ncts
               mc = icpls(i)
               do j=1,npps(me)
                  f(mc+(j-1)*ncplanes) = aux(j + (i-1)*nppx)
               end do
            end do
            do i=1,nr1x
              planes(i) = 1 
            enddo
         else
            call cft_1s(f,ncpw(me),nr3,nr3x,sign,aux)
            call fft_scatter(nproc,me,aux,nr3x,nnrs_,f,ncpw,npps,sign)
            call zero(2*nnrs_,f)
            ii = 0
            do i=1,nr1x
              planes(i) = 0 
            enddo
            do proc=1,nproc
               do i=1,ncpw(proc)
                  mc = icpls(i+ncp0s(proc))
                  ii = ii + 1 
		  k=mod(mc-1,nr1x)+1
                  planes(k) = 1
                  do j=1,npps(me)
                     f(mc+(j-1)*ncplanes) = aux(j + (ii-1)*nppx)
                  end do
               end do
            end do
         end if
!
         call cft_2s(f,npps(me),nr1,nr2,nr1x,nr2x,sign,planes)
!
      else
!
         if (sign.ne.-2) then
            do i=1,nr1x
               planes(i) = 1
            end do
         else
            do i=1,nr1x
               planes(i) = 0
            end do
            do proc=1,nproc
               do i=1,ncpw(proc)
                  mc = icpls(i+ncp0s(proc))
	          k=mod(mc-1,nr1x)+1	
                  planes(k) = 1
               enddo
            enddo
         endif
!
         call cft_2s(f,npps(me),nr1,nr2,nr1x,nr2x,sign,planes)
!
         if (sign.ne.-2) then
            do i=1,ncts
               mc = icpls(i)
               do j=1,npps(me)
                  aux(j + (i-1)*nppx) = f(mc+(j-1)*ncplanes)
               end do
            end do
            call fft_scatter(nproc,me,aux,nr3x,nnrs_,f,ncps,npps,sign)
            call cft_1s(aux,ncps(me),nr3,nr3x,sign,f)
         else
            ii = 0
            do proc=1,nproc
               do i=1,ncpw(proc)
                  mc = icpls(i+ncp0s(proc))
                  ii = ii + 1 
                  do j=1,npps(me)
                     aux(j + (ii-1)*nppx) = f(mc+(j-1)*ncplanes)
                  end do
               end do
            end do
            call fft_scatter(nproc,me,aux,nr3x,nnrs_,f,ncpw,npps,sign)
            call cft_1s(aux,ncpw(me),nr3,nr3x,sign,f)
         end if
      end if
!
      return
      end
!
!------------------------------------------------------------------------
      subroutine fft_scatter                                            &
     &     (nproc, me, f_in, nr3x, nnr_, f_aux, ncp_, npp_, sign)
!------------------------------------------------------------------------
!
! transpose the fft grid across nodes
! a) From columns to planes (sign > 0)
!
!    "columns" (or "pencil") representation:
!    processor "me" has ncp_(me) contiguous columns along z
!    Each column is nr3x=nr3+1 elements for a fft of order nr3
!    (the additional element is added to reduce memory conflicts)
!
!    The transpose take places in two steps:
!    1) on each processor the columns are divided into slices along z
!       that are stored contiguously. On processor "me", slices for 
!       processor "proc" are npp_(proc)*ncp_(me) big
!    2) all processors communicate to exchange slices
!       (all columns with z in the slice belonging to "me"
!        must be received, all the others must be sent to "proc")
!    Finally one gets the "planes" representation:
!    processor "me" has npp_(me) complete xy planes
!
!  b) From planes to columns (sign < 0)
!
!  Quite the same in the opposite direction
!
!  The output is overwritten on f_in ; f_aux is used as work space
!
!  based on code written by Stefano de Gironcoli for PWSCF
!
      implicit none
      integer maxproc
      parameter (maxproc=64)
!
      include 'mpif.h'
      integer  nproc, me, nr3x, nnr_, sign, ncp_(nproc), npp_(nproc)
      real*8     f_in(2*nnr_), f_aux(2*nnr_)
!
      integer  dest, from, k, offset1(maxproc),                         &
     &         sendcount(maxproc), sdispls(maxproc),                    &
     &         recvcount(maxproc), rdispls(maxproc),                    &
     &         proc, ierr
!
!
      if (nproc.eq.1) return
      call tictac(28,0)
!
! sendcount(proc): amount of data processor "me" must send to processor proc
! recvcount(proc): amount of data processor "me" must receive from      proc
!
      do proc = 1, nproc
         sendcount(proc) = 2*npp_(proc)*ncp_(me)
         recvcount(proc) = 2*npp_(me)*ncp_(proc)
      end do
!
! offset1(proc) is used to locate the slices to be sent to proc
! sdispls(proc)+1 is the beginning of data that must be sent to proc
! rdispls(proc)+1 is the beginning of data that must be received from proc
!
      offset1(1) = 1
      sdispls(1)=0
      rdispls(1)=0
      do proc = 2, nproc
         offset1(proc) = offset1(proc-1) + 2 * npp_(proc-1)
         sdispls(proc) = sdispls(proc-1) + sendcount(proc-1)
         rdispls(proc) = rdispls(proc-1) + recvcount(proc-1)
      end do
!
      if(sign.gt.0) then
!
! "forward" scatter from columns to planes
!
! step one: store contiguously the slices
!
         do proc = 1, nproc
            from = offset1(proc)
            dest = 1 + sdispls(proc)
            do k = 1, ncp_(me)
               call COPY ( 2 * npp_(proc),                              &
     &              f_in ( from + 2*(k-1)*nr3x) , 1,                    &
     &              f_aux( dest + 2*(k-1)*npp_(proc) ) , 1 )
            end do
         end do
!
! maybe useless; ensures that no garbage is present in the output
!
         call zero (2*nnr_,f_in)
!
! step two: communication
!
         call mpi_barrier( MPI_COMM_WORLD, ierr )
         call mpi_alltoallv(f_aux,sendcount,sdispls,MPI_REAL8,          &
     &                      f_in ,recvcount,rdispls,MPI_REAL8,          &
     &                      MPI_COMM_WORLD, ierr)
         if (ierr.ne.0) call error('fft_scatter','ierr<>0',ierr)
!
      else
!
!  step two: communication
!
         call mpi_barrier( MPI_COMM_WORLD, ierr )
         call mpi_alltoallv(f_in ,recvcount,rdispls,MPI_REAL8,          &
     &                      f_aux,sendcount,sdispls,MPI_REAL8,          &
     &                      MPI_COMM_WORLD, ierr)
         if (ierr.ne.0) call error('fft_scatter','ierr<>0',ierr)
!
!  step one: store contiguously the columns
!
         call zero (2*nnr_,f_in)
!
         do proc = 1, nproc
            from = 1 + sdispls(proc)
            dest = offset1(proc)
            do k = 1, ncp_(me)
               call COPY ( 2 * npp_(proc),                              &
     &              f_aux( from + 2*(k-1)*npp_(proc) ) , 1 ,            &
     &              f_in ( dest + 2*(k-1)*nr3x) , 1 )
            end do
         end do
!
      end if
      call tictac(28,1)
!
      return
      end 
!
!-----------------------------------------------------------------------
      subroutine reduce(size,ps)
!-----------------------------------------------------------------------
!
!     sums a distributed variable s(size) over the processors.
!     This version uses a fixed-length buffer of appropriate (?) size
!
      use para_mod
!
      implicit none
      integer size
      real*8  ps(size)
!
      include 'mpif.h'
      integer ierr, n, nbuf
      integer, parameter:: MAXB=10000
      real*8 buff(MAXB)
!
      if (nproc.le.1) return
      if (size.le.0) return
      call tictac(29,0)
!
!  syncronize processes
!
      call mpi_barrier(MPI_COMM_WORLD,ierr)
      if (ierr.ne.0) call error('reduce','error in barrier',ierr)
!
      nbuf=size/MAXB
!
      do n=1,nbuf
         call mpi_allreduce (ps(1+(n-1)*MAXB), buff, MAXB,              &
     &        MPI_REAL8, MPI_SUM, MPI_COMM_WORLD, ierr)
         if (ierr.ne.0)                                                 &
     &        call error('reduce','error in allreduce1',ierr)
         call COPY(MAXB,buff,1,ps(1+(n-1)*MAXB),1)
      end do
!
!    possible remaining elements < maxb
!
      if (size-nbuf*MAXB.gt.0) then
          call mpi_allreduce (ps(1+nbuf*MAXB), buff, size-nbuf*MAXB,    &
     &          MPI_REAL8, MPI_SUM, MPI_COMM_WORLD, ierr)
          if (ierr.ne.0)                                                &
     &         call error('reduce','error in allreduce2',ierr)
          call COPY(size-nbuf*MAXB,buff,1,ps(1+nbuf*MAXB),1)
      endif
      call tictac(29,1)
!
      return
      end
!
!----------------------------------------------------------------------
      subroutine print_para_times
!----------------------------------------------------------------------
!
      use para_mod
      use timex_mod
!
      implicit none
      include 'mpif.h'
      integer i, ierr
      real*8 mincpu(maxclock), maxcpu(maxclock)
      dimension routine(maxclock)
      character*10 routine
      data routine / '   full   ',                                      &
     &               '   phbox  ',                                      &
     &               '  strucf  ',                                      &
     &               '  rhoofr  ',                                      &
     &               '  vofrho  ',                                      &
     &               '   newd   ',                                      &
     &               '  dforce  ',                                      &
     &               '   nlfor  ',                                      &
     &               '  calphi  ',                                      &
     &               '  r_iter  ',                                      &
     &               '   ortho  ',                                      &
     &               '  wfs**2  ',                                      &
     &               '   rhov   ',                                      &
     &               '   fft    ',                                      &
     &               '   ffts+w ',                                      &
     &               '   fftb   ',                                      &
     &               '  2*nlsm1 ',                                      &
     &               '   v*wf   ',                                      &
     &               '    abc   ',                                      &
     &               '    rsg   ',                                      &
     &               '  x_iter  ',                                      &
     &               '  nlsm2   ',                                      &
     &               ' nl_dforce',                                      &
     &               ' becp-phi ',                                      &
     &               '    bhs   ',                                      &
     &               '  prefor  ',                                      &
     &               'setfftpara',                                      &
     &               'fftscatter',                                      &
     &               '  reduce  ',                                      &
     &               '          '/
!
!  syncronize processes
!
      call mpi_barrier(MPI_COMM_WORLD,ierr)
      if (ierr.ne.0)                                                    &
     &     call error('print_all_times','error in barrier',ierr)
!
      call mpi_allreduce (cputime, mincpu, maxclock,                    &
     &     MPI_REAL8, MPI_MIN, MPI_COMM_WORLD, ierr)
      if (ierr.ne.0)                                                    &
     &     call error('print_para_times','error in minimum',ierr)
      call mpi_allreduce (cputime, maxcpu, maxclock,                    &
     &     MPI_REAL8, MPI_MAX, MPI_COMM_WORLD, ierr)
      if (ierr.ne.0)                                                    &
     &     call error('print_para_times','error in maximum',ierr)
!
      write(6,*) ' routine     calls     cpu time      elapsed'
      write(6,*) '             node0  node0, min, max   node0'
      write(6,*)
      do i=1, maxclock
         if (ntimes(i).gt.0) write(6,30) routine(i),                    &
     &        ntimes(i), cputime(i), mincpu(i),maxcpu(i), elapsed(i)
      end do
 30   format(a10,i7,3f7.1,f7.1)
!
      return
      end
!
!----------------------------------------------------------------------
      subroutine write_wfc(unit,c)
!----------------------------------------------------------------------
!
! collect wavefunctions on first node and write to file
!
      use gvec
      use gvecs
      use elct
      use para_mod
      use parms
      use work1
!
      implicit none
      include 'mpif.h'
      integer unit
      complex*16 c(ngw,nx)
      complex*16, pointer:: psis(:)
!
      integer i, ii, ig, proc, ierr, ntot, ncol, mc
      integer nmin(3), nmax(3), n1,n2,nzx,nz,nz_
      integer root, displs(nproc), recvcount(nproc)
      complex*16, allocatable:: psitot(:), psiwr(:,:,:)
!
! nmin, nmax are the bounds on (i,j,k) indexes of wavefunction G-vectors
!
      call nrbounds(ngw,nr1s,nr2s,nr3s,in1p,in2p,in3p,nmin,nmax)
!
! nzx is the maximum length of a column along z
!
      nzx=nmax(3)-nmin(3)+1
!
      root = 0
! root is the first node
      ntot = 0
      do proc=1,nproc
         recvcount(proc) = ncpw(proc)*nzx
!
! recvcount(proc) = size of data received from processor proc
!                   (number of columns times length of each column)
!
         if (proc.eq.1) then
            displs(proc)=0
         else
            displs(proc)=displs(proc-1) + recvcount(proc-1)
         end if
!
! displs(proc) is the position of data received from processor proc
!
         ntot = ntot + recvcount(proc)
!
! ntot = total size of gathered data
!
      end do
!
! allocate the needed work spaces
!
      psis=> wrk1
      call zero(2*nnrs,psis)
      if (me.eq.1) then
         allocate(psitot(ntot))
         allocate(psiwr(nmin(3):nmax(3),nmin(1):nmax(1),nmin(2):nmax(2)))
         write(unit) n, nmin, nmax
      end if
!
! fill array psis with c_i(G) (as packed columns along z)
!
      do i=1,n
         do ig=1,ngw
!
! ncol+1 is the index of the column            
!
            ncol=(nps(ig)-1)/nr3sx
!
! nz_ is the z component in FFT style (refolded between 1 and nr3s)
!
            nz_ =nps(ig)-ncol*nr3sx
!
! nz is the z component in "natural" style (between nmin(3) and nmax(3))
!
            nz  =nz_-1
            if (nz.ge.nr3s/2) nz=nz-nr3s
!
! ncpw(me) columns along z are stored in contiguous order on each node 
!
            psis(nz-nmin(3)+1+ncol*nzx)=c(ig,i)
         end do
!
! gather all psis arrays on the first node, in psitot
!
         call mpi_barrier ( MPI_COMM_WORLD, ierr)
         if (ierr.ne.0) call error('write_wfc','mpi_barrier 2',ierr)
         call mpi_gatherv (psis, recvcount(me),     MPI_DOUBLE_COMPLEX, &
    &                      psitot,recvcount, displs,MPI_DOUBLE_COMPLEX, &
    &                 root, MPI_COMM_WORLD, ierr)
         if (ierr.ne.0) call error('write_wfc','mpi_gatherv',ierr)
!
! now order the columns with a node-number-independent ordering
! We use n1,n2,nz for ordering, where G=n1*g(1)+n2*g(2)+nz*g(3)
! and g(1), g(2), g(3) are basis vectors of the reciprocal lattice.
! Note that the entire column is written, even if there are no
! wavefunction components associated to it. This is a waste of 
! disk space but it is simpler to implement than other schemes.
!
         if (me.eq.1) then
            ncol=0
            do proc=1,nproc
               do ii=1,ncpw(proc)
                  ncol=ncol+1
                  mc=icpls(ii+ncp0s(proc))
!
! mc is the position in the xy plane of this column in FFT style
! we need to calculate "natural" indexes n1,n2, centered on the 
! origin, for the smallest possible box.
!
                  n2=(mc-1)/nr1sx
                  n1= mc-1-n2*nr1sx
                  if (n2.ge.nr2s/2) n2=n2-nr2s
                  if (n1.le.nmax(1).and.(n1.ne.0.or.n2.ge.0)) then
!
! NB: n1.gt.nmax(1) correspond to negative indexes that should be
!     refolded into negative n1. However these are absent because
!     only half of the G sphere is considered. The other condition
!     excludes the case n1=0, n2<0 that is also not present 
!
                     do nz=nmin(3),nmax(3)
                        psiwr(nz,n1,n2)=psitot(nz-nmin(3)+1+(ncol-1)*nzx)
                     end do
                  end if
               end do
            end do
!
! write the node-number-independent array 
!
            write(unit) psiwr
!
         end if
      end do
!
      if (me.eq.1) then
         deallocate(psiwr)
         deallocate(psitot)
      end if
!
      return
!
      end subroutine write_wfc
!
!----------------------------------------------------------------------
      subroutine read_wfc(unit,c)
!----------------------------------------------------------------------
!
! read wavefunctions from file and distribute to all nodes
!
      use gvec
      use gvecs
      use elct
      use para_mod
      use parms
      use work1
!
      implicit none
      include 'mpif.h'
      integer unit
      complex*16 c(ngw,nx)
      complex*16, pointer:: psis(:)
!
      integer i, ii, ig, proc, ierr, ntot, ncol, mc, nr
      integer nmin0(3), nmax0(3), nmin(3), nmax(3), n1,n2,nzx,nz,nz_
      integer root, displs(nproc), sendcount(nproc)
      complex*16, allocatable:: psitot(:), psird(:,:,:)
!
! nmin, nmax are the bounds on (i,j,k) indexes of wavefunction G-vectors
!
      call nrbounds(ngw,nr1s,nr2s,nr3s,in1p,in2p,in3p,nmin,nmax)
!
! nzx is the maximum length of a column along z
!
      nzx=nmax(3)-nmin(3)+1
!
      root = 0
! root is the first node
      ntot = 0
      do proc=1,nproc
         sendcount(proc) =  ncpw(proc)*nzx
!
! sendcount(proc) = size of data send to processor proc
!                   (number of columns times length of each column)
!
         if (proc.eq.1) then
            displs(proc)=0
         else
            displs(proc)=displs(proc-1) + sendcount(proc-1)
         end if
!
! displs(proc) is the position of data sent to processor proc
!
         ntot = ntot + sendcount(proc)
!
! ntot = total size of gathered data
!
      end do
!
! allocate the needed work spaces
!
      psis=> wrk1
      call zero(2*nnrs,psis)
      if (me.eq.1) then
         allocate(psitot(ntot))
!
! read the smallest FFT box that contains all wavefunction G-vectors
!
         read(unit,end=10,err=10) nr, nmin0, nmax0
! check
         if (nmin(1).ne.nmin0(1) .or. nmin(2).ne.nmin0(2) .or.          &
     &       nmin(3).ne.nmin0(3) .or. nmax(1).ne.nmax0(1) .or.          &
     &       nmax(2).ne.nmax0(2) .or. nmax(3).ne.nmax0(3) ) then
            write(6,*) 'read  nmin, nmax =',nmin, nmax
            write(6,*) 'found nmin, nmax =',nmin0, nmax0
            call error('read_wfc','wavefunction mismatch',1)
         end if
         if (nr.lt.n) call error('read_wfc','not enough wavefcts',nr)
         allocate(psird(nmin(3):nmax(3),nmin(1):nmax(1),nmin(2):nmax(2)))
      end if
!
      do i=1,n
!
! read the node-number-independent array 
!
         if (me.eq.1) then
            read(unit,end=20,err=20) psird
!
! reorder as contiguously stored columns along z. See comments in 
! write_wfc for the logic (or lack thereof) of the storage
!
            ncol=0
            do proc=1,nproc
               do ii=1,ncpw(proc)
                  ncol=ncol+1
                  mc=icpls(ii+ncp0s(proc))
                  n2=(mc-1)/nr1sx
                  n1= mc-1-n2*nr1sx
                  if (n2.ge.nr2s/2) n2=n2-nr2s
                  if (n1.le.nmax(1).and.(n1.ne.0.or.n2.ge.0)) then
                     do nz=nmin(3),nmax(3)
                        psitot(nz-nmin(3)+1+(ncol-1)*nzx) = psird(nz,n1,n2)
                     end do
                  end if
               end do
            end do
         end if
!
! distribute the array psitot on all nodes (in psis)
!
         call mpi_barrier ( MPI_COMM_WORLD, ierr)
         if (ierr.ne.0) call error('write_wfc','mpi_barrier 2',ierr)
         call mpi_scatterv(psitot,sendcount,displs,MPI_DOUBLE_COMPLEX,  &
    &                      psis  ,sendcount(me),   MPI_DOUBLE_COMPLEX,  &
    &                 root, MPI_COMM_WORLD, ierr)
         if (ierr.ne.0) call error('write_wfc','mpi_scatter',ierr)
!
! fill c_i(G) (see write_wfc for the logic-or lack thereof-of ordering)
!
         do ig=1,ngw
            ncol=(nps(ig)-1)/nr3sx
            nz_ =nps(ig)-ncol*nr3sx
            nz  =nz_-1
            if (nz.ge.nr3s/2) nz=nz-nr3s
            c(ig,i)=psis(nz-nmin(3)+1+ncol*nzx)
         end do
      end do
!
      if (me.eq.1) then
         deallocate(psird)
         deallocate(psitot)
      end if
!
      return
 10   call error('read_wfc','file missing or wrong',ierr)
 20   call error('read_wfc','wavefunction missing or wrong',ierr)
!
      end subroutine read_wfc

!-----------------------------------------------------------------------
      subroutine readpfile                                              &
     &     ( flag,nx,nax,nsp,ndr,nfi,ngw,n,nr,c0,cm,tau0,vol0,cdm0,cdmm,&
     &       taum,volm,acc,taui,a1,a2,a3,alat,lambda,lambdam,           &
     &       xnhe0,xnhem,vnhe,xnhp0,xnhpm,vnhp,vel,ekincm)
!-----------------------------------------------------------------------
!
! read from file and distribute data calculated in preceding iterations
!
      use para_mod
      implicit none
      include 'mpif.h'
!
      integer flag,nx,nax,nsp,ndr,nfi,ngw,n,nr
      complex*16 c0(ngw,n), cm(ngw,n)
      real*8 cdm0(3),cdmm(3),acc(10), a1(3),a2(3),a3(3), alat
      real*8 lambda(nx,nx), lambdam(nx,nx)
      real*8 xnhe0, xnhem, vnhe, xnhp0, xnhpm, vnhp, ekincm, volm,vol0
      real*8 taum(3,nax,nsp),tau0(3,nax,nsp),taui(3,nax,nsp)
      real*8 vel(3,nax,nsp)
!
      integer i,ia,is,j,nd,ndummy,root,ierr
      real*8, allocatable:: dummy(:)
!
! Only the first node reads
!
      if (me.eq.1) rewind ndr
      if (flag.eq.0) then
         write(6,'((a,i3,a))') ' ### reading from file ',ndr,' only c0  ##'
      else
         write(6,'((a,i3))') ' ### reading from file ',ndr
      end if
!
! now read and distribute the wave functions
!
      call read_wfc(ndr,c0)
!
      if (flag.eq.0) return
!
      call read_wfc(ndr,cm)
!
!
! variables to be read on node 1 and broadcast on other nodes are stored
! in an array "dummy", whose length ndummy must be calculated by hand
!
      ndummy = 1 + 4*3*nax*nsp + 2*3 +  3*3  +  1  + 2*n*n + 10 + 9
!             nfi    tau+vel     cdm  a1 a2 a3 alat lambda  acc misc
      allocate(dummy(ndummy))
      if (me.eq.1) read(ndr,end=10,err=10) (dummy(i),i=1,ndummy)
!
! broadcast variables to the other nodes...
!
      root = 0
      call mpi_barrier( MPI_COMM_WORLD, ierr)
      call mpi_bcast(dummy,ndummy, MPI_REAL8, root, MPI_COMM_WORLD, ierr)
      if (ierr.ne.0) call error('broadcast','ierr<>0',ierr)
!
! and now unpack the array!
!
      nfi=dummy(1)
      nd=1
      do is=1,nsp
         do ia=1,nax
            do i=1,3
               nd=nd+1
               tau0(i,ia,is)=dummy(nd)
               nd=nd+1
               taum(i,ia,is)=dummy(nd)
               nd=nd+1
               taui(i,ia,is)=dummy(nd)
               nd=nd+1
               vel(i,ia,is) =dummy(nd)
            end do
         end do
      end do
      do i=1,3
         nd=nd+1
         cdm0(i)=dummy(nd)
         nd=nd+1
         cdmm(i)=dummy(nd)
         nd=nd+1
         a1(i)=dummy(nd)
         nd=nd+1
         a2(i)=dummy(nd)
         nd=nd+1
         a3(i)=dummy(nd)
      end do
      nd=nd+1
      alat=dummy(nd)
      do j=1,n
         do i=1,n
            nd=nd+1
            lambda (i,j)=dummy(nd)
            nd=nd+1
            lambdam(i,j)=dummy(nd)
         end do
      end do
      do i=1,10
         nd=nd+1
         acc(i)=dummy(nd)
      end do
      vol0  =dummy(nd+1)
      volm  =dummy(nd+2)
      xnhe0 =dummy(nd+3)
      xnhem =dummy(nd+4)
      vnhe  =dummy(nd+5)
      xnhp0 =dummy(nd+6)
      xnhpm =dummy(nd+7)
      vnhp  =dummy(nd+8)
      ekincm=dummy(nd+9)
      if (nd+9.ne.ndummy) call error('readpwfc','dummy!!!',nd+9)
      deallocate(dummy)
!
      return
 10   call error('readpfile','end of file detected',1)
      end
!-----------------------------------------------------------------------
      subroutine writepfile                                             &
     &     ( nx,nax,nsp,ndw,nfi,ngw,n,c0,cm,tau0,vol0,cdm0,cdmm,        &
     &       taum,volm,acc,taui,a1,a2,a3,alat,lambda,lambdam,           &
     &       xnhe0,xnhem,vnhe,xnhp0,xnhpm,vnhp,vel,ekincm)
!-----------------------------------------------------------------------
!
! collect and write data to file for subsequent iterations
!
      use para_mod
      implicit none
      integer nx,nax,nsp,ndw,nfi,ngw,n
      complex*16 c0(ngw,n), cm(ngw,n)
      real*8 cdm0(3),cdmm(3),acc(10), a1(3),a2(3),a3(3), alat
      real*8 lambda(nx,nx), lambdam(nx,nx)
      real*8 xnhe0, xnhem, vnhe, xnhp0, xnhpm, vnhp, ekincm, volm,vol0
      real*8 taum(3,nax,nsp),tau0(3,nax,nsp),taui(3,nax,nsp)
      real*8 vel(3,nax,nsp)
!
      integer i,ia,is,j,nd,ndummy
      real*8, allocatable:: dummy(:)
!
! Only the first node writes
!
      if (me.eq.1) rewind ndw
      call write_wfc(ndw,c0)
      call write_wfc(ndw,cm)
      if (me.ne.1) return
!
! the remaining variables are packed into an array "dummy", 
! whose length ndummy must be calculated by hand
!
      ndummy = 1 + 4*3*nax*nsp + 2*3 +  3*3  +  1  + 2*n*n + 10 + 9
      !       nfi    tau+vel     cdm  a1 a2 a3 alat lambda  acc misc
      allocate(dummy(ndummy))
!
! and now pack the array dummy!
!
      dummy(1)=nfi
      nd=1
      do is=1,nsp
         do ia=1,nax
            do i=1,3
               nd=nd+1
               dummy(nd)=tau0(i,ia,is)
               nd=nd+1
               dummy(nd)=taum(i,ia,is)
               nd=nd+1
               dummy(nd)=taui(i,ia,is)
               nd=nd+1
               dummy(nd)=vel(i,ia,is)
            end do
         end do
      end do
      do i=1,3
         nd=nd+1
         dummy(nd)=cdm0(i)
         nd=nd+1
         dummy(nd)=cdmm(i)
         nd=nd+1
         dummy(nd)=a1(i)
         nd=nd+1
         dummy(nd)=a2(i)
         nd=nd+1
         dummy(nd)=a3(i)
      end do
      nd=nd+1
      dummy(nd)=alat
      do j=1,n
         do i=1,n
            nd=nd+1
            dummy(nd)=lambda (i,j)
            nd=nd+1
            dummy(nd)=lambdam(i,j)
         end do
      end do
      do i=1,10
         nd=nd+1
         dummy(nd)=acc(i)
      end do
      dummy(nd+1)=vol0
      dummy(nd+2)=volm
      dummy(nd+3)=xnhe0
      dummy(nd+4)=xnhem
      dummy(nd+5)=vnhe
      dummy(nd+6)=xnhp0
      dummy(nd+7)=xnhpm
      dummy(nd+8)=vnhp
      dummy(nd+9)=ekincm
      if (nd+9.ne.ndummy) call error('readpwfc','dummy!!!',nd+9)
!
      write(ndw) dummy
!
      deallocate(dummy)
!
      return
      end
!
!----------------------------------------------------------------------
      subroutine nrbounds(ngw,nr1s,nr2s,nr3s,in1p,in2p,in3p,nmin,nmax)
!----------------------------------------------------------------------
!
! find the bounds for (i,j,k) indexes of all wavefunction G-vectors
! The (i,j,k) indexes are defined as: G=i*g(1)+j*g(2)+k*g(3)
! where g(1), g(2), g(3) are basis vectors of the reciprocal lattice
!
      implicit none
! input
      integer ngw,nr1s,nr2s,nr3s,in1p(ngw),in2p(ngw),in3p(ngw)
! output
      integer nmin(3), nmax(3)
! local
      include 'mpif.h'
      integer nmin0(3), nmax0(3), ig, ierr
!
!
      nmin0(1)=  nr1s
      nmax0(1)= -nr1s
      nmin0(2)=  nr2s
      nmax0(2)= -nr2s
      nmin0(3)=  nr3s
      nmax0(3)= -nr3s
!
      do ig=1,ngw
         nmin0(1) = min(nmin0(1),in1p(ig))
         nmin0(2) = min(nmin0(2),in2p(ig))
         nmin0(3) = min(nmin0(3),in3p(ig))
         nmax0(1) = max(nmax0(1),in1p(ig))
         nmax0(2) = max(nmax0(2),in2p(ig))
         nmax0(3) = max(nmax0(3),in3p(ig))
      end do
!
! find minima and maxima for the FFT box across all nodes
!
      call mpi_barrier( MPI_COMM_WORLD, ierr )
      if (ierr.ne.0) call error('nrbounds','mpi_barrier 1',ierr)
      call mpi_allreduce (nmin0, nmin, 3, MPI_INTEGER, MPI_MIN,         &
     &           MPI_COMM_WORLD, ierr)
      if (ierr.ne.0) call error('nrbounds','mpi_allreduce min',ierr)
      call mpi_allreduce (nmax0, nmax, 3, MPI_INTEGER, MPI_MAX,         &
     &           MPI_COMM_WORLD, ierr)
      if (ierr.ne.0) call error('nrbounds','mpi_allreduce max',ierr)

      return
      end subroutine nrbounds