!
! Copyright (C) 2001-2004 PWSCF group
! This file is distributed under the terms of the
! GNU General Public License. See the file `License'
! in the root directory of the present distribution,
! or http://www.gnu.org/copyleft/gpl.txt .
!
#include "f_defs.h"
!
!----------------------------------------------------------------------------
MODULE para_const
  !----------------------------------------------------------------------------
  !
  SAVE
  !
  INTEGER, PARAMETER :: &
      maxproc  = 128     !  maximum number of processors
END MODULE para_const
!
!
!----------------------------------------------------------------------------
MODULE pfft
  !----------------------------------------------------------------------------
  !
  ! ... parallel fft information for the dense grid
  !
  USE para_const
  !
  SAVE
  !
  INTEGER :: &
      npp(maxproc),     &!  number of plane per processor
      ncp(maxproc),     &!  number of (density) columns per proc
      ncp0(maxproc),    &!  starting column for each processor
      ncplane,          &!  number of columns in a plane
      nct,              &!  total number of non-zero columns
      nxx                !  local fft data dim
  !
END MODULE pfft
!
!
!----------------------------------------------------------------------------
MODULE pffts
  !----------------------------------------------------------------------------
  !
  ! ... parallel fft information for the smooth grid
  !
  USE para_const
  !
  SAVE
  !
  INTEGER :: &
       nkcp(maxproc)     !  number of (wfs) columns per processor
  INTEGER :: &
       npps(maxproc),   &!  number of plane per processor
       ncps(maxproc),   &!  number of (density) columns per proc
       ncp0s(maxproc),  &!  starting column for each processor
       ncplanes,        &!  number of columns in a plane
       ncts,            &!  total number of non-zero columns
       nxxs              !  local fft data dim
  !
END MODULE pffts
!
!
!----------------------------------------------------------------------------
MODULE para
  !----------------------------------------------------------------------------
  !
  ! ... tis module contains data and methods needed for parallel version of
  ! ... PWscf code
  !
  USE pfft
  USE pffts
!@@  USE kinds, ONLY : DP  
  USE parameters, ONLY : DP  
  !
  SAVE
  !
  ! ... number of processors =  # of tasks
  !
  ! ... general parallel information
  !
  INTEGER :: &
      npool  = 1,         &!  number of pools
      nprocp = 1,         &!  number of processors in this task pool
      mypool = 1,         &!  identifier of this task pool
      me     = 1,         &!  identifier of this task within his pool
      kunit  = 1           !  granularity of k-point distribution
  !
END MODULE para
!
! ... here are all parallel subroutines (wrappers to MPI calls) used 
! ... by the PWscf code
!
! ... "reduce"-like subroutines
!
!----------------------------------------------------------------------------
SUBROUTINE reduce( dim, ps )
  !----------------------------------------------------------------------------
  !
  ! ... sums a distributed variable ps(dim) over the processors.
  ! ... This version uses a fixed-length buffer of appropriate (?) dim
  ! ...              uses SHMEM for the T3D/T3E, MPI otherwhise
  !
#if defined (__PARA)
  !
  USE mp_global, ONLY : intra_pool_comm, my_pool_id, nproc_pool, npool
  USE mp,        ONLY : mp_barrier
!@@  USE kinds,     ONLY : DP
  USE parameters,     ONLY : DP
  USE parallel_include  
#  if defined (__SHMEM)
  USE para,      ONLY : nprocp
#  endif
  !
  IMPLICIT NONE
  !
  INTEGER            :: dim
  REAL (KIND=DP)     :: ps(dim)
  INTEGER            :: info, n, nbuf
  INTEGER, PARAMETER :: maxb = 10000
# if (defined __SHMEM && defined __ALTIX) || (defined __SHMEM && defined __ORIGIN)
  INTEGER            :: sym_len
  LOGICAL            :: first
  REAL (KIND=DP)     :: buff(*), snd_buff(*)
  POINTER               (buff_p, buff), (snd_buff_p, snd_buff)
  COMMON /sym_heap1/    buff_p, snd_buff_p, sym_len, first
#else
  REAL (KIND=DP)     :: buff(maxb)  
# endif
  !
#  if defined (__SHMEM)
  !
  ! ... SHMEM specific 
  !
  INCLUDE 'mpp/shmem.fh'
# if defined (__ALTIX) || defined (__ORIGIN)
      INTEGER    :: pWrkSync(SHMEM_REDUCE_SYNC_SIZE),                    &
     &              pWrkData(1024*1024), start
      DATA pWrkSync /SHMEM_REDUCE_SYNC_SIZE*SHMEM_SYNC_VALUE/
      DATA pWrkData / 1048576 * 0 /
# else
  INTEGER :: pWrkSync, pWrkData, start
  COMMON / SH_SYNC / pWrkSync(SHMEM_BARRIER_SYNC_dim)
  COMMON / SH_DATA / pWrkData(1024*1024)
  DATA pWrkData / 1048576 * 0 /
  DATA pWrkSync / SHMEM_BARRIER_SYNC_dim * SHMEM_SYNC_VALUE /
!DIR$ CACHE_ALIGN /SH_SYNC/
!DIR$ CACHE_ALIGN /SH_DATA/
#  endif
  !
#  endif
  !
  !
  IF ( dim <= 0 .OR. nproc_pool <= 1 ) RETURN
  !
!@  CALL start_clock( 'reduce' )
  !
  ! ... syncronize processes - maybe unneeded on T3D but necessary on T3E !!!
  !
  CALL mp_barrier( intra_pool_comm )
  !
  nbuf = dim / maxb
  !
#  if defined (__SHMEM)
# if defined (__ALTIX) || defined (__ORIGIN)
  IF (dim .GT. sym_len) THEN
     IF (sym_len .NE. 0) THEN
        CALL shpdeallc( snd_buff_p, info, -1 )
     END IF
     sym_len = dim
     CALL shpalloc( snd_buff_p, 2*sym_len, info, -1 )
  END IF
  IF (first .NE. .TRUE.) THEN
     CALL shpalloc( buff_p, 2*maxb, info, -1 )
     first = .TRUE.
  END IF
  snd_buff(1:dim) = ps(1:dim)
# endif
  !
  start = my_pool_id * nproc_pool
  !
#  endif
  !
  DO n = 1, nbuf
     !
#  if defined (__SHMEM)
     !
# if defined (__ALTIX) || defined (__ORIGIN)
     CALL SHMEM_REAL8_SUM_TO_ALL( buff, snd_buff(1+(n-1)*maxb), &
                                  maxb, start, 0, nprocp, pWrkData, pWrkSync )
# else
     CALL SHMEM_REAL8_SUM_TO_ALL( buff, ps(1+(n-1)*maxb), &
                                  maxb, start, 0, nprocp, pWrkData, pWrkSync )
#endif
     !                             
#  else
     !
     CALL MPI_ALLREDUCE( ps(1+(n-1)*maxb), buff, maxb, MPI_REAL8, &
                         MPI_SUM, intra_pool_comm, info )
     !                    
     CALL errore( 'reduce', 'error in allreduce1', info )
     !
#  endif
     !
     ps((1+(n-1)*maxb):(n*maxb)) = buff(1:maxb)
     !
  END DO
  !
  ! ... possible remaining elements < maxb
  !
  IF ( ( dim - nbuf * maxb ) > 0 ) THEN
     !
#  if defined (__SHMEM)
     !
# if defined (__ALTIX) || defined (__ORIGIN)
     CALL SHMEM_REAL8_SUM_TO_ALL( buff, snd_buff(1+nbuf*maxb),          &
     &                            (dim-nbuf*maxb), start, 0, nprocp,    &
     &                            pWrkData, pWrkSync )
# else
     CALL SHMEM_REAL8_SUM_TO_ALL( buff, ps(1+nbuf*maxb), (dim-nbuf*maxb), &
                                  start, 0, nprocp, pWrkData, pWrkSync )
# endif
     !                             
#  else
     !
     CALL MPI_ALLREDUCE( ps(1+nbuf*maxb), buff, (dim-nbuf*maxb), MPI_REAL8, &
                         MPI_SUM, intra_pool_comm, info )
     !
     CALL errore( 'reduce', 'error in allreduce2', info )
     !
#  endif
     !
     ps((1+nbuf*maxb):dim) = buff(1:(dim-nbuf*maxb))
     !
  END IF
  !
!@  CALL stop_clock( 'reduce' )
  !
#endif
  !
  RETURN
  !
END SUBROUTINE reduce
!
!----------------------------------------------------------------------------
SUBROUTINE ireduce( dim, is )
  !----------------------------------------------------------------------------
  !
  ! ... sums a distributed variable is(dim) over the processors.
  !
#if defined (__PARA)
  !
  USE mp_global, ONLY : intra_pool_comm, nproc_pool, npool
  USE mp,        ONLY : mp_barrier
  USE parallel_include    
  !
  IMPLICIT NONE
  !
  INTEGER            :: dim, is(dim)
  INTEGER            :: info, n, m, nbuf
  INTEGER, PARAMETER :: maxi = 500
  INTEGER            :: buff(maxi)
  !
  !
  IF ( dim <= 0 .OR. nproc_pool <= 1 ) RETURN
  !
  ! ... syncronize processes
  !
  CALL mp_barrier( intra_pool_comm )
  !
  nbuf = dim / maxi
  !
  DO n = 1, nbuf
     !
     CALL MPI_ALLREDUCE( is(1+(n-1)*maxi), buff, maxi, MPI_INTEGER, &
                         MPI_SUM, intra_pool_comm, info )
     !   
     CALL errore( 'ireduce', 'error in allreduce 1', info )
     !
     is((1+(n-1)*maxi):(n*maxi)) = buff(1:maxi)
     !
  END DO
  !
  ! ... possible remaining elements < maxi
  !
  IF ( ( dim - nbuf * maxi ) > 0 ) THEN
     !
     CALL MPI_ALLREDUCE( is(1+nbuf*maxi), buff, (dim-nbuf*maxi), MPI_INTEGER, &
                         MPI_SUM, intra_pool_comm, info )
     !
     CALL errore( 'reduce', 'error in allreduce 2', info )
     !
     is((1+nbuf*maxi):dim) = buff(1:(dim-nbuf*maxi))
     !
  END IF
  !
#endif
  !
  RETURN
  !
END SUBROUTINE ireduce
!
!------------------------------------------------------------------------
SUBROUTINE poolreduce( dim, ps )
  !-----------------------------------------------------------------------
  !
  ! ... Sums a distributed variable ps(dim) over the pools.
  ! ... This MPI-only version uses a fixed-length buffer
  !       
#if defined (__PARA)
  !  
  USE mp_global, ONLY : inter_pool_comm, intra_image_comm, &
                        my_pool_id, nproc_pool, npool
  USE mp,        ONLY : mp_barrier
!@@  USE kinds,     ONLY : DP
  USE parameters,     ONLY : DP
  USE parallel_include    
  !
  IMPLICIT NONE
  !
  INTEGER            :: dim
  REAL (KIND=DP)     :: ps(dim)
  INTEGER, PARAMETER :: maxb = 10000
  REAL (KIND=DP)     :: buff(maxb)
  INTEGER            :: info, nbuf, n
  !
  !
  IF ( dim <= 0 .OR. npool <= 1 ) RETURN
  !
!@  CALL start_clock( 'poolreduce' )
  !
  ! ... MPI syncronize processes
  !
  CALL mp_barrier( intra_image_comm )
  !
  nbuf = dim / maxb
  !
  DO n = 1, nbuf
     !
     CALL MPI_ALLREDUCE( ps(1+(n-1)*maxb), buff, maxb, MPI_REAL8, &
                         MPI_SUM, inter_pool_comm, info )
     !
     CALL errore( 'poolreduce', 'info<>0 at allreduce1', info )
     !
     ps((1+(n-1)*maxb):(n*maxb)) = buff(1:maxb)
     !CALL DCOPY( maxb, buff, 1, ps(1+(n-1)*maxb), 1 )
     !
  END DO
  !
  IF ( ( dim - nbuf * maxb ) > 0 ) THEN
     !
     CALL MPI_ALLREDUCE( ps(1+nbuf*maxb), buff, (dim-nbuf*maxb), MPI_REAL8, &
                         MPI_SUM, inter_pool_comm, info )
     !
     CALL errore( 'poolreduce', 'info<>0 at allreduce2', info )
     !
     ps((1+nbuf*maxb):dim) = buff(1:(dim-nbuf*maxb))
     !CALL DCOPY( dim-nbuf*maxb, buff, 1, ps(1+nbuf*maxb), 1 )
     !
  END IF
  !
!@  CALL stop_clock( 'poolreduce' )
  !
#endif
  !
  RETURN
  !
END SUBROUTINE poolreduce
!
! ... "gather"-like subroutines
!
!----------------------------------------------------------------------------
SUBROUTINE gather( f_in, f_out )
  !----------------------------------------------------------------------------
  !
  ! ... gathers nprocp distributed data on the first processor of every pool
  !
  ! ... REAL*8  f_in  = distributed variable (nxx)
  ! ... REAL*8  f_out = gathered variable (nrx1*nrx2*nrx3)
  !
#if defined (__PARA)
  !
  USE pfft,      ONLY : ncplane, npp, nxx   
  USE mp_global, ONLY : intra_pool_comm, nproc_pool, me_pool, root_pool
  USE mp,        ONLY : mp_barrier
!@@  USE kinds,     ONLY : DP
  USE parameters,     ONLY : DP
  USE parallel_include    
  !
  IMPLICIT NONE
  !
  REAL (KIND=DP) :: f_in(nxx), f_out(*)
  INTEGER        :: proc, info
  INTEGER        :: displs(0:nproc_pool-1), recvcount(0:nproc_pool-1)
  !
  !
!@  CALL start_clock( 'gather' )
  !
  DO proc = 0, ( nproc_pool - 1 )
     !
     recvcount(proc) = ncplane * npp(proc+1)
     !
     IF ( proc == 0 ) THEN
        !
        displs(proc) = 0
        !
     ELSE
        !
        displs(proc) = displs(proc-1) + recvcount(proc-1)
        !
     END IF
     !
  END DO
  !
  CALL mp_barrier( intra_pool_comm )
  !
  CALL MPI_GATHERV( f_in, recvcount(me_pool), MPI_REAL8, f_out, &
                    recvcount, displs, MPI_REAL8, root_pool,    &
                    intra_pool_comm, info )
  !
  CALL errore( 'gather', 'info<>0', info )
  !
!@  CALL stop_clock( 'gather' )
  !
#endif
  !
  RETURN
  !
END SUBROUTINE gather
!
!-----------------------------------------------------------------------
SUBROUTINE cgather_sym( f_in, f_out )
  !-----------------------------------------------------------------------
  !
  ! ... gather complex data for symmetrization (in phonon code)
  ! ... COMPLEX*16  f_in  = distributed variable (nrxx)
  ! ... COMPLEX*16  f_out = gathered variable (nrx1*nrx2*nrx3)
  !
#if defined (__PARA)
  !
  USE pfft,      ONLY : ncplane, npp, nxx     
  USE mp_global, ONLY : intra_pool_comm, intra_image_comm, &
                        nproc_pool, me_pool
  USE mp,        ONLY : mp_barrier
  USE parallel_include    
  !
  IMPLICIT NONE
  !
  COMPLEX(KIND=DP) :: f_in(nxx), f_out(*)
  INTEGER          :: proc, info
  INTEGER          :: displs(0:nproc_pool-1), recvcount(0:nproc_pool-1)
  !
  !
!@  CALL start_clock( 'cgather' )
  !
  DO proc = 0, ( nproc_pool - 1 )
     !
     recvcount(proc) = 2 * ncplane * npp(proc+1)
     !
     IF ( proc == 0 ) THEN
        !
        displs(proc) = 0
        !
     ELSE
        !
        displs(proc) = displs(proc-1) + recvcount(proc-1)
        !
     END IF
     !
  END DO
  !
  CALL mp_barrier( intra_pool_comm )
  !
  CALL MPI_ALLGATHERV( f_in, recvcount(me_pool), MPI_REAL8, &
                       f_out, recvcount, displs, MPI_REAL8, &
                       intra_pool_comm, info )
  !
  CALL errore( 'cgather_sym', 'info<>0', info )
  !
  CALL mp_barrier( intra_image_comm )
  !
!@  CALL stop_clock( 'cgather' )
  !
#endif
  !
  RETURN
  !
END SUBROUTINE cgather_sym
!
! ... "scatter"-like subroutines
!
!----------------------------------------------------------------------------
SUBROUTINE scatter( f_in, f_out )
  !----------------------------------------------------------------------------
  !
  ! ... scatters data from the first processor of every pool
  !
  ! ... REAL*8  f_in  = gathered variable (nrx1*nrx2*nrx3)
  ! ... REAL*8  f_out = distributed variable (nxx)
  !
#if defined (__PARA)
  !
  USE pfft,      ONLY : ncplane, npp, nxx 
  USE mp_global, ONLY : intra_pool_comm, nproc_pool, &
                        me_pool, root_pool
  USE mp,        ONLY : mp_barrier
!@@  USE kinds,     ONLY : DP
  USE parameters,     ONLY : DP
  USE parallel_include    
  !
  IMPLICIT NONE
  !
  REAL (KIND=DP) :: f_in(*), f_out(nxx)
  INTEGER        :: proc, info
  INTEGER        :: displs(0:nproc_pool-1), sendcount(0:nproc_pool-1)
  !
  !
!@  CALL start_clock( 'scatter' )
  !
  DO proc = 0, ( nproc_pool - 1 )
     !
     sendcount(proc) = ncplane * npp(proc+1)
     !
     IF ( proc == 0 ) THEN
        !
        displs(proc) = 0
        !
     ELSE
        !
        displs(proc) = displs(proc-1) + sendcount(proc-1)
        !
     END IF
     !
  END DO
  !
  CALL mp_barrier( intra_pool_comm )
  !  
  CALL MPI_SCATTERV( f_in, sendcount, displs, MPI_REAL8,   &
                     f_out, sendcount(me_pool), MPI_REAL8, &
                     root_pool, intra_pool_comm, info )
  !
  CALL errore( 'scatter', 'info<>0', info )
  !
  IF ( sendcount(me_pool) /= nxx ) f_out(sendcount(me_pool)+1:nxx) = 0.D0
  !
!@  CALL stop_clock( 'scatter' )
  !
#endif
  !
  RETURN
  !
END SUBROUTINE scatter
!
!----------------------------------------------------------------------------
SUBROUTINE poolscatter( nsize, nkstot, f_in, nks, f_out )
  !----------------------------------------------------------------------------
  !
  ! ... This routine scatters a quantity ( typically the eigenvalues )
  ! ... among the pools. 
  ! ... On input, f_in is required only on the first node of the first pool. 
  ! ... f_in and f_out may coincide.
  ! ... Not a smart implementation!
  !  
#if defined (__PARA)
  !
!@@  USE kinds,     ONLY : DP
  USE parameters,     ONLY : DP
  USE mp_global, ONLY : intra_pool_comm, inter_pool_comm, &
                        my_pool_id, npool, me_pool, root_pool
  USE para,      ONLY : kunit
  USE mp,        ONLY : mp_bcast  
  !
  IMPLICIT NONE
  !
  INTEGER :: nsize, nkstot, nks
    ! first dimension of vectors f_in and f_out
    ! number of k-points per pool
    ! total number of k-points
  REAL (KIND=DP) :: f_in(nsize,nkstot), f_out(nsize,nks)
    ! input  ( contains values for all k-point )
    ! output ( only for k-points of mypool )
  INTEGER :: rest, nbase
    ! the rest of the integer division nkstot / npo
    ! the position in the original list
  !
  !
  ! ... copy from the first node of the first pool
  ! ... to the first node of all the other pools
  !
  IF ( me_pool == root_pool ) &
     CALL mp_bcast( f_in, root_pool, inter_pool_comm )
  !
  ! ... distribute the vector on the first node of each pool
  !
  rest = nkstot / kunit - ( nkstot / kunit / npool ) * npool 
  !
  nbase = nks * my_pool_id
  !
  IF ( ( my_pool_id + 1 ) > rest ) nbase = nbase + rest * kunit
  !
  f_out(:,1:nks) = f_in(:,(nbase+1):(nbase+nks))
  !CALL DCOPY( nsize * nks, f_in(1,nbase+1), 1, f_out, 1 )
  !
  ! ... copy from the first node of every pool
  ! ... to the other nodes of every pool
  !
  CALL mp_bcast( f_out, root_pool, intra_pool_comm )
  !
#endif
  !
  RETURN
  !
END SUBROUTINE poolscatter
!
!----------------------------------------------------------------------------
SUBROUTINE fft_scatter1( f_in, nrx3, nxx_, f_aux, ncp_, npp_, sign )
  !----------------------------------------------------------------------------
  !
  ! ... transpose the fft grid across nodes
  !
  ! ... a)  From columns to planes (sign > 0)
  !
  ! ...     "columns" (or "pencil") representation:
  !
  ! ...        processor "me" has ncp_(me) contiguous columns along z
  ! ...        Each column has nrx3 elements for a fft of order nr3
  ! ...        nrx3 can be = nr3 + 1 in order to reduce memory conflicts.
  !
  ! ...     The transpose take places in two steps:
  !
  ! ...     1) on each processor the columns are divided into slices along z
  ! ...        that are stored contiguously. On processor "me", slices for
  ! ...        processor "proc" are npp_(proc)*ncp_(me) big
  ! ...     2) all processors communicate to exchange slices
  ! ...        (all columns with z in the slice belonging to "me"
  ! ...        must be received, all the others must be sent to "proc")
  !     
  ! ...     Finally one gets the "planes" representation:
  !
  ! ...        processor "me" has npp_(me) complete xy planes
  !
  ! ... b)  From planes to columns (sign < 0)
  !
  ! ...     Quite the same in the opposite direction
  !
  ! ...  The output is overwritten on f_in ; f_aux is used as work space
  !
#if defined (__PARA)
  !
  USE para_const, ONLY : maxproc
  USE mp_global,  ONLY : intra_pool_comm, nproc_pool, me_pool
  USE mp,         ONLY : mp_barrier
  USE parallel_include    
  !
  IMPLICIT NONE
  !
  INTEGER        :: nrx3, nxx_, sign, ncp_(maxproc), npp_(maxproc)
  REAL (KIND=DP) :: f_in(2*nxx_), f_aux(2*nxx_)
  INTEGER        :: dest, from, k, proc, ierr
  INTEGER        :: offset1(0:maxproc-1), sendcount(0:maxproc-1), &
                    sdispls(0:maxproc-1), recvcount(0:maxproc-1), &
                    rdispls(0:maxproc-1)
  !
  !
  IF ( nproc_pool == 1 ) RETURN
  !
!@  CALL start_clock( 'fft_scatter' )
  !
  ! ... sendcount(proc): amount of data processor "me" must send to processor
  ! ... recvcount(proc): amount of data processor "me" must receive from
  !
  DO proc = 0, ( nproc_pool - 1 )
     !
     sendcount(proc) = 2 * npp_(proc+1) * ncp_(me_pool+1)
     recvcount(proc) = 2 * npp_(me_pool+1) * ncp_(proc+1)
     !
  END DO
  !
  ! ... offset1(proc)   is used to locate the slices to be sent to proc
  ! ... sdispls(proc)+1 is the beginning of data that must be sent to proc
  ! ... rdispls(proc)+1 is the beginning of data that must be received from proc
  !
  offset1(0) = 1
  sdispls(0) = 0
  rdispls(0) = 0
  !
  DO proc = 1, ( nproc_pool - 1 )
     !
     offset1(proc) = offset1(proc-1) + 2 * npp_(proc)
     sdispls(proc) = sdispls(proc-1) + sendcount(proc-1)
     rdispls(proc) = rdispls(proc-1) + recvcount(proc-1)
     !
  END DO
  !
  ierr = 0
  !
  IF ( sign > 0 ) THEN
     !
     ! ... "forward" scatter from columns to planes
     !
     ! ... step one: store contiguously the slices
     !
     DO proc = 0, ( nproc_pool - 1 )
        !
        from = offset1(proc)
        dest = 1 + sdispls(proc)
        !
        DO k = 1, ncp_(me_pool+1)
           !
           CALL DCOPY( 2*npp_(proc+1), f_in(from+2*(k-1)*nrx3), &
                       1, f_aux(dest+2*(k-1)*npp_(proc+1)), 1 )
           !
        END DO
        !
     END DO
     !
     ! ... maybe useless; ensures that no garbage is present in the output
     !
     f_in(:) = ( 0.D0, 0.D0 )
     !
     ! ... step two: communication
     !
     CALL mp_barrier( intra_pool_comm )
     !
     CALL MPI_ALLTOALLV( f_aux, sendcount, sdispls, MPI_REAL8, &
                         f_in, recvcount, rdispls, MPI_REAL8,  &
                         intra_pool_comm, ierr )
     !     
     CALL errore( 'fft_scatter', 'info<>0', ierr )
     !
  ELSE
     !
     ! ... "backward" scatter from planes to columns
     !
     ! ... step two: communication
     !
     CALL mp_barrier( intra_pool_comm )
     !
     CALL MPI_ALLTOALLV( f_in, recvcount, rdispls, MPI_REAL8,  &
                         f_aux, sendcount, sdispls, MPI_REAL8, &
                         intra_pool_comm, ierr )
     !     
     CALL errore( 'fft_scatter', 'info<>0', ierr )
     !
     ! ... step one: store contiguously the columns
     !
     f_in(:) = ( 0.D0, 0.D0 )
     !
     DO proc = 0,( nproc_pool - 1 ) 
        !
        from = 1 + sdispls(proc)
        !
        dest = offset1(proc)
        !
        DO k = 1, ncp_(me_pool+1)
           !
           CALL DCOPY( 2*npp_(proc+1), f_aux(from+2*(k-1)*npp_(proc+1)), &
                       1, f_in (dest+2*(k-1)*nrx3), 1 )
           !
        END DO
        !
     END DO
     !
  END IF
  !
!@  CALL stop_clock( 'fft_scatter' )
  !
#endif
  !
  RETURN
  !
END SUBROUTINE fft_scatter1
!
! ... other parallel subroutines
!
!------------------------------------------------------------------------
SUBROUTINE poolextreme( ps, iflag )
  !-----------------------------------------------------------------------
  !
  ! ... Finds the maximum (iflag.gt.0) or the minimum (iflag.le.0) value o
  ! ... a real variable among the values distributed across the pools and
  ! ... returns this value to all pools.
  !       
#if defined (__PARA)
  !
  USE mp_global, ONLY : inter_pool_comm, intra_image_comm, npool
  USE mp,        ONLY : mp_barrier  
  USE parallel_include  
  !
  IMPLICIT NONE
  !
  INTEGER        :: iflag, info
  REAL (KIND=DP) :: ps, psr
  !
  !
  IF ( npool <= 1 ) RETURN
  !
  CALL mp_barrier( intra_image_comm )
  !
  IF ( iflag > 0 ) THEN
     !
     CALL MPI_ALLREDUCE( ps, psr, 1, MPI_REAL8, MPI_MAX, &
                         inter_pool_comm, info )
     !
     CALL errore( 'poolextreme', 'info<>0 in allreduce1', info )
     !
  ELSE
     !
     CALL MPI_ALLREDUCE( ps, psr, 1, MPI_REAL8, MPI_MIN, &
                         inter_pool_comm, info )
     !
     CALL errore( 'poolextreme', 'info<>0 in allreduce2', info )
     !
  END IF
  !
  ps = psr
  !
#endif
  !
  RETURN
  !
END SUBROUTINE poolextreme
!
!-----------------------------------------------------------------------
SUBROUTINE poolrecover( vec, length, nkstot, nks )
  !----------------------------------------------------------------------- 
  !
  ! ... recovers on the first processor of the first pool a 
  ! ... distributed vector
  !
#if defined (__PARA)
  !
  USE mp_global, ONLY : inter_pool_comm, intra_image_comm, &
                        npool, me_pool, root_pool, my_pool_id
  USE para,      ONLY : kunit
  USE mp,        ONLY : mp_barrier  
  USE parallel_include    
  !
  IMPLICIT NONE
  !
  INTEGER        :: status(MPI_STATUS_SIZE)
  INTEGER        :: length, i, nks1, rest, fine, nbase, info, nks, nkstot
  REAL (KIND=DP) :: vec(length,nkstot)
  !
  !
  IF ( npool <= 1 ) RETURN
  !
  IF ( MOD( nkstot, kunit ) /= 0 ) &
       CALL errore( 'poolrecover', 'nkstot/kunit is not an integer', nkstot )
  !
  nks1 = kunit * ( nkstot / kunit / npool )
  !
  rest = ( nkstot - nks1 * npool ) / kunit
  !
  CALL mp_barrier( intra_image_comm )
  !
  IF ( me_pool == root_pool .AND. my_pool_id > 0 ) THEN
     !
     CALL MPI_SEND( vec, (length*nks), MPI_REAL8, 0, 17, &
                    inter_pool_comm, info )
     !     
     CALL errore( 'poolrecover', 'info<>0 in send', info )
     !
  END IF
  !
  DO i = 2, npool
     !
     IF ( i <= rest ) THEN
        !
        fine = nks1 + kunit
        !
        nbase = ( nks1 + kunit ) * ( i - 1 )
        !
     ELSE
        !
        fine = nks1
        !
        nbase = rest * (nks1 + kunit) + (i - 1 - rest) * nks1
        !
     END IF
     !
     IF ( me_pool == root_pool .AND. my_pool_id == 0 ) THEN
        !
        CALL MPI_RECV( vec(1,nbase+1), (length*fine), MPI_REAL8, &
                       (i-1), 17, inter_pool_comm, status, info )
        !
        CALL errore( 'poolrecover', 'info<>0 in recv', info )
        !
     END IF
     !
  END DO
  !
#endif
  !
  RETURN
  !
END SUBROUTINE poolrecover
!
!------------------------------------------------------------------------
SUBROUTINE ipoolrecover( ivec, length, nkstot, nks )
  !------------------------------------------------------------------------
  !
  ! ... as above, for an integer vector
  !
#if defined (__PARA)
  !  
  USE mp_global, ONLY : inter_pool_comm, intra_image_comm, &
                        npool, me_pool, root_pool, my_pool_id
  USE para,      ONLY : kunit
  USE mp,        ONLY : mp_barrier  
  USE parallel_include    
  !
  IMPLICIT NONE
  !
  INTEGER :: status(MPI_STATUS_SIZE)
  INTEGER :: length, i, nks1, rest, fine, nbase, info, nks, nkstot
  INTEGER :: ivec(length,nkstot)
  !
  !
  IF ( npool <= 1 ) RETURN
  !
  IF ( MOD( nkstot, kunit ) /= 0 ) & 
       CALL errore( 'poolrecover', 'nkstot/kunit is not an integer', nkstot )
  !
  nks1 = kunit * ( nkstot / kunit / npool )
  !
  rest = ( nkstot - nks1 * npool ) / kunit
  !
  CALL mp_barrier( intra_image_comm )
  !
  IF ( me_pool == root_pool .AND. my_pool_id > 0 ) THEN
     !
     CALL MPI_SEND( ivec, (length*nks), MPI_INTEGER, 0, 17, &
                    inter_pool_comm, info )
     !
     CALL errore( 'ipoolrecover', 'info<>0 in send', info )
     !
  END IF
  !
  DO i = 2, npool
     !
     IF ( i <= rest ) THEN
        !
        fine = nks1 + kunit
        !
        nbase = ( nks1 + kunit ) * ( i - 1 )
        !
     ELSE
        !
        fine = nks1
        !
        nbase = rest * ( nks1 + kunit ) + ( i - 1 - rest ) * nks1
        !
     END IF
     !
     IF ( me_pool == root_pool .AND. my_pool_id == 0 ) THEN
        !
        CALL MPI_RECV( ivec(1,nbase+1), (length*fine), MPI_INTEGER, &
                       (i-1), 17, inter_pool_comm, status, info )
        !
        CALL errore( 'ipoolrecover', 'info<>0 in recv', info )
        !
     END IF
     !
  END DO
  !
#endif
  !
  RETURN
  !
END SUBROUTINE ipoolrecover
!
!-----------------------------------------------------------------------
SUBROUTINE extreme( ps, iflag )
  !-----------------------------------------------------------------------
  !
  ! ... Finds the maximum (iflag.gt.0) or the minimum (iflag.le.0) value
  ! ... of a real variable among the values distributed on a given pool
  !
#if defined (__PARA)
  !
  USE mp_global, ONLY : intra_image_comm
  USE mp,        ONLY : mp_barrier  
  USE parallel_include    
  !
  IMPLICIT NONE
  !
  REAL (KIND=DP) :: ps, psr
  INTEGER        :: iflag, info
  !
  !
  CALL mp_barrier( intra_image_comm )
  !
  IF ( iflag > 0 ) THEN
     !
     CALL MPI_ALLREDUCE( ps, psr, 1, MPI_REAL8, MPI_MAX, &
                         intra_image_comm, info )
     !
  ELSE
     !
     CALL MPI_ALLREDUCE( ps, psr, 1, MPI_REAL8, MPI_MIN, &
                         intra_image_comm, info )
     !
  END IF
  !
  ps = psr
  !
#endif
  !
  RETURN
  !
END SUBROUTINE extreme