!------------------------------------------------------------------------------
!
! This file is part of the Sternheimer-GW code.
! 
! Copyright (C) 2010 - 2016 
! Henry Lambert, Martin Schlipf, and Feliciano Giustino
!
! Sternheimer-GW is free software: you can redistribute it and/or modify
! it under the terms of the GNU General Public License as published by
! the Free Software Foundation, either version 3 of the License, or
! (at your option) any later version.
!
! Sternheimer-GW is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
! GNU General Public License for more details.
!
! You should have received a copy of the GNU General Public License
! along with Sternheimer-GW. If not, see
! http://www.gnu.org/licenses/gpl.html .
!
!------------------------------------------------------------------------------ 
!> This module provides the subroutines to parallelize a task over a communicator.
!!
!! In addition it provides the subroutines to gather the parallelized results on
!! a single or all processes or to sum them to a single process.
MODULE parallel_module

  IMPLICIT NONE

  !> wrapper for the MPI_GATHERV routine
  !!
  !! The subroutines are called with the part of the array that present on the
  !! local process and are collected in an output array only on the root process.
  !! We must know which part of the array is done by which process.
  !!
  !! It is recommended that you use the parallel_task subroutine to generate
  !! the distribution. Then you can just pass the num_task array generated by
  !! that subroutine.
  INTERFACE mp_gatherv
    MODULE PROCEDURE mp_gatherv_rv, mp_gatherv_rm, mp_gatherv_r3d, &
                     mp_gatherv_cv, mp_gatherv_cm, mp_gatherv_c3d
  END INTERFACE mp_gatherv

  !> wrapper for the MPI_ALLGATHERV routine
  !!
  !! The subroutines are called with an array that is distributed according
  !! across the communicator. We must know which part of the array is done by
  !! which process.
  !!
  !! It is recommended that you use the parallel_task subroutine to generate
  !! the distribution. Then you can just pass the num_task array generated by
  !! that subroutine.
  INTERFACE mp_allgatherv
    MODULE PROCEDURE mp_allgatherv_rv, mp_allgatherv_rm, mp_allgatherv_r3d, &
                     mp_allgatherv_cv, mp_allgatherv_cm, mp_allgatherv_c3d
  END INTERFACE mp_allgatherv

  PRIVATE

  !> wrapper for the MPI_REDUCE routine with MPI_SUM operator
  !!
  !! The subroutine collects sums a distributed array to the root process.
  !! In contrast to the QE implementation an in-place operation is used
  !! and the functionality is provided for more array shapes.
  INTERFACE mp_root_sum
    MODULE PROCEDURE mp_root_sum_rv, mp_root_sum_rm, mp_root_sum_r3d, &
                     mp_root_sum_cv, mp_root_sum_cm, mp_root_sum_c3d
  END INTERFACE mp_root_sum

  PUBLIC parallel_task, mp_allgatherv, mp_gatherv, mp_root_sum

CONTAINS

  !> Parallelize a given number of tasks over given communicator.
  !!
  !! Distribution of tasks over processes in communicator according to the
  !! following strategy
  !!
  SUBROUTINE parallel_task(comm, num_task_total, first_task, last_task, num_task)

    USE mp_global, ONLY: mp_rank, mp_size

    !> the communicator over which the tasks are distributed
    INTEGER, INTENT(IN)  :: comm

    !> total number of tasks to assign
    INTEGER, INTENT(IN)  :: num_task_total

    !> first task assigned to this process
    INTEGER, INTENT(OUT) :: first_task

    !> last task assigned to this process
    INTEGER, INTENT(OUT) :: last_task

    !> the number of tasks assigned to each process
    !! @note you can use this for the mp_gather wrapper in this module
    INTEGER, INTENT(OUT), ALLOCATABLE :: num_task(:)

    !> number of processes in communicator
    INTEGER num_proc

    !> rank of this process in communicator
    INTEGER my_rank

    !> minimal number of task per process
    INTEGER num_task_min

    !> remainder after assigning an equal amount of processes
    INTEGER num_remain

    !> at this point we add one additional task
    INTEGER last_proc

    ! determine rank and size
    ! note: add 1 to rank, because Fortran indices start counting at 1
    my_rank = mp_rank(comm) + 1
    num_proc = mp_size(comm)

    ! allocate array for assigned tasks
    ALLOCATE(num_task(num_proc))

    !! 1. Distribute num_task / size(comm) on every process.
    num_task_min = num_task_total / num_proc

    !! 2. Determine the remainder of num_remain of unassigned tasks.
    num_remain = MOD(num_task_total, num_proc)

    !! 3. Assign the last processes in the list an extra task.
    last_proc = num_proc - num_remain
    num_task(:last_proc) = num_task_min
    num_task(last_proc+1:) = num_task_min + 1

    !! 4. Determine first and last task for current process.
    last_task = SUM(num_task(:my_rank))
    first_task = last_task - num_task(my_rank) + 1

  END SUBROUTINE parallel_task

  !> Gather a vector of reals.
  SUBROUTINE mp_allgatherv_rv(comm, num_task, array)

    USE kinds, ONLY: dp
    USE mp,    ONLY: mp_size
    USE parallel_include

    !> The communicator across which the tasks are distributed.
    INTEGER,  INTENT(IN)    :: comm

    !> The distribution of the tasks across the processes.
    INTEGER,  INTENT(IN)    :: num_task(:)

    !> The vector that is distributed across the processes.
    REAL(dp), INTENT(INOUT) :: array(:)

    !> displacement array - sum the number of tasks x dimension of the array
    INTEGER,  ALLOCATABLE   :: displ(:)

    ! number of processes
    INTEGER num_proc

    ! loop variable for processes
    INTEGER iproc
 
    ! MPI error code
    INTEGER ierr

    ! we only need to gather the vector if it is distributed
#if defined(__MPI)
  
    ! determine number of processes
    num_proc = SIZE(num_task)

    ! calculate displacement
    ALLOCATE(displ(num_proc))

    ! note: C array storage - starting at 0
    displ(1) = 0
    DO iproc = 1, num_proc - 1
      displ(iproc + 1) = displ(iproc) + num_task(iproc)
    END DO ! iproc

    ! sanity test of the input
    IF (mp_size(comm) /= num_proc) &
      CALL errore(__FILE__, "error in allgatherv - communicator size and distribution don't match", 1)
    IF (displ(num_proc) + num_task(num_proc) /= SIZE(array)) &
      CALL errore(__FILE__, "array size does not match with assigned tasks", 1)

    CALL MPI_ALLGATHERV(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, array, num_task, displ, MPI_DOUBLE_PRECISION, comm, ierr)
    CALL errore(__FILE__, "error in mpi_allgatherv call", ierr)

    DEALLOCATE(displ)

#endif

  END SUBROUTINE mp_allgatherv_rv

  !> Gather a vector of complex.
  SUBROUTINE mp_allgatherv_cv(comm, num_task, array)

    USE kinds, ONLY: dp
    USE mp,    ONLY: mp_size
    USE parallel_include

    !> The communicator across which the tasks are distributed.
    INTEGER,     INTENT(IN)    :: comm

    !> The distribution of the tasks across the processes.
    INTEGER,     INTENT(IN)    :: num_task(:)

    !> The vector that is distributed across the processes.
    COMPLEX(dp), INTENT(INOUT) :: array(:)

    !> displacement array - sum the number of tasks x dimension of the array
    INTEGER,     ALLOCATABLE   :: displ(:)

    ! number of processes
    INTEGER num_proc

    ! loop variable for processes
    INTEGER iproc
 
    ! MPI error code
    INTEGER ierr

    ! we only need to gather the vector if it is distributed
#if defined(__MPI)
  
    ! determine number of processes
    num_proc = SIZE(num_task)

    ! calculate displacement
    ALLOCATE(displ(num_proc))

    ! note: C array storage - starting at 0
    displ(1) = 0
    DO iproc = 1, num_proc - 1
      displ(iproc + 1) = displ(iproc) + num_task(iproc)
    END DO ! iproc

    ! sanity test of the input
    IF (mp_size(comm) /= num_proc) &
      CALL errore(__FILE__, "error in allgatherv - communicator size and distribution don't match", 1)
    IF (displ(num_proc) + num_task(num_proc) /= SIZE(array)) &
      CALL errore(__FILE__, "array size does not match with assigned tasks", 1)

    CALL MPI_ALLGATHERV(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, array, num_task, displ, MPI_DOUBLE_COMPLEX, comm, ierr)
    CALL errore(__FILE__, "error in mpi_allgatherv call", ierr)

    DEALLOCATE(displ)

#endif

  END SUBROUTINE mp_allgatherv_cv

  !> Gather a matrix of reals.
  SUBROUTINE mp_allgatherv_rm(comm, num_task, array)

    USE kinds, ONLY: dp
    USE mp,    ONLY: mp_size
    USE parallel_include

    !> The communicator across which the tasks are distributed.
    INTEGER,  INTENT(IN)    :: comm

    !> The distribution of the tasks across the processes.
    INTEGER,  INTENT(IN)    :: num_task(:)

    !> The vector that is distributed across the processes.
    REAL(dp), INTENT(INOUT) :: array(:,:)

    !> actual size of sent data - number of task x dimension of the array
    INTEGER,  ALLOCATABLE   :: receive(:)

    !> displacement array - sum the number of tasks x dimension of the array
    INTEGER,  ALLOCATABLE   :: displ(:)

    ! number of processes
    INTEGER num_proc

    ! loop variable for processes
    INTEGER iproc
 
    ! MPI error code
    INTEGER ierr

    ! we only need to gather the vector if it is distributed
#if defined(__MPI)
  
    ! determine number of processes
    num_proc = SIZE(num_task)

    ! calculate number of sent information
    ALLOCATE(receive(num_proc))
    receive = num_task * SIZE(array, 1)

    ! calculate displacement
    ALLOCATE(displ(num_proc))

    ! note: C array storage - starting at 0
    displ(1) = 0
    DO iproc = 1, num_proc - 1
      displ(iproc + 1) = displ(iproc) + receive(iproc)
    END DO ! iproc

    ! sanity test of the input
    IF (mp_size(comm) /= num_proc) &
      CALL errore(__FILE__, "error in allgatherv - communicator size and distribution don't match", 1)
    IF (displ(num_proc) + receive(num_proc) /= SIZE(array)) &
      CALL errore(__FILE__, "array size does not match with assigned tasks", 1)

    CALL MPI_ALLGATHERV(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, array, receive, displ, MPI_DOUBLE_PRECISION, comm, ierr)
    CALL errore(__FILE__, "error in mpi_allgatherv call", ierr)

    DEALLOCATE(displ)
    DEALLOCATE(receive)

#endif

  END SUBROUTINE mp_allgatherv_rm

  !> Gather a matrix of complex.
  SUBROUTINE mp_allgatherv_cm(comm, num_task, array)

    USE kinds, ONLY: dp
    USE mp,    ONLY: mp_size
    USE parallel_include

    !> The communicator across which the tasks are distributed.
    INTEGER,     INTENT(IN)    :: comm

    !> The distribution of the tasks across the processes.
    INTEGER,     INTENT(IN)    :: num_task(:)

    !> The vector that is distributed across the processes.
    COMPLEX(dp), INTENT(INOUT) :: array(:,:)

    !> actual size of sent data - number of task x dimension of the array
    INTEGER,     ALLOCATABLE   :: receive(:)

    !> displacement array - sum the number of tasks x dimension of the array
    INTEGER,     ALLOCATABLE   :: displ(:)

    ! number of processes
    INTEGER num_proc

    ! loop variable for processes
    INTEGER iproc
 
    ! MPI error code
    INTEGER ierr

    ! we only need to gather the vector if it is distributed
#if defined(__MPI)
  
    ! determine number of processes
    num_proc = SIZE(num_task)

    ! calculate number of sent information
    ALLOCATE(receive(num_proc))
    receive = num_task * SIZE(array, 1)

    ! calculate displacement
    ALLOCATE(displ(num_proc))

    ! note: C array storage - starting at 0
    displ(1) = 0
    DO iproc = 1, num_proc - 1
      displ(iproc + 1) = displ(iproc) + receive(iproc)
    END DO ! iproc

    ! sanity test of the input
    IF (mp_size(comm) /= num_proc) &
      CALL errore(__FILE__, "error in allgatherv - communicator size and distribution don't match", 1)
    IF (displ(num_proc) + receive(num_proc) /= SIZE(array)) &
      CALL errore(__FILE__, "array size does not match with assigned tasks", 1)

    CALL MPI_ALLGATHERV(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, array, receive, displ, MPI_DOUBLE_COMPLEX, comm, ierr)
    CALL errore(__FILE__, "error in mpi_allgatherv call", ierr)

    DEALLOCATE(displ)
    DEALLOCATE(receive)

#endif

  END SUBROUTINE mp_allgatherv_cm

  !> Gather a 3d tensor of reals.
  SUBROUTINE mp_allgatherv_r3d(comm, num_task, array)

    USE kinds, ONLY: dp
    USE mp,    ONLY: mp_size
    USE parallel_include

    !> The communicator across which the tasks are distributed.
    INTEGER,  INTENT(IN)    :: comm

    !> The distribution of the tasks across the processes.
    INTEGER,  INTENT(IN)    :: num_task(:)

    !> The vector that is distributed across the processes.
    REAL(dp), INTENT(INOUT) :: array(:,:,:)

    !> actual size of sent data - number of task x dimension of the array
    INTEGER,  ALLOCATABLE   :: receive(:)

    !> displacement array - sum the number of tasks x dimension of the array
    INTEGER,  ALLOCATABLE   :: displ(:)

    ! number of processes
    INTEGER num_proc

    ! loop variable for processes
    INTEGER iproc
 
    ! MPI error code
    INTEGER ierr

    ! we only need to gather the vector if it is distributed
#if defined(__MPI)
  
    ! determine number of processes
    num_proc = SIZE(num_task)

    ! calculate number of sent information
    ALLOCATE(receive(num_proc))
    receive = num_task * SIZE(array, 1) * SIZE(array, 2)

    ! calculate displacement
    ALLOCATE(displ(num_proc))

    ! note: C array storage - starting at 0
    displ(1) = 0
    DO iproc = 1, num_proc - 1
      displ(iproc + 1) = displ(iproc) + receive(iproc)
    END DO ! iproc

    ! sanity test of the input
    IF (mp_size(comm) /= num_proc) &
      CALL errore(__FILE__, "error in allgatherv - communicator size and distribution don't match", 1)
    IF (displ(num_proc) + receive(num_proc) /= SIZE(array)) &
      CALL errore(__FILE__, "array size does not match with assigned tasks", 1)

    CALL MPI_ALLGATHERV(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, array, receive, displ, MPI_DOUBLE_PRECISION, comm, ierr)
    CALL errore(__FILE__, "error in mpi_allgatherv call", ierr)

    DEALLOCATE(displ)
    DEALLOCATE(receive)

#endif

  END SUBROUTINE mp_allgatherv_r3d

  !> Gather a 3d tensor of complex.
  SUBROUTINE mp_allgatherv_c3d(comm, num_task, array)

    USE kinds, ONLY: dp
    USE mp,    ONLY: mp_size
    USE parallel_include

    !> The communicator across which the tasks are distributed.
    INTEGER,     INTENT(IN)    :: comm

    !> The distribution of the tasks across the processes.
    INTEGER,     INTENT(IN)    :: num_task(:)

    !> The vector that is distributed across the processes.
    COMPLEX(dp), INTENT(INOUT) :: array(:,:,:)

    !> actual size of sent data - number of task x dimension of the array
    INTEGER,     ALLOCATABLE   :: receive(:)

    !> displacement array - sum the number of tasks x dimension of the array
    INTEGER,     ALLOCATABLE   :: displ(:)

    ! number of processes
    INTEGER num_proc

    ! loop variable for processes
    INTEGER iproc
 
    ! MPI error code
    INTEGER ierr

    ! we only need to gather the vector if it is distributed
#if defined(__MPI)
  
    ! determine number of processes
    num_proc = SIZE(num_task)

    ! calculate number of sent information
    ALLOCATE(receive(num_proc))
    receive = num_task * SIZE(array, 1) * SIZE(array, 2)

    ! calculate displacement
    ALLOCATE(displ(num_proc))

    ! note: C array storage - starting at 0
    displ(1) = 0
    DO iproc = 1, num_proc - 1
      displ(iproc + 1) = displ(iproc) + receive(iproc)
    END DO ! iproc

    ! sanity test of the input
    IF (mp_size(comm) /= num_proc) &
      CALL errore(__FILE__, "error in allgatherv - communicator size and distribution don't match", 1)
    IF (displ(num_proc) + receive(num_proc) /= SIZE(array)) &
      CALL errore(__FILE__, "array size does not match with assigned tasks", 1)

    CALL MPI_ALLGATHERV(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, array, receive, displ, MPI_DOUBLE_COMPLEX, comm, ierr)
    CALL errore(__FILE__, "error in mpi_allgatherv call", ierr)

    DEALLOCATE(displ)
    DEALLOCATE(receive)

#endif

  END SUBROUTINE mp_allgatherv_c3d

  !> Gather a vector of reals.
  SUBROUTINE mp_gatherv_rv(comm, root, num_task, array_in, array_out)

    USE kinds, ONLY: dp
    USE mp,    ONLY: mp_size, mp_rank
    USE parallel_include

    !> The communicator across which the tasks are distributed.
    INTEGER,  INTENT(IN)  :: comm

    !> The root process on which the output is gathered.
    INTEGER,  INTENT(IN)  :: root

    !> The distribution of the tasks across the processes.
    INTEGER,  INTENT(IN)  :: num_task(:)

    !> The array that is distributed across the processes.
    REAL(dp), INTENT(IN)  :: array_in(:)

    !> The array that is gathered on the root process. Will be allocated by
    !! this routine, only on the root process.
    REAL(dp), INTENT(OUT), ALLOCATABLE :: array_out(:)

    !> displacement array - sum the number of tasks x dimension of the array
    INTEGER,  ALLOCATABLE :: displ(:)

    ! the size of the gathered array
    INTEGER size_out

    ! rank of this process
    INTEGER my_rank

    ! number of processes
    INTEGER num_proc

    ! loop variable for processes
    INTEGER iproc
 
    ! MPI error code
    INTEGER ierr

    
    ! we only need to gather the vector if it is distributed
#if defined(__MPI)
  
    ! determine number of processes
    num_proc = SIZE(num_task)

    ! sanity test of the input
    IF (mp_size(comm) /= num_proc) &
      CALL errore(__FILE__, "error in gatherv - communicator size and distribution don't match", 1)

    ! determine rank of this process
    my_rank = mp_rank(comm)

    ! create the information that is relevant at the root
    IF (my_rank == root) THEN

      ! calculate displacement
      ALLOCATE(displ(num_proc))

      ! note: C array storage - starting at 0
      displ(1) = 0
      DO iproc = 1, num_proc - 1
        displ(iproc + 1) = displ(iproc) + num_task(iproc)
      END DO ! iproc

      ! create array for output
      size_out = SUM(num_task)
      ALLOCATE(array_out(size_out))

    END IF

    ! gather everything at the root process
    ! note: my_rank + 1 to convert from C to Fortran indices
    CALL MPI_GATHERV(array_in, num_task(my_rank + 1), MPI_DOUBLE_PRECISION, &
                     array_out, num_task, displ, MPI_DOUBLE_PRECISION, &
                     root, comm, ierr)
    CALL errore(__FILE__, "error in mpi_gatherv call", ierr)

    IF (my_rank == root) THEN
      DEALLOCATE(displ)
    END IF

#else

    ! create a copy of the input array
    ALLOCATE(array_out(SIZE(array_in)))
    array_out = array_in

#endif

  END SUBROUTINE mp_gatherv_rv

  !> Gather a vector of complex.
  SUBROUTINE mp_gatherv_cv(comm, root, num_task, array_in, array_out)

    USE kinds, ONLY: dp
    USE mp,    ONLY: mp_size, mp_rank
    USE parallel_include

    !> The communicator across which the tasks are distributed.
    INTEGER,     INTENT(IN) :: comm

    !> The root process on which the output is gathered.
    INTEGER,     INTENT(IN) :: root

    !> The distribution of the tasks across the processes.
    INTEGER,     INTENT(IN) :: num_task(:)

    !> The array that is distributed across the processes.
    COMPLEX(dp), INTENT(IN) :: array_in(:)

    !> The array that is gathered on the root process. Will be allocated by
    !! this routine, only on the root process.
    COMPLEX(dp), INTENT(OUT), ALLOCATABLE :: array_out(:)

    !> displacement array - sum the number of tasks x dimension of the array
    INTEGER,  ALLOCATABLE   :: displ(:)

    ! the size of the gathered array
    INTEGER size_out

    ! rank of this process
    INTEGER my_rank

    ! number of processes
    INTEGER num_proc

    ! loop variable for processes
    INTEGER iproc
 
    ! MPI error code
    INTEGER ierr

    
    ! we only need to gather the vector if it is distributed
#if defined(__MPI)
  
    ! determine number of processes
    num_proc = SIZE(num_task)

    ! sanity test of the input
    IF (mp_size(comm) /= num_proc) &
      CALL errore(__FILE__, "error in gatherv - communicator size and distribution don't match", 1)

    ! determine rank of this process
    my_rank = mp_rank(comm)

    ! create the information that is relevant at the root
    IF (my_rank == root) THEN

      ! calculate displacement
      ALLOCATE(displ(num_proc))

      ! note: C array storage - starting at 0
      displ(1) = 0
      DO iproc = 1, num_proc - 1
        displ(iproc + 1) = displ(iproc) + num_task(iproc)
      END DO ! iproc

      ! create array for output
      size_out = SUM(num_task)
      ALLOCATE(array_out(size_out))

    END IF

    ! gather everything at the root process
    ! note: my_rank + 1 to convert from C to Fortran indices
    CALL MPI_GATHERV(array_in, num_task(my_rank + 1), MPI_DOUBLE_COMPLEX, &
                     array_out, num_task, displ, MPI_DOUBLE_COMPLEX, &
                     root, comm, ierr)
    CALL errore(__FILE__, "error in mpi_gatherv call", ierr)

    IF (my_rank == root) THEN
      DEALLOCATE(displ)
    END IF

#else

    ! create a copy of the input array
    ALLOCATE(array_out(SIZE(array_in, 1)))
    array_out = array_in

#endif

  END SUBROUTINE mp_gatherv_cv

  !> Gather a array of reals.
  SUBROUTINE mp_gatherv_rm(comm, root, num_task, array_in, array_out)

    USE kinds, ONLY: dp
    USE mp,    ONLY: mp_size, mp_rank
    USE parallel_include

    !> The communicator across which the tasks are distributed.
    INTEGER,  INTENT(IN)  :: comm

    !> The root process on which the output is gathered.
    INTEGER,  INTENT(IN)  :: root

    !> The distribution of the tasks across the processes.
    INTEGER,  INTENT(IN)  :: num_task(:)

    !> The array that is distributed across the processes.
    REAL(dp), INTENT(IN)  :: array_in(:,:)

    !> The array that is gathered on the root process. Will be allocated by
    !! this routine, only on the root process.
    REAL(dp), INTENT(OUT), ALLOCATABLE :: array_out(:,:)

    !> actual size of sent data - number of task x dimension of the array
    INTEGER send

    !> actual size of received data - total number of task x dimension of the array
    INTEGER,  ALLOCATABLE :: receive(:)

    !> displacement array - sum the number of tasks x dimension of the array
    INTEGER,  ALLOCATABLE :: displ(:)

    ! the size of the gathered array
    INTEGER size_out

    ! rank of this process
    INTEGER my_rank

    ! number of processes
    INTEGER num_proc

    ! loop variable for processes
    INTEGER iproc
 
    ! MPI error code
    INTEGER ierr

    
    ! we only need to gather the vector if it is distributed
#if defined(__MPI)
  
    ! determine number of processes
    num_proc = SIZE(num_task)

    ! sanity test of the input
    IF (mp_size(comm) /= num_proc) &
      CALL errore(__FILE__, "error in gatherv - communicator size and distribution don't match", 1)

    ! determine rank of this process
    my_rank = mp_rank(comm)

    ! we send number of task * first dimension entries
    ! note: my_rank + 1 to convert from C to Fortran indices
    send = num_task(my_rank + 1) * SIZE(array_in, 1)

    ! create the information that is relevant at the root
    IF (my_rank == root) THEN

      ! calculate number of sent information
      ALLOCATE(receive(num_proc))
      receive = num_task * SIZE(array_in, 1)

      ! calculate displacement
      ALLOCATE(displ(num_proc))

      ! note: C array storage - starting at 0
      displ(1) = 0
      DO iproc = 1, num_proc - 1
        displ(iproc + 1) = displ(iproc) + num_task(iproc) * SIZE(array_in, 1)
      END DO ! iproc

      ! create array for output
      size_out = SUM(num_task)
      ALLOCATE(array_out(SIZE(array_in, 1), size_out))

    END IF

    ! gather everything at the root process
    CALL MPI_GATHERV(array_in, send, MPI_DOUBLE_PRECISION, &
                     array_out, receive, displ, MPI_DOUBLE_PRECISION, &
                     root, comm, ierr)
    CALL errore(__FILE__, "error in mpi_gatherv call", ierr)

    IF (my_rank == root) THEN
      DEALLOCATE(displ)
      DEALLOCATE(receive)
    END IF

#else

    ! create a copy of the input array
    ALLOCATE(array_out(SIZE(array_in, 1), SIZE(array_in, 2)))
    array_out = array_in

#endif

  END SUBROUTINE mp_gatherv_rm

  !> Gather a array of complex.
  SUBROUTINE mp_gatherv_cm(comm, root, num_task, array_in, array_out)

    USE kinds, ONLY: dp
    USE mp,    ONLY: mp_size, mp_rank
    USE parallel_include

    !> The communicator across which the tasks are distributed.
    INTEGER,     INTENT(IN)  :: comm

    !> The root process on which the output is gathered.
    INTEGER,     INTENT(IN)  :: root

    !> The distribution of the tasks across the processes.
    INTEGER,     INTENT(IN)  :: num_task(:)

    !> The array that is distributed across the processes.
    COMPLEX(dp), INTENT(IN)  :: array_in(:,:)

    !> The array that is gathered on the root process. Will be allocated by
    !! this routine, only on the root process.
    COMPLEX(dp), INTENT(OUT), ALLOCATABLE :: array_out(:,:)

    !> actual size of sent data - number of task x dimension of the array
    INTEGER send

    !> actual size of received data - total number of task x dimension of the array
    INTEGER,     ALLOCATABLE :: receive(:)

    !> displacement array - sum the number of tasks x dimension of the array
    INTEGER,     ALLOCATABLE :: displ(:)

    ! the size of the gathered array
    INTEGER size_out

    ! rank of this process
    INTEGER my_rank

    ! number of processes
    INTEGER num_proc

    ! loop variable for processes
    INTEGER iproc
 
    ! MPI error code
    INTEGER ierr

    
    ! we only need to gather the vector if it is distributed
#if defined(__MPI)
  
    ! determine number of processes
    num_proc = SIZE(num_task)

    ! sanity test of the input
    IF (mp_size(comm) /= num_proc) &
      CALL errore(__FILE__, "error in gatherv - communicator size and distribution don't match", 1)

    ! determine rank of this process
    my_rank = mp_rank(comm)

    ! we send number of task * first dimension entries
    ! note: my_rank + 1 to convert from C to Fortran indices
    send = num_task(my_rank + 1) * SIZE(array_in, 1)

    ! create the information that is relevant at the root
    IF (my_rank == root) THEN

      ! calculate number of sent information
      ALLOCATE(receive(num_proc))
      receive = num_task * SIZE(array_in, 1)

      ! calculate displacement
      ALLOCATE(displ(num_proc))

      ! note: C array storage - starting at 0
      displ(1) = 0
      DO iproc = 1, num_proc - 1
        displ(iproc + 1) = displ(iproc) + num_task(iproc) * SIZE(array_in, 1)
      END DO ! iproc

      ! create array for output
      size_out = SUM(num_task)
      ALLOCATE(array_out(SIZE(array_in, 1), size_out))

    END IF

    ! gather everything at the root process
    CALL MPI_GATHERV(array_in, send, MPI_DOUBLE_COMPLEX, &
                     array_out, receive, displ, MPI_DOUBLE_COMPLEX, &
                     root, comm, ierr)
    CALL errore(__FILE__, "error in mpi_gatherv call", ierr)

    IF (my_rank == root) THEN
      DEALLOCATE(displ)
      DEALLOCATE(receive)
    END IF

#else

    ! create a copy of the input array
    ALLOCATE(array_out(SIZE(array_in, 1), SIZE(array_in, 2)))
    array_out = array_in

#endif

  END SUBROUTINE mp_gatherv_cm

  !> Gather a array of reals.
  SUBROUTINE mp_gatherv_r3d(comm, root, num_task, array_in, array_out)

    USE kinds, ONLY: dp
    USE mp,    ONLY: mp_size, mp_rank
    USE parallel_include

    !> The communicator across which the tasks are distributed.
    INTEGER,  INTENT(IN)  :: comm

    !> The root process on which the output is gathered.
    INTEGER,  INTENT(IN)  :: root

    !> The distribution of the tasks across the processes.
    INTEGER,  INTENT(IN)  :: num_task(:)

    !> The array that is distributed across the processes.
    REAL(dp), INTENT(IN)  :: array_in(:,:,:)

    !> The array that is gathered on the root process. Will be allocated by
    !! this routine, only on the root process.
    REAL(dp), INTENT(OUT), ALLOCATABLE :: array_out(:,:,:)

    !> actual size of sent data - number of task x dimension of the array
    INTEGER send

    !> actual size of received data - total number of task x dimension of the array
    INTEGER,  ALLOCATABLE :: receive(:)

    !> displacement array - sum the number of tasks x dimension of the array
    INTEGER,  ALLOCATABLE :: displ(:)

    ! the size of the gathered array
    INTEGER size_out

    ! rank of this process
    INTEGER my_rank

    ! number of processes
    INTEGER num_proc

    ! loop variable for processes
    INTEGER iproc
 
    ! MPI error code
    INTEGER ierr

    ! the size of the undistributed part of the array
    INTEGER array_dim
    
    ! we only need to gather the vector if it is distributed
#if defined(__MPI)
  
    ! determine number of processes
    num_proc = SIZE(num_task)

    ! sanity test of the input
    IF (mp_size(comm) /= num_proc) &
      CALL errore(__FILE__, "error in gatherv - communicator size and distribution don't match", 1)

    ! determine rank of this process
    my_rank = mp_rank(comm)

    ! evaluate array dimension of the part not split across processes
    array_dim = SIZE(array_in, 1) * SIZE(array_in, 2)

    ! we send number of task * first dimension entries
    ! note: my_rank + 1 to convert from C to Fortran indices
    send = num_task(my_rank + 1) * array_dim

    ! create the information that is relevant at the root
    IF (my_rank == root) THEN

      ! calculate number of sent information
      ALLOCATE(receive(num_proc))
      receive = num_task * array_dim

      ! calculate displacement
      ALLOCATE(displ(num_proc))

      ! note: C array storage - starting at 0
      displ(1) = 0
      DO iproc = 1, num_proc - 1
        displ(iproc + 1) = displ(iproc) + num_task(iproc) * array_dim
      END DO ! iproc

      ! create array for output
      size_out = SUM(num_task)
      ALLOCATE(array_out(SIZE(array_in, 1), SIZE(array_in, 2), size_out))

    END IF

    ! gather everything at the root process
    CALL MPI_GATHERV(array_in, send, MPI_DOUBLE_PRECISION, &
                     array_out, receive, displ, MPI_DOUBLE_PRECISION, &
                     root, comm, ierr)
    CALL errore(__FILE__, "error in mpi_gatherv call", ierr)

    IF (my_rank == root) THEN
      DEALLOCATE(displ)
      DEALLOCATE(receive)
    END IF

#else

    ! create a copy of the input array
    ALLOCATE(array_out(SIZE(array_in, 1), SIZE(array_in, 2), SIZE(array_in, 3)))
    array_out = array_in

#endif

  END SUBROUTINE mp_gatherv_r3d

  !> Gather a array of complex.
  SUBROUTINE mp_gatherv_c3d(comm, root, num_task, array_in, array_out)

    USE kinds, ONLY: dp
    USE mp,    ONLY: mp_size, mp_rank
    USE parallel_include

    !> The communicator across which the tasks are distributed.
    INTEGER,     INTENT(IN)  :: comm

    !> The root process on which the output is gathered.
    INTEGER,     INTENT(IN)  :: root

    !> The distribution of the tasks across the processes.
    INTEGER,     INTENT(IN)  :: num_task(:)

    !> The array that is distributed across the processes.
    COMPLEX(dp), INTENT(IN)  :: array_in(:,:,:)

    !> The array that is gathered on the root process. Will be allocated by
    !! this routine, only on the root process.
    COMPLEX(dp), INTENT(OUT), ALLOCATABLE :: array_out(:,:,:)

    !> actual size of sent data - number of task x dimension of the array
    INTEGER send

    !> actual size of received data - total number of task x dimension of the array
    INTEGER,     ALLOCATABLE :: receive(:)

    !> displacement array - sum the number of tasks x dimension of the array
    INTEGER,     ALLOCATABLE :: displ(:)

    ! the size of the gathered array
    INTEGER size_out

    ! rank of this process
    INTEGER my_rank

    ! number of processes
    INTEGER num_proc

    ! loop variable for processes
    INTEGER iproc
 
    ! MPI error code
    INTEGER ierr

    ! the size of the undistributed part of the array
    INTEGER array_dim
    
    ! we only need to gather the vector if it is distributed
#if defined(__MPI)
  
    ! determine number of processes
    num_proc = SIZE(num_task)

    ! sanity test of the input
    IF (mp_size(comm) /= num_proc) &
      CALL errore(__FILE__, "error in gatherv - communicator size and distribution don't match", 1)

    ! determine rank of this process
    my_rank = mp_rank(comm)

    ! evaluate array dimension of the part not split across processes
    array_dim = SIZE(array_in, 1) * SIZE(array_in, 2)

    ! we send number of task * first dimension entries
    ! note: my_rank + 1 to convert from C to Fortran indices
    send = num_task(my_rank + 1) * array_dim

    ! create the information that is relevant at the root
    IF (my_rank == root) THEN

      ! calculate number of sent information
      ALLOCATE(receive(num_proc))
      receive = num_task * array_dim

      ! calculate displacement
      ALLOCATE(displ(num_proc))

      ! note: C array storage - starting at 0
      displ(1) = 0
      DO iproc = 1, num_proc - 1
        displ(iproc + 1) = displ(iproc) + num_task(iproc) * array_dim 
      END DO ! iproc

      ! create array for output
      size_out = SUM(num_task)
      ALLOCATE(array_out(SIZE(array_in, 1), SIZE(array_in, 2), size_out))

    END IF

    ! gather everything at the root process
    CALL MPI_GATHERV(array_in, send, MPI_DOUBLE_COMPLEX, &
                     array_out, receive, displ, MPI_DOUBLE_COMPLEX, &
                     root, comm, ierr)
    CALL errore(__FILE__, "error in mpi_gatherv call", ierr)

    IF (my_rank == root) THEN
      DEALLOCATE(displ)
      DEALLOCATE(receive)
    END IF

#else

    ! create a copy of the input array
    ALLOCATE(array_out(SIZE(array_in, 1), SIZE(array_in, 2), SIZE(array_in, 3)))
    array_out = array_in

#endif

  END SUBROUTINE mp_gatherv_c3d

  !> sum a real vector to the root process
  SUBROUTINE mp_root_sum_rv(comm, root, array)

    USE kinds, ONLY: dp
    USE mp,    ONLY: mp_rank
    USE parallel_include

    !> The communicator across which the tasks are distributed.
    INTEGER,  INTENT(IN)    :: comm

    !> The root process on which the output is gathered.
    INTEGER,  INTENT(IN)    :: root

    !> The array that is distributed across the processes.
    REAL(dp), INTENT(INOUT) :: array(:)

    !> the rank of this process
    INTEGER my_rank

    ! MPI error code
    INTEGER ierr

    ! we only need to sum if the array is distributed
#if defined(__MPI)

    ! determine rank of this process
    my_rank = mp_rank(comm)

    ! at the root process
    IF (my_rank == root) THEN

      CALL MPI_REDUCE(MPI_IN_PLACE, array, SIZE(array), MPI_DOUBLE_PRECISION, &
                      MPI_SUM, root, comm, ierr)
      CALL errore(__FILE__, "error in mpi_reduce call", ierr)

    ELSE ! not root

      CALL MPI_REDUCE(array, array, SIZE(array), MPI_DOUBLE_PRECISION, &
                      MPI_SUM, root, comm, ierr)
      CALL errore(__FILE__, "error in mpi_reduce call", ierr)

    END IF ! root

#endif
    
  END SUBROUTINE mp_root_sum_rv

  !> sum a complex vector to the root process
  SUBROUTINE mp_root_sum_cv(comm, root, array)

    USE kinds, ONLY: dp
    USE mp,    ONLY: mp_rank
    USE parallel_include

    !> The communicator across which the tasks are distributed.
    INTEGER,  INTENT(IN) :: comm

    !> The root process on which the output is gathered.
    INTEGER,  INTENT(IN) :: root

    !> The array that is distributed across the processes.
    COMPLEX(dp), INTENT(INOUT) :: array(:)

    !> the rank of this process
    INTEGER my_rank

    ! MPI error code
    INTEGER ierr

    ! we only need to sum if the array is distributed
#if defined(__MPI)

    ! determine rank of this process
    my_rank = mp_rank(comm)

    ! at the root process
    IF (my_rank == root) THEN

      CALL MPI_REDUCE(MPI_IN_PLACE, array, SIZE(array), MPI_DOUBLE_COMPLEX, &
                      MPI_SUM, root, comm, ierr)
      CALL errore(__FILE__, "error in mpi_reduce call", ierr)

    ELSE ! not root

      CALL MPI_REDUCE(array, array, SIZE(array), MPI_DOUBLE_COMPLEX, &
                      MPI_SUM, root, comm, ierr)
      CALL errore(__FILE__, "error in mpi_reduce call", ierr)

    END IF ! root

#endif

  END SUBROUTINE mp_root_sum_cv

  !> sum a real matrix to the root process
  SUBROUTINE mp_root_sum_rm(comm, root, array)

    USE kinds, ONLY: dp
    USE mp,    ONLY: mp_rank
    USE parallel_include

    !> The communicator across which the tasks are distributed.
    INTEGER,  INTENT(IN)    :: comm

    !> The root process on which the output is gathered.
    INTEGER,  INTENT(IN)    :: root

    !> The array that is distributed across the processes.
    REAL(dp), INTENT(INOUT) :: array(:,:)

    !> the rank of this process
    INTEGER my_rank

    ! MPI error code
    INTEGER ierr

    ! we only need to sum if the array is distributed
#if defined(__MPI)

    ! determine rank of this process
    my_rank = mp_rank(comm)

    ! at the root process
    IF (my_rank == root) THEN

      CALL MPI_REDUCE(MPI_IN_PLACE, array, SIZE(array), MPI_DOUBLE_PRECISION, &
                      MPI_SUM, root, comm, ierr)
      CALL errore(__FILE__, "error in mpi_reduce call", ierr)

    ELSE ! not root

      CALL MPI_REDUCE(array, array, SIZE(array), MPI_DOUBLE_PRECISION, &
                      MPI_SUM, root, comm, ierr)
      CALL errore(__FILE__, "error in mpi_reduce call", ierr)

    END IF ! root

#endif
    
  END SUBROUTINE mp_root_sum_rm

  !> sum a complex matrix to the root process
  SUBROUTINE mp_root_sum_cm(comm, root, array)

    USE kinds, ONLY: dp
    USE mp,    ONLY: mp_rank
    USE parallel_include

    !> The communicator across which the tasks are distributed.
    INTEGER,  INTENT(IN) :: comm

    !> The root process on which the output is gathered.
    INTEGER,  INTENT(IN) :: root

    !> The array that is distributed across the processes.
    COMPLEX(dp), INTENT(INOUT) :: array(:,:)

    !> the rank of this process
    INTEGER my_rank

    ! MPI error code
    INTEGER ierr

    ! we only need to sum if the array is distributed
#if defined(__MPI)

    ! determine rank of this process
    my_rank = mp_rank(comm)

    ! at the root process
    IF (my_rank == root) THEN

      CALL MPI_REDUCE(MPI_IN_PLACE, array, SIZE(array), MPI_DOUBLE_COMPLEX, &
                      MPI_SUM, root, comm, ierr)
      CALL errore(__FILE__, "error in mpi_reduce call", ierr)

    ELSE ! not root

      CALL MPI_REDUCE(array, array, SIZE(array), MPI_DOUBLE_COMPLEX, &
                      MPI_SUM, root, comm, ierr)
      CALL errore(__FILE__, "error in mpi_reduce call", ierr)

    END IF ! root

#endif

  END SUBROUTINE mp_root_sum_cm

  !> sum a real 3d tensor to the root process
  SUBROUTINE mp_root_sum_r3d(comm, root, array)

    USE kinds, ONLY: dp
    USE mp,    ONLY: mp_rank
    USE parallel_include

    !> The communicator across which the tasks are distributed.
    INTEGER,  INTENT(IN)    :: comm

    !> The root process on which the output is gathered.
    INTEGER,  INTENT(IN)    :: root

    !> The array that is distributed across the processes.
    REAL(dp), INTENT(INOUT) :: array(:,:,:)

    !> the rank of this process
    INTEGER my_rank

    ! MPI error code
    INTEGER ierr

    ! we only need to sum if the array is distributed
#if defined(__MPI)

    ! determine rank of this process
    my_rank = mp_rank(comm)

    ! at the root process
    IF (my_rank == root) THEN

      CALL MPI_REDUCE(MPI_IN_PLACE, array, SIZE(array), MPI_DOUBLE_PRECISION, &
                      MPI_SUM, root, comm, ierr)
      CALL errore(__FILE__, "error in mpi_reduce call", ierr)

    ELSE ! not root

      CALL MPI_REDUCE(array, array, SIZE(array), MPI_DOUBLE_PRECISION, &
                      MPI_SUM, root, comm, ierr)
      CALL errore(__FILE__, "error in mpi_reduce call", ierr)

    END IF ! root

#endif
    
  END SUBROUTINE mp_root_sum_r3d

  !> sum a complex 3d tensor to the root process
  SUBROUTINE mp_root_sum_c3d(comm, root, array)

    USE kinds, ONLY: dp
    USE mp,    ONLY: mp_rank
    USE parallel_include

    !> The communicator across which the tasks are distributed.
    INTEGER,  INTENT(IN) :: comm

    !> The root process on which the output is gathered.
    INTEGER,  INTENT(IN) :: root

    !> The array that is distributed across the processes.
    COMPLEX(dp), INTENT(INOUT) :: array(:,:,:)

    !> the rank of this process
    INTEGER my_rank

    ! MPI error code
    INTEGER ierr

    ! we only need to sum if the array is distributed
#if defined(__MPI)

    ! determine rank of this process
    my_rank = mp_rank(comm)

    ! at the root process
    IF (my_rank == root) THEN

      CALL MPI_REDUCE(MPI_IN_PLACE, array, SIZE(array), MPI_DOUBLE_COMPLEX, &
                      MPI_SUM, root, comm, ierr)
      CALL errore(__FILE__, "error in mpi_reduce call", ierr)

    ELSE ! not root

      CALL MPI_REDUCE(array, array, SIZE(array), MPI_DOUBLE_COMPLEX, &
                      MPI_SUM, root, comm, ierr)
      CALL errore(__FILE__, "error in mpi_reduce call", ierr)

    END IF ! root

#endif

  END SUBROUTINE mp_root_sum_c3d

END MODULE parallel_module