module para_mod integer maxproc, ncplanex parameter (maxproc=64, ncplanex=30000) character*2 node ! node: node number, useful for opening files integer nproc, me ! nproc: number of processors ! me: number of this processor ! parallel fft information for the dense grid ! ! npp: number of plane per processor ! n3: n3(me)+1 = first plane on proc. me ! ncp: number of (density) columns per proc ! ncp0: starting column for each processor ! ncplane: number of columns in a plane ! nct: total number of non-zero columns ! nnr_: local fft data size ! ipc: index saying which proc owns columns in a plane ! ocpl: index relating columns and pos. in the plane ! integer npp(maxproc), n3(maxproc), ncp(maxproc), ncp0(maxproc), & ncplane, nct, nnr_, ipc(ncplanex), icpl(ncplanex) ! ! parallel fft information for the smooth mesh ! ! npps: number of plane per processor ! ncps: number of (density) columns per proc ! ncpw: number of (wfs) columns per processor ! ncps0: starting column for each processor ! ncplanes:number of columns in a plane (smooth) ! ncts: total number of non-zero columns ! nnrs_: local fft data size ! ipcs: saying which proc owns columns in a plane ! icpls: index relating columns and pos. in the plane ! integer npps(maxproc), ncps(maxproc), ncpw(maxproc), ncp0s(maxproc), & ncplanes, ncts, nnrs_, ipcs(ncplanex), icpls(ncplanex) end module para_mod ! !----------------------------------------------------------------------- subroutine startup !----------------------------------------------------------------------- ! ! This subroutine initializes MPI ! NPROC is set by the environment variable MP_PROCS ! use para_mod ! implicit none include 'mpif.h' integer ierr ! call mpi_init(ierr) if (ierr.ne.0) call error('startup','mpi_init',ierr) call mpi_comm_size(MPI_COMM_WORLD,nproc,ierr) if (ierr.ne.0) call error('startup','mpi_comm_size',ierr) call mpi_comm_rank(MPI_COMM_WORLD, me,ierr) if (ierr.ne.0) call error('startup','mpi_comm_rank',ierr) ! me=me+1 ! ! parent process (source) will have me=1 - child process me=2,...,NPROC ! (for historical reasons: MPI uses 0,...,NPROC-1 instead ) ! if (nproc.gt.maxproc) & & call error('startup',' too many processors ',nproc) ! if (me.lt.10) then write(node,'(i1,1x)') me else if (me.lt.100) then write(node,'(i2)') me else call error('startup','wow, >100 nodes !!',nproc) end if ! ! only the first processor writes ! if (me.eq.1) then write(6,'(/5x,''Parallel version (MPI)'')') write(6,'(5x,''Number of processors in use: '',i4)') nproc else open(6,file='/dev/null',status='unknown') ! ! useful for debugging purposes ! open(6,file='out.'//node,status='unknown') end if ! return end ! !----------------------------------------------------------------------- subroutine openfiles (tbuff,nbeg,ndr,ndw) !----------------------------------------------------------------------- ! ! Open scratch files, parallel case ! Scratch files (units 21 and ndw) are opened ! - in the current directory if the variable SCRDIR is not set, or ! - in the directory pointed by the environment variable SCRDIR, if set ! Other files - including input files - are opened in the current directory ! ! For IBM SP machines without a fast parallel file system, the file ndw ! is written on the current directory at the end of the execution. ! If the current directory is a distributed filesystem (like the home ! directory) this allows to find the final file. For optimal performances, ! SCRDIR should be a local filesystem (in order to reduce network I/O). ! use para_mod ! implicit none logical tbuff integer nbeg,ndr, ndw ! integer i, ios character unitndr*2, unitndw*2, scrdir*80 ! ! if (ndr.lt.10.or.ndr.ge.100) call error('openfile','ndr ?!?',ndr) if (ndw.lt.10.or.ndw.ge.100) call error('openfile','ndw ?!?',ndw) write(unitndr,'(i2)') ndr write(unitndw,'(i2)') ndw #ifdef T3D call pxfgetenv('SCRDIR',0,scrdir,i,ios) #else call getenv('SCRDIR',scrdir) #endif ! ! add a '/' at the end if absent and needed ! i=len(trim(scrdir)) if (i.gt.0) then if (scrdir(i:i).ne.'/') then if (i+1.le.len(scrdir)) then scrdir(i+1:i+1)='/' else call error('openfile','scratch dir too long',i) end if end if end if ! ! unit 21 contains wavefunctions in real space - untested! ! if (tbuff) open(unit=21,file=trim(scrdir)//'fort21.'//node, & & form='unformatted') ! ! in parallel execution only the first node reads/writes wavefunctions ! if (me.eq.1) then open(unit=ndw,file=trim(scrdir)//'fort.'//unitndw, & & form='unformatted') if (nbeg.ge.-1) then if(ndr.ne.ndw) then open(unit=ndr,file='fort.'//unitndr, & & status='old',form='unformatted') else if (trim(scrdir).ne.' ') then ! ! if ndr=ndw, everything will work if we simply do not open ndr, ! but since ndr reads in current directory and ndw writes in SCRDIR ! the trick will not work if SCRDIR != current directory ! call error('openfile','ndr=ndw not possible',1) end if end if end if ! return end ! !----------------------------------------------------------------------- subroutine reopenfile (ndw) !----------------------------------------------------------------------- ! ! IBM SP hack : ! closes unit ndw on scrdir, reopen it on the current directory ! use para_mod ! implicit none integer ndw character unitndw*2 ! close(unit=ndw) write(unitndw,'(i2)') ndw open(unit=ndw,file='fort.'//unitndw,form='unformatted') ! return end ! !---------------------------------------------------------------------- subroutine read_rho(unit,nspin,rhor) !---------------------------------------------------------------------- ! ! read from file rhor(nnr,nspin) on first node and distribute to other nodes ! use para_mod use parm implicit none include 'mpif.h' integer unit, nspin real*8 rhor(nnr,nspin) ! integer ir, is integer root, proc, ierr, n, displs(nproc), sendcount(nproc) real*8, allocatable:: rhodist(:) ! ! if (me.eq.1) allocate(rhodist(nr1x*nr2x*nr3x)) root = 0 do proc=1,nproc sendcount(proc) = ncplane*npp(proc) if (proc.eq.1) then displs(proc)=0 else displs(proc)=displs(proc-1) + sendcount(proc-1) end if end do do is=1,nspin ! ! read the charge density from unit "unit" on first node only ! if (me.eq.1) read(unit) (rhodist(ir),ir=1,nr1x*nr2x*nr3x) ! ! distribute the charge density to the other nodes ! call mpi_barrier ( MPI_COMM_WORLD, ierr) call mpi_scatterv(rhodist, sendcount, displs, MPI_REAL8, & & rhor(1,is),sendcount(me), MPI_REAL8, & & root, MPI_COMM_WORLD, ierr) if (ierr.ne.0) call error('mpi_scatterv','ierr<>0',ierr) ! ! just in case: set to zero unread elements (if any) ! do ir=sendcount(me)+1,nnr rhor(ir,is)=0.d0 end do end do if (me.eq.1) deallocate(rhodist) ! return end subroutine read_rho ! !---------------------------------------------------------------------- subroutine write_rho(unit,nspin,rhor) !---------------------------------------------------------------------- ! ! collect rhor(nnr,nspin) on first node and write to file ! use para_mod use parm implicit none include 'mpif.h' integer unit, nspin real*8 rhor(nnr,nspin) ! integer ir, is integer root, proc, ierr, displs(nproc), recvcount(nproc) real*8, allocatable:: rhodist(:) ! ! if (me.eq.1) allocate(rhodist(nr1x*nr2x*nr3x)) ! root = 0 do proc=1,nproc recvcount(proc) = ncplane*npp(proc) if (proc.eq.1) then displs(proc)=0 else displs(proc)=displs(proc-1) + recvcount(proc-1) end if end do ! do is=1,nspin ! ! gather the charge density on the first node ! call mpi_barrier ( MPI_COMM_WORLD, ierr) call mpi_gatherv (rhor(1,is), recvcount(me), MPI_REAL8, & & rhodist,recvcount, displs, MPI_REAL8, & & root, MPI_COMM_WORLD, ierr) if (ierr.ne.0) call error('mpi_gatherv','ierr<>0',ierr) ! ! write the charge density to unit "unit" from first node only ! if (me.eq.1) write(unit) (rhodist(ir),ir=1,nr1x*nr2x*nr3x) end do if (me.eq.1) deallocate(rhodist) ! return end subroutine write_rho ! ! !----------------------------------------------------------------------- subroutine set_fft_para( b1, b2, b3, gcut, gcuts, gcutw, & & nr1, nr2, nr3, nr1s, nr2s, nr3s, nnr, & & nr1x,nr2x,nr3x,nr1sx,nr2sx,nr3sx,nnrs ) !----------------------------------------------------------------------- ! ! distribute columns to processes for parallel fft ! based on code written by Stefano de Gironcoli for PWSCF ! columns are sets of g-vectors along z: g(k) = i1*b1+i2*b2+i3*b3 , ! with g^2<gcut and (i1,i2) running over the (xy) plane. ! Columns are "active" for a given (i1,i2) if they contain a nonzero ! number of wavevectors ! use para_mod ! implicit none real*8 b1(3), b2(3), b3(3), gcut, gcuts, gcutw integer nr1, nr2, nr3, nr1s, nr2s, nr3s, nnr, & & nr1x,nr2x,nr3x,nr1sx,nr2sx,nr3sx,nnrs ! integer ngc(ncplanex),&! number of g-vectors per column (dense grid) & ngcs(ncplanex),&! number of g-vectors per column (smooth grid & ngcw(ncplanex),&! number of wavefct plane waves per colum & in1(ncplanex),&! index i for column (i1,i2) & in2(ncplanex),&! index j for column (i1,i2) & ic, &! fft index for this column (dense grid) & ics, &! as above for the smooth grid & icm, &! fft index for column (-i1,-i2) (dense grid) & icms, &! as above for the smooth grid & index(ncplanex),&! used to order column & ncp_(maxproc),&! number of column per processor (work space) & ngp(maxproc), &! number of g-vectors per proc (dense grid) & ngps(maxproc),&! number of g-vectors per proc (smooth grid) & ngpw(maxproc) ! number of wavefct plane waves per proc ! integer np, nps1, &! counters on planes & nq, nqs, &! counters on planes & max1,min1,max2,min2, &! aux. variables & m1, m2, n1, n2, i, mc,&! generic counter & idum, nct_, &! check variables & j,jj, &! counters on processors & n1m1,n2m1,n3m1, &! nr1-1 and so on & i1, i2, i3, &! counters on G space & good_fft_dimension ! a function with obvious meaning real*8 & & aux(ncplanex), &! used to order columns & amod ! modulus of G vectors ! call tictac(27,0) ! ! set the dimensions of fft arrays ! nr1x =good_fft_dimension(nr1 ) nr2x = nr2 nr3x =good_fft_dimension(nr3 ) ! nr1sx=good_fft_dimension(nr1s) nr2sx=nr2s nr3sx=good_fft_dimension(nr3s) ! ! compute number of columns for each processor ! ncplane = nr1x*nr2x ncplanes= nr1sx*nr2sx if (ncplane.gt.ncplanex .or. ncplanes.gt.ncplanex) & & call error('set_fft_para','ncplanex too small',ncplane) ! ! set the number of plane per process ! if (nr3.lt.nproc) call error('set_fft_para', & & 'some processors have no planes ',-1) if (nr3s.lt.nproc) call error('set_fft_para', & & 'some processors have no smooth planes ',-1) ! if (nproc.eq.1) then npp(1) = nr3 npps(1)= nr3s else np = nr3/nproc nq = nr3 - np*nproc nps1 = nr3s/nproc nqs = nr3s - nps1*nproc do i = 1, nproc npp(i) = np if (i.le.nq) npp(i) = np + 1 npps(i) = nps1 if (i.le.nqs) npps(i) = nps1 + 1 end do end if n3(1)= 0 do i = 2, nproc n3(i)=n3(i-1)+npp(i-1) end do ! ! Now compute for each point of the big plane how many column have ! non zero vectors on the smooth and dense grid ! do mc = 1,ncplane ipc(mc) = 0 end do do mc = 1,ncplanes ipcs(mc)= 0 end do ! nct = 0 ncts= 0 ! ! NOTA BENE: the exact limits for a correctly sized FFT grid are: ! -nr/2,..,+nr/2 for nr even; -(nr-1)/2,..,+(nr-1)/2 for nr odd. ! If the following limits are increased, a slightly undersized fft ! grid, with some degree of G-vector refolding, can be used ! (at your own risk - a check is done in ggen). ! n1m1=nr1/2 n2m1=nr2/2 n3m1=nr3/2 ! do i1=-n1m1,n1m1 do i2=-n2m1,n2m1 ! ! nct counts columns containing G-vectors for the dense grid ! nct=nct+1 if (nct.gt.ncplane) & & call error('set_fft_para','too many columns',1) ngc (nct) = 0 ngcs(nct) = 0 ngcw(nct) = 0 ! do i3 = -n3m1,n3m1 amod = (b1(1)*i1 + b2(1)*i2 + b3(1)*i3)**2 + & & (b1(2)*i1 + b2(2)*i2 + b3(2)*i3)**2 + & & (b1(3)*i1 + b2(3)*i2 + b3(3)*i3)**2 if (amod.le.gcut ) & & ngc (nct)= ngc (nct)+ 1 if (amod.le.gcuts) & & ngcs(nct)= ngcs(nct)+ 1 if (amod.le.gcutw) & & ngcw(nct)= ngcw(nct)+ 1 enddo if (ngc(nct).gt.0) then ! ! this column contains G-vectors ! in1(nct) = i1 in2(nct) = i2 if (ngcs(nct).gt.0) then ! ! ncts counts columns contaning G-vectors for the smooth grid ! ncts=ncts+1 if (ncts.gt.ncplanes) & & call error('set_fft_para','too many columns',2) end if else ! ! this column has no G-vectors: reset the counter ! nct=nct-1 end if enddo end do ! if(nct .eq.0) call error('set_fft_para','number of column 0', 1) if(ncts.eq.0) call error('set_fft_para', & & 'number smooth column 0', 1) ! ! Sort the columns. First the column with the largest number of G ! vectors on the wavefunction sphere (active columns), ! then on the smooth sphere, then on the big sphere. Dirty trick: ! do mc = 1,nct aux(mc)=-(ngcw(mc)*nr3x**2 + ngcs(mc)*nr3x + ngc(mc)) end do call kb07ad(aux,nct,index) ! ! assign columns to processes ! do j=1,nproc ncp (j) = 0 ncps(j) = 0 ncpw(j) = 0 ngp (j) = 0 ngps(j) = 0 ngpw(j) = 0 end do ! do mc=1, nct i = index(mc) ! ! index contains the desired ordering of columns (see above) ! i1=in1(i) i2=in2(i) ! if ( i1.lt.0.or.(i1.eq.0.and.i2.lt.0) ) go to 30 ! ! only half of the columns, plus column (0,0), are scanned: ! column (-i1,-i2) must be assigned to the same proc as column (i1,i2) ! ! ic : position, in fft notation, in dense grid, of column ( i1, i2) ! icm : " " " " " " (-i1,-i2) ! ics : " " " smooth " " ( i1, i2) ! icms: " " " smooth " " (-i1,-i2) ! m1 = i1 + 1 if (m1.lt.1) m1 = m1 + nr1 m2 = i2 + 1 if (m2.lt.1) m2 = m2 + nr2 ic = m1 + (m2-1)*nr1x ! n1 = -i1 + 1 if (n1.lt.1) n1 = n1 + nr1 n2 = -i2 + 1 if (n2.lt.1) n2 = n2 + nr2 icm = n1 + (n2-1)*nr1x ! m1 = i1 + 1 if (m1.lt.1) m1 = m1 + nr1s m2 = i2 + 1 if (m2.lt.1) m2 = m2 + nr2s ics = m1 + (m2-1)*nr1sx ! n1 =-i1 + 1 if (n1.lt.1) n1 = n1 + nr1s n2 =-i2 + 1 if (n2.lt.1) n2 = n2 + nr2s icms = n1 + (n2-1)*nr1sx ! jj=1 if (ngcw(i).gt.0) then ! ! this is an active column: find which processor has currently ! the smallest number of plane waves ! do j=1,nproc if (ngpw(j).lt.ngpw(jj)) jj = j end do else ! ! this is an inactive column: find which processor has currently ! the smallest number of G-vectors ! do j=1,nproc if (ngp(j).lt.ngp(jj)) jj = j end do end if ! ! jj is the processor to which this column is assigned ! use -jj for inactive columns, jj for active columns ! ipc(ic) = -jj ncp(jj) = ncp(jj) + 1 ngp(jj) = ngp(jj) + ngc(i) if (ngcs(i).gt.0) then ncps(jj)=ncps(jj)+1 ngps(jj)=ngps(jj)+ngcs(i) ipcs(ics)=-jj endif if (ngcw(i).gt.0) then ipcs(ics)=jj ipc(ic) = jj ngpw(jj)= ngpw(jj) + ngcw(i) ncpw(jj)= ncpw(jj) + 1 endif ! ! now assign the (-i1,-i2) column to the same processor ! if (i1.eq.0.and.i2.eq.0) go to 30 ! ! do not count twice column (0,0) ! ! ipc(icm) = -jj ncp(jj) = ncp(jj) + 1 ngp(jj) = ngp(jj) + ngc(i) if (ngcs(i).gt.0) then ncps(jj)=ncps(jj)+1 ngps(jj)=ngps(jj)+ngcs(i) ipcs(icms)=-jj endif if (ngcw(i).gt.0) then ipcs(icms)=jj ipc(icm) = jj ngpw(jj)= ngpw(jj) + ngcw(i) ncpw(jj)= ncpw(jj) + 1 endif 30 continue enddo ! ! ipc is the processor for this column in the dense grid ! ipcs is the same, for the smooth grid ! write(6,'( & & '' Proc planes cols G planes cols G columns G''/ & & '' (dense grid) (smooth grid) (wavefct grid)'')') do i=1,nproc write(6,'(i3,2x,3(i5,2i7))') i, npp(i),ncp(i),ngp(i), & & npps(i),ncps(i),ngps(i), ncpw(i), ngpw(i) end do ! ! nnr_ and nnrs_ are copies of nnr and nnrs, the local fft data size, ! to be stored in "parallel" commons. Not a very elegant solution. ! if (nproc.eq.1) then nnr =nr1x*nr2x*nr3x nnrs=nr1sx*nr2sx*nr3sx else nnr = max(nr3x*ncp(me), nr1x*nr2x*npp(me)) nnrs= max(nr3sx*ncps(me), nr1sx*nr2sx*npps(me)) end if nnr_= nnr nnrs_= nnrs ! ! computing the starting column for each processor ! do i=1,nproc if(ngpw(i).eq.0) & & call error('set_fft_para', & & 'some processors have no pencils, not yet implemented',1) if (i.eq.1) then ncp0(i) = 0 ncp0s(i)= 0 else ncp0(i) = ncp0 (i-1) + ncp (i-1) ncp0s(i)= ncp0s(i-1) + ncps(i-1) endif enddo ! ! Now compute the arrays ipc and icpl (dense grid): ! ipc contain the number of the column for that processor. ! zero if the column do not belong to the processor. ! Note that input ipc is used and overwritten. ! icpl contains the point in the plane for each column ! !- active columns first........ ! do j=1,nproc ncp_(j) = 0 end do do mc =1,ncplane if (ipc(mc).gt.0) then j = ipc(mc) ncp_(j) = ncp_(j) + 1 icpl(ncp_(j) + ncp0(j)) = mc if (j.eq.me) then ipc(mc) = ncp_(j) else ipc(mc) = 0 end if end if end do ! !-..... ( intermediate check ) .... ! do j=1,nproc if (ncp_(j).ne.ncpw(j)) & & call error('set_fft_para','ncp_(j).ne.ncpw(j)',j) end do ! !- ........then the remaining columns ! do mc =1,ncplane if (ipc(mc).lt.0) then j = -ipc(mc) ncp_(j) = ncp_(j) + 1 icpl(ncp_(j) + ncp0(j)) = mc if (j.eq.me) then ipc(mc) = ncp_(j) else ipc(mc) = 0 end if end if end do ! !-... ( final check ) ! nct_ = 0 do j=1,nproc if (ncp_(j).ne.ncp(j)) & & call error('set_fft_para','ncp_(j).ne.ncp(j)',j) nct_ = nct_ + ncp_(j) end do if (nct_.ne.nct) & & call error('set_fft_para','nct_.ne.nct',1) ! ! now compute the arrays ipcs and icpls ! (as ipc and icpls, for the smooth grid) ! ! active columns first... ! do j=1,nproc ncp_(j) = 0 end do do mc =1,ncplanes if (ipcs(mc).gt.0) then j = ipcs(mc) ncp_(j)=ncp_(j) + 1 icpls(ncp_(j) + ncp0s(j)) = mc if (j.eq.me) then ipcs(mc) = ncp_(j) else ipcs(mc) = 0 endif endif enddo ! !-..... ( intermediate check ) .... ! do j=1,nproc if (ncp_(j).ne.ncpw(j)) & & call error('set_fft_para','ncp_(j).ne.ncpw(j)',j) end do ! ! and then all the others ! do mc =1,ncplanes if (ipcs(mc).lt.0) then j = -ipcs(mc) ncp_(j) = ncp_(j) + 1 icpls(ncp_(j) + ncp0s(j)) = mc if (j.eq.me) then ipcs(mc) = ncp_(j) else ipcs(mc) = 0 end if end if end do ! !-... ( final check ) ! nct_ = 0 do j=1,nproc if (ncp_(j).ne.ncps(j)) & & call error('set_fft_para','ncp_(j).ne.ncps(j)',j) nct_ = nct_ + ncp_(j) end do if (nct_.ne.ncts) & & call error('set_fft_para','nct_.ne.ncts',1) call tictac(27,1) ! return end ! !---------------------------------------------------------------------- subroutine cfftp(f,nr1,nr2,nr3,nr1x,nr2x,nr3x,sign) !---------------------------------------------------------------------- ! ! sign = +-1 : parallel 3d fft for rho and for the potential ! ! sign = +1 : G-space to R-space, output = \sum_G f(G)exp(+iG*R) ! fft along z using pencils (cft_1) ! transpose across nodes (fft_scatter) ! and reorder ! fft along y and x (cft_2) ! sign = -1 : R-space to G-space, output = \int_R f(R)exp(-iG*R)/Omega ! fft along x and y (cft_2) ! transpose across nodes (fft_scatter) ! and reorder ! fft along z using pencils (cft_1) ! ! based on code written by Stefano de Gironcoli for PWSCF ! use para_mod use work_fft ! implicit none integer nr1,nr2,nr3,nr1x,nr2x,nr3x, sign, nppx complex*16 f(nnr_) integer mc, i, j, ii ! ! the following is needed if the fft is distributed over only one processor ! for the special case nx3.ne.n3. Not an elegant solution, but simple, fast, ! and better than the preceding one that did not work in some cases. Note ! that fft_scatter does nothing if nproc=1. PG ! if (nproc.eq.1) then nppx=nr3x else nppx=npp(me) end if if (sign.eq.1) then call cft_1(f,ncp(me),nr3,nr3x,sign,aux) call fft_scatter(nproc,me,aux,nr3x,nnr_,f,ncp,npp,sign) call zero(2*nnr_,f) do i=1,nct mc = icpl(i) do j=1,npp(me) f(mc+(j-1)*ncplane) = aux(j + (i-1)*nppx) end do end do ! call cft_2(f,npp(me),nr1,nr2,nr1x,nr2x,sign) ! else if (sign.eq.-1) then ! call cft_2(f,npp(me),nr1,nr2,nr1x,nr2x,sign) ! do i=1,nct mc = icpl(i) do j=1,npp(me) aux(j + (i-1)*nppx) = f(mc+(j-1)*ncplane) end do end do call fft_scatter(nproc,me,aux,nr3x,nnr_,f,ncp,npp,sign) call cft_1(aux,ncp(me),nr3,nr3x,sign,f) else call error('cftp','not allowed',abs(sign)) end if ! return end ! !---------------------------------------------------------------------- subroutine cfftps (f,nr1,nr2,nr3,nr1x,nr2x,nr3x,sign) !---------------------------------------------------------------------- ! ! sign = +-1 : parallel 3d fft for rho and for the potential ! sign = +-2 : parallel 3d fft for wavefunctions ! ! sign = + : G-space to R-space, output = \sum_G f(G)exp(+iG*R) ! fft along z using pencils (cft_1) ! transpose across nodes (fft_scatter) ! and reorder ! fft along y (using planes) and x (cft_2) ! sign = - : R-space to G-space, output = \int_R f(R)exp(-iG*R)/Omega ! fft along x and y(using planes) (cft_2) ! transpose across nodes (fft_scatter) ! and reorder ! fft along z using pencils (cft_1) ! ! The array "planes" signals whether a fft is needed along y : ! planes(i)=0 : column f(i,*,*) empty , don't do fft along y ! planes(i)=1 : column f(i,*,*) filled, fft along y needed ! "empty" = no active components are present in f(i,*,*) ! after (sign>0) or before (sign<0) the fft on z direction ! ! Note that if sign=+/-1 (fft on rho and pot.) all fft's are needed ! and all planes(i) are set to 1 ! ! based on code written by Stefano de Gironcoli for PWSCF ! use para_mod use work_fft ! implicit none integer nr1,nr2,nr3,nr1x,nr2x,nr3x,sign complex*16 f(nnrs_) integer mc, i, j, ii, proc, k, nppx integer planes(nr1x) ! ! see comments in cfftp.F for the logic (or lack of it) of the following ! if (nproc.eq.1) then nppx=nr3x else nppx=npps(me) end if if (sign.gt.0) then if (sign.ne.2) then call cft_1s(f,ncps(me),nr3,nr3x,sign,aux) call fft_scatter(nproc,me,aux,nr3x,nnrs_,f,ncps,npps,sign) call zero(2*nnrs_,f) do i=1,ncts mc = icpls(i) do j=1,npps(me) f(mc+(j-1)*ncplanes) = aux(j + (i-1)*nppx) end do end do do i=1,nr1x planes(i) = 1 enddo else call cft_1s(f,ncpw(me),nr3,nr3x,sign,aux) call fft_scatter(nproc,me,aux,nr3x,nnrs_,f,ncpw,npps,sign) call zero(2*nnrs_,f) ii = 0 do i=1,nr1x planes(i) = 0 enddo do proc=1,nproc do i=1,ncpw(proc) mc = icpls(i+ncp0s(proc)) ii = ii + 1 k=mod(mc-1,nr1x)+1 planes(k) = 1 do j=1,npps(me) f(mc+(j-1)*ncplanes) = aux(j + (ii-1)*nppx) end do end do end do end if ! call cft_2s(f,npps(me),nr1,nr2,nr1x,nr2x,sign,planes) ! else ! if (sign.ne.-2) then do i=1,nr1x planes(i) = 1 end do else do i=1,nr1x planes(i) = 0 end do do proc=1,nproc do i=1,ncpw(proc) mc = icpls(i+ncp0s(proc)) k=mod(mc-1,nr1x)+1 planes(k) = 1 enddo enddo endif ! call cft_2s(f,npps(me),nr1,nr2,nr1x,nr2x,sign,planes) ! if (sign.ne.-2) then do i=1,ncts mc = icpls(i) do j=1,npps(me) aux(j + (i-1)*nppx) = f(mc+(j-1)*ncplanes) end do end do call fft_scatter(nproc,me,aux,nr3x,nnrs_,f,ncps,npps,sign) call cft_1s(aux,ncps(me),nr3,nr3x,sign,f) else ii = 0 do proc=1,nproc do i=1,ncpw(proc) mc = icpls(i+ncp0s(proc)) ii = ii + 1 do j=1,npps(me) aux(j + (ii-1)*nppx) = f(mc+(j-1)*ncplanes) end do end do end do call fft_scatter(nproc,me,aux,nr3x,nnrs_,f,ncpw,npps,sign) call cft_1s(aux,ncpw(me),nr3,nr3x,sign,f) end if end if ! return end ! !------------------------------------------------------------------------ subroutine fft_scatter & & (nproc, me, f_in, nr3x, nnr_, f_aux, ncp_, npp_, sign) !------------------------------------------------------------------------ ! ! transpose the fft grid across nodes ! a) From columns to planes (sign > 0) ! ! "columns" (or "pencil") representation: ! processor "me" has ncp_(me) contiguous columns along z ! Each column is nr3x=nr3+1 elements for a fft of order nr3 ! (the additional element is added to reduce memory conflicts) ! ! The transpose take places in two steps: ! 1) on each processor the columns are divided into slices along z ! that are stored contiguously. On processor "me", slices for ! processor "proc" are npp_(proc)*ncp_(me) big ! 2) all processors communicate to exchange slices ! (all columns with z in the slice belonging to "me" ! must be received, all the others must be sent to "proc") ! Finally one gets the "planes" representation: ! processor "me" has npp_(me) complete xy planes ! ! b) From planes to columns (sign < 0) ! ! Quite the same in the opposite direction ! ! The output is overwritten on f_in ; f_aux is used as work space ! ! based on code written by Stefano de Gironcoli for PWSCF ! implicit none integer maxproc parameter (maxproc=64) ! include 'mpif.h' integer nproc, me, nr3x, nnr_, sign, ncp_(nproc), npp_(nproc) real*8 f_in(2*nnr_), f_aux(2*nnr_) ! integer dest, from, k, offset1(maxproc), & & sendcount(maxproc), sdispls(maxproc), & & recvcount(maxproc), rdispls(maxproc), & & proc, ierr ! ! if (nproc.eq.1) return call tictac(28,0) ! ! sendcount(proc): amount of data processor "me" must send to processor proc ! recvcount(proc): amount of data processor "me" must receive from proc ! do proc = 1, nproc sendcount(proc) = 2*npp_(proc)*ncp_(me) recvcount(proc) = 2*npp_(me)*ncp_(proc) end do ! ! offset1(proc) is used to locate the slices to be sent to proc ! sdispls(proc)+1 is the beginning of data that must be sent to proc ! rdispls(proc)+1 is the beginning of data that must be received from proc ! offset1(1) = 1 sdispls(1)=0 rdispls(1)=0 do proc = 2, nproc offset1(proc) = offset1(proc-1) + 2 * npp_(proc-1) sdispls(proc) = sdispls(proc-1) + sendcount(proc-1) rdispls(proc) = rdispls(proc-1) + recvcount(proc-1) end do ! if(sign.gt.0) then ! ! "forward" scatter from columns to planes ! ! step one: store contiguously the slices ! do proc = 1, nproc from = offset1(proc) dest = 1 + sdispls(proc) do k = 1, ncp_(me) call COPY ( 2 * npp_(proc), & & f_in ( from + 2*(k-1)*nr3x) , 1, & & f_aux( dest + 2*(k-1)*npp_(proc) ) , 1 ) end do end do ! ! maybe useless; ensures that no garbage is present in the output ! call zero (2*nnr_,f_in) ! ! step two: communication ! call mpi_barrier( MPI_COMM_WORLD, ierr ) call mpi_alltoallv(f_aux,sendcount,sdispls,MPI_REAL8, & & f_in ,recvcount,rdispls,MPI_REAL8, & & MPI_COMM_WORLD, ierr) if (ierr.ne.0) call error('fft_scatter','ierr<>0',ierr) ! else ! ! step two: communication ! call mpi_barrier( MPI_COMM_WORLD, ierr ) call mpi_alltoallv(f_in ,recvcount,rdispls,MPI_REAL8, & & f_aux,sendcount,sdispls,MPI_REAL8, & & MPI_COMM_WORLD, ierr) if (ierr.ne.0) call error('fft_scatter','ierr<>0',ierr) ! ! step one: store contiguously the columns ! call zero (2*nnr_,f_in) ! do proc = 1, nproc from = 1 + sdispls(proc) dest = offset1(proc) do k = 1, ncp_(me) call COPY ( 2 * npp_(proc), & & f_aux( from + 2*(k-1)*npp_(proc) ) , 1 , & & f_in ( dest + 2*(k-1)*nr3x) , 1 ) end do end do ! end if call tictac(28,1) ! return end ! !----------------------------------------------------------------------- subroutine reduce(size,ps) !----------------------------------------------------------------------- ! ! sums a distributed variable s(size) over the processors. ! This version uses a fixed-length buffer of appropriate (?) size ! use para_mod ! implicit none integer size real*8 ps(size) ! include 'mpif.h' integer ierr, n, nbuf integer, parameter:: MAXB=10000 real*8 buff(MAXB) ! if (nproc.le.1) return if (size.le.0) return call tictac(29,0) ! ! syncronize processes ! call mpi_barrier(MPI_COMM_WORLD,ierr) if (ierr.ne.0) call error('reduce','error in barrier',ierr) ! nbuf=size/MAXB ! do n=1,nbuf call mpi_allreduce (ps(1+(n-1)*MAXB), buff, MAXB, & & MPI_REAL8, MPI_SUM, MPI_COMM_WORLD, ierr) if (ierr.ne.0) & & call error('reduce','error in allreduce1',ierr) call COPY(MAXB,buff,1,ps(1+(n-1)*MAXB),1) end do ! ! possible remaining elements < maxb ! if (size-nbuf*MAXB.gt.0) then call mpi_allreduce (ps(1+nbuf*MAXB), buff, size-nbuf*MAXB, & & MPI_REAL8, MPI_SUM, MPI_COMM_WORLD, ierr) if (ierr.ne.0) & & call error('reduce','error in allreduce2',ierr) call COPY(size-nbuf*MAXB,buff,1,ps(1+nbuf*MAXB),1) endif call tictac(29,1) ! return end ! !---------------------------------------------------------------------- subroutine print_para_times !---------------------------------------------------------------------- ! use para_mod use timex_mod ! implicit none include 'mpif.h' integer i, ierr real*8 mincpu(maxclock), maxcpu(maxclock) dimension routine(maxclock) character*10 routine data routine / ' full ', & & ' phbox ', & & ' strucf ', & & ' rhoofr ', & & ' vofrho ', & & ' newd ', & & ' dforce ', & & ' nlfor ', & & ' calphi ', & & ' r_iter ', & & ' ortho ', & & ' wfs**2 ', & & ' rhov ', & & ' fft ', & & ' ffts+w ', & & ' fftb ', & & ' 2*nlsm1 ', & & ' v*wf ', & & ' abc ', & & ' rsg ', & & ' x_iter ', & & ' nlsm2 ', & & ' nl_dforce', & & ' becp-phi ', & & ' bhs ', & & ' prefor ', & & 'setfftpara', & & 'fftscatter', & & ' reduce ', & & ' '/ ! ! syncronize processes ! call mpi_barrier(MPI_COMM_WORLD,ierr) if (ierr.ne.0) & & call error('print_all_times','error in barrier',ierr) ! call mpi_allreduce (cputime, mincpu, maxclock, & & MPI_REAL8, MPI_MIN, MPI_COMM_WORLD, ierr) if (ierr.ne.0) & & call error('print_para_times','error in minimum',ierr) call mpi_allreduce (cputime, maxcpu, maxclock, & & MPI_REAL8, MPI_MAX, MPI_COMM_WORLD, ierr) if (ierr.ne.0) & & call error('print_para_times','error in maximum',ierr) ! write(6,*) ' routine calls cpu time elapsed' write(6,*) ' node0 node0, min, max node0' write(6,*) do i=1, maxclock if (ntimes(i).gt.0) write(6,30) routine(i), & & ntimes(i), cputime(i), mincpu(i),maxcpu(i), elapsed(i) end do 30 format(a10,i7,3f7.1,f7.1) ! return end ! !---------------------------------------------------------------------- subroutine write_wfc(unit,c) !---------------------------------------------------------------------- ! ! collect wavefunctions on first node and write to file ! use gvec use gvecs use elct use para_mod use parms use work1 ! implicit none include 'mpif.h' integer unit complex*16 c(ngw,nx) complex*16, pointer:: psis(:) ! integer i, ii, ig, proc, ierr, ntot, ncol, mc integer nmin(3), nmax(3), n1,n2,nzx,nz,nz_ integer root, displs(nproc), recvcount(nproc) complex*16, allocatable:: psitot(:), psiwr(:,:,:) ! ! nmin, nmax are the bounds on (i,j,k) indexes of wavefunction G-vectors ! call nrbounds(ngw,nr1s,nr2s,nr3s,in1p,in2p,in3p,nmin,nmax) ! ! nzx is the maximum length of a column along z ! nzx=nmax(3)-nmin(3)+1 ! root = 0 ! root is the first node ntot = 0 do proc=1,nproc recvcount(proc) = ncpw(proc)*nzx ! ! recvcount(proc) = size of data received from processor proc ! (number of columns times length of each column) ! if (proc.eq.1) then displs(proc)=0 else displs(proc)=displs(proc-1) + recvcount(proc-1) end if ! ! displs(proc) is the position of data received from processor proc ! ntot = ntot + recvcount(proc) ! ! ntot = total size of gathered data ! end do ! ! allocate the needed work spaces ! psis=> wrk1 call zero(2*nnrs,psis) if (me.eq.1) then allocate(psitot(ntot)) allocate(psiwr(nmin(3):nmax(3),nmin(1):nmax(1),nmin(2):nmax(2))) write(unit) n, nmin, nmax end if ! ! fill array psis with c_i(G) (as packed columns along z) ! do i=1,n do ig=1,ngw ! ! ncol+1 is the index of the column ! ncol=(nps(ig)-1)/nr3sx ! ! nz_ is the z component in FFT style (refolded between 1 and nr3s) ! nz_ =nps(ig)-ncol*nr3sx ! ! nz is the z component in "natural" style (between nmin(3) and nmax(3)) ! nz =nz_-1 if (nz.ge.nr3s/2) nz=nz-nr3s ! ! ncpw(me) columns along z are stored in contiguous order on each node ! psis(nz-nmin(3)+1+ncol*nzx)=c(ig,i) end do ! ! gather all psis arrays on the first node, in psitot ! call mpi_barrier ( MPI_COMM_WORLD, ierr) if (ierr.ne.0) call error('write_wfc','mpi_barrier 2',ierr) call mpi_gatherv (psis, recvcount(me), MPI_DOUBLE_COMPLEX, & & psitot,recvcount, displs,MPI_DOUBLE_COMPLEX, & & root, MPI_COMM_WORLD, ierr) if (ierr.ne.0) call error('write_wfc','mpi_gatherv',ierr) ! ! now order the columns with a node-number-independent ordering ! We use n1,n2,nz for ordering, where G=n1*g(1)+n2*g(2)+nz*g(3) ! and g(1), g(2), g(3) are basis vectors of the reciprocal lattice. ! Note that the entire column is written, even if there are no ! wavefunction components associated to it. This is a waste of ! disk space but it is simpler to implement than other schemes. ! if (me.eq.1) then ncol=0 do proc=1,nproc do ii=1,ncpw(proc) ncol=ncol+1 mc=icpls(ii+ncp0s(proc)) ! ! mc is the position in the xy plane of this column in FFT style ! we need to calculate "natural" indexes n1,n2, centered on the ! origin, for the smallest possible box. ! n2=(mc-1)/nr1sx n1= mc-1-n2*nr1sx if (n2.ge.nr2s/2) n2=n2-nr2s if (n1.le.nmax(1).and.(n1.ne.0.or.n2.ge.0)) then ! ! NB: n1.gt.nmax(1) correspond to negative indexes that should be ! refolded into negative n1. However these are absent because ! only half of the G sphere is considered. The other condition ! excludes the case n1=0, n2<0 that is also not present ! do nz=nmin(3),nmax(3) psiwr(nz,n1,n2)=psitot(nz-nmin(3)+1+(ncol-1)*nzx) end do end if end do end do ! ! write the node-number-independent array ! write(unit) psiwr ! end if end do ! if (me.eq.1) then deallocate(psiwr) deallocate(psitot) end if ! return ! end subroutine write_wfc ! !---------------------------------------------------------------------- subroutine read_wfc(unit,c) !---------------------------------------------------------------------- ! ! read wavefunctions from file and distribute to all nodes ! use gvec use gvecs use elct use para_mod use parms use work1 ! implicit none include 'mpif.h' integer unit complex*16 c(ngw,nx) complex*16, pointer:: psis(:) ! integer i, ii, ig, proc, ierr, ntot, ncol, mc, nr integer nmin0(3), nmax0(3), nmin(3), nmax(3), n1,n2,nzx,nz,nz_ integer root, displs(nproc), sendcount(nproc) complex*16, allocatable:: psitot(:), psird(:,:,:) ! ! nmin, nmax are the bounds on (i,j,k) indexes of wavefunction G-vectors ! call nrbounds(ngw,nr1s,nr2s,nr3s,in1p,in2p,in3p,nmin,nmax) ! ! nzx is the maximum length of a column along z ! nzx=nmax(3)-nmin(3)+1 ! root = 0 ! root is the first node ntot = 0 do proc=1,nproc sendcount(proc) = ncpw(proc)*nzx ! ! sendcount(proc) = size of data send to processor proc ! (number of columns times length of each column) ! if (proc.eq.1) then displs(proc)=0 else displs(proc)=displs(proc-1) + sendcount(proc-1) end if ! ! displs(proc) is the position of data sent to processor proc ! ntot = ntot + sendcount(proc) ! ! ntot = total size of gathered data ! end do ! ! allocate the needed work spaces ! psis=> wrk1 call zero(2*nnrs,psis) if (me.eq.1) then allocate(psitot(ntot)) ! ! read the smallest FFT box that contains all wavefunction G-vectors ! read(unit,end=10,err=10) nr, nmin0, nmax0 ! check if (nmin(1).ne.nmin0(1) .or. nmin(2).ne.nmin0(2) .or. & & nmin(3).ne.nmin0(3) .or. nmax(1).ne.nmax0(1) .or. & & nmax(2).ne.nmax0(2) .or. nmax(3).ne.nmax0(3) ) then write(6,*) 'read nmin, nmax =',nmin, nmax write(6,*) 'found nmin, nmax =',nmin0, nmax0 call error('read_wfc','wavefunction mismatch',1) end if if (nr.lt.n) call error('read_wfc','not enough wavefcts',nr) allocate(psird(nmin(3):nmax(3),nmin(1):nmax(1),nmin(2):nmax(2))) end if ! do i=1,n ! ! read the node-number-independent array ! if (me.eq.1) then read(unit,end=20,err=20) psird ! ! reorder as contiguously stored columns along z. See comments in ! write_wfc for the logic (or lack thereof) of the storage ! ncol=0 do proc=1,nproc do ii=1,ncpw(proc) ncol=ncol+1 mc=icpls(ii+ncp0s(proc)) n2=(mc-1)/nr1sx n1= mc-1-n2*nr1sx if (n2.ge.nr2s/2) n2=n2-nr2s if (n1.le.nmax(1).and.(n1.ne.0.or.n2.ge.0)) then do nz=nmin(3),nmax(3) psitot(nz-nmin(3)+1+(ncol-1)*nzx) = psird(nz,n1,n2) end do end if end do end do end if ! ! distribute the array psitot on all nodes (in psis) ! call mpi_barrier ( MPI_COMM_WORLD, ierr) if (ierr.ne.0) call error('write_wfc','mpi_barrier 2',ierr) call mpi_scatterv(psitot,sendcount,displs,MPI_DOUBLE_COMPLEX, & & psis ,sendcount(me), MPI_DOUBLE_COMPLEX, & & root, MPI_COMM_WORLD, ierr) if (ierr.ne.0) call error('write_wfc','mpi_scatter',ierr) ! ! fill c_i(G) (see write_wfc for the logic-or lack thereof-of ordering) ! do ig=1,ngw ncol=(nps(ig)-1)/nr3sx nz_ =nps(ig)-ncol*nr3sx nz =nz_-1 if (nz.ge.nr3s/2) nz=nz-nr3s c(ig,i)=psis(nz-nmin(3)+1+ncol*nzx) end do end do ! if (me.eq.1) then deallocate(psird) deallocate(psitot) end if ! return 10 call error('read_wfc','file missing or wrong',ierr) 20 call error('read_wfc','wavefunction missing or wrong',ierr) ! end subroutine read_wfc !----------------------------------------------------------------------- subroutine readpfile & & ( flag,nx,nax,nsp,ndr,nfi,ngw,n,nr,c0,cm,tau0,vol0,cdm0,cdmm,& & taum,volm,acc,taui,a1,a2,a3,alat,lambda,lambdam, & & xnhe0,xnhem,vnhe,xnhp0,xnhpm,vnhp,vel,ekincm) !----------------------------------------------------------------------- ! ! read from file and distribute data calculated in preceding iterations ! use para_mod implicit none include 'mpif.h' ! integer flag,nx,nax,nsp,ndr,nfi,ngw,n,nr complex*16 c0(ngw,n), cm(ngw,n) real*8 cdm0(3),cdmm(3),acc(10), a1(3),a2(3),a3(3), alat real*8 lambda(nx,nx), lambdam(nx,nx) real*8 xnhe0, xnhem, vnhe, xnhp0, xnhpm, vnhp, ekincm, volm,vol0 real*8 taum(3,nax,nsp),tau0(3,nax,nsp),taui(3,nax,nsp) real*8 vel(3,nax,nsp) ! integer i,ia,is,j,nd,ndummy,root,ierr real*8, allocatable:: dummy(:) ! ! Only the first node reads ! if (me.eq.1) rewind ndr if (flag.eq.0) then write(6,'((a,i3,a))') ' ### reading from file ',ndr,' only c0 ##' else write(6,'((a,i3))') ' ### reading from file ',ndr end if ! ! now read and distribute the wave functions ! call read_wfc(ndr,c0) ! if (flag.eq.0) return ! call read_wfc(ndr,cm) ! ! ! variables to be read on node 1 and broadcast on other nodes are stored ! in an array "dummy", whose length ndummy must be calculated by hand ! ndummy = 1 + 4*3*nax*nsp + 2*3 + 3*3 + 1 + 2*n*n + 10 + 9 ! nfi tau+vel cdm a1 a2 a3 alat lambda acc misc allocate(dummy(ndummy)) if (me.eq.1) read(ndr,end=10,err=10) (dummy(i),i=1,ndummy) ! ! broadcast variables to the other nodes... ! root = 0 call mpi_barrier( MPI_COMM_WORLD, ierr) call mpi_bcast(dummy,ndummy, MPI_REAL8, root, MPI_COMM_WORLD, ierr) if (ierr.ne.0) call error('broadcast','ierr<>0',ierr) ! ! and now unpack the array! ! nfi=dummy(1) nd=1 do is=1,nsp do ia=1,nax do i=1,3 nd=nd+1 tau0(i,ia,is)=dummy(nd) nd=nd+1 taum(i,ia,is)=dummy(nd) nd=nd+1 taui(i,ia,is)=dummy(nd) nd=nd+1 vel(i,ia,is) =dummy(nd) end do end do end do do i=1,3 nd=nd+1 cdm0(i)=dummy(nd) nd=nd+1 cdmm(i)=dummy(nd) nd=nd+1 a1(i)=dummy(nd) nd=nd+1 a2(i)=dummy(nd) nd=nd+1 a3(i)=dummy(nd) end do nd=nd+1 alat=dummy(nd) do j=1,n do i=1,n nd=nd+1 lambda (i,j)=dummy(nd) nd=nd+1 lambdam(i,j)=dummy(nd) end do end do do i=1,10 nd=nd+1 acc(i)=dummy(nd) end do vol0 =dummy(nd+1) volm =dummy(nd+2) xnhe0 =dummy(nd+3) xnhem =dummy(nd+4) vnhe =dummy(nd+5) xnhp0 =dummy(nd+6) xnhpm =dummy(nd+7) vnhp =dummy(nd+8) ekincm=dummy(nd+9) if (nd+9.ne.ndummy) call error('readpwfc','dummy!!!',nd+9) deallocate(dummy) ! return 10 call error('readpfile','end of file detected',1) end !----------------------------------------------------------------------- subroutine writepfile & & ( nx,nax,nsp,ndw,nfi,ngw,n,c0,cm,tau0,vol0,cdm0,cdmm, & & taum,volm,acc,taui,a1,a2,a3,alat,lambda,lambdam, & & xnhe0,xnhem,vnhe,xnhp0,xnhpm,vnhp,vel,ekincm) !----------------------------------------------------------------------- ! ! collect and write data to file for subsequent iterations ! use para_mod implicit none integer nx,nax,nsp,ndw,nfi,ngw,n complex*16 c0(ngw,n), cm(ngw,n) real*8 cdm0(3),cdmm(3),acc(10), a1(3),a2(3),a3(3), alat real*8 lambda(nx,nx), lambdam(nx,nx) real*8 xnhe0, xnhem, vnhe, xnhp0, xnhpm, vnhp, ekincm, volm,vol0 real*8 taum(3,nax,nsp),tau0(3,nax,nsp),taui(3,nax,nsp) real*8 vel(3,nax,nsp) ! integer i,ia,is,j,nd,ndummy real*8, allocatable:: dummy(:) ! ! Only the first node writes ! if (me.eq.1) rewind ndw call write_wfc(ndw,c0) call write_wfc(ndw,cm) if (me.ne.1) return ! ! the remaining variables are packed into an array "dummy", ! whose length ndummy must be calculated by hand ! ndummy = 1 + 4*3*nax*nsp + 2*3 + 3*3 + 1 + 2*n*n + 10 + 9 ! nfi tau+vel cdm a1 a2 a3 alat lambda acc misc allocate(dummy(ndummy)) ! ! and now pack the array dummy! ! dummy(1)=nfi nd=1 do is=1,nsp do ia=1,nax do i=1,3 nd=nd+1 dummy(nd)=tau0(i,ia,is) nd=nd+1 dummy(nd)=taum(i,ia,is) nd=nd+1 dummy(nd)=taui(i,ia,is) nd=nd+1 dummy(nd)=vel(i,ia,is) end do end do end do do i=1,3 nd=nd+1 dummy(nd)=cdm0(i) nd=nd+1 dummy(nd)=cdmm(i) nd=nd+1 dummy(nd)=a1(i) nd=nd+1 dummy(nd)=a2(i) nd=nd+1 dummy(nd)=a3(i) end do nd=nd+1 dummy(nd)=alat do j=1,n do i=1,n nd=nd+1 dummy(nd)=lambda (i,j) nd=nd+1 dummy(nd)=lambdam(i,j) end do end do do i=1,10 nd=nd+1 dummy(nd)=acc(i) end do dummy(nd+1)=vol0 dummy(nd+2)=volm dummy(nd+3)=xnhe0 dummy(nd+4)=xnhem dummy(nd+5)=vnhe dummy(nd+6)=xnhp0 dummy(nd+7)=xnhpm dummy(nd+8)=vnhp dummy(nd+9)=ekincm if (nd+9.ne.ndummy) call error('readpwfc','dummy!!!',nd+9) ! write(ndw) dummy ! deallocate(dummy) ! return end ! !---------------------------------------------------------------------- subroutine nrbounds(ngw,nr1s,nr2s,nr3s,in1p,in2p,in3p,nmin,nmax) !---------------------------------------------------------------------- ! ! find the bounds for (i,j,k) indexes of all wavefunction G-vectors ! The (i,j,k) indexes are defined as: G=i*g(1)+j*g(2)+k*g(3) ! where g(1), g(2), g(3) are basis vectors of the reciprocal lattice ! implicit none ! input integer ngw,nr1s,nr2s,nr3s,in1p(ngw),in2p(ngw),in3p(ngw) ! output integer nmin(3), nmax(3) ! local include 'mpif.h' integer nmin0(3), nmax0(3), ig, ierr ! ! nmin0(1)= nr1s nmax0(1)= -nr1s nmin0(2)= nr2s nmax0(2)= -nr2s nmin0(3)= nr3s nmax0(3)= -nr3s ! do ig=1,ngw nmin0(1) = min(nmin0(1),in1p(ig)) nmin0(2) = min(nmin0(2),in2p(ig)) nmin0(3) = min(nmin0(3),in3p(ig)) nmax0(1) = max(nmax0(1),in1p(ig)) nmax0(2) = max(nmax0(2),in2p(ig)) nmax0(3) = max(nmax0(3),in3p(ig)) end do ! ! find minima and maxima for the FFT box across all nodes ! call mpi_barrier( MPI_COMM_WORLD, ierr ) if (ierr.ne.0) call error('nrbounds','mpi_barrier 1',ierr) call mpi_allreduce (nmin0, nmin, 3, MPI_INTEGER, MPI_MIN, & & MPI_COMM_WORLD, ierr) if (ierr.ne.0) call error('nrbounds','mpi_allreduce min',ierr) call mpi_allreduce (nmax0, nmax, 3, MPI_INTEGER, MPI_MAX, & & MPI_COMM_WORLD, ierr) if (ierr.ne.0) call error('nrbounds','mpi_allreduce max',ierr) return end subroutine nrbounds