!*******************************************************************    
!   oned.f - a solution to the Poisson equation using Jacobi             
!   iteration on a 1D decomposition (the y-dimension is split)
!                                                                       
!   The size of the computational domain is read by the master and 
!   broadcast to all other processors.  The Jacobi iteration is run
!   until the change in successive elements is below the tolerance 
!   The difference is printed out every 100th or 1000th step. 
!
!   The Poisson equation in 2D: u(x,y)_xx+u(x,y)_yy=f(x,y)
!                               u(x,y)=g(x,y) on the boundary (see onedinit)
! 
!   Polished by Ulf Andersson 2002-08-30
!   Original version written by anonymous.
!
!*******************************************************************    
PROGRAM oneD
IMPLICIT NONE
! ..
! .. Include Lines ..
Include 'mpif.h'
! ..
! .. Local Scalars ..
integer, parameter :: NDIM=1, master_id=0, stag=0, etag=1
Real(8) :: diffnorm=1.0, dwork, t1, t2, rtmin, rtmax, tolerance=1.E-6
Integer :: comm1d, ey, ierr, it, my_id, belowY, aboveY, numprocs, nx,        &
           ny, sy, py, c1, c2, ctmin, ctmax, dist, dir, jj, sender_id
Logical :: reorder
! ..
! .. Local Arrays ..
Real(8), Allocatable, Dimension(:,:) :: a, b, f
Integer :: status(mpi_status_size)
Integer, dimension(NDIM) :: dims, coords
Logical, dimension(NDIM) :: periodic
! ..
! .. External Functions ..   ! Note, mclock is NOT standard Fortran. It is part
Integer, External :: mclock  ! of IBMs "utility functions" for Fortran. See
! http://www.pdc.kth.se/doc/SP/manuals/xlf-7.1/html/lr379.HTM#IDX4871 (ulfa)
! It is also part of the Intel Fortran library, see page 1.250 in
! /pdc/vol/i-compilers/7.1-41/compiler70/docs/for_lib.pdf (ulfa)
Call mpi_init(ierr)
Call mpi_comm_rank(mpi_comm_world,my_id,ierr)
Call mpi_comm_size(mpi_comm_world,numprocs,ierr)
!                                                                       
If (my_id==master_id) Then
  !!                                                                       
  !! Get the size of the problem                                   
  !!                                                                       
  Print *, 'Enter nx'                                          
  Read *, nx                                                   
End If

Call mpi_bcast(nx,1,mpi_integer,master_id,mpi_comm_world,ierr)
ny = nx ! A square domain
!                                                                       
! Get a new communicator for a decomposition of the domain              
!
periodic = .false.
reorder = .true.
dims(1) = numprocs
Call mpi_cart_create(mpi_comm_world,NDIM,dims,periodic,reorder,comm1d,ierr)
!                                                                       
! Get my position in this communicator, and my neighbors                
!                                                                       
Call mpi_comm_rank(comm1d,my_id,ierr)
Call mpi_cart_coords(comm1d,my_id,NDIM,coords,ierr)
dir = 0 ; dist = 1
Call mpi_cart_shift(comm1d,dir,dist,belowY,aboveY,ierr)
!Write(*,*) 'Rank:', my_id, ' ,Pos. in Virt. Top.:', coords(1),               &
!           ' ,neighbor below (Y): ', belowY, ' ,neighbor above (Y): ', aboveY
!                                                                       
! Compute the actual decomposition                                      
!                                                                       
Call mpe_decomp1d(ny,dims(1),coords(1),sy,ey)
Write(*,*) 'Task: ', my_id, ' column ', sy, ' to ', ey
!
! Allocate node local arrays with ghost cells. Note, we have a two-dimensional
! grid which we distribute on a one-dimensional virtual Cartesian topology.
!
Allocate( a(0:nx+1,sy-1:ey+1), b(0:nx+1,sy-1:ey+1), f(0:nx+1,sy-1:ey+1) )
!                                                                       
! Initialize the right-hand-side (f) and the initial solution guess (a) 
!                                                                       
Call onedinit(a,b,f,nx,sy,ey)
!                                                                       
! Actually do the computation.  Note the use of a collective operation to
! check for convergence, and a do-loop to bound the number of iterations
!                                                                       
Call mpi_barrier(comm1d,ierr)
t1 = mpi_wtime()
c1 = mclock()
it = 0
  
Do While ( diffnorm > tolerance )
  !! Note that we take two steps in each Do iteration
  it = it + 1
  Call exchng1(a,nx,sy,ey,comm1d,belowY,aboveY)
  Call sweep1d(a,f,nx,sy,ey,b)
  it = it + 1
  Call exchng1(b,nx,sy,ey,comm1d,belowY,aboveY)
  Call sweep1d(b,f,nx,sy,ey,a)
  dwork = diff(a,b,nx,sy,ey)
  Call mpi_allreduce(dwork,diffnorm,1,mpi_double_precision,mpi_sum,comm1d,ierr)
  If (my_id==master_id) Then
    If (it<=1000 .and. mod(it,100)==0) Then
      Write(*,*) it, ' Difference is ', diffnorm
    End If
    If (it>1000 .and. mod(it,1000)==0) Then
      Write(*,*) it, ' Difference is ', diffnorm
    End If
  End If

End Do

c2 = mclock() - c1
t2 = mpi_wtime() - t1
Call mpi_reduce(c2,ctmax,1,mpi_integer,         mpi_max,master_id,comm1d,ierr)
Call mpi_reduce(c2,ctmin,1,mpi_integer,         mpi_min,master_id,comm1d,ierr)
Call mpi_reduce(t2,rtmax,1,mpi_double_precision,mpi_max,master_id,comm1d,ierr)
Call mpi_reduce(t2,rtmin,1,mpi_double_precision,mpi_min,master_id,comm1d,ierr)

If (my_id==master_id) Then
  Write(*,*)
  Write(*,*) 'Converged after ', it, ' Iterations'
  If (my_id==master_id) Write(*,*) it, ' Difference is ', diffnorm
  Write(*,*) '                        Min         Max'
  Write(*,'(a,2f12.2)') ' CPU time (s):  ', ctmin/100.d0, ctmax/100.d0
  Write(*, '(a, 2f12.2)') ' Real time (s): ', rtmin, rtmax

  !! A safety check
  If (coords(1)/=master_id) Then
    Write(*,*) ' FATAL ERROR! This code assumes that the process with rank 0'
    Write(*,*) 'also have coord=0 in the Cartesian virtual topology'
    Write(*,*) 'rank:', my_id, ' ,coord:', coords(1)
  End If
  Open(unit=20,file='oned.out',status='UNKNOWN')
  !! Write the master's part of the solution to file oned.out        
  Do jj=1,ey                   !! In order to get the same solution file 
    Write(20,*) b(1:nx,jj)     !! independently of the value of numprocs
  End Do                       !! we write the file column by column.
  Do py=1,dims(1)-1
    !! Find the senders rank from the senders virtual Cartesian coords.
    Call mpi_cart_rank(comm1d,py,sender_id,ierr)
    !! Receive the messages
    Call mpi_recv(sy,1,mpi_integer,sender_id,stag,comm1d,status,ierr)
    Call mpi_recv(ey,1,mpi_integer,sender_id,etag,comm1d,status,ierr)
    Call mpi_recv(b(1:nx,1:ey-sy+1),nx*(ey-sy+1),mpi_double_precision,        &
                  sender_id,0,comm1d,status,ierr)
    Do jj=1,ey-sy+1
      Write(20,*) b(1:nx,jj)
    End Do
  End Do
  close(unit=20, status='KEEP')
Else
  Call mpi_ssend(sy,1,mpi_integer,master_id,stag,comm1d,ierr)
  Call mpi_ssend(ey,1,mpi_integer,master_id,etag,comm1d,ierr)
  Call mpi_ssend(b(1:nx,sy:ey),nx*(ey-sy+1),mpi_double_precision,             &
                 master_id,0,comm1d,ierr)
End If
                                                                       
Deallocate( a, b, f )
Call mpi_finalize(ierr)

CONTAINS

! ============================================================================
! This is a routine for producing a decomposition of n on nprocs processes.
! The values returned assume a "global" domain in [1:n]
! Example of decomposition:
! n=30, nprocs=4, gives nlocal = 8, 8, 7, 7
!                            s = 1, 9,17,24
!                            e = 8,16,23,30 
! Note that since no work is done in the ghostcells, they are not considered 
! when doing the decomposition.
!
Subroutine mpe_decomp1d(n,nprocs,coord,s,e)
! .. Scalar Arguments ..
Integer, intent(in) :: coord, n, nprocs
Integer, intent(out) :: e, s
! ..
! .. Local Scalars ..
Integer :: deficit, nlocal
! ..
! .. Intrinsic Functions ..
Intrinsic min, mod
! ..
!                                                                       
nlocal = n/nprocs
s = coord*nlocal + 1
deficit = mod(n,nprocs)
s = s + min(coord,deficit)
If (coord<deficit) Then
  nlocal = nlocal + 1
End If
e = s + nlocal - 1
Return
End Subroutine mpe_decomp1d

! ============================================================================
Function diff(a,b,nx,sy,ey)
!                                                                       
! .. Function Return Value ..
Double Precision :: diff
! ..
! .. Scalar Arguments ..
Integer :: nx, sy, ey
! ..
! .. Array Arguments ..
Real (8), Dimension (0:nx+1,sy-1:ey+1) :: a, b
! ..
! .. Local Scalars ..
Real (8) :: sum
Integer :: i, j
! ..

sum = 0.0D0
Do j = sy, ey
  Do i = 1, nx
    sum = sum + (a(i,j)-b(i,j))**2
  End Do
End Do
diff = sum
Return
End Function diff

! ============================================================================
Subroutine exchng1(ab,nx,sy,ey,comm1d,belowY,aboveY)
! ..
! .. Scalar Arguments ..
Integer, intent(in) :: nx, sy, ey, comm1d, belowY, aboveY
! ..
! .. Array Arguments ..
Real (8), Dimension (0:nx+1,sy-1:ey+1), intent(inout) :: ab
! ..
! .. Local Scalars ..
Integer :: ierr
! ..
! .. Local Arrays ..
Integer, Dimension (mpi_status_size) :: status
! ..
!          
! Send nx values ab(1:nx,ey) upwards. Note that these nx elements are stored
! contiguously in memory.
Call mpi_sendrecv(ab(1,ey)  ,nx,mpi_double_precision,aboveY,0,                &
                  ab(1,sy-1),nx,mpi_double_precision,belowY,0,                &
                  comm1d,status,ierr)
! Send nx values ab(1:nx,sy) downwards. Note that these nx elements are stored
! contiguously in memory.
Call mpi_sendrecv(ab(1,sy)  ,nx,mpi_double_precision,belowY,1,                &
                  ab(1,ey+1),nx,mpi_double_precision,aboveY,1,                &
                  comm1d,status,ierr)
Return
End Subroutine exchng1

! ============================================================================
!
! Initialization routine. Also sets the constant boundary conditions.
!
! If we use f===0, we are in fact solving the Laplace equation.
! Our boundary conditions u(0,y)=1
!                         u(1,y)=0
!                         u(x,0)=1
!                         u(x,1)=0
!
Subroutine onedinit(a,b,f,nx,sy,ey)
!                                                                       
! .. Scalar Arguments ..
Integer, intent(in) :: nx, sy, ey
! ..
! .. Array Arguments ..
Real (8), Dimension (0:nx+1,sy-1:ey+1), intent(out) :: a, b, f
! ..
!                                                                       
a = 0.D0
b = 0.D0
f = 0.D0
!                                                                       
! Handle boundary conditions                                         
!                                                                       
a(0,sy:ey) = 1.D0
b(0,sy:ey) = 1.D0

If (sy==1) Then
  a(1:nx,0) = 1.D0
  b(1:nx,0) = 1.D0
End If
!                                                                       
Return
End Subroutine onedinit

! ============================================================================
!                                                                       
! Perform a Jacobi sweep for a 1D decomposition.                       
! Sweep from old into new                                                   
!                                                                       
Subroutine sweep1d(old,f,nx,sy,ey,new)
!                                                                       
! .. Scalar Arguments ..
Integer, intent(in) :: nx, sy, ey
! ..
! .. Array Arguments ..
Real (8), Dimension (0:nx+1,sy-1:ey+1), intent(in) :: old, f
Real (8), Dimension (0:nx+1,sy-1:ey+1), intent(out) :: new
! ..
! .. Local Scalars ..
Real (8) :: h
Integer :: i, j
! ..
! .. Intrinsic Functions ..
Intrinsic dble
!                                                                       
h = 1.0D0/dble(nx+1)
Do j = sy, ey
  Do i = 1, nx
    new(i,j) = 0.25*(old(i-1,j)+old(i,j+1)+old(i,j-1)+old(i+1,j) - h*h*f(i,j))
  End Do
End Do
Return
End Subroutine sweep1d

END PROGRAM oneD
