! Homework 8: Parallel y = y + Ax, due March 1, 2007 ! Calculate y = y + Ax where x & y are 1-dimensional arrays of ! length n and A is an 2-dimensional array of dimension (n,n). ! Let p = the number of processors used for the execution of this ! program. Assume that p divides n. ! Assume A is column blocked, x is blocked and y is the ! on all processors. All processors are to have the ! Correct value of y after the calculation. ! Call dgemv when performing matrix times vector operations to ! get best performance. Do a google search on dgemv to find out ! how to call it. To get Intel's high performance dgemv you ! must link in their MKL Libraries at compile time, i.e. ! mpif90 hw8.f90 -lmkl ! Time only the y = y + Ax calculation and compute mflops ! using bmpirun for p = 2, 4, and 8 and bmpirun1 for p = 2 ! and 4. Note: bmpirun1 assigns one mpi process per node ! instead of 2. mflops = millions of floating point operations ! per second. use mpi implicit none integer,parameter :: nflush=1024*1024/8 ! nflush = # of 8 byte elements in the cache double precision :: flush(1024*1024/8) ! used to flush the cache integer,parameter :: n=1024, ntrial=10 double precision,dimension(ntrial) :: max_time, array_time, array_perf integer,parameter :: dp=mpi_double_precision, comm=mpi_comm_world integer :: i,j, ktrial, m, p, rank, status(mpi_status_size), ierror double precision :: B(n,n), y(n), sum(n), psum(n), u(n), v(n), w(n) double precision,allocatable,dimension(:,:) :: A double precision,allocatable,dimension(:) :: x double precision :: t call random_number(flush) call mpi_init(ierror) call mpi_comm_size(comm, p, ierror) m = n/p call mpi_comm_rank(comm, rank, ierror) allocate(A(n,m),x(m)) ! Check to determine if p divides n: if (rank .eq. 0) then print *, ' ' print *, 'this program assumes p divides n:' print *, 'p = ', p, ' and n = ', n,' m = n/p = ', m endif ! Initialize A and x on each processor: call random_number(A) A(n,m) = A(n,m) + dble(rank) call random_number(x) x(1:m) = x(1:m) + dble(rank) ! Initialize y on processor 0 and broadcast to other processors: if (rank == 0) then call random_number(y) endif call mpi_bcast(y, n, dp, 0, comm, ierror) u(1:n) = y(1:n) ! u stores original values of y for answer checking ! Initialize the flush array and begin timing: flush(1:nflush) = 0.0d0 do ktrial = 1, ntrial y(1:n) = u(1:n) ! preserve the initial value of y for each trial flush(1:nflush) = flush(1:nflush) + 0.1d0 ! flush the cache call mpi_barrier(comm, ierror) t = mpi_wtime() ! put your parallel y = y + Ax code here array_time(ktrial) = mpi_wtime() - t call mpi_barrier(mpi_comm_world,ierror) flush(ktrial) = flush(ktrial) + y(ktrial) ! prevent compiler from ! splitting flush enddo ! Take the max values of array_time and place in max_time on processor 0: call mpi_reduce(array_time, max_time, ntrial, dp, mpi_max, 0, comm, ierror) ! To check answers gather the A arrays to B, and gather the x arrays to v on processor 0: call mpi_gather( , , dp, , , dp, 0, comm, ierror) call mpi_gather( , , dp, , , dp, 0, comm, ierror) if(rank == 0) then print *, 'error = ', maxval(abs(y- -matmul( , ))) ! print *,' ' ! print *,' ' print *,'Time in Seconds MFLOPS' array_perf = dble( )*(1.0d-6)/max_time do ktrial = 1, ntrial print *,max_time(ktrial),' ',array_perf(ktrial) enddo print *, ' ' print *, ' ' endif ! To eliminate the possibility of an optimizing compiler removing code ! from the above timing loop. (The following barrier makes the following ! print statement execute after the above print statements.) call mpi_barrier(mpi_comm_world,ierror) if (rank == 0 ) then print *, 'flush(2)+array_time(5)+y(7) = ',flush(2)+array_time(5)+y(7) endif deallocate(A,x) call mpi_finalize(ierror) end