/*NAS Parallel Benchmarks 2.4 UPC versions - CG This benchmark is an UPC version of the NPB CG code. The UPC versions are developed by HPCL-GWU and are derived from the OpenMP version (developed by RWCP) and from the MPI version (developed by NAS). Permission to use, copy, distribute and modify this software for any purpose with or without fee is hereby granted. This software is provided "as is" without express or implied warranty. Information on the UPC project at GWU is available at: http://upc.gwu.edu Information on NAS Parallel Benchmarks 2.4 is available at: http://www.nas.nasa.gov/Software/NPB/ --------------------------------------------------------------------*/ /*---- FULLY OPTIMIZED VERSION (privatized&prefetching) - NOV 2004 -*/ /*-------------------------------------------------------------------- UPC version: F. Cantonnet - GWU - HPCL (fcantonn@gwu.edu) Y. Yao - GWU - HPCL (yyy@gwu.edu) T. El-Ghazawi - GWU - HPCL (tarek@gwu.edu) Authors(NAS): R. F. Van der Wijngaart M. Yarrow C. Kuszmaul --------------------------------------------------------------------*/ /*-------------------------------------------------------------------- This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. This program can be freely redistributed provided that you conspicuously and appropriately publish on each copy an appropriate referral notice to the authors and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and give any other recipients of the Program a copy of this License along with the Program. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA --------------------------------------------------------------------*/ #include #include #include #include #include #include "npb-C.h" #include "npbparams.h" #include "upc_timers.h" #define MAX_TIMERS 4 #define TIMER_TOTALTIME 0 #define TIMER_ALLREDUCE 1 #define TIMER_ALLREDUCE2 2 #define TIMER_ALLREDUCEB 3 /* timers */ shared double timer[MAX_TIMERS][THREADS]; #define NZ (NA*(NONZER+1)*(NONZER+1)/THREADS)+(NA*(NONZER+2+(THREADS/256))\ /NUM_PROC_COLS) /* global variables */ /* common /partit_size/ */ int naa; int nzz; int nr_row; int npcols, nprows; /* thread relative */ int firstrow, lastrow; int firstcol, lastcol; int l2npcols; int exchange_thread; /* global view */ int firstrow_g[THREADS], lastrow_g[THREADS]; int firstcol_g[THREADS], lastcol_g[THREADS]; int proc_col_g[THREADS], proc_row_g[THREADS]; int l2npcols_g[THREADS]; int send_start_g[THREADS], send_len_g[THREADS]; int exchange_thread_g[THREADS], exchange_len_g[THREADS]; int reduce_threads_g[THREADS][NUM_PROC_COLS]; int reduce_send_start_g[THREADS][NUM_PROC_COLS]; int reduce_send_len_g[THREADS][NUM_PROC_COLS]; int reduce_recv_start_g[THREADS][NUM_PROC_COLS]; /* common /main_int_mem/ */ int colidx[NZ]; /* colidx[0:NZ-1] */ int rowstr[NA+1]; /* rowstr[0:NA] */ int iv[2*NA+1]; /* iv[0:2*NA] */ int arow[NZ]; /* arow[0:NZ-1] */ int acol[NZ]; /* acol[0:NZ-1] */ /* common /main_flt_mem/ */ double v[NA+1]; /* v[0:NA] */ double aelt[NZ]; /* aelt[0:NZ-1] */ double a[NZ]; /* a[0:NZ-1] */ double x[NA/NUM_PROC_ROWS+2]; /* x[0:NA+1] */ double z[NA/NUM_PROC_ROWS+2]; /* z[0:NA+1] */ double p[NA/NUM_PROC_ROWS+2]; /* p[0:NA+1] */ double q[NA/NUM_PROC_ROWS+2]; /* q[0:NA+1] */ double r[NA/NUM_PROC_ROWS+2]; /* r[0:NA+1] */ /* common /urando/ */ double amult; double tran; /* shared structures */ struct w_s { double arr[NA/NUM_PROC_ROWS+2]; }; typedef struct w_s w_t; shared w_t *w, *w_tmp; /* reduce stuff */ typedef struct cg_reduce_s cg_reduce_t; struct cg_reduce_s { double v[2][NUM_PROC_COLS]; }; shared cg_reduce_t sh_reduce[THREADS]; /* Private pointers to local shared areas */ double *sh_reduce_ptr[2]; /* function declarations */ double *init_prefetch( void ); void setup_submatrix_info( void ); void makea(int n, int nz, double a[], int colidx[], int rowstr[], int nonzer, int firstrow, int lastrow, int firstcol, int lastcol, double rcond, int arow[], int acol[], double aelt[], double v[], int iv[], double shift ); void sprnvc(int n, int nz, double v[], int iv[], int nzloc[], int mark[]); int icnvrt(double x, int ipwr2); void vecset(int n, double v[], int iv[], int *nzv, int i, double val); void sparse(double a[], int colidx[], int rowstr[], int n, int arow[], int acol[], double aelt[], int firstrow, int lastrow, double x[], boolean mark[], int nzloc[], int nnza); void conj_grad(int colidx[], int rowstr[], double x[], double z[], double a[], double p[], double q[], double r[], shared w_t *w, shared w_t *w1, double *rnorm, double *prefetch_buffer); double reduce_sum( double rs_a ); void reduce_sum_2( double rs_a, double rs_b ); void timers_reduce( double *max_t ) // TO BE RUN only by THREAD 0 { int i, j; for( j=0; j max_t[j] ) max_t[j] = timer[j][i]; } } } int main(int argc, char **argv) { int i, j, k, it; double zeta; double rnorm; double norm_temp11; double norm_temp12; double t, mflops; char class; boolean verified; double zeta_verify_value, epsilon; double max_timer[MAX_TIMERS]; double *pref_buffer; /* initialize the timers */ timer_clear(TIMER_TOTALTIME); timer_clear(TIMER_ALLREDUCE); timer_clear(TIMER_ALLREDUCE2); timer_clear(TIMER_ALLREDUCEB); /* initialize the verification constants */ if (NA == 1400 && NONZER == 7 && NITER == 15 && SHIFT == 10.0) { class = 'S'; zeta_verify_value = 8.5971775078648; } else if (NA == 7000 && NONZER == 8 && NITER == 15 && SHIFT == 12.0) { class = 'W'; zeta_verify_value = 10.362595087124; } else if (NA == 14000 && NONZER == 11 && NITER == 15 && SHIFT == 20.0) { class = 'A'; zeta_verify_value = 17.130235054029; } else if (NA == 75000 && NONZER == 13 && NITER == 75 && SHIFT == 60.0) { class = 'B'; zeta_verify_value = 22.712745482631; } else if (NA == 150000 && NONZER == 15 && NITER == 75 && SHIFT == 110.0) { class = 'C'; zeta_verify_value = 28.973605592845; } else if (NA == 1500000 && NONZER == 21 && NITER == 100 && SHIFT == 500.0) { class = 'D'; zeta_verify_value = 52.5145321058; } else { class = 'U'; } if( MYTHREAD == 0 ) { printf("\n\n NAS Parallel Benchmarks 2.4 UPC version" " - CG Benchmark - GWU/HPCL\n"); printf(" Size: %10d\n", NA); printf(" Iterations: %5d\n", NITER); printf(" Number of processors involved: %2d\n", THREADS); printf(" Number of nonzeroes per row: %8d\n", NONZER ); printf(" Eigenvalue shift: %8.3f\n", SHIFT ); } naa = NA; nzz = NZ; /* Initialize random number generator */ tran = 314159265.0; amult = 1220703125.0; zeta = randlc( &tran, amult ); /* Split the matrix into something smaller */ setup_submatrix_info(); /* Allocate the memory for the prefetching buffer */ if( l2npcols != 0 ) { /* BUG fix -- 04/16/04 -- against calloc(0) buggy implementations */ pref_buffer = init_prefetch(); MEM_OK( pref_buffer ); } else pref_buffer = NULL; upc_barrier; /* Allocate the big shared memory block for w[] and w_tmp[] */ w = (shared w_t *) upc_all_alloc( THREADS, sizeof( w_t )); MEM_OK( w ); w_tmp = (shared w_t *) upc_all_alloc( THREADS, sizeof( w_t )); MEM_OK( w_tmp ); /* Initialize the private pointers to local shared data */ sh_reduce_ptr[0] = ((double *) &sh_reduce[MYTHREAD].v[0][0]); sh_reduce_ptr[1] = ((double *) &sh_reduce[MYTHREAD].v[1][0]); nr_row = MYTHREAD / NUM_PROC_COLS; upc_barrier; makea(naa, nzz, a, colidx, rowstr, NONZER, firstrow, lastrow, firstcol, lastcol, RCOND, arow, acol, aelt, v, iv, SHIFT); /*Note: as a result of the above call to makea: values of j used in indexing rowstr go from 1 --> lastrow-firstrow+1 values of colidx which are col indexes go from firstcol --> lastcol So: Shift the col index vals from actual (firstcol --> lastcol ) to local, i.e., (1 --> lastcol-firstcol+1) */ for (j = 0; j <= lastrow - firstrow; j++) { for (k = rowstr[j]; k < rowstr[j+1]; k++) { colidx[k-1] = colidx[k-1] - firstcol + 1; } } /* set starting vector to (1, 1, .... 1) */ memset( x, 0, sizeof( x )); for (i = 0; i <= NA/NUM_PROC_ROWS; i++) { x[i] = 1.0; } zeta = 0.0; /* Do one iteration untimed to init all code and data page tables (then reinit, start timing, to niter its) */ /* The call to the conjugate gradient routine: */ upc_barrier; conj_grad(colidx, rowstr, x, z, a, p, q, r, w, w_tmp, &rnorm, pref_buffer); /* zeta = shift + 1/(x.z) So, first: (x.z) Also, find norm of z So, first: (z.z) */ upc_barrier; norm_temp11 = 0.0; norm_temp12 = 0.0; /* - compute - */ for (j = 0; j <= lastcol-firstcol; j++) { norm_temp11 = norm_temp11 + x[j]*z[j]; norm_temp12 = norm_temp12 + z[j]*z[j]; } reduce_sum_2( norm_temp11, norm_temp12 ); /* - flush - */ norm_temp11 = 0.0; norm_temp12 = 0.0; for( i=0; i memsize ) memsize = reduce_send_len_g[reduce_threads_g[MYTHREAD][i]][i]; } /* and malloc it */ buf = calloc( memsize, sizeof( double )); return buf; } void setup_submatrix_info( void ) { int col_size, row_size; int t, i, j; int div_factor; npcols = NUM_PROC_COLS; nprows = NUM_PROC_ROWS; for( t=0; t 0 ) { l2npcols_g[t] ++; i /= 2; } div_factor = npcols; for( i=0; i=0; i-- ) { if( nprows == npcols ) { reduce_send_start_g[t][i] = send_start_g[t]; reduce_send_len_g[t][i] = send_len_g[t]; } else { if( i == (l2npcols_g[t]-1) ) { reduce_send_len_g[t][i] = lastrow_g[t]-firstrow_g[t]+1 - send_len_g[t]; if( (t/2*2) == t ) { reduce_send_start_g[t][i] = send_start_g[t] + send_len_g[t]; } else { reduce_send_start_g[t][i] = 1; } } else { reduce_send_len_g[t][i] = send_len_g[t]; reduce_send_start_g[t][i] = send_start_g[t]; } } reduce_recv_start_g[t][i] = send_start_g[t]; } exchange_len_g[t] = lastcol_g[t] - firstcol_g[t] + 1; } /* make local copy */ firstcol = firstcol_g[MYTHREAD]; firstrow = firstrow_g[MYTHREAD]; lastcol = lastcol_g[MYTHREAD]; lastrow = lastrow_g[MYTHREAD]; exchange_thread = exchange_thread_g[MYTHREAD]; l2npcols = l2npcols_g[MYTHREAD]; } /* generate the test problem for benchmark 6 makea generates a sparse matrix with a prescribed sparsity distribution parameter type usage input n i number of cols/rows of matrix nz i nonzeros as declared array size rcond r*8 condition number shift r*8 main diagonal shift output a r*8 array for nonzeros colidx i col indices rowstr i row pointers workspace iv, arow, acol i v, aelt r*8 */ void makea( int n, int nz, double a[], /* a[0:nz-1] */ int colidx[], /* colidx[0:nz-1] */ int rowstr[], /* rowstr[0:n] */ int nonzer, int firstrow, int lastrow, int firstcol, int lastcol, double rcond, int arow[], /* arow[0:nz-1] */ int acol[], /* acol[0:nz-1] */ double aelt[], /* aelt[0:nz-1] */ double v[], /* v[0:n] */ int iv[], /* iv[0:2*n] */ double shift ) { int i, nnza, iouter, ivelt, ivelt1, irow, nzv; /* nonzer is approximately (int(sqrt(nnza /n))); */ double size, ratio, scale; int jcol; size = 1.0; ratio = pow(rcond, (1.0 / (double)n)); nnza = 0; /* Initialize colidx(n .. 2n-1) to zero. Used by sprnvc to mark nonzero positions */ for (i = 1; i <= n; i++) { colidx[n+i-1] = 0; } for (iouter = 1; iouter <= n; iouter++) { nzv = nonzer; sprnvc(n, nzv, v, iv, &(colidx[0]), &(colidx[n])); vecset(n, v, iv, &nzv, iouter, 0.5); for (ivelt = 1; ivelt <= nzv; ivelt++) { jcol = iv[ivelt-1]; if (jcol >= firstcol && jcol <= lastcol) { scale = size * v[ivelt-1]; for (ivelt1 = 1; ivelt1 <= nzv; ivelt1++) { irow = iv[ivelt1-1]; if (irow >= firstrow && irow <= lastrow) { nnza = nnza + 1; if (nnza > nz) { printf("Space for matrix elements exceeded in" " makea\n"); printf("nnza, nzmax = %d, %d\n", nnza, nz); printf("iouter = %d\n", iouter); exit(1); } acol[nnza-1] = jcol; arow[nnza-1] = irow; aelt[nnza-1] = v[ivelt1-1] * scale; } } } } size = size * ratio; } /* ... add the identity * rcond to the generated matrix to bound the smallest eigenvalue from below by rcond */ for (i = firstrow; i <= lastrow; i++) { if (i >= firstcol && i <= lastcol) { iouter = n + i; nnza = nnza + 1; if (nnza > nz) { printf("Space for matrix elements exceeded in makea\n"); printf("nnza, nzmax = %d, %d\n", nnza, nz); printf("iouter = %d\n", iouter); exit(1); } acol[nnza-1] = i; arow[nnza-1] = i; aelt[nnza-1] = rcond - shift; } } /* ... make the sparse matrix from list of elements with duplicates (v and iv are used as workspace) */ sparse(a, colidx, rowstr, n, arow, acol, aelt, firstrow, lastrow, v, &(iv[0]), &(iv[n]), nnza); } /* generate a sparse n-vector (v, iv) having nzv nonzeros mark(i) is set to 1 if position i is nonzero. mark is all zero on entry and is reset to all zero before exit this corrects a performance bug found by John G. Lewis, caused by reinitialization of mark on every one of the n calls to sprnvc */ void sprnvc( int n, int nz, double v[], /* v[0:*] */ int iv[], /* iv[0:*] */ int nzloc[], /* nzloc[0:n-1] */ int mark[] ) /* mark[0:n-1] */ { int nn1; int nzrow, nzv, ii, i; double vecelt, vecloc; nzv = 0; nzrow = 0; nn1 = 1; do { nn1 = 2 * nn1; } while (nn1 < n); /* nn1 is the smallest power of two not less than n */ while (nzv < nz) { vecelt = randlc(&tran, amult); /* generate an integer between 1 and n in a portable manner */ vecloc = randlc(&tran, amult); i = icnvrt(vecloc, nn1) + 1; if (i > n) continue; /* was this integer generated already? */ if (mark[i-1] == 0) { mark[i-1] = 1; nzrow = nzrow + 1; nzloc[nzrow-1] = i; nzv = nzv + 1; v[nzv-1] = vecelt; iv[nzv-1] = i; } } for (ii = 1; ii <= nzrow; ii++) { i = nzloc[ii-1]; mark[i-1] = 0; } } /* scale a double precision number x in (0,1) by a power of 2 and chop it */ int icnvrt(double x, int ipwr2) { return ((int)(ipwr2 * x)); } /* set ith element of sparse vector (v, iv) with nzv nonzeros to val */ void vecset( int n, double v[], /* v[0:*] */ int iv[], /* iv[0:*] */ int *nzv, int i, double val) { int k; boolean set; set = FALSE; for (k = 1; k <= *nzv; k++) { if (iv[k-1] == i) { v[k-1] = val; set = TRUE; } } if (set == FALSE) { *nzv = *nzv + 1; v[*nzv-1] = val; iv[*nzv-1] = i; } } /* generate a sparse matrix from a list of [col, row, element] tri */ void sparse( double a[], /* a[0:*] */ int colidx[], /* colidx[0:*] */ int rowstr[], /* rowstr[0:*] */ int n, int arow[], /* arow[0:*] */ int acol[], /* acol[0:*] */ double aelt[], /* aelt[0:*] */ int firstrow, int lastrow, double x[], /* x[0:n-1] */ boolean mark[], /* mark[0:n-1] */ int nzloc[], /* nzloc[0:n-1] */ int nnza) { /* rows range from firstrow to lastrow the rowstr pointers are defined for nrows = lastrow-firstrow+1 values */ int nrows; int i, j, jajp1, nza, k, nzrow; double xi; /* how many rows of result ? */ nrows = lastrow - firstrow + 1; /* ...count the number of triples in each row */ for (j = 1; j <= n; j++) { rowstr[j-1] = 0; mark[j-1] = FALSE; } rowstr[n] = 0; for (nza = 1; nza <= nnza; nza++) { j = (arow[nza-1] - firstrow + 1) + 1; rowstr[j-1] = rowstr[j-1] + 1; } rowstr[0] = 1; for (j = 2; j <= nrows+1; j++) { rowstr[j-1] = rowstr[j-1] + rowstr[j-2]; } /* ... rowstr(j) now is the location of the first nonzero of row j of a */ /* ... do a bucket sort of the triples on the row index */ for (nza = 1; nza <= nnza; nza++) { j = arow[nza-1] - firstrow + 1; k = rowstr[j-1]; a[k-1] = aelt[nza-1]; colidx[k-1] = acol[nza-1]; rowstr[j-1] = rowstr[j-1] + 1; } /* ... rowstr(j) now points to the first element of row j+1 */ for (j = nrows; j >= 1; j--) { rowstr[j] = rowstr[j-1]; } rowstr[0] = 1; /* ... generate the actual output rows by adding elements */ nza = 0; for (i = 0; i < n; i++) { x[i] = 0.0; mark[i] = FALSE; } jajp1 = rowstr[0]; for (j = 1; j <= nrows; j++) { nzrow = 0; /* ...loop over the jth row of a */ for (k = jajp1; k < rowstr[j]; k++) { i = colidx[k-1]; x[i-1] = x[i-1] + a[k-1]; if ( mark[i-1] == FALSE && x[i-1] != 0.0) { mark[i-1] = TRUE; nzrow = nzrow + 1; nzloc[nzrow-1] = i; } } /* ... extract the nonzeros of this row */ for (k = 1; k <= nzrow; k++) { i = nzloc[k-1]; mark[i-1] = FALSE; xi = x[i-1]; x[i-1] = 0.0; if (xi != 0.0) { nza = nza + 1; a[nza-1] = xi; colidx[nza-1] = i; } } jajp1 = rowstr[j]; rowstr[j] = nza + rowstr[0]; } } void conj_grad ( int colidx[], /* colidx[0:nzz-1] */ int rowstr[], /* rowstr[0:naa] */ double x[], /* x[*] */ double z[], /* z[*] */ double a[], /* a[0:nzz-1] */ double p[], /* p[*] */ double q[], /* q[*] */ double r[], /* r[*] */ shared w_t *w, shared w_t *w1, double *rnorm, double *prefetch_buffer ) { /*Floating point arrays here are named as in NPB1 spec discussion of CG algorithm */ double d, sum, rho, rho0, alpha, beta; int i, j, k, l, m; int cgit, cgitmax = 25; double *w_ptr, *w1_ptr; /* Initialize the private pointers to access local shared data */ w_ptr = (double *) &w[MYTHREAD].arr[0]; w1_ptr = (double *) &w1[MYTHREAD].arr[0]; /* Initialize the CG algorithm */ memset( q, 0, sizeof(double) * ((naa/nprows)+1)); memset( z, 0, sizeof(double) * ((naa/nprows)+1)); memcpy( r, x, sizeof(double) * ((naa/nprows)+1)); memcpy( p, x, sizeof(double) * ((naa/nprows)+1)); memset( w_ptr, 0, sizeof( double ) * ((naa/nprows)+1)); memset( w1_ptr, 0, sizeof( double ) * ((naa/nprows)+1)); /* rho = r.r Now, obtain the norm of r: First, sum squares of r elements locally... */ sum = 0.0; /* - compute - */ for (j = 0; j <= lastcol-firstcol; j++) { sum = sum + r[j]*r[j]; } rho = reduce_sum( sum ); /* - flush - */ rho = 0.0; for( i=0; i=0; i-- ) { k = reduce_threads_g[MYTHREAD][i]; l = reduce_recv_start_g[MYTHREAD][i]-1; m = reduce_send_start_g[k][i]-1; upc_memget( prefetch_buffer, &w1[k].arr[m], sizeof( double ) * reduce_send_len_g[k][i] ); upc_notify; for( j=0; j=0; i-- ) { k = reduce_threads_g[MYTHREAD][i]; l = reduce_recv_start_g[MYTHREAD][i]-1; m = reduce_send_start_g[k][i]-1; upc_memget( prefetch_buffer, &w1[k].arr[m], sizeof( double ) * reduce_send_len_g[k][i] ); upc_notify; for( j=0; j