-
joeri.brackenhoff authoredjoeri.brackenhoff authored
3dfd.c 35.31 KiB
#include<mpi.h>
#include<stdlib.h>
#include<stdio.h>
#include<math.h>
#include<time.h>
#include<assert.h>
#include<sys/time.h>
#include"par.h"
#include<stdlib.h>
#include<stdio.h>
#include<math.h>
#include"fdelmodc3D.h"
#define MAX(x,y) ((x) > (y) ? (x) : (y))
#define MIN(x,y) ((x) < (y) ? (x) : (y))
#define NINT(x) ((int)((x)>0.0?(x)+0.5:(x)-0.5))
#define STRIPE_COUNT "4" /* must be an ascii string */
#define STRIPE_SIZE "1048576" /* 1 MB must be an ascii string */
//#define STRIPE_SIZE "268435456" /* 256 MB must be an ascii string */
#define C1 (9.0/8.0)
#define C2 (1.0/24.0)
#define Dx(f,ix,iy,iz,nz) C1*(f[iy+ix*nz+iz] - f[iy+(ix-1)*nz+iz]) - C2*(f[iy+(ix+1)*nz+iz] - f[iy+(ix-2)*nz+iz])
#define Dy(f,ix,iy,iz,nxz) C1*(f[iy*nxz+ix+iz] - f[(iy-1)*nxz+ix+iz]) - C2*(f[(iy+1)*nxz+ix+iz] - f[(iy-2)*nxz+ix+iz])
#define Dz(f,ix,iy,iz) C1*(f[iy+ix+iz] - f[iy+ix+iz-1]) - C2*(f[iy+ix+iz+1] - f[iy+ix+iz-2])
#define Dv(vx,vz,ix,iy,iz,nz,nxz) C1*((vx[iy*nxz+(ix+1)*nz+iz] - vx[iy*nxz+ix*nz+iz]) + \
(vy[(iy+1)*nxz+ix*nz+iz] - vy[iy*nxz+ix*nz+iz]) + \
(vz[iy*nxz+ix*nz+iz+1] - vz[iy*nxz+ix*nz+iz])) - \
C2*((vx[iy*nxz+(ix+2)*nz+iz] - vx[iy*nxz+(ix-1)*nz+iz]) + \
(vy[(iy+2)*nxz+ix*nz+iz] - vy[(iy-1)*nxz+ix*nz+iz]) + \
(vz[iy*nxz+ix*nz+iz+2] - vz[iy*nxz+ix*nz+iz-1]))
int getParameters3D(modPar *mod, recPar *rec, snaPar *sna, wavPar *wav, srcPar *src, shotPar *shot, bndPar *bnd, int verbose);
int readModel3D(modPar mod, bndPar bnd, float *rox, float *roz, float *l2m);
void vinit();
int updateVelocitiesHalo(float *vx, float *vy, float *vz, float *p, float *ro, int halo, int npx, int npy, int npz);
int updateVelocities(float *vx, float *vy, float *vz, float *p, float *ro, int halo, int npx, int npy, int npz);
int updatePressureHalo(float *vx, float *vy, float *vz, float *p, float *l2m, int halo, int npx, int npy, int npz);
int updatePressure(float *vx, float *vy, float *vz, float *p, float *l2m, int halo, int npx, int npy, int npz);
int exchangeHalo(float *leftRecv, float *leftSend, float *rightRecv, float *rightSend, int size, int leftrank, int rightrank, MPI_Request *reqRecv, MPI_Request *reqSend, int tag);
int newHaloVxVz(float *vx, float *vz, int npx, int npy, int npz, int halo, float *leftRecv, float *rightRecv, float *frontRecv, float *backRecv, float *topRecv, float *bottomRecv);
int newHaloP(float *p, int npx, int npy, int npz, int halo, float *leftRecv, float *rightRecv, float *frontRecv, float *backRecv, float *topRecv, float *bottomRecv);
int copyHaloVxVz(float *vx, float *vz, int npx, int npy, int npz, int halo, float *leftSend, float *rightSend, float *frontSend, float *backSend, float *topSend, float *bottomSend);
int copyHaloP(float *p, int npx, int npy, int npz, int halo, float *leftSend, float *rightSend, float *frontSend, float *backSend, float *topSend, float *bottomSend);
int waitForHalo(MPI_Request *reqRecv, MPI_Request *reqSend);
float gauss2time(float t, float f, float t0);
double wallclock_time(void);
void name_ext(char *filename, char *extension);
/* Self documentation */
char *sdoc[] = {
" ",
" file_rcv=recv.su .. base name for receiver files",
" file_snap=snap.su . base name for snapshot files",
" nx=256 ............ number of samples in x-direction",
" ny=nx ............. number of samples in y-direction",
" nz=nx ............. number of samples in z-direction",
" dx=5 .............. spatial sampling in x-direction",
" dy=5 .............. spatial sampling in y-direction",
" dz=5 .............. spatial sampling in z-direction",
"" ,
" verbose=0 ......... silent mode; =1: display info",
" ",
" Jan Thorbecke 2016",
" Cray / TU Delft",
" E-mail: janth@xs4all.nl ",
"",
NULL};
int main (int argc, char *argv[])
{
modPar mod;
recPar rec;
snaPar sna;
wavPar wav;
srcPar src;
bndPar bnd;
shotPar shot;
float *wavelet;
int nx, ny, nz, dims[3], period[3], reorder, coord[3], ndims=3;
int npx, npy, npz, halo, nt;
int my_rank, size, source, dest, snapwritten;
int left, right, front, back, top, bottom;
int direction, displ, halosizex, halosizey, halosizez;
int ioXx, ioXz, ioYx, ioYz, ioZz, ioZx, ioPx, ioPz;
int it, ix, iy, iz, iyp, ixp, izp, isrc, ixsrc, iysrc, izsrc, c1, c2;
int sizes[3], subsizes[3], starts[3];
int gsizes[3], gsubsizes[3], gstarts[3];
int error, rc, verbose;
float fx, fy, fz, dx, dy, dz, flx, fly, flz;
float *p, *vx, *vy, *vz, *rox, *roz, *roy, *l2m, hcp, hro, fac;
float *leftRecv, *leftSend, *rightRecv, *rightSend;
float *frontRecv, *frontSend, *backRecv, *backSend;
float *topRecv, *topSend, *bottomRecv, *bottomSend;
float dt, src_ampl, fmax, fpeaksrc, t0src, time, snaptime;
double t00, t0, t1, t2, tcomm, tcomp, thcomp, tcopy, ttot, tio;
char err_buffer[MPI_MAX_ERROR_STRING];
int resultlen;
MPI_Comm COMM_CART;
MPI_Request reqSend[6], reqRecv[6];
MPI_Status status[12];
MPI_Datatype local_array, global_array;
MPI_Offset disp;
MPI_Info fileinfo;
MPI_File fh;
char filename[1000], *file_snap, *file_rcv;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
vinit();
t0= wallclock_time();
initargs(argc,argv);
requestdoc(0);
if (!getparint("verbose",&verbose)) verbose=0;
if (!getparstring("file_snap",&file_snap)) file_snap="snap.su";
if (!getparstring("file_rcv",&file_rcv)) file_rcv="recv.su";
getParameters3D(&mod, &rec, &sna, &wav, &src, &shot, &bnd, verbose);
/* divide 3D cube among available processors */
dims[0]=0;
dims[1]=0;
dims[2]=0;
MPI_Dims_create(size, ndims, dims);
/* dims contains the number of MPI-tasks in each direction */
/* set number of grid points based on number of procs in dims */
if (!getparint("nx",&nx)) nx=256;
if (!getparint("ny",&ny)) ny=nx;
if (!getparint("nz",&nz)) nz=nx;
if (!getparfloat("dx",&dx)) dx=5.;
if (!getparfloat("dy",&dy)) dy=5.;
if (!getparfloat("dz",&dz)) dz=5.;
halo = 2;
/* new larger dimensions to fit with the domain-decomposition */
nz=dims[0]*ceil(mod.naz/dims[0]);
nx=dims[1]*ceil(mod.nax/dims[1]);
ny=dims[2]*ceil(mod.nay/dims[2]);
// dt=0.001;
nt=4096;
t0src=0.50;
hcp=2000.0;
hro=1800.0;
tcomm=tcomp=thcomp=tcopy=tio=0.0;
/* for stability 10 points per wavelenght */
fmax=hcp/(mod.dx*8);
dt=0.295*mod.dx/hcp;
fpeaksrc=0.2*fmax; /* Ricker wavelet has peak at 1/3 of fmax */
fac = mod.dt/mod.dx;
fx=-mod.dx*nx/2; fy=-mod.dy*ny/2; fz=0;
npz = 2*halo+nz/dims[0];
npx = 2*halo+nx/dims[1];
npy = 2*halo+ny/dims[2];
wavelet = (float *)calloc(nt,sizeof(float));
/* find which MPI-task has the source position */
snaptime = t0src+1.80*npx*dx*0.5/hcp;
snapwritten=0;
nt = (int) 1.1*(t0src+snaptime)/dt;
nt = (int) (t0src+1.5)/dt;
if (verbose && my_rank==0) {
fprintf(stderr,"fmax=%f fpeak=%f dt=%e\n", fmax, fpeaksrc, dt);
fprintf(stderr,"nx=%d nprocx=%d ny=%d nprocy=%d nz=%d nprocz=%d\n", nx, dims[1], ny, dims[2], nz, dims[0]);
fprintf(stderr,"npx=%d npy=%d npz=%d nt=%d time=%f\n", npx, npy, npz, nt, nt*dt);
fprintf(stderr,"source expected at local boundary at %f seconds\n", npx*dx*0.5/hcp);
fflush(stderr);
}
if (my_rank==0) fprintf(stderr,"memory per MPI task is %ld MB\n", (6*npx*npy*npz*4/(1024*1024)));
/* allocate wavefields and medium properties for local grid domains */
p = (float *)calloc(npx*npy*npz,sizeof(float));
vx = (float *)calloc(npx*npy*npz,sizeof(float));
vy = (float *)calloc(npx*npy*npz,sizeof(float));
vz = (float *)calloc(npx*npy*npz,sizeof(float));
/* read velocity and density files */
readModel3D(mod, bnd, rox, roz, l2m);
/* for 2.5 D model npy=1 */
rox = (float *)calloc(npx*npy*npz,sizeof(float));
roy= (float *)calloc(npx*npy*npz,sizeof(float));
roz= (float *)calloc(npx*npy*npz,sizeof(float));
l2m = (float *)calloc(npx*npy*npz,sizeof(float));
/* define homogeneus model */
for (ix=0; ix<npx*1*npz; ix++) {
rox[ix] = fac/hro;
roy[ix] = fac/hro;
roz[ix] = fac/hro;
l2m[ix] = fac*hcp*hcp*hro;
}
/* create cartesian domain decomposition */
period[0]=0;
period[1]=0;
period[2]=0;
reorder=0;
MPI_Cart_create(MPI_COMM_WORLD, 3, dims, period, reorder, &COMM_CART);
/* find out coordinates of the rank */
MPI_Cart_coords(COMM_CART, my_rank, 3, coord);
flz = fz+(dz*nz/dims[0])*coord[0];
flx = fx+(dx*nx/dims[1])*coord[1];
fly = fy+(dy*ny/dims[2])*coord[2];
if (verbose>=2) fprintf(stderr,"Rank %d coordinates are %d %d %d orig=(%5.2F, %5.2f, %5.2f) \n", my_rank, coord[0], coord[1], coord[2], flx, fly, flz);
fflush(stderr);
/* find out neighbours of the rank, MPI_PROC_NULL is a hard boundary of the model */
displ=1;
MPI_Cart_shift(COMM_CART, 1, 1, &left, &right);
MPI_Cart_shift(COMM_CART, 2, 1, &top, &bottom);
MPI_Cart_shift(COMM_CART, 0, 1, &front, &back);
if (verbose>=2) fprintf(stderr, "Rank %d in direction 0 has LR neighbours %d %d FB %d %d TB %d %d\n", my_rank, left, right, front, back, top, bottom);
fflush(stderr);
/* allocate of halo areas */
halosizex = npy*npz*halo;
leftRecv = (float *)calloc(3*halosizex,sizeof(float));
rightRecv = (float *)calloc(3*halosizex,sizeof(float));
leftSend = (float *)calloc(3*halosizex,sizeof(float));
rightSend = (float *)calloc(3*halosizex,sizeof(float));
halosizey = npx*npz*halo;
frontRecv = (float *)calloc(3*halosizey,sizeof(float));
backRecv = (float *)calloc(3*halosizey,sizeof(float));
frontSend = (float *)calloc(3*halosizey,sizeof(float));
backSend = (float *)calloc(3*halosizey,sizeof(float));
halosizez = npy*npx*halo;
topRecv = (float *)calloc(3*halosizez,sizeof(float));
bottomRecv = (float *)calloc(3*halosizez,sizeof(float));
topSend = (float *)calloc(3*halosizez,sizeof(float));
bottomSend = (float *)calloc(3*halosizez,sizeof(float));
if (my_rank==0) fprintf(stderr,"memory per MPI task for halo exchange is %ld MB\n", ((12*(halosizex+halosizey+halosizez))*4/(1024*1024)));
/* create subarrays(excluding halo areas) to write to file with MPI-IO */
/* data in the local array */
sizes[0]=npz;
sizes[1]=npx;
sizes[2]=npy;
subsizes[0]=sizes[0]-2*halo;
subsizes[1]=sizes[1]-2*halo;
subsizes[2]=sizes[2]-2*halo;
starts[0]=halo;
starts[1]=halo;
starts[2]=halo;
MPI_Type_create_subarray(3, sizes, subsizes, starts, MPI_ORDER_C,
MPI_FLOAT, &local_array);
MPI_Type_commit(&local_array);
/* data in the global array */
gsizes[0]=nz;
gsizes[1]=nx;
gsizes[2]=ny;
gsubsizes[0]=subsizes[0];
gsubsizes[1]=subsizes[1];
gsubsizes[2]=subsizes[2];
gstarts[0]=subsizes[0]*coord[0];
gstarts[1]=subsizes[1]*coord[1];
gstarts[2]=subsizes[2]*coord[2];
MPI_Type_create_subarray(3, gsizes, gsubsizes, gstarts, MPI_ORDER_C,
MPI_FLOAT, &global_array);
MPI_Type_commit(&global_array);
/* compute field of the inner grid excluding halo areas */
ioXx=2;
ioXz=ioXx-1;
ioYx=2;
ioYz=ioYx-1;
ioZz=2;
ioZx=ioZz-1;
ioPx=1;
ioPz=ioPx;
t00 = wallclock_time();
for (it=0; it<nt; it++) {
time = it*dt;
wavelet[it] = gauss2time(time,fpeaksrc,t0src);
}
if (my_rank==0) {
FILE *fp;
fp = fopen("src.bin", "w+");
fwrite( wavelet, sizeof(float), nt, fp);
fflush(fp);
fclose(fp);
}
/*
nt =1;
sprintf(filename,"snap_nz%d_nx%d_ny%d.bin",nz, nx, ny);
for (ix=0; ix<npx*npy*npz; ix++) {
p[ix] = my_rank;
}
MPI_Info_create(&fileinfo);
MPI_Info_set(fileinfo, "striping_factor", STRIPE_COUNT);
MPI_Info_set(fileinfo, "striping_unit", STRIPE_SIZE);
MPI_File_delete(filename, MPI_INFO_NULL);
rc = MPI_File_open(MPI_COMM_WORLD, filename, MPI_MODE_RDWR|MPI_MODE_CREATE, fileinfo, &fh);
if (rc != MPI_SUCCESS) {
fprintf(stderr, "could not open input file\n");
MPI_Abort(MPI_COMM_WORLD, 2);
}
disp = 0;
rc = MPI_File_set_view(fh, disp, MPI_FLOAT, global_array, "native", fileinfo);
if (rc != MPI_SUCCESS) {
fprintf(stderr, "error setting view on results file\n");
MPI_Abort(MPI_COMM_WORLD, 4);
}
rc = MPI_File_write_all(fh, p, 1, local_array, status);
if (rc != MPI_SUCCESS) {
MPI_Error_string(rc,err_buffer,&resultlen);
fprintf(stderr,err_buffer);
MPI_Abort(MPI_COMM_WORLD, 5);
}
MPI_File_close(&fh);
*/
/* Main loop over the number of time steps */
for (it=0; it<nt; it++) {
t0 = wallclock_time();
time = it*dt;
//fprintf(stderr,"modeling time step %d for time %f\n", it, time);
/* define source wavelet */
wavelet[it] = gauss2time(time,fpeaksrc,t0src);
/* update of grid values on halo areas */
updateVelocitiesHalo(vx, vy, vz, p, rox, halo, npx, npy, npz);
t1 = wallclock_time();
thcomp += t1-t0;
/* copy of halo areas */
copyHaloVxVz(vx, vz, npx, npy, npz, halo, leftSend, rightSend, frontSend, backSend, topSend, bottomSend);
t2 = wallclock_time();
tcopy += t2-t1;
/* start a-synchronous communication of halo areas to neighbours */
/* this is done first for Vx,Vz fields only */
exchangeHalo(leftRecv, leftSend, rightRecv, rightSend, 2*halosizex, left, right, &reqRecv[0], &reqSend[0], 0);
exchangeHalo(frontRecv, frontSend, backRecv, backSend, 2*halosizey, front, back, &reqRecv[2], &reqSend[2], 4);
exchangeHalo(topRecv, topSend, bottomRecv, bottomSend, 2*halosizez, top, bottom, &reqRecv[4], &reqSend[4], 8);
t1 = wallclock_time();
tcomm += t1-t2;
/* compute update on grid values excluding halo areas */
updateVelocities(vx, vy, vz, p, rox, halo, npx, npy, npz);
t2 = wallclock_time();
tcomp += t2-t1;
/* wait for Vx.Vz halo exchange */
waitForHalo(&reqRecv[0], &reqSend[0]);
t1 = wallclock_time();
tcomm += t1-t2;
/* copy of halo areas back to arrays */
newHaloVxVz(vx, vz, npx, npy, npz, halo, leftRecv, rightRecv, frontRecv, backRecv, topRecv, bottomRecv);
t2 = wallclock_time();
tcopy += t2-t1;
/* add Force source on the Vz grid */
src_ampl = wavelet[it];
/* check if source position is in local domain */
/* for the moment place a source in the middle of each domain */
ixsrc = npx/2;
iysrc = npy/2;
izsrc = npz/2;
isrc = iysrc*npx*npz+ixsrc*npz+izsrc;
// fprintf(stderr,"npz=%d npx=%d npy=%d isrc=%d\n", npz, npx, npy, isrc);
/* source scaling factor to compensate for discretisation */
src_ampl *= rox[isrc]*l2m[isrc]/(dt);
/* Force source */
//if (my_rank == 0) vz[isrc] += 0.25*src_ampl*ro[isrc]*dz;
vz[isrc] += 0.25*src_ampl*rox[isrc]*dz;
/* compute field on the grid of the halo areas */
updatePressureHalo(vx, vy, vz, p, l2m, halo, npx, npy, npz);
t1 = wallclock_time();
thcomp += t1-t2;
/* copy p-field and sent to neighbours */
copyHaloP(p, npx, npy, npz, halo, leftSend, rightSend, frontSend, backSend, topSend, bottomSend);
exchangeHalo(leftRecv, leftSend, rightRecv, rightSend, halosizex, left, right, &reqRecv[0], &reqSend[0], 0);
exchangeHalo(frontRecv, frontSend, backRecv, backSend, halosizey, front, back, &reqRecv[2], &reqSend[2], 4);
exchangeHalo(topRecv, topSend, bottomRecv, bottomSend, halosizez, top, bottom, &reqRecv[4], &reqSend[4], 8);
t2 = wallclock_time();
tcomm += t2-t1;
/* compute update on grid values excluding halo areas */
updatePressure(vx, vy, vz, p, l2m, halo, npx, npy, npz);
t1 = wallclock_time();
tcomp += t1-t2;
/* wait for P halo exchange */
waitForHalo(&reqRecv[0], &reqSend[0]);
t2 = wallclock_time();
tcomm += t2-t1;
newHaloP(p, npx, npy, npz, halo, leftRecv, rightRecv, frontRecv, backRecv, topRecv, bottomRecv);
t1 = wallclock_time();
tcopy += t1-t2;
// fprintf(stderr,"rank %d did time step %d in %f seconds\n", my_rank, it, t1-t0);
// fflush(stderr);
/* write snapshots to file */
// if (time >= snaptime && !snapwritten) {
if ((it+1)%100==0 ) {
t1 = wallclock_time();
sprintf(filename,"snap_nz%d_nx%d_ny%d_it%4d.bin",nz, nx, ny, it);
MPI_Info_create(&fileinfo);
MPI_Info_set(fileinfo, "striping_factor", STRIPE_COUNT);
MPI_Info_set(fileinfo, "striping_unit", STRIPE_SIZE);
MPI_File_delete(filename, MPI_INFO_NULL);
rc = MPI_File_open(MPI_COMM_WORLD, filename, MPI_MODE_RDWR|MPI_MODE_CREATE, fileinfo, &fh);
if (rc != MPI_SUCCESS) {
fprintf(stderr, "could not open input file\n");
MPI_Abort(MPI_COMM_WORLD, 2);
}
disp = 0;
rc = MPI_File_set_view(fh, disp, MPI_FLOAT, global_array, "native", fileinfo);
if (rc != MPI_SUCCESS) {
fprintf(stderr, "error setting view on results file\n");
MPI_Abort(MPI_COMM_WORLD, 4);
}
rc = MPI_File_write_all(fh, p, 1, local_array, status);
if (rc != MPI_SUCCESS) {
MPI_Error_string(rc,err_buffer,&resultlen);
fprintf(stderr,err_buffer);
MPI_Abort(MPI_COMM_WORLD, 5);
}
MPI_File_close(&fh);
/* MPI_Info_create(&fileinfo);
MPI_File_delete(filename, MPI_INFO_NULL);
MPI_File_open(MPI_COMM_WORLD, filename, MPI_MODE_RDWR|MPI_MODE_CREATE, MPI_INFO_NULL, &fh);
MPI_File_set_view(fh, 0, MPI_FLOAT, global_array, "native", MPI_INFO_NULL);
MPI_File_write_all(fh, p, npz*npx*npy, local_array, status);
MPI_File_close(&fh);
*/
snapwritten+=1;
t2 = wallclock_time();
tio += t2-t1;
}
}
ttot = wallclock_time() - t00;
if (my_rank == 0) {
fprintf(stderr,"rank %d total time in %f seconds\n", my_rank, ttot);
fprintf(stderr,"rank %d comm time in %f seconds\n", my_rank, tcomm);
fprintf(stderr,"rank %d comp time in %f seconds\n", my_rank, tcomp);
fprintf(stderr,"rank %d hcomp time in %f seconds\n", my_rank, thcomp);
fprintf(stderr,"rank %d copy time in %f seconds\n", my_rank, tcopy);
fprintf(stderr,"rank %d io time in %f seconds\n", my_rank, tio);
fprintf(stderr,"rank %d snaphsots written to file\n", snapwritten);
}
MPI_Finalize();
return 0;
}
int updateVelocities(float *vx, float *vy, float *vz, float *p, float *ro, int halo, int npx, int npy, int npz)
{
int ix, iy, iz, iyp, ixp, izp, c1, c2, nxz;
int ixs, ixe, iys, iye, izs, ize;
float DpDx, DpDy, DpDz;
nxz=npx*npz;
c1 = 9.0/8.0;
c2 = -1.0/24.0;
ixs=2*halo; ixe=npx-2*halo;
iys=2*halo; iye=npy-2*halo;
izs=2*halo; ize=npz-2*halo;
/* calculate vx,vy,vz for all grid points except on the virtual boundary*/
#pragma omp for private (iy, ix, iz) nowait
#pragma ivdep
for (iy=iys; iy<iye; iy++) {
iyp=iy*nxz;
for (ix=ixs; ix<ixe; ix++) {
ixp=ix*npz;
#pragma ivdep
for (iz=izs; iz<ize; iz++) {
DpDx = Dx(p,ix,iyp,iz,npz);
DpDy = Dy(p,ixp,iy,iz,nxz);
DpDz = Dz(p,ixp,iyp,iz);
vz[iyp+ixp+iz] += ro[iyp+ixp+iz]*DpDz;
vx[iyp+ixp+iz] += ro[iyp+ixp+iz]*DpDx;
vy[iyp+ixp+iz] += ro[iyp+ixp+iz]*DpDy;
}
}
}
return 0;
}
int updateVelocitiesHalo(float *vx, float *vy, float *vz, float *p, float *ro, int halo, int npx, int npy, int npz)
{
int ix, iy, iz, iyp, ixp, izp, c1, c2, nxz;
float DpDx, DpDy, DpDz;
nxz=npx*npz;
c1 = 9.0/8.0;
c2 = -1.0/24.0;
/* calculate vx,vy,vz for all halo grid points */
/* compute halo areas at left side */
#pragma omp for private (iy, ix, iz) nowait
for (iy=halo; iy<npy-halo; iy++) {
iyp=iy*nxz;
for (ix=halo; ix<2*halo; ix++) {
ixp=ix*npz;
#pragma ivdep
for (iz=halo; iz<npz-halo; iz++) {
DpDx = Dx(p,ix,iyp,iz,npz);
DpDy = Dy(p,ixp,iy,iz,nxz);
DpDz = Dz(p,ixp,iyp,iz);
vz[iyp+ixp+iz] += ro[iyp+ixp+iz]*DpDz;
vx[iyp+ixp+iz] += ro[iyp+ixp+iz]*DpDx;
vy[iyp+ixp+iz] += ro[iyp+ixp+iz]*DpDy;
}
}
}
/* compute halo areas at right side */
for (iy=halo; iy<npy-halo; iy++) {
iyp=iy*nxz;
for (ix=npx-2*halo; ix<npx-halo; ix++) {
ixp=ix*npz;
#pragma ivdep
for (iz=halo; iz<npz-halo; iz++) {
DpDx = Dx(p,ix,iyp,iz,npz);
DpDy = Dy(p,ixp,iy,iz,nxz);
DpDz = Dz(p,ixp,iyp,iz);
vz[iyp+ixp+iz] += ro[iyp+ixp+iz]*DpDz;
vx[iyp+ixp+iz] += ro[iyp+ixp+iz]*DpDx;
vy[iyp+ixp+iz] += ro[iyp+ixp+iz]*DpDy;
}
}
}
/* compute halo areas at front side */
for (iy=halo; iy<2*halo; iy++) {
iyp=iy*nxz;
for (ix=2*halo; ix<npx-2*halo; ix++) {
ixp=ix*npz;
#pragma ivdep
for (iz=halo; iz<npz-halo; iz++) {
DpDx = Dx(p,ix,iyp,iz,npz);
DpDy = Dy(p,ixp,iy,iz,nxz);
DpDz = Dz(p,ixp,iyp,iz);
vz[iyp+ixp+iz] += ro[iyp+ixp+iz]*DpDz;
vx[iyp+ixp+iz] += ro[iyp+ixp+iz]*DpDx;
vy[iyp+ixp+iz] += ro[iyp+ixp+iz]*DpDy;
}
}
}
/* compute halo areas at back side */
for (iy=npy-2*halo; iy<npy-halo; iy++) {
iyp=iy*nxz;
for (ix=2*halo; ix<npx-2*halo; ix++) {
ixp=ix*npz;
#pragma ivdep
for (iz=halo; iz<npz-halo; iz++) {
DpDx = Dx(p,ix,iyp,iz,npz);
DpDy = Dy(p,ixp,iy,iz,nxz);
DpDz = Dz(p,ixp,iyp,iz);
vz[iyp+ixp+iz] += ro[iyp+ixp+iz]*DpDz;
vx[iyp+ixp+iz] += ro[iyp+ixp+iz]*DpDx;
vy[iyp+ixp+iz] += ro[iyp+ixp+iz]*DpDy;
}
}
}
/* compute halo areas at top side */
for (iy=2*halo; iy<npy-2*halo; iy++) {
iyp=iy*nxz;
for (ix=2*halo; ix<npx-2*halo; ix++) {
ixp=ix*npz;
#pragma ivdep
for (iz=halo; iz<2*halo; iz++) {
DpDx = Dx(p,ix,iyp,iz,npz);
DpDy = Dy(p,ixp,iy,iz,nxz);
DpDz = Dz(p,ixp,iyp,iz);
vz[iyp+ixp+iz] += ro[iyp+ixp+iz]*DpDz;
vx[iyp+ixp+iz] += ro[iyp+ixp+iz]*DpDx;
vy[iyp+ixp+iz] += ro[iyp+ixp+iz]*DpDy;
}
}
}
/* compute halo areas at bottom side */
for (iy=2*halo; iy<npy-2*halo; iy++) {
iyp=iy*nxz;
for (ix=2*halo; ix<npx-2*halo; ix++) {
ixp=ix*npz;
#pragma ivdep
for (iz=npz-2*halo; iz<npz-halo; iz++) {
DpDx = Dx(p,ix,iyp,iz,npz);
DpDy = Dy(p,ixp,iy,iz,nxz);
DpDz = Dz(p,ixp,iyp,iz);
vz[iyp+ixp+iz] += ro[iyp+ixp+iz]*DpDz;
vx[iyp+ixp+iz] += ro[iyp+ixp+iz]*DpDx;
vy[iyp+ixp+iz] += ro[iyp+ixp+iz]*DpDy;
}
}
}
return 0;
}
int updatePressure(float *vx, float *vy, float *vz, float *p, float *l2m, int halo, int npx, int npy, int npz)
{
int ix, iy, iz, iyp, ixp, izp, c1, c2, nxz;
int ixs, ixe, iys, iye, izs, ize;
nxz=npx*npz;
c1 = 9.0/8.0;
c2 = -1.0/24.0;
ixs=2*halo; ixe=npx-2*halo;
iys=2*halo; iye=npy-2*halo;
izs=2*halo; ize=npz-2*halo;
/* calculate p/tzz for all grid points except on the virtual boundary */
#pragma omp for private (ix, iz)
#pragma ivdep
for (iy=iys; iy<iye; iy++) {
iyp=iy*nxz;
for (ix=ixs; ix<ixe; ix++) {
ixp=ix*npz;
#pragma ivdep
for (iz=izs; iz<ize; iz++) {
p[iyp+ixp+iz] += l2m[iyp+ixp+iz]*(Dv(vx,vz,ix,iy,iz,npz,nxz));
}
}
}
return 0;
}
int updatePressureHalo(float *vx, float *vy, float *vz, float *p, float *l2m, int halo, int npx, int npy, int npz)
{
int ix, iy, iz, iyp, ixp, izp, c1, c2, nxz;
nxz=npx*npz;
c1 = 9.0/8.0;
c2 = -1.0/24.0;
/* calculate p/tzz for all grid points except on the virtual boundary */
/* compute halo areas at left side */
#pragma omp for private (iy, ix, iz) nowait
for (iy=halo; iy<npy-halo; iy++) {
iyp=iy*nxz;
for (ix=halo; ix<2*halo; ix++) {
ixp=ix*npz;
#pragma ivdep
for (iz=halo; iz<npz-halo; iz++) {
p[iyp+ixp+iz] += l2m[iyp+ixp+iz]*(Dv(vx,vz,ix,iy,iz,npz,nxz));
}
}
}
/* compute halo areas at right side */
for (iy=halo; iy<npy-halo; iy++) {
iyp=iy*nxz;
for (ix=npx-2*halo; ix<npx-halo; ix++) {
ixp=ix*npz;
#pragma ivdep
for (iz=halo; iz<npz-halo; iz++) {
p[iyp+ixp+iz] += l2m[iyp+ixp+iz]*(Dv(vx,vz,ix,iy,iz,npz,nxz));
}
}
}
/* compute halo areas at front side */
for (iy=halo; iy<2*halo; iy++) {
iyp=iy*nxz;
for (ix=2*halo; ix<npx-2*halo; ix++) {
ixp=ix*npz;
#pragma ivdep
for (iz=halo; iz<npz-halo; iz++) {
p[iyp+ixp+iz] += l2m[iyp+ixp+iz]*(Dv(vx,vz,ix,iy,iz,npz,nxz));
}
}
}
/* compute halo areas at back side */
for (iy=npy-2*halo; iy<npy-halo; iy++) {
iyp=iy*nxz;
for (ix=2*halo; ix<npx-2*halo; ix++) {
ixp=ix*npz;
#pragma ivdep
for (iz=halo; iz<npz-halo; iz++) {
p[iyp+ixp+iz] += l2m[iyp+ixp+iz]*(Dv(vx,vz,ix,iy,iz,npz,nxz));
}
}
}
/* compute halo areas at top side */
for (iy=2*halo; iy<npy-2*halo; iy++) {
iyp=iy*nxz;
for (ix=2*halo; ix<npx-2*halo; ix++) {
ixp=ix*npz;
#pragma ivdep
for (iz=halo; iz<2*halo; iz++) {
p[iyp+ixp+iz] += l2m[iyp+ixp+iz]*(Dv(vx,vz,ix,iy,iz,npz,nxz));
}
}
}
/* compute halo areas at bottom side */
for (iy=2*halo; iy<npy-2*halo; iy++) {
iyp=iy*nxz;
for (ix=2*halo; ix<npx-2*halo; ix++) {
ixp=ix*npz;
#pragma ivdep
for (iz=npz-2*halo; iz<npz-halo; iz++) {
p[iyp+ixp+iz] += l2m[iyp+ixp+iz]*(Dv(vx,vz,ix,iy,iz,npz,nxz));
}
}
}
return 0;
}
int exchangeHalo(float *leftRecv, float *leftSend, float *rightRecv, float *rightSend, int size, int leftrank, int rightrank, MPI_Request *reqRecv, MPI_Request *reqSend, int tag)
{
int error, my_rank, ltag;
MPI_Status status;
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
if (leftrank != MPI_PROC_NULL) {
ltag = tag;
error = MPI_Irecv(leftRecv, size, MPI_FLOAT, leftrank, ltag, MPI_COMM_WORLD, &reqRecv[0]);
assert (error == MPI_SUCCESS);
// fprintf(stderr,"rank %d recv data from %d left\n", my_rank, leftrank);
ltag = tag+1;
error = MPI_Isend(leftSend, size, MPI_FLOAT, leftrank, ltag, MPI_COMM_WORLD, &reqSend[0]);
assert (error == MPI_SUCCESS);
// fprintf(stderr,"rank %d send data to %d left\n", my_rank, leftrank);
}
else {
reqRecv[0] = MPI_REQUEST_NULL;
reqSend[0] = MPI_REQUEST_NULL;
}
if (rightrank != MPI_PROC_NULL) {
ltag = tag+1;
error = MPI_Irecv(rightRecv, size, MPI_FLOAT, rightrank, ltag, MPI_COMM_WORLD, &reqRecv[1]);
// fprintf(stderr,"rank %d recv data from %d right\n", my_rank, rightrank);
assert (error == MPI_SUCCESS);
ltag = tag;
error = MPI_Isend(rightSend, size, MPI_FLOAT, rightrank, ltag, MPI_COMM_WORLD, &reqSend[1]);
assert (error == MPI_SUCCESS);
// fprintf(stderr,"rank %d send data to %d right\n", my_rank, rightrank);
}
else {
reqRecv[1] = MPI_REQUEST_NULL;
reqSend[1] = MPI_REQUEST_NULL;
}
return 0;
}
int waitForHalo(MPI_Request *reqRecv, MPI_Request *reqSend)
{
int i;
MPI_Status status;
int error;
for (i=0; i<6; i++) {
error = MPI_Wait(&reqSend[i], &status);
assert (error == MPI_SUCCESS);
}
// MPI_Barrier(MPI_COMM_WORLD);
for (i=0; i<6; i++) {
error = MPI_Wait(&reqRecv[i], &status);
assert (error == MPI_SUCCESS);
}
return 0;
}
int copyHaloVxVz(float *vx, float *vz, int npx, int npy, int npz, int halo, float *leftSend, float *rightSend, float *frontSend, float *backSend, float *topSend, float *bottomSend)
{
int ix, iy, iz, ih, iyp, ixp, halosizex, halosizey, halosizez, nxz;
nxz = npx*npz;
/* copy halo areas at left side */
halosizex = npy*npz*halo;
for (iy=halo; iy<npy-halo; iy++) {
iyp=iy*nxz;
for (ix=halo; ix<2*halo; ix++) {
ixp=ix*npz;
ih=(ix-halo)*npz;
#pragma ivdep
for (iz=halo; iz<npz-halo; iz++) {
leftSend[iy*npz*halo+ih+iz] = vx[iyp+ixp+iz];
leftSend[halosizex+iy*npz*halo+ih+iz] = vz[iyp+ixp+iz];
}
}
}
/* copy halo areas at right side */
halosizex = npy*npz*halo;
for (iy=halo; iy<npy-halo; iy++) {
iyp=iy*nxz;
for (ix=npx-2*halo; ix<npx-halo; ix++) {
ixp=ix*npz;
ih=(ix-(npx-2*halo))*npz;
#pragma ivdep
for (iz=halo; iz<npz-halo; iz++) {
rightSend[iy*npz*halo+ih+iz] = vx[iyp+ixp+iz];
rightSend[halosizex+iy*npz*halo+ih+iz] = vz[iyp+ixp+iz];
}
}
}
/* copy halo areas at front side */
halosizey = npx*npz*halo;
for (iy=halo; iy<2*halo; iy++) {
iyp=iy*nxz;
ih=(iy-halo)*nxz;
for (ix=0; ix<npx; ix++) {
ixp=ix*npz;
#pragma ivdep
for (iz=halo; iz<npz-halo; iz++) {
frontSend[ih+ixp+iz] = vx[iyp+ixp+iz];
frontSend[halosizey+ih+ixp+iz] = vz[iyp+ixp+iz];
}
}
}
/* copy halo areas at back side */
for (iy=npy-2*halo; iy<npy-halo; iy++) {
iyp=iy*nxz;
ih=(iy-(npy-2*halo))*nxz;
for (ix=0; ix<npx; ix++) {
ixp=ix*npz;
#pragma ivdep
for (iz=halo; iz<npz-halo; iz++) {
backSend[ih+ixp+iz] = vx[iyp+ixp+iz];
backSend[halosizey+ih+ixp+iz] = vz[iyp+ixp+iz];
}
}
}
/* copy halo areas at top side */
halosizez = npy*npx*halo;
for (iy=0; iy<npy; iy++) {
iyp=iy*nxz;
for (ix=0; ix<npx; ix++) {
ixp=ix*npz;
#pragma ivdep
for (iz=halo; iz<2*halo; iz++) {
ih=iz-halo;
topSend[iy*npx*halo+ix*halo+ih] = vx[iyp+ixp+iz];
topSend[halosizez+iy*npx*halo+ix*halo+ih] = vz[iyp+ixp+iz];
}
}
}
/* copy halo areas at bottom side */
for (iy=0; iy<npy; iy++) {
iyp=iy*nxz;
for (ix=0; ix<npx; ix++) {
ixp=ix*npz;
#pragma ivdep
for (iz=npz-2*halo; iz<npz-halo; iz++) {
ih=(iz-(npz-2*halo));
bottomSend[iy*npx*halo+ix*halo+ih] = vx[iyp+ixp+iz];
bottomSend[halosizez+iy*npx*halo+ix*halo+ih] = vz[iyp+ixp+iz];
}
}
}
return 0;
}
int copyHaloP(float *p, int npx, int npy, int npz, int halo, float *leftSend, float *rightSend, float *frontSend, float *backSend, float *topSend, float *bottomSend)
{
int ix, iy, iz, ih, iyp, ixp, halosizex, halosizey, halosizez, nxz;
nxz = npx*npz;
/* copy halo areas at left side */
halosizex = npy*npz*halo;
for (iy=halo; iy<npy-halo; iy++) {
iyp=iy*nxz;
for (ix=halo; ix<2*halo; ix++) {
ixp=ix*npz;
ih=(ix-halo)*npz;
#pragma ivdep
for (iz=halo; iz<npz-halo; iz++) {
leftSend[iy*npz*halo+ih+iz] = p[iyp+ixp+iz];
}
}
}
/* copy halo areas at right side */
halosizex = npy*npz*halo;
for (iy=halo; iy<npy-halo; iy++) {
iyp=iy*nxz;
for (ix=npx-2*halo; ix<npx-halo; ix++) {
ixp=ix*npz;
ih=(ix-(npx-2*halo))*npz;
#pragma ivdep
for (iz=halo; iz<npz-halo; iz++) {
rightSend[iy*npz*halo+ih+iz] = p[iyp+ixp+iz];
}
}
}
/* copy halo areas at front side */
halosizey = npx*npz*halo;
for (iy=halo; iy<2*halo; iy++) {
iyp=iy*nxz;
ih=(iy-halo)*nxz;
for (ix=0; ix<npx; ix++) {
ixp=ix*npz;
#pragma ivdep
for (iz=halo; iz<npz-halo; iz++) {
frontSend[ih+ixp+iz] = p[iyp+ixp+iz];
}
}
}
/* copy halo areas at back side */
for (iy=npy-2*halo; iy<npy-halo; iy++) {
iyp=iy*nxz;
ih=(iy-(npy-2*halo))*nxz;
for (ix=0; ix<npx; ix++) {
ixp=ix*npz;
#pragma ivdep
for (iz=halo; iz<npz-halo; iz++) {
backSend[ih+ixp+iz] = p[iyp+ixp+iz];
}
}
}
/* copy halo areas at top side */
halosizez = npy*npx*halo;
for (iy=0; iy<npy; iy++) {
iyp=iy*nxz;
for (ix=0; ix<npx; ix++) {
ixp=ix*npz;
#pragma ivdep
for (iz=halo; iz<2*halo; iz++) {
ih=iz-halo;
topSend[iy*npx*halo+ix*halo+ih] = p[iyp+ixp+iz];
}
}
}
/* copy halo areas at bottom side */
for (iy=0; iy<npy; iy++) {
iyp=iy*nxz;
for (ix=0; ix<npx; ix++) {
ixp=ix*npz;
#pragma ivdep
for (iz=npz-2*halo; iz<npz-halo; iz++) {
ih=(iz-(npz-2*halo));
bottomSend[iy*npx*halo+ix*halo+ih] = p[iyp+ixp+iz];
}
}
}
return 0;
}
/* copy communicated halo areas back to compute grids */
int newHaloVxVz(float *vx, float *vz, int npx, int npy, int npz, int halo, float *leftRecv, float *rightRecv, float *frontRecv, float *backRecv, float *topRecv, float *bottomRecv)
{
int ix, iy, iz, ih, iyp, ixp, halosizex, halosizey, halosizez, nxz;
nxz = npx*npz;
/* copy halo areas at left side */
halosizex = npy*npz*halo;
for (iy=halo; iy<npy-halo; iy++) {
iyp=iy*nxz;
for (ix=0; ix<halo; ix++) {
ixp=ix*npz;
ih=ixp;
#pragma ivdep
for (iz=halo; iz<npz-halo; iz++) {
vx[iyp+ixp+iz] = leftRecv[iy*npz*halo+ih+iz];
vz[iyp+ixp+iz] = leftRecv[halosizex+iy*npz*halo+ih+iz];
}
}
}
/* copy halo areas at right side */
halosizex = npy*npz*halo;
for (iy=halo; iy<npy-halo; iy++) {
iyp=iy*nxz;
for (ix=npx-halo; ix<npx; ix++) {
ixp=ix*npz;
ih=(ix-(npx-halo))*npz;
#pragma ivdep
for (iz=halo; iz<npz-halo; iz++) {
vx[iyp+ixp+iz] = rightRecv[iy*npz*halo+ih+iz];
vz[iyp+ixp+iz] = rightRecv[halosizex+iy*npz*halo+ih+iz];
}
}
}
/* copy halo areas at front side */
halosizey = npx*npz*halo;
for (iy=0; iy<halo; iy++) {
iyp=iy*nxz;
ih=iyp;
for (ix=0; ix<npx; ix++) {
ixp=ix*npz;
#pragma ivdep
for (iz=halo; iz<npz-halo; iz++) {
vx[iyp+ixp+iz] = frontRecv[ih+ixp+iz];
vz[iyp+ixp+iz] = frontRecv[halosizey+ih+ixp+iz];
}
}
}
/* copy halo areas at back side */
for (iy=npy-halo; iy<npy; iy++) {
iyp=iy*nxz;
ih=(iy-(npy-halo))*nxz;
for (ix=0; ix<npx; ix++) {
ixp=ix*npz;
#pragma ivdep
for (iz=halo; iz<npz-halo; iz++) {
vx[iyp+ixp+iz] = backRecv[ih+ixp+iz];
vz[iyp+ixp+iz] = backRecv[halosizey+ih+ixp+iz];
}
}
}
/* copy halo areas at top side */
halosizez = npy*npx*halo;
for (iy=0; iy<npy; iy++) {
iyp=iy*nxz;
for (ix=0; ix<npx; ix++) {
ixp=ix*npz;
#pragma ivdep
for (iz=0; iz<halo; iz++) {
ih=iz;
vx[iyp+ixp+iz] = topRecv[iy*npx*halo+ix*halo+ih];
vz[iyp+ixp+iz] = topRecv[halosizez+iy*npx*halo+ix*halo+ih];
}
}
}
/* copy halo areas at bottom side */
for (iy=0; iy<npy; iy++) {
iyp=iy*nxz;
for (ix=0; ix<npx; ix++) {
ixp=ix*npz;
#pragma ivdep
for (iz=npz-halo; iz<npz; iz++) {
ih=(iz-(npz-halo));
vx[iyp+ixp+iz] = bottomRecv[iy*npx*halo+ix*halo+ih];
vz[iyp+ixp+iz] = bottomRecv[halosizez+iy*npx*halo+ix*halo+ih];
}
}
}
return 0;
}
/* copy communicated halo areas back to compute grids */
int newHaloP(float *p, int npx, int npy, int npz, int halo, float *leftRecv, float *rightRecv, float *frontRecv, float *backRecv, float *topRecv, float *bottomRecv)
{
int ix, iy, iz, ih, iyp, ixp, halosizex, halosizey, halosizez, nxz;
nxz = npx*npz;
/* copy halo areas at left side */
halosizex = npy*npz*halo;
for (iy=halo; iy<npy-halo; iy++) {
iyp=iy*nxz;
for (ix=0; ix<halo; ix++) {
ixp=ix*npz;
ih=ixp;
#pragma ivdep
for (iz=halo; iz<npz-halo; iz++) {
p[iyp+ixp+iz] = leftRecv[iy*npz*halo+ih+iz];
}
}
}
/* copy halo areas at right side */
halosizex = npy*npz*halo;
for (iy=halo; iy<npy-halo; iy++) {
iyp=iy*nxz;
for (ix=npx-halo; ix<npx; ix++) {
ixp=ix*npz;
ih=(ix-(npx-halo))*npz;
#pragma ivdep
for (iz=halo; iz<npz-halo; iz++) {
p[iyp+ixp+iz] = rightRecv[iy*npz*halo+ih+iz];
}
}
}
/* copy halo areas at front side */
halosizey = npx*npz*halo;
for (iy=0; iy<halo; iy++) {
iyp=iy*nxz;
ih=iyp;
for (ix=0; ix<npx; ix++) {
ixp=ix*npz;
#pragma ivdep
for (iz=halo; iz<npz-halo; iz++) {
p[iyp+ixp+iz] = frontRecv[ih+ixp+iz];
}
}
}
/* copy halo areas at back side */
for (iy=npy-halo; iy<npy; iy++) {
iyp=iy*nxz;
ih=(iy-(npy-halo))*nxz;
for (ix=0; ix<npx; ix++) {
ixp=ix*npz;
#pragma ivdep
for (iz=halo; iz<npz-halo; iz++) {
p[iyp+ixp+iz] = backRecv[ih+ixp+iz];
}
}
}
/* copy halo areas at top side */
halosizez = npy*npx*halo;
for (iy=0; iy<npy; iy++) {
iyp=iy*nxz;
for (ix=0; ix<npx; ix++) {
ixp=ix*npz;
#pragma ivdep
for (iz=0; iz<halo; iz++) {
ih=iz;
p[iyp+ixp+iz] = topRecv[iy*npx*halo+ix*halo+ih];
}
}
}
/* copy halo areas at bottom side */
for (iy=0; iy<npy; iy++) {
iyp=iy*nxz;
for (ix=0; ix<npx; ix++) {
ixp=ix*npz;
#pragma ivdep
for (iz=npz-halo; iz<npz; iz++) {
ih=(iz-(npz-halo));
p[iyp+ixp+iz] = bottomRecv[iy*npx*halo+ix*halo+ih];
}
}
}
return 0;
}
float gauss2time(float t, float f, float t0)
{
float value, time;
time = t-t0;
value = ((1.0-2.0*M_PI*M_PI*f*f*time*time)*exp(-M_PI*M_PI*f*f*time*time));
return value;
}