From 609bb8456cca39788df40531f079d23dd8e61802 Mon Sep 17 00:00:00 2001
From: Jan at TU-Delft <J.W.Thorbecke@tudelft.nl>
Date: Thu, 22 Feb 2018 14:17:10 +0100
Subject: [PATCH] buffer for all data + OpenMP

---
 corrvir/corrvir.c | 402 ++++++++++++++++++++++++++++++----------------
 1 file changed, 268 insertions(+), 134 deletions(-)

diff --git a/corrvir/corrvir.c b/corrvir/corrvir.c
index 44a0025..7a7f50a 100644
--- a/corrvir/corrvir.c
+++ b/corrvir/corrvir.c
@@ -15,11 +15,12 @@ typedef struct { /* complex number */
         float r,i;
 } complex;
 
-void read_sutrace_at_position(FILE *fp, int itrace, complex *tracedata, complex *tracebuffer, int *trace_in_buffer, size_t *nbuffer, int ntfft, int nt, int nfreq, size_t nbufmax, size_t trace_sz, double *tread, double *tfft);
+void read_sutrace_at_position(FILE *fp, int itrace, int ircvnum, int isrcnum, complex *tracedata, complex *tracebuffer, int *trace_in_buffer, size_t *nbuffer, int ntfft, int nt, int nfreq, size_t nbufmax, size_t trace_sz, double *tread, double *tfft, double *tmemcpy);
+
+void get_sutrace_at_position(FILE *fp, size_t itrace, complex *tracedata, complex *tracebuffer, int ntfft, int nt, int nfreq, size_t nbufmax, size_t trace_sz, double *tread, double *tfft, double *tmemcpy);
 
 int optncr(int n);
-void cr1fft(complex *cdata, float *data, int n, int sign);
-void rc1fft(float *rdata, complex *cdata, int n, int sign);
+void cr1_fft(complex *cdata, float *data, int n, int sign);
 
 int getFileInfo(char *file_name, int *n1, int *n2, float *d1, float *d2, int verbose);
 
@@ -52,7 +53,7 @@ char *sdoc[] = {
 " ",
 " Optional parameters: ",
 " ",
-"   derr = 0 ............ maximum tolerated error distance (m) between defined virtual-source position and receiver positions",
+"   dtolerr = 0.1 ....... tolerated error distance (in m) between above requested virtual-source position and actual receiver positions",
 "   fmax = 70 ........... maximum frequency to use in correlation",
 "   nbm = 4 ............. amount of memory (in GB) to buffer traces for reusage",
 "   nsrc_ctr = 1 ...... . minumum number of sources to contribute to Fresnel stack",
@@ -85,7 +86,7 @@ NULL};
 int main (int argc, char **argv)
 {
 	int	i, j, k, ret, nshots, maxnfiles, nfiles;
-	int	size, n1, n2, ntfft, nf, ifreq, nfreq;
+	int	size, n1, n2, ntfft, nf, ifreq;
 	int     verbose, causal;
 	int     nt, nstation, ntc, nb, istation, jstation;
 	int     pgsz, istep, nsrc_ctr, nsrc, nsrcMaster, nsrcSlave;
@@ -103,25 +104,26 @@ int main (int argc, char **argv)
 	int	xmin, xmax, ymin, ymax;
 	int     *trace_in_buffer;
 	size_t  nbuffer, nbufmax, nbufmemory;
-	size_t ntrace;
+	size_t ntrace, bufsize, nsamp, ipos, nfreq;
 	int nbm;
 	crgPos *rcvSrc;
 	int ircv, *isrc, *nsrc_crg, *vsrc;
 	float	dx, fmax, df, d1, d2, scl, fscalco, sclfft;
 	float   dt, reps, epsmax;
-	float 	derr;
+	float 	dtolerr;
 	float	*r, *vtrace;
-	float 	diffx, diffy, dd, ddmin, ddtol;
+	float 	diffx, diffy, dd, ddmin;
 	float   *xsrcv, *ysrcv;
 	float   rms2, sclrms;
 	complex tmpcc; 
 	int     nxsrcv, nysrcv, nvsrc;
 	int	 	scalco;
 	complex *c, *cmaster, *cslaves, *tracebuffer;
-	double  t0, t1, t2, t3, tinit, tread, tfft, tcorr, twrite, tlogic, trdloc, tftloc, tcorrloc;
-	double  tread_ivs, tfft_ivs, tcorr_ivs, twrite_ivs, tlogic_ivs;
+	double  t0, t1, t2, t3, ttotal, tinit, tread, tfft, tcorr, twrite, tlogic, trdloc, tftloc, tcorrloc, tmemcpy, tselect, tmemset, tloop, troutine0, troutine;
+	double  tread_ivs, tfft_ivs, tcorr_ivs, twrite_ivs, tlogic_ivs, t1_ivs, t3_ivs, t1_ivs_ivr, t2_ivs_ivr, t3_ivs_ivr, t4_ivs_ivr, t5_ivs_ivr;
 	char	*file_shots, *file_out, filename[1024], basename[1200], base[1200], *pbase;
 	int     pe=0, root_pe=0, npes=1, ipe, size_s;
+	float *trace;
 	FILE *fpin, *fpout;
 	segy hdr;
 	segy *segy_hdrs;
@@ -140,8 +142,6 @@ int main (int argc, char **argv)
 	assert(nsources!=0);
 	if (!getparint("nreceivers", &nreceivers)) nreceivers = 0;
 	assert(nreceivers!=0);
-	if (!getparint("nbm", &nbm)) nbm = 4;
-	nbufmemory=nbm;
 	if (!getparint("nsrc_ctr", &nsrc_ctr)) nsrc_ctr = 1;
 	if (!getparint("normsrc", &normsrc)) normsrc = 0;
 	if (!getparint("causal", &causal)) causal = 1;
@@ -149,11 +149,12 @@ int main (int argc, char **argv)
 	if (!getparint("cohr", &cohr)) cohr = 0;
 	if (!getparint("src_sel", &src_sel)) src_sel = 0;
 	if (!getparint("bmin", &bmin)) bmin = 1;
+	if (!getparint("nbm", &nbm)) nbm = 0;
 	if (!getparint("bmax", &bmax)) bmax = 4;
 	if (!getparfloat("reps", &reps)) reps = 0.0;
 	if (!getparfloat("epsmax", &epsmax)) epsmax = 0.1;
 	if (!getparfloat("fmax", &fmax)) fmax = 70;
-	if (!getparfloat("derr", &derr)) derr = 0;
+	if (!getparfloat("dtolerr", &dtolerr)) dtolerr = 0.1;
 	nxsrcv = countparval("xsrcv");
 	nysrcv = countparval("ysrcv");
 	if (nxsrcv != nysrcv) {
@@ -176,20 +177,63 @@ int main (int argc, char **argv)
 	fpout = fopen(file_out, "w+");
 
 	fprintf(stderr,"nt=%d dt=%f ntrace=%ld\n",nt, dt, (long) ntrace);
-	t1 = wallclock_time();
-	tinit += t1-t0;
 
-	/*================ Read geometry of all traces in file_shots ================*/
-
-    	trace_sz = sizeof(float)*(n1)+TRCBYTES;
+    ntfft = optncr(2*nt);
+	nf    = ntfft/2+1;
+	df    = 1.0/(ntfft*dt);
+	nfreq = MIN(nf,(int)(fmax/df)+1);
+	sclfft= 1.0/((float)ntfft);
+	trace_sz = sizeof(float)*(n1)+TRCBYTES;
 	segy_hdrs = (segy *)calloc(ntrace,sizeof(segy));
 	assert(segy_hdrs != NULL);
-	for (i=0; i<ntrace; i++) {
-		offset = i*trace_sz;
-		ret = fseeko(fpin , offset, SEEK_SET);
-		if (ret<0) perror("fseeko");
-    		nread = fread( &segy_hdrs[i], 1, TRCBYTES, fpin );
-		assert(nread == TRCBYTES);
+	ret = fseeko(fpin , 0, SEEK_SET);
+
+	t1 = wallclock_time();
+	tinit += t1-t0;
+
+	/* check if data is fully buffered */
+	if (nbm==0) { /* read only headers */
+		nbufmax = 0;
+		for (i=0; i<ntrace; i++) {
+			if(i % 100000 == 0) fprintf(stderr,"i=%d out of %d\n", i, ntrace);
+			offset = i*trace_sz;
+			ret = fseeko(fpin , offset, SEEK_SET);
+			if (ret<0) perror("fseeko");
+   			nread = fread( &segy_hdrs[i], 1, TRCBYTES, fpin );
+		}
+	} 
+	else { /* buffer is allocated to store all data traces */
+        bufsize = (size_t)(ntrace*nfreq*sizeof(complex));
+		nbufmax = ntrace;
+		fprintf(stderr,"memory allocated to buffer all traces = %.2f GB\n", (float)(bufsize/(1024*1024*1024)));
+		tracebuffer = (complex *)malloc(bufsize);
+		assert (tracebuffer != NULL);
+
+		/*================ Read geometry of all traces in file_shots ================*/
+
+		//trace = (float *)calloc(nt, sizeof(float));
+		r = (float *)calloc(ntfft,sizeof(float));
+		c = (complex *)calloc((nf),sizeof(complex));
+		for (i=0; i<ntrace; i++) {
+			if(i % 100000 == 0) fprintf(stderr,"i=%d out of %d\n", i, ntrace);
+			offset = i*trace_sz;
+			ret = fseeko(fpin , offset, SEEK_SET);
+			if (ret<0) perror("fseeko");
+   			nread = fread( &segy_hdrs[i], 1, TRCBYTES, fpin );
+			assert(nread == TRCBYTES);
+			
+			nsamp =  segy_hdrs[i].ns;
+			memset(r,0,ntfft*sizeof(float));
+       		nread = fread( r, sizeof(float), nt, fpin );
+       		assert(nread == nt);
+        	//memcpy(r,&trace[0],nt*sizeof(float));
+        	rc1_fft(r,c,ntfft,-1);
+	//		for (j=0; j<nfreq; j++) tracebuffer[i*nfreq+j] = c[j];
+        	memcpy(&tracebuffer[i*nfreq],&c[0],nfreq*sizeof(complex));
+		}
+		//free(trace);
+		free(r);
+		free(c);
 	}
 	t2 = wallclock_time();
 	tread += t2-t1;
@@ -280,7 +324,6 @@ int main (int argc, char **argv)
 	if (nxsrcv) {
 		nvsrc = nxsrcv;
 		vsrc = (int *)malloc(nvsrc*sizeof(int));
-		ddtol= derr*derr;
 		for (j=0; j<nxsrcv; j++) {
 			diffx = xsrcv[j]- rcvSrc[0].gx*fscalco;
 			diffy = ysrcv[j]- rcvSrc[0].gy*fscalco;
@@ -296,13 +339,13 @@ int main (int argc, char **argv)
 				}
 			}
 			fprintf(stderr,"ddmin for requested vsrc position (%.2f,%.2f) = %.2f\n",xsrcv[j],ysrcv[j],ddmin);
-			if (ddmin <= ddtol) {
+			if (ddmin <= dtolerr) {
 				xsrcv[j]=rcvSrc[vsrc[j]].gx*fscalco;
 				ysrcv[j]=rcvSrc[vsrc[j]].gy*fscalco;
-				fprintf(stderr,"Found vsrc position: (%.2f,%.2f)\n",xsrcv[j],ysrcv[j]);		 	
+				fprintf(stderr,"Requested vsrc position coincide with rcv position (%.2f,%.2f)\n",xsrcv[j],ysrcv[j]);		 	
 			}
 			else {
-			fprintf(stderr,"Error: Requested vsrc position (%.2f,%.2f) do not coincide with a rcv position\n",xsrcv[j],ysrcv[j]);
+			fprintf(stderr,"Error: Requested vsrc position do not coincide with a rcv position\n");
 			}
 		}
 	}
@@ -314,41 +357,49 @@ int main (int argc, char **argv)
 		}
 	}
 
-	
 	/*================ initializations ================*/
 
-	ntfft = optncr(2*nt);
-	nf    = ntfft/2+1;
-	df    = 1.0/(ntfft*dt);
-	nfreq = MIN(nf,(int)(fmax/df)+1);
-	sclfft= 1.0/((float)ntfft);
+	
 
-//	for (j=0; j<MIN(rcvSrc[0].nsrc,rcvSrc[1].nsrc); j++) fprintf(stderr,"before allocs sx=%d sy=%d +1 sx=%d sx=%d\n",rcvSrc[0].src[j].x, rcvSrc[0].src[j].y, rcvSrc[1].src[j].x, rcvSrc[1].src[j].y);
+	//for (j=0; j<MIN(rcvSrc[0].nsrc,rcvSrc[1].nsrc); j++) fprintf(stderr,"before allocs sx=%d sy=%d +1 sx=%d sx=%d\n",rcvSrc[0].src[j].x, rcvSrc[0].src[j].y, rcvSrc[1].src[j].x, rcvSrc[1].src[j].y);
 
-	nbuffer = 0;
-	nbufmax = (nbufmemory*1024*1024*1024/(nfreq*sizeof(complex))); /* number of traces in buffer */
-	tracebuffer = (complex *)malloc(nbufmax*nfreq*sizeof(complex));
-	trace_in_buffer = (int *)malloc(nbufmax*sizeof(int));
-	for (i=0; i<nbufmax; i++) trace_in_buffer[i] = -1;
+	//nbuffer = 0;
+	////nbufmax = (1024*1024*1024/(nfreq*sizeof(complex)));
+	//if (nbufmax !=0 ) {
+	//	tracebuffer = (complex *)malloc(nbufmax*nfreq*sizeof(complex));
+	//	trace_in_buffer = (int *)malloc(nbufmax*sizeof(int));
+	//	for (i=0; i<nbufmax; i++) trace_in_buffer[i] = -1;
+	//}
+	//fprintf(stderr,"nfreq=%d, nbufmax=%d\n", nfreq, nbufmax);
 
 	
 	/*================ for nvirtual shots find suitable receivers ================*/
+	t1 = wallclock_time();
+	tinit += t1-t2;
+
+	#pragma omp parallel default(shared) \
+	private(cmaster,cslaves,vtrace,r,c) \
+ 	private(t1, tread_ivs, tfft_ivs, tcorr_ivs, twrite_ivs, tlogic_ivs, t1_ivs_ivr, t2_ivs_ivr, t3_ivs_ivr, t4_ivs_ivr, t5_ivs_ivr) \
+	private(ivrcv, gxv, gyv, vrpeg, gelev, nsrcSlave) \
+	private(xmin,ymin,xmax,ymax, cx1, cy1, cx2, cy2, cx3, cy3, distmc2, distsc2min, distsc2max, distsc2) \
+	private(nsrc, trdloc, tftloc, tcorrloc) \
+	private(isrcm, isrcs, sxm, sym, sxs, sys, read_trace) \
+	private(jsource, rms2, ifreq, tmpcc, sclrms, scl) \
+	private(j, outputhdr, itrace_out, nwrite)
+{
 
 	cmaster = (complex *)malloc(nsources*nfreq*sizeof(complex));
 	cslaves = (complex *)malloc(nsources*nfreq*sizeof(complex));
-
 	vtrace = (float *)malloc(nt*sizeof(float));
 	r = (float *)malloc(ntfft*sizeof(float));
-	c = (complex *)malloc(nf*sizeof(complex));
-
-	t1 = wallclock_time();
-	tinit += t1-t2;
-
+	c = (complex *)malloc((ntfft/2+1)*sizeof(complex));
 	itrace_out=0;
+	
 	for (ivs=0; ivs<nvsrc; ivs++) {/* loop over the number of virtual source positions to be created */
+		fprintf(stderr,"ivs = %d \n",ivs);
 
-		t1 = wallclock_time();
-		tread_ivs = tfft_ivs = tcorr_ivs = twrite_ivs = tlogic_ivs = 0.0;
+		t1_ivs = wallclock_time();
+		tread_ivs = tfft_ivs = tcorr_ivs = twrite_ivs = tlogic_ivs = tmemcpy = tselect = tmemset = tloop = troutine = 0.0;
 		
 		ivsrc=vsrc[ivs];
 		sxv = rcvSrc[ivsrc].gx;
@@ -357,7 +408,11 @@ int main (int argc, char **argv)
 		svelev = rcvSrc[ivsrc].gelev;
 		nsrcMaster = nsrc_crg[ivsrc];
 
+		#pragma omp for
 		for (ivrcv=0; ivrcv<nreceivers; ivrcv++) {
+			
+			//t1_ivs_ivr = wallclock_time();
+			
 			gxv = rcvSrc[ivrcv].gx;
 			gyv = rcvSrc[ivrcv].gy;
 			vrpeg = rcvSrc[ivrcv].rpeg;
@@ -388,7 +443,11 @@ int main (int argc, char **argv)
 			
 			nsrc = 0;
 			memset(cslaves,0,nfreq*nsources*sizeof(complex));
-			trdloc = tftloc = tcorrloc = 0.0;
+			memset(cmaster,0,nfreq*nsources*sizeof(complex));
+			//t0 = wallclock_time();
+			//tmemset += t0-t1_ivs_ivr;
+
+			//trdloc = tftloc = tcorrloc = 0.0;
 			for (isrcm=0; isrcm<nsrcMaster; isrcm++) { /* for all traces find which traces are recorded by both virtual source and receiver position */
 				sxm = rcvSrc[ivsrc].src[isrcm].x;
 				sym = rcvSrc[ivsrc].src[isrcm].y;
@@ -462,25 +521,33 @@ int main (int argc, char **argv)
 									read_trace = 1;
 								}
 							}
-							
 						}
-						/* read data from file for sources that passed the tests above */
-						if (read_trace) {
+						if (read_trace) { /* read data from file for sources that passed the tests above */
+//#pragma omp critical
+//{
+							//troutine0 = wallclock_time();
 							/* read master trace */
-							read_sutrace_at_position(fpin, rcvSrc[ivsrc].src[isrcm].fpos, &cmaster[nsrc*nfreq], tracebuffer, trace_in_buffer, &nbuffer, ntfft, nt, nfreq, nbufmax, trace_sz, &trdloc, &tftloc);
+							//memcpy(&cmaster[nsrc*nfreq], &tracebuffer[rcvSrc[ivsrc].src[isrcm].fpos*nfreq],nfreq*sizeof(complex));
+							get_sutrace_at_position(fpin, rcvSrc[ivsrc].src[isrcm].fpos, &cmaster[nsrc*nfreq], tracebuffer, ntfft, nt, nfreq, nbufmax, trace_sz, &tread, &tfft, &tmemcpy);
+							//read_sutrace_at_position(fpin, rcvSrc[ivsrc].src[isrcm].fpos, ivsrc, isrcm, &cmaster[nsrc*nfreq], tracebuffer, trace_in_buffer, &nbuffer, ntfft, nt, nfreq, nbufmax, trace_sz, &trdloc, &tftloc, &tmemcpy);
 
 							/* read slave trace */
-							read_sutrace_at_position(fpin, rcvSrc[ivrcv].src[isrcs].fpos, &cslaves[nsrc*nfreq], tracebuffer, trace_in_buffer, &nbuffer, ntfft, nt, nfreq, nbufmax, trace_sz, &trdloc, &tftloc);
+							//ipos = (rcvSrc[ivrcv].src[isrcs].fpos)*nfreq;
+							//memcpy(&cslaves[nsrc*nfreq], &tracebuffer[ipos],nfreq*sizeof(complex));
+							get_sutrace_at_position(fpin, rcvSrc[ivrcv].src[isrcs].fpos, &cslaves[nsrc*nfreq], tracebuffer, ntfft, nt, nfreq, nbufmax, trace_sz, &tread, &tfft, &tmemcpy);
+							//read_sutrace_at_position(fpin, rcvSrc[ivrcv].src[isrcs].fpos, ivrcv, isrcs, &cslaves[nsrc*nfreq], tracebuffer, trace_in_buffer, &nbuffer, ntfft, nt, nfreq, nbufmax, trace_sz, &trdloc, &tftloc, &tmemcpy);
+							//troutine += wallclock_time()-troutine0;
+//}
 							nsrc++;
 						}
-
 					} /* end of if test that slave and master have same contributing source */
 				} /* end of nsrcSlave loop */
 			} /* end of nsrcMaster loop */
-			tread_ivs += trdloc;
-			tread += trdloc;
-			tfft_ivs += tftloc;
-			tfft += tftloc;
+			//tread_ivs += trdloc;
+			//tfft_ivs += tftloc;
+			//tloop += wallclock_time()-t0;
+			t1 = wallclock_time();
+			fprintf(stderr,"ivrcv = %d out of %d\n",ivrcv, nreceivers);
 
 			if (nsrc < nsrc_ctr) nsrc = -1; /* only compute virtual receiver when there are sufficient active sources contributing */
 
@@ -491,8 +558,7 @@ int main (int argc, char **argv)
 			else {
 				correlate(cmaster, cslaves, nfreq, nsrc, &tcorrloc, verbose);
 			}
-			tcorr_ivs += tcorrloc;
-			tcorr += tcorrloc;
+			tcorr_ivs += wallclock_time()-t1;
 			
 			/* summation over the contributing sources */
 			memset(c,0,nf*sizeof(complex));
@@ -512,14 +578,13 @@ int main (int argc, char **argv)
 					c[ifreq].i += cslaves[jsource*nfreq+ifreq].i*sclrms;
 				}
 			}
-			
-			t2 = wallclock_time();
+		
+			//t2_ivs_ivr = wallclock_time();
 
-			cr1fft(&c[0],r,ntfft,1); /* transform virtual trace back to time */
+			cr1_fft(&c[0],r,ntfft,1); /* transform virtual trace back to time */
 			
-			t3 = wallclock_time();
-			tfft_ivs += t3-t2;
-			tfft += t3-t2;
+			//t3_ivs_ivr = wallclock_time();
+			//tfft_ivs += t3_ivs_ivr-t2_ivs_ivr;
 			
 			if (normalize && nsrc>0) scl=sclfft/nsrc; /* normalize by the number of contributing sources */
 			else scl = sclfft;
@@ -548,13 +613,15 @@ int main (int argc, char **argv)
 				for (j=1;j<nt; j++) {
 					vtrace[j] = 0.5*(r[ntfft-j] + r[j])*scl;
 				}
-			} /* one virtual source and virtual receiver are now calculated; write the common virtual shot gather to disk */
+			} /* one virtual trace (virtual source and virtualreceiver) is now calculated and is written to disk below */
 			
-			t2 = wallclock_time();
+			//t4_ivs_ivr = wallclock_time();
+			//tselect += t4_ivs_ivr-t3_ivs_ivr;
 			
 			memcpy(&outputhdr,&segy_hdrs[(rcvSrc[ivrcv].src[0].fpos)],TRCBYTES);
 			outputhdr.fldr = ivsrc+1;
-			outputhdr.tracl = itrace_out+1;
+			itrace_out=ivsrc*nreceivers+ivrcv+1;
+			outputhdr.tracl = itrace_out; // wrong value in principle
 			outputhdr.tracf = ivrcv+1;
 			outputhdr.sx = sxv;
 			outputhdr.sy = syv;
@@ -566,139 +633,206 @@ int main (int argc, char **argv)
 			outputhdr.gelev = gvelev;
 			outputhdr.ns = nt;
 			// outputhdr.offset = (gxv - sxv)/1000;
+			#pragma omp critical
+{
 			nwrite = fwrite( &outputhdr, 1, TRCBYTES, fpout );
 			assert (nwrite == TRCBYTES);
 			nwrite = fwrite( &vtrace[0], sizeof(float), nt, fpout );
 			assert (nwrite == nt);
 			fflush(fpout);
-
-			t3 = wallclock_time();
-			twrite_ivs += t3-t2;
-			twrite += t3-t2;
-			
-			itrace_out++;
+}
+			//t5_ivs_ivr = wallclock_time();
+			//twrite_ivs += t5_ivs_ivr-t4_ivs_ivr;
 		} /* end of virtual receiver loop */
-
+		
+			#pragma omp single
+{
 		if (verbose>=3) {
-			t3 = wallclock_time();
-			tlogic_ivs = ((t3-t1)-tread_ivs-tfft_ivs-tcorr_ivs-twrite_ivs);
-			fprintf(stderr,"************* Timings ************* vshot= %.d \n", vspeg);
-			fprintf(stderr,"CPU-time read data         = %.3f\n", tread_ivs);
-			fprintf(stderr,"CPU-time FFT's             = %.3f\n", tfft_ivs);
-			fprintf(stderr,"CPU-time correlation       = %.3f\n", tcorr_ivs);
-			fprintf(stderr,"CPU-time write data        = %.3f\n", twrite_ivs);
-			fprintf(stderr,"CPU-time logic             = %.3f\n", tlogic_ivs);
-			fprintf(stderr,"Total CPU-time this shot   = %.3f\n", t3-t1);
+			t3_ivs = wallclock_time();
+			//tlogic_ivs = ((t3_ivs-t1_ivs)-tread_ivs-tfft_ivs-tcorr_ivs-twrite_ivs-tmemcpy);
+			fprintf(stderr,"************* Timings ************* vshot= %d (ivs = %d)\n", vspeg, ivs);
+			//fprintf(stderr,"CPU-time read data         = %.3f\n", tread_ivs);
+			//fprintf(stderr,"CPU-time FFT's             = %.3f\n", tfft_ivs);
+			//fprintf(stderr,"CPU-time memcpy            = %.3f\n", tmemcpy);
+			//fprintf(stderr,"CPU-time memset            = %.3f\n", tmemset);
+			//fprintf(stderr,"CPU-time select            = %.3f\n", tselect);
+			//fprintf(stderr,"CPU-time loop              = %.3f\n", tloop);
+			//fprintf(stderr,"CPU-time routine           = %.3f\n", troutine);
+			//fprintf(stderr,"CPU-time correlation       = %.3f\n", tcorr_ivs);
+			//fprintf(stderr,"CPU-time write data        = %.3f\n", twrite_ivs);
+			//fprintf(stderr,"CPU-time logic             = %.3f\n", tlogic_ivs);
+			fprintf(stderr,"Total CPU-time this shot   = %.3f\n", t3_ivs-t1_ivs);
 		}
+}
+		//tread+=tread_ivs;
+		//tfft+=tfft_ivs;
+		//tcorr+=tcorr_ivs;
+		//twrite+=twrite_ivs;
+
 	} /* end of virtual source loop */
+	free(cmaster);
+	free(cslaves);
+	free(vtrace);
+	free(r);
+	free(c);
 
+} /* end of parallel region */
 	fclose(fpin);
 	fclose(fpout);
 
 	if (verbose) {
 		t3 = wallclock_time();
-		tlogic = ((t3-t0)-tread-tfft-tcorr-twrite);
+		//tlogic = ((t3-t1)-tread-tfft-tcorr-twrite);
+		ttotal = t3-t1;
 		fprintf(stderr,"************* Total Timings ************* \n");
-		fprintf(stderr,"CPU-time initilization     = %.3f\n", tinit);
-		fprintf(stderr,"CPU-time read data         = %.3f\n", tread);
-		fprintf(stderr,"CPU-time FFT's             = %.3f\n", tfft);
-		fprintf(stderr,"CPU-time correlation       = %.3f\n", tcorr);
-		fprintf(stderr,"CPU-time write data        = %.3f\n", twrite);
-		fprintf(stderr,"CPU-time logic             = %.3f\n", tlogic);
-		fprintf(stderr,"Total CPU-time             = %.3f\n", t3-t0);
+		//fprintf(stderr,"CPU-time initilization     = %.3f\n", tinit);
+		//fprintf(stderr,"CPU-time read data         = %.3f\n", tread);
+		//fprintf(stderr,"CPU-time FFT's             = %.3f\n", tfft);
+		//fprintf(stderr,"CPU-time correlation       = %.3f\n", tcorr);
+		//fprintf(stderr,"CPU-time write data        = %.3f\n", twrite);
+		//fprintf(stderr,"CPU-time logic             = %.3f\n", tlogic);
+		fprintf(stderr,"Total CPU-time             = %.3f\n", ttotal);
 	}
-
-	free(cmaster);
-	free(cslaves);
-	free(vtrace);
-	free(c);
-	free(r);
-
 /*================ end ================*/
-
 	return 0;
 }
 
+void get_sutrace_at_position(FILE *fp, size_t itrace, complex *tracedata, complex *tracebuffer, int ntfft, int nt, int nfreq, size_t nbufmax, size_t trace_sz, double *tread, double *tfft, double *tmemcpy)
+{
+	static size_t freebuffer=0;
+	size_t i, inbuffer, offset, nread, nwrite;
+	int ret;
+	float *trace;
+	float *r;
+	double t0, t1, t2, t3, t0_t, t1_t;
+	complex *c;
+	FILE *fpt;
+	char filename[1024];
+
+	if (nbufmax == 0) {
+		t0_t = wallclock_time();	
+		//trace = (float *)calloc(nt, sizeof(float));
+		r = (float *)malloc(ntfft*sizeof(float));
+		c = (complex *)malloc((ntfft/2+1)*sizeof(complex));
 
-void read_sutrace_at_position(FILE *fp, int itrace, complex *tracedata, complex *tracebuffer, int *trace_in_buffer, size_t *nbuffer,
-int ntfft, int nt, int nfreq, size_t nbufmax, size_t trace_sz, double *tread, double *tfft)
+		/***** read trace from file *****/
+       	offset = itrace*trace_sz+TRCBYTES;
+       	ret = fseeko(fp, offset, SEEK_SET);
+       	if (ret<0) perror("fseeko");
+		memset(r,0,ntfft*sizeof(float));
+       	nread = fread(r, sizeof(float), nt, fp);
+       	assert(nread == nt);
+		t1_t = wallclock_time();
+		*tread += t1_t-t0_t;
+
+		/***** transform to frequency domain *****/
+
+		//memcpy(r,&trace[0],nt*sizeof(float)); /* might not be needed if segy_read_traces_c is nice */
+		rc1_fft(r,c,ntfft,-1);
+		memcpy(&tracedata[0].r,&c[0].r,nfreq*sizeof(complex));
+
+		t2 = wallclock_time();
+		*tfft += t2-t1_t;
+
+		//free(trace);
+		free(r);
+		free(c);
+	}
+	else {  /**** point to trace from buffer *****/
+		t0 = wallclock_time();
+		//tracedata = (complex *)&tracebuffer[itrace*nfreq]; 
+		memcpy(tracedata,&tracebuffer[itrace*nfreq],nfreq*sizeof(complex));
+		t1 = wallclock_time();
+		*tmemcpy += t1-t0;
+	}
+	return;
+}
+
+void read_sutrace_at_position_old(FILE *fp, int itrace, int ircvnum, int isrcnum, complex *tracedata, complex *tracebuffer, int *trace_in_buffer, size_t *nbuffer, int ntfft, int nt, int nfreq, size_t nbufmax, size_t trace_sz, double *tread, double *tfft, double *tmemcpy)
 {
 	static size_t freebuffer=0;
 	size_t i, inbuffer, offset, nread, nwrite;
 	int ret;
 	float *trace;
-	float	*r;
-	double  t0, t1, t2, t3;
+	float *r;
+	double t0, t1, t2, t3, t0_t, t1_t;
 	complex *c;
 	FILE *fpt;
 	char filename[1024];
 
 	inbuffer = -1;
-//	fprintf(stderr,"reading trace %d\n", itrace);
+	//fprintf(stderr,"reading trace %d\n", itrace);
 
+	t0_t = wallclock_time();	
 	for (i=0; i<nbufmax; i++) {
 		if (itrace == trace_in_buffer[i]) {
 			inbuffer = i;
 			break;
 		}
 	}
+	t1_t = wallclock_time();
+	*tread += t1_t-t0_t;
+	//fprintf(stderr,"itrace=%8d inbuffer=%8d nbufmax=%d ircvnum=%8d isrcnum=%8d time=%f\n", itrace, inbuffer, nbufmax, ircvnum, isrcnum, t1_t-t0_t);
 
-//	fprintf(stderr,"inbuffer = %d bufmax=%d\n", inbuffer,nbufmax);
 	if (inbuffer == -1) {
 		t0 = wallclock_time();
 		trace = (float *)calloc(nt, sizeof(float));
 		r = (float *)malloc(ntfft*sizeof(float));
 		c = (complex *)malloc((ntfft/2+1)*sizeof(complex));
 
-	/* read trace from file */
-        offset = itrace*trace_sz+TRCBYTES;
-        ret = fseeko(fp , offset, SEEK_SET);
-        if (ret<0) perror("fseeko");
-        nread = fread( trace, sizeof(float), nt, fp );
-        assert(nread == nt);
+		/***** read trace from file *****/
+        	offset = itrace*trace_sz+TRCBYTES;
+        	ret = fseeko(fp, offset, SEEK_SET);
+        	if (ret<0) perror("fseeko");
+        	nread = fread(trace, sizeof(float), nt, fp);
+        	assert(nread == nt);
 
-		/* transform to frequency domain */
+		/***** transform to frequency domain *****/
 		memset(r,0,ntfft*sizeof(float));
 		memcpy(r,&trace[0],nt*sizeof(float)); /* might not be needed if segy_read_traces_c is nice */
 		t1 = wallclock_time();
 		*tread += t1-t0;
 
-		rc1fft(r,c,ntfft,-1);
+		rc1_fft(r,c,ntfft,-1);
 
 		t2 = wallclock_time();
 		*tfft += t2-t1;
 
-		/* reset buffer counter when buffer is full */
-		if (freebuffer<nbufmax-1) {
-			freebuffer++;
-		}
-		else freebuffer=0;
+		memcpy(&tracedata[0].r,&c[0].r,nfreq*sizeof(complex));
 
-//	fprintf(stderr,"freebuffer = %d\n", freebuffer);
-//	fprintf(stderr,"nbuffer = %d\n", *nbuffer);
-		memcpy(&tracebuffer[freebuffer*nfreq].r,&c[0].r,nfreq*sizeof(complex));
-		memcpy(&tracedata[0].r,&tracebuffer[freebuffer*nfreq].r,nfreq*sizeof(complex));
-		trace_in_buffer[freebuffer]=itrace;
+		/***** reset buffer counter when buffer is full *****/
+		if (nbufmax != 0) {
+			t1 = wallclock_time();
+			if (freebuffer<nbufmax-1) {
+				freebuffer++;
+			}
+			else freebuffer=0;
+		//fprintf(stderr,"freebuffer = %d\n", freebuffer);
+		//fprintf(stderr,"nbuffer = %d\n", *nbuffer);
+
+			memcpy(&tracebuffer[freebuffer*nfreq].r,&c[0].r,nfreq*sizeof(complex));
+			trace_in_buffer[freebuffer]=itrace;
+			t2 = wallclock_time();
+			*tmemcpy += t2-t1;
+		}
 
 		free(trace);
 		free(r);
 		free(c);
+
 		t1 = wallclock_time();
 		*tread += t1-t2;
-
 	}
-	else {
-		/* read trace from buffer */
+	else {  /**** read trace from buffer *****/
 		t0 = wallclock_time();
-//		tracedata = (complex *)&tracebuffer[inbuffer*nfreq]; /* this is better, but need some more changes TODO */
+		tracedata = (complex *)&tracebuffer[inbuffer*nfreq]; /* this is better, but need some more changes TODO */
+		//fprintf(stderr,"read from inbuffer=%d nbufmax=%d\n", inbuffer,nbufmax);
 
-//	fprintf(stderr,"read from inbuffer = %d bufmax=%d\n", inbuffer,nbufmax);
-
-		memcpy(&tracedata[0].r,&tracebuffer[inbuffer*nfreq].r,nfreq*sizeof(complex));
+		//memcpy(&tracedata[0].r,&tracebuffer[inbuffer*nfreq].r,nfreq*sizeof(complex));
+		
 		t1 = wallclock_time();
 		*tread += t1-t0;
 	}
-
 	return;
 }
 
-- 
GitLab