Originally posted by oleid
View Post
Last weekend; I actually spent some time optimizing himeno, and tested on my laptop which has an Intel core i7 4510u. I got almost 4x performance with easy optimizations and a bit of manual AVX2+FMA intrinsic coding.
It is small enough so that I paste it here: (can't attach files apparently). You can compile it with gcc -std=c99 himenobmtxpa.c -g -O3 -mavx2 -mfma -fopenmp -o himenobmtxpa if you want to test it, with openmp to test multithreaded performance as a bonus.
Code:
/******************************************************************** This benchmark test program is measuring a cpu performance of floating point operation by a Poisson equation solver. If you have any question, please ask me via email. written by Ryutaro HIMENO, November 26, 2001. Version 3.0 ---------------------------------------------- Ryutaro Himeno, Dr. of Eng. Head of Computer Information Division, RIKEN (The Institute of Physical and Chemical Research) Email : [email protected] --------------------------------------------------------------- You can adjust the size of this benchmark code to fit your target computer. In that case, please chose following sets of [mimax][mjmax][mkmax]: small : 33,33,65 small : 65,65,129 midium: 129,129,257 large : 257,257,513 ext.large: 513,513,1025 This program is to measure a computer performance in MFLOPS by using a kernel which appears in a linear solver of pressure Poisson eq. which appears in an incompressible Navier-Stokes solver. A point-Jacobi method is employed in this solver as this method can be easily vectorized and be parallelized. ------------------ Finite-difference method, curvilinear coodinate system Vectorizable and parallelizable on each grid point No. of grid points : imax x jmax x kmax including boundaries ------------------ A,B,C:coefficient matrix, wrk1: source term of Poisson equation wrk2 : working area, OMEGA : relaxation parameter BND:control variable for boundaries and objects ( = 0 or 1) P: pressure ********************************************************************/ /* * modified by Ziple, 19/03/2017 * * Optimize a bit the program to be more representative of real (optimized) * HPC workloads. * * Modifications: * - improve performance by specifying that there is no aliasing (restrict keyword) * - share index computations (because matrices are of the same sizes) * - AVX2+FMA intrinsics to optimize for recent x86_64 platforms * (just use -mavx2 and -mfma flags at compilation to enable it) * - OpenMP parallelization */ #include <stdio.h> #include <sys/time.h> #include <stdlib.h> #include <string.h> #if (defined(__AVX2__) || defined(__FMA__)) #include <immintrin.h> #endif #define MS(mt, s1, s2, s3, n, r, c, d) mt->m[(n) * (s1) + (r) * (s2) + (c) * (s3) + (d)] #define MSP(mt, s1, s2, s3, n, r, c, d) mt[(n) * (s1) + (r) * (s2) + (c) * (s3) + (d)] #define MR(mt,n,r,c,d) mt->m[(n) * mt->mrows * mt->mcols * mt->mdeps + (r) * mt->mcols* mt->mdeps + (c) * mt->mdeps + (d)] struct Mat { float *restrict m; int mnums; int mrows; int mcols; int mdeps; }; /* prototypes */ typedef struct Mat Matrix; int newMat(Matrix* Mat, int mnums, int mrows, int mcols, int mdeps); void clearMat(Matrix* Mat); void set_param(int i[],char *size); void mat_set(Matrix* Mat,int l,float z); void mat_set_init(Matrix* Mat); float jacobi(int n,Matrix* M1,Matrix* M2,Matrix* M3, Matrix* M4,Matrix* M5,Matrix* M6,Matrix* M7); double fflop(int,int,int); double mflops(int,double,double); double second(); float omega=0.8; Matrix a,b,c,p,bnd,wrk1,wrk2; int main(int argc, char *argv[]) { int i,j,k,nn; int imax,jmax,kmax,mimax,mjmax,mkmax,msize[3]; float gosa,target; double cpu0,cpu1,cpu,flop; char size[10]; if(argc == 2){ strcpy(size,argv[1]); } else { printf("For example: \n"); printf(" Grid-size= XS (32x32x64)\n"); printf("\t S (64x64x128)\n"); printf("\t M (128x128x256)\n"); printf("\t L (256x256x512)\n"); printf("\t XL (512x512x1024)\n\n"); printf("Grid-size = "); scanf("%s",size); printf("\n"); } set_param(msize,size); mimax= msize[0]; mjmax= msize[1]; mkmax= msize[2]; imax= mimax-1; jmax= mjmax-1; kmax= mkmax-1; target = 60.0; printf("mimax = %d mjmax = %d mkmax = %d\n",mimax,mjmax,mkmax); printf("imax = %d jmax = %d kmax =%d\n",imax,jmax,kmax); /* * Initializing matrixes */ newMat(&p,1,mimax,mjmax,mkmax); newMat(&bnd,1,mimax,mjmax,mkmax); newMat(&wrk1,1,mimax,mjmax,mkmax); newMat(&wrk2,1,mimax,mjmax,mkmax); newMat(&a,4,mimax,mjmax,mkmax); newMat(&b,3,mimax,mjmax,mkmax); newMat(&c,3,mimax,mjmax,mkmax); mat_set_init(&p); mat_set(&bnd,0,1.0); mat_set(&wrk1,0,0.0); mat_set(&wrk2,0,0.0); mat_set(&a,0,1.0); mat_set(&a,1,1.0); mat_set(&a,2,1.0); mat_set(&a,3,1.0/6.0); mat_set(&b,0,0.0); mat_set(&b,1,0.0); mat_set(&b,2,0.0); mat_set(&c,0,1.0); mat_set(&c,1,1.0); mat_set(&c,2,1.0); /* * Start measuring */ nn= 3; printf(" Start rehearsal measurement process.\n"); printf(" Measure the performance in %d times.\n\n",nn); cpu0= second(); gosa= jacobi(nn,&a,&b,&c,&p,&bnd,&wrk1,&wrk2); cpu1= second(); cpu= cpu1 - cpu0; flop= fflop(imax,jmax,kmax); printf(" MFLOPS: %f time(s): %f %e\n\n", mflops(nn,cpu,flop),cpu,gosa); nn= (int)(target/(cpu/3.0)); printf(" Now, start the actual measurement process.\n"); printf(" The loop will be excuted in %d times\n",nn); printf(" This will take about one minute.\n"); printf(" Wait for a while\n\n"); cpu0 = second(); gosa = jacobi(nn,&a,&b,&c,&p,&bnd,&wrk1,&wrk2); cpu1 = second(); cpu = cpu1 - cpu0; printf(" Loop executed for %d times\n",nn); printf(" Gosa : %e \n",gosa); printf(" MFLOPS measured : %f\tcpu : %f\n",mflops(nn,cpu,flop),cpu); printf(" Score based on Pentium III 600MHz using Fortran 77: %f\n", mflops(nn,cpu,flop)/82,84); /* * Matrix free */ clearMat(&p); clearMat(&bnd); clearMat(&wrk1); clearMat(&wrk2); clearMat(&a); clearMat(&b); clearMat(&c); return (0); } double fflop(int mx,int my, int mz) { return((double)(mz-2)*(double)(my-2)*(double)(mx-2)*34.0); } double mflops(int nn,double cpu,double flop) { return(flop/cpu*1.e-6*(double)nn); } void set_param(int is[],char *size) { if(!strcmp(size,"XS") || !strcmp(size,"xs")){ is[0]= 32; is[1]= 32; is[2]= 64; return; } if(!strcmp(size,"S") || !strcmp(size,"s")){ is[0]= 64; is[1]= 64; is[2]= 128; return; } if(!strcmp(size,"M") || !strcmp(size,"m")){ is[0]= 128; is[1]= 128; is[2]= 256; return; } if(!strcmp(size,"L") || !strcmp(size,"l")){ is[0]= 256; is[1]= 256; is[2]= 512; return; } if(!strcmp(size,"XL") || !strcmp(size,"xl")){ is[0]= 512; is[1]= 512; is[2]= 1024; return; } else { printf("Invalid input character !!\n"); exit(6); } } int newMat(Matrix* Mat, int mnums,int mrows, int mcols, int mdeps) { Mat->mnums= mnums; Mat->mrows= mrows; Mat->mcols= mcols; Mat->mdeps= mdeps; Mat->m= NULL; Mat->m= (float*) malloc(mnums * mrows * mcols * mdeps * sizeof(float)); return(Mat->m != NULL) ? 1:0; } void clearMat(Matrix* Mat) { if(Mat->m) free(Mat->m); Mat->m= NULL; Mat->mnums= 0; Mat->mcols= 0; Mat->mrows= 0; Mat->mdeps= 0; } void mat_set(Matrix* Mat, int l, float val) { int i,j,k; for(i=0; i<Mat->mrows; i++) for(j=0; j<Mat->mcols; j++) for(k=0; k<Mat->mdeps; k++) MR(Mat,l,i,j,k)= val; } void mat_set_init(Matrix* Mat) { int i,j,k,l; float tt; for(i=0; i<Mat->mrows; i++) for(j=0; j<Mat->mcols; j++) for(k=0; k<Mat->mdeps; k++) MR(Mat,0,i,j,k)= (float)(i*i) /(float)((Mat->mrows - 1)*(Mat->mrows - 1)); } inline int min(int a, int b) { return a < b ? a : b; } #define TILE_I 8 #define TILE_J 8 #define TILE_K 8 float jacobi_core( int imax, int jmax, int kmax, int i1, int i2, int i3, int nn, float *restrict a, float *restrict b, float *restrict c, float *restrict p, float *restrict bnd, float *restrict wrk1, float *restrict wrk2) { float gosa; __m256 gosas; __m256 omegas = _mm256_set1_ps(omega); for(int n = 0; n < nn; n++) { gosa = 0.0f; gosas = _mm256_set1_ps(0.0f); #pragma omp parallel for for(int i = 1; i< imax ; i++) { for(int j =1; j < jmax; j++) { int kk = 1; #if (defined(__AVX2__) && defined(__FMA__)) while( (kk + TILE_K) <= kmax ) { int k0 = kk; __m256 a0p = _mm256_mul_ps(_mm256_loadu_ps(&MSP(a, i1, i2, i3, 0, i, j, k0)), _mm256_loadu_ps(&MSP(p, i1, i2, i3, 0, i+1, j, k0))); __m256 lt0p = _mm256_sub_ps(_mm256_loadu_ps(&MSP(p, i1, i2, i3,0,i+1,j+1,k0)), _mm256_loadu_ps(&MSP(p, i1, i2, i3,0,i+1,j-1,k0))); __m256 rt0p = _mm256_sub_ps(_mm256_loadu_ps(&MSP(p, i1, i2, i3,0,i-1,j-1,k0)), _mm256_loadu_ps(&MSP(p, i1, i2, i3,0,i-1,j+1,k0))); __m256 t0p = _mm256_add_ps( lt0p, rt0p); __m256 a0pb0p = _mm256_fmadd_ps(_mm256_loadu_ps(&MSP(b, i1, i2, i3, 0, i, j, k0)), t0p, a0p); __m256 a1p = _mm256_mul_ps(_mm256_loadu_ps(&MSP(a, i1, i2, i3, 1, i, j, k0)), _mm256_loadu_ps(&MSP(p, i1, i2, i3, 0, i, j+1, k0))); __m256 lt1p = _mm256_sub_ps(_mm256_loadu_ps(&MSP(p, i1, i2, i3,0,i,j+1,k0+1)), _mm256_loadu_ps(&MSP(p, i1, i2, i3,0,i,j-1,k0+1))); __m256 rt1p = _mm256_sub_ps(_mm256_loadu_ps(&MSP(p, i1, i2, i3,0,i,j-1,k0-1)), _mm256_loadu_ps(&MSP(p, i1, i2, i3,0,i,j+1,k0-1))); __m256 t1p = _mm256_add_ps( lt1p, rt1p); __m256 a1pb1p = _mm256_fmadd_ps(_mm256_loadu_ps(&MSP(b, i1, i2, i3, 1, i, j, k0)), t1p, a1p); __m256 a2p = _mm256_mul_ps(_mm256_loadu_ps(&MSP(a, i1, i2, i3, 2, i, j, k0)), _mm256_loadu_ps(&MSP(p, i1, i2, i3, 0, i, j, k0+1))); __m256 lt2p = _mm256_sub_ps(_mm256_loadu_ps(&MSP(p, i1, i2, i3,0,i+1,j,k0+1)), _mm256_loadu_ps(&MSP(p, i1, i2, i3,0,i-1,j,k0+1))); __m256 rt2p = _mm256_sub_ps(_mm256_loadu_ps(&MSP(p, i1, i2, i3,0,i-1,j,k0-1)), _mm256_loadu_ps(&MSP(p, i1, i2, i3,0,i+1,j,k0-1))); __m256 t2p = _mm256_add_ps( lt2p, rt2p); __m256 a2pb2p = _mm256_fmadd_ps(_mm256_loadu_ps(&MSP(b, i1, i2, i3, 2, i, j, k0)), t2p, a2p); __m256 c0p = _mm256_mul_ps(_mm256_loadu_ps(&MSP(c, i1, i2, i3, 0, i, j, k0)), _mm256_loadu_ps(&MSP(p, i1, i2, i3, 0, i-1, j, k0))); __m256 c1p = _mm256_mul_ps(_mm256_loadu_ps(&MSP(c, i1, i2, i3, 1, i, j, k0)), _mm256_loadu_ps(&MSP(p, i1, i2, i3, 0, i, j-1, k0))); __m256 c2p = _mm256_mul_ps(_mm256_loadu_ps(&MSP(c, i1, i2, i3, 2, i, j, k0)), _mm256_loadu_ps(&MSP(p, i1, i2, i3, 0, i, j, k0-1))); __m256 a0pb0pc0p = _mm256_add_ps(a0pb0p, c0p); __m256 a1pb1pc1p = _mm256_add_ps(a1pb1p, c1p); __m256 a2pb2pc2p = _mm256_add_ps(a2pb2p, c2p); __m256 a0pb0pc0pa1pb1pc1p = _mm256_add_ps(a0pb0pc0p, a1pb1pc1p); __m256 a2pb2pc2pwrk1 = _mm256_add_ps(a2pb2pc2p, _mm256_loadu_ps(&MSP(wrk1, i1, i2, i3, 0, i, j, k0))); __m256 s0s = _mm256_add_ps(a0pb0pc0pa1pb1pc1p, a2pb2pc2pwrk1); __m256 p0s = _mm256_loadu_ps(&MSP(p, i1, i2, i3, 0,i,j,k0)); __m256 sss = _mm256_mul_ps(_mm256_fmsub_ps(s0s, _mm256_loadu_ps(&MSP(a, i1, i2, i3, 3,i,j,k0)), p0s), _mm256_loadu_ps(&MSP(bnd, i1, i2, i3, 0,i,j,k0))); gosas = _mm256_fmadd_ps(sss, sss, gosas); _mm256_storeu_ps(&MSP(wrk2,i1, i2, i3, 0,i,j,k0), _mm256_fmadd_ps(omegas, sss, p0s)); kk += TILE_K; } #endif // AVX2 && FMA for(int k = kk; k < kmax; k++) { float s0= MSP(a, i1, i2, i3,0,i,j,k)*MSP(p, i1, i2, i3,0,i+1,j, k) + MSP(a, i1, i2, i3,1,i,j,k)*MSP(p, i1, i2, i3,0,i, j+1,k) + MSP(a, i1, i2, i3,2,i,j,k)*MSP(p, i1, i2, i3,0,i, j, k+1) + MSP(b, i1, i2, i3,0,i,j,k) *( MSP(p, i1, i2, i3,0,i+1,j+1,k) - MSP(p, i1, i2, i3,0,i+1,j-1,k) - MSP(p, i1, i2, i3,0,i-1,j+1,k) + MSP(p, i1, i2, i3,0,i-1,j-1,k) ) + MSP(b, i1, i2, i3,1,i,j,k) *( MSP(p, i1, i2, i3,0,i,j+1,k+1) - MSP(p, i1, i2, i3,0,i,j-1,k+1) - MSP(p, i1, i2, i3,0,i,j+1,k-1) + MSP(p, i1, i2, i3,0,i,j-1,k-1) ) + MSP(b, i1, i2, i3,2,i,j,k) *( MSP(p, i1, i2, i3,0,i+1,j,k+1) - MSP(p, i1, i2, i3,0,i-1,j,k+1) - MSP(p, i1, i2, i3,0,i+1,j,k-1) + MSP(p, i1, i2, i3,0,i-1,j,k-1) ) + MSP(c, i1, i2, i3,0,i,j,k) * MSP(p, i1, i2, i3,0,i-1,j, k) + MSP(c, i1, i2, i3,1,i,j,k) * MSP(p, i1, i2, i3,0,i, j-1,k) + MSP(c, i1, i2, i3,2,i,j,k) * MSP(p, i1, i2, i3,0,i, j, k-1) + MSP(wrk1, i1, i2, i3, 0,i,j,k); float ss= (s0*MSP(a, i1, i2, i3, 3,i,j,k) - MSP(p, i1, i2, i3, 0,i,j,k))*MSP(bnd, i1, i2, i3, 0,i,j,k); gosa+= ss*ss; MSP(wrk2,i1, i2, i3, 0,i,j,k) = MSP(p,i1, i2, i3, 0,i,j,k) + omega*ss; } } } for(int i = 1; i < imax; i++) for(int j = 1; j < jmax; j++) for(int k = 1; k < kmax; k++) MSP(p, i1, i2, i3, 0,i,j,k)= MSP(wrk2, i1, i2, i3, 0,i,j,k); } /* end n loop */ for(int i = 0; i < 8; i++) gosa += gosas[i]; return(gosa); } float jacobi(int nn, Matrix* a,Matrix* b,Matrix* c, Matrix* p,Matrix* bnd,Matrix* wrk1,Matrix* wrk2) { int imax= p->mrows-1; int jmax= p->mcols-1; int kmax= p->mdeps-1; int i1 = p->mrows * p->mcols * p->mdeps; int i2 = p->mcols * p->mdeps; int i3 = p->mdeps; return jacobi_core(imax, jmax, kmax, i1, i2, i3, nn, a->m, b->m, c->m, p->m, bnd->m, wrk1->m, wrk2->m); } double second() { struct timeval tm; double t ; static int base_sec = 0,base_usec = 0; gettimeofday(&tm, NULL); if(base_sec == 0 && base_usec == 0) { base_sec = tm.tv_sec; base_usec = tm.tv_usec; t = 0.0; } else { t = (double) (tm.tv_sec-base_sec) + ((double) (tm.tv_usec-base_usec))/1.0e6 ; } return t ; }
Edit: it won't compile without -mavx2 and -mfma flags because of stupid mistakes, can't really fix them on my phone.
Leave a comment: