Probably the simplest, notranspose GEMV kernel implementation is:
#ifdef BETA0 void ATL_dgemvN_a1_x1_b0_y1 #elif defined (BETA1) void ATL_dgemvN_a1_x1_b1_y1 #else void ATL_dgemvN_a1_x1_bX_y1 #endif (const int M, const int N, const double alpha, const double *A, const int lda, const double *X, const int incX, const double beta, double *Y, const int incY) { int i, j; register double y0; for (i=0; i != M; i++) { #ifdef BETA0 y0 = 0.0; #elif defined(BETA1) y0 = Y[i]; #elif defined(BETAX) y0 = Y[i] * beta; #endif for (j=0; j != N; j++) y0 += A[i+j*lda] * X[j]; Y[i] = y0; } }
Saving this file to ATLAS/tune/blas/gemv/CASES/ATL_dgemvN_1x1_1.c, from the ATLAS/tune/blas/gemv/<arch> directory, we can test and time the implementation by:
make dmvtstcaseN mvrout=../CASES/ATL_dgemvN_1x1_1.c yu=1 xu=1 BEGINNING GEMV PRIMITIVE TESTING: TEST TA=N, M=997, N=177, lda=1004, beta=0.000000 STARTED TEST TA=N, M=997, N=177, lda=1004, beta=0.000000 PASSED TEST TA=N, M=997, N=177, lda=1004, beta=1.000000 STARTED TEST TA=N, M=997, N=177, lda=1004, beta=1.000000 PASSED TEST TA=N, M=997, N=177, lda=1004, beta=0.800000 STARTED TEST TA=N, M=997, N=177, lda=1004, beta=0.800000 PASSED GEMV PRIMITIVE PASSED ALL TESTS speedy. make dmvcaseN mvrout=../CASES/ATL_dgemvN_1x1_1.c yu=1 xu=1 gemvN_0 : 49.484536 MFLOPS gemvN_0 : 49.484536 MFLOPS gemvN_0 : 49.230769 MFLOPS gemvN_0 : 49.40 MFLOPS
In the above examples, we pass yu, the unrolling along the vector, and xu, the unrolling along the vector, so that when ATLAS blocks the operation, it knows the correct unrolling to use to avoid cleanup code.
A similarly sophisticated transpose primitive is:
#ifdef BETA0 void ATL_dgemvT_a1_x1_b0_y1 #elif defined (BETA1) void ATL_dgemvT_a1_x1_b1_y1 #else void ATL_dgemvT_a1_x1_bX_y1 #endif (const int M, const int N, const double alpha, const double *A, const int lda, const double *X, const int incX, const double beta, double *Y, const int incY) { int i, j; register double y0; for (j=0; j != M; j++) { #ifdef BETA0 y0 = 0.0; #elif defined(BETA1) y0 = Y[j]; #else y0 = Y[j] * beta; #endif for (i=0; i != N; i++) y0 += A[i+j*lda] * X[i]; Y[j] = y0; } }
Which we could test and time by:
speedy. make dmvtstcaseT mvrout=../CASES/ATL_dgemvT_1x1_1.c xu=1 yu=1 BEGINNING GEMV PRIMITIVE TESTING: TEST TA=T, M=997, N=177, lda=1004, beta=0.000000 STARTED TEST TA=T, M=997, N=177, lda=1004, beta=0.000000 PASSED TEST TA=T, M=997, N=177, lda=1004, beta=1.000000 STARTED TEST TA=T, M=997, N=177, lda=1004, beta=1.000000 PASSED TEST TA=T, M=997, N=177, lda=1004, beta=0.800000 STARTED TEST TA=T, M=997, N=177, lda=1004, beta=0.800000 PASSED GEMV PRIMITIVE PASSED ALL TESTS speedy. make dmvcaseT mvrout=../CASES/ATL_dgemvT_1x1_1.c xu=1 yu=1 gemvT_0 : 37.647059 MFLOPS gemvT_0 : 37.647059 MFLOPS gemvT_0 : 37.647059 MFLOPS gemvT_0 : 37.65 MFLOPS
An unsophisticated notranspose GEMV implementation for double precision complex would be:
#ifdef BETA0 #ifdef Conj_ void ATL_dgemvNc_a1_x1_b0_y1 #else void ATL_dgemvN_a1_x1_b0_y1 #endif #elif defined (BETA1) #ifdef Conj_ void ATL_dgemvNc_a1_x1_b1_y1 #else void ATL_dgemvN_a1_x1_b1_y1 #endif #elif defined (BETAXI0) #ifdef Conj_ void ATL_dgemvNc_a1_x1_bXi0_y1 #else void ATL_dgemvN_a1_x1_bXi0_y1 #endif #else #ifdef Conj_ void ATL_dgemvNc_a1_x1_bX_y1 #else void ATL_dgemvN_a1_x1_bX_y1 #endif #endif (const int M, const int N, const double *alpha, const double *A, const int lda, const double *X, const int incX, const double *beta, double *Y, const int incY) { int i, j; const int M2 = M<<1, N2 = N<<1; #if defined(BETAX) const double rbeta = *beta, ibeta = beta[1]; #elif defined(BETAXI0) const double rbeta = *beta; #endif register double ra, ia, rx, ix, ry, iy; for (i=0; i != M2; i += 2) { #ifdef BETA0 ry = iy = 0.0; #elif defined(BETAX) rx = rbeta; ix = ibeta; ra = Y[i]; ia = Y[i+1]; ry = ra * rx - ia * ix; iy = ra * ix + ia * rx; #else ry = Y[i]; iy = Y[i+1]; #ifdef BETAXI0 rx = rbeta; ry *= rx; iy *= rx; #endif #endif for(j=0; j != N2; j += 2) { ra = A[i+j*lda]; ia = A[i+j*lda+1]; rx = X[j]; ix = X[j+1]; ry += ra * rx; iy += ra * ix; #ifdef Conj_ ry += ia * ix; iy -= ia * rx; #else ry -= ia * ix; iy += ia * rx; #endif } Y[i] = ry; Y[i+1] = iy; } }
Which, when saved to ATLAS/tune/blas/gemv/CASES/ATL_zgemvN_1x1_1.c, we could test and time by:
make zmvtstcaseN mvrout=../CASES/ATL_zgemvN_1x1_1.c xu=1 yu=1 BEGINNING GEMV PRIMITIVE TESTING: TEST TA=N, M=997, N=177, lda=1004, beta=(0.000000,0.000000) STARTED TEST TA=N, M=997, N=177, lda=1004, beta=(0.000000,0.000000) PASSED TEST TA=-, M=997, N=177, lda=1004, beta=(0.000000,0.000000) STARTED TEST TA=-, M=997, N=177, lda=1004, beta=(0.000000,0.000000) PASSED TEST TA=N, M=997, N=177, lda=1004, beta=(1.000000,0.000000) STARTED TEST TA=N, M=997, N=177, lda=1004, beta=(1.000000,0.000000) PASSED TEST TA=-, M=997, N=177, lda=1004, beta=(1.000000,0.000000) STARTED TEST TA=-, M=997, N=177, lda=1004, beta=(1.000000,0.000000) PASSED TEST TA=N, M=997, N=177, lda=1004, beta=(0.800000,0.000000) STARTED TEST TA=N, M=997, N=177, lda=1004, beta=(0.800000,0.000000) PASSED TEST TA=-, M=997, N=177, lda=1004, beta=(0.800000,0.000000) STARTED TEST TA=-, M=997, N=177, lda=1004, beta=(0.800000,0.000000) PASSED TEST TA=N, M=997, N=177, lda=1004, beta=(0.800000,0.300000) STARTED TEST TA=N, M=997, N=177, lda=1004, beta=(0.800000,0.300000) PASSED TEST TA=-, M=997, N=177, lda=1004, beta=(0.800000,0.300000) STARTED TEST TA=-, M=997, N=177, lda=1004, beta=(0.800000,0.300000) PASSED GEMV PRIMITIVE PASSED ALL TESTS speedy. make zmvcaseN mvrout=../CASES/ATL_zgemvN_1x1_1.c xu=1 yu=1 gemvN_0 : 78.688525 MFLOPS gemvN_0 : 78.688525 MFLOPS gemvN_0 : 78.688525 MFLOPS gemvN_0 : 78.69 MFLOPS