[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: sgemm questions



Hi Camm 

I am the Peter Soendergaard that Clint talked about.

I have implemented a codegerator in Python, that can generate gemm and
some gemv code for 3dnow and sse. It generates gcc inline assembly, and
all the macros I use are defined in the file I have attached.

I have included a file with the macros I currently use, but I only use
very basic instructions. pfadd, pfmul, pfacc and some of the mmx
instructions to move 32 bits in and out of the vectors.

Which added instructions were you thinking of? prefetch{nta,t0,t1,t2},
flip-the-vector?

I have not done any real tests for the SSE, I have more or less just
confirmed that I got working code, so I cant remember the exact
performance I got, but it was reasonable.

Cheers,

Peter


On Tue, 14 Nov 2000, R Clint Whaley wrote:

> Camm,
> 
> >Hello again!  Just looked at this stuff again today, and did a rather
> >simple change which makes the kernel work for the Athlon, with a
> >slightly higher percentage of peak than the Intel, it appears.  
> 
> Peter Soendergaard has been working on 3DNow! SGEMM; coincidentally,
> he made a rather simple change to his 3DNow! code to run SSE the other
> day, and got about the same performance as your SSE :)
> 
> Hopefully, Peter will reply more fully regarding what instructions he
> used, and why.  If he used Athlon-specific ones, having a K6x version
> would be nice as well, so long as it is trivial.  Last I knew, Peter was
> getting about 2.4 Gflops on our 1Ghz Athlon.  Anyway, I obviously don't have
> all the details you need . . .
> 
> Cheers,
> Clint
> 



/*  The mening of the defined macros is as follows:
 *  VECLEN:         The length of a singleprecision vector register
 *  NREGS:          Number of vector registers available
 *  prefetch:       Standard prefetch 
 *  prefetchw:      Prefetch used for data to be overwritten soon.
 *  vec_add:        Add to single precision vectors. 
 *  vec_mul:        Multiply to single precision vectors.
 *  vec_mov:        Moves data around
 *  vec_load_one:   Load one element in a vector and zero all other entries!
 *  vec_load_scal:  Load one element relpicated in all positions in the vector.
 *  vec_load_apart: Load elements from different memory positions into a register. 
 *  vec_sum:        Sums a register.
 *  vec_store_one:  Stores lowest element in vector to memory, no zero-extend!
 * Meaning of suffixes is as follows:
 * mr means memory to register
 * rr means register to register 
 * rm means register to memory
 * a means that instruction needs aligned data
 */




#define gen_vec_rr(op,reg1,reg2) \
        __asm__ __volatile__ (#op " %%" #reg1 ", %%" #reg2 \
                              :  /* nothing */ \
                              : /* nothing */)


#define gen_prefetch(op,mem) \
        __asm__ __volatile__ (#op " %0" \
                              : /* nothing */ \
                              : "m")

#define w(p) p

#define nop()             __asm__ __volatile__ ("nop")


#ifdef SSE

/* Peculiarities of SSE: Alignment is good, but not mandatory. It is possible to
 * load/store from misaligned adresses using movups at a cost of some cycles. Loading
 * using mul/add must always be aligned. Alignment is 16 bytes.
 * No muladd.
 */



#define gen_vec_addmul(op,mem,reg) \
        __asm__ __volatile__ (#op " %0, %%" #reg \
                              :  /* nothing */ \
                              : "m" (((mem)[0])), "m" (((mem)[1])), "m" (((mem)[2])), "m" (((mem)[3])))


#define gen_vec_load(op,mem,reg) \
        __asm__ __volatile__ (#op " %0, %%" #reg \
                              :  /* nothing */ \
                              : "m" (((mem)[0])), "m" (((mem)[1])), "m" (((mem)[2])), "m" (((mem)[3])))


#define gen_vec_store(op,reg,mem) \
        __asm__ __volatile__ (#op " %%" #reg ", %0" \
                              : "=m" (((mem)[0])), "=m" (((mem)[1])), "=m" (((mem)[2])), "=m" (((mem)[3])) \
                              :  /* nothing */ )                          




#define VECLEN 4
#define NREGS 8

#define reg0 xmm0
#define reg1 xmm1
#define reg2 xmm2
#define reg3 xmm3
#define reg4 xmm4
#define reg5 xmm5
#define reg6 xmm6
#define reg7 xmm7


#define prefetch(mem)         gen_prefetch(prefetchnta,mem)
#define prefetchw(mem)        gen_prefetch(prefetchw,mem)
#define vec_add_mr_a(mem,reg) gen_vec_addmul(addps,mem,reg)
#define vec_mul_mr_a(mem,reg) gen_vec_addmul(mulps,mem,reg)
#define vec_add_rr(mem,reg)   gen_vec_rr(addps,mem,reg)
#define vec_mul_rr(mem,reg)   gen_vec_rr(mulps,mem,reg)
#define vec_mov_mr(mem,reg)   gen_vec_load(movups,mem,reg)
#define vec_mov_rm(reg,mem)   gen_vec_store(movups,reg,mem)
#define vec_mov_mr_a(mem,reg) gen_vec_load(movaps,mem,reg)
#define vec_mov_rm_a(reg,mem) gen_vec_store(movaps,reg,mem)
#define vec_mov_rr(reg1,reg2) gen_vec_rr(movaps,reg1,reg2)
#define vec_load_one(mem,reg) gen_vec_load(movss,mem,reg)
#define vec_store_one(reg,mem) gen_vec_store(movss,reg,mem)

#define vec_enter()           /*  vec_enter */
#define vec_exit()            /*  vec_exit */

/* To use this instruction be sure that register 7 is not in use!!! */
#define vec_sum(reg) vec_sum_wrap(reg)
#define vec_sum_wrap(reg) \
        __asm__ __volatile__ ("movhlps %%" #reg ", %%xmm7\n"\
			      "addps %%" #reg ", %%xmm7\n"\
			      "movaps %%xmm7, %%" #reg "\n"\
                              "shufps $1, %%" #reg ", %%xmm7\n"\
  			      "addss %%xmm7, %%" #reg "\n"\
			      : /* nothing */  \
                              : /* nothing */)

#endif



#ifdef THREEDNOW

/* Peculiarities of 3DNOW. Alignment is not an issue, all alignments are legal, however I am 
 * not shure if alignment gives a speed increase.
 * The vec_acc instruction can be used to sum to registers at once more efficiently
 * than a series of vec_sum and vec_store_one
 * No muladd.
 */


#define gen_vec_addmul(op,mem,reg) \
        __asm__ __volatile__ (#op " %0, %%" #reg \
                              :  /* nothing */ \
                              : "m" (((mem)[0])), "m" (((mem)[1])))


#define gen_vec_load(op,mem,reg) \
        __asm__ __volatile__ (#op " %0, %%" #reg \
                              :  /* nothing */ \
                              : "m" (((mem)[0])), "m" (((mem)[1])))


#define gen_vec_store(op,reg,mem) \
        __asm__ __volatile__ (#op " %%" #reg ", %0" \
                              : "=m" (((mem)[0])), "=m" (((mem)[1])) \
			      :  /* nothing */ )                            




#define VECLEN 2
#define NREGS 8

#define reg0 mm0
#define reg1 mm1
#define reg2 mm2
#define reg3 mm3
#define reg4 mm4
#define reg5 mm5
#define reg6 mm6
#define reg7 mm7

#define prefetch(mem)           gen_prefetch(prefetch,mem)
#define prefetchw(mem)          gen_prefetch(prefetchw,mem)
#define vec_add_mr(mem,reg)     gen_vec_addmul(pfadd,mem,reg)
#define vec_mul_mr(mem,reg)     gen_vec_addmul(pfmul,mem,reg)
#define vec_mov_mr(mem,reg)     gen_vec_load(movq,mem,reg)
#define vec_mov_rm(reg,mem)     gen_vec_store(movq,reg,mem)
#define vec_add_rr(reg1,reg2)   gen_vec_rr(pfadd,reg1,reg2)
#define vec_mul_rr(reg1,reg2)   gen_vec_rr(pfmul,reg1,reg2)
#define vec_acc_rr(reg1,reg2)   gen_vec_rr(pfacc,reg1,reg2)
#define vec_mov_rr(reg1,reg2)   gen_vec_rr(movq,reg1,reg2)
#define vec_load_one(mem,reg)   gen_vec_load(movd,mem,reg)
#define vec_sum(reg)            gen_vec_rr(pfacc,reg,reg)
#define vec_store_one(reg,mem)  gen_vec_store(movd,reg,mem)

#define vec_load_scal(mem,reg)  vec_load_scal_wrap(mem,reg)
#define vec_load_scal_wrap(mem,reg) \
        __asm__ __volatile__ ("movd %0, %%" #reg "\n"\
			      "punpckldq %%" #reg ", %%" #reg \
			      : /* nothing */ \
                              : "m" ((mem)[0]))


#define vec_load_apart(mem1,mem2,reg) vec_load_apart_wrap(mem1,mem2,reg)
#define vec_load_apart_wrap(mem1,mem2,reg) \
        __asm__ __volatile__ ("movd %0, %%" #reg "\n"\
			      "punpckldq %1, %%" #reg \
			      : /* nothing */ \
                              : "m" ((mem1)[0]), "m" (((mem2)[0])))


#define vec_zero(reg)           gen_vec_rr(pxor,reg,reg)     

#define vec_enter()             __asm__ __volatile__ ("femms")
#define vec_exit()              __asm__ __volatile__ ("femms")

#define align()                 __asm__ __volatile__ (".align 16")


#endif





#ifdef ALTIVEC

#define gen_alti3(op,reg1,reg2,regout) \
        __asm__ __volatile__ (#op " %%" #reg1 ", %%" #reg2 ", %%" #regout \
                              :  /* nothing */ \
                              : /* nothing */)

#define gen_alti_muladd(op,reg1,reg2,regout) \
        __asm__ __volatile__ (#op " %%" #reg1 ", %%" #reg2 ", %%" #regout ", %%" #regout \
                              :  /* nothing */ \
                              : /* nothing */)


#define vec_add_rr(reg1,reg2,regout) gen_alti3(vaddfp,reg1,reg2,regout)
#define vec_mul_rr(reg1,reg2,regout) gen_alti3(vmulfp,reg1,reg2,regout)
#define vec_muladd_rr(reg1,reg2,regout) gen_alti3(vmulfp,reg1,reg2,regout)





#endif


typedef float vector[VECLEN];