[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: Altivec matmul kernel (attachment)



Greetings, and thanks for your reply!

Nicholas Coult <coult@augsburg.edu> writes:

> Hi,
> 
> 1 - no complex support (haven't had the time).
> 2 - altivec doesn't support double precision.

Too bad!

> 2.5 - not sure what you mean here.

Level2 or Level1 kernels?

> 3 - yes, I've successfully built the full lib.  With the Altivec "Java 
> mode" turned off (see another post of mine to atlas-comm for info on 
> this) on my 533 Mhz G4 machine, I get about 1870 Mflops peak sgemm.
> 4 - I believe that gcc 3.0 has Altivec support also, although I could be 
> wrong.
> 

Thanks for the tip!  I hope you are right.  I'll see if I can verify.

BTW, here is one of your kernels in assembly:

--[[application/octet-stream
Content-Disposition: attachment; filename="altivec.s"][quoted-printable]]
	.file	"smm.c"
gcc2_compiled.:
	.section	".text"
	.align 2
	.globl ATL_sJIK80x80x80TN80x80x0_a1_b1
	.type	 ATL_sJIK80x80x80TN80x80x0_a1_b1,@function
ATL_sJIK80x80x80TN80x80x0_a1_b1:
	.extern _savev21
	.extern _restv21
	stwu 1,-256(1)
	mflr 0
	stw 21,212(1)
	stw 22,216(1)
	stw 23,220(1)
	stw 24,224(1)
	stw 25,228(1)
	stw 26,232(1)
	stw 27,236(1)
	stw 28,240(1)
	stw 29,244(1)
	stw 30,248(1)
	stw 31,252(1)
	stw 0,260(1)
	addi 0,1,208
	bl _savev21
	mr 30,9
	lwz 25,264(1)
	vspltisw 0,-1
	vslw 21,0,0
	stfs 2,16(1)
	addi 9,1,16
	lvx 0,0,9
	vspltw 0,0,0
	stvx 0,0,9
	li 31,0
	vspltisw 22,0
	vcmpequw 23,23,23
.L36:
	li 5,0
	mullw 26,31,25
	addi 11,31,1
	mullw 27,11,25
	addi 9,31,2
	mullw 28,9,25
	addi 0,31,3
	mullw 29,0,25
	mullw 21,31,30
	mullw 22,11,30
	mullw 23,9,30
	mullw 24,0,30
.L40:
	add 0,5,26
	slwi 0,0,2
	add 0,10,0
	lvsl 1,0,0
	lvx 12,0,0
	li 9,16
	lvx 13,9,0
	vperm 6,12,13,1
	add 0,5,27
	slwi 0,0,2
	add 0,10,0
	lvsl 1,0,0
	lvx 12,0,0
	lvx 13,9,0
	vperm 5,12,13,1
	add 0,5,28
	slwi 0,0,2
	add 0,10,0
	lvsl 1,0,0
	lvx 12,0,0
	lvx 13,9,0
	vperm 4,12,13,1
	add 0,5,29
	slwi 0,0,2
	add 0,10,0
	lvsl 1,0,0
	lvx 12,0,0
	lvx 13,9,0
	vperm 3,12,13,1
	vmrghw 13,6,4
	vmrghw 0,5,3
	vmrglw 12,6,4
	vmrglw 1,5,3
	vmrghw 6,13,0
	vmrglw 5,13,0
	vmrghw 4,12,1
	vmrglw 3,12,1
	vor 7,21,21
	vsldoi 24,7,7,0
	vor 25,7,7
	vsldoi 31,7,7,0
	vor 26,7,7
	vsldoi 15,7,7,0
	vor 27,7,7
	vsldoi 16,7,7,0
	vor 28,7,7
	vsldoi 17,7,7,0
	vor 29,7,7
	vsldoi 18,7,7,0
	vor 30,7,7
	vsldoi 19,7,7,0
	vor 14,7,7
	vsldoi 2,7,7,0
	li 9,0
	mullw 11,5,7
	addi 0,5,1
	mullw 12,0,7
	addi 0,5,2
	mullw 3,0,7
	addi 0,5,3
	mullw 4,0,7
.L44:
	add 0,9,11
	slwi 0,0,2
	add 0,6,0
	lvx 11,0,0
	add 0,9,12
	slwi 0,0,2
	add 0,6,0
	lvx 10,0,0
	add 0,9,3
	slwi 0,0,2
	add 0,6,0
	lvx 9,0,0
	add 0,9,4
	slwi 0,0,2
	add 0,6,0
	lvx 8,0,0
	add 0,9,21
	slwi 0,0,2
	add 0,8,0
	lvx 12,0,0
	add 0,9,22
	slwi 0,0,2
	add 0,8,0
	lvx 13,0,0
	add 0,9,23
	slwi 0,0,2
	add 0,8,0
	lvx 1,0,0
	add 0,9,24
	slwi 0,0,2
	add 0,8,0
	lvx 0,0,0
	vmaddfp 2,11,12,2
	vmaddfp 14,11,13,14
	vmaddfp 19,11,1,19
	vmaddfp 30,11,0,30
	vmaddfp 18,10,12,18
	vmaddfp 29,10,13,29
	vmaddfp 17,10,1,17
	vmaddfp 28,10,0,28
	vmaddfp 16,9,12,16
	vmaddfp 27,9,13,27
	vmaddfp 15,9,1,15
	vmaddfp 26,9,0,26
	vmaddfp 31,8,12,31
	vmaddfp 25,8,13,25
	vmaddfp 7,8,1,7
	vmaddfp 24,8,0,24
	addi 9,9,4
	cmpwi 0,9,79
	bc 4,1,.L44
	vmrghw 13,2,19
	vmrghw 0,14,30
	vmrglw 12,2,19
	vmrglw 1,14,30
	vmrghw 2,13,0
	vmrglw 14,13,0
	vmrghw 19,12,1
	vmrglw 30,12,1
	vmrghw 13,18,17
	vmrghw 0,29,28
	vmrglw 12,18,17
	vmrglw 1,29,28
	vmrghw 18,13,0
	vmrglw 29,13,0
	vmrghw 17,12,1
	vmrglw 28,12,1
	vmrghw 13,16,15
	vmrghw 0,27,26
	vmrglw 12,16,15
	vmrglw 1,27,26
	vmrghw 16,13,0
	vmrglw 27,13,0
	vmrghw 15,12,1
	vmrglw 26,12,1
	vmrghw 13,31,7
	vmrghw 0,25,24
	vmrglw 12,31,7
	vmrglw 1,25,24
	vmrghw 31,13,0
	vmrglw 25,13,0
	vmrghw 7,12,1
	vmrglw 24,12,1
	vaddfp 2,2,14
	vaddfp 19,19,30
	vaddfp 18,18,29
	vaddfp 17,17,28
	vaddfp 16,16,27
	vaddfp 15,15,26
	vaddfp 31,31,25
	vaddfp 7,7,24
	addi 9,1,16
	lvx 0,0,9
	vmaddfp 6,6,0,2
	vmaddfp 5,5,0,18
	vmaddfp 4,4,0,16
	vmaddfp 3,3,0,31
	vaddfp 6,6,19
	vaddfp 5,5,17
	vaddfp 4,4,15
	vaddfp 3,3,7
	vmrghw 13,6,4
	vmrghw 0,5,3
	vmrglw 12,6,4
	vmrglw 1,5,3
	vmrghw 6,13,0
	vmrglw 5,13,0
	vmrghw 4,12,1
	vmrglw 3,12,1
	add 0,5,26
	slwi 0,0,2
	add 0,10,0
	lvx 12,0,0
	li 9,16
	lvx 13,9,0
	lvsr 1,0,0
	vperm 0,22,23,1
	vperm 6,6,6,1
	vsel 12,12,6,0
	vsel 13,6,13,0
	stvx 12,0,0
	stvx 13,9,0
	add 0,5,27
	slwi 0,0,2
	add 0,10,0
	lvx 12,0,0
	lvx 13,9,0
	lvsr 1,0,0
	vperm 0,22,23,1
	vperm 5,5,5,1
	vsel 12,12,5,0
	vsel 13,5,13,0
	stvx 12,0,0
	stvx 13,9,0
	add 0,5,28
	slwi 0,0,2
	add 0,10,0
	lvx 12,0,0
	lvx 13,9,0
	lvsr 1,0,0
	vperm 0,22,23,1
	vperm 4,4,4,1
	vsel 12,12,4,0
	vsel 13,4,13,0
	stvx 12,0,0
	stvx 13,9,0
	add 0,5,29
	slwi 0,0,2
	add 0,10,0
	lvx 12,0,0
	lvx 13,9,0
	lvsr 1,0,0
	vperm 0,22,23,1
	vperm 3,3,3,1
	vsel 12,12,3,0
	vsel 13,3,13,0
	stvx 12,0,0
	stvx 13,9,0
	addi 5,5,4
	cmpwi 0,5,79
	bc 4,1,.L40
	addi 31,31,4
	cmpwi 0,31,79
	bc 4,1,.L36
	lwz 21,212(1)
	lwz 22,216(1)
	lwz 23,220(1)
	lwz 24,224(1)
	lwz 25,228(1)
	lwz 26,232(1)
	lwz 27,236(1)
	lwz 28,240(1)
	lwz 29,244(1)
	lwz 30,248(1)
	lwz 31,252(1)
	addi 0,1,208
	bl _restv21
	lwz 0,260(1)
	mtlr 0
	la 1,256(1)
	blr
.Lfe1:
	.size	 ATL_sJIK80x80x80TN80x80x0_a1_b1,.Lfe1-ATL_sJIK80x80x80TN80x80x0_a1_=
b1
	.ident	"GCC: (GNU) 2.95.2 19991024 (moto-1.7 release)"


> -Nick
> 
> On Thursday, July 19, 2001, at 03:36  PM, Camm Maguire wrote:
> 
> > Greetings, and thanks for your reply.  That works!  A few questions:
> >
> > 1) Any complex support yet?
> > 2) Does altivec have any double precision support, a la SSE2
> > 2.5) Any l2/l1 code?
> > 3) Have you successfully build the full lib using this?
> > 4) The gcc/as situation is indeed bad.  I've confirmed that the latest
> > standard binutils can't assemble these instructions yet.  Therefore, to
> > build on Debian, at least the patched binutils needs to be packaged
> > and accepted as an option for powerpc before atlas can be built there
> > with this kernel.
> >
> > Take care,
> >
> > Nicholas Coult <coult@augsburg.edu> writes:
> >
> >> Hi,
> >>
> >> Sorry I'm having some email troubles just now.  But I see your message
> >> regarding the patched gcc in the developer mail archive.   The problem
> >> is that you need to add the flag '-fvec' to the gcc command-line flags.
> >> This enables the altivec extensions in gcc.  (I might be wrong about
> >> -fvec; it might be -fvector or -faltivec, but in any case you need to
> >> explicitly turn on the altivec features).
> >>
> >> -Nick
> >>
> --
> Nicholas Coult, Ph.D.,  web: http://melby.augsburg.edu/~coult
> Assistant Professor, Department of Mathematics, Augsburg College
> coult@augsburg.edu, phone:  (612) 330-1064 office: Science Hall 137B
> >>
> >>
> >
> > --
> > Camm Maguire			     			camm@enhanced.com
> > =========================================================================
> > =
> > "The earth is but one country, and mankind its citizens."  --  
> > Baha'u'llah
> >
> 
> 

-- 
Camm Maguire			     			camm@enhanced.com
==========================================================================
"The earth is but one country, and mankind its citizens."  --  Baha'u'llah