[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: altivec: how to load a scalar.



Peter,

>I'm trying to adapt my codegenerator to AltiVec code, so I have taken your
>codes as a starting point to get a running start.

OK, I am planning on sending a complete deal later tonight or tomorrow, but
I include below my kernel, which I think is quite a bit faster.  It doesn't
really shine without atlas_prefetch.h supporting altivec, but you can see
all the tricks I'm pulling if you care to . . .

When I send out the complete thing, it will include the cases index file
and the atlas_prefetch, as well as cleanup, all precisions.  With java mode
turned on (and we need to do that), the kernel peaks at 2.035Gflop.  So far,
my full sgemm looks like it is hovering more around 1.9Gflop.

>However, I need one thing to get my algorithm working: I need to load a
>scalar into a vector register and zero all other elements in the vector.
>It does not matter where in the vector the scalar element ends up.
>I looked at the lvewx instruction that does exactly that, however the
>values at the positions in the vector that is not occupied by the scalar
>are undefined after the load, and not zero.
>
>Do you know how to solve this problem efficiently? First zeroing the
>vector would still leave undefined but bounded zeros, whatever that would
>amount to.

Hmm.  I guess you don't want to load that puppy and then just use vec_sr
or vec_sl to shift off the undefined bits?  by the way, I've never heard of
vec_lvewx . . .

If you are doing this to load C, I found leaving C in regular regs a win,
and just zeroing the starting counters to be the best solution.  See the
enclosed code for how I zeroed . . .

>Another question: I thought about using vxor to zero a register instead of
>loading -0.0 into it. Do you have any thoughts on that?

When I first noticed Nick was initializing to -0.0, I wondered what he'd been
smoking.  Turns out it helps IEEE compliance in the case the multiply component
turns out to be a signed zero.  Can't quite recall the logic, but that was
what the basic idea.

Cheers,
Clint
#include "atlas_misc.h"
#define ATL_NoFakePF
#define ATL_AltiVec 
#include "atlas_prefetch.h"

#ifndef KB
   #error KB must be compile-time constant!
#endif
#if (KB / 16)*16 != KB
   #error KB must be multiple of 16!
#endif
#ifdef MB
   #if (MB/4)*4 != MB
      #error MB must by multiple of 4!
   #endif
#endif
#ifdef NB
   #if (NB/4)*4 != NB
      #error NB must by multiple of 4!
   #endif
#endif

#define VecReorder(v0, v1, v2, v3) \
{ \
   vA0 = vec_mergeh(v0, v2); \
   vA2 = vec_mergel(v0, v2); \
   vA1 = vec_mergeh(v1, v3); \
   vA3 = vec_mergel(v1, v3); \
   v0  = vec_mergeh(vA0, vA1); \
   v2  = vec_mergel(vA0, vA1); \
   v1  = vec_mergeh(vA2, vA3); \
   v3  = vec_mergel(vA2, vA3); \
}
void ATL_USERMM
   (const int M, const int N, const int K, const TYPE alpha, const TYPE *A, const int lda, const TYPE *B, const int ldb, const TYPE beta, TYPE *C, const int ldc)
/*
 * matmul with muladd=1, TA=T, TB=N, mu=4, nu=4, ku=2, prefetching A and B
 */
{
   const TYPE *stM = A + KB*M;
   const TYPE *stN = B + KB*N;
   #ifdef ATL_AltiVec
      int blkstride, cwrdKB, cwrdC=ATL_MulBySize(8);
   #endif
   const int incAn = -KB*M;
   const int incBm = -KB;
   #define incAm KB3
   #define incBn KB4
   #ifdef TREAL
      #define incCm 4
      const int incCn = (((ldc) << 2)) - M;
   #else
      #define incCm 8
      const int incCn = (((ldc) << 3)) - (M+M);
   #endif
   const int kstart = (KB>>4)-1;
   void *vC;
   TYPE *tC;
   TYPE *pC0=C, *pC1=pC0+(ldc SHIFT), *pC2=pC1+(ldc SHIFT),*pC3=pC2+(ldc SHIFT);
   const TYPE *pA0=A;
   const TYPE *pB0=B;
   register int k;
   register TYPE rA0, rA1, rA2, rA3, ra0, ra1, ra2, ra3;
   register TYPE rB0, rB1, rB2, rB3, rb0, rb1, rb2, rb3;
   register TYPE rC0_0, rC1_0, rC2_0, rC3_0, rC0_1, rC1_1, rC2_1, rC3_1, 
                 rC0_2, rC1_2, rC2_2, rC3_2, rC0_3, rC1_3, rC2_3, rC3_3;
   vector float vA0, vA1, vA2, vA3, vB0, vB1, vB2, vB3;
   vector float  vC0_0, vC1_0, vC2_0, vC3_0, vC0_1, vC1_1, vC2_1, vC3_1, 
                 vC0_2, vC1_2, vC2_2, vC3_2, vC0_3, vC1_3, vC2_3, vC3_3;
   const vector float nzero =  (vector float)(-0.0f, -0.0f, -0.0f, -0.0f);

   #ifndef ATL_NoIEEE /* turn on java/ieee mode */
      vec_mtvscr((vector unsigned long)(0));
   #endif
   vC = malloc(ATL_Cachelen + sizeof(float)*16);
   ATL_assert( vC && ( ((M>>2)<<2) == M ) && ( ((N>>2)<<2) == N ) )
   tC = ATL_AlignPtr(vC);

   #ifdef ATL_AltiVec
/*
 *    k is blkcount, cwrdKB is block size
 */
      k = 1; /* blkcount set to 1 unless KB too large */
      cwrdKB = (ATL_MulBySize(KB)+15) >> 4;  /* # of 16-byte words in KB */
      while (cwrdKB > 32) 
      {
         cwrdKB >>= 1;
         k <<= 1;
      }
      if (cwrdKB == 32) cwrdKB = 0;
      blkstride = (KB * sizeof(TYPE)) / k;
      ATL_pfavR(A, ATL_GetCtrl(blkstride, k*KB, cwrdKB), 3);
      cwrdKB = ATL_GetCtrl(blkstride, k, cwrdKB);
      if (cwrdC >= 16) cwrdC >>= 4;
      else cwrdC = 1;
      cwrdC = ATL_GetCtrl(0, 1, cwrdC);
   #endif
   do /* N-loop */
   {
      ATL_pfavR(pB0, cwrdKB, 0);
      ATL_pfavR(pB0+KB , cwrdKB, 1);
      ATL_pfavR(pB0+KB2, cwrdKB, 2);
      ATL_pfavR(pB0+KB3, cwrdKB, 3);
      do /* M-loop */
      {
         vC0_0 = vC1_0 = vC2_0 = vC3_0 =
         vC0_1 = vC1_1 = vC2_1 = vC3_1 =
         vC0_2 = vC1_2 = vC2_2 = vC3_2 =
         vC0_3 = vC1_3 = vC2_3 = vC3_3 = nzero;
         #ifdef BETA0
            rC0_0 = rC1_0 = rC2_0 = rC3_0 =
            rC0_1 = rC1_1 = rC2_1 = rC3_1 =
            rC0_2 = rC1_2 = rC2_2 = rC3_2 =
            rC0_3 = rC1_3 = rC2_3 = rC3_3 = ATL_rzero;
         #else
            #ifdef TREAL
               rC0_0 = *pC0; rC1_0 = pC0[1]; rC2_0 = pC0[2]; rC3_0 = pC0[3];
               rC0_1 = *pC1; rC1_1 = pC1[1]; rC2_1 = pC1[2]; rC3_1 = pC1[3];
               rC0_2 = *pC2; rC1_2 = pC2[1]; rC2_2 = pC2[2]; rC3_2 = pC2[3];
               rC0_3 = *pC3; rC1_3 = pC3[1]; rC2_3 = pC3[2]; rC3_3 = pC3[3];
            #else
               rC0_0 = *pC0; rC1_0 = pC0[2]; rC2_0 = pC0[4]; rC3_0 = pC0[6];
               rC0_1 = *pC1; rC1_1 = pC1[2]; rC2_1 = pC1[4]; rC3_1 = pC1[6];
               rC0_2 = *pC2; rC1_2 = pC2[2]; rC2_2 = pC2[4]; rC3_2 = pC2[6];
               rC0_3 = *pC3; rC1_3 = pC3[2]; rC2_3 = pC3[4]; rC3_3 = pC3[6];
            #endif
         #endif
         vA0 = vec_ld(0, pA0);
         vA1 = vec_ld(0, pA0+KB);
         vA2 = vec_ld(0, pA0+KB2);
         vA3 = vec_ld(0, pA0+KB3); pA0 += 4;

         vB0 = vec_ld(0, pB0);
         vB1 = vec_ld(0, pB0+KB);
         vB2 = vec_ld(0, pB0+KB2);
         vB3 = vec_ld(0, pB0+KB3); pB0 += 4;

         for (k=kstart; k; k--)
         {
            vC0_0 = vec_madd(vA0, vB0, vC0_0);
            vC1_0 = vec_madd(vA1, vB0, vC1_0);
            vC2_0 = vec_madd(vA2, vB0, vC2_0);
            vC3_0 = vec_madd(vA3, vB0, vC3_0); vB0 = vec_ld(0, pB0);
            vC0_1 = vec_madd(vA0, vB1, vC0_1);
            vC1_1 = vec_madd(vA1, vB1, vC1_1);
            vC2_1 = vec_madd(vA2, vB1, vC2_1);
            vC3_1 = vec_madd(vA3, vB1, vC3_1); vB1 = vec_ld(0, pB0+KB);
            vC0_2 = vec_madd(vA0, vB2, vC0_2);
            vC1_2 = vec_madd(vA1, vB2, vC1_2);
            vC2_2 = vec_madd(vA2, vB2, vC2_2);
            vC3_2 = vec_madd(vA3, vB2, vC3_2); vB2 = vec_ld(0, pB0+KB2);
            vC0_3 = vec_madd(vA0, vB3, vC0_3); vA0 = vec_ld(0, pA0);
            vC1_3 = vec_madd(vA1, vB3, vC1_3); vA1 = vec_ld(0, pA0+KB);
            vC2_3 = vec_madd(vA2, vB3, vC2_3); vA2 = vec_ld(0, pA0+KB2);
            vC3_3 = vec_madd(vA3, vB3, vC3_3); vA3 = vec_ld(0, pA0+KB3);
                   
            vC0_0 = vec_madd(vA0, vB0, vC0_0); vB3 = vec_ld(0, pB0+KB3);
            vC1_0 = vec_madd(vA1, vB0, vC1_0);
            vC2_0 = vec_madd(vA2, vB0, vC2_0);
            vC3_0 = vec_madd(vA3, vB0, vC3_0); vB0 = vec_ld(0, pB0+4);
            vC0_1 = vec_madd(vA0, vB1, vC0_1);
            vC1_1 = vec_madd(vA1, vB1, vC1_1);
            vC2_1 = vec_madd(vA2, vB1, vC2_1);
            vC3_1 = vec_madd(vA3, vB1, vC3_1); vB1 = vec_ld(0, pB0+KB+4);
            vC0_2 = vec_madd(vA0, vB2, vC0_2);
            vC1_2 = vec_madd(vA1, vB2, vC1_2);
            vC2_2 = vec_madd(vA2, vB2, vC2_2);
            vC3_2 = vec_madd(vA3, vB2, vC3_2); vB2 = vec_ld(0, pB0+KB2+4);
            vC0_3 = vec_madd(vA0, vB3, vC0_3); vA0 = vec_ld(0, pA0+4);
            vC1_3 = vec_madd(vA1, vB3, vC1_3); vA1 = vec_ld(0, pA0+KB+4);
            vC2_3 = vec_madd(vA2, vB3, vC2_3); vA2 = vec_ld(0, pA0+KB2+4);
            vC3_3 = vec_madd(vA3, vB3, vC3_3); vA3 = vec_ld(0, pA0+KB3+4);

            vC0_0 = vec_madd(vA0, vB0, vC0_0); vB3 = vec_ld(0, pB0+KB3+4);
            vC1_0 = vec_madd(vA1, vB0, vC1_0);
            vC2_0 = vec_madd(vA2, vB0, vC2_0);
            vC3_0 = vec_madd(vA3, vB0, vC3_0); vB0 = vec_ld(0, pB0+8);
            vC0_1 = vec_madd(vA0, vB1, vC0_1);
            vC1_1 = vec_madd(vA1, vB1, vC1_1);
            vC2_1 = vec_madd(vA2, vB1, vC2_1);
            vC3_1 = vec_madd(vA3, vB1, vC3_1); vB1 = vec_ld(0, pB0+KB+8);
            vC0_2 = vec_madd(vA0, vB2, vC0_2);
            vC1_2 = vec_madd(vA1, vB2, vC1_2);
            vC2_2 = vec_madd(vA2, vB2, vC2_2);
            vC3_2 = vec_madd(vA3, vB2, vC3_2); vB2 = vec_ld(0, pB0+KB2+8);
            vC0_3 = vec_madd(vA0, vB3, vC0_3); vA0 = vec_ld(0, pA0+8);
            vC1_3 = vec_madd(vA1, vB3, vC1_3); vA1 = vec_ld(0, pA0+KB+8);
            vC2_3 = vec_madd(vA2, vB3, vC2_3); vA2 = vec_ld(0, pA0+KB2+8);
            vC3_3 = vec_madd(vA3, vB3, vC3_3); vA3 = vec_ld(0, pA0+KB3+8);

            vC0_0 = vec_madd(vA0, vB0, vC0_0); vB3 = vec_ld(0, pB0+KB3+8);
            vC1_0 = vec_madd(vA1, vB0, vC1_0);
            vC2_0 = vec_madd(vA2, vB0, vC2_0);
            vC3_0 = vec_madd(vA3, vB0, vC3_0); vB0 = vec_ld(0, pB0+12);
            vC0_1 = vec_madd(vA0, vB1, vC0_1);
            vC1_1 = vec_madd(vA1, vB1, vC1_1);
            vC2_1 = vec_madd(vA2, vB1, vC2_1);
            vC3_1 = vec_madd(vA3, vB1, vC3_1); vB1 = vec_ld(0, pB0+KB+12);
            vC0_2 = vec_madd(vA0, vB2, vC0_2);
            vC1_2 = vec_madd(vA1, vB2, vC1_2);
            vC2_2 = vec_madd(vA2, vB2, vC2_2);
            vC3_2 = vec_madd(vA3, vB2, vC3_2); vB2 = vec_ld(0, pB0+KB2+12);
            vC0_3 = vec_madd(vA0, vB3, vC0_3); vA0 = vec_ld(0, pA0+12);
            vC1_3 = vec_madd(vA1, vB3, vC1_3); vA1 = vec_ld(0, pA0+KB+12);
            vC2_3 = vec_madd(vA2, vB3, vC2_3); vA2 = vec_ld(0, pA0+KB2+12);
            vC3_3 = vec_madd(vA3, vB3, vC3_3); vA3 = vec_ld(0, pA0+KB3+12);
                  vB3 = vec_ld(0, pB0+KB3+12);
            pA0 += 16;
            pB0 += 16;
         }
         vC0_0 = vec_madd(vA0, vB0, vC0_0); ATL_pfavW(pC0, cwrdC, 0);
         vC1_0 = vec_madd(vA1, vB0, vC1_0); ATL_pfavW(pC1, cwrdC, 1);
         vC2_0 = vec_madd(vA2, vB0, vC2_0); ATL_pfavW(pC2, cwrdC, 2);
         vC3_0 = vec_madd(vA3, vB0, vC3_0); vB0 = vec_ld(0, pB0);
         vC0_1 = vec_madd(vA0, vB1, vC0_1); ATL_pfavW(pC3, cwrdC, 3);
         vC1_1 = vec_madd(vA1, vB1, vC1_1);
         vC2_1 = vec_madd(vA2, vB1, vC2_1);
         vC3_1 = vec_madd(vA3, vB1, vC3_1); vB1 = vec_ld(0, pB0+KB);
         vC0_2 = vec_madd(vA0, vB2, vC0_2);
         vC1_2 = vec_madd(vA1, vB2, vC1_2);
         vC2_2 = vec_madd(vA2, vB2, vC2_2);
         vC3_2 = vec_madd(vA3, vB2, vC3_2); vB2 = vec_ld(0, pB0+KB2);
         vC0_3 = vec_madd(vA0, vB3, vC0_3); vA0 = vec_ld(0, pA0);
         vC1_3 = vec_madd(vA1, vB3, vC1_3); vA1 = vec_ld(0, pA0+KB);
         vC2_3 = vec_madd(vA2, vB3, vC2_3); vA2 = vec_ld(0, pA0+KB2);
         vC3_3 = vec_madd(vA3, vB3, vC3_3); vA3 = vec_ld(0, pA0+KB3);
                   
         vC0_0 = vec_madd(vA0, vB0, vC0_0); vB3 = vec_ld(0, pB0+KB3);
         vC1_0 = vec_madd(vA1, vB0, vC1_0);
         vC2_0 = vec_madd(vA2, vB0, vC2_0);
         vC3_0 = vec_madd(vA3, vB0, vC3_0); vB0 = vec_ld(0, pB0+4);
         vC0_1 = vec_madd(vA0, vB1, vC0_1);
         vC1_1 = vec_madd(vA1, vB1, vC1_1);
         vC2_1 = vec_madd(vA2, vB1, vC2_1);
         vC3_1 = vec_madd(vA3, vB1, vC3_1); vB1 = vec_ld(0, pB0+KB+4);
         vC0_2 = vec_madd(vA0, vB2, vC0_2);
         vC1_2 = vec_madd(vA1, vB2, vC1_2);
         vC2_2 = vec_madd(vA2, vB2, vC2_2);
         vC3_2 = vec_madd(vA3, vB2, vC3_2); vB2 = vec_ld(0, pB0+KB2+4);
         vC0_3 = vec_madd(vA0, vB3, vC0_3); vA0 = vec_ld(0, pA0+4);
         vC1_3 = vec_madd(vA1, vB3, vC1_3); vA1 = vec_ld(0, pA0+KB+4);
         vC2_3 = vec_madd(vA2, vB3, vC2_3); vA2 = vec_ld(0, pA0+KB2+4);
         vC3_3 = vec_madd(vA3, vB3, vC3_3); vA3 = vec_ld(0, pA0+KB3+4);

         vC0_0 = vec_madd(vA0, vB0, vC0_0); vB3 = vec_ld(0, pB0+KB3+4);
         vC1_0 = vec_madd(vA1, vB0, vC1_0); ATL_pfavR(pA0+KB3+12, cwrdKB, 0);
         vC2_0 = vec_madd(vA2, vB0, vC2_0); ATL_pfavR(pA0+KB4+12, cwrdKB, 1);
         vC3_0 = vec_madd(vA3, vB0, vC3_0); vB0 = vec_ld(0, pB0+8);
         vC0_1 = vec_madd(vA0, vB1, vC0_1); ATL_pfavR(pA0+KB5+12, cwrdKB, 2);
         vC1_1 = vec_madd(vA1, vB1, vC1_1); ATL_pfavR(pA0+KB6+12, cwrdKB, 3);
         vC2_1 = vec_madd(vA2, vB1, vC2_1);
         vC3_1 = vec_madd(vA3, vB1, vC3_1); vB1 = vec_ld(0, pB0+KB+8);
         vC0_2 = vec_madd(vA0, vB2, vC0_2);
         vC1_2 = vec_madd(vA1, vB2, vC1_2);
         vC2_2 = vec_madd(vA2, vB2, vC2_2);
         vC3_2 = vec_madd(vA3, vB2, vC3_2); vB2 = vec_ld(0, pB0+KB2+8);
         vC0_3 = vec_madd(vA0, vB3, vC0_3); vA0 = vec_ld(0, pA0+8);
         vC1_3 = vec_madd(vA1, vB3, vC1_3); vA1 = vec_ld(0, pA0+KB+8);
         vC2_3 = vec_madd(vA2, vB3, vC2_3); vA2 = vec_ld(0, pA0+KB2+8);
         vC3_3 = vec_madd(vA3, vB3, vC3_3); vA3 = vec_ld(0, pA0+KB3+8);

         vC0_0 = vec_madd(vA0, vB0, vC0_0); vB3 = vec_ld(0, pB0+KB3+8);
         vC1_0 = vec_madd(vA1, vB0, vC1_0);
         vC2_0 = vec_madd(vA2, vB0, vC2_0);
         vC3_0 = vec_madd(vA3, vB0, vC3_0);
         vC0_1 = vec_madd(vA0, vB1, vC0_1);
         vC1_1 = vec_madd(vA1, vB1, vC1_1);
         vC2_1 = vec_madd(vA2, vB1, vC2_1);
         vC3_1 = vec_madd(vA3, vB1, vC3_1);
         vC0_2 = vec_madd(vA0, vB2, vC0_2);
         vC1_2 = vec_madd(vA1, vB2, vC1_2);
         vC2_2 = vec_madd(vA2, vB2, vC2_2);
         vC3_2 = vec_madd(vA3, vB2, vC3_2);
         vC0_3 = vec_madd(vA0, vB3, vC0_3);
         vC1_3 = vec_madd(vA1, vB3, vC1_3); pA0 += 12;
         vC2_3 = vec_madd(vA2, vB3, vC2_3);
         vC3_3 = vec_madd(vA3, vB3, vC3_3); pB0 += 12;
        
         VecReorder(vC0_0, vC1_0, vC2_0, vC3_0);
         VecReorder(vC0_1, vC1_1, vC2_1, vC3_1);
         vC0_0 = vec_add(vC0_0, vC1_0);
         vC0_1 = vec_add(vC0_1, vC1_1);
         vC2_0 = vec_add(vC2_0, vC3_0); ATL_pfavR(pA0+KB3, cwrdKB, 0);
         vC2_1 = vec_add(vC2_1, vC3_1); ATL_pfavR(pA0+KB4, cwrdKB, 1);
         vC0_0 = vec_add(vC0_0, vC2_0); ATL_pfavR(pA0+KB5, cwrdKB, 2);
         vC0_1 = vec_add(vC0_1, vC2_1); ATL_pfavR(pA0+KB6, cwrdKB, 3);

         VecReorder(vC0_2, vC1_2, vC2_2, vC3_2);
         VecReorder(vC0_3, vC1_3, vC2_3, vC3_3);
         vC0_2 = vec_add(vC0_2, vC1_2);
         vC0_3 = vec_add(vC0_3, vC1_3);
         vC2_2 = vec_add(vC2_2, vC3_2);
         vC2_3 = vec_add(vC2_3, vC3_3);
         vC0_2 = vec_add(vC0_2, vC2_2);
         vC0_3 = vec_add(vC0_3, vC2_3);
         vec_st(vC0_0, 0, tC);
         vec_st(vC0_1, 0, tC+4);
         vec_st(vC0_2, 0, tC+8);
         vec_st(vC0_3, 0, tC+12);
         #ifdef BETAX
            rA0 = beta;
            rC0_0 = rC0_0*rA0 + *tC;
            rC1_0 = rC1_0*rA0 + tC[1];
            rC2_0 = rC2_0*rA0 + tC[2];
            rC3_0 = rC3_0*rA0 + tC[3];

            rC0_1 = rC0_1*rA0 + tC[4];
            rC1_1 = rC1_1*rA0 + tC[5];
            rC2_1 = rC2_1*rA0 + tC[6];
            rC3_1 = rC3_1*rA0 + tC[7];

            rC0_2 = rC0_2*rA0 + tC[8];
            rC1_2 = rC1_2*rA0 + tC[9];
            rC2_2 = rC2_2*rA0 + tC[10];
            rC3_2 = rC3_2*rA0 + tC[11];

            rC0_3 = rC0_3*rA0 + tC[12];
            rC1_3 = rC1_3*rA0 + tC[13];
            rC2_3 = rC2_3*rA0 + tC[14];
            rC3_3 = rC3_3*rA0 + tC[15];
         #else
            rC0_0 += *tC; rC1_0 += tC[1]; rC2_0 += tC[2]; rC3_0 += tC[3];
            rC0_1 += tC[4]; rC1_1 += tC[5]; rC2_1 += tC[6]; rC3_1 += tC[7];
            rC0_2 += tC[8]; rC1_2 += tC[9]; rC2_2 += tC[10]; rC3_2 += tC[11];
            rC0_3 += tC[12]; rC1_3 += tC[13]; rC2_3 += tC[14]; rC3_3 += tC[15];
         #endif

         #ifdef TREAL
            *pC0 = rC0_0; pC0[1] = rC1_0; pC0[2] = rC2_0; pC0[3] = rC3_0;
            *pC1 = rC0_1; pC1[1] = rC1_1; pC1[2] = rC2_1; pC1[3] = rC3_1;
            *pC2 = rC0_2; pC2[1] = rC1_2; pC2[2] = rC2_2; pC2[3] = rC3_2;
            *pC3 = rC0_3; pC3[1] = rC1_3; pC3[2] = rC2_3; pC3[3] = rC3_3;
         #else
            *pC0 = rC0_0; pC0[2] = rC1_0; pC0[4] = rC2_0; pC0[6] = rC3_0;
            *pC1 = rC0_1; pC1[2] = rC1_1; pC1[4] = rC2_1; pC1[6] = rC3_1;
            *pC2 = rC0_2; pC2[2] = rC1_2; pC2[4] = rC2_2; pC2[6] = rC3_2;
            *pC3 = rC0_3; pC3[2] = rC1_3; pC3[4] = rC2_3; pC3[6] = rC3_3;
         #endif
         pC0 += incCm;
         pC1 += incCm;
         pC2 += incCm;
         pC3 += incCm;
         pA0 += incAm;
         pB0 += incBm;
      }
      while(pA0 != stM);
      pC0 += incCn;
      pC1 += incCn;
      pC2 += incCn;
      pC3 += incCn;
      pA0 += incAn;
      pB0 += incBn;
   }
   while(pB0 != stN);
   free(vC);
}
#ifdef incAm
   #undef incAm
#endif
#ifdef incBn
   #undef incBn
#endif
#ifdef incCm
   #undef incCm
#endif