/* ---------------------------------------------------------------------
*
*  Mark R. Fahey
*  August 2000
*  This is a slightly modified version of pzaxpy_ from ScaLAPACK 1.0
*  which fixes a bug in the incx=1 and incy=1 case.
*
*  ---------------------------------------------------------------------
*/
/*
*  Include files
*/
#include "pblas.h"

void pzaxpy_( n, alpha, X, ix, jx, desc_X, incx, Y, iy, jy, desc_Y,
              incy )
/*
*  .. Scalar Arguments ..
*/
   int         * incx, * incy, * ix, * iy, * jx, * jy, * n;
   complex16   * alpha;
/* ..
*  .. Array Arguments ..
*/
   int         desc_X[], desc_Y[];
   complex16   X[], Y[];
{
/*
*  Purpose
*  =======
*
*  PZAXPY adds one distributed vector to another,
*
*     sub( Y ) := sub( Y ) + alpha * sub( X )
*
*  where sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X,
*                         X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X,
*
*        sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y,
*                         Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y.
*
*  Notes
*  =====
*
*  Each global data object is described by an associated description
*  vector.  This vector stores the information required to establish
*  the mapping between an object element and its corresponding process
*  and memory location.
*
*  Let A be a generic term for any 2D block cyclicly distributed array.
*  Such a global array has an associated description vector descA.
*  In the following comments, the character _ should be read as
*  "of the global array".
*
*  NOTATION        STORED IN      EXPLANATION
*  --------------- -------------- --------------------------------------
*  DT_A   (global) descA[ DT_ ]   The descriptor type.  In this case,
*                                 DT_A = 1.
*  CTXT_A (global) descA[ CTXT_ ] The BLACS context handle, indicating
*                                 the BLACS process grid A is distribu-
*                                 ted over. The context itself is glo-
*                                 bal, but the handle (the integer
*                                 value) may vary.
*  M_A    (global) descA[ M_ ]    The number of rows in the global
*                                 array A.
*  N_A    (global) descA[ N_ ]    The number of columns in the global
*                                 array A.
*  MB_A   (global) descA[ MB_ ]   The blocking factor used to distribu-
*                                 te the rows of the array.
*  NB_A   (global) descA[ NB_ ]   The blocking factor used to distribu-
*                                 te the columns of the array.
*  RSRC_A (global) descA[ RSRC_ ] The process row over which the first
*                                 row of the array A is distributed.
*  CSRC_A (global) descA[ CSRC_ ] The process column over which the
*                                 first column of the array A is
*                                 distributed.
*  LLD_A  (local)  descA[ LLD_ ]  The leading dimension of the local
*                                 array.  LLD_A >= MAX(1,LOCr(M_A)).
*
*  Let K be the number of rows or columns of a distributed matrix,
*  and assume that its process grid has dimension p x q.
*  LOCr( K ) denotes the number of elements of K that a process
*  would receive if K were distributed over the p processes of its
*  process column.
*  Similarly, LOCc( K ) denotes the number of elements of K that a
*  process would receive if K were distributed over the q processes of
*  its process row.
*  The values of LOCr() and LOCc() may be determined via a call to the
*  ScaLAPACK tool function, NUMROC:
*          LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
*          LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
*  An upper bound for these quantities may be computed by:
*          LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A
*          LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A
*
*  Because vectors may be seen as particular matrices, a distributed
*  vector is considered to be a distributed matrix.
*
*  If INCX = M_X and INCY = M_Y, NB_X must be equal to NB_Y, and the
*  process column having the first entries of sub( Y ) must also contain
*  the first entries of sub( X ). Moreover, the quantity
*  MOD( JX-1, NB_X ) must be equal to MOD( JY-1, NB_Y ).
*
*  If INCX = M_X, INCY = 1 and INCY <> M_Y, NB_X must be equal to MB_Y.
*  Moreover, the quantity MOD( JX-1, NB_X ) must be equal to
*  MOD( IY-1, MB_Y ).
*
*  If INCX = 1, INCX <> M_X and INCY = M_Y, MB_X must be equal to NB_Y.
*  Moreover, the quantity MOD( IX-1, MB_X ) must be equal to
*  MOD( JY-1, NB_Y ).
*
*  If INCX = 1, INCX <> M_X, INCY = 1 and INCY <> M_Y, MB_X must be
*  equal to MB_Y, and the process row having the first entries of
*  sub( Y ) must also contain the first entries of sub( X ). Moreover,
*  the quantity MOD( IX-1, MB_X ) must be equal to MOD( IY-1, MB_Y ).
*
*  Parameters
*  ==========
*
*  N       (global input) pointer to INTEGER.
*          The length of the distributed vectors to be added. N >= 0.
*
*  ALPHA   (global input) pointer to COMPLEX*16
*          The scalar used to multiply each component of sub( X ).
*
*  X       (local input) COMPLEX*16 array containing the local
*          pieces of a distributed matrix of dimension of at least
*              ( (JX-1)*M_X + IX + ( N - 1 )*abs( INCX ) )
*          This array contains the entries of the distributed vector
*          sub( X ).
*
*  IX      (global input) pointer to INTEGER
*          The global row index of the submatrix of the distributed
*          matrix X to operate on.
*
*  JX      (global input) pointer to INTEGER
*          The global column index of the submatrix of the distributed
*          matrix X to operate on.
*
*  DESCX   (global and local input) INTEGER array of dimension 8.
*          The array descriptor of the distributed matrix X.
*
*  INCX    (global input) pointer to INTEGER
*          The global increment for the elements of X. Only two values
*          of INCX are supported in this version, namely 1 and M_X.
*
*  Y       (local input/local output) COMPLEX*16 array
*          containing the local pieces of a distributed matrix of
*          dimension of at least
*                ( (JY-1)*M_Y + IY + ( N - 1 )*abs( INCY ) )
*          This array contains the entries of the distributed vector
*          sub( Y ).
*          On exit sub( Y ) is overwritten by sub( Y ) + alpha*sub( X ).
*
*  IY      (global input) pointer to INTEGER
*          The global row index of the submatrix of the distributed
*          matrix Y to operate on.
*
*  JY      (global input) pointer to INTEGER
*          The global column index of the submatrix of the distributed
*          matrix Y to operate on.
*
*  DESCY   (global and local input) INTEGER array of dimension 8.
*          The array descriptor of the distributed matrix Y.
*
*  INCY    (global input) pointer to INTEGER
*          The global increment for the elements of Y. Only two values
*          of INCY are supported in this version, namely 1 and M_Y.
*
*  =====================================================================
*
*  .. Local Scalars ..
*/
   int         ictxt, info, iix, iiy, ixcol, ixrow, iycol, iyrow, jjx,
               jjy, lcm, lcmp, lcmq, mycol, myrow, nn, np, np0, nprow,
               npcol, nq, nq0, nz, ione=1, tmp1, wksz;
   complex16   one, tmp, zero;
/* ..
*  .. PBLAS Buffer ..
*/
   complex16   * buff;
/* ..
*  .. External Functions ..
*/
   void        blacs_gridinfo_();
   void        zgerv2d_();
   void        zgesd2d_();
   void        pbchkvect();
   void        pberror_();
   char        * getpbbuf();
   F_VOID_FCT  zaxpy_();
   F_VOID_FCT  zcopy_();
   F_VOID_FCT  pbztrnv_();
   F_INTG_FCT  ilcm_();
   F_INTG_FCT  numroc_();
/* ..
*  .. Executable Statements ..
*
*  Get grid parameters
*/
   ictxt = desc_X[CTXT_];
   blacs_gridinfo_( &ictxt, &nprow, &npcol, &myrow, &mycol );
/*
*  Test the input parameters
*/
   info = 0;
   if( nprow == -1 )
      info = -(600+CTXT_+1);
   else
   {
      pbchkvect( *n, 1, *ix, *jx, desc_X, *incx, 6, &iix, &jjx,
                 &ixrow, &ixcol, nprow, npcol, myrow, mycol, &info );
      pbchkvect( *n, 1, *iy, *jy, desc_Y, *incy, 11, &iiy, &jjy,
                 &iyrow, &iycol, nprow, npcol, myrow, mycol, &info );

      if( info == 0 )
      {
         if( *n != 1 )
         {
            if( *incx == desc_X[M_] )
            {                 /* X is distributed along a process row */
               if( *incy == desc_Y[M_] )
               {               /* Y is distributed over a process row */
                  if( ( ixcol != iycol ) ||
                      ( ( (*jx-1) % desc_X[NB_] ) !=
                        ( (*jy-1) % desc_Y[NB_] ) ) )
                     info = -10;
                  else if( desc_Y[NB_] != desc_X[NB_] )
                     info = -(1100+NB_+1);
               }
               else if( ( *incy == 1 ) && ( *incy != desc_Y[M_] ) )
               {            /* Y is distributed over a process column */
                  if( ( (*jx-1) % desc_X[NB_] ) != ( (*iy-1) % desc_Y[MB_] ) )
                     info = -9;
                  else if( desc_Y[MB_] != desc_X[NB_] )
                     info = -(1100+MB_+1);
               }
               else
               {
                  info = -12;
               }
            }
            else if( ( *incx == 1 ) && ( *incx != desc_X[M_] ) )
            {              /* X is distributed along a process column */
               if( *incy == desc_Y[M_] )
               {               /* Y is distributed over a process row */
                  if( ( (*ix-1) % desc_X[MB_] ) != ( (*jy-1) % desc_Y[NB_] ) )
                     info = -10;
                  else if( desc_Y[NB_] != desc_X[MB_] )
                     info = -(1100+NB_+1);
               }
               else if( ( *incy == 1 ) && ( *incy != desc_Y[M_] ) )
               {               /* Y is distributed over a process column */
                  if( ( ixrow != iyrow ) ||
                      ( ( (*ix-1) % desc_X[MB_] ) !=
                        ( (*iy-1) % desc_Y[MB_] ) ) )
                     info = -9;
                  else if( desc_Y[MB_] != desc_X[MB_] )
                     info = -(1100+MB_+1);
               }
               else
               {
                  info = -12;
               }
            }
            else
            {
               info = -7;
            }
         }
         if( ictxt != desc_Y[CTXT_] )
            info = -(1100+CTXT_+1);
      }
   }
   if( info )
   {
      pberror_( &ictxt, "PZAXPY", &info );
      return;
   }
/*
*  Quick return if possible.
*/
   if( *n == 0 )
      return;
/*
*  y <- y + alpha * x
*/
   if( *n == 1 )
   {
      if( ( myrow == iyrow ) && ( mycol == iycol ) )
      {
         if( ( myrow != ixrow ) || ( mycol != ixcol ) )
            zgerv2d_( &ictxt, n, n, &tmp, n, &ixrow, &ixcol );
         else
            tmp = X[iix-1+(jjx-1)*desc_X[LLD_]];
         zaxpy_( n, alpha, &tmp, n, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], n );
      }
      else if( ( myrow == ixrow ) && ( mycol == ixcol ) )
         zgesd2d_( &ictxt, n, n, &X[iix-1+(jjx-1)*desc_X[LLD_]], n,
                   &iyrow, &iycol );
      return;
   }

   one.re = ONE; one.im = ZERO;
   zero.re = ZERO; zero.im = ZERO;
   if( ( *incx == desc_X[M_] ) && ( *incy == desc_Y[M_] ) )
   {               /* X and Y are both distributed over a process row */
      nz = (*jx-1) % desc_Y[NB_];
      nn = *n + nz;
      nq = numroc_( &nn, &desc_X[NB_], &mycol, &ixcol, &npcol );
      if( mycol == ixcol )
         nq -= nz;
      if( ixrow == iyrow )
      {
         if( myrow == ixrow )
            zaxpy_( &nq, alpha,
                    &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_],
                    &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_] );
      }
      else
      {
         if( myrow == ixrow )
            zgesd2d_( &ictxt, &ione, &nq,
                      &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_],
                      &iyrow, &mycol );
         else if( myrow == iyrow )
         {
            buff = (complex16 *)getpbbuf( "PZAXPY", nq*sizeof(complex16) );
            zgerv2d_( &ictxt, &nq, &ione, buff, &ione, &ixrow,
                      &mycol );
            zaxpy_( &nq, alpha, buff, &ione,
                    &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_] );
         }
      }
   }
   else if( ( *incx == 1 ) && ( *incx != desc_X[M_] ) &&
            ( *incy == 1 ) && ( *incy != desc_Y[M_] ) )
   {            /* X and Y are both distributed over a process column */
      nz = (*ix-1) % desc_X[MB_];
      nn = *n + nz;
      np = numroc_( &nn, &desc_X[MB_], &myrow, &ixrow, &nprow );
      if( myrow == ixrow )
         np -= nz;
      if( ixcol == iycol )
      {
         if( mycol == ixcol )
            zaxpy_( &np, alpha,
                    &X[iix-1+(jjx-1)*desc_X[LLD_]], incx,
                    &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], incy );
      }
      else
      {
         if( mycol == ixcol )
            zgesd2d_( &ictxt, &np, &ione,
                      &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_],
                      &myrow, &iycol );
         else if( mycol == iycol )
         {
            buff = (complex16 *)getpbbuf( "PZAXPY", np*sizeof(complex16) );
            zgerv2d_( &ictxt, &np, &ione, buff, &ione, &myrow,
                      &ixcol );
            zaxpy_( &np, alpha, buff, &ione,
                    &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], incy );
         }
      }
   }
   else       /* X and Y are not distributed along the same direction */
   {
      lcm = ilcm_( &nprow, &npcol );
      if( ( *incx == 1 ) && ( *incx != desc_X[M_] ) )
      {                     /* X is distributed over a process column */
         lcmq = lcm / npcol;
         nz = (*ix-1) % desc_X[MB_];
         nn = *n + nz;
         np = numroc_( &nn, &desc_X[MB_], &myrow, &ixrow, &nprow );
         nz = (*jy-1) % desc_Y[NB_];
         nn = *n + nz;
         tmp1 = nn / desc_Y[NB_];
         nq0 = MYROC0( tmp1, nn, desc_Y[NB_], npcol );
         tmp1 = nq0 / desc_Y[NB_];
         wksz = np + MYROC0( tmp1, nq0, desc_Y[NB_], lcmq );

         buff = (complex16 *)getpbbuf( "PZAXPY", wksz*sizeof(complex16) );

         if( myrow == ixrow )
            np -= nz;

         if( mycol == ixcol )
         {
            zcopy_( &np, &X[iix-1+(jjx-1)*desc_X[LLD_]], incx,
                    buff, incx );
            zscal_( &np, alpha, buff, incx );
         }
         pbztrnv_( &ictxt, C2F_CHAR( "C" ), C2F_CHAR( "T" ), n,
                   &desc_X[MB_], &nz, buff, incx, &one,
                   &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_],
                   &ixrow, &ixcol, &iyrow, &iycol, buff+np );
      }
      else                  /* Y is distributed over a process column */
      {
         lcmp = lcm / nprow;
         nz = (*iy-1) % desc_Y[MB_];
         nn = *n + nz;
         tmp1 = nn / desc_Y[MB_];
         np = numroc_( &nn, &desc_Y[MB_], &myrow, &iyrow, &nprow );
         np0 = MYROC0( tmp1, nn, desc_Y[MB_], nprow );
         tmp1 = np0 / desc_Y[MB_];
         wksz = MYROC0( tmp1, np0, desc_Y[MB_], lcmp );
         wksz = np + wksz;

         buff = (complex16 *)getpbbuf( "PZAXPY", wksz*sizeof(complex16) );

         pbztrnv_( &ictxt, C2F_CHAR( "R" ), C2F_CHAR( "T" ), n,
                   &desc_X[NB_], &nz, &X[iix-1+(jjx-1)*desc_X[LLD_]],
                   &desc_X[LLD_], &zero, buff, &ione, &ixrow, &ixcol,
                   &iyrow, &iycol, buff+np );
         if( mycol == iycol )
         {
            if( myrow == iyrow )
               np -= nz;
            zaxpy_( &np, alpha, buff, &ione,
                    &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], incy );
         }
      }
   }
}