SCALAPACK 2.2.2
LAPACK: Linear Algebra PACKage
Loading...
Searching...
No Matches
PB_CptrsmAB.c
Go to the documentation of this file.
1/* ---------------------------------------------------------------------
2*
3* -- PBLAS auxiliary routine (version 2.0) --
4* University of Tennessee, Knoxville, Oak Ridge National Laboratory,
5* and University of California, Berkeley.
6* April 1, 1998
7*
8* ---------------------------------------------------------------------
9*/
10/*
11* Include files
12*/
13#include "../pblas.h"
14#include "../PBpblas.h"
15#include "../PBtools.h"
16#include "../PBblacs.h"
17#include "../PBblas.h"
18
19#ifdef __STDC__
20void PB_CptrsmAB( PBTYP_T * TYPE, char * VARIANT, char * SIDE, char * UPLO,
21 char * TRANSA, char * DIAG, Int M, Int N, char * ALPHA,
22 char * A, Int IA, Int JA, Int * DESCA, char * B, Int IB,
23 Int JB, Int * DESCB )
24#else
25void PB_CptrsmAB( TYPE, VARIANT, SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, A,
26 IA, JA, DESCA, B, IB, JB, DESCB )
27/*
28* .. Scalar Arguments ..
29*/
30 char * DIAG, * SIDE, * TRANSA, * UPLO, * VARIANT;
31 Int IA, IB, JA, JB, M, N;
32 char * ALPHA;
33 PBTYP_T * TYPE;
34/*
35* .. Array Arguments ..
36*/
37 Int * DESCA, * DESCB;
38 char * A, * B;
39#endif
40{
41/*
42* Purpose
43* =======
44*
45* PB_CptrsmAB solves one of the matrix equations
46*
47* op( sub( A ) )*X = alpha*sub( B ), or
48*
49* X*op( sub( A ) ) = alpha*sub( B ),
50*
51* where
52*
53* sub( A ) denotes A(IA:IA+M-1,JA:JA+M-1) if SIDE = 'L',
54* A(IA:IA+N-1,JA:JA+N-1) if SIDE = 'R', and,
55*
56* sub( B ) denotes B(IB:IB+M-1,JB:JB+N-1).
57*
58* Alpha is a scalar, X and sub( B ) are m by n submatrices, sub( A ) is
59* a unit, or non-unit, upper or lower triangular submatrix and op( Y )
60* is one of
61*
62* op( Y ) = Y or op( Y ) = Y' or op( Y ) = conjg( Y' ).
63*
64* The submatrix X is overwritten on sub( B ).
65*
66* This is the outer-product algorithm using the logical aggregation
67* blocking technique.
68*
69* Notes
70* =====
71*
72* A description vector is associated with each 2D block-cyclicly dis-
73* tributed matrix. This vector stores the information required to
74* establish the mapping between a matrix entry and its corresponding
75* process and memory location.
76*
77* In the following comments, the character _ should be read as
78* "of the distributed matrix". Let A be a generic term for any 2D
79* block cyclicly distributed matrix. Its description vector is DESC_A:
80*
81* NOTATION STORED IN EXPLANATION
82* ---------------- --------------- ------------------------------------
83* DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type.
84* CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating
85* the NPROW x NPCOL BLACS process grid
86* A is distributed over. The context
87* itself is global, but the handle
88* (the integer value) may vary.
89* M_A (global) DESCA[ M_ ] The number of rows in the distribu-
90* ted matrix A, M_A >= 0.
91* N_A (global) DESCA[ N_ ] The number of columns in the distri-
92* buted matrix A, N_A >= 0.
93* IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left
94* block of the matrix A, IMB_A > 0.
95* INB_A (global) DESCA[ INB_ ] The number of columns of the upper
96* left block of the matrix A,
97* INB_A > 0.
98* MB_A (global) DESCA[ MB_ ] The blocking factor used to distri-
99* bute the last M_A-IMB_A rows of A,
100* MB_A > 0.
101* NB_A (global) DESCA[ NB_ ] The blocking factor used to distri-
102* bute the last N_A-INB_A columns of
103* A, NB_A > 0.
104* RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first
105* row of the matrix A is distributed,
106* NPROW > RSRC_A >= 0.
107* CSRC_A (global) DESCA[ CSRC_ ] The process column over which the
108* first column of A is distributed.
109* NPCOL > CSRC_A >= 0.
110* LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local
111* array storing the local blocks of
112* the distributed matrix A,
113* IF( Lc( 1, N_A ) > 0 )
114* LLD_A >= MAX( 1, Lr( 1, M_A ) )
115* ELSE
116* LLD_A >= 1.
117*
118* Let K be the number of rows of a matrix A starting at the global in-
119* dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows
120* that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would
121* receive if these K rows were distributed over NPROW processes. If K
122* is the number of columns of a matrix A starting at the global index
123* JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co-
124* lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if
125* these K columns were distributed over NPCOL processes.
126*
127* The values of Lr() and Lc() may be determined via a call to the func-
128* tion PB_Cnumroc:
129* Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW )
130* Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL )
131*
132* Arguments
133* =========
134*
135* TYPE (local input) pointer to a PBTYP_T structure
136* On entry, TYPE is a pointer to a structure of type PBTYP_T,
137* that contains type information (See pblas.h).
138*
139* VARIANT (global input) pointer to CHAR
140* On entry, VARIANT specifies whether the left- or right-loo-
141* king variant of the algorithm should be used for the transpo-
142* se cases only, that is TRANSA is not 'N' or 'n'. When VARIANT
143* is 'L' or 'l', the left-looking variant is used, otherwise
144* the right-looking algorithm is selected.
145*
146* SIDE (global input) pointer to CHAR
147* On entry, SIDE specifies whether op( sub( A ) ) appears on
148* the left or right of X as follows:
149*
150* SIDE = 'L' or 'l' op( sub( A ) )*X = alpha*sub( B ),
151*
152* SIDE = 'R' or 'r' X*op( sub( A ) ) = alpha*sub( B ).
153*
154* UPLO (global input) pointer to CHAR
155* On entry, UPLO specifies whether the submatrix sub( A ) is
156* an upper or lower triangular submatrix as follows:
157*
158* UPLO = 'U' or 'u' sub( A ) is an upper triangular
159* submatrix,
160*
161* UPLO = 'L' or 'l' sub( A ) is a lower triangular
162* submatrix.
163*
164* TRANSA (global input) pointer to CHAR
165* On entry, TRANSA specifies the form of op( sub( A ) ) to be
166* used in the matrix multiplication as follows:
167*
168* TRANSA = 'N' or 'n' op( sub( A ) ) = sub( A ),
169*
170* TRANSA = 'T' or 't' op( sub( A ) ) = sub( A )',
171*
172* TRANSA = 'C' or 'c' op( sub( A ) ) = conjg( sub( A )' ).
173*
174* DIAG (global input) pointer to CHAR
175* On entry, DIAG specifies whether or not sub( A ) is unit
176* triangular as follows:
177*
178* DIAG = 'U' or 'u' sub( A ) is assumed to be unit trian-
179* gular,
180*
181* DIAG = 'N' or 'n' sub( A ) is not assumed to be unit tri-
182* angular.
183*
184* M (global input) INTEGER
185* On entry, M specifies the number of rows of the submatrix
186* sub( B ). M must be at least zero.
187*
188* N (global input) INTEGER
189* On entry, N specifies the number of columns of the submatrix
190* sub( B ). N must be at least zero.
191*
192* ALPHA (global input) pointer to CHAR
193* On entry, ALPHA specifies the scalar alpha. When ALPHA is
194* supplied as zero then the local entries of the array B
195* corresponding to the entries of the submatrix sub( B ) need
196* not be set on input.
197*
198* A (local input) pointer to CHAR
199* On entry, A is an array of dimension (LLD_A, Ka), where Ka is
200* at least Lc( 1, JA+M-1 ) when SIDE = 'L' or 'l' and is at
201* least Lc( 1, JA+N-1 ) otherwise. Before entry, this array
202* contains the local entries of the matrix A.
203* Before entry with UPLO = 'U' or 'u', this array contains the
204* local entries corresponding to the entries of the upper tri-
205* angular submatrix sub( A ), and the local entries correspon-
206* ding to the entries of the strictly lower triangular part of
207* the submatrix sub( A ) are not referenced.
208* Before entry with UPLO = 'L' or 'l', this array contains the
209* local entries corresponding to the entries of the lower tri-
210* angular submatrix sub( A ), and the local entries correspon-
211* ding to the entries of the strictly upper triangular part of
212* the submatrix sub( A ) are not referenced.
213* Note that when DIAG = 'U' or 'u', the local entries corres-
214* ponding to the diagonal elements of the submatrix sub( A )
215* are not referenced either, but are assumed to be unity.
216*
217* IA (global input) INTEGER
218* On entry, IA specifies A's global row index, which points to
219* the beginning of the submatrix sub( A ).
220*
221* JA (global input) INTEGER
222* On entry, JA specifies A's global column index, which points
223* to the beginning of the submatrix sub( A ).
224*
225* DESCA (global and local input) INTEGER array
226* On entry, DESCA is an integer array of dimension DLEN_. This
227* is the array descriptor for the matrix A.
228*
229* B (local input/local output) pointer to CHAR
230* On entry, B is an array of dimension (LLD_B, Kb), where Kb is
231* at least Lc( 1, JB+N-1 ). Before entry, this array contains
232* the local entries of the matrix B.
233* On exit, the local entries of this array corresponding to the
234* to the entries of the submatrix sub( B ) are overwritten by
235* the local entries of the m by n solution submatrix.
236*
237* IB (global input) INTEGER
238* On entry, IB specifies B's global row index, which points to
239* the beginning of the submatrix sub( B ).
240*
241* JB (global input) INTEGER
242* On entry, JB specifies B's global column index, which points
243* to the beginning of the submatrix sub( B ).
244*
245* DESCB (global and local input) INTEGER array
246* On entry, DESCB is an integer array of dimension DLEN_. This
247* is the array descriptor for the matrix B.
248*
249* -- Written on April 1, 1998 by
250* Antoine Petitet, University of Tennessee, Knoxville 37996, USA.
251*
252* ---------------------------------------------------------------------
253*/
254/*
255* .. Local Scalars ..
256*/
257 char conjg, * negone, * one, * talph, top, * zero;
258 Int Afr, Bcol, Bcurcol, Bcurimb1, Bcurinb1, Bcurrow, Bfr, Bii,
259 Bimb, Bimb1, Binb, Binb1, Bjj, Bld, Bmb, Bmp, Bmp0, Bnb, Bnq,
260 Bnq0, Brow, WAfr, WAoff, WBfr, WBsum, ctxt, k, kb, kbb, kmax,
261 ktmp, lside, mn, mycol, myrow, notran, npcol, nprow, size,
262 upper;
263 GEMM_T gemm;
264 GSUM2D_T gsum2d;
265/*
266* .. Local Arrays ..
267*/
268 Int Bd0[DLEN_], DBUFA[DLEN_], DBUFB[DLEN_], WAd[DLEN_],
269 WBd[DLEN_];
270 char * Aptr = NULL, * Bptr = NULL, * Bptr0 = NULL, * WA = NULL,
271 * WB = NULL;
272/* ..
273* .. Executable Statements ..
274*
275*/
276 lside = ( Mupcase( SIDE [0] ) == CLEFT );
277 upper = ( Mupcase( UPLO [0] ) == CUPPER );
278 notran = ( Mupcase( TRANSA[0] ) == CNOTRAN );
279 size = TYPE->size; negone = TYPE->negone; one = TYPE->one;
280 zero = TYPE->zero; gsum2d = TYPE->Cgsum2d; gemm = TYPE->Fgemm;
281 talph = ALPHA;
282 kb = pilaenv_( &ctxt, C2F_CHAR( &TYPE->type ) );
283/*
284* Retrieve process grid information
285*/
286 Cblacs_gridinfo( ( ctxt = DESCA[CTXT_] ), &nprow, &npcol, &myrow, &mycol );
287/*
288* Retrieve sub( B )'s local information: Bii, Bjj, Brow, Bcol, Bld ...
289*/
290 Bimb = DESCB[IMB_]; Binb = DESCB[INB_];
291 Bmb = DESCB[MB_ ]; Bnb = DESCB[NB_ ]; Bld = DESCB[LLD_];
292 PB_Cinfog2l( IB, JB, DESCB, nprow, npcol, myrow, mycol, &Bii, &Bjj, &Brow,
293 &Bcol );
294
295 Bimb1 = PB_Cfirstnb( M, IB, Bimb, Bmb );
296 Bmp0 = PB_Cnumroc( M, 0, Bimb1, Bmb, myrow, Brow, nprow );
297 Binb1 = PB_Cfirstnb( N, JB, Binb, Bnb );
298 Bnq0 = PB_Cnumroc( N, 0, Binb1, Bnb, mycol, Bcol, npcol );
299 if( ( Bmp0 > 0 ) && ( Bnq0 > 0 ) ) Bptr0 = Mptr( B, Bii, Bjj, Bld, size );
300
301 if( notran )
302 {
303 if( lside )
304 {
305 if( upper )
306 {
307 kmax = ( ( M - 1 ) / kb ) * kb;
308
309 for( k = kmax; k >= 0; k -= kb )
310 {
311 kbb = M - k; kbb = MIN( kbb, kb ); ktmp = k + kbb;
312/*
313* Accumulate A( IA:IA+ktmp-1, JA+k:JA+k+kbb-1 )
314*/
315 PB_CGatherV( TYPE, REUSE, BACKWARD, ktmp, kbb, A, IA, JA+k,
316 DESCA, COLUMN, &Aptr, DBUFA, &Afr );
317/*
318* Replicate A(IA:IA+ktmp-1, JA+k:JA+k+kbb-1) over B(IB:IB+ktmp-1, JB:JB+N-1)
319*/
320 PB_Cdescset( Bd0, ktmp, N, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol,
321 ctxt, Bld );
322 PB_CInV( TYPE, NOCONJG, COLUMN, ktmp, N, Bd0, kbb, Aptr, 0, 0,
323 DBUFA, COLUMN, &WA, WAd, &WAfr );
324/*
325* Solve B( IB+k:IB+ktmp-1, JB:JB+N-1 ) with talph
326*/
327 PB_CptrsmAB0( TYPE, SIDE, UPLO, DIAG, kbb, N, talph, WA, k, 0,
328 WAd, B, IB+k, JB, DESCB, &Bptr, DBUFB, &Bfr );
329/*
330* Update B( IB:IB+k-1, JB:JB+N-1 )
331*/
332 if( k > 0 )
333 {
334/*
335* Replicate B( IB+k:IB+ktmp-1, JB:JB+N-1 ) over B( IB:IB+k-1, JB:JB+N-1 )
336*/
337 PB_Cdescset( Bd0, k, N, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol,
338 ctxt, Bld );
339 PB_CInV( TYPE, NOCONJG, ROW, k, N, Bd0, kbb, Bptr, 0, 0,
340 DBUFB, ROW, &WB, WBd, &WBfr );
341/*
342* Local update
343*/
344 Bmp = PB_Cnumroc( k, 0, Bimb1, Bmb, myrow, Brow, nprow );
345 if( ( Bmp > 0 ) && ( Bnq0 > 0 ) )
346 gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Bmp, &Bnq0,
347 &kbb, negone, WA, &WAd[LLD_], WB, &WBd[LLD_], talph,
348 Bptr0, &Bld );
349 if( WBfr ) free( WB );
350 talph = one;
351 }
352 if( WAfr ) free( WA );
353 if( Bfr ) free( Bptr );
354 if( Afr ) free( Aptr );
355 }
356 }
357 else
358 {
359 for( k = 0; k < M; k += kb )
360 {
361 ktmp = M - k; kbb = MIN( ktmp, kb );
362/*
363* Accumulate A( IA+k:IA+M-1, JA+k:JA+k+kbb-1 )
364*/
365 PB_CGatherV( TYPE, REUSE, FORWARD, ktmp, kbb, A, IA+k, JA+k,
366 DESCA, COLUMN, &Aptr, DBUFA, &Afr );
367/*
368* Replicate A( IA+k:IA+M-1, JA+k:JA+k+kbb-1 ) over B( IB+k:IB+M-1, JB:JB+N-1 )
369*/
370 Bcurimb1 = PB_Cfirstnb( ktmp, IB+k, Bimb, Bmb );
371 Bcurrow = PB_Cindxg2p( k, Bimb1, Bmb, Brow, Brow, nprow );
372 PB_Cdescset( Bd0, ktmp, N, Bcurimb1, Binb1, Bmb, Bnb, Bcurrow,
373 Bcol, ctxt, Bld );
374 PB_CInV( TYPE, NOCONJG, COLUMN, ktmp, N, Bd0, kbb, Aptr, 0, 0,
375 DBUFA, COLUMN, &WA, WAd, &WAfr );
376/*
377* Solve B( IB+k:IB+k+kbb-1, JB:JB+N-1 ) with talph
378*/
379 PB_CptrsmAB0( TYPE, SIDE, UPLO, DIAG, kbb, N, talph, WA, 0, 0,
380 WAd, B, IB+k, JB, DESCB, &Bptr, DBUFB, &Bfr );
381/*
382* Update B( IB+k+kbb:IB+M-1, JB:JB+N-1 )
383*/
384 if( ( ktmp = ktmp - kbb ) > 0 )
385 {
386/*
387* Replicate B(IB+k:IB+k+kbb-1, JB:JB+N-1) over B(IB+k+kbb:IB+M-1, JB:JB+N-1)
388*/
389 Bcurimb1 = PB_Cfirstnb( ktmp, IB+k+kbb, Bimb, Bmb );
390 Bcurrow = PB_Cindxg2p( k+kbb, Bimb1, Bmb, Brow, Brow,
391 nprow );
392 PB_Cdescset( Bd0, ktmp, N, Bcurimb1, Binb1, Bmb, Bnb, Bcurrow,
393 Bcol, ctxt, Bld );
394 PB_CInV( TYPE, NOCONJG, ROW, ktmp, N, Bd0, kbb, Bptr, 0, 0,
395 DBUFB, ROW, &WB, WBd, &WBfr );
396/*
397* Local update
398*/
399 Bmp = PB_Cnumroc( ktmp, k+kbb, Bimb1, Bmb, myrow, Brow,
400 nprow );
401 if( ( Bmp > 0 ) && ( Bnq0 > 0 ) )
402 {
403 WAoff = PB_Cnumroc( kbb, 0, WAd[IMB_], WAd[MB_], myrow,
404 WAd[RSRC_], nprow );
405 gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Bmp, &Bnq0,
406 &kbb, negone, Mptr( WA, WAoff, 0, WAd[LLD_], size ),
407 &WAd[LLD_], WB, &WBd[LLD_], talph, Mptr( Bptr0,
408 Bmp0-Bmp, 0, Bld, size ), &Bld );
409 }
410 if( WBfr ) free( WB );
411 talph = one;
412 }
413 if( WAfr ) free( WA );
414 if( Bfr ) free( Bptr );
415 if( Afr ) free( Aptr );
416 }
417 }
418 }
419 else
420 {
421 if( upper )
422 {
423 for( k = 0; k < N; k += kb )
424 {
425 ktmp = N - k; kbb = MIN( ktmp, kb );
426/*
427* Accumulate A( IA+k:IA+k+kbb-1, JA+k:JA+N-1 )
428*/
429 PB_CGatherV( TYPE, REUSE, FORWARD, kbb, ktmp, A, IA+k, JA+k,
430 DESCA, ROW, &Aptr, DBUFA, &Afr );
431/*
432* Replicate A( IA+k:IA+k+kbb-1, JA+k:JA+N-1 ) over B( IB:IB+M-1, JB+k:JB+N-1 )
433*/
434 Bcurinb1 = PB_Cfirstnb( ktmp, JB+k, Binb, Bnb );
435 Bcurcol = PB_Cindxg2p( k, Binb1, Bnb, Bcol, Bcol, npcol );
436 PB_Cdescset( Bd0, M, ktmp, Bimb1, Bcurinb1, Bmb, Bnb, Brow,
437 Bcurcol, ctxt, Bld );
438 PB_CInV( TYPE, NOCONJG, ROW, M, ktmp, Bd0, kbb, Aptr, 0, 0,
439 DBUFA, ROW, &WA, WAd, &WAfr );
440/*
441* Solve B( IB:IB+M-1, JB+k:JB+k+kbb-1 ) with talph
442*/
443 PB_CptrsmAB0( TYPE, SIDE, UPLO, DIAG, M, kbb, talph, WA, 0, 0,
444 WAd, B, IB, JB+k, DESCB, &Bptr, DBUFB, &Bfr );
445/*
446* Update B( IB:IB+M-1, JB+k+kbb:JB+N-1 )
447*/
448 if( ( ktmp = ktmp - kbb ) > 0 )
449 {
450/*
451* Replicate B(IB:IB+M-1, JB+k:JB+k+kbb-1) over B(IB:IB+M-1, JB+k+kbb:JB+N-1)
452*/
453 Bcurinb1 = PB_Cfirstnb( ktmp, JB+k+kbb, Binb, Bnb );
454 Bcurcol = PB_Cindxg2p( k+kbb, Binb1, Bnb, Bcol, Bcol,
455 npcol );
456 PB_Cdescset( Bd0, M, ktmp, Bimb1, Bcurinb1, Bmb, Bnb, Brow,
457 Bcurcol, ctxt, Bld );
458 PB_CInV( TYPE, NOCONJG, COLUMN, M, ktmp, Bd0, kbb, Bptr, 0, 0,
459 DBUFB, COLUMN, &WB, WBd, &WBfr );
460/*
461* Local update
462*/
463 Bnq = PB_Cnumroc( ktmp, k+kbb, Binb1, Bnb, mycol, Bcol,
464 npcol );
465 if( ( Bmp0 > 0 ) && ( Bnq > 0 ) )
466 {
467 WAoff = PB_Cnumroc( kbb, 0, WAd[INB_], WAd[NB_], mycol,
468 WAd[CSRC_], npcol );
469 gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Bmp0, &Bnq,
470 &kbb, negone, WB, &WBd[LLD_], Mptr( WA, 0, WAoff,
471 WAd[LLD_], size ), &WAd[LLD_], talph, Mptr( Bptr0,
472 0, Bnq0-Bnq, Bld, size ), &Bld );
473 }
474 if( WBfr ) free( WB );
475 talph = one;
476 }
477 if( WAfr ) free( WA );
478 if( Bfr ) free( Bptr );
479 if( Afr ) free( Aptr );
480 }
481 }
482 else
483 {
484 kmax = ( ( N - 1 ) / kb ) * kb;
485
486 for( k = kmax; k >= 0; k -= kb )
487 {
488 kbb = N - k; kbb = MIN( kbb, kb ); ktmp = k + kbb;
489/*
490* Accumulate A( IA+k:IA+k+kbb-1, JA:JA+ktmp-1 )
491*/
492 PB_CGatherV( TYPE, REUSE, BACKWARD, kbb, ktmp, A, IA+k, JA,
493 DESCA, ROW, &Aptr, DBUFA, &Afr );
494/*
495* Replicate A( IA+k:IA+k+kbb-1, JA:JA+ktmp-1 ) over B(IB:IB+M-1, JB:JB+ktmp-1)
496*/
497 PB_Cdescset( Bd0, M, ktmp, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol,
498 ctxt, Bld );
499 PB_CInV( TYPE, NOCONJG, ROW, M, ktmp, Bd0, kbb, Aptr, 0, 0,
500 DBUFA, ROW, &WA, WAd, &WAfr );
501/*
502* Solve B( IB:IB+M-1, JB+k:JB+ktmp-1 ) with talph
503*/
504 PB_CptrsmAB0( TYPE, SIDE, UPLO, DIAG, M, kbb, talph, WA, 0, k,
505 WAd, B, IB, JB+k, DESCB, &Bptr, DBUFB, &Bfr );
506/*
507* Update B( IB:IB+M-1, JB:JB+k-1 )
508*/
509 if( k > 0 )
510 {
511/*
512* Replicate B( IB:IB+M-1, JB+k:JB+ktmp-1 ) over B( IB:IB+M-1, JB:JB+k-1 )
513*/
514 PB_Cdescset( Bd0, M, k, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol,
515 ctxt, Bld );
516 PB_CInV( TYPE, NOCONJG, COLUMN, M, k, Bd0, kbb, Bptr, 0, 0,
517 DBUFB, COLUMN, &WB, WBd, &WBfr );
518/*
519* Local update
520*/
521 Bnq = PB_Cnumroc( k, 0, Binb1, Bnb, mycol, Bcol, npcol );
522 if( ( Bmp0 > 0 ) && ( Bnq > 0 ) )
523 gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Bmp0, &Bnq,
524 &kbb, negone, WB, &WBd[LLD_], WA, &WAd[LLD_], talph,
525 Bptr0, &Bld );
526 if( WBfr ) free( WB );
527 talph = one;
528 }
529 if( WAfr ) free( WA );
530 if( Bfr ) free( Bptr );
531 if( Afr ) free( Aptr );
532 }
533 }
534 }
535 }
536 else
537 {
538 if( Mupcase( VARIANT[0] ) == CRIGHT )
539 {
540/*
541* Right looking variant for the transpose cases
542*/
543 conjg = ( ( Mupcase( TRANSA[0] ) == CCOTRAN ) ? CCONJG : CNOCONJG );
544
545 if( lside )
546 {
547 if( !upper )
548 {
549/*
550* Left Lower (Conjugate) Transpose
551*/
552 kmax = ( ( M - 1 ) / kb ) * kb;
553
554 for( k = kmax; k >= 0; k -= kb )
555 {
556 kbb = M - k; kbb = MIN( kbb, kb ); ktmp = k + kbb;
557/*
558* Accumulate A( IA+k:IA+k+kbb-1, JA:JA+ktmp-1 )
559*/
560 PB_CGatherV( TYPE, REUSE, BACKWARD, kbb, ktmp, A, IA+k, JA,
561 DESCA, ROW, &Aptr, DBUFA, &Afr );
562/*
563* Replicate A( IA+k:IA+k+kbb-1, JA:JA+ktmp-1 )' over B(IB:IB+ktmp-1, JB:JB+N-1)
564*/
565 PB_Cdescset( Bd0, ktmp, N, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol,
566 ctxt, Bld );
567 PB_CInV( TYPE, &conjg, COLUMN, ktmp, N, Bd0, kbb, Aptr, 0, 0,
568 DBUFA, ROW, &WA, WAd, &WAfr );
569/*
570* Solve B( IB+k:IB+ktmp-1, JB:JB+N-1 ) with talph
571*/
572 PB_CptrsmAB0( TYPE, SIDE, UPPER, DIAG, kbb, N, talph, WA, k,
573 0, WAd, B, IB+k, JB, DESCB, &Bptr, DBUFB,
574 &Bfr );
575/*
576* Update B( IB:IB+k-1, JB:JB+N-1 )
577*/
578 if( k > 0 )
579 {
580/*
581* Replicate B( IB+k:IB+ktmp-1, JB:JB+N-1 ) over B( IB:IB+k-1, JB:JB+N-1 )
582*/
583 PB_Cdescset( Bd0, k, N, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol,
584 ctxt, Bld );
585 PB_CInV( TYPE, NOCONJG, ROW, k, N, Bd0, kbb, Bptr, 0, 0,
586 DBUFB, ROW, &WB, WBd, &WBfr );
587/*
588* Local update
589*/
590 Bmp = PB_Cnumroc( k, 0, Bimb1, Bmb, myrow, Brow, nprow );
591 if( ( Bmp > 0 ) && ( Bnq0 > 0 ) )
592 gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Bmp,
593 &Bnq0, &kbb, negone, WA, &WAd[LLD_], WB,
594 &WBd[LLD_], talph, Bptr0, &Bld );
595 if( WBfr ) free( WB );
596 talph = one;
597 }
598 if( WAfr ) free( WA );
599 if( Bfr ) free( Bptr );
600 if( Afr ) free( Aptr );
601 }
602 }
603 else
604 {
605/*
606* Left Upper (Conjugate) Transpose
607*/
608 for( k = 0; k < M; k += kb )
609 {
610 ktmp = M - k; kbb = MIN( ktmp, kb );
611/*
612* Accumulate A( IA+k:IA+k+kbb-1, JA+k:JA+M-1 )
613*/
614 PB_CGatherV( TYPE, REUSE, FORWARD, kbb, ktmp, A, IA+k, JA+k,
615 DESCA, ROW, &Aptr, DBUFA, &Afr );
616/*
617* Replicate A( IA+k:IA+k+kbb-1, JA+k:JA+M-1 )' over B( IB+k:IB+M-1, JB:JB+N-1 )
618*/
619 Bcurimb1 = PB_Cfirstnb( ktmp, IB+k, Bimb, Bmb );
620 Bcurrow = PB_Cindxg2p( k, Bimb1, Bmb, Brow, Brow, nprow );
621 PB_Cdescset( Bd0, ktmp, N, Bcurimb1, Binb1, Bmb, Bnb, Bcurrow,
622 Bcol, ctxt, Bld );
623 PB_CInV( TYPE, &conjg, COLUMN, ktmp, N, Bd0, kbb, Aptr, 0, 0,
624 DBUFA, ROW, &WA, WAd, &WAfr );
625/*
626* Solve B( IB+k:IB+k+kbb-1, JB:JB+N-1 ) with talph
627*/
628 PB_CptrsmAB0( TYPE, SIDE, LOWER, DIAG, kbb, N, talph, WA, 0,
629 0, WAd, B, IB+k, JB, DESCB, &Bptr, DBUFB,
630 &Bfr );
631/*
632* Update B( IB+k+kbb:IB+M-1, JB:JB+N-1 )
633*/
634 if( ( ktmp = ktmp - kbb ) > 0 )
635 {
636/*
637* Replicate B(IB+k:IB+k+kbb-1, JB:JB+N-1) over B(IB+k+kbb:IB+M-1, JB:JB+N-1)
638*/
639 Bcurimb1 = PB_Cfirstnb( ktmp, IB+k+kbb, Bimb, Bmb );
640 Bcurrow = PB_Cindxg2p( k+kbb, Bimb1, Bmb, Brow, Brow,
641 nprow );
642 PB_Cdescset( Bd0, ktmp, N, Bcurimb1, Binb1, Bmb, Bnb,
643 Bcurrow, Bcol, ctxt, Bld );
644 PB_CInV( TYPE, NOCONJG, ROW, ktmp, N, Bd0, kbb, Bptr, 0, 0,
645 DBUFB, ROW, &WB, WBd, &WBfr );
646/*
647* Local update
648*/
649 Bmp = PB_Cnumroc( ktmp, k+kbb, Bimb1, Bmb, myrow, Brow,
650 nprow );
651 if( ( Bmp > 0 ) && ( Bnq0 > 0 ) )
652 {
653 WAoff = PB_Cnumroc( kbb, 0, WAd[IMB_], WAd[MB_], myrow,
654 WAd[RSRC_], nprow );
655 gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Bmp,
656 &Bnq0, &kbb, negone, Mptr( WA, WAoff, 0,
657 WAd[LLD_], size ), &WAd[LLD_], WB, &WBd[LLD_],
658 talph, Mptr( Bptr0, Bmp0-Bmp, 0, Bld, size ),
659 &Bld );
660 }
661 if( WBfr ) free( WB );
662 talph = one;
663 }
664 if( WAfr ) free( WA );
665 if( Bfr ) free( Bptr );
666 if( Afr ) free( Aptr );
667 }
668 }
669 }
670 else
671 {
672 if( !upper )
673 {
674/*
675* Right Lower (Conjugate) Transpose
676*/
677 for( k = 0; k < N; k += kb )
678 {
679 ktmp = N - k; kbb = MIN( ktmp, kb );
680/*
681* Accumulate A( IA+k:IA+N-1, JA+k:JA+k+kbb-1 )
682*/
683 PB_CGatherV( TYPE, REUSE, FORWARD, ktmp, kbb, A, IA+k, JA+k,
684 DESCA, COLUMN, &Aptr, DBUFA, &Afr );
685/*
686* Replicate A( IA+k:IA+N-1, JA+k:JA+k+kbb-1 )' over B( IB:IB+M-1, JB+k:JB+N-1 )
687*/
688 Bcurinb1 = PB_Cfirstnb( ktmp, JB+k, Binb, Bnb );
689 Bcurcol = PB_Cindxg2p( k, Binb1, Bnb, Bcol, Bcol, npcol );
690 PB_Cdescset( Bd0, M, ktmp, Bimb1, Bcurinb1, Bmb, Bnb, Brow,
691 Bcurcol, ctxt, Bld );
692 PB_CInV( TYPE, &conjg, ROW, M, ktmp, Bd0, kbb, Aptr, 0, 0,
693 DBUFA, COLUMN, &WA, WAd, &WAfr );
694/*
695* Solve B( IB:IB+M-1, JB+k:JB+k+kbb-1 ) with talph
696*/
697 PB_CptrsmAB0( TYPE, SIDE, UPPER, DIAG, M, kbb, talph, WA, 0,
698 0, WAd, B, IB, JB+k, DESCB, &Bptr, DBUFB,
699 &Bfr );
700/*
701* Update B( IB:IB+M-1, JB+k+kbb:JB+N-1 )
702*/
703 if( ( ktmp = ktmp - kbb ) > 0 )
704 {
705/*
706* Replicate B(IB:IB+M-1, JB+k:JB+k+kbb-1) over B(IB:IB+M-1, JB+k+kbb:JB+N-1)
707*/
708 Bcurinb1 = PB_Cfirstnb( ktmp, JB+k+kbb, Binb, Bnb );
709 Bcurcol = PB_Cindxg2p( k+kbb, Binb1, Bnb, Bcol, Bcol,
710 npcol );
711 PB_Cdescset( Bd0, M, ktmp, Bimb1, Bcurinb1, Bmb, Bnb, Brow,
712 Bcurcol, ctxt, Bld );
713 PB_CInV( TYPE, NOCONJG, COLUMN, M, ktmp, Bd0, kbb, Bptr,
714 0, 0, DBUFB, COLUMN, &WB, WBd, &WBfr );
715/*
716* Local update
717*/
718 Bnq = PB_Cnumroc( ktmp, k+kbb, Binb1, Bnb, mycol, Bcol,
719 npcol );
720 if( ( Bmp0 > 0 ) && ( Bnq > 0 ) )
721 {
722 WAoff = PB_Cnumroc( kbb, 0, WAd[INB_], WAd[NB_], mycol,
723 WAd[CSRC_], npcol );
724 gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Bmp0,
725 &Bnq, &kbb, negone, WB, &WBd[LLD_], Mptr( WA, 0,
726 WAoff, WAd[LLD_], size ), &WAd[LLD_], talph,
727 Mptr( Bptr0, 0, Bnq0-Bnq, Bld, size ), &Bld );
728 }
729 if( WBfr ) free( WB );
730 talph = one;
731 }
732 if( WAfr ) free( WA );
733 if( Bfr ) free( Bptr );
734 if( Afr ) free( Aptr );
735 }
736 }
737 else
738 {
739/*
740* Right Upper (Conjugate) Transpose
741*/
742 kmax = ( ( N - 1 ) / kb ) * kb;
743
744 for( k = kmax; k >= 0; k -= kb )
745 {
746 kbb = N - k; kbb = MIN( kbb, kb ); ktmp = k + kbb;
747/*
748* Accumulate A( IA:IA+ktmp-1, JA+k:JA+k+kbb-1 )
749*/
750 PB_CGatherV( TYPE, REUSE, BACKWARD, ktmp, kbb, A, IA, JA+k,
751 DESCA, COLUMN, &Aptr, DBUFA, &Afr );
752/*
753* Replicate A( IA:IA+ktmp-1, JA+k:JA+k+kbb-1 )' over B(IB:IB+M-1, JB:JB+ktmp-1)
754*/
755 PB_Cdescset( Bd0, M, ktmp, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol,
756 ctxt, Bld );
757 PB_CInV( TYPE, &conjg, ROW, M, ktmp, Bd0, kbb, Aptr, 0, 0,
758 DBUFA, COLUMN, &WA, WAd, &WAfr );
759/*
760* Solve B( IB:IB+M-1, JB+k:JB+ktmp-1 ) with talph
761*/
762 PB_CptrsmAB0( TYPE, SIDE, LOWER, DIAG, M, kbb, talph, WA, 0,
763 k, WAd, B, IB, JB+k, DESCB, &Bptr, DBUFB,
764 &Bfr );
765/*
766* Update B( IB:IB+M-1, JB:JB+k-1 )
767*/
768 if( k > 0 )
769 {
770/*
771* Replicate B( IB:IB+M-1, JB+k:JB+ktmp-1 ) over B( IB:IB+M-1, JB:JB+k-1 )
772*/
773 PB_Cdescset( Bd0, M, k, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol,
774 ctxt, Bld );
775 PB_CInV( TYPE, NOCONJG, COLUMN, M, k, Bd0, kbb, Bptr, 0, 0,
776 DBUFB, COLUMN, &WB, WBd, &WBfr );
777/*
778* Local update
779*/
780 Bnq = PB_Cnumroc( k, 0, Binb1, Bnb, mycol, Bcol, npcol );
781 if( ( Bmp0 > 0 ) && ( Bnq > 0 ) )
782 gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( NOTRAN ), &Bmp0,
783 &Bnq, &kbb, negone, WB, &WBd[LLD_], WA,
784 &WAd[LLD_], talph, Bptr0, &Bld );
785 if( WBfr ) free( WB );
786 talph = one;
787 }
788 if( WAfr ) free( WA );
789 if( Bfr ) free( Bptr );
790 if( Afr ) free( Aptr );
791 }
792 }
793 }
794 }
795 else
796 {
797/*
798* Left looking variant for the transpose cases
799*/
800 if( lside )
801 {
802 top = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET );
803
804 if( upper )
805 {
806/*
807* Accumulate A( IA:IA+Bimb1-1, JA:JA+Bimb1-1 )
808*/
809 PB_CGatherV( TYPE, REUSE, FORWARD, Bimb1, Bimb1, A, IA, JA,
810 DESCA, COLUMN, &Aptr, DBUFA, &Afr );
811/*
812* Replicate A( IA:IA+Bimb1-1, JA:JA+Bimb1-1 ) over B(IB:IB+Bimb1-1, JB:JB+N-1)
813*/
814 PB_Cdescset( Bd0, Bimb1, N, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol,
815 ctxt, Bld );
816 PB_CInV( TYPE, NOCONJG, COLUMN, Bimb1, N, Bd0, Bimb1, Aptr, 0, 0,
817 DBUFA, COLUMN, &WA, WAd, &WAfr );
818/*
819* Solve B( IB:IB+Bimb1-1, JB:JB+N-1 )
820*/
821 if( ( ( Brow < 0 ) || ( myrow == Brow ) ) && ( Bnq0 > 0 ) )
822 TYPE->Ftrsm( C2F_CHAR( SIDE ), C2F_CHAR( UPLO ),
823 C2F_CHAR( TRANSA ), C2F_CHAR( DIAG ),
824 &Bimb1, &Bnq0, ALPHA, WA, &WAd[LLD_],
825 Bptr0, &Bld );
826 if( WAfr ) free( WA );
827 if( Afr ) free( Aptr );
828/*
829* Update and solve remaining rows of sub( B )
830*/
831 for( k = Bimb1; k < M; k += kb )
832 {
833 kbb = M - k; kbb = MIN( kbb, kb ); ktmp = k + kbb;
834/*
835* Accumulate A( IA:IA+ktmp-1, JA+k:JA+ktmp-1 )
836*/
837 PB_CGatherV( TYPE, REUSE, FORWARD, ktmp, kbb, A, IA, JA+k,
838 DESCA, COLUMN, &Aptr, DBUFA, &Afr );
839/*
840* Replicate A( IA:IA+ktmp-1, JA+k:JA+ktmp-1 ) over B(IB:IB+ktmp-1, JB:JB+N-1)
841*/
842 PB_Cdescset( Bd0, ktmp, N, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol,
843 ctxt, Bld );
844 PB_CInV( TYPE, NOCONJG, COLUMN, ktmp, N, Bd0, kbb, Aptr, 0, 0,
845 DBUFA, COLUMN, &WA, WAd, &WAfr );
846/*
847* WB := A( IA:IA+k-1, JA+k:JA+ktmp-1 )' * B( IB:IB+k-1, JB:JB+N-1 )
848*/
849 PB_COutV( TYPE, ROW, INIT, ktmp, N, Bd0, kbb, &WB, WBd, &WBfr,
850 &WBsum );
851 Bmp = PB_Cnumroc( k, 0, Bimb1, Bmb, myrow, Brow, nprow );
852 if( ( Bnq0 > 0 ) && ( Bmp > 0 ) )
853 gemm( C2F_CHAR( TRANSA ), C2F_CHAR( NOTRAN ), &kbb, &Bnq0,
854 &Bmp, one, WA, &WAd[LLD_], Bptr0, &Bld, zero, WB,
855 &WBd[LLD_] );
856 if( WBsum )
857 {
858 WBd[RSRC_] = PB_Cindxg2p( k, Bimb1, Bmb, Brow, Brow,
859 nprow );
860 if( Bnq0 > 0 )
861 gsum2d( ctxt, COLUMN, &top, kbb, Bnq0, WB, WBd[LLD_],
862 WBd[RSRC_], mycol );
863 }
864/*
865* Add WB to B( IB+k:IB+ktmp-1, JB:JB+N-1 ) and solve it with
866* A( IA+k:IA+ktmp-1, JA+k:JA+ktmp-1 )
867*/
868 PB_CptrsmAB1( TYPE, SIDE, UPLO, TRANSA, DIAG, kbb, N, ALPHA,
869 WA, k, 0, WAd, B, IB+k, JB, DESCB, WB, WBd );
870 if( WBfr ) free( WB );
871 if( WAfr ) free( WA );
872 if( Afr ) free( Aptr );
873 }
874 }
875 else
876 {
877/*
878* Solve last block of rows of sub( B )
879*/
880 Bcurimb1 = PB_Clastnb( M, IB, Bimb, Bmb );
881 k = M - Bcurimb1;
882/*
883* Accumulate A( IA+k:IA+M-1, JA+k:JA+M-1 )
884*/
885 PB_CGatherV( TYPE, REUSE, BACKWARD, Bcurimb1, Bcurimb1, A, IA+k,
886 JA+k, DESCA, COLUMN, &Aptr, DBUFA, &Afr );
887/*
888* Replicate A( IA+k:IA+M-1, JA+k:JA+M-1 ) over B( IB+k:IB+M-1, JB:JB+N-1 )
889*/
890 Bcurrow = PB_Cindxg2p( k, Bimb1, Bmb, Brow, Brow, nprow );
891 PB_Cdescset( Bd0, Bcurimb1, N, Bcurimb1, Binb1, Bmb, Bnb,
892 Bcurrow, Bcol, ctxt, Bld );
893 PB_CInV( TYPE, NOCONJG, COLUMN, Bcurimb1, N, Bd0, Bcurimb1, Aptr,
894 0, 0, DBUFA, COLUMN, &WA, WAd, &WAfr );
895/*
896* Solve B( IB+k:IB+M-1, JB:JB+N-1 )
897*/
898 if( ( ( Brow < 0 ) || ( myrow == Bcurrow ) ) && ( Bnq0 > 0 ) )
899 TYPE->Ftrsm( C2F_CHAR( SIDE ), C2F_CHAR( UPLO ),
900 C2F_CHAR( TRANSA ), C2F_CHAR( DIAG ),
901 &Bcurimb1, &Bnq0, ALPHA, WA, &WAd[LLD_],
902 Mptr( Bptr0, Bmp0-Bcurimb1, 0, Bld, size ),
903 &Bld );
904 if( WAfr ) free( WA );
905 if( Afr ) free( Aptr );
906 if( ( mn = M - Bcurimb1 ) <= 0 ) return;
907/*
908* Update and solve remaining rows of sub( B )
909*/
910 kmax = ( ( mn - 1 ) / kb ) * kb;
911
912 for( k = kmax; k >= 0; k -= kb )
913 {
914 ktmp = M - k; kbb = mn - k; kbb = MIN( kbb, kb );
915/*
916* Accumulate A( IA+k:IA+M-1, JA+k:JA+k+kbb-1 )
917*/
918 PB_CGatherV( TYPE, REUSE, BACKWARD, ktmp, kbb, A, IA+k, JA+k,
919 DESCA, COLUMN, &Aptr, DBUFA, &Afr );
920/*
921* Replicate A( IA+k:IA+M-1, JA+k:JA+k+kbb-1 ) over B( IB+k:IB+M-1, JB:JB+N-1 )
922*/
923 Bcurimb1 = PB_Cfirstnb( ktmp, IB+k, Bimb, Bmb );
924 Bcurrow = PB_Cindxg2p( k, Bimb1, Bmb, Brow, Brow, nprow );
925 PB_Cdescset( Bd0, ktmp, N, Bcurimb1, Binb1, Bmb, Bnb, Bcurrow,
926 Bcol, ctxt, Bld );
927 PB_CInV( TYPE, NOCONJG, COLUMN, ktmp, N, Bd0, kbb, Aptr, 0, 0,
928 DBUFA, COLUMN, &WA, WAd, &WAfr );
929/*
930* WB := A( IA+k+kbb:IA+M-1, JA+k:JA+k+kbb-1 )'* B( IB+k+kbb:IB+M-1, JB:JB+N-1 )
931*/
932 PB_COutV( TYPE, ROW, INIT, ktmp, N, Bd0, kbb, &WB, WBd, &WBfr,
933 &WBsum );
934 Bmp = PB_Cnumroc( ktmp-kbb, k+kbb, Bimb1, Bmb, myrow, Brow,
935 nprow );
936 if( ( Bnq0 > 0 ) && ( Bmp > 0 ) )
937 {
938 WAoff = PB_Cnumroc( kbb, 0, WAd[IMB_], WAd[MB_], myrow,
939 WAd[RSRC_], nprow );
940 gemm( C2F_CHAR( TRANSA ), C2F_CHAR( NOTRAN ), &kbb, &Bnq0,
941 &Bmp, one, Mptr( WA, WAoff, 0, WAd[LLD_], size ),
942 &WAd[LLD_], Mptr( Bptr0, Bmp0-Bmp, 0, Bld, size ),
943 &Bld, zero, WB, &WBd[LLD_] );
944 }
945 if( WBsum )
946 {
947 WBd[RSRC_] = PB_Cindxg2p( k + kbb - 1, Bimb1, Bmb, Brow,
948 Brow, nprow );
949 if( Bnq0 > 0 )
950 gsum2d( ctxt, COLUMN, &top, kbb, Bnq0, WB, WBd[LLD_],
951 WBd[RSRC_], mycol );
952 }
953/*
954* Add WB to B( IB+k:IB+k+kbb-1, JB:JB+N-1 ) and solve it with
955* A( IA+k:IA+k+kbb-1, JA+k:JA+k+kbb-1 )
956*/
957 PB_CptrsmAB1( TYPE, SIDE, UPLO, TRANSA, DIAG, kbb, N, ALPHA,
958 WA, 0, 0, WAd, B, IB+k, JB, DESCB, WB, WBd );
959 if( WBfr ) free( WB );
960 if( WAfr ) free( WA );
961 if( Afr ) free( Aptr );
962 }
963 }
964 }
965 else
966 {
967 top = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET );
968
969 if( upper )
970 {
971/*
972* Solve last block of columns of sub( B )
973*/
974 Bcurinb1 = PB_Clastnb( N, JB, Binb, Bnb );
975 k = N - Bcurinb1;
976/*
977* Accumulate A( IA+k:IA+N-1, JA+k:JA+N-1 )
978*/
979 PB_CGatherV( TYPE, REUSE, BACKWARD, Bcurinb1, Bcurinb1, A, IA+k,
980 JA+k, DESCA, ROW, &Aptr, DBUFA, &Afr );
981/*
982* Replicate A( IA+k:IA+N-1, JA+k:JA+N-1 ) over B( IB:IB+M-1, JB+k:JB+N-1 )
983*/
984 Bcurcol = PB_Cindxg2p( k, Binb1, Bnb, Bcol, Bcol, npcol );
985 PB_Cdescset( Bd0, M, Bcurinb1, Bimb1, Bcurinb1, Bmb, Bnb, Brow,
986 Bcurcol, ctxt, Bld );
987 PB_CInV( TYPE, NOCONJG, ROW, M, Bcurinb1, Bd0, Bcurinb1, Aptr,
988 0, 0, DBUFA, ROW, &WA, WAd, &WAfr );
989/*
990* Solve B( IB:IB+M-1, JB+k:JB+N-1 )
991*/
992 if( ( ( Bcol < 0 ) || ( mycol == Bcurcol ) ) && ( Bmp0 > 0 ) )
993 TYPE->Ftrsm( C2F_CHAR( SIDE ), C2F_CHAR( UPLO ),
994 C2F_CHAR( TRANSA ), C2F_CHAR( DIAG ),
995 &Bmp0, &Bcurinb1, ALPHA, WA, &WAd[LLD_],
996 Mptr( Bptr0, 0, Bnq0-Bcurinb1, Bld, size ),
997 &Bld );
998 if( WAfr ) free( WA );
999 if( Afr ) free( Aptr );
1000 if( ( mn = N - Bcurinb1 ) <= 0 ) return;
1001/*
1002* Update and solve remaining columns of sub( B )
1003*/
1004 kmax = ( ( mn - 1 ) / kb ) * kb;
1005
1006 for( k = kmax; k >= 0; k -= kb )
1007 {
1008 ktmp = N - k; kbb = mn - k; kbb = MIN( kbb, kb );
1009/*
1010* Accumulate A( IA+k:IA+k+kbb-1, JA+k:JA+N-1 )
1011*/
1012 PB_CGatherV( TYPE, REUSE, BACKWARD, kbb, ktmp, A, IA+k, JA+k,
1013 DESCA, ROW, &Aptr, DBUFA, &Afr );
1014/*
1015* Replicate A( IA+k:IA+k+kbb-1, JA+k:JA+N-1 ) over B( IB:IB+M-1, JB+k:JB+N-1 )
1016*/
1017 Bcurinb1 = PB_Cfirstnb( ktmp, JB+k, Binb, Bnb );
1018 Bcurcol = PB_Cindxg2p( k, Binb1, Bnb, Bcol, Bcol, npcol );
1019 PB_Cdescset( Bd0, M, ktmp, Bimb1, Bcurinb1, Bmb, Bnb, Brow,
1020 Bcurcol, ctxt, Bld );
1021 PB_CInV( TYPE, NOCONJG, ROW, M, ktmp, Bd0, kbb, Aptr, 0, 0,
1022 DBUFA, ROW, &WA, WAd, &WAfr );
1023/*
1024* WB := B( IB:IB+M-1, JB+k+kbb:JB+N-1 ) * A(IA+k:IA+k+kbb-1, JA+k+kbb:JA+N-1)'
1025*/
1026 PB_COutV( TYPE, COLUMN, INIT, M, ktmp, Bd0, kbb, &WB, WBd,
1027 &WBfr, &WBsum );
1028 Bnq = PB_Cnumroc( ktmp-kbb, k+kbb, Binb1, Bnb, mycol, Bcol,
1029 npcol );
1030 if( ( Bmp0 > 0 ) && ( Bnq > 0 ) )
1031 {
1032 WAoff = PB_Cnumroc( kbb, 0, WAd[INB_], WAd[NB_], mycol,
1033 WAd[CSRC_], npcol );
1034 gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRANSA ), &Bmp0, &kbb,
1035 &Bnq, one, Mptr( Bptr0, 0, Bnq0-Bnq, Bld, size ),
1036 &Bld, Mptr( WA, 0, WAoff, WAd[LLD_], size ),
1037 &WAd[LLD_], zero, WB, &WBd[LLD_] );
1038 }
1039 if( WBsum )
1040 {
1041 WBd[CSRC_] = PB_Cindxg2p( k + kbb - 1, Binb1, Bnb, Bcol,
1042 Bcol, npcol );
1043 if( Bmp0 > 0 )
1044 gsum2d( ctxt, ROW, &top, Bmp0, kbb, WB, WBd[LLD_],
1045 myrow, WBd[CSRC_] );
1046 }
1047/*
1048* Add WB to B( IB:IB+M-1, JB+k:JB+k+kbb-1 ) and solve it with
1049* A( IA+k:IA+k+kbb-1, JA+k:JA+k+kbb-1 )
1050*/
1051 PB_CptrsmAB1( TYPE, SIDE, UPLO, TRANSA, DIAG, M, kbb, ALPHA,
1052 WA, 0, 0, WAd, B, IB, JB+k, DESCB, WB, WBd );
1053 if( WBfr ) free( WB );
1054 if( WAfr ) free( WA );
1055 if( Afr ) free( Aptr );
1056 }
1057 }
1058 else
1059 {
1060/*
1061* Accumulate A( IA:IA+Binb1-1, JA:JA+Binb1-1 )
1062*/
1063 PB_CGatherV( TYPE, REUSE, FORWARD, Binb1, Binb1, A, IA, JA,
1064 DESCA, ROW, &Aptr, DBUFA, &Afr );
1065/*
1066* Replicate A( IA:IA+Binb1-1, JA:JA+Binb1-1 ) over B(IB:IB+M-1, JB:JB+Binb1-1)
1067*/
1068 PB_Cdescset( Bd0, M, Binb1, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol,
1069 ctxt, Bld );
1070 PB_CInV( TYPE, NOCONJG, ROW, M, Binb1, Bd0, Binb1, Aptr, 0, 0,
1071 DBUFA, ROW, &WA, WAd, &WAfr );
1072/*
1073* Solve B( IB:IB+M-1, JB:JB+Binb1-1 )
1074*/
1075 if( ( ( Bcol < 0 ) || ( mycol == Bcol ) ) && ( Bmp0 > 0 ) )
1076 TYPE->Ftrsm( C2F_CHAR( SIDE ), C2F_CHAR( UPLO ),
1077 C2F_CHAR( TRANSA ), C2F_CHAR( DIAG ),
1078 &Bmp0, &Binb1, ALPHA, WA, &WAd[LLD_],
1079 Bptr0, &Bld );
1080 if( WAfr ) free( WA );
1081 if( Afr ) free( Aptr );
1082/*
1083* Update and solve remaining columns of sub( B )
1084*/
1085 for( k = Binb1; k < N; k += kb )
1086 {
1087 kbb = N - k; kbb = MIN( kbb, kb ); ktmp = k + kbb;
1088/*
1089* Accumulate A( IA+k:IA+ktmp-1, JA:JA+ktmp-1 )
1090*/
1091 PB_CGatherV( TYPE, REUSE, FORWARD, kbb, ktmp, A, IA+k, JA,
1092 DESCA, ROW, &Aptr, DBUFA, &Afr );
1093/*
1094* Replicate A( IA+k:IA+ktmp-1, JA:JA+ktmp-1 ) over B( IB:IB+M-1, JB:JB+ktmp-1 )
1095*/
1096 PB_Cdescset( Bd0, M, ktmp, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol,
1097 ctxt, Bld );
1098 PB_CInV( TYPE, NOCONJG, ROW, M, ktmp, Bd0, kbb, Aptr, 0, 0,
1099 DBUFA, ROW, &WA, WAd, &WAfr );
1100/*
1101* WB := B( IB:IB+M-1, JB:JB+k-1 ) * A( IA+k:IA+ktmp-1, JA:JA+k-1 )'
1102*/
1103 PB_COutV( TYPE, COLUMN, INIT, M, ktmp, Bd0, kbb, &WB, WBd,
1104 &WBfr, &WBsum );
1105 Bnq = PB_Cnumroc( k, 0, Binb1, Bnb, mycol, Bcol, npcol );
1106 if( ( Bmp0 > 0 ) && ( Bnq > 0 ) )
1107 gemm( C2F_CHAR( NOTRAN ), C2F_CHAR( TRANSA ), &Bmp0, &kbb,
1108 &Bnq, one, Bptr0, &Bld, WA, &WAd[LLD_], zero, WB,
1109 &WBd[LLD_] );
1110 if( WBsum )
1111 {
1112 WBd[CSRC_] = PB_Cindxg2p( k, Binb1, Bnb, Bcol, Bcol,
1113 npcol );
1114 if( Bmp0 > 0 )
1115 gsum2d( ctxt, ROW, &top, Bmp0, kbb, WB, WBd[LLD_],
1116 myrow, WBd[CSRC_] );
1117 }
1118/*
1119* Add WB to B( IB:IB+M-1, JB+k:JB+ktmp-1 ) and solve it with
1120* A( IA+k:IA+ktmp-1, JA+k:JA+ktmp-1 )
1121*/
1122 PB_CptrsmAB1( TYPE, SIDE, UPLO, TRANSA, DIAG, M, kbb, ALPHA,
1123 WA, 0, k, WAd, B, IB, JB+k, DESCB, WB, WBd );
1124 if( WAfr ) free( WA );
1125 if( Afr ) free( Aptr );
1126 if( WBfr ) free( WB );
1127 }
1128 }
1129 }
1130 }
1131 }
1132/*
1133* End of PB_CptrsmAB
1134*/
1135}
#define Int
Definition Bconfig.h:22
F_VOID_FCT(* GEMM_T)()
Definition pblas.h:317
void(* GSUM2D_T)()
Definition pblas.h:286
#define C2F_CHAR(a)
Definition pblas.h:125
#define TOP_GET
Definition PBblacs.h:50
#define COLUMN
Definition PBblacs.h:45
#define COMBINE
Definition PBblacs.h:49
#define ROW
Definition PBblacs.h:46
void Cblacs_gridinfo()
#define NOTRAN
Definition PBblas.h:44
#define CCONJG
Definition PBblas.h:21
#define CLEFT
Definition PBblas.h:29
#define CNOCONJG
Definition PBblas.h:19
#define NOCONJG
Definition PBblas.h:45
#define CUPPER
Definition PBblas.h:26
#define CNOTRAN
Definition PBblas.h:18
#define FORWARD
Definition PBblas.h:64
#define CRIGHT
Definition PBblas.h:30
#define LOWER
Definition PBblas.h:51
#define REUSE
Definition PBblas.h:67
#define CCOTRAN
Definition PBblas.h:22
#define INIT
Definition PBblas.h:61
#define UPPER
Definition PBblas.h:52
#define BACKWARD
Definition PBblas.h:65
#define pilaenv_
Definition PBpblas.h:44
#define CTXT_
Definition PBtools.h:38
Int PB_Cfirstnb()
#define MB_
Definition PBtools.h:43
void PB_Cinfog2l()
#define MIN(a_, b_)
Definition PBtools.h:76
void PB_CptrsmAB1()
#define Mptr(a_, i_, j_, lda_, siz_)
Definition PBtools.h:132
#define LLD_
Definition PBtools.h:47
void PB_CGatherV()
Int PB_Cnumroc()
Int PB_Clastnb()
char * PB_Ctop()
void PB_CInV()
#define RSRC_
Definition PBtools.h:45
void PB_Cdescset()
void PB_COutV()
#define INB_
Definition PBtools.h:42
void PB_CptrsmAB0()
#define CSRC_
Definition PBtools.h:46
#define IMB_
Definition PBtools.h:41
Int PB_Cindxg2p()
#define Mupcase(C)
Definition PBtools.h:83
#define DLEN_
Definition PBtools.h:48
#define NB_
Definition PBtools.h:44
void PB_CptrsmAB()
#define TYPE
Definition clamov.c:7
Int size
Definition pblas.h:333