42{
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261 char Aroc, Broc, TrA, TrB, * one, * tbeta, * zero;
262 Int ABrocs, Abufld, AcurrocR, Afr, Afwd, AiD, AiR, AiiD, AiiR,
263 AinbD, AinbR, Ainb1D, Ainb1R, AisR, AkkR, Ald, AmyprocD,
264 AmyprocR, AnbD, AnbR, AnpD, AnpR, AnprocsD, AnprocsR, Aoff,
265 ArocD, ArocR, AsrcR, Bbufld, BcurrocR, Bfr, Bfwd, BiD, BiR,
266 BiiD, BiiR, BinbD, BinbR, Binb1D, Binb1R, BisR, BkkR, Bld,
267 BmyprocD, BmyprocR, BnbD, BnbR, BnpD, BnpR, BnprocsD,
268 BnprocsR, Boff, BrocD, BrocR, BsrcR, Ccol, Cii, Cimb1, Cinb1,
269 Cjj, Cld, Cmb, Cmp, Cnb, Cnq, Crow, WAfr, WAsum, WBfr, WBsum,
270 Wkbb=0, ctxt, k, kb, kbb, lcmb, maxp, maxpm1, maxq, mycol,
271 myrow, ncpq, nota, notb, npcol, npq=0, nprow, nrpq, p=0, q=0,
272 size, tmp;
274
275
276
280 char * Abuf = NULL, * Bbuf = NULL, * Cptr = NULL, * WA = NULL,
281 * WB = NULL;
282
283
284
285
286
287
288
290
295
297
298
299
300 if( nota )
301 {
302 AiR = JA; Aroc =
CCOLUMN; AnprocsR = npcol;
303 AinbR = DESCA[
INB_]; AnbR = DESCA[
NB_ ]; AsrcR = DESCA[
CSRC_];
304 }
305 else
306 {
307 AiR = IA; Aroc =
CROW; AnprocsR = nprow;
308 AinbR = DESCA[
IMB_]; AnbR = DESCA[
MB_ ]; AsrcR = DESCA[
RSRC_];
309 }
310
311 if( notb )
312 {
313 BiR = IB; Broc =
CROW; BnprocsR = nprow;
314 BinbR = DESCB[
IMB_]; BnbR = DESCB[
MB_ ]; BsrcR = DESCB[
RSRC_];
315 }
316 else
317 {
318 BiR = JB; Broc =
CCOLUMN; BnprocsR = npcol;
319 BinbR = DESCB[
INB_]; BnbR = DESCB[
NB_ ]; BsrcR = DESCB[
CSRC_];
320 }
321
322
323
324 PB_Cdescribe( M, N, IC, JC, DESCC, nprow, npcol, myrow, mycol, &Cii, &Cjj,
325 &Cld, &Cimb1, &Cinb1, &Cmb, &Cnb, &Crow, &Ccol, Cd0 );
326
327 Cmp =
PB_Cnumroc( M, 0, Cimb1, Cmb, myrow, Crow, nprow );
328 Cnq =
PB_Cnumroc( N, 0, Cinb1, Cnb, mycol, Ccol, npcol );
329
330
331
332
333 if( !(
PB_Cspan( K, AiR, AinbR, AnbR, AsrcR, AnprocsR ) ) &&
334 !(
PB_Cspan( K, BiR, BinbR, BnbR, BsrcR, BnprocsR ) ) )
335 {
336 PB_CInV(
TYPE, &TrA,
COLUMN, M, N, Cd0, K, A, IA, JA, DESCA, &Aroc, &WA,
337 WAd0, &WAfr );
338 PB_CInV(
TYPE, &TrB,
ROW, M, N, Cd0, K, B, IB, JB, DESCB, &Broc, &WB,
339 WBd0, &WBfr );
340 if( ( Cmp > 0 ) && ( Cnq > 0 ) )
341 {
342
343
344
347 Cii, Cjj, Cld, size ), &Cld );
348 }
349 if( WAfr ) free( WA );
350 if( WBfr ) free( WB );
351 return;
352 }
353
354
355
358
359 one =
TYPE->one; zero =
TYPE->zero; tbeta = BETA; gemm =
TYPE->Fgemm;
361
362
363
364 if( nota )
365 {
366 AiD = IA; AinbD = DESCA[
IMB_]; AnbD = DESCA[
MB_];
367 Ald = DESCA[
LLD_]; AmyprocD = myrow; AmyprocR = mycol;
368 AnprocsD = nprow;
369 PB_Cinfog2l( IA, JA, DESCA, AnprocsD, AnprocsR, AmyprocD, AmyprocR,
370 &AiiD, &AiiR, &ArocD, &ArocR );
371 }
372 else
373 {
374 AiD = JA; AinbD = DESCA[
INB_]; AnbD = DESCA[
NB_];
375 Ald = DESCA[
LLD_]; AmyprocD = mycol; AmyprocR = myrow;
376 AnprocsD = npcol;
377 PB_Cinfog2l( IA, JA, DESCA, AnprocsR, AnprocsD, AmyprocR, AmyprocD,
378 &AiiR, &AiiD, &ArocR, &ArocD );
379 }
381 AnpD =
PB_Cnumroc( M, 0, Ainb1D, AnbD, AmyprocD, ArocD, AnprocsD );
383 AisR = ( ( AsrcR < 0 ) || ( AnprocsR == 1 ) );
384
385 if( notb )
386 {
387 BiD = JB; BinbD = DESCB[
INB_]; BnbD = DESCB[
NB_];
388 Bld = DESCB[
LLD_]; BmyprocD = mycol; BmyprocR = myrow;
389 BnprocsD = npcol;
390 PB_Cinfog2l( IB, JB, DESCB, BnprocsR, BnprocsD, BmyprocR, BmyprocD,
391 &BiiR, &BiiD, &BrocR, &BrocD );
392 }
393 else
394 {
395 BiD = IB; BinbD = DESCB[
IMB_]; BnbD = DESCB[
MB_];
396 Bld = DESCB[
LLD_]; BmyprocD = myrow; BmyprocR = mycol;
397 BnprocsD = nprow;
398 PB_Cinfog2l( IB, JB, DESCB, BnprocsD, BnprocsR, BmyprocD, BmyprocR,
399 &BiiD, &BiiR, &BrocD, &BrocR );
400 }
402 BnpD =
PB_Cnumroc( N, 0, Binb1D, BnbD, BmyprocD, BrocD, BnprocsD );
404 BisR = ( ( BsrcR < 0 ) || ( BnprocsR == 1 ) );
405
406
407
408
409 if( !( AisR ) && !( Afwd ) )
410 {
411 tmp =
PB_Cindxg2p( K - 1, Ainb1R, AnbR, ArocR, ArocR, AnprocsR );
412 q =
MModSub( tmp, ArocR, AnprocsR );
413 }
414
415
416
417
418 if( !( BisR ) && !( Bfwd ) )
419 {
420 tmp =
PB_Cindxg2p( K - 1, Binb1R, BnbR, BrocR, BrocR, BnprocsR );
421 p =
MModSub( tmp, BrocR, BnprocsR );
422 }
423
424 if( Cmp > 0 && Cnq > 0 ) Cptr =
Mptr( C, Cii, Cjj, Cld, size );
425
426
427
428 PB_COutV(
TYPE,
COLUMN,
NOINIT, M, N, Cd0, kb, &WA, WAd0, &WAfr, &WAsum );
429 PB_COutV(
TYPE,
ROW,
NOINIT, M, N, Cd0, kb, &WB, WBd0, &WBfr, &WBsum );
430
431
432
433 lcmb =
PB_Clcm( ( maxp = ( BisR ? 1 : BnprocsR ) ) * BnbR,
434 ( maxq = ( AisR ? 1 : AnprocsR ) ) * AnbR );
435 maxpm1 = maxp - 1;
436
437
438
439 AcurrocR = ( AisR ? -1 :
MModAdd( ArocR, q, AnprocsR ) );
440 AkkR =
PB_Cg2lrem( AiR, AinbR, AnbR, AcurrocR, AsrcR, AnprocsR );
441 AnpR =
PB_Cnumroc( K, 0, Ainb1R, AnbR, AcurrocR, ArocR, AnprocsR );
442
443 BcurrocR = ( BisR ? -1 :
MModAdd( BrocR, p, BnprocsR ) );
444 BkkR =
PB_Cg2lrem( BiR, BinbR, BnbR, BcurrocR, BsrcR, BnprocsR );
445 BnpR =
PB_Cnumroc( K, 0, Binb1R, BnbR, BcurrocR, BrocR, BnprocsR );
446
447
448
449 PB_CVMinit( &VM, 0, BnpR, AnpR, Binb1R, Ainb1R, BnbR, AnbR, p, q,
450 maxp, maxq, lcmb );
452
453 for( k = 0; k < K; k += kb )
454 {
455 kbb = K - k; kbb =
MIN( kbb, kb );
456
457 while( Wkbb != kbb )
458 {
459
460
461
462
463 while( npq == 0 )
464 {
465 if( ( Bfwd && ( p == maxpm1 ) ) ||
466 ( !( Bfwd ) && ( p == 0 ) ) )
469
470 AcurrocR = ( AisR ? -1 :
MModAdd( ArocR, q, AnprocsR ) );
471 AkkR =
PB_Cg2lrem( AiR, AinbR, AnbR, AcurrocR, AsrcR,
472 AnprocsR );
473 AnpR =
PB_Cnumroc( K, 0, Ainb1R, AnbR, AcurrocR, ArocR,
474 AnprocsR );
475
476 BcurrocR = ( BisR ? -1 :
MModAdd( BrocR, p, BnprocsR ) );
477 BkkR =
PB_Cg2lrem( BiR, BinbR, BnbR, BcurrocR, BsrcR,
478 BnprocsR );
479 BnpR =
PB_Cnumroc( K, 0, Binb1R, BnbR, BcurrocR, BrocR,
480 BnprocsR );
481
482 PB_CVMinit( &VM, 0, BnpR, AnpR, Binb1R, Ainb1R, BnbR, AnbR,
483 p, q, maxp, maxq, lcmb );
485 }
486
487
488
489
490 if( Wkbb == 0 ) { ABrocs = ( npq < kbb ? npq : kbb ); }
491 else { ABrocs = kbb - Wkbb; ABrocs =
MIN( ABrocs, npq ); }
492
493
494
496
497 if( nota )
498 {
499
500
501
502
503 if( ( Afr = ( ncpq < ABrocs ) ) != 0 )
504 {
505
506
507
508
509 Abufld =
MAX( 1, AnpD );
510 if( AisR || ( AmyprocR == AcurrocR ) )
511 {
514 ABrocs, AnpD, one,
Mptr( A, AiiD, AkkR, Ald,
515 size ), Ald, zero, Abuf, Abufld );
516 }
517 }
518 else
519 {
520
521
522
523 Abufld = Ald;
524 if( AisR || ( AmyprocR == AcurrocR ) )
525 Abuf =
Mptr( A, AiiD, AkkR + Aoff, Ald, size );
526 }
527 PB_Cdescset( DBUFA, M, ABrocs, Ainb1D, ABrocs, AnbD, ABrocs,
528 ArocD, AcurrocR, ctxt, Abufld );
529 }
530 else
531 {
532
533
534
535
536 if( ( Afr = ( ncpq < ABrocs ) ) != 0 )
537 {
538
539
540
541
542 Abufld = ABrocs;
543 if( AisR || ( AmyprocR == AcurrocR ) )
544 {
547 ABrocs, AnpD, one,
Mptr( A, AkkR, AiiD, Ald,
548 size ), Ald, zero, Abuf, Abufld );
549 }
550 }
551 else
552 {
553
554
555
556 Abufld = Ald;
557 if( AisR || ( AmyprocR == AcurrocR ) )
558 Abuf =
Mptr( A, AkkR + Aoff, AiiD, Ald, size );
559 }
560 PB_Cdescset( DBUFA, ABrocs, M, ABrocs, Ainb1D, ABrocs, AnbD,
561 AcurrocR, ArocD, ctxt, Abufld );
562 }
563
564 if( notb )
565 {
566
567
568
569
570 if( ( Bfr = ( nrpq < ABrocs ) ) != 0 )
571 {
572
573
574
575
576 Bbufld = ABrocs;
577 if( BisR || ( BmyprocR == BcurrocR ) )
578 {
581 ABrocs, BnpD, one,
Mptr( B, BkkR, BiiD, Bld,
582 size ), Bld, zero, Bbuf, Bbufld );
583 }
584 }
585 else
586 {
587
588
589
590 Bbufld = Bld;
591 if( BisR || ( BmyprocR == BcurrocR ) )
592 Bbuf =
Mptr( B, BkkR + Boff, BiiD, Bld, size );
593 }
594 PB_Cdescset( DBUFB, ABrocs, N, ABrocs, Binb1D, ABrocs, BnbD,
595 BcurrocR, BrocD, ctxt, Bbufld );
596 }
597 else
598 {
599
600
601
602
603 if( ( Bfr = ( nrpq < ABrocs ) ) != 0 )
604 {
605
606
607
608
609 Bbufld =
MAX( 1, BnpD );
610 if( BisR || ( BmyprocR == BcurrocR ) )
611 {
614 ABrocs, BnpD, one,
Mptr( B, BiiD, BkkR, Bld,
615 size ), Bld, zero, Bbuf, Bbufld );
616 }
617 }
618 else
619 {
620
621
622
623 Bbufld = Bld;
624 if( BisR || ( BmyprocR == BcurrocR ) )
625 Bbuf =
Mptr( B, BiiD, BkkR + Boff, Bld, size );
626 }
627 PB_Cdescset( DBUFB, N, ABrocs, Binb1D, ABrocs, BnbD, ABrocs,
628 BrocD, BcurrocR, ctxt, Bbufld );
629 }
630
631
632
634
635
636
637
638 PB_CInV2(
TYPE, &TrA,
COLUMN, M, N, Cd0, ABrocs, Abuf, 0, 0,
639 DBUFA, &Aroc, WA, Wkbb, WAd0 );
640 PB_CInV2(
TYPE, &TrB,
ROW, M, N, Cd0, ABrocs, Bbuf, 0, 0,
641 DBUFB, &Broc, WB, Wkbb, WBd0 );
642
643 if( Afr & ( AisR || ( AmyprocR == AcurrocR ) ) )
644 if( Abuf ) free( Abuf );
645 if( Bfr & ( BisR || ( BmyprocR == BcurrocR ) ) )
646 if( Bbuf ) free( Bbuf );
647
648
649
650
651
652 npq -= ABrocs;
653 Wkbb += ABrocs;
654 }
655
656
657
658 if( Cmp > 0 && Cnq > 0 )
659 {
661 ALPHA, WA, &WAd0[
LLD_], WB, &WBd0[
LLD_], tbeta, Cptr, &Cld );
662 tbeta = one;
663 }
664
665 Wkbb = 0;
666 }
667
668 if( WAfr ) free( WA );
669 if( WBfr ) free( WB );
670
671
672
673}