Actual source code: baij2.c


  2: #include <../src/mat/impls/baij/seq/baij.h>
  3: #include <../src/mat/impls/dense/seq/dense.h>
  4: #include <petsc/private/kernels/blockinvert.h>
  5: #include <petscbt.h>
  6: #include <petscblaslapack.h>

  8: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
  9: #include <immintrin.h>
 10: #endif

 12: PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
 13: {
 14:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
 16:   PetscInt       row,i,j,k,l,m,n,*nidx,isz,val,ival;
 17:   const PetscInt *idx;
 18:   PetscInt       start,end,*ai,*aj,bs,*nidx2;
 19:   PetscBT        table;

 22:   m  = a->mbs;
 23:   ai = a->i;
 24:   aj = a->j;
 25:   bs = A->rmap->bs;

 27:   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");

 29:   PetscBTCreate(m,&table);
 30:   PetscMalloc1(m+1,&nidx);
 31:   PetscMalloc1(A->rmap->N+1,&nidx2);

 33:   for (i=0; i<is_max; i++) {
 34:     /* Initialise the two local arrays */
 35:     isz  = 0;
 36:     PetscBTMemzero(m,table);

 38:     /* Extract the indices, assume there can be duplicate entries */
 39:     ISGetIndices(is[i],&idx);
 40:     ISGetLocalSize(is[i],&n);

 42:     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
 43:     for (j=0; j<n; ++j) {
 44:       ival = idx[j]/bs; /* convert the indices into block indices */
 45:       if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
 46:       if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival;
 47:     }
 48:     ISRestoreIndices(is[i],&idx);
 49:     ISDestroy(&is[i]);

 51:     k = 0;
 52:     for (j=0; j<ov; j++) { /* for each overlap*/
 53:       n = isz;
 54:       for (; k<n; k++) {  /* do only those rows in nidx[k], which are not done yet */
 55:         row   = nidx[k];
 56:         start = ai[row];
 57:         end   = ai[row+1];
 58:         for (l = start; l<end; l++) {
 59:           val = aj[l];
 60:           if (!PetscBTLookupSet(table,val)) nidx[isz++] = val;
 61:         }
 62:       }
 63:     }
 64:     /* expand the Index Set */
 65:     for (j=0; j<isz; j++) {
 66:       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
 67:     }
 68:     ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);
 69:   }
 70:   PetscBTDestroy(&table);
 71:   PetscFree(nidx);
 72:   PetscFree(nidx2);
 73:   return(0);
 74: }

 76: PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
 77: {
 78:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*c;
 80:   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
 81:   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
 82:   const PetscInt *irow,*icol;
 83:   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
 84:   PetscInt       *aj = a->j,*ai = a->i;
 85:   MatScalar      *mat_a;
 86:   Mat            C;
 87:   PetscBool      flag;

 90:   ISGetIndices(isrow,&irow);
 91:   ISGetIndices(iscol,&icol);
 92:   ISGetLocalSize(isrow,&nrows);
 93:   ISGetLocalSize(iscol,&ncols);

 95:   PetscCalloc1(1+oldcols,&smap);
 96:   ssmap = smap;
 97:   PetscMalloc1(1+nrows,&lens);
 98:   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
 99:   /* determine lens of each row */
100:   for (i=0; i<nrows; i++) {
101:     kstart  = ai[irow[i]];
102:     kend    = kstart + a->ilen[irow[i]];
103:     lens[i] = 0;
104:     for (k=kstart; k<kend; k++) {
105:       if (ssmap[aj[k]]) lens[i]++;
106:     }
107:   }
108:   /* Create and fill new matrix */
109:   if (scall == MAT_REUSE_MATRIX) {
110:     c = (Mat_SeqBAIJ*)((*B)->data);

112:     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
113:     PetscArraycmp(c->ilen,lens,c->mbs,&flag);
114:     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
115:     PetscArrayzero(c->ilen,c->mbs);
116:     C    = *B;
117:   } else {
118:     MatCreate(PetscObjectComm((PetscObject)A),&C);
119:     MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);
120:     MatSetType(C,((PetscObject)A)->type_name);
121:     MatSeqBAIJSetPreallocation(C,bs,0,lens);
122:   }
123:   c = (Mat_SeqBAIJ*)(C->data);
124:   for (i=0; i<nrows; i++) {
125:     row      = irow[i];
126:     kstart   = ai[row];
127:     kend     = kstart + a->ilen[row];
128:     mat_i    = c->i[i];
129:     mat_j    = c->j + mat_i;
130:     mat_a    = c->a + mat_i*bs2;
131:     mat_ilen = c->ilen + i;
132:     for (k=kstart; k<kend; k++) {
133:       if ((tcol=ssmap[a->j[k]])) {
134:         *mat_j++ = tcol - 1;
135:         PetscArraycpy(mat_a,a->a+k*bs2,bs2);
136:         mat_a   += bs2;
137:         (*mat_ilen)++;
138:       }
139:     }
140:   }
141:   /* sort */
142:   {
143:     MatScalar *work;
144:     PetscMalloc1(bs2,&work);
145:     for (i=0; i<nrows; i++) {
146:       PetscInt ilen;
147:       mat_i = c->i[i];
148:       mat_j = c->j + mat_i;
149:       mat_a = c->a + mat_i*bs2;
150:       ilen  = c->ilen[i];
151:       PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);
152:     }
153:     PetscFree(work);
154:   }

156:   /* Free work space */
157:   ISRestoreIndices(iscol,&icol);
158:   PetscFree(smap);
159:   PetscFree(lens);
160:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
161:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);

163:   ISRestoreIndices(isrow,&irow);
164:   *B   = C;
165:   return(0);
166: }

168: PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
169: {
170:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
171:   IS             is1,is2;
173:   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs,j;
174:   const PetscInt *irow,*icol;

177:   ISGetIndices(isrow,&irow);
178:   ISGetIndices(iscol,&icol);
179:   ISGetLocalSize(isrow,&nrows);
180:   ISGetLocalSize(iscol,&ncols);

182:   /* Verify if the indices corespond to each element in a block
183:    and form the IS with compressed IS */
184:   maxmnbs = PetscMax(a->mbs,a->nbs);
185:   PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);
186:   PetscArrayzero(vary,a->mbs);
187:   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
188:   for (i=0; i<a->mbs; i++) {
189:     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
190:   }
191:   count = 0;
192:   for (i=0; i<nrows; i++) {
193:     j = irow[i] / bs;
194:     if ((vary[j]--)==bs) iary[count++] = j;
195:   }
196:   ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);

198:   PetscArrayzero(vary,a->nbs);
199:   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
200:   for (i=0; i<a->nbs; i++) {
201:     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
202:   }
203:   count = 0;
204:   for (i=0; i<ncols; i++) {
205:     j = icol[i] / bs;
206:     if ((vary[j]--)==bs) iary[count++] = j;
207:   }
208:   ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);
209:   ISRestoreIndices(isrow,&irow);
210:   ISRestoreIndices(iscol,&icol);
211:   PetscFree2(vary,iary);

213:   MatCreateSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);
214:   ISDestroy(&is1);
215:   ISDestroy(&is2);
216:   return(0);
217: }

219: PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C)
220: {
222:   Mat_SeqBAIJ    *c = (Mat_SeqBAIJ*)C->data;
223:   Mat_SubSppt    *submatj = c->submatis1;

226:   (*submatj->destroy)(C);
227:   MatDestroySubMatrix_Private(submatj);
228:   return(0);
229: }

231: PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n,Mat *mat[])
232: {
234:   PetscInt       i;
235:   Mat            C;
236:   Mat_SeqBAIJ    *c;
237:   Mat_SubSppt    *submatj;

240:   for (i=0; i<n; i++) {
241:     C       = (*mat)[i];
242:     c       = (Mat_SeqBAIJ*)C->data;
243:     submatj = c->submatis1;
244:     if (submatj) {
245:       if (--((PetscObject)C)->refct <= 0) {
246:         (*submatj->destroy)(C);
247:         MatDestroySubMatrix_Private(submatj);
248:         PetscFree(C->defaultvectype);
249:         PetscLayoutDestroy(&C->rmap);
250:         PetscLayoutDestroy(&C->cmap);
251:         PetscHeaderDestroy(&C);
252:       }
253:     } else {
254:       MatDestroy(&C);
255:     }
256:   }

258:   /* Destroy Dummy submatrices created for reuse */
259:   MatDestroySubMatrices_Dummy(n,mat);

261:   PetscFree(*mat);
262:   return(0);
263: }

265: PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
266: {
268:   PetscInt       i;

271:   if (scall == MAT_INITIAL_MATRIX) {
272:     PetscCalloc1(n+1,B);
273:   }

275:   for (i=0; i<n; i++) {
276:     MatCreateSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);
277:   }
278:   return(0);
279: }

281: /* -------------------------------------------------------*/
282: /* Should check that shapes of vectors and matrices match */
283: /* -------------------------------------------------------*/

285: PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
286: {
287:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
288:   PetscScalar       *z,sum;
289:   const PetscScalar *x;
290:   const MatScalar   *v;
291:   PetscErrorCode    ierr;
292:   PetscInt          mbs,i,n;
293:   const PetscInt    *idx,*ii,*ridx=NULL;
294:   PetscBool         usecprow=a->compressedrow.use;

297:   VecGetArrayRead(xx,&x);
298:   VecGetArrayWrite(zz,&z);

300:   if (usecprow) {
301:     mbs  = a->compressedrow.nrows;
302:     ii   = a->compressedrow.i;
303:     ridx = a->compressedrow.rindex;
304:     PetscArrayzero(z,a->mbs);
305:   } else {
306:     mbs = a->mbs;
307:     ii  = a->i;
308:   }

310:   for (i=0; i<mbs; i++) {
311:     n   = ii[1] - ii[0];
312:     v   = a->a + ii[0];
313:     idx = a->j + ii[0];
314:     ii++;
315:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
316:     PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
317:     sum = 0.0;
318:     PetscSparseDensePlusDot(sum,x,v,idx,n);
319:     if (usecprow) {
320:       z[ridx[i]] = sum;
321:     } else {
322:       z[i]       = sum;
323:     }
324:   }
325:   VecRestoreArrayRead(xx,&x);
326:   VecRestoreArrayWrite(zz,&z);
327:   PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);
328:   return(0);
329: }

331: PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
332: {
333:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
334:   PetscScalar       *z = NULL,sum1,sum2,*zarray;
335:   const PetscScalar *x,*xb;
336:   PetscScalar       x1,x2;
337:   const MatScalar   *v;
338:   PetscErrorCode    ierr;
339:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
340:   PetscBool         usecprow=a->compressedrow.use;

343:   VecGetArrayRead(xx,&x);
344:   VecGetArrayWrite(zz,&zarray);

346:   idx = a->j;
347:   v   = a->a;
348:   if (usecprow) {
349:     mbs  = a->compressedrow.nrows;
350:     ii   = a->compressedrow.i;
351:     ridx = a->compressedrow.rindex;
352:     PetscArrayzero(zarray,2*a->mbs);
353:   } else {
354:     mbs = a->mbs;
355:     ii  = a->i;
356:     z   = zarray;
357:   }

359:   for (i=0; i<mbs; i++) {
360:     n           = ii[1] - ii[0]; ii++;
361:     sum1        = 0.0; sum2 = 0.0;
362:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
363:     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
364:     for (j=0; j<n; j++) {
365:       xb    = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
366:       sum1 += v[0]*x1 + v[2]*x2;
367:       sum2 += v[1]*x1 + v[3]*x2;
368:       v    += 4;
369:     }
370:     if (usecprow) z = zarray + 2*ridx[i];
371:     z[0] = sum1; z[1] = sum2;
372:     if (!usecprow) z += 2;
373:   }
374:   VecRestoreArrayRead(xx,&x);
375:   VecRestoreArrayWrite(zz,&zarray);
376:   PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);
377:   return(0);
378: }

380: PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
381: {
382:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
383:   PetscScalar       *z = NULL,sum1,sum2,sum3,x1,x2,x3,*zarray;
384:   const PetscScalar *x,*xb;
385:   const MatScalar   *v;
386:   PetscErrorCode    ierr;
387:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
388:   PetscBool         usecprow=a->compressedrow.use;

390: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
391: #pragma disjoint(*v,*z,*xb)
392: #endif

395:   VecGetArrayRead(xx,&x);
396:   VecGetArrayWrite(zz,&zarray);

398:   idx = a->j;
399:   v   = a->a;
400:   if (usecprow) {
401:     mbs  = a->compressedrow.nrows;
402:     ii   = a->compressedrow.i;
403:     ridx = a->compressedrow.rindex;
404:     PetscArrayzero(zarray,3*a->mbs);
405:   } else {
406:     mbs = a->mbs;
407:     ii  = a->i;
408:     z   = zarray;
409:   }

411:   for (i=0; i<mbs; i++) {
412:     n           = ii[1] - ii[0]; ii++;
413:     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0;
414:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
415:     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
416:     for (j=0; j<n; j++) {
417:       xb = x + 3*(*idx++);
418:       x1 = xb[0];
419:       x2 = xb[1];
420:       x3 = xb[2];

422:       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
423:       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
424:       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
425:       v    += 9;
426:     }
427:     if (usecprow) z = zarray + 3*ridx[i];
428:     z[0] = sum1; z[1] = sum2; z[2] = sum3;
429:     if (!usecprow) z += 3;
430:   }
431:   VecRestoreArrayRead(xx,&x);
432:   VecRestoreArrayWrite(zz,&zarray);
433:   PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);
434:   return(0);
435: }

437: PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
438: {
439:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
440:   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
441:   const PetscScalar *x,*xb;
442:   const MatScalar   *v;
443:   PetscErrorCode    ierr;
444:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
445:   PetscBool         usecprow=a->compressedrow.use;

448:   VecGetArrayRead(xx,&x);
449:   VecGetArrayWrite(zz,&zarray);

451:   idx = a->j;
452:   v   = a->a;
453:   if (usecprow) {
454:     mbs  = a->compressedrow.nrows;
455:     ii   = a->compressedrow.i;
456:     ridx = a->compressedrow.rindex;
457:     PetscArrayzero(zarray,4*a->mbs);
458:   } else {
459:     mbs = a->mbs;
460:     ii  = a->i;
461:     z   = zarray;
462:   }

464:   for (i=0; i<mbs; i++) {
465:     n = ii[1] - ii[0];
466:     ii++;
467:     sum1 = 0.0;
468:     sum2 = 0.0;
469:     sum3 = 0.0;
470:     sum4 = 0.0;

472:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
473:     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
474:     for (j=0; j<n; j++) {
475:       xb    = x + 4*(*idx++);
476:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
477:       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
478:       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
479:       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
480:       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
481:       v    += 16;
482:     }
483:     if (usecprow) z = zarray + 4*ridx[i];
484:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
485:     if (!usecprow) z += 4;
486:   }
487:   VecRestoreArrayRead(xx,&x);
488:   VecRestoreArrayWrite(zz,&zarray);
489:   PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);
490:   return(0);
491: }

493: PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
494: {
495:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
496:   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*zarray;
497:   const PetscScalar *xb,*x;
498:   const MatScalar   *v;
499:   PetscErrorCode    ierr;
500:   const PetscInt    *idx,*ii,*ridx=NULL;
501:   PetscInt          mbs,i,j,n;
502:   PetscBool         usecprow=a->compressedrow.use;

505:   VecGetArrayRead(xx,&x);
506:   VecGetArrayWrite(zz,&zarray);

508:   idx = a->j;
509:   v   = a->a;
510:   if (usecprow) {
511:     mbs  = a->compressedrow.nrows;
512:     ii   = a->compressedrow.i;
513:     ridx = a->compressedrow.rindex;
514:     PetscArrayzero(zarray,5*a->mbs);
515:   } else {
516:     mbs = a->mbs;
517:     ii  = a->i;
518:     z   = zarray;
519:   }

521:   for (i=0; i<mbs; i++) {
522:     n           = ii[1] - ii[0]; ii++;
523:     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
524:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
525:     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
526:     for (j=0; j<n; j++) {
527:       xb    = x + 5*(*idx++);
528:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
529:       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
530:       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
531:       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
532:       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
533:       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
534:       v    += 25;
535:     }
536:     if (usecprow) z = zarray + 5*ridx[i];
537:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
538:     if (!usecprow) z += 5;
539:   }
540:   VecRestoreArrayRead(xx,&x);
541:   VecRestoreArrayWrite(zz,&zarray);
542:   PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);
543:   return(0);
544: }

546: PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
547: {
548:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
549:   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6;
550:   const PetscScalar *x,*xb;
551:   PetscScalar       x1,x2,x3,x4,x5,x6,*zarray;
552:   const MatScalar   *v;
553:   PetscErrorCode    ierr;
554:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
555:   PetscBool         usecprow=a->compressedrow.use;

558:   VecGetArrayRead(xx,&x);
559:   VecGetArrayWrite(zz,&zarray);

561:   idx = a->j;
562:   v   = a->a;
563:   if (usecprow) {
564:     mbs  = a->compressedrow.nrows;
565:     ii   = a->compressedrow.i;
566:     ridx = a->compressedrow.rindex;
567:     PetscArrayzero(zarray,6*a->mbs);
568:   } else {
569:     mbs = a->mbs;
570:     ii  = a->i;
571:     z   = zarray;
572:   }

574:   for (i=0; i<mbs; i++) {
575:     n  = ii[1] - ii[0];
576:     ii++;
577:     sum1 = 0.0;
578:     sum2 = 0.0;
579:     sum3 = 0.0;
580:     sum4 = 0.0;
581:     sum5 = 0.0;
582:     sum6 = 0.0;

584:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
585:     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
586:     for (j=0; j<n; j++) {
587:       xb    = x + 6*(*idx++);
588:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
589:       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
590:       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
591:       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
592:       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
593:       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
594:       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
595:       v    += 36;
596:     }
597:     if (usecprow) z = zarray + 6*ridx[i];
598:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
599:     if (!usecprow) z += 6;
600:   }

602:   VecRestoreArrayRead(xx,&x);
603:   VecRestoreArrayWrite(zz,&zarray);
604:   PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);
605:   return(0);
606: }

608: PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
609: {
610:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
611:   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
612:   const PetscScalar *x,*xb;
613:   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*zarray;
614:   const MatScalar   *v;
615:   PetscErrorCode    ierr;
616:   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
617:   PetscBool         usecprow=a->compressedrow.use;

620:   VecGetArrayRead(xx,&x);
621:   VecGetArrayWrite(zz,&zarray);

623:   idx = a->j;
624:   v   = a->a;
625:   if (usecprow) {
626:     mbs  = a->compressedrow.nrows;
627:     ii   = a->compressedrow.i;
628:     ridx = a->compressedrow.rindex;
629:     PetscArrayzero(zarray,7*a->mbs);
630:   } else {
631:     mbs = a->mbs;
632:     ii  = a->i;
633:     z   = zarray;
634:   }

636:   for (i=0; i<mbs; i++) {
637:     n  = ii[1] - ii[0];
638:     ii++;
639:     sum1 = 0.0;
640:     sum2 = 0.0;
641:     sum3 = 0.0;
642:     sum4 = 0.0;
643:     sum5 = 0.0;
644:     sum6 = 0.0;
645:     sum7 = 0.0;

647:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
648:     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
649:     for (j=0; j<n; j++) {
650:       xb    = x + 7*(*idx++);
651:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
652:       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
653:       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
654:       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
655:       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
656:       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
657:       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
658:       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
659:       v    += 49;
660:     }
661:     if (usecprow) z = zarray + 7*ridx[i];
662:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
663:     if (!usecprow) z += 7;
664:   }

666:   VecRestoreArrayRead(xx,&x);
667:   VecRestoreArrayWrite(zz,&zarray);
668:   PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);
669:   return(0);
670: }

672: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
673: PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec zz)
674: {
675:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
676:   PetscScalar       *z = NULL,*work,*workt,*zarray;
677:   const PetscScalar *x,*xb;
678:   const MatScalar   *v;
679:   PetscErrorCode    ierr;
680:   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
681:   const PetscInt    *idx,*ii,*ridx=NULL;
682:   PetscInt          k;
683:   PetscBool         usecprow=a->compressedrow.use;

685:   __m256d a0,a1,a2,a3,a4,a5;
686:   __m256d w0,w1,w2,w3;
687:   __m256d z0,z1,z2;
688:   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63);

691:   VecGetArrayRead(xx,&x);
692:   VecGetArrayWrite(zz,&zarray);

694:   idx = a->j;
695:   v   = a->a;
696:   if (usecprow) {
697:     mbs  = a->compressedrow.nrows;
698:     ii   = a->compressedrow.i;
699:     ridx = a->compressedrow.rindex;
700:     PetscArrayzero(zarray,bs*a->mbs);
701:   } else {
702:     mbs = a->mbs;
703:     ii  = a->i;
704:     z   = zarray;
705:   }

707:   if (!a->mult_work) {
708:     k    = PetscMax(A->rmap->n,A->cmap->n);
709:     PetscMalloc1(k+1,&a->mult_work);
710:   }

712:   work = a->mult_work;
713:   for (i=0; i<mbs; i++) {
714:     n           = ii[1] - ii[0]; ii++;
715:     workt       = work;
716:     for (j=0; j<n; j++) {
717:       xb = x + bs*(*idx++);
718:       for (k=0; k<bs; k++) workt[k] = xb[k];
719:       workt += bs;
720:     }
721:     if (usecprow) z = zarray + bs*ridx[i];

723:     z0 = _mm256_setzero_pd(); z1 = _mm256_setzero_pd(); z2 = _mm256_setzero_pd();

725:     for (j=0; j<n; j++) {
726:       /* first column of a */
727:       w0 = _mm256_set1_pd(work[j*9  ]);
728:       a0 = _mm256_loadu_pd(&v[j*81  ]); z0 = _mm256_fmadd_pd(a0,w0,z0);
729:       a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1);
730:       a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2);

732:       /* second column of a */
733:       w1 = _mm256_set1_pd(work[j*9+ 1]);
734:       a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0);
735:       a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1);
736:       a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2);

738:       /* third column of a */
739:       w2 = _mm256_set1_pd(work[j*9 +2]);
740:       a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0);
741:       a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1);
742:       a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2);

744:       /* fourth column of a */
745:       w3 = _mm256_set1_pd(work[j*9+ 3]);
746:       a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0);
747:       a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1);
748:       a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2);

750:       /* fifth column of a */
751:       w0 = _mm256_set1_pd(work[j*9+ 4]);
752:       a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0);
753:       a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1);
754:       a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2);

756:       /* sixth column of a */
757:       w1 = _mm256_set1_pd(work[j*9+ 5]);
758:       a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0);
759:       a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1);
760:       a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2);

762:       /* seventh column of a */
763:       w2 = _mm256_set1_pd(work[j*9+ 6]);
764:       a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0);
765:       a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1);
766:       a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2);

768:       /* eigth column of a */
769:       w3 = _mm256_set1_pd(work[j*9+ 7]);
770:       a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0);
771:       a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1);
772:       a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2);

774:       /* ninth column of a */
775:       w0 = _mm256_set1_pd(work[j*9+ 8]);
776:       a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0);
777:       a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1);
778:       a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2);
779:     }

781:     _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2);

783:     v += n*bs2;
784:     if (!usecprow) z += bs;
785:   }
786:   VecRestoreArrayRead(xx,&x);
787:   VecRestoreArrayWrite(zz,&zarray);
788:   PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);
789:   return(0);
790: }
791: #endif

793: PetscErrorCode MatMult_SeqBAIJ_11(Mat A,Vec xx,Vec zz)
794: {
795:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
796:   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
797:   const PetscScalar *x,*xb;
798:   PetscScalar       *zarray,xv;
799:   const MatScalar   *v;
800:   PetscErrorCode    ierr;
801:   const PetscInt    *ii,*ij=a->j,*idx;
802:   PetscInt          mbs,i,j,k,n,*ridx=NULL;
803:   PetscBool         usecprow=a->compressedrow.use;

806:   VecGetArrayRead(xx,&x);
807:   VecGetArrayWrite(zz,&zarray);

809:   v = a->a;
810:   if (usecprow) {
811:     mbs  = a->compressedrow.nrows;
812:     ii   = a->compressedrow.i;
813:     ridx = a->compressedrow.rindex;
814:     PetscArrayzero(zarray,11*a->mbs);
815:   } else {
816:     mbs = a->mbs;
817:     ii  = a->i;
818:     z   = zarray;
819:   }

821:   for (i=0; i<mbs; i++) {
822:     n    = ii[i+1] - ii[i];
823:     idx  = ij + ii[i];
824:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
825:     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0;

827:     for (j=0; j<n; j++) {
828:       xb = x + 11*(idx[j]);

830:       for (k=0; k<11; k++) {
831:         xv     =  xb[k];
832:         sum1  += v[0]*xv;
833:         sum2  += v[1]*xv;
834:         sum3  += v[2]*xv;
835:         sum4  += v[3]*xv;
836:         sum5  += v[4]*xv;
837:         sum6  += v[5]*xv;
838:         sum7  += v[6]*xv;
839:         sum8  += v[7]*xv;
840:         sum9  += v[8]*xv;
841:         sum10 += v[9]*xv;
842:         sum11 += v[10]*xv;
843:         v     += 11;
844:       }
845:     }
846:     if (usecprow) z = zarray + 11*ridx[i];
847:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
848:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11;

850:     if (!usecprow) z += 11;
851:   }

853:   VecRestoreArrayRead(xx,&x);
854:   VecRestoreArrayWrite(zz,&zarray);
855:   PetscLogFlops(242.0*a->nz - 11.0*a->nonzerorowcnt);
856:   return(0);
857: }

859: /* MatMult_SeqBAIJ_12 version 1: Columns in the block are accessed one at a time */
860: PetscErrorCode MatMult_SeqBAIJ_12_ver1(Mat A,Vec xx,Vec zz)
861: {
862:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
863:   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12;
864:   const PetscScalar *x,*xb;
865:   PetscScalar       *zarray,xv;
866:   const MatScalar   *v;
867:   PetscErrorCode    ierr;
868:   const PetscInt    *ii,*ij=a->j,*idx;
869:   PetscInt          mbs,i,j,k,n,*ridx=NULL;
870:   PetscBool         usecprow=a->compressedrow.use;

873:   VecGetArrayRead(xx,&x);
874:   VecGetArrayWrite(zz,&zarray);

876:   v = a->a;
877:   if (usecprow) {
878:     mbs  = a->compressedrow.nrows;
879:     ii   = a->compressedrow.i;
880:     ridx = a->compressedrow.rindex;
881:     PetscArrayzero(zarray,12*a->mbs);
882:   } else {
883:     mbs = a->mbs;
884:     ii  = a->i;
885:     z   = zarray;
886:   }

888:   for (i=0; i<mbs; i++) {
889:     n    = ii[i+1] - ii[i];
890:     idx  = ij + ii[i];
891:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
892:     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0;

894:     for (j=0; j<n; j++) {
895:       xb = x + 12*(idx[j]);

897:       for (k=0; k<12; k++) {
898:         xv     =  xb[k];
899:         sum1  += v[0]*xv;
900:         sum2  += v[1]*xv;
901:         sum3  += v[2]*xv;
902:         sum4  += v[3]*xv;
903:         sum5  += v[4]*xv;
904:         sum6  += v[5]*xv;
905:         sum7  += v[6]*xv;
906:         sum8  += v[7]*xv;
907:         sum9  += v[8]*xv;
908:         sum10 += v[9]*xv;
909:         sum11 += v[10]*xv;
910:         sum12 += v[11]*xv;
911:         v     += 12;
912:       }
913:     }
914:     if (usecprow) z = zarray + 12*ridx[i];
915:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
916:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12;
917:     if (!usecprow) z += 12;
918:   }
919:   VecRestoreArrayRead(xx,&x);
920:   VecRestoreArrayWrite(zz,&zarray);
921:   PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt);
922:   return(0);
923: }

925: PetscErrorCode MatMultAdd_SeqBAIJ_12_ver1(Mat A,Vec xx,Vec yy,Vec zz)
926: {
927:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
928:   PetscScalar       *z = NULL,*y = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12;
929:   const PetscScalar *x,*xb;
930:   PetscScalar       *zarray,*yarray,xv;
931:   const MatScalar   *v;
932:   PetscErrorCode    ierr;
933:   const PetscInt    *ii,*ij=a->j,*idx;
934:   PetscInt          mbs = a->mbs,i,j,k,n,*ridx=NULL;
935:   PetscBool         usecprow=a->compressedrow.use;

938:   VecGetArrayRead(xx,&x);
939:   VecGetArrayPair(yy,zz,&yarray,&zarray);

941:   v = a->a;
942:   if (usecprow) {
943:    if (zz != yy) {
944:      PetscArraycpy(zarray,yarray,12*mbs);
945:     }
946:     mbs  = a->compressedrow.nrows;
947:     ii   = a->compressedrow.i;
948:     ridx = a->compressedrow.rindex;
949:   } else {
950:     ii  = a->i;
951:     y   = yarray;
952:     z   = zarray;
953:   }

955:   for (i=0; i<mbs; i++) {
956:     n    = ii[i+1] - ii[i];
957:     idx  = ij + ii[i];

959:     if (usecprow){
960:       y = yarray + 12*ridx[i];
961:       z = zarray + 12*ridx[i];
962:     }
963:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];  sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
964:     sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10]; sum12 = y[11];

966:     for (j=0; j<n; j++) {
967:       xb = x + 12*(idx[j]);

969:       for (k=0; k<12; k++) {
970:         xv     =  xb[k];
971:         sum1  += v[0]*xv;
972:         sum2  += v[1]*xv;
973:         sum3  += v[2]*xv;
974:         sum4  += v[3]*xv;
975:         sum5  += v[4]*xv;
976:         sum6  += v[5]*xv;
977:         sum7  += v[6]*xv;
978:         sum8  += v[7]*xv;
979:         sum9  += v[8]*xv;
980:         sum10 += v[9]*xv;
981:         sum11 += v[10]*xv;
982:         sum12 += v[11]*xv;
983:         v     += 12;
984:       }
985:     }

987:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
988:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12;
989:     if (!usecprow) {
990:       y += 12;
991:       z += 12;
992:     }
993:   }
994:   VecRestoreArrayRead(xx,&x);
995:   VecRestoreArrayPair(yy,zz,&yarray,&zarray);
996:   PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt);
997:   return(0);
998: }

1000: /* MatMult_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
1001: PetscErrorCode MatMult_SeqBAIJ_12_ver2(Mat A,Vec xx,Vec zz)
1002: {
1003:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1004:   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12;
1005:   const PetscScalar *x,*xb;
1006:   PetscScalar       x1,x2,x3,x4,*zarray;
1007:   const MatScalar   *v;
1008:   PetscErrorCode    ierr;
1009:   const PetscInt    *ii,*ij=a->j,*idx,*ridx=NULL;
1010:   PetscInt          mbs,i,j,n;
1011:   PetscBool         usecprow=a->compressedrow.use;

1014:   VecGetArrayRead(xx,&x);
1015:   VecGetArrayWrite(zz,&zarray);

1017:   v = a->a;
1018:   if (usecprow) {
1019:     mbs  = a->compressedrow.nrows;
1020:     ii   = a->compressedrow.i;
1021:     ridx = a->compressedrow.rindex;
1022:     PetscArrayzero(zarray,12*a->mbs);
1023:   } else {
1024:     mbs = a->mbs;
1025:     ii  = a->i;
1026:     z   = zarray;
1027:   }

1029:   for (i=0; i<mbs; i++) {
1030:     n    = ii[i+1] - ii[i];
1031:     idx  = ij + ii[i];

1033:     sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = sum8 = sum9 = sum10 = sum11 = sum12 = 0;
1034:     for (j=0; j<n; j++) {
1035:       xb = x + 12*(idx[j]);
1036:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];

1038:       sum1  += v[0]*x1 + v[12]*x2 + v[24]*x3   + v[36]*x4;
1039:       sum2  += v[1]*x1 + v[13]*x2 + v[25]*x3   + v[37]*x4;
1040:       sum3  += v[2]*x1 + v[14]*x2 + v[26]*x3  + v[38]*x4;
1041:       sum4  += v[3]*x1 + v[15]*x2 + v[27]*x3  + v[39]*x4;
1042:       sum5  += v[4]*x1 + v[16]*x2 + v[28]*x3   + v[40]*x4;
1043:       sum6  += v[5]*x1 + v[17]*x2 + v[29]*x3   + v[41]*x4;
1044:       sum7  += v[6]*x1 + v[18]*x2 + v[30]*x3  + v[42]*x4;
1045:       sum8  += v[7]*x1 + v[19]*x2 + v[31]*x3  + v[43]*x4;
1046:       sum9  += v[8]*x1 + v[20]*x2 + v[32]*x3   + v[44]*x4;
1047:       sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3   + v[45]*x4;
1048:       sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3  + v[46]*x4;
1049:       sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3  + v[47]*x4;
1050:       v += 48;

1052:       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];

1054:       sum1  += v[0]*x1 + v[12]*x2 + v[24]*x3   + v[36]*x4;
1055:       sum2  += v[1]*x1 + v[13]*x2 + v[25]*x3   + v[37]*x4;
1056:       sum3  += v[2]*x1 + v[14]*x2 + v[26]*x3  + v[38]*x4;
1057:       sum4  += v[3]*x1 + v[15]*x2 + v[27]*x3  + v[39]*x4;
1058:       sum5  += v[4]*x1 + v[16]*x2 + v[28]*x3   + v[40]*x4;
1059:       sum6  += v[5]*x1 + v[17]*x2 + v[29]*x3   + v[41]*x4;
1060:       sum7  += v[6]*x1 + v[18]*x2 + v[30]*x3  + v[42]*x4;
1061:       sum8  += v[7]*x1 + v[19]*x2 + v[31]*x3  + v[43]*x4;
1062:       sum9  += v[8]*x1 + v[20]*x2 + v[32]*x3   + v[44]*x4;
1063:       sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3   + v[45]*x4;
1064:       sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3  + v[46]*x4;
1065:       sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3  + v[47]*x4;
1066:       v     += 48;

1068:       x1     = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1069:       sum1  += v[0]*x1 + v[12]*x2 + v[24]*x3   + v[36]*x4;
1070:       sum2  += v[1]*x1 + v[13]*x2 + v[25]*x3   + v[37]*x4;
1071:       sum3  += v[2]*x1 + v[14]*x2 + v[26]*x3  + v[38]*x4;
1072:       sum4  += v[3]*x1 + v[15]*x2 + v[27]*x3  + v[39]*x4;
1073:       sum5  += v[4]*x1 + v[16]*x2 + v[28]*x3   + v[40]*x4;
1074:       sum6  += v[5]*x1 + v[17]*x2 + v[29]*x3   + v[41]*x4;
1075:       sum7  += v[6]*x1 + v[18]*x2 + v[30]*x3  + v[42]*x4;
1076:       sum8  += v[7]*x1 + v[19]*x2 + v[31]*x3  + v[43]*x4;
1077:       sum9  += v[8]*x1 + v[20]*x2 + v[32]*x3   + v[44]*x4;
1078:       sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3   + v[45]*x4;
1079:       sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3  + v[46]*x4;
1080:       sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3  + v[47]*x4;
1081:       v     += 48;

1083:     }
1084:     if (usecprow) z = zarray + 12*ridx[i];
1085:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1086:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12;
1087:     if (!usecprow) z += 12;
1088:   }
1089:   VecRestoreArrayRead(xx,&x);
1090:   VecRestoreArrayWrite(zz,&zarray);
1091:   PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt);
1092:   return(0);
1093: }

1095: /* MatMultAdd_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
1096: PetscErrorCode MatMultAdd_SeqBAIJ_12_ver2(Mat A,Vec xx,Vec yy,Vec zz)
1097: {
1098:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1099:   PetscScalar       *z = NULL,*y = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12;
1100:   const PetscScalar *x,*xb;
1101:   PetscScalar       x1,x2,x3,x4,*zarray,*yarray;
1102:   const MatScalar   *v;
1103:   PetscErrorCode    ierr;
1104:   const PetscInt    *ii,*ij=a->j,*idx,*ridx=NULL;
1105:   PetscInt          mbs = a->mbs,i,j,n;
1106:   PetscBool         usecprow=a->compressedrow.use;

1109:   VecGetArrayRead(xx,&x);
1110:   VecGetArrayPair(yy,zz,&yarray,&zarray);

1112:   v = a->a;
1113:   if (usecprow) {
1114:     if (zz != yy) {
1115:       PetscArraycpy(zarray,yarray,12*mbs);
1116:     }
1117:     mbs  = a->compressedrow.nrows;
1118:     ii   = a->compressedrow.i;
1119:     ridx = a->compressedrow.rindex;
1120:   } else {
1121:     ii  = a->i;
1122:     y   = yarray;
1123:     z   = zarray;
1124:   }

1126:   for (i=0; i<mbs; i++) {
1127:     n    = ii[i+1] - ii[i];
1128:     idx  = ij + ii[i];

1130:     if (usecprow) {
1131:       y = yarray + 12*ridx[i];
1132:       z = zarray + 12*ridx[i];
1133:     }
1134:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];  sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1135:     sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10]; sum12 = y[11];

1137:     for (j=0; j<n; j++) {
1138:       xb = x + 12*(idx[j]);
1139:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];

1141:       sum1  += v[0]*x1 + v[12]*x2 + v[24]*x3   + v[36]*x4;
1142:       sum2  += v[1]*x1 + v[13]*x2 + v[25]*x3   + v[37]*x4;
1143:       sum3  += v[2]*x1 + v[14]*x2 + v[26]*x3  + v[38]*x4;
1144:       sum4  += v[3]*x1 + v[15]*x2 + v[27]*x3  + v[39]*x4;
1145:       sum5  += v[4]*x1 + v[16]*x2 + v[28]*x3   + v[40]*x4;
1146:       sum6  += v[5]*x1 + v[17]*x2 + v[29]*x3   + v[41]*x4;
1147:       sum7  += v[6]*x1 + v[18]*x2 + v[30]*x3  + v[42]*x4;
1148:       sum8  += v[7]*x1 + v[19]*x2 + v[31]*x3  + v[43]*x4;
1149:       sum9  += v[8]*x1 + v[20]*x2 + v[32]*x3   + v[44]*x4;
1150:       sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3   + v[45]*x4;
1151:       sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3  + v[46]*x4;
1152:       sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3  + v[47]*x4;
1153:       v += 48;

1155:       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];

1157:       sum1  += v[0]*x1 + v[12]*x2 + v[24]*x3   + v[36]*x4;
1158:       sum2  += v[1]*x1 + v[13]*x2 + v[25]*x3   + v[37]*x4;
1159:       sum3  += v[2]*x1 + v[14]*x2 + v[26]*x3  + v[38]*x4;
1160:       sum4  += v[3]*x1 + v[15]*x2 + v[27]*x3  + v[39]*x4;
1161:       sum5  += v[4]*x1 + v[16]*x2 + v[28]*x3   + v[40]*x4;
1162:       sum6  += v[5]*x1 + v[17]*x2 + v[29]*x3   + v[41]*x4;
1163:       sum7  += v[6]*x1 + v[18]*x2 + v[30]*x3  + v[42]*x4;
1164:       sum8  += v[7]*x1 + v[19]*x2 + v[31]*x3  + v[43]*x4;
1165:       sum9  += v[8]*x1 + v[20]*x2 + v[32]*x3   + v[44]*x4;
1166:       sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3   + v[45]*x4;
1167:       sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3  + v[46]*x4;
1168:       sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3  + v[47]*x4;
1169:       v     += 48;

1171:       x1     = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1172:       sum1  += v[0]*x1 + v[12]*x2 + v[24]*x3   + v[36]*x4;
1173:       sum2  += v[1]*x1 + v[13]*x2 + v[25]*x3   + v[37]*x4;
1174:       sum3  += v[2]*x1 + v[14]*x2 + v[26]*x3  + v[38]*x4;
1175:       sum4  += v[3]*x1 + v[15]*x2 + v[27]*x3  + v[39]*x4;
1176:       sum5  += v[4]*x1 + v[16]*x2 + v[28]*x3   + v[40]*x4;
1177:       sum6  += v[5]*x1 + v[17]*x2 + v[29]*x3   + v[41]*x4;
1178:       sum7  += v[6]*x1 + v[18]*x2 + v[30]*x3  + v[42]*x4;
1179:       sum8  += v[7]*x1 + v[19]*x2 + v[31]*x3  + v[43]*x4;
1180:       sum9  += v[8]*x1 + v[20]*x2 + v[32]*x3   + v[44]*x4;
1181:       sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3   + v[45]*x4;
1182:       sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3  + v[46]*x4;
1183:       sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3  + v[47]*x4;
1184:       v     += 48;

1186:     }
1187:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1188:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12;
1189:     if (!usecprow) {
1190:       y += 12;
1191:       z += 12;
1192:     }
1193:   }
1194:   VecRestoreArrayRead(xx,&x);
1195:   VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1196:   PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt);
1197:   return(0);
1198: }

1200: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
1201: PetscErrorCode MatMult_SeqBAIJ_12_AVX2(Mat A,Vec xx,Vec zz)
1202: {
1203:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1204:   PetscScalar       *z = NULL,*zarray;
1205:   const PetscScalar *x,*work;
1206:   const MatScalar   *v = a->a;
1207:   PetscErrorCode    ierr;
1208:   PetscInt          mbs,i,j,n;
1209:   const PetscInt    *idx = a->j,*ii,*ridx=NULL;
1210:   PetscBool         usecprow=a->compressedrow.use;
1211:   const PetscInt    bs = 12, bs2 = 144;

1213:   __m256d a0,a1,a2,a3,a4,a5;
1214:   __m256d w0,w1,w2,w3;
1215:   __m256d z0,z1,z2;

1218:   VecGetArrayRead(xx,&x);
1219:   VecGetArrayWrite(zz,&zarray);

1221:   if (usecprow) {
1222:     mbs  = a->compressedrow.nrows;
1223:     ii   = a->compressedrow.i;
1224:     ridx = a->compressedrow.rindex;
1225:     PetscArrayzero(zarray,bs*a->mbs);
1226:   } else {
1227:     mbs = a->mbs;
1228:     ii  = a->i;
1229:     z   = zarray;
1230:   }

1232:   for (i=0; i<mbs; i++) {
1233:     z0 = _mm256_setzero_pd(); z1 = _mm256_setzero_pd(); z2 = _mm256_setzero_pd();

1235:     n  = ii[1] - ii[0]; ii++;
1236:     for (j=0; j<n; j++) {
1237:       work = x + bs*(*idx++);

1239:       /* first column of a */
1240:       w0 = _mm256_set1_pd(work[0]);
1241:       a0 = _mm256_loadu_pd(v+0); z0 = _mm256_fmadd_pd(a0,w0,z0);
1242:       a1 = _mm256_loadu_pd(v+4); z1 = _mm256_fmadd_pd(a1,w0,z1);
1243:       a2 = _mm256_loadu_pd(v+8); z2 = _mm256_fmadd_pd(a2,w0,z2);

1245:       /* second column of a */
1246:       w1 = _mm256_set1_pd(work[1]);
1247:       a3 = _mm256_loadu_pd(v+12); z0 = _mm256_fmadd_pd(a3,w1,z0);
1248:       a4 = _mm256_loadu_pd(v+16); z1 = _mm256_fmadd_pd(a4,w1,z1);
1249:       a5 = _mm256_loadu_pd(v+20); z2 = _mm256_fmadd_pd(a5,w1,z2);

1251:       /* third column of a */
1252:       w2 = _mm256_set1_pd(work[2]);
1253:       a0 = _mm256_loadu_pd(v+24); z0 = _mm256_fmadd_pd(a0,w2,z0);
1254:       a1 = _mm256_loadu_pd(v+28); z1 = _mm256_fmadd_pd(a1,w2,z1);
1255:       a2 = _mm256_loadu_pd(v+32); z2 = _mm256_fmadd_pd(a2,w2,z2);

1257:       /* fourth column of a */
1258:       w3 = _mm256_set1_pd(work[3]);
1259:       a3 = _mm256_loadu_pd(v+36); z0 = _mm256_fmadd_pd(a3,w3,z0);
1260:       a4 = _mm256_loadu_pd(v+40); z1 = _mm256_fmadd_pd(a4,w3,z1);
1261:       a5 = _mm256_loadu_pd(v+44); z2 = _mm256_fmadd_pd(a5,w3,z2);

1263:       /* fifth column of a */
1264:       w0 = _mm256_set1_pd(work[4]);
1265:       a0 = _mm256_loadu_pd(v+48); z0 = _mm256_fmadd_pd(a0,w0,z0);
1266:       a1 = _mm256_loadu_pd(v+52); z1 = _mm256_fmadd_pd(a1,w0,z1);
1267:       a2 = _mm256_loadu_pd(v+56); z2 = _mm256_fmadd_pd(a2,w0,z2);

1269:       /* sixth column of a */
1270:       w1 = _mm256_set1_pd(work[5]);
1271:       a3 = _mm256_loadu_pd(v+60); z0 = _mm256_fmadd_pd(a3,w1,z0);
1272:       a4 = _mm256_loadu_pd(v+64); z1 = _mm256_fmadd_pd(a4,w1,z1);
1273:       a5 = _mm256_loadu_pd(v+68); z2 = _mm256_fmadd_pd(a5,w1,z2);

1275:       /* seventh column of a */
1276:       w2 = _mm256_set1_pd(work[6]);
1277:       a0 = _mm256_loadu_pd(v+72); z0 = _mm256_fmadd_pd(a0,w2,z0);
1278:       a1 = _mm256_loadu_pd(v+76); z1 = _mm256_fmadd_pd(a1,w2,z1);
1279:       a2 = _mm256_loadu_pd(v+80); z2 = _mm256_fmadd_pd(a2,w2,z2);

1281:       /* eigth column of a */
1282:       w3 = _mm256_set1_pd(work[7]);
1283:       a3 = _mm256_loadu_pd(v+84); z0 = _mm256_fmadd_pd(a3,w3,z0);
1284:       a4 = _mm256_loadu_pd(v+88); z1 = _mm256_fmadd_pd(a4,w3,z1);
1285:       a5 = _mm256_loadu_pd(v+92); z2 = _mm256_fmadd_pd(a5,w3,z2);

1287:       /* ninth column of a */
1288:       w0 = _mm256_set1_pd(work[8]);
1289:       a0 = _mm256_loadu_pd(v+96); z0 = _mm256_fmadd_pd(a0,w0,z0);
1290:       a1 = _mm256_loadu_pd(v+100); z1 = _mm256_fmadd_pd(a1,w0,z1);
1291:       a2 = _mm256_loadu_pd(v+104); z2 = _mm256_fmadd_pd(a2,w0,z2);

1293:       /* tenth column of a */
1294:       w1 = _mm256_set1_pd(work[9]);
1295:       a3 = _mm256_loadu_pd(v+108); z0 = _mm256_fmadd_pd(a3,w1,z0);
1296:       a4 = _mm256_loadu_pd(v+112); z1 = _mm256_fmadd_pd(a4,w1,z1);
1297:       a5 = _mm256_loadu_pd(v+116); z2 = _mm256_fmadd_pd(a5,w1,z2);

1299:       /* eleventh column of a */
1300:       w2 = _mm256_set1_pd(work[10]);
1301:       a0 = _mm256_loadu_pd(v+120); z0 = _mm256_fmadd_pd(a0,w2,z0);
1302:       a1 = _mm256_loadu_pd(v+124); z1 = _mm256_fmadd_pd(a1,w2,z1);
1303:       a2 = _mm256_loadu_pd(v+128); z2 = _mm256_fmadd_pd(a2,w2,z2);

1305:       /* twelveth column of a */
1306:       w3 = _mm256_set1_pd(work[11]);
1307:       a3 = _mm256_loadu_pd(v+132); z0 = _mm256_fmadd_pd(a3,w3,z0);
1308:       a4 = _mm256_loadu_pd(v+136); z1 = _mm256_fmadd_pd(a4,w3,z1);
1309:       a5 = _mm256_loadu_pd(v+140); z2 = _mm256_fmadd_pd(a5,w3,z2);

1311:       v += bs2;
1312:     }
1313:     if (usecprow) z = zarray + bs*ridx[i];
1314:     _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_storeu_pd(&z[ 8], z2);
1315:     if (!usecprow) z += bs;
1316:   }
1317:   VecRestoreArrayRead(xx,&x);
1318:   VecRestoreArrayWrite(zz,&zarray);
1319:   PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);
1320:   return(0);
1321: }
1322: #endif

1324: /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
1325: /* Default MatMult for block size 15 */
1326: PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
1327: {
1328:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1329:   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1330:   const PetscScalar *x,*xb;
1331:   PetscScalar       *zarray,xv;
1332:   const MatScalar   *v;
1333:   PetscErrorCode    ierr;
1334:   const PetscInt    *ii,*ij=a->j,*idx;
1335:   PetscInt          mbs,i,j,k,n,*ridx=NULL;
1336:   PetscBool         usecprow=a->compressedrow.use;

1339:   VecGetArrayRead(xx,&x);
1340:   VecGetArrayWrite(zz,&zarray);

1342:   v = a->a;
1343:   if (usecprow) {
1344:     mbs  = a->compressedrow.nrows;
1345:     ii   = a->compressedrow.i;
1346:     ridx = a->compressedrow.rindex;
1347:     PetscArrayzero(zarray,15*a->mbs);
1348:   } else {
1349:     mbs = a->mbs;
1350:     ii  = a->i;
1351:     z   = zarray;
1352:   }

1354:   for (i=0; i<mbs; i++) {
1355:     n    = ii[i+1] - ii[i];
1356:     idx  = ij + ii[i];
1357:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1358:     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;

1360:     for (j=0; j<n; j++) {
1361:       xb = x + 15*(idx[j]);

1363:       for (k=0; k<15; k++) {
1364:         xv     =  xb[k];
1365:         sum1  += v[0]*xv;
1366:         sum2  += v[1]*xv;
1367:         sum3  += v[2]*xv;
1368:         sum4  += v[3]*xv;
1369:         sum5  += v[4]*xv;
1370:         sum6  += v[5]*xv;
1371:         sum7  += v[6]*xv;
1372:         sum8  += v[7]*xv;
1373:         sum9  += v[8]*xv;
1374:         sum10 += v[9]*xv;
1375:         sum11 += v[10]*xv;
1376:         sum12 += v[11]*xv;
1377:         sum13 += v[12]*xv;
1378:         sum14 += v[13]*xv;
1379:         sum15 += v[14]*xv;
1380:         v     += 15;
1381:       }
1382:     }
1383:     if (usecprow) z = zarray + 15*ridx[i];
1384:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1385:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;

1387:     if (!usecprow) z += 15;
1388:   }

1390:   VecRestoreArrayRead(xx,&x);
1391:   VecRestoreArrayWrite(zz,&zarray);
1392:   PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1393:   return(0);
1394: }

1396: /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
1397: PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
1398: {
1399:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1400:   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1401:   const PetscScalar *x,*xb;
1402:   PetscScalar       x1,x2,x3,x4,*zarray;
1403:   const MatScalar   *v;
1404:   PetscErrorCode    ierr;
1405:   const PetscInt    *ii,*ij=a->j,*idx;
1406:   PetscInt          mbs,i,j,n,*ridx=NULL;
1407:   PetscBool         usecprow=a->compressedrow.use;

1410:   VecGetArrayRead(xx,&x);
1411:   VecGetArrayWrite(zz,&zarray);

1413:   v = a->a;
1414:   if (usecprow) {
1415:     mbs  = a->compressedrow.nrows;
1416:     ii   = a->compressedrow.i;
1417:     ridx = a->compressedrow.rindex;
1418:     PetscArrayzero(zarray,15*a->mbs);
1419:   } else {
1420:     mbs = a->mbs;
1421:     ii  = a->i;
1422:     z   = zarray;
1423:   }

1425:   for (i=0; i<mbs; i++) {
1426:     n    = ii[i+1] - ii[i];
1427:     idx  = ij + ii[i];
1428:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1429:     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;

1431:     for (j=0; j<n; j++) {
1432:       xb = x + 15*(idx[j]);
1433:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];

1435:       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
1436:       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
1437:       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
1438:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
1439:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
1440:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
1441:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
1442:       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
1443:       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
1444:       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
1445:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
1446:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
1447:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
1448:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1449:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;

1451:       v += 60;

1453:       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];

1455:       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
1456:       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
1457:       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
1458:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
1459:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
1460:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
1461:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
1462:       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
1463:       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
1464:       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
1465:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
1466:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
1467:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
1468:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1469:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
1470:       v     += 60;

1472:       x1     = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1473:       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
1474:       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
1475:       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
1476:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
1477:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
1478:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
1479:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
1480:       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
1481:       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
1482:       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
1483:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
1484:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
1485:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
1486:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1487:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
1488:       v     += 60;

1490:       x1     = xb[12]; x2 = xb[13]; x3 = xb[14];
1491:       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3;
1492:       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3;
1493:       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3;
1494:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3;
1495:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3;
1496:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3;
1497:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3;
1498:       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3;
1499:       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3;
1500:       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
1501:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
1502:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
1503:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
1504:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
1505:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
1506:       v     += 45;
1507:     }
1508:     if (usecprow) z = zarray + 15*ridx[i];
1509:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1510:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;

1512:     if (!usecprow) z += 15;
1513:   }

1515:   VecRestoreArrayRead(xx,&x);
1516:   VecRestoreArrayWrite(zz,&zarray);
1517:   PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1518:   return(0);
1519: }

1521: /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
1522: PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
1523: {
1524:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1525:   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1526:   const PetscScalar *x,*xb;
1527:   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
1528:   const MatScalar   *v;
1529:   PetscErrorCode    ierr;
1530:   const PetscInt    *ii,*ij=a->j,*idx;
1531:   PetscInt          mbs,i,j,n,*ridx=NULL;
1532:   PetscBool         usecprow=a->compressedrow.use;

1535:   VecGetArrayRead(xx,&x);
1536:   VecGetArrayWrite(zz,&zarray);

1538:   v = a->a;
1539:   if (usecprow) {
1540:     mbs  = a->compressedrow.nrows;
1541:     ii   = a->compressedrow.i;
1542:     ridx = a->compressedrow.rindex;
1543:     PetscArrayzero(zarray,15*a->mbs);
1544:   } else {
1545:     mbs = a->mbs;
1546:     ii  = a->i;
1547:     z   = zarray;
1548:   }

1550:   for (i=0; i<mbs; i++) {
1551:     n    = ii[i+1] - ii[i];
1552:     idx  = ij + ii[i];
1553:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1554:     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;

1556:     for (j=0; j<n; j++) {
1557:       xb = x + 15*(idx[j]);
1558:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1559:       x8 = xb[7];

1561:       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
1562:       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
1563:       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
1564:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
1565:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
1566:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
1567:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
1568:       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
1569:       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
1570:       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
1571:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
1572:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
1573:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
1574:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
1575:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
1576:       v     += 120;

1578:       x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];

1580:       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
1581:       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
1582:       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
1583:       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
1584:       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
1585:       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
1586:       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
1587:       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
1588:       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
1589:       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
1590:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
1591:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
1592:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
1593:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
1594:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
1595:       v     += 105;
1596:     }
1597:     if (usecprow) z = zarray + 15*ridx[i];
1598:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1599:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;

1601:     if (!usecprow) z += 15;
1602:   }

1604:   VecRestoreArrayRead(xx,&x);
1605:   VecRestoreArrayWrite(zz,&zarray);
1606:   PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1607:   return(0);
1608: }

1610: /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
1611: PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
1612: {
1613:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1614:   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1615:   const PetscScalar *x,*xb;
1616:   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
1617:   const MatScalar   *v;
1618:   PetscErrorCode    ierr;
1619:   const PetscInt    *ii,*ij=a->j,*idx;
1620:   PetscInt          mbs,i,j,n,*ridx=NULL;
1621:   PetscBool         usecprow=a->compressedrow.use;

1624:   VecGetArrayRead(xx,&x);
1625:   VecGetArrayWrite(zz,&zarray);

1627:   v = a->a;
1628:   if (usecprow) {
1629:     mbs  = a->compressedrow.nrows;
1630:     ii   = a->compressedrow.i;
1631:     ridx = a->compressedrow.rindex;
1632:     PetscArrayzero(zarray,15*a->mbs);
1633:   } else {
1634:     mbs = a->mbs;
1635:     ii  = a->i;
1636:     z   = zarray;
1637:   }

1639:   for (i=0; i<mbs; i++) {
1640:     n    = ii[i+1] - ii[i];
1641:     idx  = ij + ii[i];
1642:     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1643:     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;

1645:     for (j=0; j<n; j++) {
1646:       xb = x + 15*(idx[j]);
1647:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1648:       x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];

1650:       sum1  +=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
1651:       sum2  +=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
1652:       sum3  +=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
1653:       sum4  +=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
1654:       sum5  += v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
1655:       sum6  += v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
1656:       sum7  += v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
1657:       sum8  += v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
1658:       sum9  += v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
1659:       sum10 += v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
1660:       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
1661:       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
1662:       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
1663:       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
1664:       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
1665:       v     += 225;
1666:     }
1667:     if (usecprow) z = zarray + 15*ridx[i];
1668:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1669:     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;

1671:     if (!usecprow) z += 15;
1672:   }

1674:   VecRestoreArrayRead(xx,&x);
1675:   VecRestoreArrayWrite(zz,&zarray);
1676:   PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1677:   return(0);
1678: }

1680: /*
1681:     This will not work with MatScalar == float because it calls the BLAS
1682: */
1683: PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
1684: {
1685:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1686:   PetscScalar       *z = NULL,*work,*workt,*zarray;
1687:   const PetscScalar *x,*xb;
1688:   const MatScalar   *v;
1689:   PetscErrorCode    ierr;
1690:   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1691:   const PetscInt    *idx,*ii,*ridx=NULL;
1692:   PetscInt          ncols,k;
1693:   PetscBool         usecprow=a->compressedrow.use;

1696:   VecGetArrayRead(xx,&x);
1697:   VecGetArrayWrite(zz,&zarray);

1699:   idx = a->j;
1700:   v   = a->a;
1701:   if (usecprow) {
1702:     mbs  = a->compressedrow.nrows;
1703:     ii   = a->compressedrow.i;
1704:     ridx = a->compressedrow.rindex;
1705:     PetscArrayzero(zarray,bs*a->mbs);
1706:   } else {
1707:     mbs = a->mbs;
1708:     ii  = a->i;
1709:     z   = zarray;
1710:   }

1712:   if (!a->mult_work) {
1713:     k    = PetscMax(A->rmap->n,A->cmap->n);
1714:     PetscMalloc1(k+1,&a->mult_work);
1715:   }
1716:   work = a->mult_work;
1717:   for (i=0; i<mbs; i++) {
1718:     n           = ii[1] - ii[0]; ii++;
1719:     ncols       = n*bs;
1720:     workt       = work;
1721:     for (j=0; j<n; j++) {
1722:       xb = x + bs*(*idx++);
1723:       for (k=0; k<bs; k++) workt[k] = xb[k];
1724:       workt += bs;
1725:     }
1726:     if (usecprow) z = zarray + bs*ridx[i];
1727:     PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1728:     v += n*bs2;
1729:     if (!usecprow) z += bs;
1730:   }
1731:   VecRestoreArrayRead(xx,&x);
1732:   VecRestoreArrayWrite(zz,&zarray);
1733:   PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);
1734:   return(0);
1735: }

1737: PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1738: {
1739:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1740:   const PetscScalar *x;
1741:   PetscScalar       *y,*z,sum;
1742:   const MatScalar   *v;
1743:   PetscErrorCode    ierr;
1744:   PetscInt          mbs=a->mbs,i,n,*ridx=NULL;
1745:   const PetscInt    *idx,*ii;
1746:   PetscBool         usecprow=a->compressedrow.use;

1749:   VecGetArrayRead(xx,&x);
1750:   VecGetArrayPair(yy,zz,&y,&z);

1752:   idx = a->j;
1753:   v   = a->a;
1754:   if (usecprow) {
1755:     if (zz != yy) {
1756:       PetscArraycpy(z,y,mbs);
1757:     }
1758:     mbs  = a->compressedrow.nrows;
1759:     ii   = a->compressedrow.i;
1760:     ridx = a->compressedrow.rindex;
1761:   } else {
1762:     ii = a->i;
1763:   }

1765:   for (i=0; i<mbs; i++) {
1766:     n = ii[1] - ii[0];
1767:     ii++;
1768:     if (!usecprow) {
1769:       sum = y[i];
1770:     } else {
1771:       sum = y[ridx[i]];
1772:     }
1773:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1774:     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
1775:     PetscSparseDensePlusDot(sum,x,v,idx,n);
1776:     v   += n;
1777:     idx += n;
1778:     if (usecprow) {
1779:       z[ridx[i]] = sum;
1780:     } else {
1781:       z[i] = sum;
1782:     }
1783:   }
1784:   VecRestoreArrayRead(xx,&x);
1785:   VecRestoreArrayPair(yy,zz,&y,&z);
1786:   PetscLogFlops(2.0*a->nz);
1787:   return(0);
1788: }

1790: PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1791: {
1792:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1793:   PetscScalar       *y = NULL,*z = NULL,sum1,sum2;
1794:   const PetscScalar *x,*xb;
1795:   PetscScalar       x1,x2,*yarray,*zarray;
1796:   const MatScalar   *v;
1797:   PetscErrorCode    ierr;
1798:   PetscInt          mbs = a->mbs,i,n,j;
1799:   const PetscInt    *idx,*ii,*ridx = NULL;
1800:   PetscBool         usecprow = a->compressedrow.use;

1803:   VecGetArrayRead(xx,&x);
1804:   VecGetArrayPair(yy,zz,&yarray,&zarray);

1806:   idx = a->j;
1807:   v   = a->a;
1808:   if (usecprow) {
1809:     if (zz != yy) {
1810:       PetscArraycpy(zarray,yarray,2*mbs);
1811:     }
1812:     mbs  = a->compressedrow.nrows;
1813:     ii   = a->compressedrow.i;
1814:     ridx = a->compressedrow.rindex;
1815:   } else {
1816:     ii = a->i;
1817:     y  = yarray;
1818:     z  = zarray;
1819:   }

1821:   for (i=0; i<mbs; i++) {
1822:     n = ii[1] - ii[0]; ii++;
1823:     if (usecprow) {
1824:       z = zarray + 2*ridx[i];
1825:       y = yarray + 2*ridx[i];
1826:     }
1827:     sum1 = y[0]; sum2 = y[1];
1828:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1829:     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1830:     for (j=0; j<n; j++) {
1831:       xb = x + 2*(*idx++);
1832:       x1 = xb[0];
1833:       x2 = xb[1];

1835:       sum1 += v[0]*x1 + v[2]*x2;
1836:       sum2 += v[1]*x1 + v[3]*x2;
1837:       v    += 4;
1838:     }
1839:     z[0] = sum1; z[1] = sum2;
1840:     if (!usecprow) {
1841:       z += 2; y += 2;
1842:     }
1843:   }
1844:   VecRestoreArrayRead(xx,&x);
1845:   VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1846:   PetscLogFlops(4.0*a->nz);
1847:   return(0);
1848: }

1850: PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1851: {
1852:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1853:   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1854:   const PetscScalar *x,*xb;
1855:   const MatScalar   *v;
1856:   PetscErrorCode    ierr;
1857:   PetscInt          mbs = a->mbs,i,j,n;
1858:   const PetscInt    *idx,*ii,*ridx = NULL;
1859:   PetscBool         usecprow = a->compressedrow.use;

1862:   VecGetArrayRead(xx,&x);
1863:   VecGetArrayPair(yy,zz,&yarray,&zarray);

1865:   idx = a->j;
1866:   v   = a->a;
1867:   if (usecprow) {
1868:     if (zz != yy) {
1869:       PetscArraycpy(zarray,yarray,3*mbs);
1870:     }
1871:     mbs  = a->compressedrow.nrows;
1872:     ii   = a->compressedrow.i;
1873:     ridx = a->compressedrow.rindex;
1874:   } else {
1875:     ii = a->i;
1876:     y  = yarray;
1877:     z  = zarray;
1878:   }

1880:   for (i=0; i<mbs; i++) {
1881:     n = ii[1] - ii[0]; ii++;
1882:     if (usecprow) {
1883:       z = zarray + 3*ridx[i];
1884:       y = yarray + 3*ridx[i];
1885:     }
1886:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1887:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1888:     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1889:     for (j=0; j<n; j++) {
1890:       xb    = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1891:       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1892:       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1893:       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1894:       v    += 9;
1895:     }
1896:     z[0] = sum1; z[1] = sum2; z[2] = sum3;
1897:     if (!usecprow) {
1898:       z += 3; y += 3;
1899:     }
1900:   }
1901:   VecRestoreArrayRead(xx,&x);
1902:   VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1903:   PetscLogFlops(18.0*a->nz);
1904:   return(0);
1905: }

1907: PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1908: {
1909:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1910:   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1911:   const PetscScalar *x,*xb;
1912:   const MatScalar   *v;
1913:   PetscErrorCode    ierr;
1914:   PetscInt          mbs = a->mbs,i,j,n;
1915:   const PetscInt    *idx,*ii,*ridx=NULL;
1916:   PetscBool         usecprow=a->compressedrow.use;

1919:   VecGetArrayRead(xx,&x);
1920:   VecGetArrayPair(yy,zz,&yarray,&zarray);

1922:   idx = a->j;
1923:   v   = a->a;
1924:   if (usecprow) {
1925:     if (zz != yy) {
1926:       PetscArraycpy(zarray,yarray,4*mbs);
1927:     }
1928:     mbs  = a->compressedrow.nrows;
1929:     ii   = a->compressedrow.i;
1930:     ridx = a->compressedrow.rindex;
1931:   } else {
1932:     ii = a->i;
1933:     y  = yarray;
1934:     z  = zarray;
1935:   }

1937:   for (i=0; i<mbs; i++) {
1938:     n = ii[1] - ii[0]; ii++;
1939:     if (usecprow) {
1940:       z = zarray + 4*ridx[i];
1941:       y = yarray + 4*ridx[i];
1942:     }
1943:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1944:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1945:     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1946:     for (j=0; j<n; j++) {
1947:       xb    = x + 4*(*idx++);
1948:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1949:       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1950:       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1951:       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1952:       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1953:       v    += 16;
1954:     }
1955:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1956:     if (!usecprow) {
1957:       z += 4; y += 4;
1958:     }
1959:   }
1960:   VecRestoreArrayRead(xx,&x);
1961:   VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1962:   PetscLogFlops(32.0*a->nz);
1963:   return(0);
1964: }

1966: PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1967: {
1968:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1969:   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1970:   const PetscScalar *x,*xb;
1971:   PetscScalar       *yarray,*zarray;
1972:   const MatScalar   *v;
1973:   PetscErrorCode    ierr;
1974:   PetscInt          mbs = a->mbs,i,j,n;
1975:   const PetscInt    *idx,*ii,*ridx = NULL;
1976:   PetscBool         usecprow=a->compressedrow.use;

1979:   VecGetArrayRead(xx,&x);
1980:   VecGetArrayPair(yy,zz,&yarray,&zarray);

1982:   idx = a->j;
1983:   v   = a->a;
1984:   if (usecprow) {
1985:     if (zz != yy) {
1986:       PetscArraycpy(zarray,yarray,5*mbs);
1987:     }
1988:     mbs  = a->compressedrow.nrows;
1989:     ii   = a->compressedrow.i;
1990:     ridx = a->compressedrow.rindex;
1991:   } else {
1992:     ii = a->i;
1993:     y  = yarray;
1994:     z  = zarray;
1995:   }

1997:   for (i=0; i<mbs; i++) {
1998:     n = ii[1] - ii[0]; ii++;
1999:     if (usecprow) {
2000:       z = zarray + 5*ridx[i];
2001:       y = yarray + 5*ridx[i];
2002:     }
2003:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
2004:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
2005:     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2006:     for (j=0; j<n; j++) {
2007:       xb    = x + 5*(*idx++);
2008:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
2009:       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2010:       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2011:       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2012:       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2013:       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2014:       v    += 25;
2015:     }
2016:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
2017:     if (!usecprow) {
2018:       z += 5; y += 5;
2019:     }
2020:   }
2021:   VecRestoreArrayRead(xx,&x);
2022:   VecRestoreArrayPair(yy,zz,&yarray,&zarray);
2023:   PetscLogFlops(50.0*a->nz);
2024:   return(0);
2025: }

2027: PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
2028: {
2029:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2030:   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6;
2031:   const PetscScalar *x,*xb;
2032:   PetscScalar       x1,x2,x3,x4,x5,x6,*yarray,*zarray;
2033:   const MatScalar   *v;
2034:   PetscErrorCode    ierr;
2035:   PetscInt          mbs = a->mbs,i,j,n;
2036:   const PetscInt    *idx,*ii,*ridx=NULL;
2037:   PetscBool         usecprow=a->compressedrow.use;

2040:   VecGetArrayRead(xx,&x);
2041:   VecGetArrayPair(yy,zz,&yarray,&zarray);

2043:   idx = a->j;
2044:   v   = a->a;
2045:   if (usecprow) {
2046:     if (zz != yy) {
2047:       PetscArraycpy(zarray,yarray,6*mbs);
2048:     }
2049:     mbs  = a->compressedrow.nrows;
2050:     ii   = a->compressedrow.i;
2051:     ridx = a->compressedrow.rindex;
2052:   } else {
2053:     ii = a->i;
2054:     y  = yarray;
2055:     z  = zarray;
2056:   }

2058:   for (i=0; i<mbs; i++) {
2059:     n = ii[1] - ii[0]; ii++;
2060:     if (usecprow) {
2061:       z = zarray + 6*ridx[i];
2062:       y = yarray + 6*ridx[i];
2063:     }
2064:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
2065:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
2066:     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2067:     for (j=0; j<n; j++) {
2068:       xb    = x + 6*(*idx++);
2069:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
2070:       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
2071:       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
2072:       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
2073:       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
2074:       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
2075:       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
2076:       v    += 36;
2077:     }
2078:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
2079:     if (!usecprow) {
2080:       z += 6; y += 6;
2081:     }
2082:   }
2083:   VecRestoreArrayRead(xx,&x);
2084:   VecRestoreArrayPair(yy,zz,&yarray,&zarray);
2085:   PetscLogFlops(72.0*a->nz);
2086:   return(0);
2087: }

2089: PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
2090: {
2091:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2092:   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
2093:   const PetscScalar *x,*xb;
2094:   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
2095:   const MatScalar   *v;
2096:   PetscErrorCode    ierr;
2097:   PetscInt          mbs = a->mbs,i,j,n;
2098:   const PetscInt    *idx,*ii,*ridx = NULL;
2099:   PetscBool         usecprow=a->compressedrow.use;

2102:   VecGetArrayRead(xx,&x);
2103:   VecGetArrayPair(yy,zz,&yarray,&zarray);

2105:   idx = a->j;
2106:   v   = a->a;
2107:   if (usecprow) {
2108:     if (zz != yy) {
2109:       PetscArraycpy(zarray,yarray,7*mbs);
2110:     }
2111:     mbs  = a->compressedrow.nrows;
2112:     ii   = a->compressedrow.i;
2113:     ridx = a->compressedrow.rindex;
2114:   } else {
2115:     ii = a->i;
2116:     y  = yarray;
2117:     z  = zarray;
2118:   }

2120:   for (i=0; i<mbs; i++) {
2121:     n = ii[1] - ii[0]; ii++;
2122:     if (usecprow) {
2123:       z = zarray + 7*ridx[i];
2124:       y = yarray + 7*ridx[i];
2125:     }
2126:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
2127:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
2128:     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2129:     for (j=0; j<n; j++) {
2130:       xb    = x + 7*(*idx++);
2131:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
2132:       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2133:       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2134:       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2135:       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2136:       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2137:       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2138:       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2139:       v    += 49;
2140:     }
2141:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
2142:     if (!usecprow) {
2143:       z += 7; y += 7;
2144:     }
2145:   }
2146:   VecRestoreArrayRead(xx,&x);
2147:   VecRestoreArrayPair(yy,zz,&yarray,&zarray);
2148:   PetscLogFlops(98.0*a->nz);
2149:   return(0);
2150: }

2152: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
2153: PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec yy,Vec zz)
2154: {
2155:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2156:   PetscScalar       *z = NULL,*work,*workt,*zarray;
2157:   const PetscScalar *x,*xb;
2158:   const MatScalar   *v;
2159:   PetscErrorCode    ierr;
2160:   PetscInt          mbs,i,j,n;
2161:   PetscInt          k;
2162:   PetscBool         usecprow=a->compressedrow.use;
2163:   const PetscInt    *idx,*ii,*ridx=NULL,bs = 9, bs2 = 81;

2165:   __m256d a0,a1,a2,a3,a4,a5;
2166:   __m256d w0,w1,w2,w3;
2167:   __m256d z0,z1,z2;
2168:   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63);

2171:   VecCopy(yy,zz);
2172:   VecGetArrayRead(xx,&x);
2173:   VecGetArray(zz,&zarray);

2175:   idx = a->j;
2176:   v   = a->a;
2177:   if (usecprow) {
2178:     mbs  = a->compressedrow.nrows;
2179:     ii   = a->compressedrow.i;
2180:     ridx = a->compressedrow.rindex;
2181:   } else {
2182:     mbs = a->mbs;
2183:     ii  = a->i;
2184:     z   = zarray;
2185:   }

2187:   if (!a->mult_work) {
2188:     k    = PetscMax(A->rmap->n,A->cmap->n);
2189:     PetscMalloc1(k+1,&a->mult_work);
2190:   }

2192:   work = a->mult_work;
2193:   for (i=0; i<mbs; i++) {
2194:     n           = ii[1] - ii[0]; ii++;
2195:     workt       = work;
2196:     for (j=0; j<n; j++) {
2197:       xb = x + bs*(*idx++);
2198:       for (k=0; k<bs; k++) workt[k] = xb[k];
2199:       workt += bs;
2200:     }
2201:     if (usecprow) z = zarray + bs*ridx[i];

2203:     z0 = _mm256_loadu_pd(&z[ 0]); z1 = _mm256_loadu_pd(&z[ 4]); z2 = _mm256_set1_pd(z[ 8]);

2205:     for (j=0; j<n; j++) {
2206:       /* first column of a */
2207:       w0 = _mm256_set1_pd(work[j*9  ]);
2208:       a0 = _mm256_loadu_pd(&v[j*81  ]); z0 = _mm256_fmadd_pd(a0,w0,z0);
2209:       a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1);
2210:       a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2);

2212:       /* second column of a */
2213:       w1 = _mm256_set1_pd(work[j*9+ 1]);
2214:       a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0);
2215:       a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1);
2216:       a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2);

2218:       /* third column of a */
2219:       w2 = _mm256_set1_pd(work[j*9+ 2]);
2220:       a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0);
2221:       a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1);
2222:       a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2);

2224:       /* fourth column of a */
2225:       w3 = _mm256_set1_pd(work[j*9+ 3]);
2226:       a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0);
2227:       a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1);
2228:       a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2);

2230:       /* fifth column of a */
2231:       w0 = _mm256_set1_pd(work[j*9+ 4]);
2232:       a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0);
2233:       a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1);
2234:       a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2);

2236:       /* sixth column of a */
2237:       w1 = _mm256_set1_pd(work[j*9+ 5]);
2238:       a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0);
2239:       a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1);
2240:       a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2);

2242:       /* seventh column of a */
2243:       w2 = _mm256_set1_pd(work[j*9+ 6]);
2244:       a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0);
2245:       a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1);
2246:       a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2);

2248:       /* eigth column of a */
2249:       w3 = _mm256_set1_pd(work[j*9+ 7]);
2250:       a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0);
2251:       a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1);
2252:       a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2);

2254:       /* ninth column of a */
2255:       w0 = _mm256_set1_pd(work[j*9+ 8]);
2256:       a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0);
2257:       a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1);
2258:       a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2);
2259:     }

2261:     _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2);

2263:     v += n*bs2;
2264:     if (!usecprow) z += bs;
2265:   }
2266:   VecRestoreArrayRead(xx,&x);
2267:   VecRestoreArray(zz,&zarray);
2268:   PetscLogFlops(162.0*a->nz);
2269:   return(0);
2270: }
2271: #endif

2273: PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A,Vec xx,Vec yy,Vec zz)
2274: {
2275:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2276:   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
2277:   const PetscScalar *x,*xb;
2278:   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,*yarray,*zarray;
2279:   const MatScalar   *v;
2280:   PetscErrorCode    ierr;
2281:   PetscInt          mbs = a->mbs,i,j,n;
2282:   const PetscInt    *idx,*ii,*ridx = NULL;
2283:   PetscBool         usecprow=a->compressedrow.use;

2286:   VecGetArrayRead(xx,&x);
2287:   VecGetArrayPair(yy,zz,&yarray,&zarray);

2289:   idx = a->j;
2290:   v   = a->a;
2291:   if (usecprow) {
2292:     if (zz != yy) {
2293:       PetscArraycpy(zarray,yarray,7*mbs);
2294:     }
2295:     mbs  = a->compressedrow.nrows;
2296:     ii   = a->compressedrow.i;
2297:     ridx = a->compressedrow.rindex;
2298:   } else {
2299:     ii = a->i;
2300:     y  = yarray;
2301:     z  = zarray;
2302:   }

2304:   for (i=0; i<mbs; i++) {
2305:     n = ii[1] - ii[0]; ii++;
2306:     if (usecprow) {
2307:       z = zarray + 11*ridx[i];
2308:       y = yarray + 11*ridx[i];
2309:     }
2310:     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
2311:     sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10];
2312:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
2313:     PetscPrefetchBlock(v+121*n,121*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2314:     for (j=0; j<n; j++) {
2315:       xb    = x + 11*(*idx++);
2316:       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10];
2317:       sum1 += v[  0]*x1 + v[  11]*x2  + v[2*11]*x3  + v[3*11]*x4 + v[4*11]*x5 + v[5*11]*x6 + v[6*11]*x7+ v[7*11]*x8 + v[8*11]*x9  + v[9*11]*x10  + v[10*11]*x11;
2318:       sum2 += v[1+0]*x1 + v[1+11]*x2  + v[1+2*11]*x3  + v[1+3*11]*x4 + v[1+4*11]*x5 + v[1+5*11]*x6 + v[1+6*11]*x7+ v[1+7*11]*x8 + v[1+8*11]*x9  + v[1+9*11]*x10  + v[1+10*11]*x11;
2319:       sum3 += v[2+0]*x1 + v[2+11]*x2  + v[2+2*11]*x3  + v[2+3*11]*x4 + v[2+4*11]*x5 + v[2+5*11]*x6 + v[2+6*11]*x7+ v[2+7*11]*x8 + v[2+8*11]*x9  + v[2+9*11]*x10  + v[2+10*11]*x11;
2320:       sum4 += v[3+0]*x1 + v[3+11]*x2  + v[3+2*11]*x3  + v[3+3*11]*x4 + v[3+4*11]*x5 + v[3+5*11]*x6 + v[3+6*11]*x7+ v[3+7*11]*x8 + v[3+8*11]*x9  + v[3+9*11]*x10  + v[3+10*11]*x11;
2321:       sum5 += v[4+0]*x1 + v[4+11]*x2  + v[4+2*11]*x3  + v[4+3*11]*x4 + v[4+4*11]*x5 + v[4+5*11]*x6 + v[4+6*11]*x7+ v[4+7*11]*x8 + v[4+8*11]*x9  + v[4+9*11]*x10  + v[4+10*11]*x11;
2322:       sum6 += v[5+0]*x1 + v[5+11]*x2  + v[5+2*11]*x3  + v[5+3*11]*x4 + v[5+4*11]*x5 + v[5+5*11]*x6 + v[5+6*11]*x7+ v[5+7*11]*x8 + v[5+8*11]*x9  + v[5+9*11]*x10  + v[5+10*11]*x11;
2323:       sum7 += v[6+0]*x1 + v[6+11]*x2  + v[6+2*11]*x3  + v[6+3*11]*x4 + v[6+4*11]*x5 + v[6+5*11]*x6 + v[6+6*11]*x7+ v[6+7*11]*x8 + v[6+8*11]*x9  + v[6+9*11]*x10  + v[6+10*11]*x11;
2324:       sum8 += v[7+0]*x1 + v[7+11]*x2  + v[7+2*11]*x3  + v[7+3*11]*x4 + v[7+4*11]*x5 + v[7+5*11]*x6 + v[7+6*11]*x7+ v[7+7*11]*x8 + v[7+8*11]*x9  + v[7+9*11]*x10  + v[7+10*11]*x11;
2325:       sum9 += v[8+0]*x1 + v[8+11]*x2  + v[8+2*11]*x3  + v[8+3*11]*x4 + v[8+4*11]*x5 + v[8+5*11]*x6 + v[8+6*11]*x7+ v[8+7*11]*x8 + v[8+8*11]*x9  + v[8+9*11]*x10  + v[8+10*11]*x11;
2326:       sum10 += v[9+0]*x1 + v[9+11]*x2  + v[9+2*11]*x3  + v[9+3*11]*x4 + v[9+4*11]*x5 + v[9+5*11]*x6 + v[9+6*11]*x7+ v[9+7*11]*x8 + v[9+8*11]*x9  + v[9+9*11]*x10  + v[9+10*11]*x11;
2327:       sum11 += v[10+0]*x1 + v[10+11]*x2  + v[10+2*11]*x3  + v[10+3*11]*x4 + v[10+4*11]*x5 + v[10+5*11]*x6 + v[10+6*11]*x7+ v[10+7*11]*x8 + v[10+8*11]*x9  + v[10+9*11]*x10  + v[10+10*11]*x11;
2328:       v    += 121;
2329:     }
2330:     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
2331:     z[7] = sum6; z[8] = sum7; z[9] = sum8; z[10] = sum9; z[11] = sum10;
2332:     if (!usecprow) {
2333:       z += 11; y += 11;
2334:     }
2335:   }
2336:   VecRestoreArrayRead(xx,&x);
2337:   VecRestoreArrayPair(yy,zz,&yarray,&zarray);
2338:   PetscLogFlops(242.0*a->nz);
2339:   return(0);
2340: }

2342: PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
2343: {
2344:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2345:   PetscScalar       *z = NULL,*work,*workt,*zarray;
2346:   const PetscScalar *x,*xb;
2347:   const MatScalar   *v;
2348:   PetscErrorCode    ierr;
2349:   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
2350:   PetscInt          ncols,k;
2351:   const PetscInt    *ridx = NULL,*idx,*ii;
2352:   PetscBool         usecprow = a->compressedrow.use;

2355:   VecCopy(yy,zz);
2356:   VecGetArrayRead(xx,&x);
2357:   VecGetArray(zz,&zarray);

2359:   idx = a->j;
2360:   v   = a->a;
2361:   if (usecprow) {
2362:     mbs  = a->compressedrow.nrows;
2363:     ii   = a->compressedrow.i;
2364:     ridx = a->compressedrow.rindex;
2365:   } else {
2366:     mbs = a->mbs;
2367:     ii  = a->i;
2368:     z   = zarray;
2369:   }

2371:   if (!a->mult_work) {
2372:     k    = PetscMax(A->rmap->n,A->cmap->n);
2373:     PetscMalloc1(k+1,&a->mult_work);
2374:   }
2375:   work = a->mult_work;
2376:   for (i=0; i<mbs; i++) {
2377:     n     = ii[1] - ii[0]; ii++;
2378:     ncols = n*bs;
2379:     workt = work;
2380:     for (j=0; j<n; j++) {
2381:       xb = x + bs*(*idx++);
2382:       for (k=0; k<bs; k++) workt[k] = xb[k];
2383:       workt += bs;
2384:     }
2385:     if (usecprow) z = zarray + bs*ridx[i];
2386:     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
2387:     v += n*bs2;
2388:     if (!usecprow) z += bs;
2389:   }
2390:   VecRestoreArrayRead(xx,&x);
2391:   VecRestoreArray(zz,&zarray);
2392:   PetscLogFlops(2.0*a->nz*bs2);
2393:   return(0);
2394: }

2396: PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
2397: {
2398:   PetscScalar    zero = 0.0;

2402:   VecSet(zz,zero);
2403:   MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);
2404:   return(0);
2405: }

2407: PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
2408: {
2409:   PetscScalar    zero = 0.0;

2413:   VecSet(zz,zero);
2414:   MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);
2415:   return(0);
2416: }

2418: PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
2419: {
2420:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2421:   PetscScalar       *z,x1,x2,x3,x4,x5;
2422:   const PetscScalar *x,*xb = NULL;
2423:   const MatScalar   *v;
2424:   PetscErrorCode    ierr;
2425:   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n;
2426:   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
2427:   Mat_CompressedRow cprow = a->compressedrow;
2428:   PetscBool         usecprow = cprow.use;

2431:   if (yy != zz) { VecCopy(yy,zz); }
2432:   VecGetArrayRead(xx,&x);
2433:   VecGetArray(zz,&z);

2435:   idx = a->j;
2436:   v   = a->a;
2437:   if (usecprow) {
2438:     mbs  = cprow.nrows;
2439:     ii   = cprow.i;
2440:     ridx = cprow.rindex;
2441:   } else {
2442:     mbs=a->mbs;
2443:     ii = a->i;
2444:     xb = x;
2445:   }

2447:   switch (bs) {
2448:   case 1:
2449:     for (i=0; i<mbs; i++) {
2450:       if (usecprow) xb = x + ridx[i];
2451:       x1 = xb[0];
2452:       ib = idx + ii[0];
2453:       n  = ii[1] - ii[0]; ii++;
2454:       for (j=0; j<n; j++) {
2455:         rval     = ib[j];
2456:         z[rval] += PetscConj(*v) * x1;
2457:         v++;
2458:       }
2459:       if (!usecprow) xb++;
2460:     }
2461:     break;
2462:   case 2:
2463:     for (i=0; i<mbs; i++) {
2464:       if (usecprow) xb = x + 2*ridx[i];
2465:       x1 = xb[0]; x2 = xb[1];
2466:       ib = idx + ii[0];
2467:       n  = ii[1] - ii[0]; ii++;
2468:       for (j=0; j<n; j++) {
2469:         rval       = ib[j]*2;
2470:         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
2471:         z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
2472:         v         += 4;
2473:       }
2474:       if (!usecprow) xb += 2;
2475:     }
2476:     break;
2477:   case 3:
2478:     for (i=0; i<mbs; i++) {
2479:       if (usecprow) xb = x + 3*ridx[i];
2480:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2481:       ib = idx + ii[0];
2482:       n  = ii[1] - ii[0]; ii++;
2483:       for (j=0; j<n; j++) {
2484:         rval       = ib[j]*3;
2485:         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
2486:         z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
2487:         z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
2488:         v         += 9;
2489:       }
2490:       if (!usecprow) xb += 3;
2491:     }
2492:     break;
2493:   case 4:
2494:     for (i=0; i<mbs; i++) {
2495:       if (usecprow) xb = x + 4*ridx[i];
2496:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2497:       ib = idx + ii[0];
2498:       n  = ii[1] - ii[0]; ii++;
2499:       for (j=0; j<n; j++) {
2500:         rval       = ib[j]*4;
2501:         z[rval++] +=  PetscConj(v[0])*x1 + PetscConj(v[1])*x2  + PetscConj(v[2])*x3  + PetscConj(v[3])*x4;
2502:         z[rval++] +=  PetscConj(v[4])*x1 + PetscConj(v[5])*x2  + PetscConj(v[6])*x3  + PetscConj(v[7])*x4;
2503:         z[rval++] +=  PetscConj(v[8])*x1 + PetscConj(v[9])*x2  + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
2504:         z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
2505:         v         += 16;
2506:       }
2507:       if (!usecprow) xb += 4;
2508:     }
2509:     break;
2510:   case 5:
2511:     for (i=0; i<mbs; i++) {
2512:       if (usecprow) xb = x + 5*ridx[i];
2513:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2514:       x4 = xb[3]; x5 = xb[4];
2515:       ib = idx + ii[0];
2516:       n  = ii[1] - ii[0]; ii++;
2517:       for (j=0; j<n; j++) {
2518:         rval       = ib[j]*5;
2519:         z[rval++] +=  PetscConj(v[0])*x1 +  PetscConj(v[1])*x2 +  PetscConj(v[2])*x3 +  PetscConj(v[3])*x4 +  PetscConj(v[4])*x5;
2520:         z[rval++] +=  PetscConj(v[5])*x1 +  PetscConj(v[6])*x2 +  PetscConj(v[7])*x3 +  PetscConj(v[8])*x4 +  PetscConj(v[9])*x5;
2521:         z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
2522:         z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
2523:         z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
2524:         v         += 25;
2525:       }
2526:       if (!usecprow) xb += 5;
2527:     }
2528:     break;
2529:   default: /* block sizes larger than 5 by 5 are handled by BLAS */
2530:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
2531: #if 0
2532:     {
2533:       PetscInt          ncols,k,bs2=a->bs2;
2534:       PetscScalar       *work,*workt,zb;
2535:       const PetscScalar *xtmp;
2536:       if (!a->mult_work) {
2537:         k    = PetscMax(A->rmap->n,A->cmap->n);
2538:         PetscMalloc1(k+1,&a->mult_work);
2539:       }
2540:       work = a->mult_work;
2541:       xtmp = x;
2542:       for (i=0; i<mbs; i++) {
2543:         n     = ii[1] - ii[0]; ii++;
2544:         ncols = n*bs;
2545:         PetscArrayzero(work,ncols);
2546:         if (usecprow) xtmp = x + bs*ridx[i];
2547:         PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2548:         v += n*bs2;
2549:         if (!usecprow) xtmp += bs;
2550:         workt = work;
2551:         for (j=0; j<n; j++) {
2552:           zb = z + bs*(*idx++);
2553:           for (k=0; k<bs; k++) zb[k] += workt[k] ;
2554:           workt += bs;
2555:         }
2556:       }
2557:     }
2558: #endif
2559:   }
2560:   VecRestoreArrayRead(xx,&x);
2561:   VecRestoreArray(zz,&z);
2562:   PetscLogFlops(2.0*a->nz*a->bs2);
2563:   return(0);
2564: }

2566: PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
2567: {
2568:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2569:   PetscScalar       *zb,*z,x1,x2,x3,x4,x5;
2570:   const PetscScalar *x,*xb = NULL;
2571:   const MatScalar   *v;
2572:   PetscErrorCode    ierr;
2573:   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2;
2574:   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
2575:   Mat_CompressedRow cprow   = a->compressedrow;
2576:   PetscBool         usecprow=cprow.use;

2579:   if (yy != zz) { VecCopy(yy,zz); }
2580:   VecGetArrayRead(xx,&x);
2581:   VecGetArray(zz,&z);

2583:   idx = a->j;
2584:   v   = a->a;
2585:   if (usecprow) {
2586:     mbs  = cprow.nrows;
2587:     ii   = cprow.i;
2588:     ridx = cprow.rindex;
2589:   } else {
2590:     mbs=a->mbs;
2591:     ii = a->i;
2592:     xb = x;
2593:   }

2595:   switch (bs) {
2596:   case 1:
2597:     for (i=0; i<mbs; i++) {
2598:       if (usecprow) xb = x + ridx[i];
2599:       x1 = xb[0];
2600:       ib = idx + ii[0];
2601:       n  = ii[1] - ii[0]; ii++;
2602:       for (j=0; j<n; j++) {
2603:         rval     = ib[j];
2604:         z[rval] += *v * x1;
2605:         v++;
2606:       }
2607:       if (!usecprow) xb++;
2608:     }
2609:     break;
2610:   case 2:
2611:     for (i=0; i<mbs; i++) {
2612:       if (usecprow) xb = x + 2*ridx[i];
2613:       x1 = xb[0]; x2 = xb[1];
2614:       ib = idx + ii[0];
2615:       n  = ii[1] - ii[0]; ii++;
2616:       for (j=0; j<n; j++) {
2617:         rval       = ib[j]*2;
2618:         z[rval++] += v[0]*x1 + v[1]*x2;
2619:         z[rval++] += v[2]*x1 + v[3]*x2;
2620:         v         += 4;
2621:       }
2622:       if (!usecprow) xb += 2;
2623:     }
2624:     break;
2625:   case 3:
2626:     for (i=0; i<mbs; i++) {
2627:       if (usecprow) xb = x + 3*ridx[i];
2628:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2629:       ib = idx + ii[0];
2630:       n  = ii[1] - ii[0]; ii++;
2631:       for (j=0; j<n; j++) {
2632:         rval       = ib[j]*3;
2633:         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
2634:         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
2635:         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
2636:         v         += 9;
2637:       }
2638:       if (!usecprow) xb += 3;
2639:     }
2640:     break;
2641:   case 4:
2642:     for (i=0; i<mbs; i++) {
2643:       if (usecprow) xb = x + 4*ridx[i];
2644:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2645:       ib = idx + ii[0];
2646:       n  = ii[1] - ii[0]; ii++;
2647:       for (j=0; j<n; j++) {
2648:         rval       = ib[j]*4;
2649:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
2650:         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
2651:         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
2652:         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
2653:         v         += 16;
2654:       }
2655:       if (!usecprow) xb += 4;
2656:     }
2657:     break;
2658:   case 5:
2659:     for (i=0; i<mbs; i++) {
2660:       if (usecprow) xb = x + 5*ridx[i];
2661:       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2662:       x4 = xb[3]; x5 = xb[4];
2663:       ib = idx + ii[0];
2664:       n  = ii[1] - ii[0]; ii++;
2665:       for (j=0; j<n; j++) {
2666:         rval       = ib[j]*5;
2667:         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
2668:         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
2669:         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
2670:         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
2671:         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
2672:         v         += 25;
2673:       }
2674:       if (!usecprow) xb += 5;
2675:     }
2676:     break;
2677:   default: {      /* block sizes larger then 5 by 5 are handled by BLAS */
2678:     PetscInt          ncols,k;
2679:     PetscScalar       *work,*workt;
2680:     const PetscScalar *xtmp;
2681:     if (!a->mult_work) {
2682:       k    = PetscMax(A->rmap->n,A->cmap->n);
2683:       PetscMalloc1(k+1,&a->mult_work);
2684:     }
2685:     work = a->mult_work;
2686:     xtmp = x;
2687:     for (i=0; i<mbs; i++) {
2688:       n     = ii[1] - ii[0]; ii++;
2689:       ncols = n*bs;
2690:       PetscArrayzero(work,ncols);
2691:       if (usecprow) xtmp = x + bs*ridx[i];
2692:       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2693:       v += n*bs2;
2694:       if (!usecprow) xtmp += bs;
2695:       workt = work;
2696:       for (j=0; j<n; j++) {
2697:         zb = z + bs*(*idx++);
2698:         for (k=0; k<bs; k++) zb[k] += workt[k];
2699:         workt += bs;
2700:       }
2701:     }
2702:     }
2703:   }
2704:   VecRestoreArrayRead(xx,&x);
2705:   VecRestoreArray(zz,&z);
2706:   PetscLogFlops(2.0*a->nz*a->bs2);
2707:   return(0);
2708: }

2710: PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
2711: {
2712:   Mat_SeqBAIJ    *a      = (Mat_SeqBAIJ*)inA->data;
2713:   PetscInt       totalnz = a->bs2*a->nz;
2714:   PetscScalar    oalpha  = alpha;
2716:   PetscBLASInt   one = 1,tnz;

2719:   PetscBLASIntCast(totalnz,&tnz);
2720:   PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one));
2721:   PetscLogFlops(totalnz);
2722:   return(0);
2723: }

2725: PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
2726: {
2728:   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ*)A->data;
2729:   MatScalar      *v  = a->a;
2730:   PetscReal      sum = 0.0;
2731:   PetscInt       i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;

2734:   if (type == NORM_FROBENIUS) {
2735: #if defined(PETSC_USE_REAL___FP16)
2736:     PetscBLASInt one = 1,cnt = bs2*nz;
2737:     PetscStackCallBLAS("BLASnrm2",*norm = BLASnrm2_(&cnt,v,&one));
2738: #else
2739:     for (i=0; i<bs2*nz; i++) {
2740:       sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
2741:     }
2742: #endif
2743:     *norm = PetscSqrtReal(sum);
2744:     PetscLogFlops(2.0*bs2*nz);
2745:   } else if (type == NORM_1) { /* maximum column sum */
2746:     PetscReal *tmp;
2747:     PetscInt  *bcol = a->j;
2748:     PetscCalloc1(A->cmap->n+1,&tmp);
2749:     for (i=0; i<nz; i++) {
2750:       for (j=0; j<bs; j++) {
2751:         k1 = bs*(*bcol) + j; /* column index */
2752:         for (k=0; k<bs; k++) {
2753:           tmp[k1] += PetscAbsScalar(*v); v++;
2754:         }
2755:       }
2756:       bcol++;
2757:     }
2758:     *norm = 0.0;
2759:     for (j=0; j<A->cmap->n; j++) {
2760:       if (tmp[j] > *norm) *norm = tmp[j];
2761:     }
2762:     PetscFree(tmp);
2763:     PetscLogFlops(PetscMax(bs2*nz-1,0));
2764:   } else if (type == NORM_INFINITY) { /* maximum row sum */
2765:     *norm = 0.0;
2766:     for (k=0; k<bs; k++) {
2767:       for (j=0; j<a->mbs; j++) {
2768:         v   = a->a + bs2*a->i[j] + k;
2769:         sum = 0.0;
2770:         for (i=0; i<a->i[j+1]-a->i[j]; i++) {
2771:           for (k1=0; k1<bs; k1++) {
2772:             sum += PetscAbsScalar(*v);
2773:             v   += bs;
2774:           }
2775:         }
2776:         if (sum > *norm) *norm = sum;
2777:       }
2778:     }
2779:     PetscLogFlops(PetscMax(bs2*nz-1,0));
2780:   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
2781:   return(0);
2782: }


2785: PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg)
2786: {
2787:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;

2791:   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
2792:   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
2793:     *flg = PETSC_FALSE;
2794:     return(0);
2795:   }

2797:   /* if the a->i are the same */
2798:   PetscArraycmp(a->i,b->i,a->mbs+1,flg);
2799:   if (!*flg) return(0);

2801:   /* if a->j are the same */
2802:   PetscArraycmp(a->j,b->j,a->nz,flg);
2803:   if (!*flg) return(0);

2805:   /* if a->a are the same */
2806:   PetscArraycmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs),flg);
2807:   return(0);

2809: }

2811: PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
2812: {
2813:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2815:   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
2816:   PetscScalar    *x,zero = 0.0;
2817:   MatScalar      *aa,*aa_j;

2820:   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
2821:   bs   = A->rmap->bs;
2822:   aa   = a->a;
2823:   ai   = a->i;
2824:   aj   = a->j;
2825:   ambs = a->mbs;
2826:   bs2  = a->bs2;

2828:   VecSet(v,zero);
2829:   VecGetArray(v,&x);
2830:   VecGetLocalSize(v,&n);
2831:   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
2832:   for (i=0; i<ambs; i++) {
2833:     for (j=ai[i]; j<ai[i+1]; j++) {
2834:       if (aj[j] == i) {
2835:         row  = i*bs;
2836:         aa_j = aa+j*bs2;
2837:         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
2838:         break;
2839:       }
2840:     }
2841:   }
2842:   VecRestoreArray(v,&x);
2843:   return(0);
2844: }

2846: PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
2847: {
2848:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2849:   const PetscScalar *l,*r,*li,*ri;
2850:   PetscScalar       x;
2851:   MatScalar         *aa, *v;
2852:   PetscErrorCode    ierr;
2853:   PetscInt          i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai;
2854:   const PetscInt    *ai,*aj;

2857:   ai  = a->i;
2858:   aj  = a->j;
2859:   aa  = a->a;
2860:   m   = A->rmap->n;
2861:   n   = A->cmap->n;
2862:   bs  = A->rmap->bs;
2863:   mbs = a->mbs;
2864:   bs2 = a->bs2;
2865:   if (ll) {
2866:     VecGetArrayRead(ll,&l);
2867:     VecGetLocalSize(ll,&lm);
2868:     if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
2869:     for (i=0; i<mbs; i++) { /* for each block row */
2870:       M  = ai[i+1] - ai[i];
2871:       li = l + i*bs;
2872:       v  = aa + bs2*ai[i];
2873:       for (j=0; j<M; j++) { /* for each block */
2874:         for (k=0; k<bs2; k++) {
2875:           (*v++) *= li[k%bs];
2876:         }
2877:       }
2878:     }
2879:     VecRestoreArrayRead(ll,&l);
2880:     PetscLogFlops(a->nz);
2881:   }

2883:   if (rr) {
2884:     VecGetArrayRead(rr,&r);
2885:     VecGetLocalSize(rr,&rn);
2886:     if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
2887:     for (i=0; i<mbs; i++) { /* for each block row */
2888:       iai = ai[i];
2889:       M   = ai[i+1] - iai;
2890:       v   = aa + bs2*iai;
2891:       for (j=0; j<M; j++) { /* for each block */
2892:         ri = r + bs*aj[iai+j];
2893:         for (k=0; k<bs; k++) {
2894:           x = ri[k];
2895:           for (tmp=0; tmp<bs; tmp++) v[tmp] *= x;
2896:           v += bs;
2897:         }
2898:       }
2899:     }
2900:     VecRestoreArrayRead(rr,&r);
2901:     PetscLogFlops(a->nz);
2902:   }
2903:   return(0);
2904: }


2907: PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2908: {
2909:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;

2912:   info->block_size   = a->bs2;
2913:   info->nz_allocated = a->bs2*a->maxnz;
2914:   info->nz_used      = a->bs2*a->nz;
2915:   info->nz_unneeded  = info->nz_allocated - info->nz_used;
2916:   info->assemblies   = A->num_ass;
2917:   info->mallocs      = A->info.mallocs;
2918:   info->memory       = ((PetscObject)A)->mem;
2919:   if (A->factortype) {
2920:     info->fill_ratio_given  = A->info.fill_ratio_given;
2921:     info->fill_ratio_needed = A->info.fill_ratio_needed;
2922:     info->factor_mallocs    = A->info.factor_mallocs;
2923:   } else {
2924:     info->fill_ratio_given  = 0;
2925:     info->fill_ratio_needed = 0;
2926:     info->factor_mallocs    = 0;
2927:   }
2928:   return(0);
2929: }

2931: PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2932: {
2933:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;

2937:   PetscArrayzero(a->a,a->bs2*a->i[a->mbs]);
2938:   return(0);
2939: }

2941: PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)
2942: {

2946:   MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);
2947:   C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense;
2948:   return(0);
2949: }

2951: PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2952: {
2953:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2954:   PetscScalar       *z = NULL,sum1;
2955:   const PetscScalar *xb;
2956:   PetscScalar       x1;
2957:   const MatScalar   *v,*vv;
2958:   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
2959:   PetscBool         usecprow=a->compressedrow.use;

2962:   idx = a->j;
2963:   v   = a->a;
2964:   if (usecprow) {
2965:     mbs  = a->compressedrow.nrows;
2966:     ii   = a->compressedrow.i;
2967:     ridx = a->compressedrow.rindex;
2968:   } else {
2969:     mbs = a->mbs;
2970:     ii  = a->i;
2971:     z   = c;
2972:   }

2974:   for (i=0; i<mbs; i++) {
2975:     n           = ii[1] - ii[0]; ii++;
2976:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
2977:     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2978:     if (usecprow) z = c + ridx[i];
2979:     jj = idx;
2980:     vv = v;
2981:     for (k=0; k<cn; k++) {
2982:       idx = jj;
2983:       v = vv;
2984:       sum1    = 0.0;
2985:       for (j=0; j<n; j++) {
2986:         xb    = b + (*idx++); x1 = xb[0+k*bm];
2987:         sum1 += v[0]*x1;
2988:         v    += 1;
2989:       }
2990:       z[0+k*cm] = sum1;
2991:     }
2992:     if (!usecprow) z += 1;
2993:   }
2994:   return(0);
2995: }

2997: PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2998: {
2999:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
3000:   PetscScalar       *z = NULL,sum1,sum2;
3001:   const PetscScalar *xb;
3002:   PetscScalar       x1,x2;
3003:   const MatScalar   *v,*vv;
3004:   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
3005:   PetscBool         usecprow=a->compressedrow.use;

3008:   idx = a->j;
3009:   v   = a->a;
3010:   if (usecprow) {
3011:     mbs  = a->compressedrow.nrows;
3012:     ii   = a->compressedrow.i;
3013:     ridx = a->compressedrow.rindex;
3014:   } else {
3015:     mbs = a->mbs;
3016:     ii  = a->i;
3017:     z   = c;
3018:   }

3020:   for (i=0; i<mbs; i++) {
3021:     n           = ii[1] - ii[0]; ii++;
3022:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
3023:     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3024:     if (usecprow) z = c + 2*ridx[i];
3025:     jj = idx;
3026:     vv = v;
3027:     for (k=0; k<cn; k++) {
3028:       idx = jj;
3029:       v = vv;
3030:       sum1    = 0.0; sum2 = 0.0;
3031:       for (j=0; j<n; j++) {
3032:         xb    = b + 2*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm];
3033:         sum1 += v[0]*x1 + v[2]*x2;
3034:         sum2 += v[1]*x1 + v[3]*x2;
3035:         v    += 4;
3036:       }
3037:       z[0+k*cm] = sum1; z[1+k*cm] = sum2;
3038:     }
3039:     if (!usecprow) z += 2;
3040:   }
3041:   return(0);
3042: }

3044: PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
3045: {
3046:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
3047:   PetscScalar       *z = NULL,sum1,sum2,sum3;
3048:   const PetscScalar *xb;
3049:   PetscScalar       x1,x2,x3;
3050:   const MatScalar   *v,*vv;
3051:   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
3052:   PetscBool         usecprow=a->compressedrow.use;

3055:   idx = a->j;
3056:   v   = a->a;
3057:   if (usecprow) {
3058:     mbs  = a->compressedrow.nrows;
3059:     ii   = a->compressedrow.i;
3060:     ridx = a->compressedrow.rindex;
3061:   } else {
3062:     mbs = a->mbs;
3063:     ii  = a->i;
3064:     z   = c;
3065:   }

3067:   for (i=0; i<mbs; i++) {
3068:     n           = ii[1] - ii[0]; ii++;
3069:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
3070:     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3071:     if (usecprow) z = c + 3*ridx[i];
3072:     jj = idx;
3073:     vv = v;
3074:     for (k=0; k<cn; k++) {
3075:       idx = jj;
3076:       v = vv;
3077:       sum1    = 0.0; sum2 = 0.0; sum3 = 0.0;
3078:       for (j=0; j<n; j++) {
3079:         xb    = b + 3*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm];
3080:         sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
3081:         sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
3082:         sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
3083:         v    += 9;
3084:       }
3085:       z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3;
3086:     }
3087:     if (!usecprow) z += 3;
3088:   }
3089:   return(0);
3090: }

3092: PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
3093: {
3094:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
3095:   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4;
3096:   const PetscScalar *xb;
3097:   PetscScalar       x1,x2,x3,x4;
3098:   const MatScalar   *v,*vv;
3099:   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
3100:   PetscBool         usecprow=a->compressedrow.use;

3103:   idx = a->j;
3104:   v   = a->a;
3105:   if (usecprow) {
3106:     mbs  = a->compressedrow.nrows;
3107:     ii   = a->compressedrow.i;
3108:     ridx = a->compressedrow.rindex;
3109:   } else {
3110:     mbs = a->mbs;
3111:     ii  = a->i;
3112:     z   = c;
3113:   }

3115:   for (i=0; i<mbs; i++) {
3116:     n           = ii[1] - ii[0]; ii++;
3117:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
3118:     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3119:     if (usecprow) z = c + 4*ridx[i];
3120:     jj = idx;
3121:     vv = v;
3122:     for (k=0; k<cn; k++) {
3123:       idx = jj;
3124:       v = vv;
3125:       sum1    = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
3126:       for (j=0; j<n; j++) {
3127:         xb    = b + 4*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm];
3128:         sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3129:         sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3130:         sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3131:         sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3132:         v    += 16;
3133:       }
3134:       z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4;
3135:     }
3136:     if (!usecprow) z += 4;
3137:   }
3138:   return(0);
3139: }

3141: PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
3142: {
3143:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
3144:   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5;
3145:   const PetscScalar *xb;
3146:   PetscScalar       x1,x2,x3,x4,x5;
3147:   const MatScalar   *v,*vv;
3148:   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
3149:   PetscBool         usecprow=a->compressedrow.use;

3152:   idx = a->j;
3153:   v   = a->a;
3154:   if (usecprow) {
3155:     mbs  = a->compressedrow.nrows;
3156:     ii   = a->compressedrow.i;
3157:     ridx = a->compressedrow.rindex;
3158:   } else {
3159:     mbs = a->mbs;
3160:     ii  = a->i;
3161:     z   = c;
3162:   }

3164:   for (i=0; i<mbs; i++) {
3165:     n           = ii[1] - ii[0]; ii++;
3166:     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
3167:     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3168:     if (usecprow) z = c + 5*ridx[i];
3169:     jj = idx;
3170:     vv = v;
3171:     for (k=0; k<cn; k++) {
3172:       idx = jj;
3173:       v = vv;
3174:       sum1    = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
3175:       for (j=0; j<n; j++) {
3176:         xb    = b + 5*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm]; x5 = xb[4+k*bm];
3177:         sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3178:         sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3179:         sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3180:         sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3181:         sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3182:         v    += 25;
3183:       }
3184:       z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4; z[4+k*cm] = sum5;
3185:     }
3186:     if (!usecprow) z += 5;
3187:   }
3188:   return(0);
3189: }

3191: PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A,Mat B,Mat C)
3192: {
3193:   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
3194:   Mat_SeqDense      *bd = (Mat_SeqDense*)B->data;
3195:   Mat_SeqDense      *cd = (Mat_SeqDense*)C->data;
3196:   PetscInt          cm=cd->lda,cn=B->cmap->n,bm=bd->lda;
3197:   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
3198:   PetscBLASInt      bbs,bcn,bbm,bcm;
3199:   PetscScalar       *z = NULL;
3200:   PetscScalar       *c,*b;
3201:   const MatScalar   *v;
3202:   const PetscInt    *idx,*ii,*ridx=NULL;
3203:   PetscScalar       _DZero=0.0,_DOne=1.0;
3204:   PetscBool         usecprow=a->compressedrow.use;
3205:   PetscErrorCode    ierr;

3208:   if (!cm || !cn) return(0);
3209:   if (B->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,B->rmap->n);
3210:   if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n);
3211:   if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n);
3212:   b = bd->v;
3213:   if (a->nonzerorowcnt != A->rmap->n) {
3214:     MatZeroEntries(C);
3215:   }
3216:   MatDenseGetArray(C,&c);
3217:   switch (bs) {
3218:   case 1:
3219:     MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn);
3220:     break;
3221:   case 2:
3222:     MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn);
3223:     break;
3224:   case 3:
3225:     MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn);
3226:     break;
3227:   case 4:
3228:     MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn);
3229:     break;
3230:   case 5:
3231:     MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn);
3232:     break;
3233:   default: /* block sizes larger than 5 by 5 are handled by BLAS */
3234:     PetscBLASIntCast(bs,&bbs);
3235:     PetscBLASIntCast(cn,&bcn);
3236:     PetscBLASIntCast(bm,&bbm);
3237:     PetscBLASIntCast(cm,&bcm);
3238:     idx = a->j;
3239:     v   = a->a;
3240:     if (usecprow) {
3241:       mbs  = a->compressedrow.nrows;
3242:       ii   = a->compressedrow.i;
3243:       ridx = a->compressedrow.rindex;
3244:     } else {
3245:       mbs = a->mbs;
3246:       ii  = a->i;
3247:       z   = c;
3248:     }
3249:     for (i=0; i<mbs; i++) {
3250:       n = ii[1] - ii[0]; ii++;
3251:       if (usecprow) z = c + bs*ridx[i];
3252:       if (n) {
3253:         PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DZero,z,&bcm));
3254:         v += bs2;
3255:       }
3256:       for (j=1; j<n; j++) {
3257:         PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DOne,z,&bcm));
3258:         v += bs2;
3259:       }
3260:       if (!usecprow) z += bs;
3261:     }
3262:   }
3263:   MatDenseRestoreArray(C,&c);
3264:   PetscLogFlops((2.0*a->nz*bs2 - bs*a->nonzerorowcnt)*cn);
3265:   return(0);
3266: }