Actual source code: baij2.c
2: #include <../src/mat/impls/baij/seq/baij.h>
3: #include <../src/mat/impls/dense/seq/dense.h>
4: #include <petsc/private/kernels/blockinvert.h>
5: #include <petscbt.h>
6: #include <petscblaslapack.h>
8: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
9: #include <immintrin.h>
10: #endif
12: PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
13: {
14: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
16: PetscInt row,i,j,k,l,m,n,*nidx,isz,val,ival;
17: const PetscInt *idx;
18: PetscInt start,end,*ai,*aj,bs,*nidx2;
19: PetscBT table;
22: m = a->mbs;
23: ai = a->i;
24: aj = a->j;
25: bs = A->rmap->bs;
27: if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
29: PetscBTCreate(m,&table);
30: PetscMalloc1(m+1,&nidx);
31: PetscMalloc1(A->rmap->N+1,&nidx2);
33: for (i=0; i<is_max; i++) {
34: /* Initialise the two local arrays */
35: isz = 0;
36: PetscBTMemzero(m,table);
38: /* Extract the indices, assume there can be duplicate entries */
39: ISGetIndices(is[i],&idx);
40: ISGetLocalSize(is[i],&n);
42: /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
43: for (j=0; j<n; ++j) {
44: ival = idx[j]/bs; /* convert the indices into block indices */
45: if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
46: if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival;
47: }
48: ISRestoreIndices(is[i],&idx);
49: ISDestroy(&is[i]);
51: k = 0;
52: for (j=0; j<ov; j++) { /* for each overlap*/
53: n = isz;
54: for (; k<n; k++) { /* do only those rows in nidx[k], which are not done yet */
55: row = nidx[k];
56: start = ai[row];
57: end = ai[row+1];
58: for (l = start; l<end; l++) {
59: val = aj[l];
60: if (!PetscBTLookupSet(table,val)) nidx[isz++] = val;
61: }
62: }
63: }
64: /* expand the Index Set */
65: for (j=0; j<isz; j++) {
66: for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
67: }
68: ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);
69: }
70: PetscBTDestroy(&table);
71: PetscFree(nidx);
72: PetscFree(nidx2);
73: return(0);
74: }
76: PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
77: {
78: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c;
80: PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
81: PetscInt row,mat_i,*mat_j,tcol,*mat_ilen;
82: const PetscInt *irow,*icol;
83: PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
84: PetscInt *aj = a->j,*ai = a->i;
85: MatScalar *mat_a;
86: Mat C;
87: PetscBool flag;
90: ISGetIndices(isrow,&irow);
91: ISGetIndices(iscol,&icol);
92: ISGetLocalSize(isrow,&nrows);
93: ISGetLocalSize(iscol,&ncols);
95: PetscCalloc1(1+oldcols,&smap);
96: ssmap = smap;
97: PetscMalloc1(1+nrows,&lens);
98: for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
99: /* determine lens of each row */
100: for (i=0; i<nrows; i++) {
101: kstart = ai[irow[i]];
102: kend = kstart + a->ilen[irow[i]];
103: lens[i] = 0;
104: for (k=kstart; k<kend; k++) {
105: if (ssmap[aj[k]]) lens[i]++;
106: }
107: }
108: /* Create and fill new matrix */
109: if (scall == MAT_REUSE_MATRIX) {
110: c = (Mat_SeqBAIJ*)((*B)->data);
112: if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
113: PetscArraycmp(c->ilen,lens,c->mbs,&flag);
114: if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
115: PetscArrayzero(c->ilen,c->mbs);
116: C = *B;
117: } else {
118: MatCreate(PetscObjectComm((PetscObject)A),&C);
119: MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);
120: MatSetType(C,((PetscObject)A)->type_name);
121: MatSeqBAIJSetPreallocation(C,bs,0,lens);
122: }
123: c = (Mat_SeqBAIJ*)(C->data);
124: for (i=0; i<nrows; i++) {
125: row = irow[i];
126: kstart = ai[row];
127: kend = kstart + a->ilen[row];
128: mat_i = c->i[i];
129: mat_j = c->j + mat_i;
130: mat_a = c->a + mat_i*bs2;
131: mat_ilen = c->ilen + i;
132: for (k=kstart; k<kend; k++) {
133: if ((tcol=ssmap[a->j[k]])) {
134: *mat_j++ = tcol - 1;
135: PetscArraycpy(mat_a,a->a+k*bs2,bs2);
136: mat_a += bs2;
137: (*mat_ilen)++;
138: }
139: }
140: }
141: /* sort */
142: {
143: MatScalar *work;
144: PetscMalloc1(bs2,&work);
145: for (i=0; i<nrows; i++) {
146: PetscInt ilen;
147: mat_i = c->i[i];
148: mat_j = c->j + mat_i;
149: mat_a = c->a + mat_i*bs2;
150: ilen = c->ilen[i];
151: PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);
152: }
153: PetscFree(work);
154: }
156: /* Free work space */
157: ISRestoreIndices(iscol,&icol);
158: PetscFree(smap);
159: PetscFree(lens);
160: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
161: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
163: ISRestoreIndices(isrow,&irow);
164: *B = C;
165: return(0);
166: }
168: PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
169: {
170: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
171: IS is1,is2;
173: PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs,j;
174: const PetscInt *irow,*icol;
177: ISGetIndices(isrow,&irow);
178: ISGetIndices(iscol,&icol);
179: ISGetLocalSize(isrow,&nrows);
180: ISGetLocalSize(iscol,&ncols);
182: /* Verify if the indices corespond to each element in a block
183: and form the IS with compressed IS */
184: maxmnbs = PetscMax(a->mbs,a->nbs);
185: PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);
186: PetscArrayzero(vary,a->mbs);
187: for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
188: for (i=0; i<a->mbs; i++) {
189: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
190: }
191: count = 0;
192: for (i=0; i<nrows; i++) {
193: j = irow[i] / bs;
194: if ((vary[j]--)==bs) iary[count++] = j;
195: }
196: ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);
198: PetscArrayzero(vary,a->nbs);
199: for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
200: for (i=0; i<a->nbs; i++) {
201: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
202: }
203: count = 0;
204: for (i=0; i<ncols; i++) {
205: j = icol[i] / bs;
206: if ((vary[j]--)==bs) iary[count++] = j;
207: }
208: ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);
209: ISRestoreIndices(isrow,&irow);
210: ISRestoreIndices(iscol,&icol);
211: PetscFree2(vary,iary);
213: MatCreateSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);
214: ISDestroy(&is1);
215: ISDestroy(&is2);
216: return(0);
217: }
219: PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C)
220: {
222: Mat_SeqBAIJ *c = (Mat_SeqBAIJ*)C->data;
223: Mat_SubSppt *submatj = c->submatis1;
226: (*submatj->destroy)(C);
227: MatDestroySubMatrix_Private(submatj);
228: return(0);
229: }
231: PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n,Mat *mat[])
232: {
234: PetscInt i;
235: Mat C;
236: Mat_SeqBAIJ *c;
237: Mat_SubSppt *submatj;
240: for (i=0; i<n; i++) {
241: C = (*mat)[i];
242: c = (Mat_SeqBAIJ*)C->data;
243: submatj = c->submatis1;
244: if (submatj) {
245: if (--((PetscObject)C)->refct <= 0) {
246: (*submatj->destroy)(C);
247: MatDestroySubMatrix_Private(submatj);
248: PetscFree(C->defaultvectype);
249: PetscLayoutDestroy(&C->rmap);
250: PetscLayoutDestroy(&C->cmap);
251: PetscHeaderDestroy(&C);
252: }
253: } else {
254: MatDestroy(&C);
255: }
256: }
258: /* Destroy Dummy submatrices created for reuse */
259: MatDestroySubMatrices_Dummy(n,mat);
261: PetscFree(*mat);
262: return(0);
263: }
265: PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
266: {
268: PetscInt i;
271: if (scall == MAT_INITIAL_MATRIX) {
272: PetscCalloc1(n+1,B);
273: }
275: for (i=0; i<n; i++) {
276: MatCreateSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);
277: }
278: return(0);
279: }
281: /* -------------------------------------------------------*/
282: /* Should check that shapes of vectors and matrices match */
283: /* -------------------------------------------------------*/
285: PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
286: {
287: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
288: PetscScalar *z,sum;
289: const PetscScalar *x;
290: const MatScalar *v;
291: PetscErrorCode ierr;
292: PetscInt mbs,i,n;
293: const PetscInt *idx,*ii,*ridx=NULL;
294: PetscBool usecprow=a->compressedrow.use;
297: VecGetArrayRead(xx,&x);
298: VecGetArrayWrite(zz,&z);
300: if (usecprow) {
301: mbs = a->compressedrow.nrows;
302: ii = a->compressedrow.i;
303: ridx = a->compressedrow.rindex;
304: PetscArrayzero(z,a->mbs);
305: } else {
306: mbs = a->mbs;
307: ii = a->i;
308: }
310: for (i=0; i<mbs; i++) {
311: n = ii[1] - ii[0];
312: v = a->a + ii[0];
313: idx = a->j + ii[0];
314: ii++;
315: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
316: PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
317: sum = 0.0;
318: PetscSparseDensePlusDot(sum,x,v,idx,n);
319: if (usecprow) {
320: z[ridx[i]] = sum;
321: } else {
322: z[i] = sum;
323: }
324: }
325: VecRestoreArrayRead(xx,&x);
326: VecRestoreArrayWrite(zz,&z);
327: PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);
328: return(0);
329: }
331: PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
332: {
333: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
334: PetscScalar *z = NULL,sum1,sum2,*zarray;
335: const PetscScalar *x,*xb;
336: PetscScalar x1,x2;
337: const MatScalar *v;
338: PetscErrorCode ierr;
339: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
340: PetscBool usecprow=a->compressedrow.use;
343: VecGetArrayRead(xx,&x);
344: VecGetArrayWrite(zz,&zarray);
346: idx = a->j;
347: v = a->a;
348: if (usecprow) {
349: mbs = a->compressedrow.nrows;
350: ii = a->compressedrow.i;
351: ridx = a->compressedrow.rindex;
352: PetscArrayzero(zarray,2*a->mbs);
353: } else {
354: mbs = a->mbs;
355: ii = a->i;
356: z = zarray;
357: }
359: for (i=0; i<mbs; i++) {
360: n = ii[1] - ii[0]; ii++;
361: sum1 = 0.0; sum2 = 0.0;
362: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
363: PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
364: for (j=0; j<n; j++) {
365: xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
366: sum1 += v[0]*x1 + v[2]*x2;
367: sum2 += v[1]*x1 + v[3]*x2;
368: v += 4;
369: }
370: if (usecprow) z = zarray + 2*ridx[i];
371: z[0] = sum1; z[1] = sum2;
372: if (!usecprow) z += 2;
373: }
374: VecRestoreArrayRead(xx,&x);
375: VecRestoreArrayWrite(zz,&zarray);
376: PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);
377: return(0);
378: }
380: PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
381: {
382: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
383: PetscScalar *z = NULL,sum1,sum2,sum3,x1,x2,x3,*zarray;
384: const PetscScalar *x,*xb;
385: const MatScalar *v;
386: PetscErrorCode ierr;
387: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
388: PetscBool usecprow=a->compressedrow.use;
390: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
391: #pragma disjoint(*v,*z,*xb)
392: #endif
395: VecGetArrayRead(xx,&x);
396: VecGetArrayWrite(zz,&zarray);
398: idx = a->j;
399: v = a->a;
400: if (usecprow) {
401: mbs = a->compressedrow.nrows;
402: ii = a->compressedrow.i;
403: ridx = a->compressedrow.rindex;
404: PetscArrayzero(zarray,3*a->mbs);
405: } else {
406: mbs = a->mbs;
407: ii = a->i;
408: z = zarray;
409: }
411: for (i=0; i<mbs; i++) {
412: n = ii[1] - ii[0]; ii++;
413: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
414: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
415: PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
416: for (j=0; j<n; j++) {
417: xb = x + 3*(*idx++);
418: x1 = xb[0];
419: x2 = xb[1];
420: x3 = xb[2];
422: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
423: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
424: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
425: v += 9;
426: }
427: if (usecprow) z = zarray + 3*ridx[i];
428: z[0] = sum1; z[1] = sum2; z[2] = sum3;
429: if (!usecprow) z += 3;
430: }
431: VecRestoreArrayRead(xx,&x);
432: VecRestoreArrayWrite(zz,&zarray);
433: PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);
434: return(0);
435: }
437: PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
438: {
439: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
440: PetscScalar *z = NULL,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
441: const PetscScalar *x,*xb;
442: const MatScalar *v;
443: PetscErrorCode ierr;
444: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
445: PetscBool usecprow=a->compressedrow.use;
448: VecGetArrayRead(xx,&x);
449: VecGetArrayWrite(zz,&zarray);
451: idx = a->j;
452: v = a->a;
453: if (usecprow) {
454: mbs = a->compressedrow.nrows;
455: ii = a->compressedrow.i;
456: ridx = a->compressedrow.rindex;
457: PetscArrayzero(zarray,4*a->mbs);
458: } else {
459: mbs = a->mbs;
460: ii = a->i;
461: z = zarray;
462: }
464: for (i=0; i<mbs; i++) {
465: n = ii[1] - ii[0];
466: ii++;
467: sum1 = 0.0;
468: sum2 = 0.0;
469: sum3 = 0.0;
470: sum4 = 0.0;
472: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
473: PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
474: for (j=0; j<n; j++) {
475: xb = x + 4*(*idx++);
476: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
477: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
478: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
479: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
480: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
481: v += 16;
482: }
483: if (usecprow) z = zarray + 4*ridx[i];
484: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
485: if (!usecprow) z += 4;
486: }
487: VecRestoreArrayRead(xx,&x);
488: VecRestoreArrayWrite(zz,&zarray);
489: PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);
490: return(0);
491: }
493: PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
494: {
495: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
496: PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*zarray;
497: const PetscScalar *xb,*x;
498: const MatScalar *v;
499: PetscErrorCode ierr;
500: const PetscInt *idx,*ii,*ridx=NULL;
501: PetscInt mbs,i,j,n;
502: PetscBool usecprow=a->compressedrow.use;
505: VecGetArrayRead(xx,&x);
506: VecGetArrayWrite(zz,&zarray);
508: idx = a->j;
509: v = a->a;
510: if (usecprow) {
511: mbs = a->compressedrow.nrows;
512: ii = a->compressedrow.i;
513: ridx = a->compressedrow.rindex;
514: PetscArrayzero(zarray,5*a->mbs);
515: } else {
516: mbs = a->mbs;
517: ii = a->i;
518: z = zarray;
519: }
521: for (i=0; i<mbs; i++) {
522: n = ii[1] - ii[0]; ii++;
523: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
524: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
525: PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
526: for (j=0; j<n; j++) {
527: xb = x + 5*(*idx++);
528: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
529: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
530: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
531: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
532: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
533: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
534: v += 25;
535: }
536: if (usecprow) z = zarray + 5*ridx[i];
537: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
538: if (!usecprow) z += 5;
539: }
540: VecRestoreArrayRead(xx,&x);
541: VecRestoreArrayWrite(zz,&zarray);
542: PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);
543: return(0);
544: }
546: PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
547: {
548: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
549: PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6;
550: const PetscScalar *x,*xb;
551: PetscScalar x1,x2,x3,x4,x5,x6,*zarray;
552: const MatScalar *v;
553: PetscErrorCode ierr;
554: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
555: PetscBool usecprow=a->compressedrow.use;
558: VecGetArrayRead(xx,&x);
559: VecGetArrayWrite(zz,&zarray);
561: idx = a->j;
562: v = a->a;
563: if (usecprow) {
564: mbs = a->compressedrow.nrows;
565: ii = a->compressedrow.i;
566: ridx = a->compressedrow.rindex;
567: PetscArrayzero(zarray,6*a->mbs);
568: } else {
569: mbs = a->mbs;
570: ii = a->i;
571: z = zarray;
572: }
574: for (i=0; i<mbs; i++) {
575: n = ii[1] - ii[0];
576: ii++;
577: sum1 = 0.0;
578: sum2 = 0.0;
579: sum3 = 0.0;
580: sum4 = 0.0;
581: sum5 = 0.0;
582: sum6 = 0.0;
584: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
585: PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
586: for (j=0; j<n; j++) {
587: xb = x + 6*(*idx++);
588: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
589: sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
590: sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
591: sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
592: sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
593: sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
594: sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
595: v += 36;
596: }
597: if (usecprow) z = zarray + 6*ridx[i];
598: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
599: if (!usecprow) z += 6;
600: }
602: VecRestoreArrayRead(xx,&x);
603: VecRestoreArrayWrite(zz,&zarray);
604: PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);
605: return(0);
606: }
608: PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
609: {
610: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
611: PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
612: const PetscScalar *x,*xb;
613: PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray;
614: const MatScalar *v;
615: PetscErrorCode ierr;
616: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
617: PetscBool usecprow=a->compressedrow.use;
620: VecGetArrayRead(xx,&x);
621: VecGetArrayWrite(zz,&zarray);
623: idx = a->j;
624: v = a->a;
625: if (usecprow) {
626: mbs = a->compressedrow.nrows;
627: ii = a->compressedrow.i;
628: ridx = a->compressedrow.rindex;
629: PetscArrayzero(zarray,7*a->mbs);
630: } else {
631: mbs = a->mbs;
632: ii = a->i;
633: z = zarray;
634: }
636: for (i=0; i<mbs; i++) {
637: n = ii[1] - ii[0];
638: ii++;
639: sum1 = 0.0;
640: sum2 = 0.0;
641: sum3 = 0.0;
642: sum4 = 0.0;
643: sum5 = 0.0;
644: sum6 = 0.0;
645: sum7 = 0.0;
647: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
648: PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
649: for (j=0; j<n; j++) {
650: xb = x + 7*(*idx++);
651: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
652: sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
653: sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
654: sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
655: sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
656: sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
657: sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
658: sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
659: v += 49;
660: }
661: if (usecprow) z = zarray + 7*ridx[i];
662: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
663: if (!usecprow) z += 7;
664: }
666: VecRestoreArrayRead(xx,&x);
667: VecRestoreArrayWrite(zz,&zarray);
668: PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);
669: return(0);
670: }
672: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
673: PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec zz)
674: {
675: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
676: PetscScalar *z = NULL,*work,*workt,*zarray;
677: const PetscScalar *x,*xb;
678: const MatScalar *v;
679: PetscErrorCode ierr;
680: PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
681: const PetscInt *idx,*ii,*ridx=NULL;
682: PetscInt k;
683: PetscBool usecprow=a->compressedrow.use;
685: __m256d a0,a1,a2,a3,a4,a5;
686: __m256d w0,w1,w2,w3;
687: __m256d z0,z1,z2;
688: __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63);
691: VecGetArrayRead(xx,&x);
692: VecGetArrayWrite(zz,&zarray);
694: idx = a->j;
695: v = a->a;
696: if (usecprow) {
697: mbs = a->compressedrow.nrows;
698: ii = a->compressedrow.i;
699: ridx = a->compressedrow.rindex;
700: PetscArrayzero(zarray,bs*a->mbs);
701: } else {
702: mbs = a->mbs;
703: ii = a->i;
704: z = zarray;
705: }
707: if (!a->mult_work) {
708: k = PetscMax(A->rmap->n,A->cmap->n);
709: PetscMalloc1(k+1,&a->mult_work);
710: }
712: work = a->mult_work;
713: for (i=0; i<mbs; i++) {
714: n = ii[1] - ii[0]; ii++;
715: workt = work;
716: for (j=0; j<n; j++) {
717: xb = x + bs*(*idx++);
718: for (k=0; k<bs; k++) workt[k] = xb[k];
719: workt += bs;
720: }
721: if (usecprow) z = zarray + bs*ridx[i];
723: z0 = _mm256_setzero_pd(); z1 = _mm256_setzero_pd(); z2 = _mm256_setzero_pd();
725: for (j=0; j<n; j++) {
726: /* first column of a */
727: w0 = _mm256_set1_pd(work[j*9 ]);
728: a0 = _mm256_loadu_pd(&v[j*81 ]); z0 = _mm256_fmadd_pd(a0,w0,z0);
729: a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1);
730: a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2);
732: /* second column of a */
733: w1 = _mm256_set1_pd(work[j*9+ 1]);
734: a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0);
735: a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1);
736: a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2);
738: /* third column of a */
739: w2 = _mm256_set1_pd(work[j*9 +2]);
740: a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0);
741: a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1);
742: a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2);
744: /* fourth column of a */
745: w3 = _mm256_set1_pd(work[j*9+ 3]);
746: a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0);
747: a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1);
748: a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2);
750: /* fifth column of a */
751: w0 = _mm256_set1_pd(work[j*9+ 4]);
752: a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0);
753: a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1);
754: a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2);
756: /* sixth column of a */
757: w1 = _mm256_set1_pd(work[j*9+ 5]);
758: a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0);
759: a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1);
760: a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2);
762: /* seventh column of a */
763: w2 = _mm256_set1_pd(work[j*9+ 6]);
764: a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0);
765: a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1);
766: a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2);
768: /* eigth column of a */
769: w3 = _mm256_set1_pd(work[j*9+ 7]);
770: a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0);
771: a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1);
772: a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2);
774: /* ninth column of a */
775: w0 = _mm256_set1_pd(work[j*9+ 8]);
776: a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0);
777: a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1);
778: a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2);
779: }
781: _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2);
783: v += n*bs2;
784: if (!usecprow) z += bs;
785: }
786: VecRestoreArrayRead(xx,&x);
787: VecRestoreArrayWrite(zz,&zarray);
788: PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);
789: return(0);
790: }
791: #endif
793: PetscErrorCode MatMult_SeqBAIJ_11(Mat A,Vec xx,Vec zz)
794: {
795: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
796: PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
797: const PetscScalar *x,*xb;
798: PetscScalar *zarray,xv;
799: const MatScalar *v;
800: PetscErrorCode ierr;
801: const PetscInt *ii,*ij=a->j,*idx;
802: PetscInt mbs,i,j,k,n,*ridx=NULL;
803: PetscBool usecprow=a->compressedrow.use;
806: VecGetArrayRead(xx,&x);
807: VecGetArrayWrite(zz,&zarray);
809: v = a->a;
810: if (usecprow) {
811: mbs = a->compressedrow.nrows;
812: ii = a->compressedrow.i;
813: ridx = a->compressedrow.rindex;
814: PetscArrayzero(zarray,11*a->mbs);
815: } else {
816: mbs = a->mbs;
817: ii = a->i;
818: z = zarray;
819: }
821: for (i=0; i<mbs; i++) {
822: n = ii[i+1] - ii[i];
823: idx = ij + ii[i];
824: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
825: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0;
827: for (j=0; j<n; j++) {
828: xb = x + 11*(idx[j]);
830: for (k=0; k<11; k++) {
831: xv = xb[k];
832: sum1 += v[0]*xv;
833: sum2 += v[1]*xv;
834: sum3 += v[2]*xv;
835: sum4 += v[3]*xv;
836: sum5 += v[4]*xv;
837: sum6 += v[5]*xv;
838: sum7 += v[6]*xv;
839: sum8 += v[7]*xv;
840: sum9 += v[8]*xv;
841: sum10 += v[9]*xv;
842: sum11 += v[10]*xv;
843: v += 11;
844: }
845: }
846: if (usecprow) z = zarray + 11*ridx[i];
847: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
848: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11;
850: if (!usecprow) z += 11;
851: }
853: VecRestoreArrayRead(xx,&x);
854: VecRestoreArrayWrite(zz,&zarray);
855: PetscLogFlops(242.0*a->nz - 11.0*a->nonzerorowcnt);
856: return(0);
857: }
859: /* MatMult_SeqBAIJ_12 version 1: Columns in the block are accessed one at a time */
860: PetscErrorCode MatMult_SeqBAIJ_12_ver1(Mat A,Vec xx,Vec zz)
861: {
862: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
863: PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12;
864: const PetscScalar *x,*xb;
865: PetscScalar *zarray,xv;
866: const MatScalar *v;
867: PetscErrorCode ierr;
868: const PetscInt *ii,*ij=a->j,*idx;
869: PetscInt mbs,i,j,k,n,*ridx=NULL;
870: PetscBool usecprow=a->compressedrow.use;
873: VecGetArrayRead(xx,&x);
874: VecGetArrayWrite(zz,&zarray);
876: v = a->a;
877: if (usecprow) {
878: mbs = a->compressedrow.nrows;
879: ii = a->compressedrow.i;
880: ridx = a->compressedrow.rindex;
881: PetscArrayzero(zarray,12*a->mbs);
882: } else {
883: mbs = a->mbs;
884: ii = a->i;
885: z = zarray;
886: }
888: for (i=0; i<mbs; i++) {
889: n = ii[i+1] - ii[i];
890: idx = ij + ii[i];
891: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
892: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0;
894: for (j=0; j<n; j++) {
895: xb = x + 12*(idx[j]);
897: for (k=0; k<12; k++) {
898: xv = xb[k];
899: sum1 += v[0]*xv;
900: sum2 += v[1]*xv;
901: sum3 += v[2]*xv;
902: sum4 += v[3]*xv;
903: sum5 += v[4]*xv;
904: sum6 += v[5]*xv;
905: sum7 += v[6]*xv;
906: sum8 += v[7]*xv;
907: sum9 += v[8]*xv;
908: sum10 += v[9]*xv;
909: sum11 += v[10]*xv;
910: sum12 += v[11]*xv;
911: v += 12;
912: }
913: }
914: if (usecprow) z = zarray + 12*ridx[i];
915: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
916: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12;
917: if (!usecprow) z += 12;
918: }
919: VecRestoreArrayRead(xx,&x);
920: VecRestoreArrayWrite(zz,&zarray);
921: PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt);
922: return(0);
923: }
925: PetscErrorCode MatMultAdd_SeqBAIJ_12_ver1(Mat A,Vec xx,Vec yy,Vec zz)
926: {
927: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
928: PetscScalar *z = NULL,*y = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12;
929: const PetscScalar *x,*xb;
930: PetscScalar *zarray,*yarray,xv;
931: const MatScalar *v;
932: PetscErrorCode ierr;
933: const PetscInt *ii,*ij=a->j,*idx;
934: PetscInt mbs = a->mbs,i,j,k,n,*ridx=NULL;
935: PetscBool usecprow=a->compressedrow.use;
938: VecGetArrayRead(xx,&x);
939: VecGetArrayPair(yy,zz,&yarray,&zarray);
941: v = a->a;
942: if (usecprow) {
943: if (zz != yy) {
944: PetscArraycpy(zarray,yarray,12*mbs);
945: }
946: mbs = a->compressedrow.nrows;
947: ii = a->compressedrow.i;
948: ridx = a->compressedrow.rindex;
949: } else {
950: ii = a->i;
951: y = yarray;
952: z = zarray;
953: }
955: for (i=0; i<mbs; i++) {
956: n = ii[i+1] - ii[i];
957: idx = ij + ii[i];
959: if (usecprow){
960: y = yarray + 12*ridx[i];
961: z = zarray + 12*ridx[i];
962: }
963: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
964: sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10]; sum12 = y[11];
966: for (j=0; j<n; j++) {
967: xb = x + 12*(idx[j]);
969: for (k=0; k<12; k++) {
970: xv = xb[k];
971: sum1 += v[0]*xv;
972: sum2 += v[1]*xv;
973: sum3 += v[2]*xv;
974: sum4 += v[3]*xv;
975: sum5 += v[4]*xv;
976: sum6 += v[5]*xv;
977: sum7 += v[6]*xv;
978: sum8 += v[7]*xv;
979: sum9 += v[8]*xv;
980: sum10 += v[9]*xv;
981: sum11 += v[10]*xv;
982: sum12 += v[11]*xv;
983: v += 12;
984: }
985: }
987: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
988: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12;
989: if (!usecprow) {
990: y += 12;
991: z += 12;
992: }
993: }
994: VecRestoreArrayRead(xx,&x);
995: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
996: PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt);
997: return(0);
998: }
1000: /* MatMult_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
1001: PetscErrorCode MatMult_SeqBAIJ_12_ver2(Mat A,Vec xx,Vec zz)
1002: {
1003: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1004: PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12;
1005: const PetscScalar *x,*xb;
1006: PetscScalar x1,x2,x3,x4,*zarray;
1007: const MatScalar *v;
1008: PetscErrorCode ierr;
1009: const PetscInt *ii,*ij=a->j,*idx,*ridx=NULL;
1010: PetscInt mbs,i,j,n;
1011: PetscBool usecprow=a->compressedrow.use;
1014: VecGetArrayRead(xx,&x);
1015: VecGetArrayWrite(zz,&zarray);
1017: v = a->a;
1018: if (usecprow) {
1019: mbs = a->compressedrow.nrows;
1020: ii = a->compressedrow.i;
1021: ridx = a->compressedrow.rindex;
1022: PetscArrayzero(zarray,12*a->mbs);
1023: } else {
1024: mbs = a->mbs;
1025: ii = a->i;
1026: z = zarray;
1027: }
1029: for (i=0; i<mbs; i++) {
1030: n = ii[i+1] - ii[i];
1031: idx = ij + ii[i];
1033: sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = sum7 = sum8 = sum9 = sum10 = sum11 = sum12 = 0;
1034: for (j=0; j<n; j++) {
1035: xb = x + 12*(idx[j]);
1036: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1038: sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4;
1039: sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4;
1040: sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4;
1041: sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4;
1042: sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4;
1043: sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4;
1044: sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4;
1045: sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4;
1046: sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4;
1047: sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4;
1048: sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4;
1049: sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4;
1050: v += 48;
1052: x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
1054: sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4;
1055: sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4;
1056: sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4;
1057: sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4;
1058: sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4;
1059: sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4;
1060: sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4;
1061: sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4;
1062: sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4;
1063: sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4;
1064: sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4;
1065: sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4;
1066: v += 48;
1068: x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1069: sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4;
1070: sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4;
1071: sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4;
1072: sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4;
1073: sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4;
1074: sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4;
1075: sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4;
1076: sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4;
1077: sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4;
1078: sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4;
1079: sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4;
1080: sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4;
1081: v += 48;
1083: }
1084: if (usecprow) z = zarray + 12*ridx[i];
1085: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1086: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12;
1087: if (!usecprow) z += 12;
1088: }
1089: VecRestoreArrayRead(xx,&x);
1090: VecRestoreArrayWrite(zz,&zarray);
1091: PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt);
1092: return(0);
1093: }
1095: /* MatMultAdd_SeqBAIJ_12_ver2 : Columns in the block are accessed in sets of 4,4,4 */
1096: PetscErrorCode MatMultAdd_SeqBAIJ_12_ver2(Mat A,Vec xx,Vec yy,Vec zz)
1097: {
1098: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1099: PetscScalar *z = NULL,*y = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12;
1100: const PetscScalar *x,*xb;
1101: PetscScalar x1,x2,x3,x4,*zarray,*yarray;
1102: const MatScalar *v;
1103: PetscErrorCode ierr;
1104: const PetscInt *ii,*ij=a->j,*idx,*ridx=NULL;
1105: PetscInt mbs = a->mbs,i,j,n;
1106: PetscBool usecprow=a->compressedrow.use;
1109: VecGetArrayRead(xx,&x);
1110: VecGetArrayPair(yy,zz,&yarray,&zarray);
1112: v = a->a;
1113: if (usecprow) {
1114: if (zz != yy) {
1115: PetscArraycpy(zarray,yarray,12*mbs);
1116: }
1117: mbs = a->compressedrow.nrows;
1118: ii = a->compressedrow.i;
1119: ridx = a->compressedrow.rindex;
1120: } else {
1121: ii = a->i;
1122: y = yarray;
1123: z = zarray;
1124: }
1126: for (i=0; i<mbs; i++) {
1127: n = ii[i+1] - ii[i];
1128: idx = ij + ii[i];
1130: if (usecprow) {
1131: y = yarray + 12*ridx[i];
1132: z = zarray + 12*ridx[i];
1133: }
1134: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1135: sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10]; sum12 = y[11];
1137: for (j=0; j<n; j++) {
1138: xb = x + 12*(idx[j]);
1139: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1141: sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4;
1142: sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4;
1143: sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4;
1144: sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4;
1145: sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4;
1146: sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4;
1147: sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4;
1148: sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4;
1149: sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4;
1150: sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4;
1151: sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4;
1152: sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4;
1153: v += 48;
1155: x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
1157: sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4;
1158: sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4;
1159: sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4;
1160: sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4;
1161: sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4;
1162: sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4;
1163: sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4;
1164: sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4;
1165: sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4;
1166: sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4;
1167: sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4;
1168: sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4;
1169: v += 48;
1171: x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1172: sum1 += v[0]*x1 + v[12]*x2 + v[24]*x3 + v[36]*x4;
1173: sum2 += v[1]*x1 + v[13]*x2 + v[25]*x3 + v[37]*x4;
1174: sum3 += v[2]*x1 + v[14]*x2 + v[26]*x3 + v[38]*x4;
1175: sum4 += v[3]*x1 + v[15]*x2 + v[27]*x3 + v[39]*x4;
1176: sum5 += v[4]*x1 + v[16]*x2 + v[28]*x3 + v[40]*x4;
1177: sum6 += v[5]*x1 + v[17]*x2 + v[29]*x3 + v[41]*x4;
1178: sum7 += v[6]*x1 + v[18]*x2 + v[30]*x3 + v[42]*x4;
1179: sum8 += v[7]*x1 + v[19]*x2 + v[31]*x3 + v[43]*x4;
1180: sum9 += v[8]*x1 + v[20]*x2 + v[32]*x3 + v[44]*x4;
1181: sum10 += v[9]*x1 + v[21]*x2 + v[33]*x3 + v[45]*x4;
1182: sum11 += v[10]*x1 + v[22]*x2 + v[34]*x3 + v[46]*x4;
1183: sum12 += v[11]*x1 + v[23]*x2 + v[35]*x3 + v[47]*x4;
1184: v += 48;
1186: }
1187: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1188: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12;
1189: if (!usecprow) {
1190: y += 12;
1191: z += 12;
1192: }
1193: }
1194: VecRestoreArrayRead(xx,&x);
1195: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1196: PetscLogFlops(288.0*a->nz - 12.0*a->nonzerorowcnt);
1197: return(0);
1198: }
1200: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
1201: PetscErrorCode MatMult_SeqBAIJ_12_AVX2(Mat A,Vec xx,Vec zz)
1202: {
1203: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1204: PetscScalar *z = NULL,*zarray;
1205: const PetscScalar *x,*work;
1206: const MatScalar *v = a->a;
1207: PetscErrorCode ierr;
1208: PetscInt mbs,i,j,n;
1209: const PetscInt *idx = a->j,*ii,*ridx=NULL;
1210: PetscBool usecprow=a->compressedrow.use;
1211: const PetscInt bs = 12, bs2 = 144;
1213: __m256d a0,a1,a2,a3,a4,a5;
1214: __m256d w0,w1,w2,w3;
1215: __m256d z0,z1,z2;
1218: VecGetArrayRead(xx,&x);
1219: VecGetArrayWrite(zz,&zarray);
1221: if (usecprow) {
1222: mbs = a->compressedrow.nrows;
1223: ii = a->compressedrow.i;
1224: ridx = a->compressedrow.rindex;
1225: PetscArrayzero(zarray,bs*a->mbs);
1226: } else {
1227: mbs = a->mbs;
1228: ii = a->i;
1229: z = zarray;
1230: }
1232: for (i=0; i<mbs; i++) {
1233: z0 = _mm256_setzero_pd(); z1 = _mm256_setzero_pd(); z2 = _mm256_setzero_pd();
1235: n = ii[1] - ii[0]; ii++;
1236: for (j=0; j<n; j++) {
1237: work = x + bs*(*idx++);
1239: /* first column of a */
1240: w0 = _mm256_set1_pd(work[0]);
1241: a0 = _mm256_loadu_pd(v+0); z0 = _mm256_fmadd_pd(a0,w0,z0);
1242: a1 = _mm256_loadu_pd(v+4); z1 = _mm256_fmadd_pd(a1,w0,z1);
1243: a2 = _mm256_loadu_pd(v+8); z2 = _mm256_fmadd_pd(a2,w0,z2);
1245: /* second column of a */
1246: w1 = _mm256_set1_pd(work[1]);
1247: a3 = _mm256_loadu_pd(v+12); z0 = _mm256_fmadd_pd(a3,w1,z0);
1248: a4 = _mm256_loadu_pd(v+16); z1 = _mm256_fmadd_pd(a4,w1,z1);
1249: a5 = _mm256_loadu_pd(v+20); z2 = _mm256_fmadd_pd(a5,w1,z2);
1251: /* third column of a */
1252: w2 = _mm256_set1_pd(work[2]);
1253: a0 = _mm256_loadu_pd(v+24); z0 = _mm256_fmadd_pd(a0,w2,z0);
1254: a1 = _mm256_loadu_pd(v+28); z1 = _mm256_fmadd_pd(a1,w2,z1);
1255: a2 = _mm256_loadu_pd(v+32); z2 = _mm256_fmadd_pd(a2,w2,z2);
1257: /* fourth column of a */
1258: w3 = _mm256_set1_pd(work[3]);
1259: a3 = _mm256_loadu_pd(v+36); z0 = _mm256_fmadd_pd(a3,w3,z0);
1260: a4 = _mm256_loadu_pd(v+40); z1 = _mm256_fmadd_pd(a4,w3,z1);
1261: a5 = _mm256_loadu_pd(v+44); z2 = _mm256_fmadd_pd(a5,w3,z2);
1263: /* fifth column of a */
1264: w0 = _mm256_set1_pd(work[4]);
1265: a0 = _mm256_loadu_pd(v+48); z0 = _mm256_fmadd_pd(a0,w0,z0);
1266: a1 = _mm256_loadu_pd(v+52); z1 = _mm256_fmadd_pd(a1,w0,z1);
1267: a2 = _mm256_loadu_pd(v+56); z2 = _mm256_fmadd_pd(a2,w0,z2);
1269: /* sixth column of a */
1270: w1 = _mm256_set1_pd(work[5]);
1271: a3 = _mm256_loadu_pd(v+60); z0 = _mm256_fmadd_pd(a3,w1,z0);
1272: a4 = _mm256_loadu_pd(v+64); z1 = _mm256_fmadd_pd(a4,w1,z1);
1273: a5 = _mm256_loadu_pd(v+68); z2 = _mm256_fmadd_pd(a5,w1,z2);
1275: /* seventh column of a */
1276: w2 = _mm256_set1_pd(work[6]);
1277: a0 = _mm256_loadu_pd(v+72); z0 = _mm256_fmadd_pd(a0,w2,z0);
1278: a1 = _mm256_loadu_pd(v+76); z1 = _mm256_fmadd_pd(a1,w2,z1);
1279: a2 = _mm256_loadu_pd(v+80); z2 = _mm256_fmadd_pd(a2,w2,z2);
1281: /* eigth column of a */
1282: w3 = _mm256_set1_pd(work[7]);
1283: a3 = _mm256_loadu_pd(v+84); z0 = _mm256_fmadd_pd(a3,w3,z0);
1284: a4 = _mm256_loadu_pd(v+88); z1 = _mm256_fmadd_pd(a4,w3,z1);
1285: a5 = _mm256_loadu_pd(v+92); z2 = _mm256_fmadd_pd(a5,w3,z2);
1287: /* ninth column of a */
1288: w0 = _mm256_set1_pd(work[8]);
1289: a0 = _mm256_loadu_pd(v+96); z0 = _mm256_fmadd_pd(a0,w0,z0);
1290: a1 = _mm256_loadu_pd(v+100); z1 = _mm256_fmadd_pd(a1,w0,z1);
1291: a2 = _mm256_loadu_pd(v+104); z2 = _mm256_fmadd_pd(a2,w0,z2);
1293: /* tenth column of a */
1294: w1 = _mm256_set1_pd(work[9]);
1295: a3 = _mm256_loadu_pd(v+108); z0 = _mm256_fmadd_pd(a3,w1,z0);
1296: a4 = _mm256_loadu_pd(v+112); z1 = _mm256_fmadd_pd(a4,w1,z1);
1297: a5 = _mm256_loadu_pd(v+116); z2 = _mm256_fmadd_pd(a5,w1,z2);
1299: /* eleventh column of a */
1300: w2 = _mm256_set1_pd(work[10]);
1301: a0 = _mm256_loadu_pd(v+120); z0 = _mm256_fmadd_pd(a0,w2,z0);
1302: a1 = _mm256_loadu_pd(v+124); z1 = _mm256_fmadd_pd(a1,w2,z1);
1303: a2 = _mm256_loadu_pd(v+128); z2 = _mm256_fmadd_pd(a2,w2,z2);
1305: /* twelveth column of a */
1306: w3 = _mm256_set1_pd(work[11]);
1307: a3 = _mm256_loadu_pd(v+132); z0 = _mm256_fmadd_pd(a3,w3,z0);
1308: a4 = _mm256_loadu_pd(v+136); z1 = _mm256_fmadd_pd(a4,w3,z1);
1309: a5 = _mm256_loadu_pd(v+140); z2 = _mm256_fmadd_pd(a5,w3,z2);
1311: v += bs2;
1312: }
1313: if (usecprow) z = zarray + bs*ridx[i];
1314: _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_storeu_pd(&z[ 8], z2);
1315: if (!usecprow) z += bs;
1316: }
1317: VecRestoreArrayRead(xx,&x);
1318: VecRestoreArrayWrite(zz,&zarray);
1319: PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);
1320: return(0);
1321: }
1322: #endif
1324: /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
1325: /* Default MatMult for block size 15 */
1326: PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
1327: {
1328: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1329: PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1330: const PetscScalar *x,*xb;
1331: PetscScalar *zarray,xv;
1332: const MatScalar *v;
1333: PetscErrorCode ierr;
1334: const PetscInt *ii,*ij=a->j,*idx;
1335: PetscInt mbs,i,j,k,n,*ridx=NULL;
1336: PetscBool usecprow=a->compressedrow.use;
1339: VecGetArrayRead(xx,&x);
1340: VecGetArrayWrite(zz,&zarray);
1342: v = a->a;
1343: if (usecprow) {
1344: mbs = a->compressedrow.nrows;
1345: ii = a->compressedrow.i;
1346: ridx = a->compressedrow.rindex;
1347: PetscArrayzero(zarray,15*a->mbs);
1348: } else {
1349: mbs = a->mbs;
1350: ii = a->i;
1351: z = zarray;
1352: }
1354: for (i=0; i<mbs; i++) {
1355: n = ii[i+1] - ii[i];
1356: idx = ij + ii[i];
1357: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1358: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1360: for (j=0; j<n; j++) {
1361: xb = x + 15*(idx[j]);
1363: for (k=0; k<15; k++) {
1364: xv = xb[k];
1365: sum1 += v[0]*xv;
1366: sum2 += v[1]*xv;
1367: sum3 += v[2]*xv;
1368: sum4 += v[3]*xv;
1369: sum5 += v[4]*xv;
1370: sum6 += v[5]*xv;
1371: sum7 += v[6]*xv;
1372: sum8 += v[7]*xv;
1373: sum9 += v[8]*xv;
1374: sum10 += v[9]*xv;
1375: sum11 += v[10]*xv;
1376: sum12 += v[11]*xv;
1377: sum13 += v[12]*xv;
1378: sum14 += v[13]*xv;
1379: sum15 += v[14]*xv;
1380: v += 15;
1381: }
1382: }
1383: if (usecprow) z = zarray + 15*ridx[i];
1384: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1385: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1387: if (!usecprow) z += 15;
1388: }
1390: VecRestoreArrayRead(xx,&x);
1391: VecRestoreArrayWrite(zz,&zarray);
1392: PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1393: return(0);
1394: }
1396: /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
1397: PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
1398: {
1399: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1400: PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1401: const PetscScalar *x,*xb;
1402: PetscScalar x1,x2,x3,x4,*zarray;
1403: const MatScalar *v;
1404: PetscErrorCode ierr;
1405: const PetscInt *ii,*ij=a->j,*idx;
1406: PetscInt mbs,i,j,n,*ridx=NULL;
1407: PetscBool usecprow=a->compressedrow.use;
1410: VecGetArrayRead(xx,&x);
1411: VecGetArrayWrite(zz,&zarray);
1413: v = a->a;
1414: if (usecprow) {
1415: mbs = a->compressedrow.nrows;
1416: ii = a->compressedrow.i;
1417: ridx = a->compressedrow.rindex;
1418: PetscArrayzero(zarray,15*a->mbs);
1419: } else {
1420: mbs = a->mbs;
1421: ii = a->i;
1422: z = zarray;
1423: }
1425: for (i=0; i<mbs; i++) {
1426: n = ii[i+1] - ii[i];
1427: idx = ij + ii[i];
1428: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1429: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1431: for (j=0; j<n; j++) {
1432: xb = x + 15*(idx[j]);
1433: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1435: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4;
1436: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4;
1437: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4;
1438: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4;
1439: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4;
1440: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4;
1441: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4;
1442: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4;
1443: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4;
1444: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4;
1445: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4;
1446: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4;
1447: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4;
1448: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4;
1449: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4;
1451: v += 60;
1453: x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
1455: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4;
1456: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4;
1457: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4;
1458: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4;
1459: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4;
1460: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4;
1461: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4;
1462: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4;
1463: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4;
1464: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4;
1465: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4;
1466: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4;
1467: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4;
1468: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4;
1469: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4;
1470: v += 60;
1472: x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1473: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4;
1474: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4;
1475: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4;
1476: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4;
1477: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4;
1478: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4;
1479: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4;
1480: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4;
1481: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4;
1482: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4;
1483: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4;
1484: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4;
1485: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4;
1486: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4;
1487: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4;
1488: v += 60;
1490: x1 = xb[12]; x2 = xb[13]; x3 = xb[14];
1491: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3;
1492: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3;
1493: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3;
1494: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3;
1495: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3;
1496: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3;
1497: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3;
1498: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3;
1499: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3;
1500: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
1501: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
1502: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
1503: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
1504: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
1505: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
1506: v += 45;
1507: }
1508: if (usecprow) z = zarray + 15*ridx[i];
1509: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1510: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1512: if (!usecprow) z += 15;
1513: }
1515: VecRestoreArrayRead(xx,&x);
1516: VecRestoreArrayWrite(zz,&zarray);
1517: PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1518: return(0);
1519: }
1521: /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
1522: PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
1523: {
1524: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1525: PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1526: const PetscScalar *x,*xb;
1527: PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
1528: const MatScalar *v;
1529: PetscErrorCode ierr;
1530: const PetscInt *ii,*ij=a->j,*idx;
1531: PetscInt mbs,i,j,n,*ridx=NULL;
1532: PetscBool usecprow=a->compressedrow.use;
1535: VecGetArrayRead(xx,&x);
1536: VecGetArrayWrite(zz,&zarray);
1538: v = a->a;
1539: if (usecprow) {
1540: mbs = a->compressedrow.nrows;
1541: ii = a->compressedrow.i;
1542: ridx = a->compressedrow.rindex;
1543: PetscArrayzero(zarray,15*a->mbs);
1544: } else {
1545: mbs = a->mbs;
1546: ii = a->i;
1547: z = zarray;
1548: }
1550: for (i=0; i<mbs; i++) {
1551: n = ii[i+1] - ii[i];
1552: idx = ij + ii[i];
1553: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1554: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1556: for (j=0; j<n; j++) {
1557: xb = x + 15*(idx[j]);
1558: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1559: x8 = xb[7];
1561: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
1562: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
1563: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
1564: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
1565: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
1566: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
1567: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
1568: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
1569: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
1570: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
1571: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
1572: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
1573: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
1574: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
1575: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
1576: v += 120;
1578: x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];
1580: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
1581: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
1582: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
1583: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
1584: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
1585: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
1586: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
1587: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
1588: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
1589: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
1590: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
1591: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
1592: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
1593: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
1594: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
1595: v += 105;
1596: }
1597: if (usecprow) z = zarray + 15*ridx[i];
1598: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1599: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1601: if (!usecprow) z += 15;
1602: }
1604: VecRestoreArrayRead(xx,&x);
1605: VecRestoreArrayWrite(zz,&zarray);
1606: PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1607: return(0);
1608: }
1610: /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
1611: PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
1612: {
1613: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1614: PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1615: const PetscScalar *x,*xb;
1616: PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
1617: const MatScalar *v;
1618: PetscErrorCode ierr;
1619: const PetscInt *ii,*ij=a->j,*idx;
1620: PetscInt mbs,i,j,n,*ridx=NULL;
1621: PetscBool usecprow=a->compressedrow.use;
1624: VecGetArrayRead(xx,&x);
1625: VecGetArrayWrite(zz,&zarray);
1627: v = a->a;
1628: if (usecprow) {
1629: mbs = a->compressedrow.nrows;
1630: ii = a->compressedrow.i;
1631: ridx = a->compressedrow.rindex;
1632: PetscArrayzero(zarray,15*a->mbs);
1633: } else {
1634: mbs = a->mbs;
1635: ii = a->i;
1636: z = zarray;
1637: }
1639: for (i=0; i<mbs; i++) {
1640: n = ii[i+1] - ii[i];
1641: idx = ij + ii[i];
1642: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1643: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1645: for (j=0; j<n; j++) {
1646: xb = x + 15*(idx[j]);
1647: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1648: x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];
1650: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
1651: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
1652: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
1653: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
1654: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
1655: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
1656: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
1657: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
1658: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
1659: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
1660: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
1661: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
1662: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
1663: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
1664: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
1665: v += 225;
1666: }
1667: if (usecprow) z = zarray + 15*ridx[i];
1668: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1669: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1671: if (!usecprow) z += 15;
1672: }
1674: VecRestoreArrayRead(xx,&x);
1675: VecRestoreArrayWrite(zz,&zarray);
1676: PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1677: return(0);
1678: }
1680: /*
1681: This will not work with MatScalar == float because it calls the BLAS
1682: */
1683: PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
1684: {
1685: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1686: PetscScalar *z = NULL,*work,*workt,*zarray;
1687: const PetscScalar *x,*xb;
1688: const MatScalar *v;
1689: PetscErrorCode ierr;
1690: PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1691: const PetscInt *idx,*ii,*ridx=NULL;
1692: PetscInt ncols,k;
1693: PetscBool usecprow=a->compressedrow.use;
1696: VecGetArrayRead(xx,&x);
1697: VecGetArrayWrite(zz,&zarray);
1699: idx = a->j;
1700: v = a->a;
1701: if (usecprow) {
1702: mbs = a->compressedrow.nrows;
1703: ii = a->compressedrow.i;
1704: ridx = a->compressedrow.rindex;
1705: PetscArrayzero(zarray,bs*a->mbs);
1706: } else {
1707: mbs = a->mbs;
1708: ii = a->i;
1709: z = zarray;
1710: }
1712: if (!a->mult_work) {
1713: k = PetscMax(A->rmap->n,A->cmap->n);
1714: PetscMalloc1(k+1,&a->mult_work);
1715: }
1716: work = a->mult_work;
1717: for (i=0; i<mbs; i++) {
1718: n = ii[1] - ii[0]; ii++;
1719: ncols = n*bs;
1720: workt = work;
1721: for (j=0; j<n; j++) {
1722: xb = x + bs*(*idx++);
1723: for (k=0; k<bs; k++) workt[k] = xb[k];
1724: workt += bs;
1725: }
1726: if (usecprow) z = zarray + bs*ridx[i];
1727: PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1728: v += n*bs2;
1729: if (!usecprow) z += bs;
1730: }
1731: VecRestoreArrayRead(xx,&x);
1732: VecRestoreArrayWrite(zz,&zarray);
1733: PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);
1734: return(0);
1735: }
1737: PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1738: {
1739: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1740: const PetscScalar *x;
1741: PetscScalar *y,*z,sum;
1742: const MatScalar *v;
1743: PetscErrorCode ierr;
1744: PetscInt mbs=a->mbs,i,n,*ridx=NULL;
1745: const PetscInt *idx,*ii;
1746: PetscBool usecprow=a->compressedrow.use;
1749: VecGetArrayRead(xx,&x);
1750: VecGetArrayPair(yy,zz,&y,&z);
1752: idx = a->j;
1753: v = a->a;
1754: if (usecprow) {
1755: if (zz != yy) {
1756: PetscArraycpy(z,y,mbs);
1757: }
1758: mbs = a->compressedrow.nrows;
1759: ii = a->compressedrow.i;
1760: ridx = a->compressedrow.rindex;
1761: } else {
1762: ii = a->i;
1763: }
1765: for (i=0; i<mbs; i++) {
1766: n = ii[1] - ii[0];
1767: ii++;
1768: if (!usecprow) {
1769: sum = y[i];
1770: } else {
1771: sum = y[ridx[i]];
1772: }
1773: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1774: PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1775: PetscSparseDensePlusDot(sum,x,v,idx,n);
1776: v += n;
1777: idx += n;
1778: if (usecprow) {
1779: z[ridx[i]] = sum;
1780: } else {
1781: z[i] = sum;
1782: }
1783: }
1784: VecRestoreArrayRead(xx,&x);
1785: VecRestoreArrayPair(yy,zz,&y,&z);
1786: PetscLogFlops(2.0*a->nz);
1787: return(0);
1788: }
1790: PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1791: {
1792: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1793: PetscScalar *y = NULL,*z = NULL,sum1,sum2;
1794: const PetscScalar *x,*xb;
1795: PetscScalar x1,x2,*yarray,*zarray;
1796: const MatScalar *v;
1797: PetscErrorCode ierr;
1798: PetscInt mbs = a->mbs,i,n,j;
1799: const PetscInt *idx,*ii,*ridx = NULL;
1800: PetscBool usecprow = a->compressedrow.use;
1803: VecGetArrayRead(xx,&x);
1804: VecGetArrayPair(yy,zz,&yarray,&zarray);
1806: idx = a->j;
1807: v = a->a;
1808: if (usecprow) {
1809: if (zz != yy) {
1810: PetscArraycpy(zarray,yarray,2*mbs);
1811: }
1812: mbs = a->compressedrow.nrows;
1813: ii = a->compressedrow.i;
1814: ridx = a->compressedrow.rindex;
1815: } else {
1816: ii = a->i;
1817: y = yarray;
1818: z = zarray;
1819: }
1821: for (i=0; i<mbs; i++) {
1822: n = ii[1] - ii[0]; ii++;
1823: if (usecprow) {
1824: z = zarray + 2*ridx[i];
1825: y = yarray + 2*ridx[i];
1826: }
1827: sum1 = y[0]; sum2 = y[1];
1828: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1829: PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1830: for (j=0; j<n; j++) {
1831: xb = x + 2*(*idx++);
1832: x1 = xb[0];
1833: x2 = xb[1];
1835: sum1 += v[0]*x1 + v[2]*x2;
1836: sum2 += v[1]*x1 + v[3]*x2;
1837: v += 4;
1838: }
1839: z[0] = sum1; z[1] = sum2;
1840: if (!usecprow) {
1841: z += 2; y += 2;
1842: }
1843: }
1844: VecRestoreArrayRead(xx,&x);
1845: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1846: PetscLogFlops(4.0*a->nz);
1847: return(0);
1848: }
1850: PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1851: {
1852: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1853: PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1854: const PetscScalar *x,*xb;
1855: const MatScalar *v;
1856: PetscErrorCode ierr;
1857: PetscInt mbs = a->mbs,i,j,n;
1858: const PetscInt *idx,*ii,*ridx = NULL;
1859: PetscBool usecprow = a->compressedrow.use;
1862: VecGetArrayRead(xx,&x);
1863: VecGetArrayPair(yy,zz,&yarray,&zarray);
1865: idx = a->j;
1866: v = a->a;
1867: if (usecprow) {
1868: if (zz != yy) {
1869: PetscArraycpy(zarray,yarray,3*mbs);
1870: }
1871: mbs = a->compressedrow.nrows;
1872: ii = a->compressedrow.i;
1873: ridx = a->compressedrow.rindex;
1874: } else {
1875: ii = a->i;
1876: y = yarray;
1877: z = zarray;
1878: }
1880: for (i=0; i<mbs; i++) {
1881: n = ii[1] - ii[0]; ii++;
1882: if (usecprow) {
1883: z = zarray + 3*ridx[i];
1884: y = yarray + 3*ridx[i];
1885: }
1886: sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1887: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1888: PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1889: for (j=0; j<n; j++) {
1890: xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1891: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1892: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1893: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1894: v += 9;
1895: }
1896: z[0] = sum1; z[1] = sum2; z[2] = sum3;
1897: if (!usecprow) {
1898: z += 3; y += 3;
1899: }
1900: }
1901: VecRestoreArrayRead(xx,&x);
1902: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1903: PetscLogFlops(18.0*a->nz);
1904: return(0);
1905: }
1907: PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1908: {
1909: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1910: PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1911: const PetscScalar *x,*xb;
1912: const MatScalar *v;
1913: PetscErrorCode ierr;
1914: PetscInt mbs = a->mbs,i,j,n;
1915: const PetscInt *idx,*ii,*ridx=NULL;
1916: PetscBool usecprow=a->compressedrow.use;
1919: VecGetArrayRead(xx,&x);
1920: VecGetArrayPair(yy,zz,&yarray,&zarray);
1922: idx = a->j;
1923: v = a->a;
1924: if (usecprow) {
1925: if (zz != yy) {
1926: PetscArraycpy(zarray,yarray,4*mbs);
1927: }
1928: mbs = a->compressedrow.nrows;
1929: ii = a->compressedrow.i;
1930: ridx = a->compressedrow.rindex;
1931: } else {
1932: ii = a->i;
1933: y = yarray;
1934: z = zarray;
1935: }
1937: for (i=0; i<mbs; i++) {
1938: n = ii[1] - ii[0]; ii++;
1939: if (usecprow) {
1940: z = zarray + 4*ridx[i];
1941: y = yarray + 4*ridx[i];
1942: }
1943: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1944: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1945: PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1946: for (j=0; j<n; j++) {
1947: xb = x + 4*(*idx++);
1948: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1949: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1950: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1951: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1952: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1953: v += 16;
1954: }
1955: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1956: if (!usecprow) {
1957: z += 4; y += 4;
1958: }
1959: }
1960: VecRestoreArrayRead(xx,&x);
1961: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1962: PetscLogFlops(32.0*a->nz);
1963: return(0);
1964: }
1966: PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1967: {
1968: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1969: PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1970: const PetscScalar *x,*xb;
1971: PetscScalar *yarray,*zarray;
1972: const MatScalar *v;
1973: PetscErrorCode ierr;
1974: PetscInt mbs = a->mbs,i,j,n;
1975: const PetscInt *idx,*ii,*ridx = NULL;
1976: PetscBool usecprow=a->compressedrow.use;
1979: VecGetArrayRead(xx,&x);
1980: VecGetArrayPair(yy,zz,&yarray,&zarray);
1982: idx = a->j;
1983: v = a->a;
1984: if (usecprow) {
1985: if (zz != yy) {
1986: PetscArraycpy(zarray,yarray,5*mbs);
1987: }
1988: mbs = a->compressedrow.nrows;
1989: ii = a->compressedrow.i;
1990: ridx = a->compressedrow.rindex;
1991: } else {
1992: ii = a->i;
1993: y = yarray;
1994: z = zarray;
1995: }
1997: for (i=0; i<mbs; i++) {
1998: n = ii[1] - ii[0]; ii++;
1999: if (usecprow) {
2000: z = zarray + 5*ridx[i];
2001: y = yarray + 5*ridx[i];
2002: }
2003: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
2004: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2005: PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2006: for (j=0; j<n; j++) {
2007: xb = x + 5*(*idx++);
2008: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
2009: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2010: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2011: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2012: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2013: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
2014: v += 25;
2015: }
2016: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
2017: if (!usecprow) {
2018: z += 5; y += 5;
2019: }
2020: }
2021: VecRestoreArrayRead(xx,&x);
2022: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
2023: PetscLogFlops(50.0*a->nz);
2024: return(0);
2025: }
2027: PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
2028: {
2029: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2030: PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6;
2031: const PetscScalar *x,*xb;
2032: PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray;
2033: const MatScalar *v;
2034: PetscErrorCode ierr;
2035: PetscInt mbs = a->mbs,i,j,n;
2036: const PetscInt *idx,*ii,*ridx=NULL;
2037: PetscBool usecprow=a->compressedrow.use;
2040: VecGetArrayRead(xx,&x);
2041: VecGetArrayPair(yy,zz,&yarray,&zarray);
2043: idx = a->j;
2044: v = a->a;
2045: if (usecprow) {
2046: if (zz != yy) {
2047: PetscArraycpy(zarray,yarray,6*mbs);
2048: }
2049: mbs = a->compressedrow.nrows;
2050: ii = a->compressedrow.i;
2051: ridx = a->compressedrow.rindex;
2052: } else {
2053: ii = a->i;
2054: y = yarray;
2055: z = zarray;
2056: }
2058: for (i=0; i<mbs; i++) {
2059: n = ii[1] - ii[0]; ii++;
2060: if (usecprow) {
2061: z = zarray + 6*ridx[i];
2062: y = yarray + 6*ridx[i];
2063: }
2064: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
2065: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2066: PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2067: for (j=0; j<n; j++) {
2068: xb = x + 6*(*idx++);
2069: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
2070: sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2071: sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2072: sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2073: sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2074: sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2075: sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2076: v += 36;
2077: }
2078: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
2079: if (!usecprow) {
2080: z += 6; y += 6;
2081: }
2082: }
2083: VecRestoreArrayRead(xx,&x);
2084: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
2085: PetscLogFlops(72.0*a->nz);
2086: return(0);
2087: }
2089: PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
2090: {
2091: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2092: PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
2093: const PetscScalar *x,*xb;
2094: PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
2095: const MatScalar *v;
2096: PetscErrorCode ierr;
2097: PetscInt mbs = a->mbs,i,j,n;
2098: const PetscInt *idx,*ii,*ridx = NULL;
2099: PetscBool usecprow=a->compressedrow.use;
2102: VecGetArrayRead(xx,&x);
2103: VecGetArrayPair(yy,zz,&yarray,&zarray);
2105: idx = a->j;
2106: v = a->a;
2107: if (usecprow) {
2108: if (zz != yy) {
2109: PetscArraycpy(zarray,yarray,7*mbs);
2110: }
2111: mbs = a->compressedrow.nrows;
2112: ii = a->compressedrow.i;
2113: ridx = a->compressedrow.rindex;
2114: } else {
2115: ii = a->i;
2116: y = yarray;
2117: z = zarray;
2118: }
2120: for (i=0; i<mbs; i++) {
2121: n = ii[1] - ii[0]; ii++;
2122: if (usecprow) {
2123: z = zarray + 7*ridx[i];
2124: y = yarray + 7*ridx[i];
2125: }
2126: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
2127: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2128: PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2129: for (j=0; j<n; j++) {
2130: xb = x + 7*(*idx++);
2131: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
2132: sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2133: sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2134: sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2135: sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2136: sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2137: sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2138: sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2139: v += 49;
2140: }
2141: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
2142: if (!usecprow) {
2143: z += 7; y += 7;
2144: }
2145: }
2146: VecRestoreArrayRead(xx,&x);
2147: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
2148: PetscLogFlops(98.0*a->nz);
2149: return(0);
2150: }
2152: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
2153: PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec yy,Vec zz)
2154: {
2155: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2156: PetscScalar *z = NULL,*work,*workt,*zarray;
2157: const PetscScalar *x,*xb;
2158: const MatScalar *v;
2159: PetscErrorCode ierr;
2160: PetscInt mbs,i,j,n;
2161: PetscInt k;
2162: PetscBool usecprow=a->compressedrow.use;
2163: const PetscInt *idx,*ii,*ridx=NULL,bs = 9, bs2 = 81;
2165: __m256d a0,a1,a2,a3,a4,a5;
2166: __m256d w0,w1,w2,w3;
2167: __m256d z0,z1,z2;
2168: __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63);
2171: VecCopy(yy,zz);
2172: VecGetArrayRead(xx,&x);
2173: VecGetArray(zz,&zarray);
2175: idx = a->j;
2176: v = a->a;
2177: if (usecprow) {
2178: mbs = a->compressedrow.nrows;
2179: ii = a->compressedrow.i;
2180: ridx = a->compressedrow.rindex;
2181: } else {
2182: mbs = a->mbs;
2183: ii = a->i;
2184: z = zarray;
2185: }
2187: if (!a->mult_work) {
2188: k = PetscMax(A->rmap->n,A->cmap->n);
2189: PetscMalloc1(k+1,&a->mult_work);
2190: }
2192: work = a->mult_work;
2193: for (i=0; i<mbs; i++) {
2194: n = ii[1] - ii[0]; ii++;
2195: workt = work;
2196: for (j=0; j<n; j++) {
2197: xb = x + bs*(*idx++);
2198: for (k=0; k<bs; k++) workt[k] = xb[k];
2199: workt += bs;
2200: }
2201: if (usecprow) z = zarray + bs*ridx[i];
2203: z0 = _mm256_loadu_pd(&z[ 0]); z1 = _mm256_loadu_pd(&z[ 4]); z2 = _mm256_set1_pd(z[ 8]);
2205: for (j=0; j<n; j++) {
2206: /* first column of a */
2207: w0 = _mm256_set1_pd(work[j*9 ]);
2208: a0 = _mm256_loadu_pd(&v[j*81 ]); z0 = _mm256_fmadd_pd(a0,w0,z0);
2209: a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1);
2210: a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2);
2212: /* second column of a */
2213: w1 = _mm256_set1_pd(work[j*9+ 1]);
2214: a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0);
2215: a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1);
2216: a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2);
2218: /* third column of a */
2219: w2 = _mm256_set1_pd(work[j*9+ 2]);
2220: a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0);
2221: a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1);
2222: a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2);
2224: /* fourth column of a */
2225: w3 = _mm256_set1_pd(work[j*9+ 3]);
2226: a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0);
2227: a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1);
2228: a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2);
2230: /* fifth column of a */
2231: w0 = _mm256_set1_pd(work[j*9+ 4]);
2232: a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0);
2233: a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1);
2234: a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2);
2236: /* sixth column of a */
2237: w1 = _mm256_set1_pd(work[j*9+ 5]);
2238: a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0);
2239: a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1);
2240: a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2);
2242: /* seventh column of a */
2243: w2 = _mm256_set1_pd(work[j*9+ 6]);
2244: a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0);
2245: a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1);
2246: a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2);
2248: /* eigth column of a */
2249: w3 = _mm256_set1_pd(work[j*9+ 7]);
2250: a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0);
2251: a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1);
2252: a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2);
2254: /* ninth column of a */
2255: w0 = _mm256_set1_pd(work[j*9+ 8]);
2256: a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0);
2257: a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1);
2258: a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2);
2259: }
2261: _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2);
2263: v += n*bs2;
2264: if (!usecprow) z += bs;
2265: }
2266: VecRestoreArrayRead(xx,&x);
2267: VecRestoreArray(zz,&zarray);
2268: PetscLogFlops(162.0*a->nz);
2269: return(0);
2270: }
2271: #endif
2273: PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A,Vec xx,Vec yy,Vec zz)
2274: {
2275: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2276: PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
2277: const PetscScalar *x,*xb;
2278: PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,*yarray,*zarray;
2279: const MatScalar *v;
2280: PetscErrorCode ierr;
2281: PetscInt mbs = a->mbs,i,j,n;
2282: const PetscInt *idx,*ii,*ridx = NULL;
2283: PetscBool usecprow=a->compressedrow.use;
2286: VecGetArrayRead(xx,&x);
2287: VecGetArrayPair(yy,zz,&yarray,&zarray);
2289: idx = a->j;
2290: v = a->a;
2291: if (usecprow) {
2292: if (zz != yy) {
2293: PetscArraycpy(zarray,yarray,7*mbs);
2294: }
2295: mbs = a->compressedrow.nrows;
2296: ii = a->compressedrow.i;
2297: ridx = a->compressedrow.rindex;
2298: } else {
2299: ii = a->i;
2300: y = yarray;
2301: z = zarray;
2302: }
2304: for (i=0; i<mbs; i++) {
2305: n = ii[1] - ii[0]; ii++;
2306: if (usecprow) {
2307: z = zarray + 11*ridx[i];
2308: y = yarray + 11*ridx[i];
2309: }
2310: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
2311: sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10];
2312: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2313: PetscPrefetchBlock(v+121*n,121*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2314: for (j=0; j<n; j++) {
2315: xb = x + 11*(*idx++);
2316: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10];
2317: sum1 += v[ 0]*x1 + v[ 11]*x2 + v[2*11]*x3 + v[3*11]*x4 + v[4*11]*x5 + v[5*11]*x6 + v[6*11]*x7+ v[7*11]*x8 + v[8*11]*x9 + v[9*11]*x10 + v[10*11]*x11;
2318: sum2 += v[1+0]*x1 + v[1+11]*x2 + v[1+2*11]*x3 + v[1+3*11]*x4 + v[1+4*11]*x5 + v[1+5*11]*x6 + v[1+6*11]*x7+ v[1+7*11]*x8 + v[1+8*11]*x9 + v[1+9*11]*x10 + v[1+10*11]*x11;
2319: sum3 += v[2+0]*x1 + v[2+11]*x2 + v[2+2*11]*x3 + v[2+3*11]*x4 + v[2+4*11]*x5 + v[2+5*11]*x6 + v[2+6*11]*x7+ v[2+7*11]*x8 + v[2+8*11]*x9 + v[2+9*11]*x10 + v[2+10*11]*x11;
2320: sum4 += v[3+0]*x1 + v[3+11]*x2 + v[3+2*11]*x3 + v[3+3*11]*x4 + v[3+4*11]*x5 + v[3+5*11]*x6 + v[3+6*11]*x7+ v[3+7*11]*x8 + v[3+8*11]*x9 + v[3+9*11]*x10 + v[3+10*11]*x11;
2321: sum5 += v[4+0]*x1 + v[4+11]*x2 + v[4+2*11]*x3 + v[4+3*11]*x4 + v[4+4*11]*x5 + v[4+5*11]*x6 + v[4+6*11]*x7+ v[4+7*11]*x8 + v[4+8*11]*x9 + v[4+9*11]*x10 + v[4+10*11]*x11;
2322: sum6 += v[5+0]*x1 + v[5+11]*x2 + v[5+2*11]*x3 + v[5+3*11]*x4 + v[5+4*11]*x5 + v[5+5*11]*x6 + v[5+6*11]*x7+ v[5+7*11]*x8 + v[5+8*11]*x9 + v[5+9*11]*x10 + v[5+10*11]*x11;
2323: sum7 += v[6+0]*x1 + v[6+11]*x2 + v[6+2*11]*x3 + v[6+3*11]*x4 + v[6+4*11]*x5 + v[6+5*11]*x6 + v[6+6*11]*x7+ v[6+7*11]*x8 + v[6+8*11]*x9 + v[6+9*11]*x10 + v[6+10*11]*x11;
2324: sum8 += v[7+0]*x1 + v[7+11]*x2 + v[7+2*11]*x3 + v[7+3*11]*x4 + v[7+4*11]*x5 + v[7+5*11]*x6 + v[7+6*11]*x7+ v[7+7*11]*x8 + v[7+8*11]*x9 + v[7+9*11]*x10 + v[7+10*11]*x11;
2325: sum9 += v[8+0]*x1 + v[8+11]*x2 + v[8+2*11]*x3 + v[8+3*11]*x4 + v[8+4*11]*x5 + v[8+5*11]*x6 + v[8+6*11]*x7+ v[8+7*11]*x8 + v[8+8*11]*x9 + v[8+9*11]*x10 + v[8+10*11]*x11;
2326: sum10 += v[9+0]*x1 + v[9+11]*x2 + v[9+2*11]*x3 + v[9+3*11]*x4 + v[9+4*11]*x5 + v[9+5*11]*x6 + v[9+6*11]*x7+ v[9+7*11]*x8 + v[9+8*11]*x9 + v[9+9*11]*x10 + v[9+10*11]*x11;
2327: sum11 += v[10+0]*x1 + v[10+11]*x2 + v[10+2*11]*x3 + v[10+3*11]*x4 + v[10+4*11]*x5 + v[10+5*11]*x6 + v[10+6*11]*x7+ v[10+7*11]*x8 + v[10+8*11]*x9 + v[10+9*11]*x10 + v[10+10*11]*x11;
2328: v += 121;
2329: }
2330: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
2331: z[7] = sum6; z[8] = sum7; z[9] = sum8; z[10] = sum9; z[11] = sum10;
2332: if (!usecprow) {
2333: z += 11; y += 11;
2334: }
2335: }
2336: VecRestoreArrayRead(xx,&x);
2337: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
2338: PetscLogFlops(242.0*a->nz);
2339: return(0);
2340: }
2342: PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
2343: {
2344: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2345: PetscScalar *z = NULL,*work,*workt,*zarray;
2346: const PetscScalar *x,*xb;
2347: const MatScalar *v;
2348: PetscErrorCode ierr;
2349: PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
2350: PetscInt ncols,k;
2351: const PetscInt *ridx = NULL,*idx,*ii;
2352: PetscBool usecprow = a->compressedrow.use;
2355: VecCopy(yy,zz);
2356: VecGetArrayRead(xx,&x);
2357: VecGetArray(zz,&zarray);
2359: idx = a->j;
2360: v = a->a;
2361: if (usecprow) {
2362: mbs = a->compressedrow.nrows;
2363: ii = a->compressedrow.i;
2364: ridx = a->compressedrow.rindex;
2365: } else {
2366: mbs = a->mbs;
2367: ii = a->i;
2368: z = zarray;
2369: }
2371: if (!a->mult_work) {
2372: k = PetscMax(A->rmap->n,A->cmap->n);
2373: PetscMalloc1(k+1,&a->mult_work);
2374: }
2375: work = a->mult_work;
2376: for (i=0; i<mbs; i++) {
2377: n = ii[1] - ii[0]; ii++;
2378: ncols = n*bs;
2379: workt = work;
2380: for (j=0; j<n; j++) {
2381: xb = x + bs*(*idx++);
2382: for (k=0; k<bs; k++) workt[k] = xb[k];
2383: workt += bs;
2384: }
2385: if (usecprow) z = zarray + bs*ridx[i];
2386: PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
2387: v += n*bs2;
2388: if (!usecprow) z += bs;
2389: }
2390: VecRestoreArrayRead(xx,&x);
2391: VecRestoreArray(zz,&zarray);
2392: PetscLogFlops(2.0*a->nz*bs2);
2393: return(0);
2394: }
2396: PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
2397: {
2398: PetscScalar zero = 0.0;
2402: VecSet(zz,zero);
2403: MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);
2404: return(0);
2405: }
2407: PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
2408: {
2409: PetscScalar zero = 0.0;
2413: VecSet(zz,zero);
2414: MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);
2415: return(0);
2416: }
2418: PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
2419: {
2420: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2421: PetscScalar *z,x1,x2,x3,x4,x5;
2422: const PetscScalar *x,*xb = NULL;
2423: const MatScalar *v;
2424: PetscErrorCode ierr;
2425: PetscInt mbs,i,rval,bs=A->rmap->bs,j,n;
2426: const PetscInt *idx,*ii,*ib,*ridx = NULL;
2427: Mat_CompressedRow cprow = a->compressedrow;
2428: PetscBool usecprow = cprow.use;
2431: if (yy != zz) { VecCopy(yy,zz); }
2432: VecGetArrayRead(xx,&x);
2433: VecGetArray(zz,&z);
2435: idx = a->j;
2436: v = a->a;
2437: if (usecprow) {
2438: mbs = cprow.nrows;
2439: ii = cprow.i;
2440: ridx = cprow.rindex;
2441: } else {
2442: mbs=a->mbs;
2443: ii = a->i;
2444: xb = x;
2445: }
2447: switch (bs) {
2448: case 1:
2449: for (i=0; i<mbs; i++) {
2450: if (usecprow) xb = x + ridx[i];
2451: x1 = xb[0];
2452: ib = idx + ii[0];
2453: n = ii[1] - ii[0]; ii++;
2454: for (j=0; j<n; j++) {
2455: rval = ib[j];
2456: z[rval] += PetscConj(*v) * x1;
2457: v++;
2458: }
2459: if (!usecprow) xb++;
2460: }
2461: break;
2462: case 2:
2463: for (i=0; i<mbs; i++) {
2464: if (usecprow) xb = x + 2*ridx[i];
2465: x1 = xb[0]; x2 = xb[1];
2466: ib = idx + ii[0];
2467: n = ii[1] - ii[0]; ii++;
2468: for (j=0; j<n; j++) {
2469: rval = ib[j]*2;
2470: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
2471: z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
2472: v += 4;
2473: }
2474: if (!usecprow) xb += 2;
2475: }
2476: break;
2477: case 3:
2478: for (i=0; i<mbs; i++) {
2479: if (usecprow) xb = x + 3*ridx[i];
2480: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2481: ib = idx + ii[0];
2482: n = ii[1] - ii[0]; ii++;
2483: for (j=0; j<n; j++) {
2484: rval = ib[j]*3;
2485: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
2486: z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
2487: z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
2488: v += 9;
2489: }
2490: if (!usecprow) xb += 3;
2491: }
2492: break;
2493: case 4:
2494: for (i=0; i<mbs; i++) {
2495: if (usecprow) xb = x + 4*ridx[i];
2496: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2497: ib = idx + ii[0];
2498: n = ii[1] - ii[0]; ii++;
2499: for (j=0; j<n; j++) {
2500: rval = ib[j]*4;
2501: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4;
2502: z[rval++] += PetscConj(v[4])*x1 + PetscConj(v[5])*x2 + PetscConj(v[6])*x3 + PetscConj(v[7])*x4;
2503: z[rval++] += PetscConj(v[8])*x1 + PetscConj(v[9])*x2 + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
2504: z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
2505: v += 16;
2506: }
2507: if (!usecprow) xb += 4;
2508: }
2509: break;
2510: case 5:
2511: for (i=0; i<mbs; i++) {
2512: if (usecprow) xb = x + 5*ridx[i];
2513: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2514: x4 = xb[3]; x5 = xb[4];
2515: ib = idx + ii[0];
2516: n = ii[1] - ii[0]; ii++;
2517: for (j=0; j<n; j++) {
2518: rval = ib[j]*5;
2519: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4 + PetscConj(v[4])*x5;
2520: z[rval++] += PetscConj(v[5])*x1 + PetscConj(v[6])*x2 + PetscConj(v[7])*x3 + PetscConj(v[8])*x4 + PetscConj(v[9])*x5;
2521: z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
2522: z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
2523: z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
2524: v += 25;
2525: }
2526: if (!usecprow) xb += 5;
2527: }
2528: break;
2529: default: /* block sizes larger than 5 by 5 are handled by BLAS */
2530: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
2531: #if 0
2532: {
2533: PetscInt ncols,k,bs2=a->bs2;
2534: PetscScalar *work,*workt,zb;
2535: const PetscScalar *xtmp;
2536: if (!a->mult_work) {
2537: k = PetscMax(A->rmap->n,A->cmap->n);
2538: PetscMalloc1(k+1,&a->mult_work);
2539: }
2540: work = a->mult_work;
2541: xtmp = x;
2542: for (i=0; i<mbs; i++) {
2543: n = ii[1] - ii[0]; ii++;
2544: ncols = n*bs;
2545: PetscArrayzero(work,ncols);
2546: if (usecprow) xtmp = x + bs*ridx[i];
2547: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2548: v += n*bs2;
2549: if (!usecprow) xtmp += bs;
2550: workt = work;
2551: for (j=0; j<n; j++) {
2552: zb = z + bs*(*idx++);
2553: for (k=0; k<bs; k++) zb[k] += workt[k] ;
2554: workt += bs;
2555: }
2556: }
2557: }
2558: #endif
2559: }
2560: VecRestoreArrayRead(xx,&x);
2561: VecRestoreArray(zz,&z);
2562: PetscLogFlops(2.0*a->nz*a->bs2);
2563: return(0);
2564: }
2566: PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
2567: {
2568: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2569: PetscScalar *zb,*z,x1,x2,x3,x4,x5;
2570: const PetscScalar *x,*xb = NULL;
2571: const MatScalar *v;
2572: PetscErrorCode ierr;
2573: PetscInt mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2;
2574: const PetscInt *idx,*ii,*ib,*ridx = NULL;
2575: Mat_CompressedRow cprow = a->compressedrow;
2576: PetscBool usecprow=cprow.use;
2579: if (yy != zz) { VecCopy(yy,zz); }
2580: VecGetArrayRead(xx,&x);
2581: VecGetArray(zz,&z);
2583: idx = a->j;
2584: v = a->a;
2585: if (usecprow) {
2586: mbs = cprow.nrows;
2587: ii = cprow.i;
2588: ridx = cprow.rindex;
2589: } else {
2590: mbs=a->mbs;
2591: ii = a->i;
2592: xb = x;
2593: }
2595: switch (bs) {
2596: case 1:
2597: for (i=0; i<mbs; i++) {
2598: if (usecprow) xb = x + ridx[i];
2599: x1 = xb[0];
2600: ib = idx + ii[0];
2601: n = ii[1] - ii[0]; ii++;
2602: for (j=0; j<n; j++) {
2603: rval = ib[j];
2604: z[rval] += *v * x1;
2605: v++;
2606: }
2607: if (!usecprow) xb++;
2608: }
2609: break;
2610: case 2:
2611: for (i=0; i<mbs; i++) {
2612: if (usecprow) xb = x + 2*ridx[i];
2613: x1 = xb[0]; x2 = xb[1];
2614: ib = idx + ii[0];
2615: n = ii[1] - ii[0]; ii++;
2616: for (j=0; j<n; j++) {
2617: rval = ib[j]*2;
2618: z[rval++] += v[0]*x1 + v[1]*x2;
2619: z[rval++] += v[2]*x1 + v[3]*x2;
2620: v += 4;
2621: }
2622: if (!usecprow) xb += 2;
2623: }
2624: break;
2625: case 3:
2626: for (i=0; i<mbs; i++) {
2627: if (usecprow) xb = x + 3*ridx[i];
2628: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2629: ib = idx + ii[0];
2630: n = ii[1] - ii[0]; ii++;
2631: for (j=0; j<n; j++) {
2632: rval = ib[j]*3;
2633: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
2634: z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
2635: z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
2636: v += 9;
2637: }
2638: if (!usecprow) xb += 3;
2639: }
2640: break;
2641: case 4:
2642: for (i=0; i<mbs; i++) {
2643: if (usecprow) xb = x + 4*ridx[i];
2644: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2645: ib = idx + ii[0];
2646: n = ii[1] - ii[0]; ii++;
2647: for (j=0; j<n; j++) {
2648: rval = ib[j]*4;
2649: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
2650: z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
2651: z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
2652: z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
2653: v += 16;
2654: }
2655: if (!usecprow) xb += 4;
2656: }
2657: break;
2658: case 5:
2659: for (i=0; i<mbs; i++) {
2660: if (usecprow) xb = x + 5*ridx[i];
2661: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2662: x4 = xb[3]; x5 = xb[4];
2663: ib = idx + ii[0];
2664: n = ii[1] - ii[0]; ii++;
2665: for (j=0; j<n; j++) {
2666: rval = ib[j]*5;
2667: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
2668: z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
2669: z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
2670: z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
2671: z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
2672: v += 25;
2673: }
2674: if (!usecprow) xb += 5;
2675: }
2676: break;
2677: default: { /* block sizes larger then 5 by 5 are handled by BLAS */
2678: PetscInt ncols,k;
2679: PetscScalar *work,*workt;
2680: const PetscScalar *xtmp;
2681: if (!a->mult_work) {
2682: k = PetscMax(A->rmap->n,A->cmap->n);
2683: PetscMalloc1(k+1,&a->mult_work);
2684: }
2685: work = a->mult_work;
2686: xtmp = x;
2687: for (i=0; i<mbs; i++) {
2688: n = ii[1] - ii[0]; ii++;
2689: ncols = n*bs;
2690: PetscArrayzero(work,ncols);
2691: if (usecprow) xtmp = x + bs*ridx[i];
2692: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2693: v += n*bs2;
2694: if (!usecprow) xtmp += bs;
2695: workt = work;
2696: for (j=0; j<n; j++) {
2697: zb = z + bs*(*idx++);
2698: for (k=0; k<bs; k++) zb[k] += workt[k];
2699: workt += bs;
2700: }
2701: }
2702: }
2703: }
2704: VecRestoreArrayRead(xx,&x);
2705: VecRestoreArray(zz,&z);
2706: PetscLogFlops(2.0*a->nz*a->bs2);
2707: return(0);
2708: }
2710: PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
2711: {
2712: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data;
2713: PetscInt totalnz = a->bs2*a->nz;
2714: PetscScalar oalpha = alpha;
2716: PetscBLASInt one = 1,tnz;
2719: PetscBLASIntCast(totalnz,&tnz);
2720: PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one));
2721: PetscLogFlops(totalnz);
2722: return(0);
2723: }
2725: PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
2726: {
2728: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2729: MatScalar *v = a->a;
2730: PetscReal sum = 0.0;
2731: PetscInt i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;
2734: if (type == NORM_FROBENIUS) {
2735: #if defined(PETSC_USE_REAL___FP16)
2736: PetscBLASInt one = 1,cnt = bs2*nz;
2737: PetscStackCallBLAS("BLASnrm2",*norm = BLASnrm2_(&cnt,v,&one));
2738: #else
2739: for (i=0; i<bs2*nz; i++) {
2740: sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
2741: }
2742: #endif
2743: *norm = PetscSqrtReal(sum);
2744: PetscLogFlops(2.0*bs2*nz);
2745: } else if (type == NORM_1) { /* maximum column sum */
2746: PetscReal *tmp;
2747: PetscInt *bcol = a->j;
2748: PetscCalloc1(A->cmap->n+1,&tmp);
2749: for (i=0; i<nz; i++) {
2750: for (j=0; j<bs; j++) {
2751: k1 = bs*(*bcol) + j; /* column index */
2752: for (k=0; k<bs; k++) {
2753: tmp[k1] += PetscAbsScalar(*v); v++;
2754: }
2755: }
2756: bcol++;
2757: }
2758: *norm = 0.0;
2759: for (j=0; j<A->cmap->n; j++) {
2760: if (tmp[j] > *norm) *norm = tmp[j];
2761: }
2762: PetscFree(tmp);
2763: PetscLogFlops(PetscMax(bs2*nz-1,0));
2764: } else if (type == NORM_INFINITY) { /* maximum row sum */
2765: *norm = 0.0;
2766: for (k=0; k<bs; k++) {
2767: for (j=0; j<a->mbs; j++) {
2768: v = a->a + bs2*a->i[j] + k;
2769: sum = 0.0;
2770: for (i=0; i<a->i[j+1]-a->i[j]; i++) {
2771: for (k1=0; k1<bs; k1++) {
2772: sum += PetscAbsScalar(*v);
2773: v += bs;
2774: }
2775: }
2776: if (sum > *norm) *norm = sum;
2777: }
2778: }
2779: PetscLogFlops(PetscMax(bs2*nz-1,0));
2780: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
2781: return(0);
2782: }
2785: PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg)
2786: {
2787: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
2791: /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */
2792: if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
2793: *flg = PETSC_FALSE;
2794: return(0);
2795: }
2797: /* if the a->i are the same */
2798: PetscArraycmp(a->i,b->i,a->mbs+1,flg);
2799: if (!*flg) return(0);
2801: /* if a->j are the same */
2802: PetscArraycmp(a->j,b->j,a->nz,flg);
2803: if (!*flg) return(0);
2805: /* if a->a are the same */
2806: PetscArraycmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs),flg);
2807: return(0);
2809: }
2811: PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
2812: {
2813: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2815: PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
2816: PetscScalar *x,zero = 0.0;
2817: MatScalar *aa,*aa_j;
2820: if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
2821: bs = A->rmap->bs;
2822: aa = a->a;
2823: ai = a->i;
2824: aj = a->j;
2825: ambs = a->mbs;
2826: bs2 = a->bs2;
2828: VecSet(v,zero);
2829: VecGetArray(v,&x);
2830: VecGetLocalSize(v,&n);
2831: if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
2832: for (i=0; i<ambs; i++) {
2833: for (j=ai[i]; j<ai[i+1]; j++) {
2834: if (aj[j] == i) {
2835: row = i*bs;
2836: aa_j = aa+j*bs2;
2837: for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
2838: break;
2839: }
2840: }
2841: }
2842: VecRestoreArray(v,&x);
2843: return(0);
2844: }
2846: PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
2847: {
2848: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2849: const PetscScalar *l,*r,*li,*ri;
2850: PetscScalar x;
2851: MatScalar *aa, *v;
2852: PetscErrorCode ierr;
2853: PetscInt i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai;
2854: const PetscInt *ai,*aj;
2857: ai = a->i;
2858: aj = a->j;
2859: aa = a->a;
2860: m = A->rmap->n;
2861: n = A->cmap->n;
2862: bs = A->rmap->bs;
2863: mbs = a->mbs;
2864: bs2 = a->bs2;
2865: if (ll) {
2866: VecGetArrayRead(ll,&l);
2867: VecGetLocalSize(ll,&lm);
2868: if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
2869: for (i=0; i<mbs; i++) { /* for each block row */
2870: M = ai[i+1] - ai[i];
2871: li = l + i*bs;
2872: v = aa + bs2*ai[i];
2873: for (j=0; j<M; j++) { /* for each block */
2874: for (k=0; k<bs2; k++) {
2875: (*v++) *= li[k%bs];
2876: }
2877: }
2878: }
2879: VecRestoreArrayRead(ll,&l);
2880: PetscLogFlops(a->nz);
2881: }
2883: if (rr) {
2884: VecGetArrayRead(rr,&r);
2885: VecGetLocalSize(rr,&rn);
2886: if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
2887: for (i=0; i<mbs; i++) { /* for each block row */
2888: iai = ai[i];
2889: M = ai[i+1] - iai;
2890: v = aa + bs2*iai;
2891: for (j=0; j<M; j++) { /* for each block */
2892: ri = r + bs*aj[iai+j];
2893: for (k=0; k<bs; k++) {
2894: x = ri[k];
2895: for (tmp=0; tmp<bs; tmp++) v[tmp] *= x;
2896: v += bs;
2897: }
2898: }
2899: }
2900: VecRestoreArrayRead(rr,&r);
2901: PetscLogFlops(a->nz);
2902: }
2903: return(0);
2904: }
2907: PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2908: {
2909: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2912: info->block_size = a->bs2;
2913: info->nz_allocated = a->bs2*a->maxnz;
2914: info->nz_used = a->bs2*a->nz;
2915: info->nz_unneeded = info->nz_allocated - info->nz_used;
2916: info->assemblies = A->num_ass;
2917: info->mallocs = A->info.mallocs;
2918: info->memory = ((PetscObject)A)->mem;
2919: if (A->factortype) {
2920: info->fill_ratio_given = A->info.fill_ratio_given;
2921: info->fill_ratio_needed = A->info.fill_ratio_needed;
2922: info->factor_mallocs = A->info.factor_mallocs;
2923: } else {
2924: info->fill_ratio_given = 0;
2925: info->fill_ratio_needed = 0;
2926: info->factor_mallocs = 0;
2927: }
2928: return(0);
2929: }
2931: PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2932: {
2933: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2937: PetscArrayzero(a->a,a->bs2*a->i[a->mbs]);
2938: return(0);
2939: }
2941: PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)
2942: {
2946: MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);
2947: C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense;
2948: return(0);
2949: }
2951: PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2952: {
2953: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2954: PetscScalar *z = NULL,sum1;
2955: const PetscScalar *xb;
2956: PetscScalar x1;
2957: const MatScalar *v,*vv;
2958: PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
2959: PetscBool usecprow=a->compressedrow.use;
2962: idx = a->j;
2963: v = a->a;
2964: if (usecprow) {
2965: mbs = a->compressedrow.nrows;
2966: ii = a->compressedrow.i;
2967: ridx = a->compressedrow.rindex;
2968: } else {
2969: mbs = a->mbs;
2970: ii = a->i;
2971: z = c;
2972: }
2974: for (i=0; i<mbs; i++) {
2975: n = ii[1] - ii[0]; ii++;
2976: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2977: PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2978: if (usecprow) z = c + ridx[i];
2979: jj = idx;
2980: vv = v;
2981: for (k=0; k<cn; k++) {
2982: idx = jj;
2983: v = vv;
2984: sum1 = 0.0;
2985: for (j=0; j<n; j++) {
2986: xb = b + (*idx++); x1 = xb[0+k*bm];
2987: sum1 += v[0]*x1;
2988: v += 1;
2989: }
2990: z[0+k*cm] = sum1;
2991: }
2992: if (!usecprow) z += 1;
2993: }
2994: return(0);
2995: }
2997: PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2998: {
2999: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
3000: PetscScalar *z = NULL,sum1,sum2;
3001: const PetscScalar *xb;
3002: PetscScalar x1,x2;
3003: const MatScalar *v,*vv;
3004: PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
3005: PetscBool usecprow=a->compressedrow.use;
3008: idx = a->j;
3009: v = a->a;
3010: if (usecprow) {
3011: mbs = a->compressedrow.nrows;
3012: ii = a->compressedrow.i;
3013: ridx = a->compressedrow.rindex;
3014: } else {
3015: mbs = a->mbs;
3016: ii = a->i;
3017: z = c;
3018: }
3020: for (i=0; i<mbs; i++) {
3021: n = ii[1] - ii[0]; ii++;
3022: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3023: PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3024: if (usecprow) z = c + 2*ridx[i];
3025: jj = idx;
3026: vv = v;
3027: for (k=0; k<cn; k++) {
3028: idx = jj;
3029: v = vv;
3030: sum1 = 0.0; sum2 = 0.0;
3031: for (j=0; j<n; j++) {
3032: xb = b + 2*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm];
3033: sum1 += v[0]*x1 + v[2]*x2;
3034: sum2 += v[1]*x1 + v[3]*x2;
3035: v += 4;
3036: }
3037: z[0+k*cm] = sum1; z[1+k*cm] = sum2;
3038: }
3039: if (!usecprow) z += 2;
3040: }
3041: return(0);
3042: }
3044: PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
3045: {
3046: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
3047: PetscScalar *z = NULL,sum1,sum2,sum3;
3048: const PetscScalar *xb;
3049: PetscScalar x1,x2,x3;
3050: const MatScalar *v,*vv;
3051: PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
3052: PetscBool usecprow=a->compressedrow.use;
3055: idx = a->j;
3056: v = a->a;
3057: if (usecprow) {
3058: mbs = a->compressedrow.nrows;
3059: ii = a->compressedrow.i;
3060: ridx = a->compressedrow.rindex;
3061: } else {
3062: mbs = a->mbs;
3063: ii = a->i;
3064: z = c;
3065: }
3067: for (i=0; i<mbs; i++) {
3068: n = ii[1] - ii[0]; ii++;
3069: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3070: PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3071: if (usecprow) z = c + 3*ridx[i];
3072: jj = idx;
3073: vv = v;
3074: for (k=0; k<cn; k++) {
3075: idx = jj;
3076: v = vv;
3077: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
3078: for (j=0; j<n; j++) {
3079: xb = b + 3*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm];
3080: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
3081: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
3082: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
3083: v += 9;
3084: }
3085: z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3;
3086: }
3087: if (!usecprow) z += 3;
3088: }
3089: return(0);
3090: }
3092: PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
3093: {
3094: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
3095: PetscScalar *z = NULL,sum1,sum2,sum3,sum4;
3096: const PetscScalar *xb;
3097: PetscScalar x1,x2,x3,x4;
3098: const MatScalar *v,*vv;
3099: PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
3100: PetscBool usecprow=a->compressedrow.use;
3103: idx = a->j;
3104: v = a->a;
3105: if (usecprow) {
3106: mbs = a->compressedrow.nrows;
3107: ii = a->compressedrow.i;
3108: ridx = a->compressedrow.rindex;
3109: } else {
3110: mbs = a->mbs;
3111: ii = a->i;
3112: z = c;
3113: }
3115: for (i=0; i<mbs; i++) {
3116: n = ii[1] - ii[0]; ii++;
3117: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3118: PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3119: if (usecprow) z = c + 4*ridx[i];
3120: jj = idx;
3121: vv = v;
3122: for (k=0; k<cn; k++) {
3123: idx = jj;
3124: v = vv;
3125: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
3126: for (j=0; j<n; j++) {
3127: xb = b + 4*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm];
3128: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3129: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3130: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3131: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3132: v += 16;
3133: }
3134: z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4;
3135: }
3136: if (!usecprow) z += 4;
3137: }
3138: return(0);
3139: }
3141: PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
3142: {
3143: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
3144: PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5;
3145: const PetscScalar *xb;
3146: PetscScalar x1,x2,x3,x4,x5;
3147: const MatScalar *v,*vv;
3148: PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
3149: PetscBool usecprow=a->compressedrow.use;
3152: idx = a->j;
3153: v = a->a;
3154: if (usecprow) {
3155: mbs = a->compressedrow.nrows;
3156: ii = a->compressedrow.i;
3157: ridx = a->compressedrow.rindex;
3158: } else {
3159: mbs = a->mbs;
3160: ii = a->i;
3161: z = c;
3162: }
3164: for (i=0; i<mbs; i++) {
3165: n = ii[1] - ii[0]; ii++;
3166: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
3167: PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
3168: if (usecprow) z = c + 5*ridx[i];
3169: jj = idx;
3170: vv = v;
3171: for (k=0; k<cn; k++) {
3172: idx = jj;
3173: v = vv;
3174: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
3175: for (j=0; j<n; j++) {
3176: xb = b + 5*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm]; x5 = xb[4+k*bm];
3177: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3178: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3179: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3180: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3181: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3182: v += 25;
3183: }
3184: z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4; z[4+k*cm] = sum5;
3185: }
3186: if (!usecprow) z += 5;
3187: }
3188: return(0);
3189: }
3191: PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A,Mat B,Mat C)
3192: {
3193: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
3194: Mat_SeqDense *bd = (Mat_SeqDense*)B->data;
3195: Mat_SeqDense *cd = (Mat_SeqDense*)C->data;
3196: PetscInt cm=cd->lda,cn=B->cmap->n,bm=bd->lda;
3197: PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
3198: PetscBLASInt bbs,bcn,bbm,bcm;
3199: PetscScalar *z = NULL;
3200: PetscScalar *c,*b;
3201: const MatScalar *v;
3202: const PetscInt *idx,*ii,*ridx=NULL;
3203: PetscScalar _DZero=0.0,_DOne=1.0;
3204: PetscBool usecprow=a->compressedrow.use;
3205: PetscErrorCode ierr;
3208: if (!cm || !cn) return(0);
3209: if (B->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,B->rmap->n);
3210: if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n);
3211: if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n);
3212: b = bd->v;
3213: if (a->nonzerorowcnt != A->rmap->n) {
3214: MatZeroEntries(C);
3215: }
3216: MatDenseGetArray(C,&c);
3217: switch (bs) {
3218: case 1:
3219: MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn);
3220: break;
3221: case 2:
3222: MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn);
3223: break;
3224: case 3:
3225: MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn);
3226: break;
3227: case 4:
3228: MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn);
3229: break;
3230: case 5:
3231: MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn);
3232: break;
3233: default: /* block sizes larger than 5 by 5 are handled by BLAS */
3234: PetscBLASIntCast(bs,&bbs);
3235: PetscBLASIntCast(cn,&bcn);
3236: PetscBLASIntCast(bm,&bbm);
3237: PetscBLASIntCast(cm,&bcm);
3238: idx = a->j;
3239: v = a->a;
3240: if (usecprow) {
3241: mbs = a->compressedrow.nrows;
3242: ii = a->compressedrow.i;
3243: ridx = a->compressedrow.rindex;
3244: } else {
3245: mbs = a->mbs;
3246: ii = a->i;
3247: z = c;
3248: }
3249: for (i=0; i<mbs; i++) {
3250: n = ii[1] - ii[0]; ii++;
3251: if (usecprow) z = c + bs*ridx[i];
3252: if (n) {
3253: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DZero,z,&bcm));
3254: v += bs2;
3255: }
3256: for (j=1; j<n; j++) {
3257: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DOne,z,&bcm));
3258: v += bs2;
3259: }
3260: if (!usecprow) z += bs;
3261: }
3262: }
3263: MatDenseRestoreArray(C,&c);
3264: PetscLogFlops((2.0*a->nz*bs2 - bs*a->nonzerorowcnt)*cn);
3265: return(0);
3266: }