Actual source code: sfpack.c
1: #include "petsc/private/sfimpl.h"
2: #include <../src/vec/is/sf/impls/basic/sfpack.h>
3: #include <../src/vec/is/sf/impls/basic/sfbasic.h>
5: /* This is a C file that contains packing facilities, with dispatches to device if enabled. */
7: #if defined(PETSC_HAVE_CUDA)
8: #include <cuda_runtime.h>
9: #include <petsccublas.h>
10: #endif
11: #if defined(PETSC_HAVE_HIP)
12: #include <hip/hip_runtime.h>
13: #include <petschipblas.h>
14: #endif
15: /*
16: * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction",
17: * therefore we pack data types manually. This file defines packing routines for the standard data types.
18: */
20: #define CPPJoin4(a,b,c,d) a##_##b##_##c##_##d
22: /* Operations working like s += t */
23: #define OP_BINARY(op,s,t) do {(s) = (s) op (t); } while (0) /* binary ops in the middle such as +, *, && etc. */
24: #define OP_FUNCTION(op,s,t) do {(s) = op((s),(t)); } while (0) /* ops like a function, such as PetscMax, PetscMin */
25: #define OP_LXOR(op,s,t) do {(s) = (!(s)) != (!(t));} while (0) /* logical exclusive OR */
26: #define OP_ASSIGN(op,s,t) do {(s) = (t);} while (0)
27: /* Ref MPI MAXLOC */
28: #define OP_XLOC(op,s,t) \
29: do { \
30: if ((s).u == (t).u) (s).i = PetscMin((s).i,(t).i); \
31: else if (!((s).u op (t).u)) s = t; \
32: } while (0)
34: /* DEF_PackFunc - macro defining a Pack routine
36: Arguments of the macro:
37: +Type Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry.
38: .BS Block size for vectorization. It is a factor of bsz.
39: -EQ (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below.
41: Arguments of the Pack routine:
42: +count Number of indices in idx[].
43: .start When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used.
44: .opt Per-pack optimization plan. NULL means no such plan.
45: .idx Indices of entries to packed.
46: .link Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int.
47: .unpacked Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count).
48: -packed Address of the packed data.
49: */
50: #define DEF_PackFunc(Type,BS,EQ) \
51: static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *unpacked,void *packed) \
52: { \
54: const Type *u = (const Type*)unpacked,*u2; \
55: Type *p = (Type*)packed,*p2; \
56: PetscInt i,j,k,X,Y,r,bs=link->bs; \
57: const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \
58: const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \
60: if (!idx) {PetscArraycpy(p,u+start*MBS,MBS*count);}/* idx[] are contiguous */ \
61: else if (opt) { /* has optimizations available */ \
62: p2 = p; \
63: for (r=0; r<opt->n; r++) { \
64: u2 = u + opt->start[r]*MBS; \
65: X = opt->X[r]; \
66: Y = opt->Y[r]; \
67: for (k=0; k<opt->dz[r]; k++) \
68: for (j=0; j<opt->dy[r]; j++) { \
69: PetscArraycpy(p2,u2+(X*Y*k+X*j)*MBS,opt->dx[r]*MBS); \
70: p2 += opt->dx[r]*MBS; \
71: } \
72: } \
73: } else { \
74: for (i=0; i<count; i++) \
75: for (j=0; j<M; j++) /* Decent compilers should eliminate this loop when M = const 1 */ \
76: for (k=0; k<BS; k++) /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */ \
77: p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k]; \
78: } \
79: return(0); \
80: }
82: /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer
83: and inserts into a sparse array.
85: Arguments:
86: .Type Type of the data
87: .BS Block size for vectorization
88: .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const.
90: Notes:
91: This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro.
92: */
93: #define DEF_UnpackFunc(Type,BS,EQ) \
94: static PetscErrorCode CPPJoin4(UnpackAndInsert,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
95: { \
97: Type *u = (Type*)unpacked,*u2; \
98: const Type *p = (const Type*)packed; \
99: PetscInt i,j,k,X,Y,r,bs=link->bs; \
100: const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \
101: const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \
103: if (!idx) { \
104: u += start*MBS; \
105: if (u != p) {PetscArraycpy(u,p,count*MBS);} \
106: } else if (opt) { /* has optimizations available */ \
107: for (r=0; r<opt->n; r++) { \
108: u2 = u + opt->start[r]*MBS; \
109: X = opt->X[r]; \
110: Y = opt->Y[r]; \
111: for (k=0; k<opt->dz[r]; k++) \
112: for (j=0; j<opt->dy[r]; j++) { \
113: PetscArraycpy(u2+(X*Y*k+X*j)*MBS,p,opt->dx[r]*MBS); \
114: p += opt->dx[r]*MBS; \
115: } \
116: } \
117: } else { \
118: for (i=0; i<count; i++) \
119: for (j=0; j<M; j++) \
120: for (k=0; k<BS; k++) u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k]; \
121: } \
122: return(0); \
123: }
125: /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert
127: Arguments:
128: +Opname Name of the Op, such as Add, Mult, LAND, etc.
129: .Type Type of the data
130: .BS Block size for vectorization
131: .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const.
132: .Op Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc.
133: .OpApply Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR
134: */
135: #define DEF_UnpackAndOp(Type,BS,EQ,Opname,Op,OpApply) \
136: static PetscErrorCode CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
137: { \
138: Type *u = (Type*)unpacked,*u2; \
139: const Type *p = (const Type*)packed; \
140: PetscInt i,j,k,X,Y,r,bs=link->bs; \
141: const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \
142: const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \
144: if (!idx) { \
145: u += start*MBS; \
146: for (i=0; i<count; i++) \
147: for (j=0; j<M; j++) \
148: for (k=0; k<BS; k++) \
149: OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]); \
150: } else if (opt) { /* idx[] has patterns */ \
151: for (r=0; r<opt->n; r++) { \
152: u2 = u + opt->start[r]*MBS; \
153: X = opt->X[r]; \
154: Y = opt->Y[r]; \
155: for (k=0; k<opt->dz[r]; k++) \
156: for (j=0; j<opt->dy[r]; j++) { \
157: for (i=0; i<opt->dx[r]*MBS; i++) OpApply(Op,u2[(X*Y*k+X*j)*MBS+i],p[i]); \
158: p += opt->dx[r]*MBS; \
159: } \
160: } \
161: } else { \
162: for (i=0; i<count; i++) \
163: for (j=0; j<M; j++) \
164: for (k=0; k<BS; k++) \
165: OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]); \
166: } \
167: return(0); \
168: }
170: #define DEF_FetchAndOp(Type,BS,EQ,Opname,Op,OpApply) \
171: static PetscErrorCode CPPJoin4(FetchAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,void *packed) \
172: { \
173: Type *u = (Type*)unpacked,*p = (Type*)packed,tmp; \
174: PetscInt i,j,k,r,l,bs=link->bs; \
175: const PetscInt M = (EQ) ? 1 : bs/BS; \
176: const PetscInt MBS = M*BS; \
178: for (i=0; i<count; i++) { \
179: r = (!idx ? start+i : idx[i])*MBS; \
180: l = i*MBS; \
181: for (j=0; j<M; j++) \
182: for (k=0; k<BS; k++) { \
183: tmp = u[r+j*BS+k]; \
184: OpApply(Op,u[r+j*BS+k],p[l+j*BS+k]); \
185: p[l+j*BS+k] = tmp; \
186: } \
187: } \
188: return(0); \
189: }
191: #define DEF_ScatterAndOp(Type,BS,EQ,Opname,Op,OpApply) \
192: static PetscErrorCode CPPJoin4(ScatterAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst) \
193: { \
195: const Type *u = (const Type*)src; \
196: Type *v = (Type*)dst; \
197: PetscInt i,j,k,s,t,X,Y,bs = link->bs; \
198: const PetscInt M = (EQ) ? 1 : bs/BS; \
199: const PetscInt MBS = M*BS; \
201: if (!srcIdx) { /* src is contiguous */ \
202: u += srcStart*MBS; \
203: CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(link,count,dstStart,dstOpt,dstIdx,dst,u); \
204: } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */ \
205: u += srcOpt->start[0]*MBS; \
206: v += dstStart*MBS; \
207: X = srcOpt->X[0]; Y = srcOpt->Y[0]; \
208: for (k=0; k<srcOpt->dz[0]; k++) \
209: for (j=0; j<srcOpt->dy[0]; j++) { \
210: for (i=0; i<srcOpt->dx[0]*MBS; i++) OpApply(Op,v[i],u[(X*Y*k+X*j)*MBS+i]); \
211: v += srcOpt->dx[0]*MBS; \
212: } \
213: } else { /* all other cases */ \
214: for (i=0; i<count; i++) { \
215: s = (!srcIdx ? srcStart+i : srcIdx[i])*MBS; \
216: t = (!dstIdx ? dstStart+i : dstIdx[i])*MBS; \
217: for (j=0; j<M; j++) \
218: for (k=0; k<BS; k++) OpApply(Op,v[t+j*BS+k],u[s+j*BS+k]); \
219: } \
220: } \
221: return(0); \
222: }
224: #define DEF_FetchAndOpLocal(Type,BS,EQ,Opname,Op,OpApply) \
225: static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata,void *leafupdate) \
226: { \
227: Type *rdata = (Type*)rootdata,*lupdate = (Type*)leafupdate; \
228: const Type *ldata = (const Type*)leafdata; \
229: PetscInt i,j,k,r,l,bs = link->bs; \
230: const PetscInt M = (EQ) ? 1 : bs/BS; \
231: const PetscInt MBS = M*BS; \
233: for (i=0; i<count; i++) { \
234: r = (rootidx ? rootidx[i] : rootstart+i)*MBS; \
235: l = (leafidx ? leafidx[i] : leafstart+i)*MBS; \
236: for (j=0; j<M; j++) \
237: for (k=0; k<BS; k++) { \
238: lupdate[l+j*BS+k] = rdata[r+j*BS+k]; \
239: OpApply(Op,rdata[r+j*BS+k],ldata[l+j*BS+k]); \
240: } \
241: } \
242: return(0); \
243: }
245: /* Pack, Unpack/Fetch ops */
246: #define DEF_Pack(Type,BS,EQ) \
247: DEF_PackFunc(Type,BS,EQ) \
248: DEF_UnpackFunc(Type,BS,EQ) \
249: DEF_ScatterAndOp(Type,BS,EQ,Insert,=,OP_ASSIGN) \
250: static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFLink link) { \
251: link->h_Pack = CPPJoin4(Pack, Type,BS,EQ); \
252: link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ); \
253: link->h_ScatterAndInsert= CPPJoin4(ScatterAndInsert,Type,BS,EQ); \
254: }
256: /* Add, Mult ops */
257: #define DEF_Add(Type,BS,EQ) \
258: DEF_UnpackAndOp (Type,BS,EQ,Add, +,OP_BINARY) \
259: DEF_UnpackAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \
260: DEF_FetchAndOp (Type,BS,EQ,Add, +,OP_BINARY) \
261: DEF_ScatterAndOp (Type,BS,EQ,Add, +,OP_BINARY) \
262: DEF_ScatterAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \
263: DEF_FetchAndOpLocal(Type,BS,EQ,Add, +,OP_BINARY) \
264: static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFLink link) { \
265: link->h_UnpackAndAdd = CPPJoin4(UnpackAndAdd, Type,BS,EQ); \
266: link->h_UnpackAndMult = CPPJoin4(UnpackAndMult, Type,BS,EQ); \
267: link->h_FetchAndAdd = CPPJoin4(FetchAndAdd, Type,BS,EQ); \
268: link->h_ScatterAndAdd = CPPJoin4(ScatterAndAdd, Type,BS,EQ); \
269: link->h_ScatterAndMult = CPPJoin4(ScatterAndMult, Type,BS,EQ); \
270: link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal,Type,BS,EQ); \
271: }
273: /* Max, Min ops */
274: #define DEF_Cmp(Type,BS,EQ) \
275: DEF_UnpackAndOp (Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \
276: DEF_UnpackAndOp (Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \
277: DEF_ScatterAndOp(Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \
278: DEF_ScatterAndOp(Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \
279: static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFLink link) { \
280: link->h_UnpackAndMax = CPPJoin4(UnpackAndMax, Type,BS,EQ); \
281: link->h_UnpackAndMin = CPPJoin4(UnpackAndMin, Type,BS,EQ); \
282: link->h_ScatterAndMax = CPPJoin4(ScatterAndMax, Type,BS,EQ); \
283: link->h_ScatterAndMin = CPPJoin4(ScatterAndMin, Type,BS,EQ); \
284: }
286: /* Logical ops.
287: The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid
288: the compilation warning "empty macro arguments are undefined in ISO C90"
289: */
290: #define DEF_Log(Type,BS,EQ) \
291: DEF_UnpackAndOp (Type,BS,EQ,LAND,&&,OP_BINARY) \
292: DEF_UnpackAndOp (Type,BS,EQ,LOR, ||,OP_BINARY) \
293: DEF_UnpackAndOp (Type,BS,EQ,LXOR,||, OP_LXOR) \
294: DEF_ScatterAndOp(Type,BS,EQ,LAND,&&,OP_BINARY) \
295: DEF_ScatterAndOp(Type,BS,EQ,LOR, ||,OP_BINARY) \
296: DEF_ScatterAndOp(Type,BS,EQ,LXOR,||, OP_LXOR) \
297: static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFLink link) { \
298: link->h_UnpackAndLAND = CPPJoin4(UnpackAndLAND, Type,BS,EQ); \
299: link->h_UnpackAndLOR = CPPJoin4(UnpackAndLOR, Type,BS,EQ); \
300: link->h_UnpackAndLXOR = CPPJoin4(UnpackAndLXOR, Type,BS,EQ); \
301: link->h_ScatterAndLAND = CPPJoin4(ScatterAndLAND,Type,BS,EQ); \
302: link->h_ScatterAndLOR = CPPJoin4(ScatterAndLOR, Type,BS,EQ); \
303: link->h_ScatterAndLXOR = CPPJoin4(ScatterAndLXOR,Type,BS,EQ); \
304: }
306: /* Bitwise ops */
307: #define DEF_Bit(Type,BS,EQ) \
308: DEF_UnpackAndOp (Type,BS,EQ,BAND,&,OP_BINARY) \
309: DEF_UnpackAndOp (Type,BS,EQ,BOR, |,OP_BINARY) \
310: DEF_UnpackAndOp (Type,BS,EQ,BXOR,^,OP_BINARY) \
311: DEF_ScatterAndOp(Type,BS,EQ,BAND,&,OP_BINARY) \
312: DEF_ScatterAndOp(Type,BS,EQ,BOR, |,OP_BINARY) \
313: DEF_ScatterAndOp(Type,BS,EQ,BXOR,^,OP_BINARY) \
314: static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFLink link) { \
315: link->h_UnpackAndBAND = CPPJoin4(UnpackAndBAND, Type,BS,EQ); \
316: link->h_UnpackAndBOR = CPPJoin4(UnpackAndBOR, Type,BS,EQ); \
317: link->h_UnpackAndBXOR = CPPJoin4(UnpackAndBXOR, Type,BS,EQ); \
318: link->h_ScatterAndBAND = CPPJoin4(ScatterAndBAND,Type,BS,EQ); \
319: link->h_ScatterAndBOR = CPPJoin4(ScatterAndBOR, Type,BS,EQ); \
320: link->h_ScatterAndBXOR = CPPJoin4(ScatterAndBXOR,Type,BS,EQ); \
321: }
323: /* Maxloc, Minloc ops */
324: #define DEF_Xloc(Type,BS,EQ) \
325: DEF_UnpackAndOp (Type,BS,EQ,Max,>,OP_XLOC) \
326: DEF_UnpackAndOp (Type,BS,EQ,Min,<,OP_XLOC) \
327: DEF_ScatterAndOp(Type,BS,EQ,Max,>,OP_XLOC) \
328: DEF_ScatterAndOp(Type,BS,EQ,Min,<,OP_XLOC) \
329: static void CPPJoin4(PackInit_Xloc,Type,BS,EQ)(PetscSFLink link) { \
330: link->h_UnpackAndMaxloc = CPPJoin4(UnpackAndMax, Type,BS,EQ); \
331: link->h_UnpackAndMinloc = CPPJoin4(UnpackAndMin, Type,BS,EQ); \
332: link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax,Type,BS,EQ); \
333: link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin,Type,BS,EQ); \
334: }
336: #define DEF_IntegerType(Type,BS,EQ) \
337: DEF_Pack(Type,BS,EQ) \
338: DEF_Add(Type,BS,EQ) \
339: DEF_Cmp(Type,BS,EQ) \
340: DEF_Log(Type,BS,EQ) \
341: DEF_Bit(Type,BS,EQ) \
342: static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFLink link) { \
343: CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \
344: CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \
345: CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \
346: CPPJoin4(PackInit_Logical,Type,BS,EQ)(link); \
347: CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link); \
348: }
350: #define DEF_RealType(Type,BS,EQ) \
351: DEF_Pack(Type,BS,EQ) \
352: DEF_Add(Type,BS,EQ) \
353: DEF_Cmp(Type,BS,EQ) \
354: static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFLink link) { \
355: CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \
356: CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \
357: CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \
358: }
360: #if defined(PETSC_HAVE_COMPLEX)
361: #define DEF_ComplexType(Type,BS,EQ) \
362: DEF_Pack(Type,BS,EQ) \
363: DEF_Add(Type,BS,EQ) \
364: static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFLink link) { \
365: CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \
366: CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \
367: }
368: #endif
370: #define DEF_DumbType(Type,BS,EQ) \
371: DEF_Pack(Type,BS,EQ) \
372: static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFLink link) { \
373: CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \
374: }
376: /* Maxloc, Minloc */
377: #define DEF_PairType(Type,BS,EQ) \
378: DEF_Pack(Type,BS,EQ) \
379: DEF_Xloc(Type,BS,EQ) \
380: static void CPPJoin4(PackInit_PairType,Type,BS,EQ)(PetscSFLink link) { \
381: CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \
382: CPPJoin4(PackInit_Xloc,Type,BS,EQ)(link); \
383: }
385: DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT */
386: DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */
387: DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */
388: DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */
389: DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */
390: DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */
391: DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */
392: DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */
394: #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */
395: DEF_IntegerType(int,1,1)
396: DEF_IntegerType(int,2,1)
397: DEF_IntegerType(int,4,1)
398: DEF_IntegerType(int,8,1)
399: DEF_IntegerType(int,1,0)
400: DEF_IntegerType(int,2,0)
401: DEF_IntegerType(int,4,0)
402: DEF_IntegerType(int,8,0)
403: #endif
405: /* The typedefs are used to get a typename without space that CPPJoin can handle */
406: typedef signed char SignedChar;
407: DEF_IntegerType(SignedChar,1,1)
408: DEF_IntegerType(SignedChar,2,1)
409: DEF_IntegerType(SignedChar,4,1)
410: DEF_IntegerType(SignedChar,8,1)
411: DEF_IntegerType(SignedChar,1,0)
412: DEF_IntegerType(SignedChar,2,0)
413: DEF_IntegerType(SignedChar,4,0)
414: DEF_IntegerType(SignedChar,8,0)
416: typedef unsigned char UnsignedChar;
417: DEF_IntegerType(UnsignedChar,1,1)
418: DEF_IntegerType(UnsignedChar,2,1)
419: DEF_IntegerType(UnsignedChar,4,1)
420: DEF_IntegerType(UnsignedChar,8,1)
421: DEF_IntegerType(UnsignedChar,1,0)
422: DEF_IntegerType(UnsignedChar,2,0)
423: DEF_IntegerType(UnsignedChar,4,0)
424: DEF_IntegerType(UnsignedChar,8,0)
426: DEF_RealType(PetscReal,1,1)
427: DEF_RealType(PetscReal,2,1)
428: DEF_RealType(PetscReal,4,1)
429: DEF_RealType(PetscReal,8,1)
430: DEF_RealType(PetscReal,1,0)
431: DEF_RealType(PetscReal,2,0)
432: DEF_RealType(PetscReal,4,0)
433: DEF_RealType(PetscReal,8,0)
435: #if defined(PETSC_HAVE_COMPLEX)
436: DEF_ComplexType(PetscComplex,1,1)
437: DEF_ComplexType(PetscComplex,2,1)
438: DEF_ComplexType(PetscComplex,4,1)
439: DEF_ComplexType(PetscComplex,8,1)
440: DEF_ComplexType(PetscComplex,1,0)
441: DEF_ComplexType(PetscComplex,2,0)
442: DEF_ComplexType(PetscComplex,4,0)
443: DEF_ComplexType(PetscComplex,8,0)
444: #endif
446: #define PairType(Type1,Type2) Type1##_##Type2
447: typedef struct {int u; int i;} PairType(int,int);
448: typedef struct {PetscInt u; PetscInt i;} PairType(PetscInt,PetscInt);
449: DEF_PairType(PairType(int,int),1,1)
450: DEF_PairType(PairType(PetscInt,PetscInt),1,1)
452: /* If we don't know the basic type, we treat it as a stream of chars or ints */
453: DEF_DumbType(char,1,1)
454: DEF_DumbType(char,2,1)
455: DEF_DumbType(char,4,1)
456: DEF_DumbType(char,1,0)
457: DEF_DumbType(char,2,0)
458: DEF_DumbType(char,4,0)
460: typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */
461: DEF_DumbType(DumbInt,1,1)
462: DEF_DumbType(DumbInt,2,1)
463: DEF_DumbType(DumbInt,4,1)
464: DEF_DumbType(DumbInt,8,1)
465: DEF_DumbType(DumbInt,1,0)
466: DEF_DumbType(DumbInt,2,0)
467: DEF_DumbType(DumbInt,4,0)
468: DEF_DumbType(DumbInt,8,0)
470: #if !defined(PETSC_HAVE_MPI_TYPE_DUP)
471: PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype)
472: {
473: int ierr;
474: MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr;
475: MPI_Type_commit(newtype); if (ierr) return ierr;
476: return MPI_SUCCESS;
477: }
478: #endif
480: PetscErrorCode PetscSFLinkDestroy(PetscSF sf,PetscSFLink link)
481: {
482: PetscErrorCode ierr;
483: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
484: PetscInt i,nreqs = (bas->nrootreqs+sf->nleafreqs)*8;
487: /* Destroy device-specific fields */
488: if (link->deviceinited) {(*link->Destroy)(sf,link);}
490: /* Destroy host related fields */
491: if (!link->isbuiltin) {MPI_Type_free(&link->unit);}
492: if (!link->use_nvshmem) {
493: for (i=0; i<nreqs; i++) { /* Persistent reqs must be freed. */
494: if (link->reqs[i] != MPI_REQUEST_NULL) {MPI_Request_free(&link->reqs[i]);}
495: }
496: PetscFree(link->reqs);
497: for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
498: PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]);
499: PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]);
500: }
501: }
502: PetscFree(link);
503: return(0);
504: }
506: PetscErrorCode PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink)
507: {
508: PetscErrorCode ierr;
511: PetscSFSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata);
512: #if defined(PETSC_HAVE_NVSHMEM)
513: {
514: PetscBool use_nvshmem;
515: PetscSFLinkNvshmemCheck(sf,rootmtype,rootdata,leafmtype,leafdata,&use_nvshmem);
516: if (use_nvshmem) {
517: PetscSFLinkCreate_NVSHMEM(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op,sfop,mylink);
518: return(0);
519: }
520: }
521: #endif
522: PetscSFLinkCreate_MPI(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op,sfop,mylink);
523: return(0);
524: }
526: /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction.
527: If the sf uses persistent requests and the requests have not been initialized, then initialize them.
528: */
529: PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void **rootbuf, void **leafbuf,MPI_Request **rootreqs,MPI_Request **leafreqs)
530: {
531: PetscErrorCode ierr;
532: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
533: PetscInt i,j,nrootranks,ndrootranks,nleafranks,ndleafranks;
534: const PetscInt *rootoffset,*leafoffset;
535: PetscMPIInt n;
536: MPI_Aint disp;
537: MPI_Comm comm = PetscObjectComm((PetscObject)sf);
538: MPI_Datatype unit = link->unit;
539: const PetscMemType rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */
540: const PetscInt rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi;
543: /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */
544: if (sf->persistent) {
545: if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) {
546: PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL);
547: if (direction == PETSCSF_LEAF2../../../../../..) {
548: for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
549: disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
550: PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);
551: MPI_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);
552: }
553: } else { /* PETSCSF_../../../../../..2LEAF */
554: for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
555: disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
556: PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);
557: MPI_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);
558: }
559: }
560: link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE;
561: }
563: if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) {
564: PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL);
565: if (direction == PETSCSF_LEAF2../../../../../..) {
566: for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
567: disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
568: PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);
569: MPI_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);
570: }
571: } else { /* PETSCSF_../../../../../..2LEAF */
572: for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
573: disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
574: PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);
575: MPI_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);
576: }
577: }
578: link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE;
579: }
580: }
581: if (rootbuf) *rootbuf = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi];
582: if (leafbuf) *leafbuf = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi];
583: if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi];
584: if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi];
585: return(0);
586: }
589: PetscErrorCode PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFLink *mylink)
590: {
591: PetscErrorCode ierr;
592: PetscSFLink link,*p;
593: PetscSF_Basic *bas=(PetscSF_Basic*)sf->data;
596: /* Look for types in cache */
597: for (p=&bas->inuse; (link=*p); p=&link->next) {
598: PetscBool match;
599: MPIPetsc_Type_compare(unit,link->unit,&match);
600: if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) {
601: switch (cmode) {
602: case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */
603: case PETSC_USE_POINTER: break;
604: default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode");
605: }
606: *mylink = link;
607: return(0);
608: }
609: }
610: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack");
611: }
613: PetscErrorCode PetscSFLinkReclaim(PetscSF sf,PetscSFLink *mylink)
614: {
615: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
616: PetscSFLink link = *mylink;
619: link->rootdata = NULL;
620: link->leafdata = NULL;
621: link->next = bas->avail;
622: bas->avail = link;
623: *mylink = NULL;
624: return(0);
625: }
627: /* Error out on unsupported overlapped communications */
628: PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata)
629: {
630: PetscErrorCode ierr;
631: PetscSFLink link,*p;
632: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
633: PetscBool match;
636: if (PetscDefined(USE_DEBUG)) {
637: /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore
638: the potential overlapping since this process does not participate in communication. Overlapping is harmless.
639: */
640: if (rootdata || leafdata) {
641: for (p=&bas->inuse; (link=*p); p=&link->next) {
642: MPIPetsc_Type_compare(unit,link->unit,&match);
643: if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.",rootdata,leafdata);
644: }
645: }
646: }
647: return(0);
648: }
650: static PetscErrorCode PetscSFLinkMemcpy_Host(PetscSFLink link,PetscMemType dstmtype,void* dst,PetscMemType srcmtype,const void*src,size_t n)
651: {
653: if (n) {PetscErrorCode PetscMemcpy(dst,src,n);}
654: return(0);
655: }
658: PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit)
659: {
661: PetscInt nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0;
662: PetscBool is2Int,is2PetscInt;
663: PetscMPIInt ni,na,nd,combiner;
664: #if defined(PETSC_HAVE_COMPLEX)
665: PetscInt nPetscComplex=0;
666: #endif
669: MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR, &nSignedChar);
670: MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);
671: /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
672: MPIPetsc_Type_compare_contig(unit,MPI_INT, &nInt);
673: MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);
674: MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);
675: #if defined(PETSC_HAVE_COMPLEX)
676: MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);
677: #endif
678: MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);
679: MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);
680: /* TODO: shaell we also handle Fortran MPI_2REAL? */
681: MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);
682: link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */
683: link->bs = 1; /* default */
685: if (is2Int) {
686: PackInit_PairType_int_int_1_1(link);
687: link->bs = 1;
688: link->unitbytes = 2*sizeof(int);
689: link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
690: link->basicunit = MPI_2INT;
691: link->unit = MPI_2INT;
692: } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
693: PackInit_PairType_PetscInt_PetscInt_1_1(link);
694: link->bs = 1;
695: link->unitbytes = 2*sizeof(PetscInt);
696: link->basicunit = MPIU_2INT;
697: link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
698: link->unit = MPIU_2INT;
699: } else if (nPetscReal) {
700: if (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link);
701: else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link);
702: else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link);
703: else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link);
704: link->bs = nPetscReal;
705: link->unitbytes = nPetscReal*sizeof(PetscReal);
706: link->basicunit = MPIU_REAL;
707: if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;}
708: } else if (nPetscInt) {
709: if (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link);
710: else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link);
711: else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link);
712: else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link);
713: link->bs = nPetscInt;
714: link->unitbytes = nPetscInt*sizeof(PetscInt);
715: link->basicunit = MPIU_INT;
716: if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;}
717: #if defined(PETSC_USE_64BIT_INDICES)
718: } else if (nInt) {
719: if (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link);
720: else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link);
721: else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link);
722: else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link);
723: link->bs = nInt;
724: link->unitbytes = nInt*sizeof(int);
725: link->basicunit = MPI_INT;
726: if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;}
727: #endif
728: } else if (nSignedChar) {
729: if (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link);
730: else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link);
731: else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link);
732: else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link);
733: link->bs = nSignedChar;
734: link->unitbytes = nSignedChar*sizeof(SignedChar);
735: link->basicunit = MPI_SIGNED_CHAR;
736: if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;}
737: } else if (nUnsignedChar) {
738: if (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link);
739: else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link);
740: else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link);
741: else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link);
742: link->bs = nUnsignedChar;
743: link->unitbytes = nUnsignedChar*sizeof(UnsignedChar);
744: link->basicunit = MPI_UNSIGNED_CHAR;
745: if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;}
746: #if defined(PETSC_HAVE_COMPLEX)
747: } else if (nPetscComplex) {
748: if (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link);
749: else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link);
750: else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link);
751: else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link);
752: link->bs = nPetscComplex;
753: link->unitbytes = nPetscComplex*sizeof(PetscComplex);
754: link->basicunit = MPIU_COMPLEX;
755: if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;}
756: #endif
757: } else {
758: MPI_Aint lb,nbyte;
759: MPI_Type_get_extent(unit,&lb,&nbyte);
760: if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
761: if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
762: if (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link);
763: else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link);
764: else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link);
765: link->bs = nbyte;
766: link->unitbytes = nbyte;
767: link->basicunit = MPI_BYTE;
768: } else {
769: nInt = nbyte / sizeof(int);
770: if (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link);
771: else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link);
772: else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link);
773: else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link);
774: link->bs = nInt;
775: link->unitbytes = nbyte;
776: link->basicunit = MPI_INT;
777: }
778: if (link->isbuiltin) link->unit = unit;
779: }
781: if (!link->isbuiltin) {MPI_Type_dup(unit,&link->unit);}
783: link->Memcpy = PetscSFLinkMemcpy_Host;
784: return(0);
785: }
787: PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*))
788: {
790: *UnpackAndOp = NULL;
791: if (PetscMemTypeHost(mtype)) {
792: if (op == MPI_REPLACE) *UnpackAndOp = link->h_UnpackAndInsert;
793: else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd;
794: else if (op == MPI_PROD) *UnpackAndOp = link->h_UnpackAndMult;
795: else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax;
796: else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin;
797: else if (op == MPI_LAND) *UnpackAndOp = link->h_UnpackAndLAND;
798: else if (op == MPI_BAND) *UnpackAndOp = link->h_UnpackAndBAND;
799: else if (op == MPI_LOR) *UnpackAndOp = link->h_UnpackAndLOR;
800: else if (op == MPI_BOR) *UnpackAndOp = link->h_UnpackAndBOR;
801: else if (op == MPI_LXOR) *UnpackAndOp = link->h_UnpackAndLXOR;
802: else if (op == MPI_BXOR) *UnpackAndOp = link->h_UnpackAndBXOR;
803: else if (op == MPI_MAXLOC) *UnpackAndOp = link->h_UnpackAndMaxloc;
804: else if (op == MPI_MINLOC) *UnpackAndOp = link->h_UnpackAndMinloc;
805: }
806: #if defined(PETSC_HAVE_DEVICE)
807: else if (PetscMemTypeDevice(mtype) && !atomic) {
808: if (op == MPI_REPLACE) *UnpackAndOp = link->d_UnpackAndInsert;
809: else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd;
810: else if (op == MPI_PROD) *UnpackAndOp = link->d_UnpackAndMult;
811: else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax;
812: else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin;
813: else if (op == MPI_LAND) *UnpackAndOp = link->d_UnpackAndLAND;
814: else if (op == MPI_BAND) *UnpackAndOp = link->d_UnpackAndBAND;
815: else if (op == MPI_LOR) *UnpackAndOp = link->d_UnpackAndLOR;
816: else if (op == MPI_BOR) *UnpackAndOp = link->d_UnpackAndBOR;
817: else if (op == MPI_LXOR) *UnpackAndOp = link->d_UnpackAndLXOR;
818: else if (op == MPI_BXOR) *UnpackAndOp = link->d_UnpackAndBXOR;
819: else if (op == MPI_MAXLOC) *UnpackAndOp = link->d_UnpackAndMaxloc;
820: else if (op == MPI_MINLOC) *UnpackAndOp = link->d_UnpackAndMinloc;
821: } else if (PetscMemTypeDevice(mtype) && atomic) {
822: if (op == MPI_REPLACE) *UnpackAndOp = link->da_UnpackAndInsert;
823: else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd;
824: else if (op == MPI_PROD) *UnpackAndOp = link->da_UnpackAndMult;
825: else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax;
826: else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin;
827: else if (op == MPI_LAND) *UnpackAndOp = link->da_UnpackAndLAND;
828: else if (op == MPI_BAND) *UnpackAndOp = link->da_UnpackAndBAND;
829: else if (op == MPI_LOR) *UnpackAndOp = link->da_UnpackAndLOR;
830: else if (op == MPI_BOR) *UnpackAndOp = link->da_UnpackAndBOR;
831: else if (op == MPI_LXOR) *UnpackAndOp = link->da_UnpackAndLXOR;
832: else if (op == MPI_BXOR) *UnpackAndOp = link->da_UnpackAndBXOR;
833: else if (op == MPI_MAXLOC) *UnpackAndOp = link->da_UnpackAndMaxloc;
834: else if (op == MPI_MINLOC) *UnpackAndOp = link->da_UnpackAndMinloc;
835: }
836: #endif
837: return(0);
838: }
840: PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*))
841: {
843: *ScatterAndOp = NULL;
844: if (PetscMemTypeHost(mtype)) {
845: if (op == MPI_REPLACE) *ScatterAndOp = link->h_ScatterAndInsert;
846: else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd;
847: else if (op == MPI_PROD) *ScatterAndOp = link->h_ScatterAndMult;
848: else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax;
849: else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin;
850: else if (op == MPI_LAND) *ScatterAndOp = link->h_ScatterAndLAND;
851: else if (op == MPI_BAND) *ScatterAndOp = link->h_ScatterAndBAND;
852: else if (op == MPI_LOR) *ScatterAndOp = link->h_ScatterAndLOR;
853: else if (op == MPI_BOR) *ScatterAndOp = link->h_ScatterAndBOR;
854: else if (op == MPI_LXOR) *ScatterAndOp = link->h_ScatterAndLXOR;
855: else if (op == MPI_BXOR) *ScatterAndOp = link->h_ScatterAndBXOR;
856: else if (op == MPI_MAXLOC) *ScatterAndOp = link->h_ScatterAndMaxloc;
857: else if (op == MPI_MINLOC) *ScatterAndOp = link->h_ScatterAndMinloc;
858: }
859: #if defined(PETSC_HAVE_DEVICE)
860: else if (PetscMemTypeDevice(mtype) && !atomic) {
861: if (op == MPI_REPLACE) *ScatterAndOp = link->d_ScatterAndInsert;
862: else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd;
863: else if (op == MPI_PROD) *ScatterAndOp = link->d_ScatterAndMult;
864: else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax;
865: else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin;
866: else if (op == MPI_LAND) *ScatterAndOp = link->d_ScatterAndLAND;
867: else if (op == MPI_BAND) *ScatterAndOp = link->d_ScatterAndBAND;
868: else if (op == MPI_LOR) *ScatterAndOp = link->d_ScatterAndLOR;
869: else if (op == MPI_BOR) *ScatterAndOp = link->d_ScatterAndBOR;
870: else if (op == MPI_LXOR) *ScatterAndOp = link->d_ScatterAndLXOR;
871: else if (op == MPI_BXOR) *ScatterAndOp = link->d_ScatterAndBXOR;
872: else if (op == MPI_MAXLOC) *ScatterAndOp = link->d_ScatterAndMaxloc;
873: else if (op == MPI_MINLOC) *ScatterAndOp = link->d_ScatterAndMinloc;
874: } else if (PetscMemTypeDevice(mtype) && atomic) {
875: if (op == MPI_REPLACE) *ScatterAndOp = link->da_ScatterAndInsert;
876: else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd;
877: else if (op == MPI_PROD) *ScatterAndOp = link->da_ScatterAndMult;
878: else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax;
879: else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin;
880: else if (op == MPI_LAND) *ScatterAndOp = link->da_ScatterAndLAND;
881: else if (op == MPI_BAND) *ScatterAndOp = link->da_ScatterAndBAND;
882: else if (op == MPI_LOR) *ScatterAndOp = link->da_ScatterAndLOR;
883: else if (op == MPI_BOR) *ScatterAndOp = link->da_ScatterAndBOR;
884: else if (op == MPI_LXOR) *ScatterAndOp = link->da_ScatterAndLXOR;
885: else if (op == MPI_BXOR) *ScatterAndOp = link->da_ScatterAndBXOR;
886: else if (op == MPI_MAXLOC) *ScatterAndOp = link->da_ScatterAndMaxloc;
887: else if (op == MPI_MINLOC) *ScatterAndOp = link->da_ScatterAndMinloc;
888: }
889: #endif
890: return(0);
891: }
893: PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*))
894: {
896: *FetchAndOp = NULL;
897: if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
898: if (PetscMemTypeHost(mtype)) *FetchAndOp = link->h_FetchAndAdd;
899: #if defined(PETSC_HAVE_DEVICE)
900: else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOp = link->d_FetchAndAdd;
901: else if (PetscMemTypeDevice(mtype) && atomic) *FetchAndOp = link->da_FetchAndAdd;
902: #endif
903: return(0);
904: }
906: PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*))
907: {
909: *FetchAndOpLocal = NULL;
910: if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
911: if (PetscMemTypeHost(mtype)) *FetchAndOpLocal = link->h_FetchAndAddLocal;
912: #if defined(PETSC_HAVE_DEVICE)
913: else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal;
914: else if (PetscMemTypeDevice(mtype) && atomic) *FetchAndOpLocal = link->da_FetchAndAddLocal;
915: #endif
916: return(0);
917: }
919: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
920: {
922: PetscLogDouble flops;
923: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
926: if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
927: flops = bas->rootbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
928: #if defined(PETSC_HAVE_DEVICE)
929: if (PetscMemTypeDevice(link->rootmtype)) {PetscLogGpuFlops(flops);} else
930: #endif
931: {PetscLogFlops(flops);}
932: }
933: return(0);
934: }
936: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
937: {
938: PetscLogDouble flops;
942: if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
943: flops = sf->leafbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
944: #if defined(PETSC_HAVE_DEVICE)
945: if (PetscMemTypeDevice(link->leafmtype)) {PetscLogGpuFlops(flops);} else
946: #endif
947: {PetscLogFlops(flops);}
948: }
949: return(0);
950: }
952: /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local.
953: Input Arguments:
954: +sf - The StarForest
955: .link - The link
956: .count - Number of entries to unpack
957: .start - The first index, significent when indices=NULL
958: .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start>
959: .buf - A contiguous buffer to unpack from
960: -op - Operation after unpack
962: Output Arguments:
963: .data - The data to unpack to
964: */
965: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *indices,void *data,const void *buf,MPI_Op op)
966: {
968: #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
969: {
971: PetscInt i;
972: PetscMPIInt n;
973: if (indices) {
974: /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on
975: basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf.
976: */
977: for (i=0; i<count; i++) {MPI_Reduce_local((const char*)buf+i*link->unitbytes,(char*)data+indices[i]*link->unitbytes,1,link->unit,op);}
978: } else {
979: PetscMPIIntCast(count,&n);
980: MPI_Reduce_local(buf,(char*)data+start*link->unitbytes,n,link->unit,op);
981: }
982: }
983: return(0);
984: #else
985: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
986: #endif
987: }
989: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt srcStart,const PetscInt *srcIdx,const void *src,PetscInt dstStart,const PetscInt *dstIdx,void *dst,MPI_Op op)
990: {
992: #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
993: {
995: PetscInt i,disp;
996: if (!srcIdx) {
997: PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,dstStart,dstIdx,dst,(const char*)src+srcStart*link->unitbytes,op);
998: } else {
999: for (i=0; i<count; i++) {
1000: disp = dstIdx? dstIdx[i] : dstStart + i;
1001: MPI_Reduce_local((const char*)src+srcIdx[i]*link->unitbytes,(char*)dst+disp*link->unitbytes,1,link->unit,op);
1002: }
1003: }
1004: }
1005: return(0);
1006: #else
1007: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
1008: #endif
1009: }
1011: /*=============================================================================
1012: Pack/Unpack/Fetch/Scatter routines
1013: ============================================================================*/
1015: /* Pack rootdata to rootbuf
1016: Input Arguments:
1017: + sf - The SF this packing works on.
1018: . link - It gives the memtype of the roots and also provides root buffer.
1019: . scope - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately.
1020: - rootdata - Where to read the roots.
1022: Notes:
1023: When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is
1024: in a place where the underlying MPI is ready to access (use_gpu_aware_mpi or not)
1025: */
1026: PetscErrorCode PetscSFLinkPackRootData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata)
1027: {
1028: PetscErrorCode ierr;
1029: const PetscInt *rootindices = NULL;
1030: PetscInt count,start;
1031: PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1032: PetscMemType rootmtype = link->rootmtype;
1033: PetscSFPackOpt opt = NULL;
1036: PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);
1037: if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip packing */
1038: PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);
1039: PetscSFLinkGetPack(link,rootmtype,&Pack);
1040: (*Pack)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);
1041: }
1042: PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);
1043: return(0);
1044: }
1046: /* Pack leafdata to leafbuf */
1047: PetscErrorCode PetscSFLinkPackLeafData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata)
1048: {
1049: PetscErrorCode ierr;
1050: const PetscInt *leafindices = NULL;
1051: PetscInt count,start;
1052: PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1053: PetscMemType leafmtype = link->leafmtype;
1054: PetscSFPackOpt opt = NULL;
1057: PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);
1058: if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip packing */
1059: PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);
1060: PetscSFLinkGetPack(link,leafmtype,&Pack);
1061: (*Pack)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);
1062: }
1063: PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);
1064: return(0);
1065: }
1067: /* Pack rootdata to rootbuf, which are in the same memory space */
1068: PetscErrorCode PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata)
1069: {
1070: PetscErrorCode ierr;
1071: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1074: if (scope == PETSCSF_REMOTE) { /* Sync the device if rootdata is not on petsc default stream */
1075: if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) {(*link->SyncDevice)(link);}
1076: if (link->PrePack) {(*link->PrePack)(sf,link,PETSCSF_../../../../../..2LEAF);} /* Used by SF nvshmem */
1077: }
1078: PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);
1079: if (bas->rootbuflen[scope]) {PetscSFLinkPackRootData_Private(sf,link,scope,rootdata);}
1080: PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);
1081: return(0);
1082: }
1083: /* Pack leafdata to leafbuf, which are in the same memory space */
1084: PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata)
1085: {
1086: PetscErrorCode ierr;
1089: if (scope == PETSCSF_REMOTE) {
1090: if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) {(*link->SyncDevice)(link);}
1091: if (link->PrePack) {(*link->PrePack)(sf,link,PETSCSF_LEAF2../../../../../..);} /* Used by SF nvshmem */
1092: }
1093: PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);
1094: if (sf->leafbuflen[scope]) {PetscSFLinkPackLeafData_Private(sf,link,scope,leafdata);}
1095: PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);
1096: return(0);
1097: }
1099: PetscErrorCode PetscSFLinkUnpackRootData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1100: {
1101: PetscErrorCode ierr;
1102: const PetscInt *rootindices = NULL;
1103: PetscInt count,start;
1104: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1105: PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1106: PetscMemType rootmtype = link->rootmtype;
1107: PetscSFPackOpt opt = NULL;
1110: if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */
1111: PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[scope],&UnpackAndOp);
1112: if (UnpackAndOp) {
1113: PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);
1114: (*UnpackAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);
1115: } else {
1116: PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&rootindices);
1117: PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,rootindices,rootdata,link->rootbuf[scope][rootmtype],op);
1118: }
1119: }
1120: PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);
1121: return(0);
1122: }
1124: PetscErrorCode PetscSFLinkUnpackLeafData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op)
1125: {
1126: PetscErrorCode ierr;
1127: const PetscInt *leafindices = NULL;
1128: PetscInt count,start;
1129: PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1130: PetscMemType leafmtype = link->leafmtype;
1131: PetscSFPackOpt opt = NULL;
1134: if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */
1135: PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[scope],&UnpackAndOp);
1136: if (UnpackAndOp) {
1137: PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);
1138: (*UnpackAndOp)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);
1139: } else {
1140: PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&leafindices);
1141: PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,leafindices,leafdata,link->leafbuf[scope][leafmtype],op);
1142: }
1143: }
1144: PetscSFLinkLogFlopsAfterUnpackLeafData(sf,link,scope,op);
1145: return(0);
1146: }
1147: /* Unpack rootbuf to rootdata, which are in the same memory space */
1148: PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1149: {
1150: PetscErrorCode ierr;
1151: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1154: PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);
1155: if (bas->rootbuflen[scope]) {PetscSFLinkUnpackRootData_Private(sf,link,scope,rootdata,op);}
1156: PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);
1157: if (scope == PETSCSF_REMOTE) {
1158: if (link->PostUnpack) {(*link->PostUnpack)(sf,link,PETSCSF_LEAF2../../../../../..);} /* Used by SF nvshmem */
1159: if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) {(*link->SyncDevice)(link);}
1160: }
1161: return(0);
1162: }
1164: /* Unpack leafbuf to leafdata for remote (common case) or local (rare case when rootmtype != leafmtype) */
1165: PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op)
1166: {
1167: PetscErrorCode ierr;
1170: PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);
1171: if (sf->leafbuflen[scope]) {PetscSFLinkUnpackLeafData_Private(sf,link,scope,leafdata,op);}
1172: PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);
1173: if (scope == PETSCSF_REMOTE) {
1174: if (link->PostUnpack) {(*link->PostUnpack)(sf,link,PETSCSF_../../../../../..2LEAF);} /* Used by SF nvshmem */
1175: if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) {(*link->SyncDevice)(link);}
1176: }
1177: return(0);
1178: }
1180: /* FetchAndOp rootdata with rootbuf, it is a kind of Unpack on rootdata, except it also updates rootbuf */
1181: PetscErrorCode PetscSFLinkFetchAndOpRemote(PetscSF sf,PetscSFLink link,void *rootdata,MPI_Op op)
1182: {
1183: PetscErrorCode ierr;
1184: const PetscInt *rootindices = NULL;
1185: PetscInt count,start;
1186: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1187: PetscErrorCode (*FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*) = NULL;
1188: PetscMemType rootmtype = link->rootmtype;
1189: PetscSFPackOpt opt = NULL;
1192: PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);
1193: if (bas->rootbuflen[PETSCSF_REMOTE]) {
1194: /* Do FetchAndOp on rootdata with rootbuf */
1195: PetscSFLinkGetFetchAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_REMOTE],&FetchAndOp);
1196: PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_REMOTE,&count,&start,&opt,&rootindices);
1197: (*FetchAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[PETSCSF_REMOTE][rootmtype]);
1198: }
1199: PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,PETSCSF_REMOTE,op);
1200: PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);
1201: return(0);
1202: }
1204: PetscErrorCode PetscSFLinkScatterLocal(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void *rootdata,void *leafdata,MPI_Op op)
1205: {
1206: PetscErrorCode ierr;
1207: const PetscInt *rootindices = NULL,*leafindices = NULL;
1208: PetscInt count,rootstart,leafstart;
1209: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1210: PetscErrorCode (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL;
1211: PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype,srcmtype,dstmtype;
1212: PetscSFPackOpt leafopt = NULL,rootopt = NULL;
1213: PetscInt buflen = sf->leafbuflen[PETSCSF_LOCAL];
1214: char *srcbuf = NULL,*dstbuf = NULL;
1215: PetscBool dstdups;
1217: if (!buflen) return(0);
1218: if (rootmtype != leafmtype) { /* The cross memory space local scatter is done by pack, copy and unpack */
1219: if (direction == PETSCSF_../../../../../..2LEAF) {
1220: PetscSFLinkPackRootData(sf,link,PETSCSF_LOCAL,rootdata);
1221: srcmtype = rootmtype;
1222: srcbuf = link->rootbuf[PETSCSF_LOCAL][rootmtype];
1223: dstmtype = leafmtype;
1224: dstbuf = link->leafbuf[PETSCSF_LOCAL][leafmtype];
1225: } else {
1226: PetscSFLinkPackLeafData(sf,link,PETSCSF_LOCAL,leafdata);
1227: srcmtype = leafmtype;
1228: srcbuf = link->leafbuf[PETSCSF_LOCAL][leafmtype];
1229: dstmtype = rootmtype;
1230: dstbuf = link->rootbuf[PETSCSF_LOCAL][rootmtype];
1231: }
1232: (*link->Memcpy)(link,dstmtype,dstbuf,srcmtype,srcbuf,buflen*link->unitbytes);
1233: /* If above is a device to host copy, we have to sync the stream before accessing the buffer on host */
1234: if (PetscMemTypeHost(dstmtype)) {(*link->SyncStream)(link);}
1235: if (direction == PETSCSF_../../../../../..2LEAF) {
1236: PetscSFLinkUnpackLeafData(sf,link,PETSCSF_LOCAL,leafdata,op);
1237: } else {
1238: PetscSFLinkUnpackRootData(sf,link,PETSCSF_LOCAL,rootdata,op);
1239: }
1240: } else {
1241: dstdups = (direction == PETSCSF_../../../../../..2LEAF) ? sf->leafdups[PETSCSF_LOCAL] : bas->rootdups[PETSCSF_LOCAL];
1242: dstmtype = (direction == PETSCSF_../../../../../..2LEAF) ? link->leafmtype : link->rootmtype;
1243: PetscSFLinkGetScatterAndOp(link,dstmtype,op,dstdups,&ScatterAndOp);
1244: if (ScatterAndOp) {
1245: PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);
1246: PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);
1247: if (direction == PETSCSF_../../../../../..2LEAF) {
1248: (*ScatterAndOp)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata);
1249: } else {
1250: (*ScatterAndOp)(link,count,leafstart,leafopt,leafindices,leafdata,rootstart,rootopt,rootindices,rootdata);
1251: }
1252: } else {
1253: PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);
1254: PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);
1255: if (direction == PETSCSF_../../../../../..2LEAF) {
1256: PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,op);
1257: } else {
1258: PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,leafstart,leafindices,leafdata,rootstart,rootindices,rootdata,op);
1259: }
1260: }
1261: }
1262: return(0);
1263: }
1265: /* Fetch rootdata to leafdata and leafupdate locally */
1266: PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1267: {
1268: PetscErrorCode ierr;
1269: const PetscInt *rootindices = NULL,*leafindices = NULL;
1270: PetscInt count,rootstart,leafstart;
1271: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1272: PetscErrorCode (*FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1273: const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1274: PetscSFPackOpt leafopt = NULL,rootopt = NULL;
1277: if (!bas->rootbuflen[PETSCSF_LOCAL]) return(0);
1278: if (rootmtype != leafmtype) {
1279: /* The local communication has to go through pack and unpack */
1280: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU");
1281: } else {
1282: PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);
1283: PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);
1284: PetscSFLinkGetFetchAndOpLocal(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&FetchAndOpLocal);
1285: (*FetchAndOpLocal)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata,leafupdate);
1286: }
1287: return(0);
1288: }
1290: /*
1291: Create per-rank pack/unpack optimizations based on indice patterns
1293: Input Parameters:
1294: + n - Number of destination ranks
1295: . offset - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0.
1296: - idx - [*] Array storing indices
1298: Output Parameters:
1299: + opt - Pack optimizations. NULL if no optimizations.
1300: */
1301: PetscErrorCode PetscSFCreatePackOpt(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out)
1302: {
1304: PetscInt r,p,start,i,j,k,dx,dy,dz,dydz,m,X,Y;
1305: PetscBool optimizable = PETSC_TRUE;
1306: PetscSFPackOpt opt;
1309: PetscMalloc1(1,&opt);
1310: PetscMalloc1(7*n+2,&opt->array);
1311: opt->n = opt->array[0] = n;
1312: opt->offset = opt->array + 1;
1313: opt->start = opt->array + n + 2;
1314: opt->dx = opt->array + 2*n + 2;
1315: opt->dy = opt->array + 3*n + 2;
1316: opt->dz = opt->array + 4*n + 2;
1317: opt->X = opt->array + 5*n + 2;
1318: opt->Y = opt->array + 6*n + 2;
1320: for (r=0; r<n; r++) { /* For each destination rank */
1321: m = offset[r+1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */
1322: p = offset[r];
1323: start = idx[p]; /* First index for this rank */
1324: p++;
1326: /* Search in X dimension */
1327: for (dx=1; dx<m; dx++,p++) {
1328: if (start+dx != idx[p]) break;
1329: }
1331: dydz = m/dx;
1332: X = dydz > 1 ? (idx[p]-start) : dx;
1333: /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */
1334: if (m%dx || X <= 0) {optimizable = PETSC_FALSE; goto finish;}
1335: for (dy=1; dy<dydz; dy++) { /* Search in Y dimension */
1336: for (i=0; i<dx; i++,p++) {
1337: if (start+X*dy+i != idx[p]) {
1338: if (i) {optimizable = PETSC_FALSE; goto finish;} /* The pattern is violated in the middle of an x-walk */
1339: else goto Z_dimension;
1340: }
1341: }
1342: }
1344: Z_dimension:
1345: dz = m/(dx*dy);
1346: Y = dz > 1 ? (idx[p]-start)/X : dy;
1347: /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */
1348: if (m%(dx*dy) || Y <= 0) {optimizable = PETSC_FALSE; goto finish;}
1349: for (k=1; k<dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */
1350: for (j=0; j<dy; j++) {
1351: for (i=0; i<dx; i++,p++) {
1352: if (start+X*Y*k+X*j+i != idx[p]) {optimizable = PETSC_FALSE; goto finish;}
1353: }
1354: }
1355: }
1356: opt->start[r] = start;
1357: opt->dx[r] = dx;
1358: opt->dy[r] = dy;
1359: opt->dz[r] = dz;
1360: opt->X[r] = X;
1361: opt->Y[r] = Y;
1362: }
1364: finish:
1365: /* If not optimizable, free arrays to save memory */
1366: if (!n || !optimizable) {
1367: PetscFree(opt->array);
1368: PetscFree(opt);
1369: *out = NULL;
1370: } else {
1371: opt->offset[0] = 0;
1372: for (r=0; r<n; r++) opt->offset[r+1] = opt->offset[r] + opt->dx[r]*opt->dy[r]*opt->dz[r];
1373: *out = opt;
1374: }
1375: return(0);
1376: }
1378: PETSC_STATIC_INLINE PetscErrorCode PetscSFDestroyPackOpt(PetscSF sf,PetscMemType mtype,PetscSFPackOpt *out)
1379: {
1381: PetscSFPackOpt opt = *out;
1384: if (opt) {
1385: PetscSFFree(sf,mtype,opt->array);
1386: PetscFree(opt);
1387: *out = NULL;
1388: }
1389: return(0);
1390: }
1392: PetscErrorCode PetscSFSetUpPackFields(PetscSF sf)
1393: {
1395: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1396: PetscInt i,j;
1399: /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */
1400: for (i=0; i<2; i++) { /* Set defaults */
1401: sf->leafstart[i] = 0;
1402: sf->leafcontig[i] = PETSC_TRUE;
1403: sf->leafdups[i] = PETSC_FALSE;
1404: bas->rootstart[i] = 0;
1405: bas->rootcontig[i] = PETSC_TRUE;
1406: bas->rootdups[i] = PETSC_FALSE;
1407: }
1409: sf->leafbuflen[0] = sf->roffset[sf->ndranks];
1410: sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks];
1412: if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0];
1413: if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]];
1415: /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */
1416: for (i=0; i<sf->roffset[sf->ndranks]; i++) { /* self */
1417: if (sf->rmine[i] != sf->leafstart[0]+i) {sf->leafcontig[0] = PETSC_FALSE; break;}
1418: }
1419: for (i=sf->roffset[sf->ndranks],j=0; i<sf->roffset[sf->nranks]; i++,j++) { /* remote */
1420: if (sf->rmine[i] != sf->leafstart[1]+j) {sf->leafcontig[1] = PETSC_FALSE; break;}
1421: }
1423: /* If not, see if we can have per-rank optimizations by doing index analysis */
1424: if (!sf->leafcontig[0]) {PetscSFCreatePackOpt(sf->ndranks, sf->roffset, sf->rmine, &sf->leafpackopt[0]);}
1425: if (!sf->leafcontig[1]) {PetscSFCreatePackOpt(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt[1]);}
1427: /* Are root indices for self and remote contiguous? */
1428: bas->rootbuflen[0] = bas->ioffset[bas->ndiranks];
1429: bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks];
1431: if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0];
1432: if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]];
1434: for (i=0; i<bas->ioffset[bas->ndiranks]; i++) {
1435: if (bas->irootloc[i] != bas->rootstart[0]+i) {bas->rootcontig[0] = PETSC_FALSE; break;}
1436: }
1437: for (i=bas->ioffset[bas->ndiranks],j=0; i<bas->ioffset[bas->niranks]; i++,j++) {
1438: if (bas->irootloc[i] != bas->rootstart[1]+j) {bas->rootcontig[1] = PETSC_FALSE; break;}
1439: }
1441: if (!bas->rootcontig[0]) {PetscSFCreatePackOpt(bas->ndiranks, bas->ioffset, bas->irootloc, &bas->rootpackopt[0]);}
1442: if (!bas->rootcontig[1]) {PetscSFCreatePackOpt(bas->niranks-bas->ndiranks, bas->ioffset+bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]);}
1444: #if defined(PETSC_HAVE_DEVICE)
1445: /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */
1446: if (PetscDefined(HAVE_DEVICE)) {
1447: PetscBool ismulti = (sf->multi == sf) ? PETSC_TRUE : PETSC_FALSE;
1452: }
1453: #endif
1454: return(0);
1455: }
1457: PetscErrorCode PetscSFResetPackFields(PetscSF sf)
1458: {
1460: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1461: PetscInt i;
1464: for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
1465: PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&sf->leafpackopt[i]);
1466: PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&bas->rootpackopt[i]);
1467: #if defined(PETSC_HAVE_DEVICE)
1468: PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&sf->leafpackopt_d[i]);
1469: PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&bas->rootpackopt_d[i]);
1470: #endif
1471: }
1472: return(0);
1473: }