Actual source code: sfpack.c
petsc-3.13.6 2020-09-29
1: #include <../src/vec/is/sf/impls/basic/sfpack.h>
2: #include <../src/vec/is/sf/impls/basic/sfbasic.h>
4: #if defined(PETSC_HAVE_CUDA)
5: #include <cuda_runtime.h>
6: #endif
7: /*
8: * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction",
9: * therefore we pack data types manually. This file defines packing routines for the standard data types.
10: */
12: #define CPPJoin4(a,b,c,d) a##_##b##_##c##_##d
14: /* Operations working like s += t */
15: #define OP_BINARY(op,s,t) do {(s) = (s) op (t); } while(0) /* binary ops in the middle such as +, *, && etc. */
16: #define OP_FUNCTION(op,s,t) do {(s) = op((s),(t)); } while(0) /* ops like a function, such as PetscMax, PetscMin */
17: #define OP_LXOR(op,s,t) do {(s) = (!(s)) != (!(t));} while(0) /* logical exclusive OR */
18: #define OP_ASSIGN(op,s,t) do {(s) = (t);} while(0)
19: /* Ref MPI MAXLOC */
20: #define OP_XLOC(op,s,t) \
21: do { \
22: if ((s).u == (t).u) (s).i = PetscMin((s).i,(t).i); \
23: else if (!((s).u op (t).u)) s = t; \
24: } while(0)
26: /* DEF_PackFunc - macro defining a Pack routine
28: Arguments of the macro:
29: +Type Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry.
30: .BS Block size for vectorization. It is a factor of bsz.
31: -EQ (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below.
33: Arguments of the Pack routine:
34: +count Number of indices in idx[].
35: .start When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used.
36: .opt Per-pack optimization plan. NULL means no such plan.
37: .idx Indices of entries to packed.
38: .link Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int.
39: .unpacked Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count).
40: -packed Address of the packed data.
41: */
42: #define DEF_PackFunc(Type,BS,EQ) \
43: static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *unpacked,void *packed) \
44: { \
46: const Type *u = (const Type*)unpacked,*u2; \
47: Type *p = (Type*)packed,*p2; \
48: PetscInt i,j,k,X,Y,r,bs=link->bs; \
49: const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \
50: const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \
52: if (!idx) {PetscArraycpy(p,u+start*MBS,MBS*count);}/* idx[] are contiguous */ \
53: else if (opt) { /* has optimizations available */ \
54: p2 = p; \
55: for (r=0; r<opt->n; r++) { \
56: u2 = u + opt->start[r]*MBS; \
57: X = opt->X[r]; \
58: Y = opt->Y[r]; \
59: for (k=0; k<opt->dz[r]; k++) \
60: for (j=0; j<opt->dy[r]; j++) { \
61: PetscArraycpy(p2,u2+(X*Y*k+X*j)*MBS,opt->dx[r]*MBS); \
62: p2 += opt->dx[r]*MBS; \
63: } \
64: } \
65: } else { \
66: for (i=0; i<count; i++) \
67: for (j=0; j<M; j++) /* Decent compilers should eliminate this loop when M = const 1 */ \
68: for (k=0; k<BS; k++) /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */ \
69: p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k]; \
70: } \
71: return(0); \
72: }
74: /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer
75: and inserts into a sparse array.
77: Arguments:
78: .Type Type of the data
79: .BS Block size for vectorization
80: .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const.
82: Notes:
83: This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro.
84: */
85: #define DEF_UnpackFunc(Type,BS,EQ) \
86: static PetscErrorCode CPPJoin4(UnpackAndInsert,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
87: { \
89: Type *u = (Type*)unpacked,*u2; \
90: const Type *p = (const Type*)packed; \
91: PetscInt i,j,k,X,Y,r,bs=link->bs; \
92: const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \
93: const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \
95: if (!idx) { \
96: u += start*MBS; \
97: if (u != p) {PetscArraycpy(u,p,count*MBS);} \
98: } else if (opt) { /* has optimizations available */ \
99: for (r=0; r<opt->n; r++) { \
100: u2 = u + opt->start[r]*MBS; \
101: X = opt->X[r]; \
102: Y = opt->Y[r]; \
103: for (k=0; k<opt->dz[r]; k++) \
104: for (j=0; j<opt->dy[r]; j++) { \
105: PetscArraycpy(u2+(X*Y*k+X*j)*MBS,p,opt->dx[r]*MBS); \
106: p += opt->dx[r]*MBS; \
107: } \
108: } \
109: } else { \
110: for (i=0; i<count; i++) \
111: for (j=0; j<M; j++) \
112: for (k=0; k<BS; k++) u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k]; \
113: } \
114: return(0); \
115: }
117: /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert
119: Arguments:
120: +Opname Name of the Op, such as Add, Mult, LAND, etc.
121: .Type Type of the data
122: .BS Block size for vectorization
123: .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const.
124: .Op Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc.
125: .OpApply Macro defining Section 1.5 Writing Application Codes with PETSc of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR
126: */
127: #define DEF_UnpackAndOp(Type,BS,EQ,Opname,Op,OpApply) \
128: static PetscErrorCode CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
129: { \
130: Type *u = (Type*)unpacked,*u2; \
131: const Type *p = (const Type*)packed; \
132: PetscInt i,j,k,X,Y,r,bs=link->bs; \
133: const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \
134: const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \
136: if (!idx) { \
137: u += start*MBS; \
138: for (i=0; i<count; i++) \
139: for (j=0; j<M; j++) \
140: for (k=0; k<BS; k++) \
141: OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]); \
142: } else if (opt) { /* idx[] has patterns */ \
143: for (r=0; r<opt->n; r++) { \
144: u2 = u + opt->start[r]*MBS; \
145: X = opt->X[r]; \
146: Y = opt->Y[r]; \
147: for (k=0; k<opt->dz[r]; k++) \
148: for (j=0; j<opt->dy[r]; j++) { \
149: for (i=0; i<opt->dx[r]*MBS; i++) OpApply(Op,u2[(X*Y*k+X*j)*MBS+i],p[i]); \
150: p += opt->dx[r]*MBS; \
151: } \
152: } \
153: } else { \
154: for (i=0; i<count; i++) \
155: for (j=0; j<M; j++) \
156: for (k=0; k<BS; k++) \
157: OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]); \
158: } \
159: return(0); \
160: }
162: #define DEF_FetchAndOp(Type,BS,EQ,Opname,Op,OpApply) \
163: static PetscErrorCode CPPJoin4(FetchAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,void *packed) \
164: { \
165: Type *u = (Type*)unpacked,*p = (Type*)packed,tmp; \
166: PetscInt i,j,k,r,l,bs=link->bs; \
167: const PetscInt M = (EQ) ? 1 : bs/BS; \
168: const PetscInt MBS = M*BS; \
170: for (i=0; i<count; i++) { \
171: r = (!idx ? start+i : idx[i])*MBS; \
172: l = i*MBS; \
173: for (j=0; j<M; j++) \
174: for (k=0; k<BS; k++) { \
175: tmp = u[r+j*BS+k]; \
176: OpApply(Op,u[r+j*BS+k],p[l+j*BS+k]); \
177: p[l+j*BS+k] = tmp; \
178: } \
179: } \
180: return(0); \
181: }
183: #define DEF_ScatterAndOp(Type,BS,EQ,Opname,Op,OpApply) \
184: static PetscErrorCode CPPJoin4(ScatterAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst) \
185: { \
187: const Type *u = (const Type*)src; \
188: Type *v = (Type*)dst; \
189: PetscInt i,j,k,s,t,X,Y,bs = link->bs; \
190: const PetscInt M = (EQ) ? 1 : bs/BS; \
191: const PetscInt MBS = M*BS; \
193: if (!srcIdx) { /* src is contiguous */ \
194: u += srcStart*MBS; \
195: CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(link,count,dstStart,dstOpt,dstIdx,dst,u); \
196: } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */ \
197: u += srcOpt->start[0]*MBS; \
198: v += dstStart*MBS; \
199: X = srcOpt->X[0]; Y = srcOpt->Y[0]; \
200: for (k=0; k<srcOpt->dz[0]; k++) \
201: for (j=0; j<srcOpt->dy[0]; j++) { \
202: for (i=0; i<srcOpt->dx[0]*MBS; i++) OpApply(Op,v[i],u[(X*Y*k+X*j)*MBS+i]); \
203: v += srcOpt->dx[0]*MBS; \
204: } \
205: } else { /* all other cases */ \
206: for (i=0; i<count; i++) { \
207: s = (!srcIdx ? srcStart+i : srcIdx[i])*MBS; \
208: t = (!dstIdx ? dstStart+i : dstIdx[i])*MBS; \
209: for (j=0; j<M; j++) \
210: for (k=0; k<BS; k++) OpApply(Op,v[t+j*BS+k],u[s+j*BS+k]); \
211: } \
212: } \
213: return(0); \
214: }
216: #define DEF_FetchAndOpLocal(Type,BS,EQ,Opname,Op,OpApply) \
217: static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata,void *leafupdate) \
218: { \
219: Type *rdata = (Type*)rootdata,*lupdate = (Type*)leafupdate; \
220: const Type *ldata = (const Type*)leafdata; \
221: PetscInt i,j,k,r,l,bs = link->bs; \
222: const PetscInt M = (EQ) ? 1 : bs/BS; \
223: const PetscInt MBS = M*BS; \
225: for (i=0; i<count; i++) { \
226: r = (rootidx ? rootidx[i] : rootstart+i)*MBS; \
227: l = (leafidx ? leafidx[i] : leafstart+i)*MBS; \
228: for (j=0; j<M; j++) \
229: for (k=0; k<BS; k++) { \
230: lupdate[l+j*BS+k] = rdata[r+j*BS+k]; \
231: OpApply(Op,rdata[r+j*BS+k],ldata[l+j*BS+k]); \
232: } \
233: } \
234: return(0); \
235: }
237: /* Pack, Unpack/Fetch ops */
238: #define DEF_Pack(Type,BS,EQ) \
239: DEF_PackFunc(Type,BS,EQ) \
240: DEF_UnpackFunc(Type,BS,EQ) \
241: DEF_ScatterAndOp(Type,BS,EQ,Insert,=,OP_ASSIGN) \
242: static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFLink link) { \
243: link->h_Pack = CPPJoin4(Pack, Type,BS,EQ); \
244: link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ); \
245: link->h_ScatterAndInsert= CPPJoin4(ScatterAndInsert,Type,BS,EQ); \
246: }
248: /* Add, Mult ops */
249: #define DEF_Add(Type,BS,EQ) \
250: DEF_UnpackAndOp (Type,BS,EQ,Add, +,OP_BINARY) \
251: DEF_UnpackAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \
252: DEF_FetchAndOp (Type,BS,EQ,Add, +,OP_BINARY) \
253: DEF_ScatterAndOp (Type,BS,EQ,Add, +,OP_BINARY) \
254: DEF_ScatterAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \
255: DEF_FetchAndOpLocal(Type,BS,EQ,Add, +,OP_BINARY) \
256: static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFLink link) { \
257: link->h_UnpackAndAdd = CPPJoin4(UnpackAndAdd, Type,BS,EQ); \
258: link->h_UnpackAndMult = CPPJoin4(UnpackAndMult, Type,BS,EQ); \
259: link->h_FetchAndAdd = CPPJoin4(FetchAndAdd, Type,BS,EQ); \
260: link->h_ScatterAndAdd = CPPJoin4(ScatterAndAdd, Type,BS,EQ); \
261: link->h_ScatterAndMult = CPPJoin4(ScatterAndMult, Type,BS,EQ); \
262: link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal,Type,BS,EQ); \
263: }
265: /* Max, Min ops */
266: #define DEF_Cmp(Type,BS,EQ) \
267: DEF_UnpackAndOp (Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \
268: DEF_UnpackAndOp (Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \
269: DEF_ScatterAndOp(Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \
270: DEF_ScatterAndOp(Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \
271: static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFLink link) { \
272: link->h_UnpackAndMax = CPPJoin4(UnpackAndMax, Type,BS,EQ); \
273: link->h_UnpackAndMin = CPPJoin4(UnpackAndMin, Type,BS,EQ); \
274: link->h_ScatterAndMax = CPPJoin4(ScatterAndMax, Type,BS,EQ); \
275: link->h_ScatterAndMin = CPPJoin4(ScatterAndMin, Type,BS,EQ); \
276: }
278: /* Logical ops.
279: The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid
280: the compilation warning "empty macro arguments are undefined in ISO C90"
281: */
282: #define DEF_Log(Type,BS,EQ) \
283: DEF_UnpackAndOp (Type,BS,EQ,LAND,&&,OP_BINARY) \
284: DEF_UnpackAndOp (Type,BS,EQ,LOR, ||,OP_BINARY) \
285: DEF_UnpackAndOp (Type,BS,EQ,LXOR,||, OP_LXOR) \
286: DEF_ScatterAndOp(Type,BS,EQ,LAND,&&,OP_BINARY) \
287: DEF_ScatterAndOp(Type,BS,EQ,LOR, ||,OP_BINARY) \
288: DEF_ScatterAndOp(Type,BS,EQ,LXOR,||, OP_LXOR) \
289: static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFLink link) { \
290: link->h_UnpackAndLAND = CPPJoin4(UnpackAndLAND, Type,BS,EQ); \
291: link->h_UnpackAndLOR = CPPJoin4(UnpackAndLOR, Type,BS,EQ); \
292: link->h_UnpackAndLXOR = CPPJoin4(UnpackAndLXOR, Type,BS,EQ); \
293: link->h_ScatterAndLAND = CPPJoin4(ScatterAndLAND,Type,BS,EQ); \
294: link->h_ScatterAndLOR = CPPJoin4(ScatterAndLOR, Type,BS,EQ); \
295: link->h_ScatterAndLXOR = CPPJoin4(ScatterAndLXOR,Type,BS,EQ); \
296: }
298: /* Bitwise ops */
299: #define DEF_Bit(Type,BS,EQ) \
300: DEF_UnpackAndOp (Type,BS,EQ,BAND,&,OP_BINARY) \
301: DEF_UnpackAndOp (Type,BS,EQ,BOR, |,OP_BINARY) \
302: DEF_UnpackAndOp (Type,BS,EQ,BXOR,^,OP_BINARY) \
303: DEF_ScatterAndOp(Type,BS,EQ,BAND,&,OP_BINARY) \
304: DEF_ScatterAndOp(Type,BS,EQ,BOR, |,OP_BINARY) \
305: DEF_ScatterAndOp(Type,BS,EQ,BXOR,^,OP_BINARY) \
306: static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFLink link) { \
307: link->h_UnpackAndBAND = CPPJoin4(UnpackAndBAND, Type,BS,EQ); \
308: link->h_UnpackAndBOR = CPPJoin4(UnpackAndBOR, Type,BS,EQ); \
309: link->h_UnpackAndBXOR = CPPJoin4(UnpackAndBXOR, Type,BS,EQ); \
310: link->h_ScatterAndBAND = CPPJoin4(ScatterAndBAND,Type,BS,EQ); \
311: link->h_ScatterAndBOR = CPPJoin4(ScatterAndBOR, Type,BS,EQ); \
312: link->h_ScatterAndBXOR = CPPJoin4(ScatterAndBXOR,Type,BS,EQ); \
313: }
315: /* Maxloc, Minloc ops */
316: #define DEF_Xloc(Type,BS,EQ) \
317: DEF_UnpackAndOp (Type,BS,EQ,Max,>,OP_XLOC) \
318: DEF_UnpackAndOp (Type,BS,EQ,Min,<,OP_XLOC) \
319: DEF_ScatterAndOp(Type,BS,EQ,Max,>,OP_XLOC) \
320: DEF_ScatterAndOp(Type,BS,EQ,Min,<,OP_XLOC) \
321: static void CPPJoin4(PackInit_Xloc,Type,BS,EQ)(PetscSFLink link) { \
322: link->h_UnpackAndMaxloc = CPPJoin4(UnpackAndMax, Type,BS,EQ); \
323: link->h_UnpackAndMinloc = CPPJoin4(UnpackAndMin, Type,BS,EQ); \
324: link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax,Type,BS,EQ); \
325: link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin,Type,BS,EQ); \
326: }
328: #define DEF_IntegerType(Type,BS,EQ) \
329: DEF_Pack(Type,BS,EQ) \
330: DEF_Add(Type,BS,EQ) \
331: DEF_Cmp(Type,BS,EQ) \
332: DEF_Log(Type,BS,EQ) \
333: DEF_Bit(Type,BS,EQ) \
334: static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFLink link) { \
335: CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \
336: CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \
337: CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \
338: CPPJoin4(PackInit_Logical,Type,BS,EQ)(link); \
339: CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link); \
340: }
342: #define DEF_RealType(Type,BS,EQ) \
343: DEF_Pack(Type,BS,EQ) \
344: DEF_Add(Type,BS,EQ) \
345: DEF_Cmp(Type,BS,EQ) \
346: static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFLink link) { \
347: CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \
348: CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \
349: CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \
350: }
352: #if defined(PETSC_HAVE_COMPLEX)
353: #define DEF_ComplexType(Type,BS,EQ) \
354: DEF_Pack(Type,BS,EQ) \
355: DEF_Add(Type,BS,EQ) \
356: static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFLink link) { \
357: CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \
358: CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \
359: }
360: #endif
362: #define DEF_DumbType(Type,BS,EQ) \
363: DEF_Pack(Type,BS,EQ) \
364: static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFLink link) { \
365: CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \
366: }
368: /* Maxloc, Minloc */
369: #define DEF_PairType(Type,BS,EQ) \
370: DEF_Pack(Type,BS,EQ) \
371: DEF_Xloc(Type,BS,EQ) \
372: static void CPPJoin4(PackInit_PairType,Type,BS,EQ)(PetscSFLink link) { \
373: CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \
374: CPPJoin4(PackInit_Xloc,Type,BS,EQ)(link); \
375: }
377: DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT */
378: DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */
379: DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */
380: DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */
381: DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */
382: DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */
383: DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */
384: DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */
386: #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */
387: DEF_IntegerType(int,1,1)
388: DEF_IntegerType(int,2,1)
389: DEF_IntegerType(int,4,1)
390: DEF_IntegerType(int,8,1)
391: DEF_IntegerType(int,1,0)
392: DEF_IntegerType(int,2,0)
393: DEF_IntegerType(int,4,0)
394: DEF_IntegerType(int,8,0)
395: #endif
397: /* The typedefs are used to get a typename without space that CPPJoin can handle */
398: typedef signed char SignedChar;
399: DEF_IntegerType(SignedChar,1,1)
400: DEF_IntegerType(SignedChar,2,1)
401: DEF_IntegerType(SignedChar,4,1)
402: DEF_IntegerType(SignedChar,8,1)
403: DEF_IntegerType(SignedChar,1,0)
404: DEF_IntegerType(SignedChar,2,0)
405: DEF_IntegerType(SignedChar,4,0)
406: DEF_IntegerType(SignedChar,8,0)
408: typedef unsigned char UnsignedChar;
409: DEF_IntegerType(UnsignedChar,1,1)
410: DEF_IntegerType(UnsignedChar,2,1)
411: DEF_IntegerType(UnsignedChar,4,1)
412: DEF_IntegerType(UnsignedChar,8,1)
413: DEF_IntegerType(UnsignedChar,1,0)
414: DEF_IntegerType(UnsignedChar,2,0)
415: DEF_IntegerType(UnsignedChar,4,0)
416: DEF_IntegerType(UnsignedChar,8,0)
418: DEF_RealType(PetscReal,1,1)
419: DEF_RealType(PetscReal,2,1)
420: DEF_RealType(PetscReal,4,1)
421: DEF_RealType(PetscReal,8,1)
422: DEF_RealType(PetscReal,1,0)
423: DEF_RealType(PetscReal,2,0)
424: DEF_RealType(PetscReal,4,0)
425: DEF_RealType(PetscReal,8,0)
427: #if defined(PETSC_HAVE_COMPLEX)
428: DEF_ComplexType(PetscComplex,1,1)
429: DEF_ComplexType(PetscComplex,2,1)
430: DEF_ComplexType(PetscComplex,4,1)
431: DEF_ComplexType(PetscComplex,8,1)
432: DEF_ComplexType(PetscComplex,1,0)
433: DEF_ComplexType(PetscComplex,2,0)
434: DEF_ComplexType(PetscComplex,4,0)
435: DEF_ComplexType(PetscComplex,8,0)
436: #endif
438: #define PairType(Type1,Type2) Type1##_##Type2
439: typedef struct {int u; int i;} PairType(int,int);
440: typedef struct {PetscInt u; PetscInt i;} PairType(PetscInt,PetscInt);
441: DEF_PairType(PairType(int,int),1,1)
442: DEF_PairType(PairType(PetscInt,PetscInt),1,1)
444: /* If we don't know the basic type, we treat it as a stream of chars or ints */
445: DEF_DumbType(char,1,1)
446: DEF_DumbType(char,2,1)
447: DEF_DumbType(char,4,1)
448: DEF_DumbType(char,1,0)
449: DEF_DumbType(char,2,0)
450: DEF_DumbType(char,4,0)
452: typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */
453: DEF_DumbType(DumbInt,1,1)
454: DEF_DumbType(DumbInt,2,1)
455: DEF_DumbType(DumbInt,4,1)
456: DEF_DumbType(DumbInt,8,1)
457: DEF_DumbType(DumbInt,1,0)
458: DEF_DumbType(DumbInt,2,0)
459: DEF_DumbType(DumbInt,4,0)
460: DEF_DumbType(DumbInt,8,0)
462: #if !defined(PETSC_HAVE_MPI_TYPE_DUP)
463: PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype)
464: {
465: int ierr;
466: MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr;
467: MPI_Type_commit(newtype); if (ierr) return ierr;
468: return MPI_SUCCESS;
469: }
470: #endif
472: /*
473: The routine Creates a communication link for the given operation. It first looks up its link cache. If
474: there is a free & suitable one, it uses it. Otherwise it creates a new one.
476: A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack
477: root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata
478: can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate
479: those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation.
481: The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU.
483: In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link.
485: The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and
486: need pack/unpack data.
487: */
488: PetscErrorCode PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink)
489: {
490: PetscErrorCode ierr;
491: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
492: PetscInt i,j,k,nrootreqs,nleafreqs,nreqs;
493: PetscSFLink *p,link;
494: PetscSFDirection direction;
495: MPI_Request *reqs = NULL;
496: PetscBool match,rootdirect[2],leafdirect[2];
497: PetscMemType rootmtype_mpi,leafmtype_mpi; /* mtypes seen by MPI */
498: PetscInt rootdirect_mpi,leafdirect_mpi; /* root/leafdirect seen by MPI*/
501: PetscSFSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata);
503: /* Can we directly use root/leafdirect with the given sf, sfop and op? */
504: for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
505: if (sfop == PETSCSF_BCAST) {
506: rootdirect[i] = bas->rootcontig[i]; /* Pack roots */
507: leafdirect[i] = (sf->leafcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack leaves */
508: } else if (sfop == PETSCSF_REDUCE) {
509: leafdirect[i] = sf->leafcontig[i]; /* Pack leaves */
510: rootdirect[i] = (bas->rootcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */
511: } else { /* PETSCSF_FETCH */
512: rootdirect[i] = PETSC_FALSE; /* FETCH always need a separate rootbuf */
513: leafdirect[i] = PETSC_FALSE; /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */
514: }
515: }
517: if (sf->use_gpu_aware_mpi) {
518: rootmtype_mpi = rootmtype;
519: leafmtype_mpi = leafmtype;
520: } else {
521: rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST;
522: }
523: /* Will root/leafdata be directly accessed by MPI? Without use_gpu_aware_mpi, device data is bufferred on host and then passed to MPI */
524: rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype)? 1 : 0;
525: leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype)? 1 : 0;
527: direction = (sfop == PETSCSF_BCAST)? PETSCSF_../../../../../..2LEAF : PETSCSF_LEAF2../../../../../..;
528: nrootreqs = bas->nrootreqs;
529: nleafreqs = sf->nleafreqs;
531: /* Look for free links in cache */
532: for (p=&bas->avail; (link=*p); p=&link->next) {
533: MPIPetsc_Type_compare(unit,link->unit,&match);
534: if (match) {
535: /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with current.
536: If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests().
537: */
538: if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) {
539: reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */
540: for (i=0; i<nrootreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {MPI_Request_free(&reqs[i]);}}
541: link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE;
542: }
543: if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) {
544: reqs = link->leafreqs[direction][leafmtype][1];
545: for (i=0; i<nleafreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {MPI_Request_free(&reqs[i]);}}
546: link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE;
547: }
548: *p = link->next; /* Remove from available list */
549: goto found;
550: }
551: }
553: PetscNew(&link);
554: PetscSFLinkSetUp_Host(sf,link,unit);
555: PetscCommGetNewTag(PetscObjectComm((PetscObject)sf),&link->tag); /* One tag per link */
557: nreqs = (nrootreqs+nleafreqs)*8;
558: PetscMalloc1(nreqs,&link->reqs);
559: for (i=0; i<nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */
561: for (i=0; i<2; i++) { /* Two communication directions */
562: for (j=0; j<2; j++) { /* Two memory types */
563: for (k=0; k<2; k++) { /* root/leafdirect 0 or 1 */
564: link->rootreqs[i][j][k] = link->reqs + nrootreqs*(4*i+2*j+k);
565: link->leafreqs[i][j][k] = link->reqs + nrootreqs*8 + nleafreqs*(4*i+2*j+k);
566: }
567: }
568: }
570: found:
571: if ((rootmtype == PETSC_MEMTYPE_DEVICE || leafmtype == PETSC_MEMTYPE_DEVICE) && !link->deviceinited) {PetscSFLinkSetUp_Device(sf,link,unit);}
573: /* Allocate buffers along root/leafdata */
574: for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
575: /* For local communication, buffers are only needed when roots and leaves have different mtypes */
576: if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue;
577: if (bas->rootbuflen[i]) {
578: if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */
579: link->rootbuf[i][rootmtype] = (char*)rootdata + bas->rootstart[i]*link->unitbytes;
580: } else { /* Have to have a separate rootbuf */
581: if (!link->rootbuf_alloc[i][rootmtype]) {
582: PetscMallocWithMemType(rootmtype,bas->rootbuflen[i]*link->unitbytes,(void**)&link->rootbuf_alloc[i][rootmtype]);
583: }
584: link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype];
585: }
586: }
588: if (sf->leafbuflen[i]) {
589: if (leafdirect[i]) {
590: link->leafbuf[i][leafmtype] = (char*)leafdata + sf->leafstart[i]*link->unitbytes;
591: } else {
592: if (!link->leafbuf_alloc[i][leafmtype]) {
593: PetscMallocWithMemType(leafmtype,sf->leafbuflen[i]*link->unitbytes,(void**)&link->leafbuf_alloc[i][leafmtype]);
594: }
595: link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype];
596: }
597: }
598: }
600: /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */
601: if (rootmtype == PETSC_MEMTYPE_DEVICE && rootmtype_mpi == PETSC_MEMTYPE_HOST) {
602: if(!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) {
603: PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);
604: }
605: link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
606: }
607: if (leafmtype == PETSC_MEMTYPE_DEVICE && leafmtype_mpi == PETSC_MEMTYPE_HOST) {
608: if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) {
609: PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);
610: }
611: link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
612: }
614: /* Set `current` state of the link. They may change between different SF invocations with the same link */
615: if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */
616: if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata;
617: if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata;
618: }
620: link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */
621: link->leafdata = leafdata;
622: for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
623: link->rootdirect[i] = rootdirect[i];
624: link->leafdirect[i] = leafdirect[i];
625: }
626: link->rootdirect_mpi = rootdirect_mpi;
627: link->leafdirect_mpi = leafdirect_mpi;
628: link->rootmtype = rootmtype;
629: link->leafmtype = leafmtype;
630: link->rootmtype_mpi = rootmtype_mpi;
631: link->leafmtype_mpi = leafmtype_mpi;
633: link->next = bas->inuse;
634: bas->inuse = link;
635: *mylink = link;
636: return(0);
637: }
639: /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction.
640: If the sf uses persistent requests and the requests have not been initialized, then initialize them.
641: */
642: PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void **rootbuf, void **leafbuf,MPI_Request **rootreqs,MPI_Request **leafreqs)
643: {
644: PetscErrorCode ierr;
645: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
646: PetscInt i,j,nrootranks,ndrootranks,nleafranks,ndleafranks;
647: const PetscInt *rootoffset,*leafoffset;
648: PetscMPIInt n;
649: MPI_Aint disp;
650: MPI_Comm comm = PetscObjectComm((PetscObject)sf);
651: MPI_Datatype unit = link->unit;
652: const PetscMemType rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */
653: const PetscInt rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi;
656: /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */
657: if (sf->persistent) {
658: if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) {
659: PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL);
660: if (direction == PETSCSF_LEAF2../../../../../..) {
661: for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
662: disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
663: PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);
664: MPI_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);
665: }
666: } else { /* PETSCSF_../../../../../..2LEAF */
667: for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
668: disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
669: PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);
670: MPI_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);
671: }
672: }
673: link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE;
674: }
676: if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) {
677: PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL);
678: if (direction == PETSCSF_LEAF2../../../../../..) {
679: for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
680: disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
681: PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);
682: MPI_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);
683: }
684: } else { /* PETSCSF_../../../../../..2LEAF */
685: for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
686: disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
687: PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);
688: MPI_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);
689: }
690: }
691: link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE;
692: }
693: }
694: if (rootbuf) *rootbuf = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi];
695: if (leafbuf) *leafbuf = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi];
696: if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi];
697: if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi];
698: return(0);
699: }
702: PetscErrorCode PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFLink *mylink)
703: {
704: PetscErrorCode ierr;
705: PetscSFLink link,*p;
706: PetscSF_Basic *bas=(PetscSF_Basic*)sf->data;
709: /* Look for types in cache */
710: for (p=&bas->inuse; (link=*p); p=&link->next) {
711: PetscBool match;
712: MPIPetsc_Type_compare(unit,link->unit,&match);
713: if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) {
714: switch (cmode) {
715: case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */
716: case PETSC_USE_POINTER: break;
717: default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode");
718: }
719: *mylink = link;
720: return(0);
721: }
722: }
723: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack");
724: return(0);
725: }
727: PetscErrorCode PetscSFLinkReclaim(PetscSF sf,PetscSFLink *link)
728: {
729: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
732: (*link)->rootdata = NULL;
733: (*link)->leafdata = NULL;
734: (*link)->next = bas->avail;
735: bas->avail = *link;
736: *link = NULL;
737: return(0);
738: }
740: /* Destroy all links chained in 'avail' */
741: PetscErrorCode PetscSFLinkDestroy(PetscSF sf,PetscSFLink *avail)
742: {
743: PetscErrorCode ierr;
744: PetscSFLink link = *avail,next;
745: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
746: PetscInt i,nreqs = (bas->nrootreqs+sf->nleafreqs)*8;
749: for (; link; link=next) {
750: next = link->next;
751: if (!link->isbuiltin) {MPI_Type_free(&link->unit);}
752: for (i=0; i<nreqs; i++) { /* Persistent reqs must be freed. */
753: if (link->reqs[i] != MPI_REQUEST_NULL) {MPI_Request_free(&link->reqs[i]);}
754: }
755: PetscFree(link->reqs);
756: for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
757: #if defined(PETSC_HAVE_CUDA)
758: PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->rootbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);
759: PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->leafbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);
760: if (link->stream) {cudaError_t err = cudaStreamDestroy(link->stream);CHKERRCUDA(err); link->stream = NULL;}
761: #endif
762: PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]);
763: PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]);
764: }
765: PetscFree(link);
766: }
767: *avail = NULL;
768: return(0);
769: }
771: #if defined(PETSC_USE_DEBUG)
772: /* Error out on unsupported overlapped communications */
773: PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata)
774: {
775: PetscErrorCode ierr;
776: PetscSFLink link,*p;
777: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
778: PetscBool match;
781: /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore
782: the potential overlapping since this process does not participate in communication. Overlapping is harmless.
783: */
784: if (rootdata || leafdata) {
785: for (p=&bas->inuse; (link=*p); p=&link->next) {
786: MPIPetsc_Type_compare(unit,link->unit,&match);
787: if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.",rootdata,leafdata);
788: }
789: }
790: return(0);
791: }
792: #endif
794: PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit)
795: {
797: PetscInt nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0;
798: PetscBool is2Int,is2PetscInt;
799: PetscMPIInt ni,na,nd,combiner;
800: #if defined(PETSC_HAVE_COMPLEX)
801: PetscInt nPetscComplex=0;
802: #endif
805: MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR, &nSignedChar);
806: MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);
807: /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
808: MPIPetsc_Type_compare_contig(unit,MPI_INT, &nInt);
809: MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);
810: MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);
811: #if defined(PETSC_HAVE_COMPLEX)
812: MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);
813: #endif
814: MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);
815: MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);
816: /* TODO: shaell we also handle Fortran MPI_2REAL? */
817: MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);
818: link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */
819: link->bs = 1; /* default */
821: if (is2Int) {
822: PackInit_PairType_int_int_1_1(link);
823: link->bs = 1;
824: link->unitbytes = 2*sizeof(int);
825: link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
826: link->basicunit = MPI_2INT;
827: link->unit = MPI_2INT;
828: } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
829: PackInit_PairType_PetscInt_PetscInt_1_1(link);
830: link->bs = 1;
831: link->unitbytes = 2*sizeof(PetscInt);
832: link->basicunit = MPIU_2INT;
833: link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
834: link->unit = MPIU_2INT;
835: } else if (nPetscReal) {
836: if (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link);
837: else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link);
838: else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link);
839: else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link);
840: link->bs = nPetscReal;
841: link->unitbytes = nPetscReal*sizeof(PetscReal);
842: link->basicunit = MPIU_REAL;
843: if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;}
844: } else if (nPetscInt) {
845: if (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link);
846: else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link);
847: else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link);
848: else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link);
849: link->bs = nPetscInt;
850: link->unitbytes = nPetscInt*sizeof(PetscInt);
851: link->basicunit = MPIU_INT;
852: if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;}
853: #if defined(PETSC_USE_64BIT_INDICES)
854: } else if (nInt) {
855: if (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link);
856: else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link);
857: else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link);
858: else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link);
859: link->bs = nInt;
860: link->unitbytes = nInt*sizeof(int);
861: link->basicunit = MPI_INT;
862: if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;}
863: #endif
864: } else if (nSignedChar) {
865: if (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link);
866: else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link);
867: else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link);
868: else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link);
869: link->bs = nSignedChar;
870: link->unitbytes = nSignedChar*sizeof(SignedChar);
871: link->basicunit = MPI_SIGNED_CHAR;
872: if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;}
873: } else if (nUnsignedChar) {
874: if (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link);
875: else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link);
876: else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link);
877: else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link);
878: link->bs = nUnsignedChar;
879: link->unitbytes = nUnsignedChar*sizeof(UnsignedChar);
880: link->basicunit = MPI_UNSIGNED_CHAR;
881: if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;}
882: #if defined(PETSC_HAVE_COMPLEX)
883: } else if (nPetscComplex) {
884: if (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link);
885: else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link);
886: else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link);
887: else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link);
888: link->bs = nPetscComplex;
889: link->unitbytes = nPetscComplex*sizeof(PetscComplex);
890: link->basicunit = MPIU_COMPLEX;
891: if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;}
892: #endif
893: } else {
894: MPI_Aint lb,nbyte;
895: MPI_Type_get_extent(unit,&lb,&nbyte);
896: if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
897: if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
898: if (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link);
899: else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link);
900: else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link);
901: link->bs = nbyte;
902: link->unitbytes = nbyte;
903: link->basicunit = MPI_BYTE;
904: } else {
905: nInt = nbyte / sizeof(int);
906: if (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link);
907: else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link);
908: else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link);
909: else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link);
910: link->bs = nInt;
911: link->unitbytes = nbyte;
912: link->basicunit = MPI_INT;
913: }
914: if (link->isbuiltin) link->unit = unit;
915: }
917: if (!link->isbuiltin) {MPI_Type_dup(unit,&link->unit);}
918: return(0);
919: }
921: PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*))
922: {
924: *UnpackAndOp = NULL;
925: if (mtype == PETSC_MEMTYPE_HOST) {
926: if (op == MPIU_REPLACE) *UnpackAndOp = link->h_UnpackAndInsert;
927: else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd;
928: else if (op == MPI_PROD) *UnpackAndOp = link->h_UnpackAndMult;
929: else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax;
930: else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin;
931: else if (op == MPI_LAND) *UnpackAndOp = link->h_UnpackAndLAND;
932: else if (op == MPI_BAND) *UnpackAndOp = link->h_UnpackAndBAND;
933: else if (op == MPI_LOR) *UnpackAndOp = link->h_UnpackAndLOR;
934: else if (op == MPI_BOR) *UnpackAndOp = link->h_UnpackAndBOR;
935: else if (op == MPI_LXOR) *UnpackAndOp = link->h_UnpackAndLXOR;
936: else if (op == MPI_BXOR) *UnpackAndOp = link->h_UnpackAndBXOR;
937: else if (op == MPI_MAXLOC) *UnpackAndOp = link->h_UnpackAndMaxloc;
938: else if (op == MPI_MINLOC) *UnpackAndOp = link->h_UnpackAndMinloc;
939: }
940: #if defined(PETSC_HAVE_CUDA)
941: else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) {
942: if (op == MPIU_REPLACE) *UnpackAndOp = link->d_UnpackAndInsert;
943: else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd;
944: else if (op == MPI_PROD) *UnpackAndOp = link->d_UnpackAndMult;
945: else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax;
946: else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin;
947: else if (op == MPI_LAND) *UnpackAndOp = link->d_UnpackAndLAND;
948: else if (op == MPI_BAND) *UnpackAndOp = link->d_UnpackAndBAND;
949: else if (op == MPI_LOR) *UnpackAndOp = link->d_UnpackAndLOR;
950: else if (op == MPI_BOR) *UnpackAndOp = link->d_UnpackAndBOR;
951: else if (op == MPI_LXOR) *UnpackAndOp = link->d_UnpackAndLXOR;
952: else if (op == MPI_BXOR) *UnpackAndOp = link->d_UnpackAndBXOR;
953: else if (op == MPI_MAXLOC) *UnpackAndOp = link->d_UnpackAndMaxloc;
954: else if (op == MPI_MINLOC) *UnpackAndOp = link->d_UnpackAndMinloc;
955: } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) {
956: if (op == MPIU_REPLACE) *UnpackAndOp = link->da_UnpackAndInsert;
957: else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd;
958: else if (op == MPI_PROD) *UnpackAndOp = link->da_UnpackAndMult;
959: else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax;
960: else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin;
961: else if (op == MPI_LAND) *UnpackAndOp = link->da_UnpackAndLAND;
962: else if (op == MPI_BAND) *UnpackAndOp = link->da_UnpackAndBAND;
963: else if (op == MPI_LOR) *UnpackAndOp = link->da_UnpackAndLOR;
964: else if (op == MPI_BOR) *UnpackAndOp = link->da_UnpackAndBOR;
965: else if (op == MPI_LXOR) *UnpackAndOp = link->da_UnpackAndLXOR;
966: else if (op == MPI_BXOR) *UnpackAndOp = link->da_UnpackAndBXOR;
967: else if (op == MPI_MAXLOC) *UnpackAndOp = link->da_UnpackAndMaxloc;
968: else if (op == MPI_MINLOC) *UnpackAndOp = link->da_UnpackAndMinloc;
969: }
970: #endif
971: return(0);
972: }
974: PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*))
975: {
977: *ScatterAndOp = NULL;
978: if (mtype == PETSC_MEMTYPE_HOST) {
979: if (op == MPIU_REPLACE) *ScatterAndOp = link->h_ScatterAndInsert;
980: else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd;
981: else if (op == MPI_PROD) *ScatterAndOp = link->h_ScatterAndMult;
982: else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax;
983: else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin;
984: else if (op == MPI_LAND) *ScatterAndOp = link->h_ScatterAndLAND;
985: else if (op == MPI_BAND) *ScatterAndOp = link->h_ScatterAndBAND;
986: else if (op == MPI_LOR) *ScatterAndOp = link->h_ScatterAndLOR;
987: else if (op == MPI_BOR) *ScatterAndOp = link->h_ScatterAndBOR;
988: else if (op == MPI_LXOR) *ScatterAndOp = link->h_ScatterAndLXOR;
989: else if (op == MPI_BXOR) *ScatterAndOp = link->h_ScatterAndBXOR;
990: else if (op == MPI_MAXLOC) *ScatterAndOp = link->h_ScatterAndMaxloc;
991: else if (op == MPI_MINLOC) *ScatterAndOp = link->h_ScatterAndMinloc;
992: }
993: #if defined(PETSC_HAVE_CUDA)
994: else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) {
995: if (op == MPIU_REPLACE) *ScatterAndOp = link->d_ScatterAndInsert;
996: else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd;
997: else if (op == MPI_PROD) *ScatterAndOp = link->d_ScatterAndMult;
998: else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax;
999: else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin;
1000: else if (op == MPI_LAND) *ScatterAndOp = link->d_ScatterAndLAND;
1001: else if (op == MPI_BAND) *ScatterAndOp = link->d_ScatterAndBAND;
1002: else if (op == MPI_LOR) *ScatterAndOp = link->d_ScatterAndLOR;
1003: else if (op == MPI_BOR) *ScatterAndOp = link->d_ScatterAndBOR;
1004: else if (op == MPI_LXOR) *ScatterAndOp = link->d_ScatterAndLXOR;
1005: else if (op == MPI_BXOR) *ScatterAndOp = link->d_ScatterAndBXOR;
1006: else if (op == MPI_MAXLOC) *ScatterAndOp = link->d_ScatterAndMaxloc;
1007: else if (op == MPI_MINLOC) *ScatterAndOp = link->d_ScatterAndMinloc;
1008: } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) {
1009: if (op == MPIU_REPLACE) *ScatterAndOp = link->da_ScatterAndInsert;
1010: else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd;
1011: else if (op == MPI_PROD) *ScatterAndOp = link->da_ScatterAndMult;
1012: else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax;
1013: else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin;
1014: else if (op == MPI_LAND) *ScatterAndOp = link->da_ScatterAndLAND;
1015: else if (op == MPI_BAND) *ScatterAndOp = link->da_ScatterAndBAND;
1016: else if (op == MPI_LOR) *ScatterAndOp = link->da_ScatterAndLOR;
1017: else if (op == MPI_BOR) *ScatterAndOp = link->da_ScatterAndBOR;
1018: else if (op == MPI_LXOR) *ScatterAndOp = link->da_ScatterAndLXOR;
1019: else if (op == MPI_BXOR) *ScatterAndOp = link->da_ScatterAndBXOR;
1020: else if (op == MPI_MAXLOC) *ScatterAndOp = link->da_ScatterAndMaxloc;
1021: else if (op == MPI_MINLOC) *ScatterAndOp = link->da_ScatterAndMinloc;
1022: }
1023: #endif
1024: return(0);
1025: }
1027: PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*))
1028: {
1030: *FetchAndOp = NULL;
1031: if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
1032: if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOp = link->h_FetchAndAdd;
1033: #if defined(PETSC_HAVE_CUDA)
1034: else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOp = link->d_FetchAndAdd;
1035: else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) *FetchAndOp = link->da_FetchAndAdd;
1036: #endif
1037: return(0);
1038: }
1040: PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*))
1041: {
1043: *FetchAndOpLocal = NULL;
1044: if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
1045: if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOpLocal = link->h_FetchAndAddLocal;
1046: #if defined(PETSC_HAVE_CUDA)
1047: else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal;
1048: else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) *FetchAndOpLocal = link->da_FetchAndAddLocal;
1049: #endif
1050: return(0);
1051: }
1053: /*=============================================================================
1054: A set of helper routines for Pack/Unpack/Scatter on GPUs
1055: ============================================================================*/
1056: #if defined(PETSC_HAVE_CUDA)
1057: /* If SF does not know which stream root/leafdata is being computed on, it has to sync the device to
1058: make sure the data is ready for packing.
1059: */
1060: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncDeviceBeforePackData(PetscSF sf,PetscSFLink link)
1061: {
1063: if (sf->use_default_stream) return(0);
1064: if (link->rootmtype == PETSC_MEMTYPE_DEVICE || link->leafmtype == PETSC_MEMTYPE_DEVICE) {
1065: cudaError_t cerr = cudaDeviceSynchronize();CHKERRCUDA(cerr);
1066: }
1067: return(0);
1068: }
1070: /* PetscSFLinkSyncStreamAfterPackXxxData routines make sure root/leafbuf for the remote is ready for MPI */
1071: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterPackRootData(PetscSF sf,PetscSFLink link)
1072: {
1073: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1076: /* Do nothing if we use stream aware mpi || has nothing for remote */
1077: if (sf->use_stream_aware_mpi || link->rootmtype != PETSC_MEMTYPE_DEVICE || !bas->rootbuflen[PETSCSF_REMOTE]) return(0);
1078: /* If we called a packing kernel || we async-copied rootdata from device to host || No cudaDeviceSynchronize was called (since default stream is assumed) */
1079: if (!link->rootdirect[PETSCSF_REMOTE] || !sf->use_gpu_aware_mpi || sf->use_default_stream) {
1080: cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr);
1081: }
1082: return(0);
1083: }
1084: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterPackLeafData(PetscSF sf,PetscSFLink link)
1085: {
1087: /* See comments above */
1088: if (sf->use_stream_aware_mpi || link->leafmtype != PETSC_MEMTYPE_DEVICE || !sf->leafbuflen[PETSCSF_REMOTE]) return(0);
1089: if (!link->leafdirect[PETSCSF_REMOTE] || !sf->use_gpu_aware_mpi || sf->use_default_stream) {
1090: cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr);
1091: }
1092: return(0);
1093: }
1095: /* PetscSFLinkSyncStreamAfterUnpackXxx routines make sure root/leafdata (local & remote) is ready to use for SF callers, when SF
1096: does not know which stream the callers will use.
1097: */
1098: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterUnpackRootData(PetscSF sf,PetscSFLink link)
1099: {
1100: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1101: PetscBool host2host = (link->rootmtype == PETSC_MEMTYPE_HOST) && (link->leafmtype == PETSC_MEMTYPE_HOST) ? PETSC_TRUE : PETSC_FALSE;
1104: /* Do nothing if host2host OR we are allowed to asynchronously put rootdata on device through the default stream */
1105: if (host2host || (link->rootmtype == PETSC_MEMTYPE_DEVICE && sf->use_default_stream)) return(0);
1107: /* If rootmtype is HOST or DEVICE:
1108: If we have data from local, then we called a scatter kernel (on link->stream), then we must sync it;
1109: If we have data from remote && no rootdirect(i.e., we called an unpack kernel), then we must also sycn it (if rootdirect,
1110: i.e., no unpack kernel after MPI, MPI guarentees rootbuf is ready to use so that we do not need the sync).
1112: Note a tricky case is when leafmtype=DEVICE, rootmtype=HOST on uni-processor, we must sync the stream otherwise
1113: CPU thread might use the yet-to-be-updated rootdata pending in the stream.
1114: */
1115: if (bas->rootbuflen[PETSCSF_LOCAL] || (bas->rootbuflen[PETSCSF_REMOTE] && !link->rootdirect[PETSCSF_REMOTE])) {
1116: cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr);
1117: }
1118: return(0);
1119: }
1120: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterUnpackLeafData(PetscSF sf,PetscSFLink link)
1121: {
1122: PetscBool host2host = (link->rootmtype == PETSC_MEMTYPE_HOST) && (link->leafmtype == PETSC_MEMTYPE_HOST) ? PETSC_TRUE : PETSC_FALSE;
1125: /* See comments in PetscSFLinkSyncStreamAfterUnpackRootData*/
1126: if (host2host || (link->leafmtype == PETSC_MEMTYPE_DEVICE && sf->use_default_stream)) return(0);
1127: if (sf->leafbuflen[PETSCSF_LOCAL] || (sf->leafbuflen[PETSCSF_REMOTE] && !link->leafdirect[PETSCSF_REMOTE])) {
1128: cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr);
1129: }
1130: return(0);
1131: }
1133: /* PetscSFLinkCopyXxxxBufferInCaseNotUseGpuAwareMPI routines are simple: if not use_gpu_aware_mpi, we need
1134: to copy the buffer from GPU to CPU before MPI calls, and from CPU to GPU after MPI calls.
1135: */
1136: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host)
1137: {
1139: cudaError_t cerr;
1140: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1143: if (link->rootmtype == PETSC_MEMTYPE_DEVICE && (link->rootmtype_mpi != link->rootmtype) && bas->rootbuflen[PETSCSF_REMOTE]) {
1144: void *h_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
1145: void *d_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE];
1146: size_t count = bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes;
1147: if (device2host) {
1148: cerr = cudaMemcpyAsync(h_buf,d_buf,count,cudaMemcpyDeviceToHost,link->stream);CHKERRCUDA(cerr);
1149: PetscLogGpuToCpu(count);
1150: } else {
1151: cerr = cudaMemcpyAsync(d_buf,h_buf,count,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr);
1152: PetscLogCpuToGpu(count);
1153: }
1154: }
1155: return(0);
1156: }
1158: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host)
1159: {
1161: cudaError_t cerr;
1164: if (link->leafmtype == PETSC_MEMTYPE_DEVICE && (link->leafmtype_mpi != link->leafmtype) && sf->leafbuflen[PETSCSF_REMOTE]) {
1165: void *h_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
1166: void *d_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE];
1167: size_t count = sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes;
1168: if (device2host) {
1169: cerr = cudaMemcpyAsync(h_buf,d_buf,count,cudaMemcpyDeviceToHost,link->stream);CHKERRCUDA(cerr);
1170: PetscLogGpuToCpu(count);
1171: } else {
1172: cerr = cudaMemcpyAsync(d_buf,h_buf,count,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr);
1173: PetscLogCpuToGpu(count);
1174: }
1175: }
1176: return(0);
1177: }
1178: #else
1180: #define PetscSFLinkSyncDeviceBeforePackData(a,b) 0
1181: #define PetscSFLinkSyncStreamAfterPackRootData(a,b) 0
1182: #define PetscSFLinkSyncStreamAfterPackLeafData(a,b) 0
1183: #define PetscSFLinkSyncStreamAfterUnpackRootData(a,b) 0
1184: #define PetscSFLinkSyncStreamAfterUnpackLeafData(a,b) 0
1185: #define PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(a,b,c) 0
1186: #define PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(a,b,c) 0
1188: #endif
1190: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
1191: {
1193: PetscLogDouble flops;
1194: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1198: if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
1199: flops = bas->rootbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
1200: #if defined(PETSC_HAVE_CUDA)
1201: if (link->rootmtype == PETSC_MEMTYPE_DEVICE) {PetscLogGpuFlops(flops);} else
1202: #endif
1203: {PetscLogFlops(flops);}
1204: }
1205: return(0);
1206: }
1208: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
1209: {
1210: PetscLogDouble flops;
1214: if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
1215: flops = sf->leafbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
1216: #if defined(PETSC_HAVE_CUDA)
1217: if (link->leafmtype == PETSC_MEMTYPE_DEVICE) {PetscLogGpuFlops(flops);} else
1218: #endif
1219: {PetscLogFlops(flops);}
1220: }
1221: return(0);
1222: }
1224: /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local.
1225: Input Arguments:
1226: +sf - The StarForest
1227: .link - The link
1228: .count - Number of entries to unpack
1229: .start - The first index, significent when indices=NULL
1230: .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start>
1231: .buf - A contiguous buffer to unpack from
1232: -op - Operation after unpack
1234: Output Arguments:
1235: .data - The data to unpack to
1236: */
1237: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *indices,void *data,const void *buf,MPI_Op op)
1238: {
1240: #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
1241: {
1243: PetscInt i;
1244: PetscMPIInt n;
1245: if (indices) {
1246: /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on
1247: basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf.
1248: */
1249: for (i=0; i<count; i++) {MPI_Reduce_local((const char*)buf+i*link->unitbytes,(char*)data+indices[i]*link->unitbytes,1,link->unit,op);}
1250: } else {
1251: PetscMPIIntCast(count,&n);
1252: MPI_Reduce_local(buf,(char*)data+start*link->unitbytes,n,link->unit,op);
1253: }
1254: }
1255: #else
1256: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
1257: #endif
1258: return(0);
1259: }
1261: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt srcStart,const PetscInt *srcIdx,const void *src,PetscInt dstStart,const PetscInt *dstIdx,void *dst,MPI_Op op)
1262: {
1264: #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
1265: {
1267: PetscInt i,disp;
1268: if (!srcIdx) {
1269: PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,dstStart,dstIdx,dst,(const char*)src+srcStart*link->unitbytes,op);
1270: } else {
1271: for (i=0; i<count; i++) {
1272: disp = dstIdx? dstIdx[i] : dstStart + i;
1273: MPI_Reduce_local((const char*)src+srcIdx[i]*link->unitbytes,(char*)dst+disp*link->unitbytes,1,link->unit,op);
1274: }
1275: }
1276: }
1277: #else
1278: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
1279: #endif
1280: return(0);
1281: }
1283: /*=============================================================================
1284: Pack/Unpack/Fetch/Scatter routines
1285: ============================================================================*/
1287: /* Pack rootdata to rootbuf
1288: Input Arguments:
1289: + sf - The SF this packing works on.
1290: . link - It gives the memtype of the roots and also provides root buffer.
1291: . scope - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately.
1292: - rootdata - Where to read the roots.
1294: Notes:
1295: When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is
1296: in a place where the underlying MPI is ready can access (use_gpu_aware_mpi or not)
1297: */
1298: PetscErrorCode PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata)
1299: {
1300: PetscErrorCode ierr;
1301: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1302: const PetscInt *rootindices = NULL;
1303: PetscInt count,start;
1304: PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1305: PetscMemType rootmtype = link->rootmtype;
1306: PetscSFPackOpt opt = NULL;
1310: PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);
1311: if (scope == PETSCSF_REMOTE) {PetscSFLinkSyncDeviceBeforePackData(sf,link);}
1312: if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip packing */
1313: PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);
1314: PetscSFLinkGetPack(link,rootmtype,&Pack);
1315: (*Pack)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);
1316: }
1317: if (scope == PETSCSF_REMOTE) {
1318: PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);
1319: PetscSFLinkSyncStreamAfterPackRootData(sf,link);
1320: }
1321: PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);
1322: return(0);
1323: }
1325: /* Pack leafdata to leafbuf */
1326: PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata)
1327: {
1328: PetscErrorCode ierr;
1329: const PetscInt *leafindices = NULL;
1330: PetscInt count,start;
1331: PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1332: PetscMemType leafmtype = link->leafmtype;
1333: PetscSFPackOpt opt = NULL;
1336: PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);
1337: if (scope == PETSCSF_REMOTE) {PetscSFLinkSyncDeviceBeforePackData(sf,link);}
1338: if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip packing */
1339: PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);
1340: PetscSFLinkGetPack(link,leafmtype,&Pack);
1341: (*Pack)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);
1342: }
1343: if (scope == PETSCSF_REMOTE) {
1344: PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);
1345: PetscSFLinkSyncStreamAfterPackLeafData(sf,link);
1346: }
1347: PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);
1348: return(0);
1349: }
1351: /* Unpack rootbuf to rootdata */
1352: PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1353: {
1354: PetscErrorCode ierr;
1355: const PetscInt *rootindices = NULL;
1356: PetscInt count,start;
1357: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1358: PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1359: PetscMemType rootmtype = link->rootmtype;
1360: PetscSFPackOpt opt = NULL;
1363: PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);
1364: if (scope == PETSCSF_REMOTE) {PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);}
1365: if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */
1366: PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[scope],&UnpackAndOp);
1367: if (UnpackAndOp) {
1368: PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);
1369: (*UnpackAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);
1370: } else {
1371: PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&rootindices);
1372: PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,rootindices,rootdata,link->rootbuf[scope][rootmtype],op);
1373: }
1374: }
1375: if (scope == PETSCSF_REMOTE) {PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);}
1376: PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);
1377: PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);
1378: return(0);
1379: }
1381: /* Unpack leafbuf to leafdata */
1382: PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op)
1383: {
1384: PetscErrorCode ierr;
1385: const PetscInt *leafindices = NULL;
1386: PetscInt count,start;
1387: PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1388: PetscMemType leafmtype = link->leafmtype;
1389: PetscSFPackOpt opt = NULL;
1392: PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);
1393: if (scope == PETSCSF_REMOTE) {PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);}
1394: if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */
1395: PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[scope],&UnpackAndOp);
1396: if (UnpackAndOp) {
1397: PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);
1398: (*UnpackAndOp)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);
1399: } else {
1400: PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&leafindices);
1401: PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,leafindices,leafdata,link->leafbuf[scope][leafmtype],op);
1402: }
1403: }
1404: if (scope == PETSCSF_REMOTE) {PetscSFLinkSyncStreamAfterUnpackLeafData(sf,link);}
1405: PetscSFLinkLogFlopsAfterUnpackLeafData(sf,link,scope,op);
1406: PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);
1407: return(0);
1408: }
1410: /* FetchAndOp rootdata with rootbuf */
1411: PetscErrorCode PetscSFLinkFetchRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1412: {
1413: PetscErrorCode ierr;
1414: const PetscInt *rootindices = NULL;
1415: PetscInt count,start;
1416: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1417: PetscErrorCode (*FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*) = NULL;
1418: PetscMemType rootmtype = link->rootmtype;
1419: PetscSFPackOpt opt = NULL;
1422: PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);
1423: if (scope == PETSCSF_REMOTE) {PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);}
1424: if (bas->rootbuflen[scope]) {
1425: /* Do FetchAndOp on rootdata with rootbuf */
1426: PetscSFLinkGetFetchAndOp(link,rootmtype,op,bas->rootdups[scope],&FetchAndOp);
1427: PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);
1428: (*FetchAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);
1429: }
1430: if (scope == PETSCSF_REMOTE) {
1431: PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE);
1432: PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);
1433: }
1434: PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);
1435: PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);
1436: return(0);
1437: }
1439: /* Bcast rootdata to leafdata locally (i.e., only for local communication - PETSCSF_LOCAL) */
1440: PetscErrorCode PetscSFLinkBcastAndOpLocal(PetscSF sf,PetscSFLink link,const void *rootdata,void *leafdata,MPI_Op op)
1441: {
1442: PetscErrorCode ierr;
1443: const PetscInt *rootindices = NULL,*leafindices = NULL;
1444: PetscInt count,rootstart,leafstart;
1445: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1446: PetscErrorCode (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL;
1447: const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1448: PetscSFPackOpt leafopt = NULL,rootopt = NULL;
1451: if (!bas->rootbuflen[PETSCSF_LOCAL]) return(0);
1452: if (rootmtype != leafmtype) { /* Uncommon case */
1453: /* The local communication has to go through pack and unpack */
1454: PetscSFLinkPackRootData(sf,link,PETSCSF_LOCAL,rootdata);
1455: PetscSFLinkMemcpy(sf,link,leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],sf->leafbuflen[PETSCSF_LOCAL]*link->unitbytes);
1456: PetscSFLinkUnpackLeafData(sf,link,PETSCSF_LOCAL,leafdata,op);
1457: } else {
1458: PetscSFLinkGetScatterAndOp(link,leafmtype,op,sf->leafdups[PETSCSF_LOCAL],&ScatterAndOp);
1459: if (ScatterAndOp) {
1460: PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);
1461: PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);
1462: (*ScatterAndOp)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata);
1463: } else {
1464: PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);
1465: PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);
1466: PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,op);
1467: }
1468: }
1469: return(0);
1470: }
1472: /* Reduce leafdata to rootdata locally */
1473: PetscErrorCode PetscSFLinkReduceLocal(PetscSF sf,PetscSFLink link,const void *leafdata,void *rootdata,MPI_Op op)
1474: {
1475: PetscErrorCode ierr;
1476: const PetscInt *rootindices = NULL,*leafindices = NULL;
1477: PetscInt count,rootstart,leafstart;
1478: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1479: PetscErrorCode (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL;
1480: const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1481: PetscSFPackOpt leafopt = NULL,rootopt = NULL;
1484: if (!sf->leafbuflen[PETSCSF_LOCAL]) return(0);
1485: if (rootmtype != leafmtype) {
1486: /* The local communication has to go through pack and unpack */
1487: PetscSFLinkPackLeafData(sf,link,PETSCSF_LOCAL,leafdata);
1488: PetscSFLinkMemcpy(sf,link,rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],bas->rootbuflen[PETSCSF_LOCAL]*link->unitbytes);
1489: PetscSFLinkUnpackRootData(sf,link,PETSCSF_LOCAL,rootdata,op);
1490: } else {
1491: PetscSFLinkGetScatterAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&ScatterAndOp);
1492: if (ScatterAndOp) {
1493: PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);
1494: PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);
1495: (*ScatterAndOp)(link,count,leafstart,leafopt,leafindices,leafdata,rootstart,rootopt,rootindices,rootdata);
1496: } else {
1497: PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);
1498: PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);
1499: PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,leafstart,leafindices,leafdata,rootstart,rootindices,rootdata,op);
1500: }
1501: }
1502: return(0);
1503: }
1505: /* Fetch rootdata to leafdata and leafupdate locally */
1506: PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1507: {
1508: PetscErrorCode ierr;
1509: const PetscInt *rootindices = NULL,*leafindices = NULL;
1510: PetscInt count,rootstart,leafstart;
1511: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1512: PetscErrorCode (*FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1513: const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1514: PetscSFPackOpt leafopt = NULL,rootopt = NULL;
1517: if (!bas->rootbuflen[PETSCSF_LOCAL]) return(0);
1518: if (rootmtype != leafmtype) {
1519: /* The local communication has to go through pack and unpack */
1520: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU");
1521: } else {
1522: PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);
1523: PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);
1524: PetscSFLinkGetFetchAndOpLocal(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&FetchAndOpLocal);
1525: (*FetchAndOpLocal)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata,leafupdate);
1526: }
1527: return(0);
1528: }
1530: /*
1531: Create per-rank pack/unpack optimizations based on indice patterns
1533: Input Parameters:
1534: + n - Number of destination ranks
1535: . offset - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0.
1536: - idx - [*] Array storing indices
1538: Output Parameters:
1539: + opt - Pack optimizations. NULL if no optimizations.
1540: */
1541: PetscErrorCode PetscSFCreatePackOpt(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out)
1542: {
1544: PetscInt r,p,start,i,j,k,dx,dy,dz,dydz,m,X,Y;
1545: PetscBool optimizable = PETSC_TRUE;
1546: PetscSFPackOpt opt;
1549: PetscMalloc1(1,&opt);
1550: PetscMalloc1(7*n+2,&opt->array);
1551: opt->n = opt->array[0] = n;
1552: opt->offset = opt->array + 1;
1553: opt->start = opt->array + n + 2;
1554: opt->dx = opt->array + 2*n + 2;
1555: opt->dy = opt->array + 3*n + 2;
1556: opt->dz = opt->array + 4*n + 2;
1557: opt->X = opt->array + 5*n + 2;
1558: opt->Y = opt->array + 6*n + 2;
1560: for (r=0; r<n; r++) { /* For each destination rank */
1561: m = offset[r+1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */
1562: p = offset[r];
1563: start = idx[p]; /* First index for this rank */
1564: p++;
1566: /* Search in X dimension */
1567: for (dx=1; dx<m; dx++,p++) {
1568: if (start+dx != idx[p]) break;
1569: }
1571: dydz = m/dx;
1572: X = dydz > 1 ? (idx[p]-start) : dx;
1573: /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */
1574: if (m%dx || X <= 0) {optimizable = PETSC_FALSE; goto finish;}
1575: for (dy=1; dy<dydz; dy++) { /* Search in Y dimension */
1576: for (i=0; i<dx; i++,p++) {
1577: if (start+X*dy+i != idx[p]) {
1578: if (i) {optimizable = PETSC_FALSE; goto finish;} /* The pattern is violated in the middle of an x-walk */
1579: else goto Z_dimension;
1580: }
1581: }
1582: }
1584: Z_dimension:
1585: dz = m/(dx*dy);
1586: Y = dz > 1 ? (idx[p]-start)/X : dy;
1587: /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */
1588: if (m%(dx*dy) || Y <= 0) {optimizable = PETSC_FALSE; goto finish;}
1589: for (k=1; k<dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */
1590: for (j=0; j<dy; j++) {
1591: for (i=0; i<dx; i++,p++) {
1592: if (start+X*Y*k+X*j+i != idx[p]) {optimizable = PETSC_FALSE; goto finish;}
1593: }
1594: }
1595: }
1596: opt->start[r] = start;
1597: opt->dx[r] = dx;
1598: opt->dy[r] = dy;
1599: opt->dz[r] = dz;
1600: opt->X[r] = X;
1601: opt->Y[r] = Y;
1602: }
1604: finish:
1605: /* If not optimizable, free arrays to save memory */
1606: if (!n || !optimizable) {
1607: PetscFree(opt->array);
1608: PetscFree(opt);
1609: *out = NULL;
1610: } else {
1611: opt->offset[0] = 0;
1612: for (r=0; r<n; r++) opt->offset[r+1] = opt->offset[r] + opt->dx[r]*opt->dy[r]*opt->dz[r];
1613: *out = opt;
1614: }
1615: return(0);
1616: }
1618: PETSC_STATIC_INLINE PetscErrorCode PetscSFDestroyPackOpt(PetscMemType mtype,PetscSFPackOpt *out)
1619: {
1621: PetscSFPackOpt opt = *out;
1624: if (opt) {
1625: if (mtype == PETSC_MEMTYPE_HOST) {PetscFree(opt->array);}
1626: #if defined(PETSC_HAVE_CUDA)
1627: else {cudaError_t cerr = cudaFree(opt->array);CHKERRCUDA(cerr);opt->array=NULL;}
1628: #endif
1629: PetscFree(opt);
1630: *out = NULL;
1631: }
1632: return(0);
1633: }
1635: PetscErrorCode PetscSFSetUpPackFields(PetscSF sf)
1636: {
1638: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1639: PetscInt i,j;
1642: /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */
1643: for (i=0; i<2; i++) { /* Set defaults */
1644: sf->leafstart[i] = 0;
1645: sf->leafcontig[i] = PETSC_TRUE;
1646: sf->leafdups[i] = PETSC_FALSE;
1647: bas->rootstart[i] = 0;
1648: bas->rootcontig[i] = PETSC_TRUE;
1649: bas->rootdups[i] = PETSC_FALSE;
1650: }
1652: sf->leafbuflen[0] = sf->roffset[sf->ndranks];
1653: sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks];
1655: if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0];
1656: if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]];
1658: /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */
1659: for (i=0; i<sf->roffset[sf->ndranks]; i++) { /* self */
1660: if (sf->rmine[i] != sf->leafstart[0]+i) {sf->leafcontig[0] = PETSC_FALSE; break;}
1661: }
1662: for (i=sf->roffset[sf->ndranks],j=0; i<sf->roffset[sf->nranks]; i++,j++) { /* remote */
1663: if (sf->rmine[i] != sf->leafstart[1]+j) {sf->leafcontig[1] = PETSC_FALSE; break;}
1664: }
1666: /* If not, see if we can have per-rank optimizations by doing index analysis */
1667: if (!sf->leafcontig[0]) {PetscSFCreatePackOpt(sf->ndranks, sf->roffset, sf->rmine, &sf->leafpackopt[0]);}
1668: if (!sf->leafcontig[1]) {PetscSFCreatePackOpt(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt[1]);}
1670: /* Are root indices for self and remote contiguous? */
1671: bas->rootbuflen[0] = bas->ioffset[bas->ndiranks];
1672: bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks];
1674: if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0];
1675: if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]];
1677: for (i=0; i<bas->ioffset[bas->ndiranks]; i++) {
1678: if (bas->irootloc[i] != bas->rootstart[0]+i) {bas->rootcontig[0] = PETSC_FALSE; break;}
1679: }
1680: for (i=bas->ioffset[bas->ndiranks],j=0; i<bas->ioffset[bas->niranks]; i++,j++) {
1681: if (bas->irootloc[i] != bas->rootstart[1]+j) {bas->rootcontig[1] = PETSC_FALSE; break;}
1682: }
1684: if (!bas->rootcontig[0]) {PetscSFCreatePackOpt(bas->ndiranks, bas->ioffset, bas->irootloc, &bas->rootpackopt[0]);}
1685: if (!bas->rootcontig[1]) {PetscSFCreatePackOpt(bas->niranks-bas->ndiranks, bas->ioffset+bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]);}
1687: #if defined(PETSC_HAVE_CUDA)
1688: /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */
1693: #endif
1695: return(0);
1696: }
1698: PetscErrorCode PetscSFResetPackFields(PetscSF sf)
1699: {
1701: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1702: PetscInt i;
1705: for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
1706: PetscSFDestroyPackOpt(PETSC_MEMTYPE_HOST,&sf->leafpackopt[i]);
1707: PetscSFDestroyPackOpt(PETSC_MEMTYPE_DEVICE,&sf->leafpackopt_d[i]);
1708: PetscSFDestroyPackOpt(PETSC_MEMTYPE_HOST,&bas->rootpackopt[i]);
1709: PetscSFDestroyPackOpt(PETSC_MEMTYPE_DEVICE,&bas->rootpackopt_d[i]);
1710: }
1711: return(0);
1712: }