Actual source code: veccuda2.cu
1: /*
2: Implements the sequential cuda vectors.
3: */
5: #define PETSC_SKIP_SPINLOCK
6: #define PETSC_SKIP_CXX_COMPLEX_FIX
8: #include <petscconf.h>
9: #include <petsc/private/vecimpl.h>
10: #include <../src/vec/vec/impls/dvecimpl.h>
11: #include <petsc/private/cudavecimpl.h>
13: #include <cuda_runtime.h>
14: #include <thrust/device_ptr.h>
15: #include <thrust/transform.h>
16: #include <thrust/functional.h>
17: #include <thrust/reduce.h>
19: /*
20: Allocates space for the vector array on the GPU if it does not exist.
21: Does NOT change the PetscCUDAFlag for the vector
22: Does NOT zero the CUDA array
24: */
25: PetscErrorCode VecCUDAAllocateCheck(Vec v)
26: {
28: cudaError_t err;
29: Vec_CUDA *veccuda;
30: PetscBool option_set;
33: if (!v->spptr) {
34: PetscReal pinned_memory_min;
35: PetscCalloc(sizeof(Vec_CUDA),&v->spptr);
36: veccuda = (Vec_CUDA*)v->spptr;
37: err = cudaMalloc((void**)&veccuda->GPUarray_allocated,sizeof(PetscScalar)*((PetscBLASInt)v->map->n));CHKERRCUDA(err);
38: veccuda->GPUarray = veccuda->GPUarray_allocated;
39: if (v->offloadmask == PETSC_OFFLOAD_UNALLOCATED) {
40: if (v->data && ((Vec_Seq*)v->data)->array) {
41: v->offloadmask = PETSC_OFFLOAD_CPU;
42: } else {
43: v->offloadmask = PETSC_OFFLOAD_GPU;
44: }
45: }
46: pinned_memory_min = 0;
48: /* Need to parse command line for minimum size to use for pinned memory allocations on host here.
49: Note: This same code duplicated in VecCreate_SeqCUDA_Private() and VecCreate_MPICUDA_Private(). Is there a good way to avoid this? */
50: PetscOptionsBegin(PetscObjectComm((PetscObject)v),((PetscObject)v)->prefix,"VECCUDA Options","Vec");
51: PetscOptionsReal("-vec_pinned_memory_min","Minimum size (in bytes) for an allocation to use pinned memory on host","VecSetPinnedMemoryMin",pinned_memory_min,&pinned_memory_min,&option_set);
52: if (option_set) v->minimum_bytes_pinned_memory = pinned_memory_min;
53: PetscOptionsEnd();
54: }
55: return(0);
56: }
58: /* Copies a vector from the CPU to the GPU unless we already have an up-to-date copy on the GPU */
59: PetscErrorCode VecCUDACopyToGPU(Vec v)
60: {
62: cudaError_t err;
63: Vec_CUDA *veccuda;
64: PetscScalar *varray;
68: VecCUDAAllocateCheck(v);
69: if (v->offloadmask == PETSC_OFFLOAD_CPU) {
70: PetscLogEventBegin(VEC_CUDACopyToGPU,v,0,0,0);
71: veccuda = (Vec_CUDA*)v->spptr;
72: varray = veccuda->GPUarray;
73: err = cudaMemcpy(varray,((Vec_Seq*)v->data)->array,v->map->n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
74: PetscLogCpuToGpu((v->map->n)*sizeof(PetscScalar));
75: PetscLogEventEnd(VEC_CUDACopyToGPU,v,0,0,0);
76: v->offloadmask = PETSC_OFFLOAD_BOTH;
77: }
78: return(0);
79: }
81: /*
82: VecCUDACopyFromGPU - Copies a vector from the GPU to the CPU unless we already have an up-to-date copy on the CPU
83: */
84: PetscErrorCode VecCUDACopyFromGPU(Vec v)
85: {
87: cudaError_t err;
88: Vec_CUDA *veccuda;
89: PetscScalar *varray;
93: VecCUDAAllocateCheckHost(v);
94: if (v->offloadmask == PETSC_OFFLOAD_GPU) {
95: PetscLogEventBegin(VEC_CUDACopyFromGPU,v,0,0,0);
96: veccuda = (Vec_CUDA*)v->spptr;
97: varray = veccuda->GPUarray;
98: err = cudaMemcpy(((Vec_Seq*)v->data)->array,varray,v->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
99: PetscLogGpuToCpu((v->map->n)*sizeof(PetscScalar));
100: PetscLogEventEnd(VEC_CUDACopyFromGPU,v,0,0,0);
101: v->offloadmask = PETSC_OFFLOAD_BOTH;
102: }
103: return(0);
104: }
106: /*MC
107: VECSEQCUDA - VECSEQCUDA = "seqcuda" - The basic sequential vector, modified to use CUDA
109: Options Database Keys:
110: . -vec_type seqcuda - sets the vector type to VECSEQCUDA during a call to VecSetFromOptions()
112: Level: beginner
114: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateSeqWithArray(), VECMPI, VecType, VecCreateMPI(), VecCreateSeq(), VecSetPinnedMemoryMin()
115: M*/
117: PetscErrorCode VecAYPX_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
118: {
119: const PetscScalar *xarray;
120: PetscScalar *yarray;
121: PetscErrorCode ierr;
122: PetscBLASInt one = 1,bn = 0;
123: PetscScalar sone = 1.0;
124: cublasHandle_t cublasv2handle;
125: cublasStatus_t cberr;
126: cudaError_t err;
129: PetscCUBLASGetHandle(&cublasv2handle);
130: PetscBLASIntCast(yin->map->n,&bn);
131: VecCUDAGetArrayRead(xin,&xarray);
132: VecCUDAGetArray(yin,&yarray);
133: PetscLogGpuTimeBegin();
134: if (alpha == (PetscScalar)0.0) {
135: err = cudaMemcpy(yarray,xarray,bn*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
136: } else if (alpha == (PetscScalar)1.0) {
137: cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
138: PetscLogGpuFlops(1.0*yin->map->n);
139: } else {
140: cberr = cublasXscal(cublasv2handle,bn,&alpha,yarray,one);CHKERRCUBLAS(cberr);
141: cberr = cublasXaxpy(cublasv2handle,bn,&sone,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
142: PetscLogGpuFlops(2.0*yin->map->n);
143: }
144: err = WaitForCUDA();CHKERRCUDA(err);
145: PetscLogGpuTimeEnd();
146: VecCUDARestoreArrayRead(xin,&xarray);
147: VecCUDARestoreArray(yin,&yarray);
148: PetscLogCpuToGpu(sizeof(PetscScalar));
149: return(0);
150: }
152: PetscErrorCode VecAXPY_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
153: {
154: const PetscScalar *xarray;
155: PetscScalar *yarray;
156: PetscErrorCode ierr;
157: PetscBLASInt one = 1,bn = 0;
158: cublasHandle_t cublasv2handle;
159: cublasStatus_t cberr;
160: PetscBool xiscuda;
161: cudaError_t err;
164: if (alpha == (PetscScalar)0.0) return(0);
165: PetscCUBLASGetHandle(&cublasv2handle);
166: PetscObjectTypeCompareAny((PetscObject)xin,&xiscuda,VECSEQCUDA,VECMPICUDA,"");
167: if (xiscuda) {
168: PetscBLASIntCast(yin->map->n,&bn);
169: VecCUDAGetArrayRead(xin,&xarray);
170: VecCUDAGetArray(yin,&yarray);
171: PetscLogGpuTimeBegin();
172: cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
173: err = WaitForCUDA();CHKERRCUDA(err);
174: PetscLogGpuTimeEnd();
175: VecCUDARestoreArrayRead(xin,&xarray);
176: VecCUDARestoreArray(yin,&yarray);
177: PetscLogGpuFlops(2.0*yin->map->n);
178: PetscLogCpuToGpu(sizeof(PetscScalar));
179: } else {
180: VecAXPY_Seq(yin,alpha,xin);
181: }
182: return(0);
183: }
185: PetscErrorCode VecPointwiseDivide_SeqCUDA(Vec win, Vec xin, Vec yin)
186: {
187: PetscInt n = xin->map->n;
188: const PetscScalar *xarray=NULL,*yarray=NULL;
189: PetscScalar *warray=NULL;
190: thrust::device_ptr<const PetscScalar> xptr,yptr;
191: thrust::device_ptr<PetscScalar> wptr;
192: PetscErrorCode ierr;
193: cudaError_t err;
196: VecCUDAGetArrayWrite(win,&warray);
197: VecCUDAGetArrayRead(xin,&xarray);
198: VecCUDAGetArrayRead(yin,&yarray);
199: PetscLogGpuTimeBegin();
200: try {
201: wptr = thrust::device_pointer_cast(warray);
202: xptr = thrust::device_pointer_cast(xarray);
203: yptr = thrust::device_pointer_cast(yarray);
204: thrust::transform(xptr,xptr+n,yptr,wptr,thrust::divides<PetscScalar>());
205: err = WaitForCUDA();CHKERRCUDA(err);
206: } catch (char *ex) {
207: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
208: }
209: PetscLogGpuTimeEnd();
210: PetscLogGpuFlops(n);
211: VecCUDARestoreArrayRead(xin,&xarray);
212: VecCUDARestoreArrayRead(yin,&yarray);
213: VecCUDARestoreArrayWrite(win,&warray);
214: return(0);
215: }
217: PetscErrorCode VecWAXPY_SeqCUDA(Vec win,PetscScalar alpha,Vec xin, Vec yin)
218: {
219: const PetscScalar *xarray=NULL,*yarray=NULL;
220: PetscScalar *warray=NULL;
221: PetscErrorCode ierr;
222: PetscBLASInt one = 1,bn = 0;
223: cublasHandle_t cublasv2handle;
224: cublasStatus_t stat;
225: cudaError_t cerr;
226: cudaStream_t stream;
229: PetscCUBLASGetHandle(&cublasv2handle);
230: PetscBLASIntCast(win->map->n,&bn);
231: if (alpha == (PetscScalar)0.0) {
232: VecCopy_SeqCUDA(yin,win);
233: } else {
234: VecCUDAGetArrayRead(xin,&xarray);
235: VecCUDAGetArrayRead(yin,&yarray);
236: VecCUDAGetArrayWrite(win,&warray);
237: PetscLogGpuTimeBegin();
238: stat = cublasGetStream(cublasv2handle,&stream);CHKERRCUBLAS(stat);
239: cerr = cudaMemcpyAsync(warray,yarray,win->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice,stream);CHKERRCUDA(cerr);
240: stat = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,warray,one);CHKERRCUBLAS(stat);
241: cerr = WaitForCUDA();CHKERRCUDA(cerr);
242: PetscLogGpuTimeEnd();
243: PetscLogGpuFlops(2*win->map->n);
244: VecCUDARestoreArrayRead(xin,&xarray);
245: VecCUDARestoreArrayRead(yin,&yarray);
246: VecCUDARestoreArrayWrite(win,&warray);
247: PetscLogCpuToGpu(sizeof(PetscScalar));
248: }
249: return(0);
250: }
252: PetscErrorCode VecMAXPY_SeqCUDA(Vec xin, PetscInt nv,const PetscScalar *alpha,Vec *y)
253: {
254: PetscErrorCode ierr;
255: cudaError_t err;
256: PetscInt n = xin->map->n,j;
257: PetscScalar *xarray;
258: const PetscScalar *yarray;
259: PetscBLASInt one = 1,bn = 0;
260: cublasHandle_t cublasv2handle;
261: cublasStatus_t cberr;
264: PetscLogGpuFlops(nv*2.0*n);
265: PetscLogCpuToGpu(nv*sizeof(PetscScalar));
266: PetscCUBLASGetHandle(&cublasv2handle);
267: PetscBLASIntCast(n,&bn);
268: PetscLogGpuTimeBegin();
269: VecCUDAGetArray(xin,&xarray);
270: for (j=0; j<nv; j++) {
271: VecCUDAGetArrayRead(y[j],&yarray);
272: cberr = cublasXaxpy(cublasv2handle,bn,alpha+j,yarray,one,xarray,one);CHKERRCUBLAS(cberr);
273: VecCUDARestoreArrayRead(y[j],&yarray);
274: }
275: err = WaitForCUDA();CHKERRCUDA(err);
276: PetscLogGpuTimeEnd();
277: VecCUDARestoreArray(xin,&xarray);
278: return(0);
279: }
281: PetscErrorCode VecDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
282: {
283: const PetscScalar *xarray,*yarray;
284: PetscErrorCode ierr;
285: PetscBLASInt one = 1,bn = 0;
286: cublasHandle_t cublasv2handle;
287: cublasStatus_t cerr;
290: PetscCUBLASGetHandle(&cublasv2handle);
291: PetscBLASIntCast(yin->map->n,&bn);
292: VecCUDAGetArrayRead(xin,&xarray);
293: VecCUDAGetArrayRead(yin,&yarray);
294: /* arguments y, x are reversed because BLAS complex conjugates the first argument, PETSc the second */
295: PetscLogGpuTimeBegin();
296: cerr = cublasXdot(cublasv2handle,bn,yarray,one,xarray,one,z);CHKERRCUBLAS(cerr);
297: PetscLogGpuTimeEnd();
298: if (xin->map->n >0) {
299: PetscLogGpuFlops(2.0*xin->map->n-1);
300: }
301: PetscLogGpuToCpu(sizeof(PetscScalar));
302: VecCUDARestoreArrayRead(xin,&xarray);
303: VecCUDARestoreArrayRead(yin,&yarray);
304: return(0);
305: }
307: //
308: // CUDA kernels for MDot to follow
309: //
311: // set work group size to be a power of 2 (128 is usually a good compromise between portability and speed)
312: #define MDOT_WORKGROUP_SIZE 128
313: #define MDOT_WORKGROUP_NUM 128
315: #if !defined(PETSC_USE_COMPLEX)
316: // M = 2:
317: __global__ void VecMDot_SeqCUDA_kernel2(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,
318: PetscInt size, PetscScalar *group_results)
319: {
320: __shared__ PetscScalar tmp_buffer[2*MDOT_WORKGROUP_SIZE];
321: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
322: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
323: PetscInt vec_start_index = blockIdx.x * entries_per_group;
324: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
326: PetscScalar entry_x = 0;
327: PetscScalar group_sum0 = 0;
328: PetscScalar group_sum1 = 0;
329: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
330: entry_x = x[i]; // load only once from global memory!
331: group_sum0 += entry_x * y0[i];
332: group_sum1 += entry_x * y1[i];
333: }
334: tmp_buffer[threadIdx.x] = group_sum0;
335: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
337: // parallel reduction
338: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
339: __syncthreads();
340: if (threadIdx.x < stride) {
341: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
342: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
343: }
344: }
346: // write result of group to group_results
347: if (threadIdx.x == 0) {
348: group_results[blockIdx.x] = tmp_buffer[0];
349: group_results[blockIdx.x + gridDim.x] = tmp_buffer[MDOT_WORKGROUP_SIZE];
350: }
351: }
353: // M = 3:
354: __global__ void VecMDot_SeqCUDA_kernel3(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,
355: PetscInt size, PetscScalar *group_results)
356: {
357: __shared__ PetscScalar tmp_buffer[3*MDOT_WORKGROUP_SIZE];
358: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
359: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
360: PetscInt vec_start_index = blockIdx.x * entries_per_group;
361: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
363: PetscScalar entry_x = 0;
364: PetscScalar group_sum0 = 0;
365: PetscScalar group_sum1 = 0;
366: PetscScalar group_sum2 = 0;
367: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
368: entry_x = x[i]; // load only once from global memory!
369: group_sum0 += entry_x * y0[i];
370: group_sum1 += entry_x * y1[i];
371: group_sum2 += entry_x * y2[i];
372: }
373: tmp_buffer[threadIdx.x] = group_sum0;
374: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
375: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
377: // parallel reduction
378: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
379: __syncthreads();
380: if (threadIdx.x < stride) {
381: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
382: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
383: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
384: }
385: }
387: // write result of group to group_results
388: if (threadIdx.x == 0) {
389: group_results[blockIdx.x ] = tmp_buffer[0];
390: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
391: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
392: }
393: }
395: // M = 4:
396: __global__ void VecMDot_SeqCUDA_kernel4(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
397: PetscInt size, PetscScalar *group_results)
398: {
399: __shared__ PetscScalar tmp_buffer[4*MDOT_WORKGROUP_SIZE];
400: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
401: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
402: PetscInt vec_start_index = blockIdx.x * entries_per_group;
403: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
405: PetscScalar entry_x = 0;
406: PetscScalar group_sum0 = 0;
407: PetscScalar group_sum1 = 0;
408: PetscScalar group_sum2 = 0;
409: PetscScalar group_sum3 = 0;
410: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
411: entry_x = x[i]; // load only once from global memory!
412: group_sum0 += entry_x * y0[i];
413: group_sum1 += entry_x * y1[i];
414: group_sum2 += entry_x * y2[i];
415: group_sum3 += entry_x * y3[i];
416: }
417: tmp_buffer[threadIdx.x] = group_sum0;
418: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
419: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
420: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
422: // parallel reduction
423: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
424: __syncthreads();
425: if (threadIdx.x < stride) {
426: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
427: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
428: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
429: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
430: }
431: }
433: // write result of group to group_results
434: if (threadIdx.x == 0) {
435: group_results[blockIdx.x ] = tmp_buffer[0];
436: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
437: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
438: group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
439: }
440: }
442: // M = 8:
443: __global__ void VecMDot_SeqCUDA_kernel8(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
444: const PetscScalar *y4,const PetscScalar *y5,const PetscScalar *y6,const PetscScalar *y7,
445: PetscInt size, PetscScalar *group_results)
446: {
447: __shared__ PetscScalar tmp_buffer[8*MDOT_WORKGROUP_SIZE];
448: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
449: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
450: PetscInt vec_start_index = blockIdx.x * entries_per_group;
451: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
453: PetscScalar entry_x = 0;
454: PetscScalar group_sum0 = 0;
455: PetscScalar group_sum1 = 0;
456: PetscScalar group_sum2 = 0;
457: PetscScalar group_sum3 = 0;
458: PetscScalar group_sum4 = 0;
459: PetscScalar group_sum5 = 0;
460: PetscScalar group_sum6 = 0;
461: PetscScalar group_sum7 = 0;
462: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
463: entry_x = x[i]; // load only once from global memory!
464: group_sum0 += entry_x * y0[i];
465: group_sum1 += entry_x * y1[i];
466: group_sum2 += entry_x * y2[i];
467: group_sum3 += entry_x * y3[i];
468: group_sum4 += entry_x * y4[i];
469: group_sum5 += entry_x * y5[i];
470: group_sum6 += entry_x * y6[i];
471: group_sum7 += entry_x * y7[i];
472: }
473: tmp_buffer[threadIdx.x] = group_sum0;
474: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
475: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
476: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
477: tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] = group_sum4;
478: tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] = group_sum5;
479: tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] = group_sum6;
480: tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] = group_sum7;
482: // parallel reduction
483: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
484: __syncthreads();
485: if (threadIdx.x < stride) {
486: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
487: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
488: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
489: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
490: tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 4 * MDOT_WORKGROUP_SIZE];
491: tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 5 * MDOT_WORKGROUP_SIZE];
492: tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 6 * MDOT_WORKGROUP_SIZE];
493: tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 7 * MDOT_WORKGROUP_SIZE];
494: }
495: }
497: // write result of group to group_results
498: if (threadIdx.x == 0) {
499: group_results[blockIdx.x ] = tmp_buffer[0];
500: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
501: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
502: group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
503: group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * MDOT_WORKGROUP_SIZE];
504: group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * MDOT_WORKGROUP_SIZE];
505: group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * MDOT_WORKGROUP_SIZE];
506: group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * MDOT_WORKGROUP_SIZE];
507: }
508: }
509: #endif /* !defined(PETSC_USE_COMPLEX) */
511: PetscErrorCode VecMDot_SeqCUDA(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
512: {
513: PetscErrorCode ierr;
514: PetscInt i,n = xin->map->n,current_y_index = 0;
515: const PetscScalar *xptr,*y0ptr,*y1ptr,*y2ptr,*y3ptr,*y4ptr,*y5ptr,*y6ptr,*y7ptr;
516: #if !defined(PETSC_USE_COMPLEX)
517: PetscInt nv1 = ((nv % 4) == 1) ? nv-1: nv,j;
518: PetscScalar *group_results_gpu,*group_results_cpu;
519: cudaError_t cuda_ierr;
520: #endif
521: PetscBLASInt one = 1,bn = 0;
522: cublasHandle_t cublasv2handle;
523: cublasStatus_t cberr;
524: cudaError_t err;
527: PetscCUBLASGetHandle(&cublasv2handle);
528: PetscBLASIntCast(xin->map->n,&bn);
529: if (nv <= 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Number of vectors provided to VecMDot_SeqCUDA not positive.");
530: /* Handle the case of local size zero first */
531: if (!xin->map->n) {
532: for (i=0; i<nv; ++i) z[i] = 0;
533: return(0);
534: }
536: #if !defined(PETSC_USE_COMPLEX)
537: // allocate scratchpad memory for the results of individual work groups:
538: cuda_cudaMalloc((void**)&group_results_gpu, nv1*sizeof(PetscScalar)*MDOT_WORKGROUP_NUM);CHKERRCUDA(cuda_ierr);
539: PetscMalloc1(nv1*MDOT_WORKGROUP_NUM,&group_results_cpu);
540: #endif
541: VecCUDAGetArrayRead(xin,&xptr);
542: PetscLogGpuTimeBegin();
544: while (current_y_index < nv)
545: {
546: switch (nv - current_y_index) {
548: case 7:
549: case 6:
550: case 5:
551: case 4:
552: VecCUDAGetArrayRead(yin[current_y_index ],&y0ptr);
553: VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
554: VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
555: VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);
556: #if defined(PETSC_USE_COMPLEX)
557: cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
558: cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
559: cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
560: cberr = cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRCUBLAS(cberr);
561: #else
562: VecMDot_SeqCUDA_kernel4<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,n,group_results_gpu+current_y_index*MDOT_WORKGROUP_NUM);
564: #endif
565: VecCUDARestoreArrayRead(yin[current_y_index ],&y0ptr);
566: VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
567: VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
568: VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);
569: current_y_index += 4;
570: break;
572: case 3:
573: VecCUDAGetArrayRead(yin[current_y_index ],&y0ptr);
574: VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
575: VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
577: #if defined(PETSC_USE_COMPLEX)
578: cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
579: cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
580: cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
581: #else
582: VecMDot_SeqCUDA_kernel3<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,n,group_results_gpu+current_y_index*MDOT_WORKGROUP_NUM);
583: #endif
584: VecCUDARestoreArrayRead(yin[current_y_index ],&y0ptr);
585: VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
586: VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
587: current_y_index += 3;
588: break;
590: case 2:
591: VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);
592: VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
593: #if defined(PETSC_USE_COMPLEX)
594: cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
595: cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
596: #else
597: VecMDot_SeqCUDA_kernel2<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,n,group_results_gpu+current_y_index*MDOT_WORKGROUP_NUM);
598: #endif
599: VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);
600: VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
601: current_y_index += 2;
602: break;
604: case 1:
605: VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);
606: cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
607: VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);
608: current_y_index += 1;
609: break;
611: default: // 8 or more vectors left
612: VecCUDAGetArrayRead(yin[current_y_index ],&y0ptr);
613: VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
614: VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
615: VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);
616: VecCUDAGetArrayRead(yin[current_y_index+4],&y4ptr);
617: VecCUDAGetArrayRead(yin[current_y_index+5],&y5ptr);
618: VecCUDAGetArrayRead(yin[current_y_index+6],&y6ptr);
619: VecCUDAGetArrayRead(yin[current_y_index+7],&y7ptr);
620: #if defined(PETSC_USE_COMPLEX)
621: cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
622: cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
623: cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
624: cberr = cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRCUBLAS(cberr);
625: cberr = cublasXdot(cublasv2handle,bn,y4ptr,one,xptr,one,&z[current_y_index+4]);CHKERRCUBLAS(cberr);
626: cberr = cublasXdot(cublasv2handle,bn,y5ptr,one,xptr,one,&z[current_y_index+5]);CHKERRCUBLAS(cberr);
627: cberr = cublasXdot(cublasv2handle,bn,y6ptr,one,xptr,one,&z[current_y_index+6]);CHKERRCUBLAS(cberr);
628: cberr = cublasXdot(cublasv2handle,bn,y7ptr,one,xptr,one,&z[current_y_index+7]);CHKERRCUBLAS(cberr);
629: #else
630: VecMDot_SeqCUDA_kernel8<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,y4ptr,y5ptr,y6ptr,y7ptr,n,group_results_gpu+current_y_index*MDOT_WORKGROUP_NUM);
631: #endif
632: VecCUDARestoreArrayRead(yin[current_y_index ],&y0ptr);
633: VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
634: VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
635: VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);
636: VecCUDARestoreArrayRead(yin[current_y_index+4],&y4ptr);
637: VecCUDARestoreArrayRead(yin[current_y_index+5],&y5ptr);
638: VecCUDARestoreArrayRead(yin[current_y_index+6],&y6ptr);
639: VecCUDARestoreArrayRead(yin[current_y_index+7],&y7ptr);
640: current_y_index += 8;
641: break;
642: }
643: }
644: err = WaitForCUDA();CHKERRCUDA(err);
645: PetscLogGpuTimeEnd();
646: VecCUDARestoreArrayRead(xin,&xptr);
648: #if defined(PETSC_USE_COMPLEX)
649: PetscLogGpuToCpu(nv*sizeof(PetscScalar));
650: #else
651: // copy results to CPU
652: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,nv1*sizeof(PetscScalar)*MDOT_WORKGROUP_NUM,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);
654: // sum group results into z
655: for (j=0; j<nv1; ++j) {
656: z[j] = 0;
657: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[j] += group_results_cpu[i];
658: }
659: PetscLogFlops(nv1*MDOT_WORKGROUP_NUM);
660: cuda_cudaFree(group_results_gpu);CHKERRCUDA(cuda_ierr);
661: PetscFree(group_results_cpu);
662: PetscLogGpuToCpu(nv1*sizeof(PetscScalar)*MDOT_WORKGROUP_NUM);
663: #endif
664: PetscLogGpuFlops(PetscMax(nv*(2.0*n-1),0.0));
665: return(0);
666: }
668: #undef MDOT_WORKGROUP_SIZE
669: #undef MDOT_WORKGROUP_NUM
671: PetscErrorCode VecSet_SeqCUDA(Vec xin,PetscScalar alpha)
672: {
673: PetscInt n = xin->map->n;
674: PetscScalar *xarray = NULL;
675: thrust::device_ptr<PetscScalar> xptr;
676: PetscErrorCode ierr;
677: cudaError_t err;
680: VecCUDAGetArrayWrite(xin,&xarray);
681: PetscLogGpuTimeBegin();
682: if (alpha == (PetscScalar)0.0) {
683: err = cudaMemset(xarray,0,n*sizeof(PetscScalar));CHKERRCUDA(err);
684: } else {
685: try {
686: xptr = thrust::device_pointer_cast(xarray);
687: thrust::fill(xptr,xptr+n,alpha);
688: } catch (char *ex) {
689: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
690: }
691: PetscLogCpuToGpu(sizeof(PetscScalar));
692: }
693: err = WaitForCUDA();CHKERRCUDA(err);
694: PetscLogGpuTimeEnd();
695: VecCUDARestoreArrayWrite(xin,&xarray);
696: return(0);
697: }
699: PetscErrorCode VecScale_SeqCUDA(Vec xin,PetscScalar alpha)
700: {
701: PetscScalar *xarray;
703: PetscBLASInt one = 1,bn = 0;
704: cublasHandle_t cublasv2handle;
705: cublasStatus_t cberr;
706: cudaError_t err;
709: if (alpha == (PetscScalar)0.0) {
710: VecSet_SeqCUDA(xin,alpha);
711: err = WaitForCUDA();CHKERRCUDA(err);
712: } else if (alpha != (PetscScalar)1.0) {
713: PetscCUBLASGetHandle(&cublasv2handle);
714: PetscBLASIntCast(xin->map->n,&bn);
715: VecCUDAGetArray(xin,&xarray);
716: PetscLogGpuTimeBegin();
717: cberr = cublasXscal(cublasv2handle,bn,&alpha,xarray,one);CHKERRCUBLAS(cberr);
718: VecCUDARestoreArray(xin,&xarray);
719: err = WaitForCUDA();CHKERRCUDA(err);
720: PetscLogGpuTimeEnd();
721: PetscLogCpuToGpu(sizeof(PetscScalar));
722: PetscLogGpuFlops(xin->map->n);
723: }
724: return(0);
725: }
727: PetscErrorCode VecTDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
728: {
729: const PetscScalar *xarray,*yarray;
730: PetscErrorCode ierr;
731: PetscBLASInt one = 1,bn = 0;
732: cublasHandle_t cublasv2handle;
733: cublasStatus_t cerr;
736: PetscCUBLASGetHandle(&cublasv2handle);
737: PetscBLASIntCast(xin->map->n,&bn);
738: VecCUDAGetArrayRead(xin,&xarray);
739: VecCUDAGetArrayRead(yin,&yarray);
740: PetscLogGpuTimeBegin();
741: cerr = cublasXdotu(cublasv2handle,bn,xarray,one,yarray,one,z);CHKERRCUBLAS(cerr);
742: PetscLogGpuTimeEnd();
743: if (xin->map->n > 0) {
744: PetscLogGpuFlops(2.0*xin->map->n-1);
745: }
746: PetscLogGpuToCpu(sizeof(PetscScalar));
747: VecCUDARestoreArrayRead(yin,&yarray);
748: VecCUDARestoreArrayRead(xin,&xarray);
749: return(0);
750: }
752: PetscErrorCode VecCopy_SeqCUDA(Vec xin,Vec yin)
753: {
754: const PetscScalar *xarray;
755: PetscScalar *yarray;
756: PetscErrorCode ierr;
757: cudaError_t err;
760: if (xin != yin) {
761: if (xin->offloadmask == PETSC_OFFLOAD_GPU) {
762: PetscBool yiscuda;
764: PetscObjectTypeCompareAny((PetscObject)yin,&yiscuda,VECSEQCUDA,VECMPICUDA,"");
765: VecCUDAGetArrayRead(xin,&xarray);
766: if (yiscuda) {
767: VecCUDAGetArrayWrite(yin,&yarray);
768: } else {
769: VecGetArrayWrite(yin,&yarray);
770: }
771: PetscLogGpuTimeBegin();
772: if (yiscuda) {
773: err = cudaMemcpyAsync(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice,PetscDefaultCudaStream);CHKERRCUDA(err);
774: } else {
775: err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
776: }
777: err = WaitForCUDA();CHKERRCUDA(err);
778: PetscLogGpuTimeEnd();
779: VecCUDARestoreArrayRead(xin,&xarray);
780: if (yiscuda) {
781: VecCUDARestoreArrayWrite(yin,&yarray);
782: } else {
783: VecRestoreArrayWrite(yin,&yarray);
784: }
785: } else if (xin->offloadmask == PETSC_OFFLOAD_CPU) {
786: /* copy in CPU if we are on the CPU */
787: VecCopy_SeqCUDA_Private(xin,yin);
788: } else if (xin->offloadmask == PETSC_OFFLOAD_BOTH) {
789: /* if xin is valid in both places, see where yin is and copy there (because it's probably where we'll want to next use it) */
790: if (yin->offloadmask == PETSC_OFFLOAD_CPU) {
791: /* copy in CPU */
792: VecCopy_SeqCUDA_Private(xin,yin);
793: } else if (yin->offloadmask == PETSC_OFFLOAD_GPU) {
794: /* copy in GPU */
795: VecCUDAGetArrayRead(xin,&xarray);
796: VecCUDAGetArrayWrite(yin,&yarray);
797: PetscLogGpuTimeBegin();
798: err = cudaMemcpyAsync(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice,PetscDefaultCudaStream);CHKERRCUDA(err);
799: PetscLogGpuTimeEnd();
800: VecCUDARestoreArrayRead(xin,&xarray);
801: VecCUDARestoreArrayWrite(yin,&yarray);
802: } else if (yin->offloadmask == PETSC_OFFLOAD_BOTH) {
803: /* xin and yin are both valid in both places (or yin was unallocated before the earlier call to allocatecheck
804: default to copy in GPU (this is an arbitrary choice) */
805: VecCUDAGetArrayRead(xin,&xarray);
806: VecCUDAGetArrayWrite(yin,&yarray);
807: PetscLogGpuTimeBegin();
808: err = cudaMemcpyAsync(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice,PetscDefaultCudaStream);CHKERRCUDA(err);
809: PetscLogGpuTimeEnd();
810: VecCUDARestoreArrayRead(xin,&xarray);
811: VecCUDARestoreArrayWrite(yin,&yarray);
812: } else {
813: VecCopy_SeqCUDA_Private(xin,yin);
814: }
815: }
816: }
817: return(0);
818: }
820: PetscErrorCode VecSwap_SeqCUDA(Vec xin,Vec yin)
821: {
823: PetscBLASInt one = 1,bn = 0;
824: PetscScalar *xarray,*yarray;
825: cublasHandle_t cublasv2handle;
826: cublasStatus_t cberr;
827: cudaError_t err;
830: PetscCUBLASGetHandle(&cublasv2handle);
831: PetscBLASIntCast(xin->map->n,&bn);
832: if (xin != yin) {
833: VecCUDAGetArray(xin,&xarray);
834: VecCUDAGetArray(yin,&yarray);
835: PetscLogGpuTimeBegin();
836: cberr = cublasXswap(cublasv2handle,bn,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
837: err = WaitForCUDA();CHKERRCUDA(err);
838: PetscLogGpuTimeEnd();
839: VecCUDARestoreArray(xin,&xarray);
840: VecCUDARestoreArray(yin,&yarray);
841: }
842: return(0);
843: }
845: PetscErrorCode VecAXPBY_SeqCUDA(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
846: {
847: PetscErrorCode ierr;
848: PetscScalar a = alpha,b = beta;
849: const PetscScalar *xarray;
850: PetscScalar *yarray;
851: PetscBLASInt one = 1, bn = 0;
852: cublasHandle_t cublasv2handle;
853: cublasStatus_t cberr;
854: cudaError_t err;
857: PetscCUBLASGetHandle(&cublasv2handle);
858: PetscBLASIntCast(yin->map->n,&bn);
859: if (a == (PetscScalar)0.0) {
860: VecScale_SeqCUDA(yin,beta);
861: } else if (b == (PetscScalar)1.0) {
862: VecAXPY_SeqCUDA(yin,alpha,xin);
863: } else if (a == (PetscScalar)1.0) {
864: VecAYPX_SeqCUDA(yin,beta,xin);
865: } else if (b == (PetscScalar)0.0) {
866: VecCUDAGetArrayRead(xin,&xarray);
867: VecCUDAGetArray(yin,&yarray);
868: PetscLogGpuTimeBegin();
869: err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
870: cberr = cublasXscal(cublasv2handle,bn,&alpha,yarray,one);CHKERRCUBLAS(cberr);
871: err = WaitForCUDA();CHKERRCUDA(err);
872: PetscLogGpuTimeEnd();
873: PetscLogGpuFlops(xin->map->n);
874: PetscLogCpuToGpu(sizeof(PetscScalar));
875: VecCUDARestoreArrayRead(xin,&xarray);
876: VecCUDARestoreArray(yin,&yarray);
877: } else {
878: VecCUDAGetArrayRead(xin,&xarray);
879: VecCUDAGetArray(yin,&yarray);
880: PetscLogGpuTimeBegin();
881: cberr = cublasXscal(cublasv2handle,bn,&beta,yarray,one);CHKERRCUBLAS(cberr);
882: cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
883: VecCUDARestoreArrayRead(xin,&xarray);
884: VecCUDARestoreArray(yin,&yarray);
885: err = WaitForCUDA();CHKERRCUDA(err);
886: PetscLogGpuTimeEnd();
887: PetscLogGpuFlops(3.0*xin->map->n);
888: PetscLogCpuToGpu(2*sizeof(PetscScalar));
889: }
890: return(0);
891: }
893: PetscErrorCode VecAXPBYPCZ_SeqCUDA(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)
894: {
896: PetscInt n = zin->map->n;
899: if (gamma == (PetscScalar)1.0) {
900: /* z = ax + b*y + z */
901: VecAXPY_SeqCUDA(zin,alpha,xin);
902: VecAXPY_SeqCUDA(zin,beta,yin);
903: PetscLogGpuFlops(4.0*n);
904: } else {
905: /* z = a*x + b*y + c*z */
906: VecScale_SeqCUDA(zin,gamma);
907: VecAXPY_SeqCUDA(zin,alpha,xin);
908: VecAXPY_SeqCUDA(zin,beta,yin);
909: PetscLogGpuFlops(5.0*n);
910: }
911: return(0);
912: }
914: PetscErrorCode VecPointwiseMult_SeqCUDA(Vec win,Vec xin,Vec yin)
915: {
916: PetscInt n = win->map->n;
917: const PetscScalar *xarray,*yarray;
918: PetscScalar *warray;
919: thrust::device_ptr<const PetscScalar> xptr,yptr;
920: thrust::device_ptr<PetscScalar> wptr;
921: PetscErrorCode ierr;
922: cudaError_t err;
925: VecCUDAGetArrayRead(xin,&xarray);
926: VecCUDAGetArrayRead(yin,&yarray);
927: VecCUDAGetArrayWrite(win,&warray);
928: PetscLogGpuTimeBegin();
929: try {
930: wptr = thrust::device_pointer_cast(warray);
931: xptr = thrust::device_pointer_cast(xarray);
932: yptr = thrust::device_pointer_cast(yarray);
933: thrust::transform(xptr,xptr+n,yptr,wptr,thrust::multiplies<PetscScalar>());
934: err = WaitForCUDA();CHKERRCUDA(err);
935: } catch (char *ex) {
936: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
937: }
938: PetscLogGpuTimeEnd();
939: VecCUDARestoreArrayRead(xin,&xarray);
940: VecCUDARestoreArrayRead(yin,&yarray);
941: VecCUDARestoreArrayWrite(win,&warray);
942: PetscLogGpuFlops(n);
943: return(0);
944: }
946: /* should do infinity norm in cuda */
948: PetscErrorCode VecNorm_SeqCUDA(Vec xin,NormType type,PetscReal *z)
949: {
950: PetscErrorCode ierr;
951: PetscInt n = xin->map->n;
952: PetscBLASInt one = 1, bn = 0;
953: const PetscScalar *xarray;
954: cublasHandle_t cublasv2handle;
955: cublasStatus_t cberr;
956: cudaError_t err;
959: PetscCUBLASGetHandle(&cublasv2handle);
960: PetscBLASIntCast(n,&bn);
961: if (type == NORM_2 || type == NORM_FROBENIUS) {
962: VecCUDAGetArrayRead(xin,&xarray);
963: PetscLogGpuTimeBegin();
964: cberr = cublasXnrm2(cublasv2handle,bn,xarray,one,z);CHKERRCUBLAS(cberr);
965: PetscLogGpuTimeEnd();
966: VecCUDARestoreArrayRead(xin,&xarray);
967: PetscLogGpuFlops(PetscMax(2.0*n-1,0.0));
968: } else if (type == NORM_INFINITY) {
969: int i;
970: VecCUDAGetArrayRead(xin,&xarray);
971: PetscLogGpuTimeBegin();
972: cberr = cublasIXamax(cublasv2handle,bn,xarray,one,&i);CHKERRCUBLAS(cberr);
973: PetscLogGpuTimeEnd();
974: if (bn) {
975: PetscScalar zs;
976: err = cudaMemcpy(&zs,xarray+i-1,sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
977: *z = PetscAbsScalar(zs);
978: } else *z = 0.0;
979: VecCUDARestoreArrayRead(xin,&xarray);
980: } else if (type == NORM_1) {
981: VecCUDAGetArrayRead(xin,&xarray);
982: PetscLogGpuTimeBegin();
983: cberr = cublasXasum(cublasv2handle,bn,xarray,one,z);CHKERRCUBLAS(cberr);
984: VecCUDARestoreArrayRead(xin,&xarray);
985: PetscLogGpuTimeEnd();
986: PetscLogGpuFlops(PetscMax(n-1.0,0.0));
987: } else if (type == NORM_1_AND_2) {
988: VecNorm_SeqCUDA(xin,NORM_1,z);
989: VecNorm_SeqCUDA(xin,NORM_2,z+1);
990: }
991: PetscLogGpuToCpu(sizeof(PetscReal));
992: return(0);
993: }
995: PetscErrorCode VecDotNorm2_SeqCUDA(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
996: {
1000: VecDot_SeqCUDA(s,t,dp);
1001: VecDot_SeqCUDA(t,t,nm);
1002: return(0);
1003: }
1005: PetscErrorCode VecDestroy_SeqCUDA(Vec v)
1006: {
1008: cudaError_t cerr;
1009: Vec_CUDA *veccuda = (Vec_CUDA*)v->spptr;
1012: if (v->spptr) {
1013: if (veccuda->GPUarray_allocated) {
1014: #if defined(PETSC_HAVE_NVSHMEM)
1015: if (veccuda->nvshmem) {
1016: PetscNvshmemFree(veccuda->GPUarray_allocated);
1017: veccuda->nvshmem = PETSC_FALSE;
1018: }
1019: else
1020: #endif
1021: {cerr = cudaFree(veccuda->GPUarray_allocated);CHKERRCUDA(cerr);}
1022: veccuda->GPUarray_allocated = NULL;
1023: }
1024: if (veccuda->stream) {
1025: cerr = cudaStreamDestroy(veccuda->stream);CHKERRCUDA(cerr);
1026: }
1027: }
1028: VecDestroy_SeqCUDA_Private(v);
1029: PetscFree(v->spptr);
1030: return(0);
1031: }
1033: #if defined(PETSC_USE_COMPLEX)
1034: struct conjugate
1035: {
1036: __host__ __device__
1037: PetscScalar operator()(PetscScalar x)
1038: {
1039: return PetscConj(x);
1040: }
1041: };
1042: #endif
1044: PetscErrorCode VecConjugate_SeqCUDA(Vec xin)
1045: {
1046: #if defined(PETSC_USE_COMPLEX)
1047: PetscScalar *xarray;
1048: PetscErrorCode ierr;
1049: PetscInt n = xin->map->n;
1050: thrust::device_ptr<PetscScalar> xptr;
1051: cudaError_t err;
1054: VecCUDAGetArray(xin,&xarray);
1055: PetscLogGpuTimeBegin();
1056: try {
1057: xptr = thrust::device_pointer_cast(xarray);
1058: thrust::transform(xptr,xptr+n,xptr,conjugate());
1059: err = WaitForCUDA();CHKERRCUDA(err);
1060: } catch (char *ex) {
1061: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1062: }
1063: PetscLogGpuTimeEnd();
1064: VecCUDARestoreArray(xin,&xarray);
1065: #else
1067: #endif
1068: return(0);
1069: }
1071: PetscErrorCode VecGetLocalVector_SeqCUDA(Vec v,Vec w)
1072: {
1074: cudaError_t err;
1081: if (w->data) {
1082: if (((Vec_Seq*)w->data)->array_allocated) {
1083: if (w->pinned_memory) {
1084: PetscMallocSetCUDAHost();
1085: }
1086: PetscFree(((Vec_Seq*)w->data)->array_allocated);
1087: if (w->pinned_memory) {
1088: PetscMallocResetCUDAHost();
1089: w->pinned_memory = PETSC_FALSE;
1090: }
1091: }
1092: ((Vec_Seq*)w->data)->array = NULL;
1093: ((Vec_Seq*)w->data)->unplacedarray = NULL;
1094: }
1095: if (w->spptr) {
1097: if (((Vec_CUDA*)w->spptr)->GPUarray) {
1098: err = cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);CHKERRCUDA(err);
1099: ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1100: }
1101: if (((Vec_CUDA*)w->spptr)->stream) {
1102: err = cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);CHKERRCUDA(err);
1103: }
1104: PetscFree(w->spptr);
1105: }
1107: if (v->petscnative) {
1108: PetscFree(w->data);
1109: w->data = v->data;
1110: w->offloadmask = v->offloadmask;
1111: w->pinned_memory = v->pinned_memory;
1112: w->spptr = v->spptr;
1113: PetscObjectStateIncrease((PetscObject)w);
1114: } else {
1115: VecGetArray(v,&((Vec_Seq*)w->data)->array);
1116: w->offloadmask = PETSC_OFFLOAD_CPU;
1117: VecCUDAAllocateCheck(w);
1118: }
1119: return(0);
1120: }
1122: PetscErrorCode VecRestoreLocalVector_SeqCUDA(Vec v,Vec w)
1123: {
1125: cudaError_t err;
1133: if (v->petscnative) {
1134: v->data = w->data;
1135: v->offloadmask = w->offloadmask;
1136: v->pinned_memory = w->pinned_memory;
1137: v->spptr = w->spptr;
1138: w->data = 0;
1139: w->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1140: w->spptr = 0;
1141: } else {
1142: VecRestoreArray(v,&((Vec_Seq*)w->data)->array);
1143: if ((Vec_CUDA*)w->spptr) {
1144: err = cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);CHKERRCUDA(err);
1145: ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1146: if (((Vec_CUDA*)v->spptr)->stream) {
1147: err = cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);CHKERRCUDA(err);
1148: }
1149: PetscFree(w->spptr);
1150: }
1151: }
1152: return(0);
1153: }
1155: struct petscrealpart : public thrust::unary_function<PetscScalar,PetscReal>
1156: {
1157: __host__ __device__
1158: PetscReal operator()(PetscScalar x) {
1159: return PetscRealPart(x);
1160: }
1161: };
1163: struct petscrealparti : public thrust::unary_function<thrust::tuple<PetscScalar, PetscInt>,thrust::tuple<PetscReal, PetscInt>>
1164: {
1165: __host__ __device__
1166: thrust::tuple<PetscReal, PetscInt> operator()(thrust::tuple<PetscScalar, PetscInt> x) {
1167: return thrust::make_tuple(PetscRealPart(x.get<0>()), x.get<1>());
1168: }
1169: };
1171: struct petscmax : public thrust::binary_function<PetscReal,PetscReal,PetscReal>
1172: {
1173: __host__ __device__
1174: PetscReal operator()(PetscReal x, PetscReal y) {
1175: return x < y ? y : x;
1176: }
1177: };
1179: struct petscmaxi : public thrust::binary_function<thrust::tuple<PetscReal, PetscInt>,thrust::tuple<PetscReal, PetscInt>,thrust::tuple<PetscReal, PetscInt>>
1180: {
1181: __host__ __device__
1182: thrust::tuple<PetscReal, PetscInt> operator()(thrust::tuple<PetscReal, PetscInt> x, thrust::tuple<PetscReal, PetscInt> y) {
1183: return x.get<0>() < y.get<0>() ? thrust::make_tuple(y.get<0>(), y.get<1>()) :
1184: (x.get<0>() != y.get<0>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) :
1185: (x.get<1>() < y.get<1>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) : thrust::make_tuple(y.get<0>(), y.get<1>())));
1186: }
1187: };
1189: struct petscmin : public thrust::binary_function<PetscReal,PetscReal,PetscReal>
1190: {
1191: __host__ __device__
1192: PetscReal operator()(PetscReal x, PetscReal y) {
1193: return x < y ? x : y;
1194: }
1195: };
1197: struct petscmini : public thrust::binary_function<thrust::tuple<PetscReal, PetscInt>,thrust::tuple<PetscReal, PetscInt>,thrust::tuple<PetscReal, PetscInt>>
1198: {
1199: __host__ __device__
1200: thrust::tuple<PetscReal, PetscInt> operator()(thrust::tuple<PetscReal, PetscInt> x, thrust::tuple<PetscReal, PetscInt> y) {
1201: return x.get<0>() > y.get<0>() ? thrust::make_tuple(y.get<0>(), y.get<1>()) :
1202: (x.get<0>() != y.get<0>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) :
1203: (x.get<1>() < y.get<1>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) : thrust::make_tuple(y.get<0>(), y.get<1>())));
1204: }
1205: };
1207: PetscErrorCode VecMax_SeqCUDA(Vec v, PetscInt *p, PetscReal *m)
1208: {
1209: PetscErrorCode ierr;
1210: PetscInt n = v->map->n;
1211: const PetscScalar *av;
1212: thrust::device_ptr<const PetscScalar> avpt;
1216: if (!n) {
1217: *m = PETSC_MIN_REAL;
1218: if (p) *p = -1;
1219: return(0);
1220: }
1221: VecCUDAGetArrayRead(v,&av);
1222: avpt = thrust::device_pointer_cast(av);
1223: PetscLogGpuTimeBegin();
1224: if (p) {
1225: thrust::tuple<PetscReal,PetscInt> res(PETSC_MIN_REAL,-1);
1226: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(avpt,thrust::counting_iterator<PetscInt>(0)));
1227: try {
1228: #if defined(PETSC_USE_COMPLEX)
1229: res = thrust::transform_reduce(zibit,zibit+n,petscrealparti(),res,petscmaxi());
1230: #else
1231: res = thrust::reduce(zibit,zibit+n,res,petscmaxi());
1232: #endif
1233: } catch (char *ex) {
1234: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1235: }
1236: *m = res.get<0>();
1237: *p = res.get<1>();
1238: } else {
1239: try {
1240: #if defined(PETSC_USE_COMPLEX)
1241: *m = thrust::transform_reduce(avpt,avpt+n,petscrealpart(),PETSC_MIN_REAL,petscmax());
1242: #else
1243: *m = thrust::reduce(avpt,avpt+n,PETSC_MIN_REAL,petscmax());
1244: #endif
1245: } catch (char *ex) {
1246: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1247: }
1248: }
1249: PetscLogGpuTimeEnd();
1250: VecCUDARestoreArrayRead(v,&av);
1251: return(0);
1252: }
1254: PetscErrorCode VecMin_SeqCUDA(Vec v, PetscInt *p, PetscReal *m)
1255: {
1256: PetscErrorCode ierr;
1257: PetscInt n = v->map->n;
1258: const PetscScalar *av;
1259: thrust::device_ptr<const PetscScalar> avpt;
1263: if (!n) {
1264: *m = PETSC_MAX_REAL;
1265: if (p) *p = -1;
1266: return(0);
1267: }
1268: VecCUDAGetArrayRead(v,&av);
1269: avpt = thrust::device_pointer_cast(av);
1270: PetscLogGpuTimeBegin();
1271: if (p) {
1272: thrust::tuple<PetscReal,PetscInt> res(PETSC_MAX_REAL,-1);
1273: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(avpt,thrust::counting_iterator<PetscInt>(0)));
1274: try {
1275: #if defined(PETSC_USE_COMPLEX)
1276: res = thrust::transform_reduce(zibit,zibit+n,petscrealparti(),res,petscmini());
1277: #else
1278: res = thrust::reduce(zibit,zibit+n,res,petscmini());
1279: #endif
1280: } catch (char *ex) {
1281: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1282: }
1283: *m = res.get<0>();
1284: *p = res.get<1>();
1285: } else {
1286: try {
1287: #if defined(PETSC_USE_COMPLEX)
1288: *m = thrust::transform_reduce(avpt,avpt+n,petscrealpart(),PETSC_MAX_REAL,petscmin());
1289: #else
1290: *m = thrust::reduce(avpt,avpt+n,PETSC_MAX_REAL,petscmin());
1291: #endif
1292: } catch (char *ex) {
1293: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1294: }
1295: }
1296: PetscLogGpuTimeEnd();
1297: VecCUDARestoreArrayRead(v,&av);
1298: return(0);
1299: }
1300: #if defined(PETSC_HAVE_NVSHMEM)
1301: /* Free old CUDA array and re-allocate a new one from nvshmem symmetric heap.
1302: New array does not retain values in the old array. The offload mask is not changed.
1304: Note: the function is only meant to be used in MatAssemblyEnd_MPIAIJCUSPARSE.
1305: */
1306: PetscErrorCode VecAllocateNVSHMEM_SeqCUDA(Vec v)
1307: {
1309: cudaError_t cerr;
1310: Vec_CUDA *veccuda = (Vec_CUDA*)v->spptr;
1311: PetscInt n;
1314: cerr = cudaFree(veccuda->GPUarray_allocated);CHKERRCUDA(cerr);
1315: VecGetLocalSize(v,&n);
1316: MPIU_Allreduce(MPI_IN_PLACE,&n,1,MPIU_INT,MPI_MAX,PETSC_COMM_WORLD);
1317: PetscNvshmemMalloc(n*sizeof(PetscScalar),(void**)&veccuda->GPUarray_allocated);
1318: veccuda->GPUarray = veccuda->GPUarray_allocated;
1319: veccuda->nvshmem = PETSC_TRUE;
1320: return(0);
1321: }
1322: #endif