Skip to content

Commit 78c41df

Browse files
authored
Merge branch 'develop' into fix_elastic_born_silicon
2 parents 7a4ab64 + fac62e9 commit 78c41df

File tree

168 files changed

+4050
-7833
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

168 files changed

+4050
-7833
lines changed

cmake/Modules/Packages/GPU.cmake

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -154,8 +154,11 @@ if(GPU_API STREQUAL "CUDA")
154154
endif()
155155
endif()
156156

157-
cuda_compile_fatbin(GPU_GEN_OBJS ${GPU_LIB_CU} OPTIONS ${CUDA_REQUEST_PIC}
158-
-DUNIX -O3 --use_fast_math -Wno-deprecated-gpu-targets -allow-unsupported-compiler -DNV_KERNEL -DUCL_CUDADR ${GPU_CUDA_GENCODE} -D_${GPU_PREC_SETTING} -DLAMMPS_${LAMMPS_SIZES})
157+
set(NVCC_FLAGS -DUNIX -O3 --use_fast_math -Wno-deprecated-gpu-targets -allow-unsupported-compiler -DNV_KERNEL -DUCL_CUDADR ${GPU_CUDA_GENCODE} -D_${GPU_PREC_SETTING} -DLAMMPS_${LAMMPS_SIZES})
158+
if(CUDPP_OPT)
159+
string(APPEND NVCC_FLAGS " -DUSE_CUDPP")
160+
endif()
161+
cuda_compile_fatbin(GPU_GEN_OBJS ${GPU_LIB_CU} OPTIONS ${CUDA_REQUEST_PIC} ${NVCC_FLAGS})
159162

160163
cuda_compile(GPU_OBJS ${GPU_LIB_CUDPP_CU} OPTIONS ${CUDA_REQUEST_PIC}
161164
-DUNIX -O3 --use_fast_math -Wno-deprecated-gpu-targets -allow-unsupported-compiler -DUCL_CUDADR ${GPU_CUDA_GENCODE} -D_${GPU_PREC_SETTING} -DLAMMPS_${LAMMPS_SIZES})
@@ -253,9 +256,9 @@ elseif(GPU_API STREQUAL "OPENCL")
253256
target_include_directories(gpu PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/gpu)
254257
target_compile_definitions(gpu PRIVATE -DUSE_OPENCL -D_${GPU_PREC_SETTING})
255258
if(GPU_DEBUG)
256-
target_compile_definitions(gpu PRIVATE -DUCL_DEBUG -DGERYON_KERNEL_DUMP)
259+
target_compile_definitions(gpu PRIVATE -DUCL_DEBUG -DGERYON_KERNEL_DUMP -DLAL_SERIALIZE_INIT)
257260
else()
258-
target_compile_definitions(gpu PRIVATE -DMPI_GERYON -DGERYON_NUMA_FISSION -DUCL_NO_EXIT)
261+
target_compile_definitions(gpu PRIVATE -DMPI_GERYON -DGERYON_NUMA_FISSION -DUCL_NO_EXIT -DLAL_SERIALIZE_INIT)
259262
endif()
260263

261264
add_executable(ocl_get_devices ${LAMMPS_LIB_SOURCE_DIR}/gpu/geryon/ucl_get_devices.cpp)

doc/src/package.rst

Lines changed: 30 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
package command
44
===============
55

6+
.. contents::
7+
68
Syntax
79
""""""
810

@@ -19,9 +21,10 @@ Syntax
1921
Ngpu = # of GPUs per node
2022
zero or more keyword/value pairs may be appended
2123
keywords = *neigh* or *newton* or *pair/only* or *binsize* or *split* or *gpuID* or *tpa* or *blocksize* or *omp* or *platform* or *device_type* or *ocl_args*
22-
*neigh* value = *yes* or *no*
24+
*neigh* value = *yes* or *no* or *hybrid*
2325
*yes* = neighbor list build on GPU (default)
2426
*no* = neighbor list build on CPU
27+
*hybrid* = perform binning on the CPU but build neighbor list on the GPU
2528
*newton* = *off* or *on*
2629
*off* = set Newton pairwise flag off (default and required)
2730
*on* = set Newton pairwise flag on (currently not allowed)
@@ -195,10 +198,11 @@ See the :doc:`Accelerator packages <Speed_packages>` page for more details
195198
about using the various accelerator packages for speeding up LAMMPS
196199
simulations.
197200

198-
----------
201+
GPU package settings
202+
^^^^^^^^^^^^^^^^^^^^
199203

200-
The *gpu* style invokes settings associated with the use of the GPU
201-
package.
204+
The *gpu* style invokes settings associated with the use of the
205+
:ref:`GPU package <PKG-GPU>`.
202206

203207
The *Ngpu* argument sets the number of GPUs per node. If *Ngpu* is 0
204208
and no other keywords are specified, GPU or accelerator devices are
@@ -216,15 +220,25 @@ tasks (per node) than GPUs, multiple MPI tasks will share each GPU.
216220
Optional keyword/value pairs can also be specified. Each has a
217221
default value as listed below.
218222

223+
.. versionchanged:: TBD
224+
225+
Updated description to the current state of the GPU package
226+
219227
The *neigh* keyword specifies where neighbor lists for pair style
220228
computation will be built. If *neigh* is *yes*, which is the default,
221229
neighbor list building is performed on the GPU. If *neigh* is *no*,
222-
neighbor list building is performed on the CPU. GPU neighbor list
223-
building currently cannot be used with a triclinic box. GPU neighbor
224-
lists are not compatible with commands that are not GPU-enabled. When
225-
a non-GPU enabled command requires a neighbor list, it will also be
226-
built on the CPU. In these cases, it will typically be more efficient
227-
to only use CPU neighbor list builds.
230+
neighbor list building is instead performed on the CPU. If *neigh* is
231+
*hybrid* the binning step of the neighbor list build is performed on the
232+
CPU and the list themselves on the GPU. GPU neighbor list building
233+
currently is not fully compatible with a triclinic box; if the behavior
234+
is significantly different from the CPU case, use the *neigh no*
235+
setting. GPU neighbor lists are not accessible for commands that are
236+
not GPU-enabled. When a non-GPU enabled command requires a neighbor
237+
list, it will be built on the CPU. In these cases, it can be more
238+
efficient to only use CPU neighbor list builds, particularly if the CPU
239+
neighbor list is perpetual, i.e. used in every step. If a GPU
240+
environment does not support building neighbor lists on the GPU, the
241+
default setting it will automatically change to *neigh no*.
228242

229243
The *newton* keyword sets the Newton flags for pairwise (not bonded)
230244
interactions to *off* or *on*, the same as the :doc:`newton <newton>`
@@ -355,7 +369,8 @@ For OpenCL, the routines are compiled at runtime for the specified GPU
355369
or accelerator architecture. The *ocl\_args* keyword can be used to
356370
specify additional flags for the runtime build.
357371

358-
----------
372+
INTEL package settings
373+
^^^^^^^^^^^^^^^^^^^^^^
359374

360375
The *intel* style invokes settings associated with the use of the INTEL
361376
package. The keywords *balance*, *ghost*, *tpc*, and *tptask* are
@@ -458,7 +473,8 @@ to prevent MPI tasks and OpenMP threads from being on separate NUMA
458473
domains and to prevent offload threads from interfering with other
459474
processes/threads used for LAMMPS.
460475

461-
----------
476+
KOKKOS package settings
477+
^^^^^^^^^^^^^^^^^^^^^^^
462478

463479
The *kokkos* style invokes settings associated with the use of the
464480
KOKKOS package.
@@ -649,7 +665,8 @@ The *bond/block/size* keyword sets the number of GPU threads per block
649665
used for launching the bond force kernel on the GPU. The default value
650666
of this parameter is determined based on the GPU architecture at runtime.
651667

652-
----------
668+
OPENMP package settings
669+
^^^^^^^^^^^^^^^^^^^^^^^
653670

654671
The *omp* style invokes settings associated with the use of the
655672
OPENMP package.

lib/gpu/geryon/hip_device.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -443,7 +443,7 @@ void UCL_Device::clear() {
443443
// List all devices along with all properties
444444
void UCL_Device::print_all(std::ostream &out) {
445445
int driver_version;
446-
hipDriverGetVersion(&driver_version);
446+
(void)hipDriverGetVersion(&driver_version);
447447
out << "Driver Version: "
448448
<< driver_version/1000 << "." << driver_version%100
449449
<< std::endl;

lib/gpu/geryon/hip_macros.h

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -61,20 +61,20 @@
6161
#else // not DEBUG
6262

6363
// void macros for performance reasons
64-
#define CU_SAFE_CALL_NS( call ) call
65-
#define CU_SAFE_CALL( call) call
64+
#define CU_SAFE_CALL_NS(call) (void)call
65+
#define CU_SAFE_CALL(call) (void)call
6666

6767
#endif
6868

6969
#ifdef UCL_DESTRUCT_CHECK
7070

71-
#define CU_DESTRUCT_CALL( call) CU_SAFE_CALL( call)
72-
#define CU_DESTRUCT_CALL_NS( call) CU_SAFE_CALL_NS( call)
71+
#define CU_DESTRUCT_CALL(call) CU_SAFE_CALL(call)
72+
#define CU_DESTRUCT_CALL_NS(call) CU_SAFE_CALL_NS(call)
7373

7474
#else
7575

76-
#define CU_DESTRUCT_CALL( call) call
77-
#define CU_DESTRUCT_CALL_NS( call) call
76+
#define CU_DESTRUCT_CALL(call) (void)call
77+
#define CU_DESTRUCT_CALL_NS(call) (void)call
7878

7979
#endif
8080

lib/gpu/geryon/ocl_kernel.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,9 +67,22 @@ class UCL_Program {
6767
/** \note Must call init() after each clear **/
6868
inline void clear() {
6969
if (_init_done) {
70+
#ifdef CL_VERSION_2_0
71+
cl_context ctx_from_queue = nullptr;
72+
cl_int err = clGetCommandQueueInfo(_cq,
73+
CL_QUEUE_CONTEXT,
74+
sizeof(ctx_from_queue),
75+
&ctx_from_queue,
76+
nullptr);
77+
if (err == CL_SUCCESS)
78+
CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq));
79+
CL_DESTRUCT_CALL(clReleaseProgram(_program));
80+
CL_DESTRUCT_CALL(clReleaseContext(_context));
81+
#else
7082
CL_DESTRUCT_CALL(clReleaseProgram(_program));
7183
CL_DESTRUCT_CALL(clReleaseContext(_context));
7284
CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq));
85+
#endif
7386
_init_done=false;
7487
}
7588
}

lib/gpu/lal_base_atomic.cpp

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,30 @@ inline void BaseAtomicT::build_nbor_list(const int inum, const int host_inum,
182182
_max_an_bytes=bytes;
183183
}
184184

185+
template <class numtyp, class acctyp>
186+
inline void BaseAtomicT::build_nbor_list(const int inum, const int host_inum,
187+
const int nall, double **host_x,
188+
int *host_type, double *sublo,
189+
double *subhi, tagint *tag,
190+
int **nspecial, tagint **special,
191+
double* prd, int* periodicity, bool &success) {
192+
success=true;
193+
resize_atom(inum,nall,success);
194+
resize_local(inum,host_inum,nbor->max_nbors(),success);
195+
if (!success)
196+
return;
197+
atom->cast_copy_x(host_x,host_type);
198+
199+
int mn;
200+
nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi,
201+
tag, nspecial, special, success, mn, prd, periodicity,
202+
ans->error_flag);
203+
204+
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
205+
if (bytes>_max_an_bytes)
206+
_max_an_bytes=bytes;
207+
}
208+
185209
// ---------------------------------------------------------------------------
186210
// Copy nbor list from host if necessary and then calculate forces, virials,..
187211
// ---------------------------------------------------------------------------
@@ -248,7 +272,8 @@ int **BaseAtomicT::compute(const int ago, const int inum_full,
248272
const bool eflag_in, const bool vflag_in,
249273
const bool eatom, const bool vatom,
250274
int &host_start, int **ilist, int **jnum,
251-
const double cpu_time, bool &success) {
275+
const double cpu_time, bool &success, double *prd,
276+
int *periodicity) {
252277
acc_timers();
253278
int eflag, vflag;
254279
if (eatom) eflag=2;
@@ -280,7 +305,9 @@ int **BaseAtomicT::compute(const int ago, const int inum_full,
280305
// Build neighbor list on GPU if necessary
281306
if (ago==0) {
282307
build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
283-
sublo, subhi, tag, nspecial, special, success);
308+
sublo, subhi, tag, nspecial, special,
309+
prd, periodicity, success);
310+
284311
if (!success)
285312
return nullptr;
286313
hd_balancer.start_timer();

lib/gpu/lal_base_atomic.h

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,12 @@ class BaseAtomic {
125125
double *sublo, double *subhi, tagint *tag, int **nspecial,
126126
tagint **special, bool &success);
127127

128+
void build_nbor_list(const int inum, const int host_inum,
129+
const int nall, double **host_x, int *host_type,
130+
double *sublo, double *subhi, tagint *tag, int **nspecial,
131+
tagint **special, double *prd, int *periodicity,
132+
bool &success);
133+
128134
/// Pair loop with host neighboring
129135
void compute(const int f_ago, const int inum_full,
130136
const int nall, double **host_x, int *host_type,
@@ -138,7 +144,8 @@ class BaseAtomic {
138144
double *subhi, tagint *tag, int **nspecial,
139145
tagint **special, const bool eflag, const bool vflag,
140146
const bool eatom, const bool vatom, int &host_start,
141-
int **ilist, int **numj, const double cpu_time, bool &success);
147+
int **ilist, int **numj, const double cpu_time, bool &success,
148+
double *prd=nullptr, int *periodicity=nullptr);
142149

143150
// -------------------------- DEVICE DATA -------------------------
144151

lib/gpu/lal_base_charge.cpp

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,30 @@ inline void BaseChargeT::build_nbor_list(const int inum, const int host_inum,
184184
_max_an_bytes=bytes;
185185
}
186186

187+
template <class numtyp, class acctyp>
188+
inline void BaseChargeT::build_nbor_list(const int inum, const int host_inum,
189+
const int nall, double **host_x,
190+
int *host_type, double *sublo,
191+
double *subhi, tagint *tag,
192+
int **nspecial, tagint **special,
193+
double* prd, int* periodicity, bool &success) {
194+
success=true;
195+
resize_atom(inum,nall,success);
196+
resize_local(inum,host_inum,nbor->max_nbors(),success);
197+
if (!success)
198+
return;
199+
atom->cast_copy_x(host_x,host_type);
200+
201+
int mn;
202+
nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi,
203+
tag, nspecial, special, success, mn, prd, periodicity,
204+
ans->error_flag);
205+
206+
double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
207+
if (bytes>_max_an_bytes)
208+
_max_an_bytes=bytes;
209+
}
210+
187211
// ---------------------------------------------------------------------------
188212
// Copy nbor list from host if necessary and then calculate forces, virials,..
189213
// ---------------------------------------------------------------------------
@@ -257,7 +281,8 @@ int** BaseChargeT::compute(const int ago, const int inum_full,
257281
const bool eatom, const bool vatom, int &host_start,
258282
int **ilist, int **jnum,
259283
const double cpu_time, bool &success,
260-
double *host_q, double *boxlo, double *prd) {
284+
double *host_q, double *boxlo, double *prd,
285+
int* periodicity) {
261286
acc_timers();
262287
int eflag, vflag;
263288
if (eatom) eflag=2;
@@ -289,7 +314,9 @@ int** BaseChargeT::compute(const int ago, const int inum_full,
289314
// Build neighbor list on GPU if necessary
290315
if (ago==0) {
291316
build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
292-
sublo, subhi, tag, nspecial, special, success);
317+
sublo, subhi, tag, nspecial, special,
318+
prd, periodicity, success);
319+
293320
if (!success)
294321
return nullptr;
295322
atom->cast_q_data(host_q);

lib/gpu/lal_base_charge.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,12 @@ class BaseCharge {
129129
double *sublo, double *subhi, tagint *tag, int **nspecial,
130130
tagint **special, bool &success);
131131

132+
void build_nbor_list(const int inum, const int host_inum,
133+
const int nall, double **host_x, int *host_type,
134+
double *sublo, double *subhi, tagint *tag, int **nspecial,
135+
tagint **special, double *prd, int *periodicity,
136+
bool &success);
137+
132138
/// Pair loop with host neighboring
133139
void compute(const int f_ago, const int inum_full, const int nall,
134140
double **host_x, int *host_type, int *ilist, int *numj,
@@ -144,7 +150,7 @@ class BaseCharge {
144150
tagint **special, const bool eflag, const bool vflag,
145151
const bool eatom, const bool vatom, int &host_start,
146152
int **ilist, int **numj, const double cpu_time, bool &success,
147-
double *charge, double *boxlo, double *prd);
153+
double *charge, double *boxlo, double *prd, int* periodicity=nullptr);
148154

149155
// -------------------------- DEVICE DATA -------------------------
150156

lib/gpu/lal_beck_ext.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -98,10 +98,10 @@ int ** beck_gpu_compute_n(const int ago, const int inum_full,
9898
tagint **special, const bool eflag, const bool vflag,
9999
const bool eatom, const bool vatom, int &host_start,
100100
int **ilist, int **jnum, const double cpu_time,
101-
bool &success) {
101+
bool &success, double *prd, int *periodicity) {
102102
return BLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
103103
subhi, tag, nspecial, special, eflag, vflag, eatom,
104-
vatom, host_start, ilist, jnum, cpu_time, success);
104+
vatom, host_start, ilist, jnum, cpu_time, success, prd, periodicity);
105105
}
106106

107107
void beck_gpu_compute(const int ago, const int inum_full, const int nall,

0 commit comments

Comments
 (0)