jrgissing
diff --git a/‎cmake/Modules/Packages/GPU.cmake‎
Lines changed: 7 additions & 4 deletions b/‎cmake/Modules/Packages/GPU.cmake‎
Lines changed: 7 additions & 4 deletions
diff --git a/‎doc/src/package.rst‎
Lines changed: 30 additions & 13 deletions b/‎doc/src/package.rst‎
Lines changed: 30 additions & 13 deletions
diff --git a/‎lib/gpu/geryon/hip_device.h‎
Lines changed: 1 addition & 1 deletion b/‎lib/gpu/geryon/hip_device.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/gpu/geryon/hip_macros.h‎
Lines changed: 6 additions & 6 deletions b/‎lib/gpu/geryon/hip_macros.h‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎lib/gpu/geryon/ocl_kernel.h‎
Lines changed: 13 additions & 0 deletions b/‎lib/gpu/geryon/ocl_kernel.h‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎lib/gpu/lal_base_atomic.cpp‎
Lines changed: 29 additions & 2 deletions b/‎lib/gpu/lal_base_atomic.cpp‎
Lines changed: 29 additions & 2 deletions
diff --git a/‎lib/gpu/lal_base_atomic.h‎
Lines changed: 8 additions & 1 deletion b/‎lib/gpu/lal_base_atomic.h‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎lib/gpu/lal_base_charge.cpp‎
Lines changed: 29 additions & 2 deletions b/‎lib/gpu/lal_base_charge.cpp‎
Lines changed: 29 additions & 2 deletions
diff --git a/‎lib/gpu/lal_base_charge.h‎
Lines changed: 7 additions & 1 deletion b/‎lib/gpu/lal_base_charge.h‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎lib/gpu/lal_beck_ext.cpp‎
Lines changed: 2 additions & 2 deletions b/‎lib/gpu/lal_beck_ext.cpp‎
Lines changed: 2 additions & 2 deletions
@@ -154,8 +154,11 @@ if(GPU_API STREQUAL "CUDA")
     endif()
   endif()
 
-  cuda_compile_fatbin(GPU_GEN_OBJS ${GPU_LIB_CU} OPTIONS ${CUDA_REQUEST_PIC}
-          -DUNIX -O3 --use_fast_math -Wno-deprecated-gpu-targets -allow-unsupported-compiler -DNV_KERNEL -DUCL_CUDADR ${GPU_CUDA_GENCODE} -D_${GPU_PREC_SETTING} -DLAMMPS_${LAMMPS_SIZES})
+  set(NVCC_FLAGS -DUNIX -O3 --use_fast_math -Wno-deprecated-gpu-targets -allow-unsupported-compiler -DNV_KERNEL -DUCL_CUDADR ${GPU_CUDA_GENCODE} -D_${GPU_PREC_SETTING} -DLAMMPS_${LAMMPS_SIZES})
+  if(CUDPP_OPT)
+    string(APPEND NVCC_FLAGS " -DUSE_CUDPP")
+  endif()
+  cuda_compile_fatbin(GPU_GEN_OBJS ${GPU_LIB_CU} OPTIONS ${CUDA_REQUEST_PIC} ${NVCC_FLAGS})
 
   cuda_compile(GPU_OBJS ${GPU_LIB_CUDPP_CU} OPTIONS ${CUDA_REQUEST_PIC}
           -DUNIX -O3 --use_fast_math -Wno-deprecated-gpu-targets -allow-unsupported-compiler -DUCL_CUDADR ${GPU_CUDA_GENCODE} -D_${GPU_PREC_SETTING} -DLAMMPS_${LAMMPS_SIZES})
@@ -253,9 +256,9 @@ elseif(GPU_API STREQUAL "OPENCL")
   target_include_directories(gpu PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/gpu)
   target_compile_definitions(gpu PRIVATE -DUSE_OPENCL -D_${GPU_PREC_SETTING})
   if(GPU_DEBUG)
-    target_compile_definitions(gpu PRIVATE -DUCL_DEBUG -DGERYON_KERNEL_DUMP)
+    target_compile_definitions(gpu PRIVATE -DUCL_DEBUG -DGERYON_KERNEL_DUMP -DLAL_SERIALIZE_INIT)
   else()
-    target_compile_definitions(gpu PRIVATE -DMPI_GERYON -DGERYON_NUMA_FISSION -DUCL_NO_EXIT)
+    target_compile_definitions(gpu PRIVATE -DMPI_GERYON -DGERYON_NUMA_FISSION -DUCL_NO_EXIT -DLAL_SERIALIZE_INIT)
   endif()
 
   add_executable(ocl_get_devices ${LAMMPS_LIB_SOURCE_DIR}/gpu/geryon/ucl_get_devices.cpp)
 
@@ -3,6 +3,8 @@
 package command
 ===============
 
+.. contents::
+
 Syntax
 """"""
 
@@ -19,9 +21,10 @@ Syntax
       Ngpu = # of GPUs per node
       zero or more keyword/value pairs may be appended
       keywords = *neigh* or *newton* or *pair/only* or *binsize* or *split* or *gpuID* or *tpa* or *blocksize* or *omp* or *platform* or *device_type* or *ocl_args*
-        *neigh* value = *yes* or *no*
+        *neigh* value = *yes* or *no* or *hybrid*
           *yes* = neighbor list build on GPU (default)
           *no* = neighbor list build on CPU
+          *hybrid* = perform binning on the CPU but build neighbor list on the GPU
         *newton* = *off* or *on*
           *off* = set Newton pairwise flag off (default and required)
           *on* = set Newton pairwise flag on (currently not allowed)
@@ -195,10 +198,11 @@ See the :doc:`Accelerator packages <Speed_packages>` page for more details
 about using the various accelerator packages for speeding up LAMMPS
 simulations.
 
-----------
+GPU package settings
+^^^^^^^^^^^^^^^^^^^^
 
-The *gpu* style invokes settings associated with the use of the GPU
-package.
+The *gpu* style invokes settings associated with the use of the
+:ref:`GPU package <PKG-GPU>`.
 
 The *Ngpu* argument sets the number of GPUs per node. If *Ngpu* is 0
 and no other keywords are specified, GPU or accelerator devices are
@@ -216,15 +220,25 @@ tasks (per node) than GPUs, multiple MPI tasks will share each GPU.
 Optional keyword/value pairs can also be specified.  Each has a
 default value as listed below.
 
+.. versionchanged:: TBD
+
+   Updated description to the current state of the GPU package
+
 The *neigh* keyword specifies where neighbor lists for pair style
 computation will be built.  If *neigh* is *yes*, which is the default,
 neighbor list building is performed on the GPU.  If *neigh* is *no*,
-neighbor list building is performed on the CPU.  GPU neighbor list
-building currently cannot be used with a triclinic box.  GPU neighbor
-lists are not compatible with commands that are not GPU-enabled.  When
-a non-GPU enabled command requires a neighbor list, it will also be
-built on the CPU.  In these cases, it will typically be more efficient
-to only use CPU neighbor list builds.
+neighbor list building is instead performed on the CPU.  If *neigh* is
+*hybrid* the binning step of the neighbor list build is performed on the
+CPU and the list themselves on the GPU.  GPU neighbor list building
+currently is not fully compatible with a triclinic box; if the behavior
+is significantly different from the CPU case, use the *neigh no*
+setting.  GPU neighbor lists are not accessible for commands that are
+not GPU-enabled.  When a non-GPU enabled command requires a neighbor
+list, it will be built on the CPU.  In these cases, it can be more
+efficient to only use CPU neighbor list builds, particularly if the CPU
+neighbor list is perpetual, i.e. used in every step.  If a GPU
+environment does not support building neighbor lists on the GPU, the
+default setting it will automatically change to *neigh no*.
 
 The *newton* keyword sets the Newton flags for pairwise (not bonded)
 interactions to *off* or *on*, the same as the :doc:`newton <newton>`
@@ -355,7 +369,8 @@ For OpenCL, the routines are compiled at runtime for the specified GPU
 or accelerator architecture. The *ocl\_args* keyword can be used to
 specify additional flags for the runtime build.
 
-----------
+INTEL package settings
+^^^^^^^^^^^^^^^^^^^^^^
 
 The *intel* style invokes settings associated with the use of the INTEL
 package.  The keywords *balance*, *ghost*, *tpc*, and *tptask* are
@@ -458,7 +473,8 @@ to prevent MPI tasks and OpenMP threads from being on separate NUMA
 domains and to prevent offload threads from interfering with other
 processes/threads used for LAMMPS.
 
-----------
+KOKKOS package settings
+^^^^^^^^^^^^^^^^^^^^^^^
 
 The *kokkos* style invokes settings associated with the use of the
 KOKKOS package.
@@ -649,7 +665,8 @@ The *bond/block/size* keyword sets the number of GPU threads per block
 used for launching the bond force kernel on the GPU.  The default value
 of this parameter is determined based on the GPU architecture at runtime.
 
-----------
+OPENMP package settings
+^^^^^^^^^^^^^^^^^^^^^^^
 
 The *omp* style invokes settings associated with the use of the
 OPENMP package.
 
@@ -443,7 +443,7 @@ void UCL_Device::clear() {
 // List all devices along with all properties
 void UCL_Device::print_all(std::ostream &out) {
   int driver_version;
-  hipDriverGetVersion(&driver_version);
+  (void)hipDriverGetVersion(&driver_version);
   out << "Driver Version:                           "
       << driver_version/1000 << "." << driver_version%100
                   << std::endl;
 
@@ -61,20 +61,20 @@
 #else  // not DEBUG
 
 // void macros for performance reasons
-#define CU_SAFE_CALL_NS( call ) call
-#define CU_SAFE_CALL( call) call
+#define CU_SAFE_CALL_NS(call) (void)call
+#define CU_SAFE_CALL(call) (void)call
 
 #endif
 
 #ifdef UCL_DESTRUCT_CHECK
 
-#define CU_DESTRUCT_CALL( call) CU_SAFE_CALL( call)
-#define CU_DESTRUCT_CALL_NS( call) CU_SAFE_CALL_NS( call)
+#define CU_DESTRUCT_CALL(call) CU_SAFE_CALL(call)
+#define CU_DESTRUCT_CALL_NS(call) CU_SAFE_CALL_NS(call)
 
 #else
 
-#define CU_DESTRUCT_CALL( call) call
-#define CU_DESTRUCT_CALL_NS( call) call
+#define CU_DESTRUCT_CALL(call) (void)call
+#define CU_DESTRUCT_CALL_NS(call) (void)call
 
 #endif
 
 
@@ -67,9 +67,22 @@ class UCL_Program {
   /** \note Must call init() after each clear **/
   inline void clear() {
     if (_init_done) {
+      #ifdef CL_VERSION_2_0
+      cl_context ctx_from_queue = nullptr;
+      cl_int err = clGetCommandQueueInfo(_cq,
+                                  CL_QUEUE_CONTEXT,
+                                  sizeof(ctx_from_queue),
+                                  &ctx_from_queue,
+                                  nullptr);
+      if (err == CL_SUCCESS)
+        CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq));
+      CL_DESTRUCT_CALL(clReleaseProgram(_program));
+      CL_DESTRUCT_CALL(clReleaseContext(_context));
+      #else
       CL_DESTRUCT_CALL(clReleaseProgram(_program));
       CL_DESTRUCT_CALL(clReleaseContext(_context));
       CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq));
+      #endif
       _init_done=false;
     }
   }
 
@@ -182,6 +182,30 @@ inline void BaseAtomicT::build_nbor_list(const int inum, const int host_inum,
     _max_an_bytes=bytes;
 }
 
+template <class numtyp, class acctyp>
+inline void BaseAtomicT::build_nbor_list(const int inum, const int host_inum,
+                                         const int nall, double **host_x,
+                                         int *host_type, double *sublo,
+                                         double *subhi, tagint *tag,
+                                         int **nspecial, tagint **special,
+                                         double* prd, int* periodicity, bool &success) {
+  success=true;
+  resize_atom(inum,nall,success);
+  resize_local(inum,host_inum,nbor->max_nbors(),success);
+  if (!success)
+    return;
+  atom->cast_copy_x(host_x,host_type);
+
+  int mn;
+  nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi,
+                        tag, nspecial, special, success, mn, prd, periodicity,
+                        ans->error_flag);
+
+  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
+  if (bytes>_max_an_bytes)
+    _max_an_bytes=bytes;
+}
+
 // ---------------------------------------------------------------------------
 // Copy nbor list from host if necessary and then calculate forces, virials,..
 // ---------------------------------------------------------------------------
@@ -248,7 +272,8 @@ int **BaseAtomicT::compute(const int ago, const int inum_full,
                            const bool eflag_in, const bool vflag_in,
                            const bool eatom, const bool vatom,
                            int &host_start, int **ilist, int **jnum,
-                           const double cpu_time, bool &success) {
+                           const double cpu_time, bool &success, double *prd,
+                           int *periodicity) {
   acc_timers();
   int eflag, vflag;
   if (eatom) eflag=2;
@@ -280,7 +305,9 @@ int **BaseAtomicT::compute(const int ago, const int inum_full,
   // Build neighbor list on GPU if necessary
   if (ago==0) {
     build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
-                    sublo, subhi, tag, nspecial, special, success);
+                    sublo, subhi, tag, nspecial, special,
+                    prd, periodicity, success);
+
     if (!success)
       return nullptr;
     hd_balancer.start_timer();
 
@@ -125,6 +125,12 @@ class BaseAtomic {
                        double *sublo, double *subhi, tagint *tag, int **nspecial,
                        tagint **special, bool &success);
 
+  void build_nbor_list(const int inum, const int host_inum,
+                       const int nall, double **host_x, int *host_type,
+                       double *sublo, double *subhi, tagint *tag, int **nspecial,
+                       tagint **special, double *prd, int *periodicity,
+                       bool &success);
+
   /// Pair loop with host neighboring
   void compute(const int f_ago, const int inum_full,
                const int nall, double **host_x, int *host_type,
@@ -138,7 +144,8 @@ class BaseAtomic {
                 double *subhi, tagint *tag, int **nspecial,
                 tagint **special, const bool eflag, const bool vflag,
                 const bool eatom, const bool vatom, int &host_start,
-                int **ilist, int **numj, const double cpu_time, bool &success);
+                int **ilist, int **numj, const double cpu_time, bool &success,
+                double *prd=nullptr, int *periodicity=nullptr);
 
   // -------------------------- DEVICE DATA -------------------------
 
 
@@ -184,6 +184,30 @@ inline void BaseChargeT::build_nbor_list(const int inum, const int host_inum,
     _max_an_bytes=bytes;
 }
 
+template <class numtyp, class acctyp>
+inline void BaseChargeT::build_nbor_list(const int inum, const int host_inum,
+                                         const int nall, double **host_x,
+                                         int *host_type, double *sublo,
+                                         double *subhi, tagint *tag,
+                                         int **nspecial, tagint **special,
+                                         double* prd, int* periodicity, bool &success) {
+  success=true;
+  resize_atom(inum,nall,success);
+  resize_local(inum,host_inum,nbor->max_nbors(),success);
+  if (!success)
+    return;
+  atom->cast_copy_x(host_x,host_type);
+
+  int mn;
+  nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi,
+                        tag, nspecial, special, success, mn, prd, periodicity,
+                        ans->error_flag);
+
+  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
+  if (bytes>_max_an_bytes)
+    _max_an_bytes=bytes;
+}
+
 // ---------------------------------------------------------------------------
 // Copy nbor list from host if necessary and then calculate forces, virials,..
 // ---------------------------------------------------------------------------
@@ -257,7 +281,8 @@ int** BaseChargeT::compute(const int ago, const int inum_full,
                            const bool eatom, const bool vatom, int &host_start,
                            int **ilist, int **jnum,
                            const double cpu_time, bool &success,
-                           double *host_q, double *boxlo, double *prd) {
+                           double *host_q, double *boxlo, double *prd,
+                           int* periodicity) {
   acc_timers();
   int eflag, vflag;
   if (eatom) eflag=2;
@@ -289,7 +314,9 @@ int** BaseChargeT::compute(const int ago, const int inum_full,
   // Build neighbor list on GPU if necessary
   if (ago==0) {
     build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
-                    sublo, subhi, tag, nspecial, special, success);
+                    sublo, subhi, tag, nspecial, special,
+                    prd, periodicity, success);
+
     if (!success)
       return nullptr;
     atom->cast_q_data(host_q);
 
@@ -129,6 +129,12 @@ class BaseCharge {
                        double *sublo, double *subhi, tagint *tag, int **nspecial,
                        tagint **special, bool &success);
 
+  void build_nbor_list(const int inum, const int host_inum,
+                       const int nall, double **host_x, int *host_type,
+                       double *sublo, double *subhi, tagint *tag, int **nspecial,
+                       tagint **special, double *prd, int *periodicity,
+                       bool &success);
+
   /// Pair loop with host neighboring
   void compute(const int f_ago, const int inum_full, const int nall,
                double **host_x, int *host_type, int *ilist, int *numj,
@@ -144,7 +150,7 @@ class BaseCharge {
                 tagint **special, const bool eflag, const bool vflag,
                 const bool eatom, const bool vatom, int &host_start,
                 int **ilist, int **numj, const double cpu_time, bool &success,
-                double *charge, double *boxlo, double *prd);
+                double *charge, double *boxlo, double *prd, int* periodicity=nullptr);
 
   // -------------------------- DEVICE DATA -------------------------
 
 
@@ -98,10 +98,10 @@ int ** beck_gpu_compute_n(const int ago, const int inum_full,
                            tagint **special, const bool eflag, const bool vflag,
                            const bool eatom, const bool vatom, int &host_start,
                            int **ilist, int **jnum, const double cpu_time,
-                           bool &success) {
+                           bool &success, double *prd, int *periodicity) {
   return BLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
                       subhi, tag, nspecial, special, eflag, vflag, eatom,
-                      vatom, host_start, ilist, jnum, cpu_time, success);
+                      vatom, host_start, ilist, jnum, cpu_time, success, prd, periodicity);
 }
 
 void beck_gpu_compute(const int ago, const int inum_full, const int nall,