Snapshot before adding OpenCL buffer acquisition facilities

ctpeterson · ctpeterson · commit 6b130b397a8b · 2026-02-24T10:54:53.000-05:00
diff --git a/src/memory/bufferpool.nim b/src/memory/bufferpool.nim
@@ -33,6 +33,10 @@ import std/[strutils]
 
 import types/[composite]
 
+import opencl/[oclbufferpool]
+
+export oclbufferpool
+
 const
   MaxPoolEntries* = 128
   MaxPoolBytes* = 2 * 1024 * 1024 * 1024 # 2 GB
diff --git a/src/memory/simdlayout.nim b/src/memory/simdlayout.nim
@@ -146,3 +146,4 @@ implement SIMDLayout with:
     for d in 0..<D:
       let numFaceSites = this.numDeviceSites() div this.deviceGrid[d]
       result += 2 * numFaceSites * ghostGrid[d]
+
diff --git a/src/opencl/oclbufferpool.nim b/src/opencl/oclbufferpool.nim
@@ -0,0 +1,45 @@
+#[
+  ReliQ lattice field theory framework: https://github.com/reliq-lft/ReliQ
+  Source file: src/opencl/oclbufferpool.nim
+  Contact: reliq-lft@proton.me
+
+  Author: Andrea Ferretti
+  Modifications: Curtis Taylor Peterson <curtistaylorpetersonwork@gmail.com>
+
+  Original License:
+
+  Copyright 2016-2017 UniCredit S.p.A.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+
+  ReliQ Modifications License:
+
+  Copyright (c) 2025 reliq-lft
+  
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to deal
+  in the Software without restriction, including without limitation the rights
+  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+  copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+
+  The above copyright notice and this permission notice shall be included in all
+  copies or substantial portions of the Software.
+  
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 
+  WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 
+  CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+]#
diff --git a/src/reliq.nim b/src/reliq.nim
@@ -37,10 +37,13 @@ import memory/[coherence]
 # <***> will be where view is exported <***>
 when defined(UseOpenMP):
   import openmp/[openmp]
+  export openmp
 elif defined(UseOpenCL):
   import opencl/[opencl]
+  export opencl
 else:
   import opencl/[opencl]
+  export opencl
 
 when defined(UseOpenMP):
   type
@@ -55,8 +58,8 @@ else:
     DeviceBuffer* = PMem
     DeviceQueue* = PCommandQueue
     
-var globalBufferPool* {.inject.} = newBufferPool()
-var globalCoherenceManager* {.inject.} = newCoherenceManager()
+var globalBufferPool* {.global.} = newBufferPool()
+var globalCoherenceManager* {.global.} = newCoherenceManager()
 
 template reliq*(body: untyped): untyped =
   gaParallel:
diff --git a/src/tensor/tensorfieldview.nim b/src/tensor/tensorfieldview.nim
@@ -95,12 +95,16 @@ template newTensorFieldView*[D: static[int], R: static[int], L: Lattice[D], T](
   let ghostGrid = tensor.ghostGrid()
   var view = tensor.newTensorFieldView(true)
 
-  #[ host/device coherence check ]#
+  #[ host coherence check ]#
 
   globalCoherenceManager.ensureEntry(view.aos)
   view.state = globalCoherenceManager.open(view.aos, io)
 
-  #[ buffer acquisition ]#
+  #[ get SIMD layout ]#
+
+  view.simdLayout = newSIMDLayout(localGrid, inputSIMDGrid)
+
+  #[ host buffer acquisition ]#
 
   when isComplex32(T):
     type StorageType = float32
@@ -109,22 +113,23 @@ template newTensorFieldView*[D: static[int], R: static[int], L: Lattice[D], T](
   else:
     type StorageType = T
 
-  view.simdLayout = newSIMDLayout(localGrid, inputSIMDGrid)
+  var numLocalSites = view.simdLayout.numLocalDeviceSites()
+  var numGhostSites = view.simdLayout.numGhostDeviceSites(ghostGrid)
+  let key = BufferKey(
+    numSites: numLocalSites + numGhostSites,
+    elementsPerSite: product(view.shape) * (if isComplex(T): 2 else: 1),
+    elementSize: sizeof(StorageType)
+  )
+  let bytes = key.numSites * key.elementsPerSite * key.elementSize
+
   if view.state.canReuseBuffer and view.state.buffer != nil:
     (view.buffer, view.reference) = (view.state.buffer, view.state.reference)
   else: # try buffer pool
-    var numLocalSites = view.simdLayout.numLocalDeviceSites()
-    var numGhostSites = view.simdLayout.numGhostDeviceSites(ghostGrid)
-    let key = BufferKey(
-      numSites: numLocalSites + numGhostSites,
-      elementsPerSite: product(view.shape) * (if isComplex(T): 2 else: 1),
-      elementSize: sizeof(StorageType)
-    )
     let (found, entry) = globalBufferPool.lookup(key)
     if found: (view.buffer, view.reference) = (entry.buffer, entry.reference)
-    else: view.buffer = alloc entry.bytes
+    else: view.buffer = alloc bytes
   
-  #[ AoS -> AoSoA transformation ]#
+  #[ host AoS -> AoSoA transformation ]#
 
   view.aosoa = cast[LocalStorage[StorageType]](view.buffer)
   if view.state.needsTransform:
@@ -139,12 +144,34 @@ template newTensorFieldView*[D: static[int], R: static[int], L: Lattice[D], T](
       ): T
     of iokWrite: discard
 
-  #[ host to device transfer ]#
+  #[ device buffer acquisition ]#
+
+  # !!!! NEED TO IMPLEMENT OpenCL BUFFER POOL !!!!
 
-  case globalCoherenceManager[view.buffer].state
-  of cskDeviceDirty:
-  of cskEmpty:
-  of cskHostDirty, cskConsistent: discard
+  #[
+  when defined(UseOpenMP):
+    view.deviceQueue = nil.DeviceQueue
+    view.deviceBuffer = cast[DeviceBuffer](view.buffer)
+  elif defined(UseOpenCL):
+    view.deviceQueue = clQueues[0] # one device per MPI rank
+
+    # cl_mem allocation: no coherence-level caching exists for device
+    # buffers yet, so every view gets a fresh cl_mem.  The host AoSoA
+    # buffer *is* reused via the coherence manager above, but the cl_mem
+    # is not (a CLBufferPool would be needed for that — see §3.4 of
+    # memory_management_strategy.txt).
+    view.deviceBuffer = buffer[StorageType](clContext, key.numSites * key.elementsPerSite)
+
+    # Upload decision: for iokRead/iokReadWrite we always need to
+    # upload because the cl_mem is freshly allocated (no cl_mem reuse).
+    # Once a CLBufferPool is added, this can be conditioned on whether
+    # the cl_mem was reused and the data is already current.
+    case io
+    of iokRead, iokReadWrite:
+      view.deviceQueue.write(addr view.aosoa[0], view.deviceBuffer, bytes)
+      check finish(view.deviceQueue)
+    of iokWrite: discard
+  ]#
   
   # return view
   move view