Skip to content

Commit 6b130b3

Browse files
committed
Snapshot before adding OpenCL buffer acquisition facilities
1 parent bd6b3a5 commit 6b130b3

5 files changed

Lines changed: 99 additions & 19 deletions

File tree

src/memory/bufferpool.nim

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,10 @@ import std/[strutils]
3333

3434
import types/[composite]
3535

36+
import opencl/[oclbufferpool]
37+
38+
export oclbufferpool
39+
3640
const
3741
MaxPoolEntries* = 128
3842
MaxPoolBytes* = 2 * 1024 * 1024 * 1024 # 2 GB

src/memory/simdlayout.nim

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,3 +146,4 @@ implement SIMDLayout with:
146146
for d in 0..<D:
147147
let numFaceSites = this.numDeviceSites() div this.deviceGrid[d]
148148
result += 2 * numFaceSites * ghostGrid[d]
149+

src/opencl/oclbufferpool.nim

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
#[
2+
ReliQ lattice field theory framework: https://github.com/reliq-lft/ReliQ
3+
Source file: src/opencl/oclbufferpool.nim
4+
Contact: reliq-lft@proton.me
5+
6+
Author: Andrea Ferretti
7+
Modifications: Curtis Taylor Peterson <curtistaylorpetersonwork@gmail.com>
8+
9+
Original License:
10+
11+
Copyright 2016-2017 UniCredit S.p.A.
12+
13+
Licensed under the Apache License, Version 2.0 (the "License");
14+
you may not use this file except in compliance with the License.
15+
You may obtain a copy of the License at
16+
17+
http://www.apache.org/licenses/LICENSE-2.0
18+
19+
Unless required by applicable law or agreed to in writing, software
20+
distributed under the License is distributed on an "AS IS" BASIS,
21+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
22+
See the License for the specific language governing permissions and
23+
limitations under the License.
24+
25+
ReliQ Modifications License:
26+
27+
Copyright (c) 2025 reliq-lft
28+
29+
Permission is hereby granted, free of charge, to any person obtaining a copy
30+
of this software and associated documentation files (the "Software"), to deal
31+
in the Software without restriction, including without limitation the rights
32+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
33+
copies of the Software, and to permit persons to whom the Software is
34+
furnished to do so, subject to the following conditions:
35+
36+
The above copyright notice and this permission notice shall be included in all
37+
copies or substantial portions of the Software.
38+
39+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
40+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
41+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
42+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
43+
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
44+
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
45+
]#

src/reliq.nim

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,10 +37,13 @@ import memory/[coherence]
3737
# <***> will be where view is exported <***>
3838
when defined(UseOpenMP):
3939
import openmp/[openmp]
40+
export openmp
4041
elif defined(UseOpenCL):
4142
import opencl/[opencl]
43+
export opencl
4244
else:
4345
import opencl/[opencl]
46+
export opencl
4447

4548
when defined(UseOpenMP):
4649
type
@@ -55,8 +58,8 @@ else:
5558
DeviceBuffer* = PMem
5659
DeviceQueue* = PCommandQueue
5760

58-
var globalBufferPool* {.inject.} = newBufferPool()
59-
var globalCoherenceManager* {.inject.} = newCoherenceManager()
61+
var globalBufferPool* {.global.} = newBufferPool()
62+
var globalCoherenceManager* {.global.} = newCoherenceManager()
6063

6164
template reliq*(body: untyped): untyped =
6265
gaParallel:

src/tensor/tensorfieldview.nim

Lines changed: 44 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -95,12 +95,16 @@ template newTensorFieldView*[D: static[int], R: static[int], L: Lattice[D], T](
9595
let ghostGrid = tensor.ghostGrid()
9696
var view = tensor.newTensorFieldView(true)
9797

98-
#[ host/device coherence check ]#
98+
#[ host coherence check ]#
9999

100100
globalCoherenceManager.ensureEntry(view.aos)
101101
view.state = globalCoherenceManager.open(view.aos, io)
102102

103-
#[ buffer acquisition ]#
103+
#[ get SIMD layout ]#
104+
105+
view.simdLayout = newSIMDLayout(localGrid, inputSIMDGrid)
106+
107+
#[ host buffer acquisition ]#
104108

105109
when isComplex32(T):
106110
type StorageType = float32
@@ -109,22 +113,23 @@ template newTensorFieldView*[D: static[int], R: static[int], L: Lattice[D], T](
109113
else:
110114
type StorageType = T
111115

112-
view.simdLayout = newSIMDLayout(localGrid, inputSIMDGrid)
116+
var numLocalSites = view.simdLayout.numLocalDeviceSites()
117+
var numGhostSites = view.simdLayout.numGhostDeviceSites(ghostGrid)
118+
let key = BufferKey(
119+
numSites: numLocalSites + numGhostSites,
120+
elementsPerSite: product(view.shape) * (if isComplex(T): 2 else: 1),
121+
elementSize: sizeof(StorageType)
122+
)
123+
let bytes = key.numSites * key.elementsPerSite * key.elementSize
124+
113125
if view.state.canReuseBuffer and view.state.buffer != nil:
114126
(view.buffer, view.reference) = (view.state.buffer, view.state.reference)
115127
else: # try buffer pool
116-
var numLocalSites = view.simdLayout.numLocalDeviceSites()
117-
var numGhostSites = view.simdLayout.numGhostDeviceSites(ghostGrid)
118-
let key = BufferKey(
119-
numSites: numLocalSites + numGhostSites,
120-
elementsPerSite: product(view.shape) * (if isComplex(T): 2 else: 1),
121-
elementSize: sizeof(StorageType)
122-
)
123128
let (found, entry) = globalBufferPool.lookup(key)
124129
if found: (view.buffer, view.reference) = (entry.buffer, entry.reference)
125-
else: view.buffer = alloc entry.bytes
130+
else: view.buffer = alloc bytes
126131

127-
#[ AoS -> AoSoA transformation ]#
132+
#[ host AoS -> AoSoA transformation ]#
128133

129134
view.aosoa = cast[LocalStorage[StorageType]](view.buffer)
130135
if view.state.needsTransform:
@@ -139,12 +144,34 @@ template newTensorFieldView*[D: static[int], R: static[int], L: Lattice[D], T](
139144
): T
140145
of iokWrite: discard
141146

142-
#[ host to device transfer ]#
147+
#[ device buffer acquisition ]#
148+
149+
# !!!! NEED TO IMPLEMENT OpenCL BUFFER POOL !!!!
143150

144-
case globalCoherenceManager[view.buffer].state
145-
of cskDeviceDirty:
146-
of cskEmpty:
147-
of cskHostDirty, cskConsistent: discard
151+
#[
152+
when defined(UseOpenMP):
153+
view.deviceQueue = nil.DeviceQueue
154+
view.deviceBuffer = cast[DeviceBuffer](view.buffer)
155+
elif defined(UseOpenCL):
156+
view.deviceQueue = clQueues[0] # one device per MPI rank
157+
158+
# cl_mem allocation: no coherence-level caching exists for device
159+
# buffers yet, so every view gets a fresh cl_mem. The host AoSoA
160+
# buffer *is* reused via the coherence manager above, but the cl_mem
161+
# is not (a CLBufferPool would be needed for that — see §3.4 of
162+
# memory_management_strategy.txt).
163+
view.deviceBuffer = buffer[StorageType](clContext, key.numSites * key.elementsPerSite)
164+
165+
# Upload decision: for iokRead/iokReadWrite we always need to
166+
# upload because the cl_mem is freshly allocated (no cl_mem reuse).
167+
# Once a CLBufferPool is added, this can be conditioned on whether
168+
# the cl_mem was reused and the data is already current.
169+
case io
170+
of iokRead, iokReadWrite:
171+
view.deviceQueue.write(addr view.aosoa[0], view.deviceBuffer, bytes)
172+
check finish(view.deviceQueue)
173+
of iokWrite: discard
174+
]#
148175

149176
# return view
150177
move view

0 commit comments

Comments
 (0)