@@ -95,12 +95,16 @@ template newTensorFieldView*[D: static[int], R: static[int], L: Lattice[D], T](
9595 let ghostGrid = tensor.ghostGrid ()
9696 var view = tensor.newTensorFieldView (true )
9797
98- #[ host/device coherence check ]#
98+ #[ host coherence check ]#
9999
100100 globalCoherenceManager.ensureEntry (view.aos)
101101 view.state = globalCoherenceManager.open (view.aos, io)
102102
103- #[ buffer acquisition ]#
103+ #[ get SIMD layout ]#
104+
105+ view.simdLayout = newSIMDLayout (localGrid, inputSIMDGrid)
106+
107+ #[ host buffer acquisition ]#
104108
105109 when isComplex32 (T):
106110 type StorageType = float32
@@ -109,22 +113,23 @@ template newTensorFieldView*[D: static[int], R: static[int], L: Lattice[D], T](
109113 else :
110114 type StorageType = T
111115
112- view.simdLayout = newSIMDLayout (localGrid, inputSIMDGrid)
116+ var numLocalSites = view.simdLayout.numLocalDeviceSites ()
117+ var numGhostSites = view.simdLayout.numGhostDeviceSites (ghostGrid)
118+ let key = BufferKey (
119+ numSites: numLocalSites + numGhostSites,
120+ elementsPerSite: product (view.shape) * (if isComplex (T): 2 else : 1 ),
121+ elementSize: sizeof (StorageType )
122+ )
123+ let bytes = key.numSites * key.elementsPerSite * key.elementSize
124+
113125 if view.state.canReuseBuffer and view.state.buffer != nil :
114126 (view.buffer, view.reference) = (view.state.buffer, view.state.reference)
115127 else : # try buffer pool
116- var numLocalSites = view.simdLayout.numLocalDeviceSites ()
117- var numGhostSites = view.simdLayout.numGhostDeviceSites (ghostGrid)
118- let key = BufferKey (
119- numSites: numLocalSites + numGhostSites,
120- elementsPerSite: product (view.shape) * (if isComplex (T): 2 else : 1 ),
121- elementSize: sizeof (StorageType )
122- )
123128 let (found, entry) = globalBufferPool.lookup (key)
124129 if found: (view.buffer, view.reference) = (entry.buffer, entry.reference)
125- else : view.buffer = alloc entry. bytes
130+ else : view.buffer = alloc bytes
126131
127- #[ AoS -> AoSoA transformation ]#
132+ #[ host AoS -> AoSoA transformation ]#
128133
129134 view.aosoa = cast [LocalStorage [StorageType ]](view.buffer)
130135 if view.state.needsTransform:
@@ -139,12 +144,34 @@ template newTensorFieldView*[D: static[int], R: static[int], L: Lattice[D], T](
139144 ): T
140145 of iokWrite: discard
141146
142- #[ host to device transfer ]#
147+ #[ device buffer acquisition ]#
148+
149+ # !!!! NEED TO IMPLEMENT OpenCL BUFFER POOL !!!!
143150
144- case globalCoherenceManager[view.buffer].state
145- of cskDeviceDirty:
146- of cskEmpty:
147- of cskHostDirty, cskConsistent: discard
151+ #[
152+ when defined(UseOpenMP):
153+ view.deviceQueue = nil.DeviceQueue
154+ view.deviceBuffer = cast[DeviceBuffer](view.buffer)
155+ elif defined(UseOpenCL):
156+ view.deviceQueue = clQueues[0] # one device per MPI rank
157+
158+ # cl_mem allocation: no coherence-level caching exists for device
159+ # buffers yet, so every view gets a fresh cl_mem. The host AoSoA
160+ # buffer *is* reused via the coherence manager above, but the cl_mem
161+ # is not (a CLBufferPool would be needed for that — see §3.4 of
162+ # memory_management_strategy.txt).
163+ view.deviceBuffer = buffer[StorageType](clContext, key.numSites * key.elementsPerSite)
164+
165+ # Upload decision: for iokRead/iokReadWrite we always need to
166+ # upload because the cl_mem is freshly allocated (no cl_mem reuse).
167+ # Once a CLBufferPool is added, this can be conditioned on whether
168+ # the cl_mem was reused and the data is already current.
169+ case io
170+ of iokRead, iokReadWrite:
171+ view.deviceQueue.write(addr view.aosoa[0], view.deviceBuffer, bytes)
172+ check finish(view.deviceQueue)
173+ of iokWrite: discard
174+ ]#
148175
149176 # return view
150177 move view
0 commit comments