Skip to content

Commit 6137e7a

Browse files
committed
Refactor
1 parent e2e669e commit 6137e7a

11 files changed

Lines changed: 322 additions & 367 deletions

dace/codegen/targets/experimental_cuda.py

Lines changed: 29 additions & 127 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved.
2-
from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple, Union
2+
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, Union
33
import networkx as nx
44

55
import dace
@@ -21,18 +21,12 @@
2121
from dace.codegen.target import TargetCodeGenerator, make_absolute
2222

2323
from dace.transformation.passes import analysis as ap
24-
from dace.transformation.pass_pipeline import Pipeline
25-
from dace.transformation.passes.gpu_specialization.gpu_specialization_pipeline import (GPUPostExpansionPipeline,
26-
GPUStreamPipeline)
27-
from dace.transformation.passes.gpu_specialization.helpers.gpu_helpers import (read_stream_assignments_from_wired_sdfg,
28-
validate_stream_indices_within_bounds)
24+
from dace.transformation.passes.gpu_specialization.gpu_specialization_pipeline import GPUCodegenPreprocessPipeline
25+
from dace.transformation.passes.gpu_specialization.helpers.gpu_helpers import read_stream_assignments_from_wired_sdfg
2926
from dace.transformation.passes.shared_memory_synchronization import DefaultSharedMemorySync
30-
from dace.transformation.dataflow.add_threadblock_map import AddThreadBlockMap
31-
from dace.transformation.passes.analysis.infer_gpu_grid_and_block_size import InferGPUGridAndBlockSize
3227

3328
from dace.codegen.targets.experimental_cuda_helpers.gpu_stream_manager import GPUStreamManager
3429
from dace.codegen.targets.experimental_cuda_helpers.gpu_utils import generate_sync_debug_call
35-
from dace.sdfg.core_dialect import (CoreDialectCompliant, warn_if_not_core_dialect)
3630

3731
from dace.codegen.targets import cpp
3832

@@ -96,110 +90,40 @@ def __init__(self, frame_codegen: 'DaCeCodeGenerator', sdfg: SDFG):
9690
self._kernel_arglists: Dict[nodes.MapEntry, Dict[str, dt.Data]] = {}
9791

9892
def preprocess(self, sdfg: SDFG) -> None:
99-
"""Prepare the SDFG for GPU code generation."""
100-
101-
# ----------------------------------------------------------------
102-
# Pipeline 1 — codegen preparation. Establishes invariants the
103-
# transformation pipeline below relies on: every descriptor has
104-
# decided storage / schedule, and every Scalar that cannot live on
105-
# the GPU as a Scalar (rule 1) or that the kernel writes to (rule 2)
106-
# has been promoted to a length-1 Array. After this pipeline, the
107-
# SDFG is "well-formed for GPU codegen" — no further inference or
108-
# descriptor rewrites should be needed.
109-
# ----------------------------------------------------------------
110-
from dace.transformation.passes.promote_gpu_scalars_to_arrays import (InferDefaultSchedulesAndStorages,
111-
PromoteGPUScalarsToArrays)
112-
codegen_preparation_pipeline = Pipeline([
113-
InferDefaultSchedulesAndStorages(),
114-
PromoteGPUScalarsToArrays(),
115-
])
116-
codegen_preparation_pipeline.apply_pass(sdfg, {})
117-
118-
# ``AddThreadBlockMap`` is intentionally deferred until *after* the
119-
# GPU specialization pipeline + ``expand_library_nodes``: tiling
120-
# before the hoist (`MoveArrayOutOfKernel`) introduces an inner
121-
# ``GPU_ThreadBlock`` map whose range expression references the
122-
# outer block index, which then leaks into host-side
123-
# ``cudaMalloc`` size expressions for any transient lifted out of
124-
# the kernel.
93+
"""Prepare the SDFG for GPU code generation.
94+
95+
All SDFG-level transformation lives in
96+
:class:`GPUCodegenPreprocessPipeline`. This method only does
97+
framecode-target bookkeeping: the ``gpu_context`` statestruct
98+
entry, kernel-dimension cache hand-off, frame symbol cache rebuild,
99+
``GPUStreamManager`` construction, pool-release computation, and
100+
the per-kernel arglist build.
101+
"""
125102
self._frame.statestruct.append('dace::cuda::Context *gpu_context;')
126-
127-
# ----------------------------------------------------------------
128-
# Pipeline 2 — GPU specialization. Two-phase around
129-
# ``expand_library_nodes(recursive=True)``:
130-
# * ``GPUStreamPipeline`` (pre-expansion) lifts implicit copies,
131-
# schedules streams, threads ``gpu_streams`` everywhere needed,
132-
# wires each consumer's ``__stream`` connector, emits sync
133-
# tasklets. Idempotent via the ``is_gpu_lowering_applied`` signal.
134-
# * ``GPUPostExpansionPipeline`` (post-expansion) reconnects internal
135-
# GPU consumers of expansion-spawned NestedSDFGs and lifts
136-
# ``GPU_Shared`` transients out of inner NestedSDFGs so the
137-
# framecode walker pins their ``__shared__`` allocation to the
138-
# kernel scope.
139-
# ----------------------------------------------------------------
140103
self._dispatcher._used_targets.add(self)
141-
GPUStreamPipeline().apply_pass(sdfg, {})
142-
# Strategy stamps the full WCC assignment dict on the SDFG; codegen
143-
# consumers (memory-pool path needs AccessNode stream ids, not just
144-
# wired-consumer ids) read from there. If the SDFG comes in already
145-
# lowered, fall back to reading consumers from wired connectors —
146-
# caller-pre-lowered fixtures don't have the cache attribute.
147-
gpustream_assignments = (getattr(sdfg, '_gpu_stream_assignments', None)
148-
or read_stream_assignments_from_wired_sdfg(sdfg))
149104

150-
# Defensive bounds check: catch out-of-range stream ids before the
151-
# codegen emits an out-of-bounds ``__state->gpu_context->streams[i]``.
152-
validate_stream_indices_within_bounds(sdfg)
153-
154-
sdfg.expand_library_nodes(recursive=True)
155-
GPUPostExpansionPipeline().apply_pass(sdfg, {})
156-
157-
# Core-dialect compliance is a property of the *post-pipeline* SDFG —
158-
# probing earlier would warn about every implicit copy the pipeline
159-
# subsequently lifts to a ``CopyLibraryNode``, drowning real bugs in
160-
# noise. The strict guard against leftover implicit GPU-memory copies
161-
# also runs here, after both ``expand_library_nodes`` rounds, so an
162-
# offender introduced by library expansion is caught instead of slipping
163-
# through into ill-formed generated code.
164-
warn_if_not_core_dialect(sdfg, source='ExperimentalCUDACodeGen')
165-
leftover = CoreDialectCompliant.offenders_implicit_gpu_copies(sdfg)
166-
if leftover:
167-
raise ValueError("ExperimentalCUDACodeGen: " + str(len(leftover)) +
168-
" implicit GPU-memory copy edge(s) survived InsertExplicitGPUGlobalMemoryCopies + "
169-
"expand_library_nodes. Every CPU↔GPU and GPU↔GPU AccessNode→AccessNode edge must be "
170-
"expressed via an explicit CopyLibraryNode. Offenders:\n - " + "\n - ".join(leftover))
171-
172-
from dace.sdfg import infer_types
173-
from dace.transformation.passes.promote_gpu_scalars_to_arrays import invalidate_array_connectors
174-
# Reset stale Array-vs-scalar connector types on NestedSDFGs (some
175-
# are spawned by library expansion with construction-time typing
176-
# that no longer matches the inner descriptor) and re-infer per
177-
# sub-SDFG — ``infer_connector_types`` only walks top-level states.
178-
invalidate_array_connectors(sdfg)
179-
for nsdfg in sdfg.all_sdfgs_recursive():
180-
infer_types.infer_connector_types(nsdfg)
181-
182-
# Library-node expansion can add new nested SDFGs with new cfg_ids; re-seed
105+
pipeline_results: Dict[str, Any] = {}
106+
GPUCodegenPreprocessPipeline().apply_pass(sdfg, pipeline_results)
107+
108+
# The ``AddThreadBlockMaps`` Pass returns the kernel-dimension
109+
# map and the set of kernels it tiled; the codegen consults both
110+
# when emitting kernel launches.
111+
atb_results = pipeline_results.get('AddThreadBlockMaps', {}) or {}
112+
self._kernel_dimensions_map = atb_results.get('kernel_dimensions_map', {})
113+
self._tb_inserted_kernels = atb_results.get('tb_inserted_kernels', set())
114+
115+
# Library-node expansion adds new nested SDFGs with new cfg_ids; re-seed
183116
# the framecode's symbol/constant cache so lookups succeed for them.
184117
self._rebuild_frame_symbol_cache(sdfg)
185118

119+
# Strategy stamps the WCC assignment dict on the SDFG; codegen
120+
# consumers (memory-pool path needs AccessNode stream ids, not
121+
# just wired-consumer ids) read it from there. Pre-lowered
122+
# fixtures fall back to reading consumers from wired connectors.
123+
gpustream_assignments = (getattr(sdfg, '_gpu_stream_assignments', None)
124+
or read_stream_assignments_from_wired_sdfg(sdfg))
186125
self._gpu_stream_manager = GPUStreamManager(sdfg, gpustream_assignments)
187126

188-
# No ``_cuda_stream`` annotation pass: the CPU codegen prelude in
189-
# ``cpp.py`` binds ``__dace_current_stream`` directly from the
190-
# ``gpuStream_t``-typed in-connector when one is present. Legacy
191-
# codegen keeps its own ``_cuda_stream`` path in ``cuda.py``.
192-
193-
# Tile every ``GPU_Device`` map with an explicit ``GPU_ThreadBlock``
194-
# inner map. Done here — as late as possible, after the GPU
195-
# specialization pipeline, ``expand_library_nodes``, and the
196-
# post-expansion pipeline — so the kernel-internal transient hoist
197-
# (``MoveArrayOutOfKernel``) sees the user-authored kernel shape,
198-
# not the post-tile shape. Tiling earlier introduces an inner map
199-
# range like ``Min(N-1, b_i+31) - b_i + 1`` whose ``b_i`` outer-loop
200-
# symbol then leaks into host-side ``cudaMalloc`` size expressions.
201-
self._infer_kernel_dimensions(sdfg)
202-
203127
if Config.get('compiler', 'cuda', 'auto_syncthreads_insertion'):
204128
DefaultSharedMemorySync().apply_pass(sdfg, None)
205129

@@ -213,28 +137,6 @@ def preprocess(self, sdfg: SDFG) -> None:
213137
self._kernel_arglists[node] = state.scope_subgraph(node).arglist(defined_syms,
214138
shared_transients[state.parent])
215139

216-
def _infer_kernel_dimensions(self, sdfg: SDFG):
217-
"""Run ``AddThreadBlockMap`` over any GPU_Device maps that don't yet
218-
carry a ThreadBlock map and refresh ``_kernel_dimensions_map`` for
219-
every GPU_Device map currently in the SDFG. Idempotent — safe to call
220-
repeatedly between library-expansion rounds, since
221-
``InferGPUGridAndBlockSize`` re-walks the SDFG and re-emits the full
222-
mapping. ``_tb_inserted_kernels`` accumulates across calls so that a
223-
kernel auto-tiled in an earlier round still uses
224-
``_get_inserted_gpu_block_size`` (and not ``_infer_gpu_block_size``,
225-
which would flag the user's explicit ``gpu_block_size`` against the
226-
tile-derived inner map size as a conflict)."""
227-
old_nodes = set(node for node, _ in sdfg.all_nodes_recursive())
228-
sdfg.apply_transformations_once_everywhere(AddThreadBlockMap)
229-
new_nodes = set(node for node, _ in sdfg.all_nodes_recursive()) - old_nodes
230-
for n in new_nodes:
231-
if isinstance(n, nodes.MapEntry) and n.schedule == dtypes.ScheduleType.GPU_Device:
232-
self._tb_inserted_kernels.add(n)
233-
# Pre-existing entries are preserved by re-running the inference pass:
234-
# it walks every GPU_Device map in the SDFG, so an unmodified kernel
235-
# gets an identical (grid, block) tuple back.
236-
self._kernel_dimensions_map.update(InferGPUGridAndBlockSize().apply_pass(sdfg, self._tb_inserted_kernels))
237-
238140
def _rebuild_frame_symbol_cache(self, sdfg: SDFG) -> None:
239141
"""Re-seed the framecode's symbol/constant cache for the current SDFG hierarchy.
240142
Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
# Copyright 2019-2026 ETH Zurich and the DaCe authors. All rights reserved.
2+
"""Wrapper :class:`Pass` classes that turn previously-imperative steps in
3+
``experimental_cuda.preprocess`` into composable Pipeline members.
4+
5+
Each pass corresponds to one of the manual operations the codegen target
6+
used to call directly (``sdfg.expand_library_nodes``,
7+
``apply_transformations_once_everywhere(AddThreadBlockMap)``, etc.) and
8+
exposes the same behaviour through the Pipeline framework so the order
9+
becomes declarative and testable.
10+
"""
11+
from typing import Any, Dict, Optional
12+
13+
from dace import SDFG, dtypes, nodes, properties
14+
from dace.transformation import pass_pipeline as ppl, transformation
15+
16+
17+
@properties.make_properties
18+
@transformation.explicit_cf_compatible
19+
class ExpandLibraryNodes(ppl.Pass):
20+
"""Wraps :meth:`SDFG.expand_library_nodes` (recursive) as a Pipeline
21+
Pass so library-node expansion can be ordered declaratively
22+
alongside other transformations."""
23+
24+
def depends_on(self):
25+
return set()
26+
27+
def modifies(self) -> ppl.Modifies:
28+
return (ppl.Modifies.States | ppl.Modifies.Nodes | ppl.Modifies.Edges | ppl.Modifies.Descriptors
29+
| ppl.Modifies.Symbols)
30+
31+
def should_reapply(self, modified: ppl.Modifies) -> bool:
32+
return False
33+
34+
def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]) -> Optional[bool]:
35+
sdfg.expand_library_nodes(recursive=True)
36+
return True
37+
38+
39+
@properties.make_properties
40+
@transformation.explicit_cf_compatible
41+
class AddThreadBlockMaps(ppl.Pass):
42+
"""Tile every ``GPU_Device`` map without an inner ``GPU_ThreadBlock``
43+
map (via :class:`AddThreadBlockMap`) and infer the resulting
44+
``(grid, block)`` dimensions for codegen.
45+
46+
Returns a dict ``{'kernel_dimensions_map': …, 'tb_inserted_kernels':
47+
set(MapEntry)}`` that callers (the codegen target) read out of
48+
``pipeline_results``.
49+
50+
Tiles late on purpose: the kernel-internal transient hoist
51+
(``MoveArrayOutOfKernel``) sees user-authored kernel shapes, not
52+
post-tile shapes — tiling earlier introduces an inner-map range like
53+
``Min(N-1, b_i+31) - b_i + 1`` whose ``b_i`` outer-loop symbol then
54+
leaks into host-side ``cudaMalloc`` size expressions for any
55+
transient lifted out of the kernel.
56+
"""
57+
58+
def depends_on(self):
59+
return set()
60+
61+
def modifies(self) -> ppl.Modifies:
62+
return ppl.Modifies.States | ppl.Modifies.Nodes | ppl.Modifies.Edges
63+
64+
def should_reapply(self, modified: ppl.Modifies) -> bool:
65+
return False
66+
67+
def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]) -> Dict[str, Any]:
68+
from dace.transformation.dataflow.add_threadblock_map import AddThreadBlockMap
69+
from dace.transformation.passes.analysis.infer_gpu_grid_and_block_size import InferGPUGridAndBlockSize
70+
71+
old_nodes = set(node for node, _ in sdfg.all_nodes_recursive())
72+
sdfg.apply_transformations_once_everywhere(AddThreadBlockMap)
73+
new_nodes = set(node for node, _ in sdfg.all_nodes_recursive()) - old_nodes
74+
tb_inserted_kernels = {
75+
n
76+
for n in new_nodes
77+
if isinstance(n, nodes.MapEntry) and n.schedule == dtypes.ScheduleType.GPU_Device
78+
}
79+
kernel_dimensions_map = InferGPUGridAndBlockSize().apply_pass(sdfg, tb_inserted_kernels) or {}
80+
return {
81+
'kernel_dimensions_map': kernel_dimensions_map,
82+
'tb_inserted_kernels': tb_inserted_kernels,
83+
}
84+
85+
86+
@properties.make_properties
87+
@transformation.explicit_cf_compatible
88+
class InvalidateAndInferConnectorTypes(ppl.Pass):
89+
"""Reset stale Array-vs-Scalar connector typings on NestedSDFGs (some
90+
are spawned by library expansion with construction-time typing that
91+
no longer matches the inner descriptor) and re-infer per sub-SDFG.
92+
93+
``infer_connector_types`` only walks top-level states, so we iterate
94+
every nested SDFG explicitly.
95+
"""
96+
97+
def depends_on(self):
98+
return set()
99+
100+
def modifies(self) -> ppl.Modifies:
101+
return ppl.Modifies.Connectors | ppl.Modifies.Descriptors
102+
103+
def should_reapply(self, modified: ppl.Modifies) -> bool:
104+
return False
105+
106+
def apply_pass(self, sdfg: SDFG, pipeline_results: Dict[str, Any]) -> None:
107+
from dace.sdfg import infer_types
108+
from dace.transformation.passes.promote_gpu_scalars_to_arrays import invalidate_array_connectors
109+
invalidate_array_connectors(sdfg)
110+
for nsdfg in sdfg.all_sdfgs_recursive():
111+
infer_types.infer_connector_types(nsdfg)
112+
return None
113+
114+

0 commit comments

Comments
 (0)