11# Copyright 2019-2025 ETH Zurich and the DaCe authors. All rights reserved.
2- from typing import TYPE_CHECKING , Dict , List , Optional , Set , Tuple , Union
2+ from typing import TYPE_CHECKING , Any , Dict , List , Optional , Set , Tuple , Union
33import networkx as nx
44
55import dace
2121from dace .codegen .target import TargetCodeGenerator , make_absolute
2222
2323from dace .transformation .passes import analysis as ap
24- from dace .transformation .pass_pipeline import Pipeline
25- from dace .transformation .passes .gpu_specialization .gpu_specialization_pipeline import (GPUPostExpansionPipeline ,
26- GPUStreamPipeline )
27- from dace .transformation .passes .gpu_specialization .helpers .gpu_helpers import (read_stream_assignments_from_wired_sdfg ,
28- validate_stream_indices_within_bounds )
24+ from dace .transformation .passes .gpu_specialization .gpu_specialization_pipeline import GPUCodegenPreprocessPipeline
25+ from dace .transformation .passes .gpu_specialization .helpers .gpu_helpers import read_stream_assignments_from_wired_sdfg
2926from dace .transformation .passes .shared_memory_synchronization import DefaultSharedMemorySync
30- from dace .transformation .dataflow .add_threadblock_map import AddThreadBlockMap
31- from dace .transformation .passes .analysis .infer_gpu_grid_and_block_size import InferGPUGridAndBlockSize
3227
3328from dace .codegen .targets .experimental_cuda_helpers .gpu_stream_manager import GPUStreamManager
3429from dace .codegen .targets .experimental_cuda_helpers .gpu_utils import generate_sync_debug_call
35- from dace .sdfg .core_dialect import (CoreDialectCompliant , warn_if_not_core_dialect )
3630
3731from dace .codegen .targets import cpp
3832
@@ -96,110 +90,40 @@ def __init__(self, frame_codegen: 'DaCeCodeGenerator', sdfg: SDFG):
9690 self ._kernel_arglists : Dict [nodes .MapEntry , Dict [str , dt .Data ]] = {}
9791
9892 def preprocess (self , sdfg : SDFG ) -> None :
99- """Prepare the SDFG for GPU code generation."""
100-
101- # ----------------------------------------------------------------
102- # Pipeline 1 — codegen preparation. Establishes invariants the
103- # transformation pipeline below relies on: every descriptor has
104- # decided storage / schedule, and every Scalar that cannot live on
105- # the GPU as a Scalar (rule 1) or that the kernel writes to (rule 2)
106- # has been promoted to a length-1 Array. After this pipeline, the
107- # SDFG is "well-formed for GPU codegen" — no further inference or
108- # descriptor rewrites should be needed.
109- # ----------------------------------------------------------------
110- from dace .transformation .passes .promote_gpu_scalars_to_arrays import (InferDefaultSchedulesAndStorages ,
111- PromoteGPUScalarsToArrays )
112- codegen_preparation_pipeline = Pipeline ([
113- InferDefaultSchedulesAndStorages (),
114- PromoteGPUScalarsToArrays (),
115- ])
116- codegen_preparation_pipeline .apply_pass (sdfg , {})
117-
118- # ``AddThreadBlockMap`` is intentionally deferred until *after* the
119- # GPU specialization pipeline + ``expand_library_nodes``: tiling
120- # before the hoist (`MoveArrayOutOfKernel`) introduces an inner
121- # ``GPU_ThreadBlock`` map whose range expression references the
122- # outer block index, which then leaks into host-side
123- # ``cudaMalloc`` size expressions for any transient lifted out of
124- # the kernel.
93+ """Prepare the SDFG for GPU code generation.
94+
95+ All SDFG-level transformation lives in
96+ :class:`GPUCodegenPreprocessPipeline`. This method only does
97+ framecode-target bookkeeping: the ``gpu_context`` statestruct
98+ entry, kernel-dimension cache hand-off, frame symbol cache rebuild,
99+ ``GPUStreamManager`` construction, pool-release computation, and
100+ the per-kernel arglist build.
101+ """
125102 self ._frame .statestruct .append ('dace::cuda::Context *gpu_context;' )
126-
127- # ----------------------------------------------------------------
128- # Pipeline 2 — GPU specialization. Two-phase around
129- # ``expand_library_nodes(recursive=True)``:
130- # * ``GPUStreamPipeline`` (pre-expansion) lifts implicit copies,
131- # schedules streams, threads ``gpu_streams`` everywhere needed,
132- # wires each consumer's ``__stream`` connector, emits sync
133- # tasklets. Idempotent via the ``is_gpu_lowering_applied`` signal.
134- # * ``GPUPostExpansionPipeline`` (post-expansion) reconnects internal
135- # GPU consumers of expansion-spawned NestedSDFGs and lifts
136- # ``GPU_Shared`` transients out of inner NestedSDFGs so the
137- # framecode walker pins their ``__shared__`` allocation to the
138- # kernel scope.
139- # ----------------------------------------------------------------
140103 self ._dispatcher ._used_targets .add (self )
141- GPUStreamPipeline ().apply_pass (sdfg , {})
142- # Strategy stamps the full WCC assignment dict on the SDFG; codegen
143- # consumers (memory-pool path needs AccessNode stream ids, not just
144- # wired-consumer ids) read from there. If the SDFG comes in already
145- # lowered, fall back to reading consumers from wired connectors —
146- # caller-pre-lowered fixtures don't have the cache attribute.
147- gpustream_assignments = (getattr (sdfg , '_gpu_stream_assignments' , None )
148- or read_stream_assignments_from_wired_sdfg (sdfg ))
149104
150- # Defensive bounds check: catch out-of-range stream ids before the
151- # codegen emits an out-of-bounds ``__state->gpu_context->streams[i]``.
152- validate_stream_indices_within_bounds (sdfg )
153-
154- sdfg .expand_library_nodes (recursive = True )
155- GPUPostExpansionPipeline ().apply_pass (sdfg , {})
156-
157- # Core-dialect compliance is a property of the *post-pipeline* SDFG —
158- # probing earlier would warn about every implicit copy the pipeline
159- # subsequently lifts to a ``CopyLibraryNode``, drowning real bugs in
160- # noise. The strict guard against leftover implicit GPU-memory copies
161- # also runs here, after both ``expand_library_nodes`` rounds, so an
162- # offender introduced by library expansion is caught instead of slipping
163- # through into ill-formed generated code.
164- warn_if_not_core_dialect (sdfg , source = 'ExperimentalCUDACodeGen' )
165- leftover = CoreDialectCompliant .offenders_implicit_gpu_copies (sdfg )
166- if leftover :
167- raise ValueError ("ExperimentalCUDACodeGen: " + str (len (leftover )) +
168- " implicit GPU-memory copy edge(s) survived InsertExplicitGPUGlobalMemoryCopies + "
169- "expand_library_nodes. Every CPU↔GPU and GPU↔GPU AccessNode→AccessNode edge must be "
170- "expressed via an explicit CopyLibraryNode. Offenders:\n - " + "\n - " .join (leftover ))
171-
172- from dace .sdfg import infer_types
173- from dace .transformation .passes .promote_gpu_scalars_to_arrays import invalidate_array_connectors
174- # Reset stale Array-vs-scalar connector types on NestedSDFGs (some
175- # are spawned by library expansion with construction-time typing
176- # that no longer matches the inner descriptor) and re-infer per
177- # sub-SDFG — ``infer_connector_types`` only walks top-level states.
178- invalidate_array_connectors (sdfg )
179- for nsdfg in sdfg .all_sdfgs_recursive ():
180- infer_types .infer_connector_types (nsdfg )
181-
182- # Library-node expansion can add new nested SDFGs with new cfg_ids; re-seed
105+ pipeline_results : Dict [str , Any ] = {}
106+ GPUCodegenPreprocessPipeline ().apply_pass (sdfg , pipeline_results )
107+
108+ # The ``AddThreadBlockMaps`` Pass returns the kernel-dimension
109+ # map and the set of kernels it tiled; the codegen consults both
110+ # when emitting kernel launches.
111+ atb_results = pipeline_results .get ('AddThreadBlockMaps' , {}) or {}
112+ self ._kernel_dimensions_map = atb_results .get ('kernel_dimensions_map' , {})
113+ self ._tb_inserted_kernels = atb_results .get ('tb_inserted_kernels' , set ())
114+
115+ # Library-node expansion adds new nested SDFGs with new cfg_ids; re-seed
183116 # the framecode's symbol/constant cache so lookups succeed for them.
184117 self ._rebuild_frame_symbol_cache (sdfg )
185118
119+ # Strategy stamps the WCC assignment dict on the SDFG; codegen
120+ # consumers (memory-pool path needs AccessNode stream ids, not
121+ # just wired-consumer ids) read it from there. Pre-lowered
122+ # fixtures fall back to reading consumers from wired connectors.
123+ gpustream_assignments = (getattr (sdfg , '_gpu_stream_assignments' , None )
124+ or read_stream_assignments_from_wired_sdfg (sdfg ))
186125 self ._gpu_stream_manager = GPUStreamManager (sdfg , gpustream_assignments )
187126
188- # No ``_cuda_stream`` annotation pass: the CPU codegen prelude in
189- # ``cpp.py`` binds ``__dace_current_stream`` directly from the
190- # ``gpuStream_t``-typed in-connector when one is present. Legacy
191- # codegen keeps its own ``_cuda_stream`` path in ``cuda.py``.
192-
193- # Tile every ``GPU_Device`` map with an explicit ``GPU_ThreadBlock``
194- # inner map. Done here — as late as possible, after the GPU
195- # specialization pipeline, ``expand_library_nodes``, and the
196- # post-expansion pipeline — so the kernel-internal transient hoist
197- # (``MoveArrayOutOfKernel``) sees the user-authored kernel shape,
198- # not the post-tile shape. Tiling earlier introduces an inner map
199- # range like ``Min(N-1, b_i+31) - b_i + 1`` whose ``b_i`` outer-loop
200- # symbol then leaks into host-side ``cudaMalloc`` size expressions.
201- self ._infer_kernel_dimensions (sdfg )
202-
203127 if Config .get ('compiler' , 'cuda' , 'auto_syncthreads_insertion' ):
204128 DefaultSharedMemorySync ().apply_pass (sdfg , None )
205129
@@ -213,28 +137,6 @@ def preprocess(self, sdfg: SDFG) -> None:
213137 self ._kernel_arglists [node ] = state .scope_subgraph (node ).arglist (defined_syms ,
214138 shared_transients [state .parent ])
215139
216- def _infer_kernel_dimensions (self , sdfg : SDFG ):
217- """Run ``AddThreadBlockMap`` over any GPU_Device maps that don't yet
218- carry a ThreadBlock map and refresh ``_kernel_dimensions_map`` for
219- every GPU_Device map currently in the SDFG. Idempotent — safe to call
220- repeatedly between library-expansion rounds, since
221- ``InferGPUGridAndBlockSize`` re-walks the SDFG and re-emits the full
222- mapping. ``_tb_inserted_kernels`` accumulates across calls so that a
223- kernel auto-tiled in an earlier round still uses
224- ``_get_inserted_gpu_block_size`` (and not ``_infer_gpu_block_size``,
225- which would flag the user's explicit ``gpu_block_size`` against the
226- tile-derived inner map size as a conflict)."""
227- old_nodes = set (node for node , _ in sdfg .all_nodes_recursive ())
228- sdfg .apply_transformations_once_everywhere (AddThreadBlockMap )
229- new_nodes = set (node for node , _ in sdfg .all_nodes_recursive ()) - old_nodes
230- for n in new_nodes :
231- if isinstance (n , nodes .MapEntry ) and n .schedule == dtypes .ScheduleType .GPU_Device :
232- self ._tb_inserted_kernels .add (n )
233- # Pre-existing entries are preserved by re-running the inference pass:
234- # it walks every GPU_Device map in the SDFG, so an unmodified kernel
235- # gets an identical (grid, block) tuple back.
236- self ._kernel_dimensions_map .update (InferGPUGridAndBlockSize ().apply_pass (sdfg , self ._tb_inserted_kernels ))
237-
238140 def _rebuild_frame_symbol_cache (self , sdfg : SDFG ) -> None :
239141 """Re-seed the framecode's symbol/constant cache for the current SDFG hierarchy.
240142
0 commit comments