-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathhatch_build_hooks.py
More file actions
362 lines (299 loc) · 14.3 KB
/
hatch_build_hooks.py
File metadata and controls
362 lines (299 loc) · 14.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
"""
Custom build hooks for hatchling to initialize JSON data during wheel creation.
This hook is called during `uv sync` and wheel building. It skips regeneration
if the required resource files already exist, making incremental syncs fast.
"""
import os
import sys
import time
from pathlib import Path
from typing import Any
# hatchling is a build system for Python projects, and this hook will be used to
# create JSON data structures for the IMAS Codex server during the wheel build process.
from hatchling.builders.hooks.plugin.interface import (
BuildHookInterface, # type: ignore[import]
)
class CustomBuildHook(BuildHookInterface):
"""Custom build hook to create JSON data structures during wheel building."""
PLUGIN_NAME = "imas-build-hook"
def _trace(self, message: str) -> None:
"""Print trace message for debugging build hook execution."""
print(f"[BUILD HOOK] {message}", flush=True)
def _check_graph_models_exist(self) -> bool:
"""Check if graph models exist and are up to date."""
package_root = Path(__file__).parent
schemas_dir = package_root / "imas_codex" / "schemas"
# Check all schema files that contribute to generated models
schema_files = [
schemas_dir / "facility.yaml",
schemas_dir / "common.yaml",
schemas_dir / "imas_dd.yaml",
schemas_dir / "facility_config.yaml",
schemas_dir / "standard_name.yaml",
schemas_dir / "task_groups.yaml",
]
# All generated output files that must exist
output_files = [
package_root / "imas_codex" / "graph" / "models.py",
package_root / "imas_codex" / "graph" / "dd_models.py",
package_root / "imas_codex" / "config" / "models.py",
# schema_context_data.py is a runtime dependency
# (imported by schema_context.py, query_builder.py, client.py)
package_root / "imas_codex" / "graph" / "schema_context_data.py",
]
# If no schema files exist yet, nothing to generate
existing_schemas = [f for f in schema_files if f.exists()]
if not existing_schemas:
return True
# Check if all output files exist
existing_outputs = [f for f in output_files if f.exists()]
if len(existing_outputs) != len(output_files):
return False
# Check if any schema is newer than the oldest output
oldest_output_mtime = min(f.stat().st_mtime for f in existing_outputs)
return all(f.stat().st_mtime <= oldest_output_mtime for f in existing_schemas)
def _generate_graph_models(self, package_root: Path) -> None:
"""Generate graph Pydantic models from LinkML schema."""
original_path = sys.path[:]
if str(package_root) not in sys.path:
sys.path.insert(0, str(package_root))
try:
from scripts.build_models import build_models
# Let build_models freshness checks decide what to regenerate
# instead of using --force which rebuilds everything.
result = build_models.main(args=[], standalone_mode=False)
if result == 0:
self._trace("Graph models generated successfully")
else:
self._trace(f"Graph models generation returned {result}")
except SystemExit as e:
if e.code == 0:
self._trace("Graph models generated successfully")
else:
self._trace(f"Failed to generate graph models: exit {e.code}")
except Exception as e:
self._trace(f"Failed to generate graph models: {e}")
finally:
sys.path[:] = original_path
def _generate_schema_reference(self, package_root: Path) -> None:
"""Generate agents/schema-reference.md from LinkML schemas."""
output_file = package_root / "agents" / "schema-reference.md"
schemas_dir = package_root / "imas_codex" / "schemas"
# Freshness check
schema_files = [
schemas_dir / "facility.yaml",
schemas_dir / "common.yaml",
schemas_dir / "imas_dd.yaml",
]
existing = [f for f in schema_files if f.exists()]
if not existing:
return
if output_file.exists():
output_mtime = output_file.stat().st_mtime
if all(f.stat().st_mtime <= output_mtime for f in existing):
self._trace("Schema reference up to date")
return
original_path = sys.path[:]
if str(package_root) not in sys.path:
sys.path.insert(0, str(package_root))
try:
from scripts.gen_schema_reference import generate_schema_reference
generate_schema_reference(force=True)
self._trace("Schema reference generated")
except Exception as e:
self._trace(f"Failed to generate schema reference: {e}")
finally:
sys.path[:] = original_path
def _generate_schema_context(self, package_root: Path) -> None:
"""Generate imas_codex/graph/schema_context_data.py from LinkML schemas."""
output_file = package_root / "imas_codex" / "graph" / "schema_context_data.py"
schemas_dir = package_root / "imas_codex" / "schemas"
schema_files = [
schemas_dir / "facility.yaml",
schemas_dir / "common.yaml",
schemas_dir / "imas_dd.yaml",
schemas_dir / "task_groups.yaml",
]
existing = [f for f in schema_files if f.exists()]
if not existing:
return
if output_file.exists():
output_mtime = output_file.stat().st_mtime
if all(f.stat().st_mtime <= output_mtime for f in existing if f.exists()):
self._trace("Schema context up to date")
return
original_path = sys.path[:]
if str(package_root) not in sys.path:
sys.path.insert(0, str(package_root))
try:
from scripts.gen_schema_context import generate_schema_context
generate_schema_context(force=True)
self._trace("Schema context generated")
except Exception as e:
self._trace(f"Failed to generate schema context: {e}")
finally:
sys.path[:] = original_path
def _check_schemas_exist(self, schemas_dir: Path) -> bool:
"""Check if schema files already exist and are valid."""
catalog_path = schemas_dir / "ids_catalog.json"
detailed_dir = schemas_dir / "detailed"
exists = (
catalog_path.exists()
and detailed_dir.exists()
and any(detailed_dir.glob("*.json"))
)
return exists
def _check_path_map_exists(self, mappings_dir: Path) -> bool:
"""Check if path map file already exists."""
mapping_file = mappings_dir / "path_mappings.json"
return mapping_file.exists()
def _sync_grammar_best_effort(self, package_root: Path) -> None:
"""Sync ISN grammar into the graph if reachable.
Best-effort: silently skipped when graph is unreachable (CI, Docker
build, offline dev) or when `IMAS_CODEX_SKIP_GRAMMAR_SYNC=1` is set.
Prevents grammar-version drift between the installed ISN package and
the graph's `GrammarToken` nodes, which otherwise causes the compose
worker to fall back to stale tokens and emit noisy warnings.
"""
if os.environ.get("IMAS_CODEX_SKIP_GRAMMAR_SYNC") == "1":
self._trace("Grammar sync skipped (IMAS_CODEX_SKIP_GRAMMAR_SYNC=1)")
return
original_path = sys.path[:]
if str(package_root) not in sys.path:
sys.path.insert(0, str(package_root))
try:
from imas_codex.standard_names.grammar_sync import (
sync_isn_grammar_to_graph,
)
report = sync_isn_grammar_to_graph(dry_run=False)
self._trace(
f"Grammar sync OK: ISN={report.isn_version} "
f"segments={report.segments} templates={report.templates}"
)
except Exception as e: # graph unreachable, auth failure, package absent, etc.
self._trace(f"Grammar sync skipped (best-effort): {type(e).__name__}: {e}")
finally:
sys.path[:] = original_path
def initialize(self, version: str, build_data: dict[str, Any]) -> None:
"""
Initialize the build hook and create JSON data structures.
Skips regeneration if files already exist to keep uv sync fast.
Args:
version: The version string for the build
build_data: Dictionary containing build configuration data
"""
start_time = time.time()
self._trace(f"initialize() called with version={version}")
# Signal to imas_codex internals that we are in a build context.
# This prevents the schema.py daemon thread from starting
# (avoids import-lock contention with linkml_runtime on NFS).
os.environ["_IMAS_CODEX_BUILD"] = "1"
# Add package root to sys.path temporarily to resolve internal imports
package_root = Path(__file__).parent
original_path = sys.path[:]
if str(package_root) not in sys.path:
sys.path.insert(0, str(package_root))
try:
# Lightweight import for path resolution
from imas_codex.resource_path_accessor import ResourcePathAccessor
finally:
sys.path[:] = original_path
# Get configuration options
ids_filter = self.config.get("ids-filter", "")
dd_version_config = self.config.get("imas-dd-version", "")
# Allow environment variable override for ASV builds
ids_filter = os.environ.get("IDS_FILTER", ids_filter)
dd_version_config = os.environ.get("IMAS_DD_VERSION", dd_version_config)
# Transform ids_filter from space-separated or comma-separated string to set
ids_set = None
if ids_filter:
ids_set = set(ids_filter.replace(",", " ").split())
# Determine DD version without loading heavy accessor
if dd_version_config:
resolved_dd_version = dd_version_config
else:
# Get version from default package
if str(package_root) not in sys.path:
sys.path.insert(0, str(package_root))
try:
from imas_codex import dd_version
resolved_dd_version = dd_version
finally:
sys.path[:] = original_path
self._trace(f"Using DD version: {resolved_dd_version}")
# Check if graph models need generation
graph_models_exist = self._check_graph_models_exist()
self._trace(f"graph_models_exist={graph_models_exist}")
if not graph_models_exist:
self._trace("Generating graph models from LinkML schema...")
self._generate_graph_models(package_root)
# Schema reference (agents/schema-reference.md) and schema context
# (imas_codex/graph/schema_context_data.py) are generated inside
# build_models.main() above, with their own freshness checks.
# No need to call them again here.
# Best-effort sync of ISN grammar into the graph so compose workers
# see the current segments/tokens/templates. Silent when the graph
# is unreachable (CI, Docker build, offline dev).
self._sync_grammar_best_effort(package_root)
# Get resource paths for this version
path_accessor = ResourcePathAccessor(dd_version=resolved_dd_version)
schemas_dir = path_accessor.schemas_dir
mappings_dir = path_accessor.mappings_dir
# Check if all required files already exist
schemas_exist = self._check_schemas_exist(schemas_dir)
path_map_exists = self._check_path_map_exists(mappings_dir)
self._trace(f"schemas_exist={schemas_exist}, path_map_exists={path_map_exists}")
if schemas_exist and path_map_exists:
elapsed = time.time() - start_time
self._trace(f"Resources exist, skipping build. Total time: {elapsed:.2f}s")
return
# Need to build - import heavy modules now
self._trace("Resources missing, starting build...")
if str(package_root) not in sys.path:
sys.path.insert(0, str(package_root))
try:
from imas_codex.core.xml_parser import DataDictionaryTransformer
from scripts.build_path_map import build_path_map
finally:
sys.path[:] = original_path
# Create DD accessor based on version config
dd_accessor = None
if dd_version_config:
from imas_codex.dd_accessor import ImasDataDictionariesAccessor
dd_accessor = ImasDataDictionariesAccessor(dd_version_config)
self._trace(f"Building with IMAS DD version: {dd_version_config}")
else:
from imas_codex.dd_accessor import ImasDataDictionaryAccessor
dd_accessor = ImasDataDictionaryAccessor()
self._trace(f"Building with IMAS DD version: {dd_accessor.get_version()}")
# Build schemas only if they don't exist
# IMPORTANT: Always build ALL schemas (ids_set=None), not a filtered subset.
# The ids_set is for runtime filtering, not build-time filtering.
if not schemas_exist:
self._trace("Building schemas...")
json_transformer = DataDictionaryTransformer(
dd_accessor=dd_accessor, ids_set=None
)
json_transformer.build()
self._trace("Schemas built")
else:
self._trace("Schemas already exist, skipping")
# Build path map only if it doesn't exist
if not path_map_exists:
import json
self._trace("Building path map...")
mapping_file = mappings_dir / "path_mappings.json"
mapping_data = build_path_map(
target_version=resolved_dd_version,
ids_filter=ids_set,
verbose=True,
)
with open(mapping_file, "w") as f:
json.dump(mapping_data, f, indent=2)
self._trace(
f"Built path map with {mapping_data['metadata']['total_mappings']} mappings"
)
else:
self._trace("Path map already exists, skipping")
elapsed = time.time() - start_time
self._trace(f"Build complete. Total time: {elapsed:.2f}s")