imas-codex/hatch_build_hooks.py at main · iterorganization/imas-codex · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
"""
Custom build hooks for hatchling to initialize JSON data during wheel creation.

This hook is called during `uv sync` and wheel building. It skips regeneration
if the required resource files already exist, making incremental syncs fast.
"""

import os
import sys
import time
from pathlib import Path
from typing import Any

# hatchling is a build system for Python projects, and this hook will be used to
# create JSON data structures for the IMAS Codex server during the wheel build process.
from hatchling.builders.hooks.plugin.interface import (
    BuildHookInterface,  # type: ignore[import]
)


class CustomBuildHook(BuildHookInterface):
    """Custom build hook to create JSON data structures during wheel building."""

    PLUGIN_NAME = "imas-build-hook"

    def _trace(self, message: str) -> None:
        """Print trace message for debugging build hook execution."""
        print(f"[BUILD HOOK] {message}", flush=True)

    def _check_graph_models_exist(self) -> bool:
        """Check if graph models exist and are up to date."""
        package_root = Path(__file__).parent
        schemas_dir = package_root / "imas_codex" / "schemas"

        # Check all schema files that contribute to generated models
        schema_files = [
            schemas_dir / "facility.yaml",
            schemas_dir / "common.yaml",
            schemas_dir / "imas_dd.yaml",
            schemas_dir / "facility_config.yaml",
            schemas_dir / "standard_name.yaml",
            schemas_dir / "task_groups.yaml",
        ]

        # All generated output files that must exist
        output_files = [
            package_root / "imas_codex" / "graph" / "models.py",
            package_root / "imas_codex" / "graph" / "dd_models.py",
            package_root / "imas_codex" / "config" / "models.py",
            # schema_context_data.py is a runtime dependency
            # (imported by schema_context.py, query_builder.py, client.py)
            package_root / "imas_codex" / "graph" / "schema_context_data.py",
        ]

        # If no schema files exist yet, nothing to generate
        existing_schemas = [f for f in schema_files if f.exists()]
        if not existing_schemas:
            return True

        # Check if all output files exist
        existing_outputs = [f for f in output_files if f.exists()]
        if len(existing_outputs) != len(output_files):
            return False

        # Check if any schema is newer than the oldest output
        oldest_output_mtime = min(f.stat().st_mtime for f in existing_outputs)
        return all(f.stat().st_mtime <= oldest_output_mtime for f in existing_schemas)

    def _generate_graph_models(self, package_root: Path) -> None:
        """Generate graph Pydantic models from LinkML schema."""
        original_path = sys.path[:]
        if str(package_root) not in sys.path:
            sys.path.insert(0, str(package_root))

        try:
            from scripts.build_models import build_models

            # Let build_models freshness checks decide what to regenerate
            # instead of using --force which rebuilds everything.
            result = build_models.main(args=[], standalone_mode=False)
            if result == 0:
                self._trace("Graph models generated successfully")
            else:
                self._trace(f"Graph models generation returned {result}")
        except SystemExit as e:
            if e.code == 0:
                self._trace("Graph models generated successfully")
            else:
                self._trace(f"Failed to generate graph models: exit {e.code}")
        except Exception as e:
            self._trace(f"Failed to generate graph models: {e}")
        finally:
            sys.path[:] = original_path

    def _generate_schema_reference(self, package_root: Path) -> None:
        """Generate agents/schema-reference.md from LinkML schemas."""
        output_file = package_root / "agents" / "schema-reference.md"
        schemas_dir = package_root / "imas_codex" / "schemas"

        # Freshness check
        schema_files = [
            schemas_dir / "facility.yaml",
            schemas_dir / "common.yaml",
            schemas_dir / "imas_dd.yaml",
        ]
        existing = [f for f in schema_files if f.exists()]
        if not existing:
            return

        if output_file.exists():
            output_mtime = output_file.stat().st_mtime
            if all(f.stat().st_mtime <= output_mtime for f in existing):
                self._trace("Schema reference up to date")
                return

        original_path = sys.path[:]
        if str(package_root) not in sys.path:
            sys.path.insert(0, str(package_root))

        try:
            from scripts.gen_schema_reference import generate_schema_reference

            generate_schema_reference(force=True)
            self._trace("Schema reference generated")
        except Exception as e:
            self._trace(f"Failed to generate schema reference: {e}")
        finally:
            sys.path[:] = original_path

    def _generate_schema_context(self, package_root: Path) -> None:
        """Generate imas_codex/graph/schema_context_data.py from LinkML schemas."""
        output_file = package_root / "imas_codex" / "graph" / "schema_context_data.py"
        schemas_dir = package_root / "imas_codex" / "schemas"

        schema_files = [
            schemas_dir / "facility.yaml",
            schemas_dir / "common.yaml",
            schemas_dir / "imas_dd.yaml",
            schemas_dir / "task_groups.yaml",
        ]
        existing = [f for f in schema_files if f.exists()]
        if not existing:
            return

        if output_file.exists():
            output_mtime = output_file.stat().st_mtime
            if all(f.stat().st_mtime <= output_mtime for f in existing if f.exists()):
                self._trace("Schema context up to date")
                return

        original_path = sys.path[:]
        if str(package_root) not in sys.path:
            sys.path.insert(0, str(package_root))

        try:
            from scripts.gen_schema_context import generate_schema_context

            generate_schema_context(force=True)
            self._trace("Schema context generated")
        except Exception as e:
            self._trace(f"Failed to generate schema context: {e}")
        finally:
            sys.path[:] = original_path

    def _check_schemas_exist(self, schemas_dir: Path) -> bool:
        """Check if schema files already exist and are valid."""
        catalog_path = schemas_dir / "ids_catalog.json"
        detailed_dir = schemas_dir / "detailed"
        exists = (
            catalog_path.exists()
            and detailed_dir.exists()
            and any(detailed_dir.glob("*.json"))
        )
        return exists

    def _check_path_map_exists(self, mappings_dir: Path) -> bool:
        """Check if path map file already exists."""
        mapping_file = mappings_dir / "path_mappings.json"
        return mapping_file.exists()

    def _sync_grammar_best_effort(self, package_root: Path) -> None:
        """Sync ISN grammar into the graph if reachable.

        Best-effort: silently skipped when graph is unreachable (CI, Docker
        build, offline dev) or when `IMAS_CODEX_SKIP_GRAMMAR_SYNC=1` is set.
        Prevents grammar-version drift between the installed ISN package and
        the graph's `GrammarToken` nodes, which otherwise causes the compose
        worker to fall back to stale tokens and emit noisy warnings.
        """
        if os.environ.get("IMAS_CODEX_SKIP_GRAMMAR_SYNC") == "1":
            self._trace("Grammar sync skipped (IMAS_CODEX_SKIP_GRAMMAR_SYNC=1)")
            return

        original_path = sys.path[:]
        if str(package_root) not in sys.path:
            sys.path.insert(0, str(package_root))
        try:
            from imas_codex.standard_names.grammar_sync import (
                sync_isn_grammar_to_graph,
            )

            report = sync_isn_grammar_to_graph(dry_run=False)
            self._trace(
                f"Grammar sync OK: ISN={report.isn_version} "
                f"segments={report.segments} templates={report.templates}"
            )
        except Exception as e:  # graph unreachable, auth failure, package absent, etc.
            self._trace(f"Grammar sync skipped (best-effort): {type(e).__name__}: {e}")
        finally:
            sys.path[:] = original_path

    def initialize(self, version: str, build_data: dict[str, Any]) -> None:
        """
        Initialize the build hook and create JSON data structures.

        Skips regeneration if files already exist to keep uv sync fast.

        Args:
            version: The version string for the build
            build_data: Dictionary containing build configuration data
        """
        start_time = time.time()
        self._trace(f"initialize() called with version={version}")

        # Signal to imas_codex internals that we are in a build context.
        # This prevents the schema.py daemon thread from starting
        # (avoids import-lock contention with linkml_runtime on NFS).
        os.environ["_IMAS_CODEX_BUILD"] = "1"

        # Add package root to sys.path temporarily to resolve internal imports
        package_root = Path(__file__).parent
        original_path = sys.path[:]
        if str(package_root) not in sys.path:
            sys.path.insert(0, str(package_root))

        try:
            # Lightweight import for path resolution
            from imas_codex.resource_path_accessor import ResourcePathAccessor
        finally:
            sys.path[:] = original_path

        # Get configuration options
        ids_filter = self.config.get("ids-filter", "")
        dd_version_config = self.config.get("imas-dd-version", "")

        # Allow environment variable override for ASV builds
        ids_filter = os.environ.get("IDS_FILTER", ids_filter)
        dd_version_config = os.environ.get("IMAS_DD_VERSION", dd_version_config)

        # Transform ids_filter from space-separated or comma-separated string to set
        ids_set = None
        if ids_filter:
            ids_set = set(ids_filter.replace(",", " ").split())

        # Determine DD version without loading heavy accessor
        if dd_version_config:
            resolved_dd_version = dd_version_config
        else:
            # Get version from default package
            if str(package_root) not in sys.path:
                sys.path.insert(0, str(package_root))
            try:
                from imas_codex import dd_version

                resolved_dd_version = dd_version
            finally:
                sys.path[:] = original_path

        self._trace(f"Using DD version: {resolved_dd_version}")

        # Check if graph models need generation
        graph_models_exist = self._check_graph_models_exist()
        self._trace(f"graph_models_exist={graph_models_exist}")

        if not graph_models_exist:
            self._trace("Generating graph models from LinkML schema...")
            self._generate_graph_models(package_root)

        # Schema reference (agents/schema-reference.md) and schema context
        # (imas_codex/graph/schema_context_data.py) are generated inside
        # build_models.main() above, with their own freshness checks.
        # No need to call them again here.

        # Best-effort sync of ISN grammar into the graph so compose workers
        # see the current segments/tokens/templates. Silent when the graph
        # is unreachable (CI, Docker build, offline dev).
        self._sync_grammar_best_effort(package_root)

        # Get resource paths for this version
        path_accessor = ResourcePathAccessor(dd_version=resolved_dd_version)
        schemas_dir = path_accessor.schemas_dir
        mappings_dir = path_accessor.mappings_dir

        # Check if all required files already exist
        schemas_exist = self._check_schemas_exist(schemas_dir)
        path_map_exists = self._check_path_map_exists(mappings_dir)

        self._trace(f"schemas_exist={schemas_exist}, path_map_exists={path_map_exists}")

        if schemas_exist and path_map_exists:
            elapsed = time.time() - start_time
            self._trace(f"Resources exist, skipping build. Total time: {elapsed:.2f}s")
            return

        # Need to build - import heavy modules now
        self._trace("Resources missing, starting build...")
        if str(package_root) not in sys.path:
            sys.path.insert(0, str(package_root))

        try:
            from imas_codex.core.xml_parser import DataDictionaryTransformer
            from scripts.build_path_map import build_path_map
        finally:
            sys.path[:] = original_path

        # Create DD accessor based on version config
        dd_accessor = None
        if dd_version_config:
            from imas_codex.dd_accessor import ImasDataDictionariesAccessor

            dd_accessor = ImasDataDictionariesAccessor(dd_version_config)
            self._trace(f"Building with IMAS DD version: {dd_version_config}")
        else:
            from imas_codex.dd_accessor import ImasDataDictionaryAccessor

            dd_accessor = ImasDataDictionaryAccessor()
            self._trace(f"Building with IMAS DD version: {dd_accessor.get_version()}")

        # Build schemas only if they don't exist
        # IMPORTANT: Always build ALL schemas (ids_set=None), not a filtered subset.
        # The ids_set is for runtime filtering, not build-time filtering.
        if not schemas_exist:
            self._trace("Building schemas...")
            json_transformer = DataDictionaryTransformer(
                dd_accessor=dd_accessor, ids_set=None
            )
            json_transformer.build()
            self._trace("Schemas built")
        else:
            self._trace("Schemas already exist, skipping")

        # Build path map only if it doesn't exist
        if not path_map_exists:
            import json

            self._trace("Building path map...")
            mapping_file = mappings_dir / "path_mappings.json"
            mapping_data = build_path_map(
                target_version=resolved_dd_version,
                ids_filter=ids_set,
                verbose=True,
            )
            with open(mapping_file, "w") as f:
                json.dump(mapping_data, f, indent=2)
            self._trace(
                f"Built path map with {mapping_data['metadata']['total_mappings']} mappings"
            )
        else:
            self._trace("Path map already exists, skipping")

        elapsed = time.time() - start_time
        self._trace(f"Build complete. Total time: {elapsed:.2f}s")