Merge pull request #499 from iiasa/enh/newclimate

khaeru · web-flow · commit e5939f07937b · 2026-04-26T14:37:21.000+02:00
Add `.tools.newclimate`
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -74,3 +74,6 @@
 /doc/project/sparccle.rst @adrivinca
 /doc/project/ssp.rst      @OFR-IIASA
 /doc/project/uptake.rst   @ywpratama
+
+/message_ix_models/tools/newclimate                @khaeru
+/message_ix_models/tests/tools/test_newclimate.py  @khaeru
diff --git a/doc/api/data-sources.rst b/doc/api/data-sources.rst
@@ -180,3 +180,11 @@ These files were characterized by:
    .. [1] the column is sometimes labelled "UNIT", but the contents appear to be the same.
 
 This source is discontinued and will not publish subsequent editions of the data.
+
+.. _tools-newclimate:
+
+NewClimate Institute (:mod:`.tools.newclimate`)
+===============================================
+
+.. automodule:: message_ix_models.tools.newclimate
+   :members:
diff --git a/doc/api/tools.rst b/doc/api/tools.rst
@@ -119,6 +119,8 @@ Policies (:mod:`.tools.policy`)
 .. automodule:: message_ix_models.tools.policy
    :members:
 
+   See also :ref:`tools-newclimate`.
+
 .. _tools-wb:
 
 World Bank structures (:mod:`.tools.wb`)
@@ -127,7 +129,6 @@ World Bank structures (:mod:`.tools.wb`)
 .. automodule:: message_ix_models.tools.wb
    :members:
 
-
 Tools for scenario manipulation
 ===============================
 
diff --git a/doc/whatsnew.rst b/doc/whatsnew.rst
@@ -5,6 +5,7 @@ Next release
 ============
 
 - Add IAMC code list :class:`~.iamc.structure.CL_SCENARIO_DIAGNOSTIC` (:pull:`501`).
+- New module :ref:`tools-newclimate` (:pull:`499`).
 - Add :doc:`/api/model-bmt` (:pull:`433`).
 
   - Add
diff --git a/message_ix_models/tests/tools/test_newclimate.py b/message_ix_models/tests/tools/test_newclimate.py
@@ -0,0 +1,89 @@
+import pytest
+
+from message_ix_models.testing import KEY as STASH_KEY
+from message_ix_models.tools.newclimate import SECTOR, fetch, get, read
+from message_ix_models.tools.newclimate.structure import STRINGENCY
+
+
+class TestSTRINGENCY:
+    def test_int(self) -> None:
+        """Lookup of str containing only digits gives an enumeration member."""
+        assert STRINGENCY["1"] == STRINGENCY._1
+
+
+@pytest.mark.parametrize(
+    "version",
+    ("2024", "2023", "2022", "2021", "2020", "2019"),
+)
+def test_fetch(version: str) -> None:
+    # File can be fetched
+    p = fetch(version)
+
+    assert p.exists()
+
+
+@pytest.mark.parametrize(
+    "version, N_total, N_transport",
+    (
+        ("2024", 6507, 1298),
+        ("2023", 6273, 1246),
+        ("2022", 5883, 1203),
+        pytest.param("2021", 1, 1, marks=pytest.mark.xfail(raises=NotImplementedError)),
+        pytest.param("2020", 1, 1, marks=pytest.mark.xfail(raises=NotImplementedError)),
+        pytest.param("2019", 1, 1, marks=pytest.mark.xfail(raises=NotImplementedError)),
+    ),
+)
+def test_get(version: str, N_total: int, N_transport: int) -> None:
+    # Data can be fetched and read
+    result = get(version)
+
+    # Expected number of records
+    N = len(result)
+    assert N_total == N
+
+    # Objects can be filtered using enumerations
+    subset = {k: p for k, p in result.items() if SECTOR.Transport in p.sector}
+
+    N = len(subset)
+    assert N_transport == N
+
+
+def test_read0() -> None:
+    result = get("2024")
+
+    # Retrieve one entry
+    p = result["211000001"]
+
+    # Enumerated field is parsed to a list of enum items
+    assert [SECTOR.Electricity_and_heat, SECTOR.Renewables] == p.sector
+
+    # Geo field contains a pycountry object
+    assert 1 == len(p.geo)
+    # …that can be used to access various fields, as needed
+    assert "ITA" == p.country.alpha_3
+    assert "Italy" == p.country.name
+
+
+@pytest.mark.parametrize(
+    "filename, N_total",
+    (
+        ("Canada_edits_additions0.csv", 18),
+        ("Canada_edits_additions1.csv", 1),
+        ("climate_policy_database_policies_2025.csv", 6507),
+    ),
+)
+def test_read_local_data(
+    pytestconfig: pytest.Config, filename: str, N_total: int
+) -> None:
+    """Test files in user's local data path."""
+    path = pytestconfig.stash[STASH_KEY["user-local-data"]].joinpath(
+        "newclimate", filename
+    )
+
+    if path.exists():
+        # Function runs
+        result = read(path)
+
+        # Expected number of records
+        N = len(result)
+        assert N_total == N
diff --git a/message_ix_models/tools/newclimate/__init__.py b/message_ix_models/tools/newclimate/__init__.py
@@ -0,0 +1,215 @@
+"""Handle data from the NewClimate Institute's Climate Policy Database (CPDB).
+
+This module provides:
+
+- :class:`.NewClimatePolicy`, a concrete subclass of the abstract/generic
+  :class:`.Policy`, that reflects the data model appearing in the CPDB.
+
+  - Enumerations that reflect values appearing in fields of the database which appear to
+    be enumerated (as opposed to free text):
+    :class:`HIGH_IMPACT`,
+    :class:`JURISDICTION`,
+    :class:`OBJECTIVE`,
+    :class:`SECTOR`,
+    :class:`STATUS`,
+    :class:`STRINGENCY`,
+    :class:`TYPE`, and
+    :class:`UPDATE`.
+
+  - A method :meth:`.NewClimatePolicy.from_csv_dict` that interprets the CSV data
+    format in which the database is expressed.
+
+- Functions to :func:`fetch` versions of the database from Zenodo, :func:`read` into
+  collections of Python objects, or do both (:func:`get`).
+
+These enable programmatic use of the information in the database. For example:
+
+.. code-block:: python
+
+   from message_ix_models.tools.newclimate import SECTOR, get
+   from pycountry import countries
+
+   # Fetch and parse the 2024 edition of the database
+   policies = get("2024")
+   print(len(policies))  # 6507 objects
+
+   # Filter the dict to a list of policy objects matching a certain sector
+   p_transport = list(filter(lambda p: SECTOR.Transport in p.sector, policies.values()))
+   print(len(p_transport))  # 1298 objects
+
+   # Filter for any policies concerning the country of Austria, or the EU
+   match = {pycountry.lookup("Austria"), "European Union"}
+   p_AUT = list(filter(lambda p: set(p.geo) & match, policies.values()))
+   print(len(p_AUT)))  # 259 objects
+
+.. todo:: Extend the module:
+
+   - Serialize :class:`.NewClimatePolicy` objects in 1 or more formats, preferably
+     standards-based.
+   - :func:`fetch` versions of the database more recent than the latest Zenodo record,
+     using the `cpdb_api package
+     <https://github.com/https-github-com-NewClimateInstitute/CPDB-API>`_ or other code.
+   - Convert to/from other data models.
+"""
+
+import csv
+import logging
+from functools import cache
+from typing import TYPE_CHECKING
+
+from .structure import (
+    HIGH_IMPACT,
+    JURISDICTION,
+    OBJECTIVE,
+    SECTOR,
+    STATUS,
+    STRINGENCY,
+    TYPE,
+    UPDATE,
+    NewClimatePolicy,
+)
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+__all__ = [
+    "HIGH_IMPACT",
+    "JURISDICTION",
+    "NewClimatePolicy",
+    "OBJECTIVE",
+    "SECTOR",
+    "STATUS",
+    "STRINGENCY",
+    "TYPE",
+    "UPDATE",
+    "read",
+    "get",
+    "fetch",
+]
+
+log = logging.getLogger(__name__)
+
+#: Pooch information for fetching files from the static version of the database.
+SOURCE = {  # noqa: E501
+    "newclimate-2024": dict(
+        pooch_args=dict(
+            base_url="doi:10.5281/zenodo.15432946",
+            registry={
+                "ClimatePolicyDatabase_v2024.csv": (
+                    "sha256:e893745bc26d225d8e91d063eb1fdbcbb5da4a51ce05d28ce5b9f51f6ef4408f"
+                ),
+            },
+        ),
+    ),
+    "newclimate-2023": dict(
+        pooch_args=dict(
+            base_url="doi:10.5281/zenodo.10869734",
+            registry={
+                "ClimatePolicyDatabase_v2023.xlsx": (
+                    "sha256:bdce700c6b0c2eeb7fa06584cb8523793b64ec5799d91ae65818209aaf9de682"
+                ),
+            },
+        ),
+    ),
+    "newclimate-2022": dict(
+        pooch_args=dict(
+            base_url="doi:10.5281/zenodo.7774473",
+            registry={
+                "ClimatePolicyDatabase_v2022.csv": (
+                    "sha256:fe431e41c4c2fb8513d6718fba6ba3bc0a1fd2c5b9016256a106b998f5f48946"
+                ),
+            },
+        ),
+    ),
+    "newclimate-2021": dict(
+        pooch_args=dict(
+            base_url="doi:10.5281/zenodo.7774471",
+            registry={
+                "ClimatePolicyDatabase_v2021.xlsx": (
+                    "sha256:d880c2c94c7d8da84bb9cf8d315faf7230e4965cbc679ac1783222ecfe84062a"
+                ),
+            },
+        ),
+    ),
+    "newclimate-2020": dict(
+        pooch_args=dict(
+            base_url="doi:10.5281/zenodo.7774462",
+            registry={
+                "ClimatePolicyDatabase_v2020.xlsx": (
+                    "sha256:08818156401200ec094985c34250ef65cea6ff5246cbbeb1d0ade317f8fdaa0c"
+                ),
+            },
+        ),
+    ),
+    "newclimate-2019": dict(
+        pooch_args=dict(
+            base_url="doi:10.5281/zenodo.7774110",
+            registry={
+                "ClimatePolicyDatabase _v2019.xlsx": (
+                    "sha256:c28cdd613496d503ae00bacf637fc052128e04361580110829843b4bf0235368"
+                ),
+            },
+        )
+    ),
+}
+
+
+def fetch(version: str) -> "Path":
+    """Retrieve data for `version` of the Climate Policy Database from Zenodo."""
+    from message_ix_models.util import pooch
+
+    # Ensure sources for this module are registered
+    pooch.SOURCE.update(SOURCE)
+
+    # Construct the key
+    source_id = f"newclimate-{version}"
+
+    return pooch.fetch(**pooch.SOURCE[source_id], extra_cache_path="newclimate")[0]
+
+
+def get(version: str) -> dict[str, NewClimatePolicy]:
+    """:func:`fetch` and then :func:`read` data for `version` of the database."""
+    f_source = fetch(version)
+
+    if f_source.suffix == ".xlsx":
+        # Convert Excel to CSV
+        import pandas as pd
+
+        f_read = f_source.with_suffix(".csv")
+        if not f_read.exists():
+            log.info(f"Unpack {f_source} to {f_read}")
+            pd.read_excel(f_source).to_csv(f_read, index=False)
+    else:
+        f_read = f_source
+
+    # - Force use of UTF-8 on macOS and Windows.
+    # - The 2022 CSV file is not in UTF-8 format; use a different encoding.
+    kwargs = dict(encoding="latin-1" if version == "2022" else "utf-8")
+
+    try:
+        return read(f_read, **kwargs)
+    except Exception as e:
+        if version in ("2021", "2020", "2019"):
+            raise NotImplementedError("Read 2021 and earlier data format") from e
+        else:  # pragma: no cover
+            raise
+
+
+@cache
+def read(path: "Path", **kwargs) -> dict[str, NewClimatePolicy]:
+    """Read a CSV file into a :class:`dict` of Policy objects.
+
+    Returns
+    -------
+    dict
+        Keys are :attr:`.NewClimatePolicy.id`. If the file contains records with the
+        same IDs, only the last appears, and a warning is logged.
+    """
+    with open(path, **kwargs) as f:
+        policies = [NewClimatePolicy.from_csv_dict(row) for row in csv.DictReader(f)]
+
+    result = {p.id: p for p in policies}
+    if len(result) < len(policies):
+        log.warning(f"{len(policies) - len(result)} duplicate IDs in `path`")
+
+    return result
diff --git a/message_ix_models/tools/newclimate/structure.py b/message_ix_models/tools/newclimate/structure.py
diff --git a/message_ix_models/util/pooch.py b/message_ix_models/util/pooch.py
diff --git a/message_ix_models/util/pycountry.py b/message_ix_models/util/pycountry.py