BlogPapers/_scripts/generate_feed_and_sitemap.py at master · SummerSec/BlogPapers · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
#!/usr/bin/env python3
"""
从仓库内的 Markdown 生成 resources/rss.xml、resources/atom.xml、resources/sitemap.xml，
与线上文章列表保持一致（不依赖外链 blog-rss，也不爬 sumsec.me）。

博文根目录从 resources/Archives.md 中自动解析：凡出现指向 ../<段>/README.md 的链接（相对
Archives.md 所在目录），即把 <段> 视为仓库根下的一层归档目录（如 2026、PL）。新增年份时
只需编辑 Archives.md，不必改本脚本。

sitemap 顺序：站点根 → 文章 URL（四位年份目录优先按年份降序，同年份内按 Git 时间降序；
非四位目录如 PL 排在所有年份之后）→ 导航类静态页 → resources 下其余 md → 各归档
README.html（同年份规则）→ 最后为 resources/README.html。不包含 *-ppt.html。

依赖：Python 3.10+ 标准库；可选 git 用于每篇文章的最近提交时间。
"""

from __future__ import annotations

import html
import re
import subprocess
import sys
import xml.etree.ElementTree as ET
from datetime import datetime, timezone
from email.utils import format_datetime
from pathlib import Path
from urllib.parse import quote

REPO_ROOT = Path(__file__).resolve().parent.parent
RESOURCES = REPO_ROOT / "resources"
SITE = "https://sumsec.me"
SITE_DESC = "像清水一般清澈透明"
CHANNEL_TITLE = "SUMSEC"

ARCHIVES_MD = RESOURCES / "Archives.md"
# Archives 未配置任何归档链接时的回退（避免 CI 直接失败）
_FALLBACK_POST_ROOTS = ("2021", "2022", "2023", "2026", "PL")

# 固定收录的导航类页面（不含首页与 resources/README，二者在 sitemap 中单独插入顺序）
STATIC_SITEMAP_PATHS = (
    "/resources/Archives.html",
    "/resources/AboutMe.html",
    "/resources/Advertisements.html",
    "/resources/subdomain.html",
    "/resources/rss.xml",
    "/resources/atom.xml",
)


def git_last_commit_datetime(rel_posix: str) -> datetime | None:
    try:
        out = subprocess.check_output(
            [
                "git",
                "-c",
                "core.quotepath=false",
                "log",
                "-1",
                "--format=%aI",
                "--",
                rel_posix,
            ],
            cwd=REPO_ROOT,
            stderr=subprocess.DEVNULL,
            text=True,
        ).strip()
        if not out:
            return None
        # 2024-01-01T12:00:00+08:00
        return datetime.fromisoformat(out.replace("Z", "+00:00"))
    except (subprocess.CalledProcessError, FileNotFoundError, ValueError):
        return None


def file_mtime_utc(path: Path) -> datetime:
    return datetime.fromtimestamp(path.stat().st_mtime, tz=timezone.utc)


def title_from_md(path: Path) -> str:
    try:
        text = path.read_text(encoding="utf-8", errors="replace")
    except OSError:
        return path.stem
    for line in text.splitlines()[:60]:
        s = line.strip()
        if s.startswith("# "):
            return s[2:].strip()
    return path.stem


def discover_post_roots() -> tuple[str, ...]:
    """
    从 Archives.md 提取归档目录名：匹配 (../NAME/README.md)（不区分 README 大小写）。
    NAME 为仓库根下的一级目录；排除 resources（其下 md 由 sitemap 另段处理）。
    """
    if not ARCHIVES_MD.is_file():
        print(
            f"警告：未找到 {ARCHIVES_MD.relative_to(REPO_ROOT).as_posix()}，使用回退归档列表。",
            file=sys.stderr,
        )
        return _FALLBACK_POST_ROOTS
    text = ARCHIVES_MD.read_text(encoding="utf-8", errors="replace")
    pat = re.compile(r"\(\.\./([^/)]+)/README\.md\s*\)", re.IGNORECASE)
    order: list[str] = []
    seen: set[str] = set()
    for m in pat.finditer(text):
        name = m.group(1).strip()
        if not name or name in seen:
            continue
        if name == "resources" or ".." in name or "/" in name:
            continue
        seen.add(name)
        order.append(name)
    if not order:
        print(
            "警告：Archives.md 中未解析到任何 (../<目录>/README.md) 链接，使用回退归档列表。",
            file=sys.stderr,
        )
        return _FALLBACK_POST_ROOTS
    return tuple(order)


def _archive_segment_sort_key(segment: str) -> tuple:
    """四位年份降序在前；其余目录（如 PL）在后，按名字排序。"""
    if segment.isdigit() and len(segment) == 4:
        return (0, -int(segment))
    return (1, segment)


def _post_sitemap_sort_key(item: tuple[Path, str, datetime]) -> tuple:
    """文章 URL 排序：年份降序 → 同年内时间降序。"""
    path, _rel, dt = item
    parts = path.relative_to(REPO_ROOT).parts
    seg = parts[0] if parts else ""
    tier, y = _archive_segment_sort_key(seg)
    if tier == 0:
        return (0, y, -dt.timestamp())
    return (1, y, -dt.timestamp())  # y 为 segment 字符串


def url_for_md(rel: Path) -> str:
    """Jekyll 默认：2026/foo.md -> https://sumsec.me/2026/foo.html"""
    parts = rel.as_posix().split("/")
    stem = parts[-1][:-3]  # .md
    dir_parts = parts[:-1]
    enc_dir = "/".join(quote(seg, safe="") for seg in dir_parts)
    enc_stem = quote(stem, safe="")
    if enc_dir:
        return f"{SITE}/{enc_dir}/{enc_stem}.html"
    return f"{SITE}/{enc_stem}.html"


def collect_posts(post_roots: tuple[str, ...]) -> list[tuple[Path, str, datetime]]:
    """返回 (绝对路径, repo 相对 posix, 排序用时间)。"""
    rows: list[tuple[Path, str, datetime]] = []
    for root_name in post_roots:
        root = REPO_ROOT / root_name
        if not root.is_dir():
            continue
        for path in sorted(root.rglob("*.md")):
            if path.name == "README.md":
                continue
            rel = path.relative_to(REPO_ROOT)
            rel_posix = rel.as_posix()
            dt = git_last_commit_datetime(rel_posix) or file_mtime_utc(path)
            rows.append((path, rel_posix, dt))
    rows.sort(key=lambda x: x[2], reverse=True)
    return rows


def collect_sitemap_urls(
    posts: list[tuple[Path, str, datetime]], post_roots: tuple[str, ...]
) -> list[str]:
    seen: set[str] = set()
    out: list[str] = []

    def add(u: str) -> None:
        if u not in seen:
            seen.add(u)
            out.append(u)

    add(SITE + "/")

    for item in sorted(posts, key=_post_sitemap_sort_key):
        path, _, _ = item
        add(url_for_md(path.relative_to(REPO_ROOT)))

    for p in STATIC_SITEMAP_PATHS:
        add(SITE.rstrip("/") + p)

    if RESOURCES.is_dir():
        for path in sorted(RESOURCES.glob("*.md")):
            if path.name == "README.md":
                continue
            add(url_for_md(path.relative_to(REPO_ROOT)))

    for root_name in sorted(post_roots, key=_archive_segment_sort_key):
        readme = REPO_ROOT / root_name / "README.md"
        if readme.is_file():
            add(url_for_md(readme.relative_to(REPO_ROOT)))

    readme_res = RESOURCES / "README.md"
    if readme_res.is_file():
        add(url_for_md(readme_res.relative_to(REPO_ROOT)))

    return out


def write_sitemap(urls: list[str]) -> None:
    urlset = ET.Element("urlset", xmlns="http://www.sitemaps.org/schemas/sitemap/0.9")
    for loc in urls:
        u = ET.SubElement(urlset, "url")
        ET.SubElement(u, "loc").text = loc
    tree = ET.ElementTree(urlset)
    ET.indent(tree, space="  ")
    out_path = RESOURCES / "sitemap.xml"
    tree.write(out_path, encoding="utf-8", xml_declaration=True, default_namespace=None)
    # ElementTree 默认 standalone 无；与旧文件风格接近即可


def write_rss(posts: list[tuple[Path, str, datetime]]) -> None:
    now = datetime.now(timezone.utc)
    lines = [
        "<?xml version='1.0' encoding='UTF-8'?>",
        '<rss xmlns:atom="http://www.w3.org/2005/Atom" xmlns:content="http://purl.org/rss/1.0/modules/content/" version="2.0">',
        "<channel>",
        f"<title>{html.escape(CHANNEL_TITLE)}</title>",
        f"<link>{SITE}</link>",
        f"<description>{html.escape(SITE_DESC)}</description>",
        "<docs>http://www.rssboard.org/rss-specification</docs>",
        "<generator>BlogPapers generate_feed_and_sitemap.py</generator>",
        "<language>zh-CN</language>",
        f"<lastBuildDate>{format_datetime(now)}</lastBuildDate>",
        f'<atom:link href="{SITE}/resources/rss.xml" rel="self" type="application/rss+xml"/>',
    ]
    for path, rel_posix, dt in posts:
        link = url_for_md(path.relative_to(REPO_ROOT))
        title = html.escape(title_from_md(path))
        pub = format_datetime(dt.astimezone(timezone.utc))
        guid = html.escape(link)
        lines.append("<item>")
        lines.append(f"<title>{title}</title>")
        lines.append(f"<link>{html.escape(link)}</link>")
        lines.append(f"<guid isPermaLink=\"true\">{guid}</guid>")
        lines.append(f"<pubDate>{pub}</pubDate>")
        lines.append("</item>")
    lines.append("</channel></rss>")
    (RESOURCES / "rss.xml").write_text("\n".join(lines) + "\n", encoding="utf-8")


def write_atom(posts: list[tuple[Path, str, datetime]]) -> None:
    now = datetime.now(timezone.utc)
    feed = ET.Element(
        "feed",
        {
            "xmlns": "http://www.w3.org/2005/Atom",
            "xml:lang": "zh-CN",
        },
    )
    ET.SubElement(feed, "id").text = SITE + "/"
    ET.SubElement(feed, "title").text = CHANNEL_TITLE
    ET.SubElement(feed, "updated").text = now.isoformat(timespec="seconds").replace("+00:00", "Z")
    ET.SubElement(feed, "link", {"href": f"{SITE}/resources/atom.xml", "rel": "self"})
    ET.SubElement(feed, "link", {"href": SITE, "rel": "alternate"})
    ET.SubElement(feed, "generator", uri="https://github.com/SummerSec/BlogPapers", version="1").text = (
        "BlogPapers generate_feed_and_sitemap.py"
    )
    ET.SubElement(feed, "subtitle").text = SITE_DESC

    for path, rel_posix, dt in posts:
        link = url_for_md(path.relative_to(REPO_ROOT))
        entry = ET.SubElement(feed, "entry")
        ET.SubElement(entry, "id").text = link
        ET.SubElement(entry, "title").text = title_from_md(path)
        ET.SubElement(entry, "updated").text = dt.astimezone(timezone.utc).isoformat(timespec="seconds").replace(
            "+00:00", "Z"
        )
        ET.SubElement(entry, "link", href=link, rel="alternate")

    ET.indent(feed, space="  ")
    atom_path = RESOURCES / "atom.xml"
    tree = ET.ElementTree(feed)
    tree.write(atom_path, encoding="utf-8", xml_declaration=True, default_namespace=None)


def main() -> int:
    if not REPO_ROOT.joinpath("_config.yml").is_file():
        print("请在 BlogPapers 仓库根目录运行。", file=sys.stderr)
        return 1
    post_roots = discover_post_roots()
    posts = collect_posts(post_roots)
    if not posts:
        print("未发现任何博文 Markdown。", file=sys.stderr)
        return 1
    write_rss(posts)
    write_atom(posts)
    urls = collect_sitemap_urls(posts, post_roots)
    write_sitemap(urls)
    print(
        f"已写入 {len(posts)} 条 feed 条目，sitemap {len(urls)} 个 URL；"
        f"归档根目录（来自 Archives.md）：{', '.join(post_roots)}。"
    )
    return 0


if __name__ == "__main__":
    raise SystemExit(main())