vault backup: 2025-12-09 21:46:11

2025-12-09 21:46:11 +01:00
parent a2f484b3af
commit 9e3b6b2cd3
48 changed files with 717 additions and 145 deletions
--- a/wip/extract-provfråga.py
+++ b/wip/extract-provfråga.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python3
+import re
+import csv
+import argparse
+from pathlib import Path
+
+FRONTMATTER_RE = re.compile(r"^---\s*\n(.*?)\n---\s*\n", re.S)
+FENCE_RE = re.compile(r"^```([^\n]*)\n(.*?)\n```", re.S | re.M)
+DATE_DIR_RE = re.compile(r"\d{4}-\d{2}-\d{2}")
+
+def find_date(path: Path):
+    for p in path.parents:
+        if DATE_DIR_RE.fullmatch(p.name):
+            return p.name
+    return ""
+
+def parse_frontmatter(text: str):
+    m = FRONTMATTER_RE.match(text)
+    if not m:
+        return {}, text
+    fm_raw = m.group(1)
+    rest = text[m.end():]
+    tags = []
+    # simple tags parsing: look for lines under "tags:"
+    if re.search(r"^\s*tags\s*:\s*$", fm_raw, re.M):
+        in_tags = False
+        for line in fm_raw.splitlines():
+            if re.match(r"^\s*tags\s*:\s*$", line):
+                in_tags = True
+                continue
+            if in_tags:
+                mm = re.match(r"^\s*-\s*(.+)$", line)
+                if mm:
+                    tags.append(mm.group(1).strip())
+                else:
+                    break
+    # also try single-line tags: tags: [a, b]
+    if not tags:
+        m2 = re.search(r"tags\s*:\s*\[([^\]]+)\]", fm_raw)
+        if m2:
+            tags = [t.strip() for t in m2.group(1).split(",")]
+    return {"tags": tags}, rest
+
+def extract_question_answer(body: str):
+    # find first fenced block (prefer spoiler)
+    fences = list(FENCE_RE.finditer(body))
+    spoiler = None
+    for f in fences:
+        info = (f.group(1) or "").lower()
+        if "spoiler" in info:
+            spoiler = f
+            break
+    if spoiler is None and fences:
+        spoiler = fences[0]
+    if spoiler:
+        answer = spoiler.group(2).strip()
+        question = (body[:spoiler.start()]).strip()
+    else:
+        # fallback: everything is question, no answer
+        answer = ""
+        question = body.strip()
+    return question, answer
+
+def main(root: Path, out: Path):
+    rows = []
+    for md in root.rglob("*.md"):
+        rel = md.relative_to(root)
+        date = find_date(md.parent)
+        qnum = md.stem
+        text = md.read_text(encoding="utf-8")
+        fm, body = parse_frontmatter(text)
+        tags = fm.get("tags", [])
+        # choose first tag that's not biokemi or provfråga
+        category = ""
+        for t in tags:
+            if t.lower() not in ("biokemi", "provfråga"):
+                category = t
+                break
+        question, answer = extract_question_answer(body)
+        # normalize whitespace
+        question = re.sub(r"\s+", " ", question).strip()
+        answer = re.sub(r"\s+", " ", answer).strip()
+        details = f"{category}; {date} {qnum} {answer}"
+        rows.append((question, details))
+    # write CSV with semicolon delimiter and quoting
+    out.parent.mkdir(parents=True, exist_ok=True)
+    with out.open("w", encoding="utf-8", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["question", "details"])
+        for r in rows:
+            writer.writerow(r)
+
+if __name__ == "__main__":
+    ap = argparse.ArgumentParser(description="Extract questions+answers to CSV")
+    ap.add_argument("root", nargs="?", default="content", help="root content folder (default: content)")
+    ap.add_argument("-o", "--out", default="output.csv", help="output CSV file (default: output.csv)")
+    args = ap.parse_args()
+    main(Path(args.root), Path(args.out))