vault backup: 2025-12-09 22:35:53

2025-12-09 22:35:53 +01:00
parent 9e3b6b2cd3
commit 313fef0b98
24 changed files with 3894 additions and 612 deletions
--- a/wip/extract-provfråga.py
+++ b/wip/extract-provfråga.py
@@ -1,9 +1,12 @@
+# python
 #!/usr/bin/env python3
 import re
 import csv
 import argparse
 from pathlib import Path

+from markdown import markdown
+
 FRONTMATTER_RE = re.compile(r"^---\s*\n(.*?)\n---\s*\n", re.S)
 FENCE_RE = re.compile(r"^```([^\n]*)\n(.*?)\n```", re.S | re.M)
 DATE_DIR_RE = re.compile(r"\d{4}-\d{2}-\d{2}")
@@ -17,7 +20,7 @@ def find_date(path: Path):
 def parse_frontmatter(text: str):
    m = FRONTMATTER_RE.match(text)
    if not m:
-        return {}, text
+        return {"tags": [], "date": ""}, text
    fm_raw = m.group(1)
    rest = text[m.end():]
    tags = []
@@ -36,10 +39,21 @@ def parse_frontmatter(text: str):
                    break
    # also try single-line tags: tags: [a, b]
    if not tags:
-        m2 = re.search(r"tags\s*:\s*\[([^\]]+)\]", fm_raw)
-        if m2:
-            tags = [t.strip() for t in m2.group(1).split(",")]
-    return {"tags": tags}, rest
+        # handle single-line tags like: tags: [a, b]
+        idx = fm_raw.find("tags:")
+        if idx != -1:
+            # look for first '[' and ']' after the 'tags:' token on the same or next line
+            br_start = fm_raw.find("[", idx)
+            br_end = fm_raw.find("]", br_start + 1) if br_start != -1 else -1
+            if br_start != -1 and br_end != -1:
+                inner = fm_raw[br_start+1:br_end]
+                tags = [t.strip().strip('"\'') for t in inner.split(",") if t.strip()]
+    # parse date from frontmatter if present
+    date_val = ""
+    mdate = re.search(r"^date\s*:\s*(.+)$", fm_raw, re.M)
+    if mdate:
+        date_val = mdate.group(1).strip().strip('"\'')
+    return {"tags": tags, "date": date_val}, rest

 def extract_question_answer(body: str):
    # find first fenced block (prefer spoiler)
@@ -64,29 +78,41 @@ def extract_question_answer(body: str):
 def main(root: Path, out: Path):
    rows = []
    for md in root.rglob("*.md"):
-        rel = md.relative_to(root)
-        date = find_date(md.parent)
-        qnum = md.stem
+        # process each markdown file
+        if len(md.stem) > 2:
+            continue
        text = md.read_text(encoding="utf-8")
        fm, body = parse_frontmatter(text)
+        date = fm.get("date") or find_date(md.parent)
+        qnum = md.stem
        tags = fm.get("tags", [])
        # choose first tag that's not biokemi or provfråga
        category = ""
        for t in tags:
-            if t.lower() not in ("biokemi", "provfråga"):
+            if t and t.lower() not in ("biokemi", "provfråga"):
                category = t
                break
        question, answer = extract_question_answer(body)
-        # normalize whitespace
-        question = re.sub(r"\s+", " ", question).strip()
-        answer = re.sub(r"\s+", " ", answer).strip()
-        details = f"{category}; {date} {qnum} {answer}"
-        rows.append((question, details))
-    # write CSV with semicolon delimiter and quoting
+        # keep original markdown (preserve line breaks) so markdown can render properly
+        question_md = question.strip()
+        answer_md = answer.strip()
+
+        # Render question and answer markdown to HTML. Enable common extensions.
+        question_html = markdown(question_md, extensions=["fenced_code", "tables"])
+        answer_html = markdown(answer_md, extensions=["fenced_code", "tables"])
+
+        # metadata as simple HTML paragraphs so CSV consumer can display it
+        meta_html = f"<p>kategory: {category}</p><p>prov: {date}</p><p>fråga: {qnum}</p>"
+
+        # second column contains the rendered answer followed by metadata HTML
+        details = answer_html + "\n\n" + meta_html
+
+        rows.append((question_html, details, category))
+    # write CSV with semicolon delimiter
    out.parent.mkdir(parents=True, exist_ok=True)
    with out.open("w", encoding="utf-8", newline="") as f:
-        writer = csv.writer(f)
-        writer.writerow(["question", "details"])
+        writer = csv.writer(f, delimiter=";", quoting=csv.QUOTE_ALL)
+        #writer.writerow(["fråga", "svar", "kategori"])
        for r in rows:
            writer.writerow(r)