#!/usr/bin/env python3 import re import csv import argparse from pathlib import Path FRONTMATTER_RE = re.compile(r"^---\s*\n(.*?)\n---\s*\n", re.S) FENCE_RE = re.compile(r"^```([^\n]*)\n(.*?)\n```", re.S | re.M) DATE_DIR_RE = re.compile(r"\d{4}-\d{2}-\d{2}") def find_date(path: Path): for p in path.parents: if DATE_DIR_RE.fullmatch(p.name): return p.name return "" def parse_frontmatter(text: str): m = FRONTMATTER_RE.match(text) if not m: return {}, text fm_raw = m.group(1) rest = text[m.end():] tags = [] # simple tags parsing: look for lines under "tags:" if re.search(r"^\s*tags\s*:\s*$", fm_raw, re.M): in_tags = False for line in fm_raw.splitlines(): if re.match(r"^\s*tags\s*:\s*$", line): in_tags = True continue if in_tags: mm = re.match(r"^\s*-\s*(.+)$", line) if mm: tags.append(mm.group(1).strip()) else: break # also try single-line tags: tags: [a, b] if not tags: m2 = re.search(r"tags\s*:\s*\[([^\]]+)\]", fm_raw) if m2: tags = [t.strip() for t in m2.group(1).split(",")] return {"tags": tags}, rest def extract_question_answer(body: str): # find first fenced block (prefer spoiler) fences = list(FENCE_RE.finditer(body)) spoiler = None for f in fences: info = (f.group(1) or "").lower() if "spoiler" in info: spoiler = f break if spoiler is None and fences: spoiler = fences[0] if spoiler: answer = spoiler.group(2).strip() question = (body[:spoiler.start()]).strip() else: # fallback: everything is question, no answer answer = "" question = body.strip() return question, answer def main(root: Path, out: Path): rows = [] for md in root.rglob("*.md"): rel = md.relative_to(root) date = find_date(md.parent) qnum = md.stem text = md.read_text(encoding="utf-8") fm, body = parse_frontmatter(text) tags = fm.get("tags", []) # choose first tag that's not biokemi or provfråga category = "" for t in tags: if t.lower() not in ("biokemi", "provfråga"): category = t break question, answer = extract_question_answer(body) # normalize whitespace question = re.sub(r"\s+", " ", question).strip() answer = re.sub(r"\s+", " ", answer).strip() details = f"{category}; {date} {qnum} {answer}" rows.append((question, details)) # write CSV with semicolon delimiter and quoting out.parent.mkdir(parents=True, exist_ok=True) with out.open("w", encoding="utf-8", newline="") as f: writer = csv.writer(f) writer.writerow(["question", "details"]) for r in rows: writer.writerow(r) if __name__ == "__main__": ap = argparse.ArgumentParser(description="Extract questions+answers to CSV") ap.add_argument("root", nargs="?", default="content", help="root content folder (default: content)") ap.add_argument("-o", "--out", default="output.csv", help="output CSV file (default: output.csv)") args = ap.parse_args() main(Path(args.root), Path(args.out))