medical-notes/wip/extract-provfråga.py

# python
#!/usr/bin/env python3
import re
import csv
import argparse
from pathlib import Path

from markdown import markdown

FRONTMATTER_RE = re.compile(r"^---\s*\n(.*?)\n---\s*\n", re.S)
FENCE_RE = re.compile(r"^```([^\n]*)\n(.*?)\n```", re.S | re.M)
DATE_DIR_RE = re.compile(r"\d{4}-\d{2}-\d{2}")

def find_date(path: Path):
    for p in path.parents:
        if DATE_DIR_RE.fullmatch(p.name):
            return p.name
    return ""

def parse_frontmatter(text: str):
    m = FRONTMATTER_RE.match(text)
    if not m:
        return {"tags": [], "date": "", "maxpoints": ""}, text
    fm_raw = m.group(1)
    rest = text[m.end():]
    tags = []
    # simple tags parsing: look for lines under "tags:"
    if re.search(r"^\s*tags\s*:\s*$", fm_raw, re.M):
        in_tags = False
        for line in fm_raw.splitlines():
            if re.match(r"^\s*tags\s*:\s*$", line):
                in_tags = True
                continue
            if in_tags:
                mm = re.match(r"^\s*-\s*(.+)$", line)
                if mm:
                    tags.append(mm.group(1).strip())
                else:
                    break
    # also try single-line tags: tags: [a, b]
    if not tags:
        # handle single-line tags like: tags: [a, b]
        idx = fm_raw.find("tags:")
        if idx != -1:
            # look for first '[' and ']' after the 'tags:' token on the same or next line
            br_start = fm_raw.find("[", idx)
            br_end = fm_raw.find("]", br_start + 1) if br_start != -1 else -1
            if br_start != -1 and br_end != -1:
                inner = fm_raw[br_start+1:br_end]
                tags = [t.strip().strip('"\'') for t in inner.split(",") if t.strip()]
    # parse date from frontmatter if present
    date_val = ""
    mdate = re.search(r"^date\s*:\s*(.+)$", fm_raw, re.M)
    if mdate:
        date_val = mdate.group(1).strip().strip('"\'')
    # parse maxpoints (accept either maxpoints or max_points)
    maxpoints_val = ""
    mmax = re.search(r"^(?:max_points|maxpoints)\s*:\s*(.+)$", fm_raw, re.M)
    if mmax:
        maxpoints_val = mmax.group(1).strip().strip('"\'')
    return {"tags": tags, "date": date_val, "maxpoints": maxpoints_val}, rest

def extract_question_answer(body: str):
    # find first fenced block (prefer spoiler)
    fences = list(FENCE_RE.finditer(body))
    spoiler = None
    for f in fences:
        info = (f.group(1) or "").lower()
        if "spoiler" in info:
            spoiler = f
            break
    if spoiler is None and fences:
        spoiler = fences[0]
    if spoiler:
        answer = spoiler.group(2).strip()
        question = (body[:spoiler.start()]).strip()
    else:
        # fallback: everything is question, no answer
        answer = ""
        question = body.strip()
    return question, answer


def _plain_text_from_md(md_text: str) -> str:
    """Simple cleanup to produce plain text from markdown for CSV/Excel export.
    Removes fenced code blocks, basic markdown punctuation, list markers, and collapses whitespace.
    This is intentionally lightweight (not a full markdown->text renderer) but good enough for spreadsheet import.
    """
    # remove fenced code blocks
    txt = re.sub(r"```.*?```", "", md_text, flags=re.S)
    # remove inline code markers and emphasis/headers
    txt = re.sub(r"[#*_`]+", "", txt)
    # remove list markers at start of lines
    txt = re.sub(r"^\s*-\s+", "", txt, flags=re.M)
    # collapse whitespace to single spaces
    txt = re.sub(r"\s+", " ", txt)
    return txt.strip()


def main(root: Path, out: Path, mode: str = "anki"):
    rows = []
    for md in root.rglob("*.md"):
        # process each markdown file
        if len(md.stem) > 2:
            continue
        text = md.read_text(encoding="utf-8")
        fm, body = parse_frontmatter(text)
        date = fm.get("date") or find_date(md.parent)
        qnum = md.stem
        tags = fm.get("tags", [])
        maxpoints = fm.get("maxpoints", "")
        # choose first tag that's not biokemi or provfråga
        category = ""
        for t in tags:
            if t and t.lower() not in ("biokemi", "provfråga"):
                category = t
                break
        question, answer = extract_question_answer(body)
        # keep original markdown (preserve line breaks) so markdown can render properly
        question_md = question.strip()
        answer_md = answer.strip()

        question_md = question_md.replace("**Uppgift**", "")
        question_md = question_md.replace("**Rätt svar**", "")
        question_md = question_md.replace("**Svar**", "")
        question_md = question_md.replace("**Answer**", "")

        if mode == "excel":
            # produce plain-text (no HTML) rows: date;question_number;question;category;maxpoints;question;answer
            date_val = date or ""
            q_plain = _plain_text_from_md(question_md)
            a_plain = _plain_text_from_md(answer_md)
            # columns: date;question_number;question;category;maxpoints;question;answer
            # move category to be the third column as requested: date;question_number;category;question;maxpoints;question;answer
            rows.append((date_val, qnum, category, q_plain, maxpoints, q_plain, a_plain))
            continue

        # Render question and answer markdown to HTML. Enable common extensions.
        question_html = markdown(question_md, extensions=["fenced_code", "tables"])
        answer_html = markdown(answer_md, extensions=["fenced_code", "tables"])

        # metadata as simple HTML paragraphs so CSV consumer can display it
        meta_html = f"<p>kategory: {category}</p><p>prov: {date}</p><p>fråga: {qnum}</p>"

        # second column contains the rendered answer followed by metadata HTML
        details = answer_html + "\n\n" + meta_html

        rows.append((question_html, details, category))
    # write CSV with semicolon delimiter
    out.parent.mkdir(parents=True, exist_ok=True)
    with out.open("w", encoding="utf-8", newline="") as f:
        writer = csv.writer(f, delimiter=";", quoting=csv.QUOTE_ALL)
        # If exporting for Excel, add a header row matching the columns
        if mode == "excel":
            writer.writerow(["date", "question_number", "category", "question", "maxpoints", "question", "answer"])
        #writer.writerow(["fråga", "svar", "kategori"])
        for r in rows:
            writer.writerow(r)


if __name__ == "__main__":
    ap = argparse.ArgumentParser(description="Extract questions+answers to CSV")
    ap.add_argument("root", nargs="?", default="content", help="root content folder (default: content)")
    ap.add_argument("-o", "--out", default="output.csv", help="output CSV file (default: output.csv)")
    ap.add_argument("--mode", choices=["anki", "excel"], default="anki", help="output mode: 'anki' (default) or 'excel' (date;question_number;question;category;maxpoints;question;answer)")
    args = ap.parse_args()
    main(Path(args.root), Path(args.out), args.mode)