1
0

vault backup: 2025-12-09 21:46:11
All checks were successful
Deploy Quartz site to GitHub Pages / build (push) Successful in 1m20s

This commit is contained in:
2025-12-09 21:46:11 +01:00
parent a2f484b3af
commit 9e3b6b2cd3
48 changed files with 717 additions and 145 deletions

98
wip/extract-provfråga.py Normal file
View File

@@ -0,0 +1,98 @@
#!/usr/bin/env python3
import re
import csv
import argparse
from pathlib import Path
FRONTMATTER_RE = re.compile(r"^---\s*\n(.*?)\n---\s*\n", re.S)
FENCE_RE = re.compile(r"^```([^\n]*)\n(.*?)\n```", re.S | re.M)
DATE_DIR_RE = re.compile(r"\d{4}-\d{2}-\d{2}")
def find_date(path: Path):
for p in path.parents:
if DATE_DIR_RE.fullmatch(p.name):
return p.name
return ""
def parse_frontmatter(text: str):
m = FRONTMATTER_RE.match(text)
if not m:
return {}, text
fm_raw = m.group(1)
rest = text[m.end():]
tags = []
# simple tags parsing: look for lines under "tags:"
if re.search(r"^\s*tags\s*:\s*$", fm_raw, re.M):
in_tags = False
for line in fm_raw.splitlines():
if re.match(r"^\s*tags\s*:\s*$", line):
in_tags = True
continue
if in_tags:
mm = re.match(r"^\s*-\s*(.+)$", line)
if mm:
tags.append(mm.group(1).strip())
else:
break
# also try single-line tags: tags: [a, b]
if not tags:
m2 = re.search(r"tags\s*:\s*\[([^\]]+)\]", fm_raw)
if m2:
tags = [t.strip() for t in m2.group(1).split(",")]
return {"tags": tags}, rest
def extract_question_answer(body: str):
# find first fenced block (prefer spoiler)
fences = list(FENCE_RE.finditer(body))
spoiler = None
for f in fences:
info = (f.group(1) or "").lower()
if "spoiler" in info:
spoiler = f
break
if spoiler is None and fences:
spoiler = fences[0]
if spoiler:
answer = spoiler.group(2).strip()
question = (body[:spoiler.start()]).strip()
else:
# fallback: everything is question, no answer
answer = ""
question = body.strip()
return question, answer
def main(root: Path, out: Path):
rows = []
for md in root.rglob("*.md"):
rel = md.relative_to(root)
date = find_date(md.parent)
qnum = md.stem
text = md.read_text(encoding="utf-8")
fm, body = parse_frontmatter(text)
tags = fm.get("tags", [])
# choose first tag that's not biokemi or provfråga
category = ""
for t in tags:
if t.lower() not in ("biokemi", "provfråga"):
category = t
break
question, answer = extract_question_answer(body)
# normalize whitespace
question = re.sub(r"\s+", " ", question).strip()
answer = re.sub(r"\s+", " ", answer).strip()
details = f"{category}; {date} {qnum} {answer}"
rows.append((question, details))
# write CSV with semicolon delimiter and quoting
out.parent.mkdir(parents=True, exist_ok=True)
with out.open("w", encoding="utf-8", newline="") as f:
writer = csv.writer(f)
writer.writerow(["question", "details"])
for r in rows:
writer.writerow(r)
if __name__ == "__main__":
ap = argparse.ArgumentParser(description="Extract questions+answers to CSV")
ap.add_argument("root", nargs="?", default="content", help="root content folder (default: content)")
ap.add_argument("-o", "--out", default="output.csv", help="output CSV file (default: output.csv)")
args = ap.parse_args()
main(Path(args.root), Path(args.out))