All checks were successful
Deploy Quartz site to GitHub Pages / build (push) Successful in 1m34s
172 lines
7.1 KiB
Python
172 lines
7.1 KiB
Python
# python
|
|
#!/usr/bin/env python3
|
|
import re
|
|
import csv
|
|
import argparse
|
|
from pathlib import Path
|
|
|
|
from markdown import markdown
|
|
|
|
FRONTMATTER_RE = re.compile(r"^---\s*\n(.*?)\n---\s*\n", re.S)
|
|
FENCE_RE = re.compile(r"^```([^\n]*)\n(.*?)\n```", re.S | re.M)
|
|
DATE_DIR_RE = re.compile(r"\d{4}-\d{2}-\d{2}")
|
|
|
|
def find_date(path: Path):
|
|
for p in path.parents:
|
|
if DATE_DIR_RE.fullmatch(p.name):
|
|
return p.name
|
|
return ""
|
|
|
|
def parse_frontmatter(text: str):
|
|
m = FRONTMATTER_RE.match(text)
|
|
if not m:
|
|
return {"tags": [], "date": "", "maxpoints": ""}, text
|
|
fm_raw = m.group(1)
|
|
rest = text[m.end():]
|
|
tags = []
|
|
# simple tags parsing: look for lines under "tags:"
|
|
if re.search(r"^\s*tags\s*:\s*$", fm_raw, re.M):
|
|
in_tags = False
|
|
for line in fm_raw.splitlines():
|
|
if re.match(r"^\s*tags\s*:\s*$", line):
|
|
in_tags = True
|
|
continue
|
|
if in_tags:
|
|
mm = re.match(r"^\s*-\s*(.+)$", line)
|
|
if mm:
|
|
tags.append(mm.group(1).strip())
|
|
else:
|
|
break
|
|
# also try single-line tags: tags: [a, b]
|
|
if not tags:
|
|
# handle single-line tags like: tags: [a, b]
|
|
idx = fm_raw.find("tags:")
|
|
if idx != -1:
|
|
# look for first '[' and ']' after the 'tags:' token on the same or next line
|
|
br_start = fm_raw.find("[", idx)
|
|
br_end = fm_raw.find("]", br_start + 1) if br_start != -1 else -1
|
|
if br_start != -1 and br_end != -1:
|
|
inner = fm_raw[br_start+1:br_end]
|
|
tags = [t.strip().strip('"\'') for t in inner.split(",") if t.strip()]
|
|
# parse date from frontmatter if present
|
|
date_val = ""
|
|
mdate = re.search(r"^date\s*:\s*(.+)$", fm_raw, re.M)
|
|
if mdate:
|
|
date_val = mdate.group(1).strip().strip('"\'')
|
|
# parse maxpoints (accept either maxpoints or max_points)
|
|
maxpoints_val = ""
|
|
mmax = re.search(r"^(?:max_points|maxpoints)\s*:\s*(.+)$", fm_raw, re.M)
|
|
if mmax:
|
|
maxpoints_val = mmax.group(1).strip().strip('"\'')
|
|
return {"tags": tags, "date": date_val, "maxpoints": maxpoints_val}, rest
|
|
|
|
def extract_question_answer(body: str):
|
|
# find first fenced block (prefer spoiler)
|
|
fences = list(FENCE_RE.finditer(body))
|
|
spoiler = None
|
|
for f in fences:
|
|
info = (f.group(1) or "").lower()
|
|
if "spoiler" in info:
|
|
spoiler = f
|
|
break
|
|
if spoiler is None and fences:
|
|
spoiler = fences[0]
|
|
if spoiler:
|
|
answer = spoiler.group(2).strip()
|
|
question = (body[:spoiler.start()]).strip()
|
|
else:
|
|
# fallback: everything is question, no answer
|
|
answer = ""
|
|
question = body.strip()
|
|
return question, answer
|
|
|
|
|
|
def _plain_text_from_md(md_text: str) -> str:
|
|
"""Simple cleanup to produce plain text from markdown for CSV/Excel export.
|
|
Removes fenced code blocks, basic markdown punctuation, list markers, and collapses whitespace.
|
|
This is intentionally lightweight (not a full markdown->text renderer) but good enough for spreadsheet import.
|
|
"""
|
|
# remove fenced code blocks
|
|
txt = re.sub(r"```.*?```", "", md_text, flags=re.S)
|
|
# remove inline code markers and emphasis/headers
|
|
txt = re.sub(r"[#*_`]+", "", txt)
|
|
# remove list markers at start of lines
|
|
txt = re.sub(r"^\s*-\s+", "", txt, flags=re.M)
|
|
# collapse whitespace to single spaces
|
|
txt = re.sub(r"\s+", " ", txt)
|
|
return txt.strip()
|
|
|
|
|
|
def main(root: Path, out: Path, mode: str = "anki"):
|
|
rows = []
|
|
for md in root.rglob("*.md"):
|
|
# process each markdown file
|
|
if len(md.stem) > 2:
|
|
continue
|
|
text = md.read_text(encoding="utf-8")
|
|
fm, body = parse_frontmatter(text)
|
|
date = fm.get("date") or find_date(md.parent)
|
|
qnum = md.stem
|
|
tags = fm.get("tags", [])
|
|
maxpoints = fm.get("maxpoints", "")
|
|
# choose first tag that's not biokemi or provfråga
|
|
category = ""
|
|
for t in tags:
|
|
if t and t.lower() not in ("biokemi", "provfråga"):
|
|
category = t
|
|
break
|
|
question, answer = extract_question_answer(body)
|
|
# keep original markdown (preserve line breaks) so markdown can render properly
|
|
question_md = question.strip()
|
|
answer_md = answer.strip()
|
|
|
|
question_md = question_md.replace("**Uppgift**", "")
|
|
question_md = question_md.replace("**Rätt svar**", "")
|
|
question_md = question_md.replace("**Svar**", "")
|
|
question_md = question_md.replace("**Answer**", "")
|
|
|
|
if mode == "excel":
|
|
# produce plain-text (no HTML) rows: date;question_number;question;category;maxpoints;question;answer
|
|
date_val = date or ""
|
|
q_plain = _plain_text_from_md(question_md)
|
|
a_plain = _plain_text_from_md(answer_md)
|
|
# columns: date;question_number;question;category;maxpoints;question;answer
|
|
# move category to be the third column as requested: date;question_number;category;question;maxpoints;question;answer
|
|
rows.append((date_val, qnum, category, q_plain, maxpoints, q_plain, a_plain))
|
|
continue
|
|
|
|
# Render question and answer markdown to HTML. Enable common extensions.
|
|
question_html = markdown(question_md, extensions=["fenced_code", "tables"])
|
|
answer_html = markdown(answer_md, extensions=["fenced_code", "tables"])
|
|
|
|
# metadata as simple HTML paragraphs so CSV consumer can display it
|
|
meta_html = f"<p>kategory: {category}</p><p>prov: {date}</p><p>fråga: {qnum}</p>"
|
|
|
|
# second column contains the rendered answer followed by metadata HTML
|
|
details = answer_html + "\n\n" + meta_html
|
|
|
|
# construct Anki deck name: use category if present, otherwise fallback to 'FÖRELÄSNING'
|
|
deck_suffix = category if category else "?"
|
|
deck_name = f"Biokemi::Johan D - Tentafrågor::{deck_suffix.capitalize()}"
|
|
# For Anki export, include deck as first column
|
|
rows.append((question_html, details, category, deck_name))
|
|
# write CSV with semicolon delimiter
|
|
out.parent.mkdir(parents=True, exist_ok=True)
|
|
with out.open("w", encoding="utf-8", newline="") as f:
|
|
f.write("#deck column:4\n")
|
|
writer = csv.writer(f, delimiter=";", quoting=csv.QUOTE_ALL)
|
|
# If exporting for Excel, add a header row matching the columns
|
|
if mode == "excel":
|
|
writer.writerow(["date", "question_number", "category", "question", "maxpoints", "question", "answer"])
|
|
for r in rows:
|
|
writer.writerow(r)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
ap = argparse.ArgumentParser(description="Extract questions+answers to CSV")
|
|
ap.add_argument("root", nargs="?", default="content", help="root content folder (default: content)")
|
|
ap.add_argument("-o", "--out", default="output.csv", help="output CSV file (default: output.csv)")
|
|
ap.add_argument("--mode", choices=["anki", "excel"], default="anki", help="output mode: 'anki' (default) or 'excel' (date;question_number;question;category;maxpoints;question;answer)")
|
|
args = ap.parse_args()
|
|
main(Path(args.root), Path(args.out), args.mode)
|