vault backup: 2025-12-26 02:09:22
All checks were successful
Deploy Quartz site to GitHub Pages / build (push) Successful in 2m29s
All checks were successful
Deploy Quartz site to GitHub Pages / build (push) Successful in 2m29s
This commit is contained in:
465
stroma/quiz/utils/unified_parser.py
Normal file
465
stroma/quiz/utils/unified_parser.py
Normal file
@@ -0,0 +1,465 @@
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from enum import Enum
|
||||
from typing import Any
|
||||
|
||||
from quiz.utils.question_parser import Node, parse_question_from_content
|
||||
|
||||
# === REGEX PATTERNS ===
|
||||
|
||||
# Matches Obsidian-style embeds like ![[image.png]] or ![[image.png|300]]
|
||||
EMBED_RE = re.compile(
|
||||
r"!\[\[" # Start of embed
|
||||
r".*?" # Content (filename and optional pipes)
|
||||
r"\]\]" # End of embed
|
||||
)
|
||||
|
||||
# Captures the filename from an Obsidian embed, ignoring dimensions
|
||||
IMAGE_RE = re.compile(
|
||||
r"!\[\[" # Start of embed
|
||||
r"([^|\]]+)" # Group 1: Filename (everything before | or ])
|
||||
r"(?:\|.*?)?" # Optional dimension part starting with |
|
||||
r"\]\]" # End of embed
|
||||
)
|
||||
|
||||
# Matches lettered options at the start of a line, e.g., "A: Text" or "B. Text"
|
||||
OPTION_LETTER_RE = re.compile(
|
||||
r"^([A-Z])" # Group 1: Single uppercase letter at start
|
||||
r"[:\.]?" # Optional colon or period
|
||||
r"\s*" # Optional whitespace
|
||||
r"(.*)$" # Group 2: The rest of the text
|
||||
)
|
||||
|
||||
# Matches standalone uppercase letters used for answers, e.g., "A", "A och B"
|
||||
ANSWER_LETTER_RE = re.compile(
|
||||
r"\b" # Word boundary
|
||||
r"([A-Z])" # Group 1: Single uppercase letter
|
||||
r"\b" # Word boundary
|
||||
)
|
||||
|
||||
# Matches sub-question markers like a), b) at the start of a line
|
||||
SUB_QUESTION_LETTER_RE = re.compile(
|
||||
r"^\s*" # Start of line and optional whitespace
|
||||
r"([a-z])" # Group 1: Single lowercase letter
|
||||
r"\)" # Closing parenthesis
|
||||
, re.MULTILINE)
|
||||
|
||||
# Matches numbered sub-question markers like 1), 2) at the start of a line
|
||||
SUB_QUESTION_NUMBER_RE = re.compile(
|
||||
r"^\s*" # Start of line and optional whitespace
|
||||
r"(\d+)" # Group 1: One or more digits
|
||||
r"\)" # Closing parenthesis
|
||||
, re.MULTILINE)
|
||||
|
||||
# Matches select range patterns like (1..10)
|
||||
SELECT_RANGE_RE = re.compile(
|
||||
r"\(" # Opening parenthesis
|
||||
r"(\d+)" # Group 1: Start number
|
||||
r"\.\." # Range dots
|
||||
r"(\d+)" # Group 2: End number
|
||||
r"\)" # Closing parenthesis
|
||||
)
|
||||
|
||||
# Matches letter range patterns like (A..H)
|
||||
SELECT_LETTER_RANGE_RE = re.compile(
|
||||
r"\(" # Opening parenthesis
|
||||
r"([A-Z])" # Group 1: Start letter
|
||||
r"\.\." # Range dots
|
||||
r"([A-Z])" # Group 2: End letter
|
||||
r"\)" # Closing parenthesis
|
||||
)
|
||||
|
||||
# Matches select list patterns like (A, B, C)
|
||||
SELECT_LIST_RE = re.compile(
|
||||
r"\(" # Opening parenthesis
|
||||
r"(" # Group 1: The list content
|
||||
r"[^)]+" # Anything but closing parenthesis
|
||||
r"," # At least one comma
|
||||
r"[^)]+" # Anything but closing parenthesis
|
||||
r")"
|
||||
r"\)" # Closing parenthesis
|
||||
)
|
||||
|
||||
# Matches sub-question markers in mid-text (used for splitting intro text)
|
||||
FIELD_MARKER_RE = re.compile(
|
||||
r"\b" # Word boundary
|
||||
r"([a-z]|\d+)" # Group 1: Letter or digit
|
||||
r"\)" # Closing parenthesis
|
||||
)
|
||||
|
||||
# Matches sub-question markers (a, b or 1, 2) at start of line for splitting
|
||||
SUB_QUESTION_SPLIT_RE = re.compile(
|
||||
r"^\s*" # Start of line and optional whitespace
|
||||
r"([a-z]|\d+)" # Group 1: Single letter or one or more digits
|
||||
r"\)" # Closing parenthesis
|
||||
r"\s*" # Optional trailing whitespace
|
||||
, re.MULTILINE)
|
||||
|
||||
# Matches point markers like (0.5 p) or (1 p)
|
||||
POINTS_RE = re.compile(
|
||||
r"\(" # Opening parenthesis
|
||||
r"\d+" # One or more digits
|
||||
r"(?:\.\d+)?" # Optional decimal part
|
||||
r"\s*" # Optional whitespace
|
||||
r"p" # Literal 'p'
|
||||
r"\)" # Closing parenthesis
|
||||
)
|
||||
|
||||
|
||||
class QuestionType(Enum):
|
||||
MCQ = "mcq"
|
||||
SCQ = "scq"
|
||||
MATCHING = "matching"
|
||||
TEXTALTERNATIV = "textalternativ"
|
||||
TEXTFÄLT = "textfält"
|
||||
SIFFERFÄLT = "sifferfält"
|
||||
HOTSPOT = "hotspot"
|
||||
SAMMANSATT = "sammansatt"
|
||||
DND_TEXT = "dnd-text"
|
||||
DND_BILD = "dnd-bild"
|
||||
SANT_FALSKT = "sant-falskt"
|
||||
|
||||
|
||||
@dataclass
|
||||
class SubQuestion:
|
||||
id: str # 'a', 'b', etc.
|
||||
text: str # Text for this part
|
||||
answer: Any = None
|
||||
options: list[str] | None = None # None if text input
|
||||
|
||||
|
||||
@dataclass
|
||||
class QuestionData:
|
||||
type: QuestionType
|
||||
question: str
|
||||
answer: Any # str | list[str] | list[list[str]]
|
||||
num_questions: int = 1 # Total sub-questions (a, b, c...)
|
||||
is_complete: bool = False
|
||||
options: list[str] = field(default_factory=list)
|
||||
image: str | None = None
|
||||
answer_image: str | None = None
|
||||
instruction: str | None = None
|
||||
metadata: dict = field(default_factory=dict)
|
||||
sub_questions: list[SubQuestion] = field(default_factory=list)
|
||||
|
||||
|
||||
class UnifiedParser:
|
||||
def __init__(self, content: str):
|
||||
self.content = content
|
||||
self.parsed = parse_question_from_content(content)
|
||||
self.metadata = self.parsed.metadata
|
||||
self.nodes = self.parsed.nodes
|
||||
|
||||
# Pre-extract common fields
|
||||
self.type = self._extract_type()
|
||||
self.question = self._extract_question_text()
|
||||
self.instruction = self._extract_instruction()
|
||||
self.image = self._extract_image()
|
||||
self.num_questions = self._count_sub_questions()
|
||||
|
||||
def parse(self) -> QuestionData:
|
||||
match self.type:
|
||||
case QuestionType.MCQ | QuestionType.SCQ:
|
||||
data = self._parse_choice_question()
|
||||
case QuestionType.MATCHING:
|
||||
data = self._create_question(
|
||||
answer=self._extract_answer_pairs(),
|
||||
options=self._extract_bullet_list_options()
|
||||
)
|
||||
case QuestionType.TEXTALTERNATIV:
|
||||
data = self._create_question(
|
||||
answer=self._extract_raw_answer(),
|
||||
options=self._extract_bullet_list_options()
|
||||
)
|
||||
case QuestionType.TEXTFÄLT:
|
||||
data = self._parse_text_field()
|
||||
case QuestionType.SIFFERFÄLT:
|
||||
data = self._create_question(answer=self._extract_raw_answer())
|
||||
case QuestionType.HOTSPOT:
|
||||
data = self._parse_hotspot()
|
||||
case QuestionType.SAMMANSATT:
|
||||
data = self._create_question(answer=self._extract_answer_lines())
|
||||
case QuestionType.DND_TEXT:
|
||||
data = self._create_question(answer=self._extract_answer_lines())
|
||||
case QuestionType.DND_BILD:
|
||||
data = self._create_question(answer=self._extract_answer_lines())
|
||||
case QuestionType.SANT_FALSKT:
|
||||
data = self._create_question(answer=self._extract_answer_pairs())
|
||||
case _:
|
||||
raise ValueError(f"Unsupported question type: {self.type}")
|
||||
|
||||
data.num_questions = self.num_questions
|
||||
data.sub_questions = self._extract_sub_questions(data)
|
||||
data.is_complete = self._check_completeness(data)
|
||||
return data
|
||||
|
||||
def _check_completeness(self, data: QuestionData) -> bool:
|
||||
"""Verify if the answer is complete (no TODOs, matches sub-question count)."""
|
||||
content = self._extract_raw_answer()
|
||||
if not content or "TODO" in content:
|
||||
return False
|
||||
|
||||
# If we have sub-questions, ensure we have enough answer lines/parts
|
||||
if data.num_questions > 1:
|
||||
if isinstance(data.answer, list):
|
||||
if data.type in [QuestionType.MCQ, QuestionType.SCQ]:
|
||||
return len(data.answer) > 0
|
||||
return len(data.answer) >= data.num_questions
|
||||
else:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def _count_sub_questions(self) -> int:
|
||||
"""Count sub-questions like a), b), c) or 1), 2) in the question text."""
|
||||
md_content = self.parsed.raw_content
|
||||
|
||||
# Count lettered sub-questions: a), b), c)...
|
||||
letters = SUB_QUESTION_LETTER_RE.findall(md_content)
|
||||
if letters:
|
||||
unique_letters = sorted(list(set(letters)))
|
||||
if "a" in unique_letters:
|
||||
max_letter = max(unique_letters)
|
||||
return ord(max_letter) - ord("a") + 1
|
||||
|
||||
# Count numbered sub-questions: 1), 2), 3)...
|
||||
numbers = SUB_QUESTION_NUMBER_RE.findall(md_content)
|
||||
if numbers:
|
||||
unique_numbers = sorted(list(set(map(int, numbers))))
|
||||
if 1 in unique_numbers:
|
||||
return max(unique_numbers)
|
||||
|
||||
return 1
|
||||
|
||||
def _create_question(
|
||||
self,
|
||||
answer: Any,
|
||||
options: list[str] = None,
|
||||
answer_image: str | None = None
|
||||
) -> QuestionData:
|
||||
"""Create a QuestionData object with common fields pre-populated."""
|
||||
return QuestionData(
|
||||
type=self.type,
|
||||
question=self.question,
|
||||
answer=answer,
|
||||
options=options or [],
|
||||
image=self.image,
|
||||
answer_image=answer_image,
|
||||
instruction=self.instruction,
|
||||
metadata=self.metadata
|
||||
)
|
||||
|
||||
# === Extraction Helpers ===
|
||||
|
||||
def _extract_type(self) -> QuestionType:
|
||||
tags = self.metadata.get("tags", [])
|
||||
for tag in tags:
|
||||
if tag.startswith("frågetyp/"):
|
||||
type_str = tag.split("/", 1)[1]
|
||||
try:
|
||||
return QuestionType(type_str)
|
||||
except ValueError:
|
||||
continue
|
||||
return QuestionType.MCQ # Default
|
||||
|
||||
def _extract_question_text(self) -> str:
|
||||
texts = []
|
||||
for node in self.nodes:
|
||||
if node.type == "paragraph":
|
||||
text = node.text.strip()
|
||||
# Skip instructions
|
||||
if text.startswith("Välj") and "alternativ" in text:
|
||||
continue
|
||||
|
||||
# If paragraph contains a sub-question marker, stop there
|
||||
# We use a more liberal search here because mistune might have joined lines
|
||||
first_marker = FIELD_MARKER_RE.search(text)
|
||||
|
||||
if first_marker:
|
||||
text = text[:first_marker.start()].strip()
|
||||
if text:
|
||||
# Only add if it doesn't look like an instruction we already skipped
|
||||
if not (text.startswith("Välj") and "alternativ" in text):
|
||||
texts.append(text)
|
||||
break # Stop collecting intro text once we hit a sub-question
|
||||
|
||||
# Clean and collect
|
||||
text = EMBED_RE.sub("", text).strip()
|
||||
text = text.replace("**", "")
|
||||
if text:
|
||||
texts.append(text)
|
||||
return "\n".join(texts)
|
||||
|
||||
def _extract_instruction(self) -> str | None:
|
||||
for node in self.nodes:
|
||||
if node.type == "paragraph":
|
||||
text = node.text.strip()
|
||||
if "Välj" in text and "alternativ" in text:
|
||||
return text.replace("**", "")
|
||||
return None
|
||||
|
||||
def _extract_image(self) -> str | None:
|
||||
for node in self.nodes:
|
||||
# Check for direct embed nodes
|
||||
if node.type == "embed":
|
||||
return f"![[{node.attrs['filename']}]]"
|
||||
|
||||
# Check inside paragraphs/lists for inline embeds
|
||||
if node.type in ["paragraph", "list"]:
|
||||
for child in node.children:
|
||||
if child.type == "embed":
|
||||
return f"![[{child.attrs['filename']}]]"
|
||||
|
||||
if node.raw:
|
||||
match = IMAGE_RE.search(node.raw)
|
||||
if match:
|
||||
return f"![[{match.group(1)}]]"
|
||||
return None
|
||||
|
||||
def _extract_sub_questions(self, data: QuestionData) -> list[SubQuestion]:
|
||||
# Only split the text BEFORE the spoiler block to avoid misidentifying markers in answers
|
||||
full_raw = self.parsed.raw_content
|
||||
parts = full_raw.split("```", 1)
|
||||
question_portion = parts[0]
|
||||
|
||||
# Split by sub-question markers at the start of lines: a), b) or 1), 2)
|
||||
segments = SUB_QUESTION_SPLIT_RE.split(question_portion)[1:]
|
||||
|
||||
sub_questions = []
|
||||
# segments will be [id1, text1, id2, text2, ...]
|
||||
for i in range(0, len(segments), 2):
|
||||
q_id = segments[i]
|
||||
q_full_text = segments[i+1].strip()
|
||||
|
||||
# Extract options if any (for select fields)
|
||||
options = self._extract_select_options(q_full_text)
|
||||
|
||||
# Clean text (remove point markers like (0.5 p) and select patterns)
|
||||
clean_text = SELECT_RANGE_RE.sub("", q_full_text)
|
||||
clean_text = SELECT_LETTER_RANGE_RE.sub("", clean_text)
|
||||
clean_text = SELECT_LIST_RE.sub("", clean_text)
|
||||
clean_text = POINTS_RE.sub("", clean_text).strip()
|
||||
|
||||
# Extract answer for this part
|
||||
answer = None
|
||||
if isinstance(data.answer, list) and i//2 < len(data.answer):
|
||||
answer = data.answer[i//2]
|
||||
elif isinstance(data.answer, str):
|
||||
lines = [l.strip() for l in data.answer.split("\n") if l.strip()]
|
||||
if i//2 < len(lines):
|
||||
answer = lines[i//2]
|
||||
elif data.num_questions == 1:
|
||||
answer = data.answer
|
||||
|
||||
sub_questions.append(SubQuestion(
|
||||
id=q_id,
|
||||
text=clean_text,
|
||||
answer=answer,
|
||||
options=options
|
||||
))
|
||||
|
||||
return sub_questions
|
||||
|
||||
def _extract_select_options(self, text: str) -> list[str] | None:
|
||||
"""Extract options from patterns like (1..10), (A..D), or (A, B, C)."""
|
||||
# Numerical range (1..10)
|
||||
match = SELECT_RANGE_RE.search(text)
|
||||
if match:
|
||||
start, end = map(int, match.groups())
|
||||
return [str(x) for x in range(start, end + 1)]
|
||||
|
||||
# Letter range (A..H)
|
||||
match = SELECT_LETTER_RANGE_RE.search(text)
|
||||
if match:
|
||||
start, end = match.groups()
|
||||
return [chr(x) for x in range(ord(start), ord(end) + 1)]
|
||||
|
||||
# Comma-separated list (A, B, C)
|
||||
match = SELECT_LIST_RE.search(text)
|
||||
if match:
|
||||
items = match.group(1).split(",")
|
||||
return [item.strip() for item in items]
|
||||
|
||||
return None
|
||||
|
||||
def _extract_lettered_options(self) -> list[str]:
|
||||
options = []
|
||||
for node in self.nodes:
|
||||
if node.type == "list":
|
||||
for item in node.children:
|
||||
item_text = item.text.strip()
|
||||
if OPTION_LETTER_RE.match(item_text):
|
||||
options.append(item_text)
|
||||
return options
|
||||
|
||||
def _extract_bullet_list_options(self) -> list[str]:
|
||||
options = []
|
||||
for node in self.nodes:
|
||||
if node.type == "list":
|
||||
for item in node.children:
|
||||
options.append(item.text.strip())
|
||||
return options
|
||||
|
||||
def _extract_raw_answer(self) -> str:
|
||||
for node in self.nodes:
|
||||
if node.type == "block_code" and node.attrs.get("info") == "spoiler-block:":
|
||||
return node.raw.strip()
|
||||
return ""
|
||||
|
||||
def _extract_answer_letters(self) -> list[str]:
|
||||
content = self._extract_raw_answer()
|
||||
if not content or content == "TODO":
|
||||
return []
|
||||
return ANSWER_LETTER_RE.findall(content)
|
||||
|
||||
def _extract_answer_lines(self) -> list[str]:
|
||||
content = self._extract_raw_answer()
|
||||
if not content or content == "TODO":
|
||||
return []
|
||||
return [line.strip() for line in content.split("\n") if line.strip()]
|
||||
|
||||
def _extract_answer_pairs(self) -> list[list[str]]:
|
||||
lines = self._extract_answer_lines()
|
||||
pairs = []
|
||||
for line in lines:
|
||||
if ":" in line:
|
||||
key, value = line.split(":", 1)
|
||||
pairs.append([key.strip(), value.strip()])
|
||||
return pairs
|
||||
|
||||
# === Question Type Handlers ===
|
||||
|
||||
def _parse_choice_question(self) -> QuestionData:
|
||||
answer_letters = self._extract_answer_letters()
|
||||
if self.type == QuestionType.MCQ:
|
||||
answer = answer_letters
|
||||
else:
|
||||
answer = answer_letters[0] if answer_letters else ""
|
||||
|
||||
return self._create_question(
|
||||
answer=answer,
|
||||
options=self._extract_lettered_options()
|
||||
)
|
||||
|
||||
def _parse_text_field(self) -> QuestionData:
|
||||
lines = self._extract_answer_lines()
|
||||
return self._create_question(
|
||||
answer=lines if len(lines) > 1 else (lines[0] if lines else "")
|
||||
)
|
||||
|
||||
|
||||
def _parse_hotspot(self) -> QuestionData:
|
||||
content = self._extract_raw_answer()
|
||||
answer_image = None
|
||||
|
||||
match = IMAGE_RE.search(content)
|
||||
if match:
|
||||
answer_image = f"![[{match.group(1)}]]"
|
||||
answer_text = EMBED_RE.sub("", content).strip()
|
||||
else:
|
||||
answer_text = content
|
||||
|
||||
return self._create_question(
|
||||
answer=answer_text,
|
||||
answer_image=answer_image
|
||||
)
|
||||
Reference in New Issue
Block a user