vault backup: 2025-12-26 02:09:22

2025-12-26 02:09:22 +01:00
parent 3fddadfe50
commit 50366b9b9c
288 changed files with 58893 additions and 750 deletions
--- a/stroma/quiz/utils/unified_parser.py
+++ b/stroma/quiz/utils/unified_parser.py
@@ -0,0 +1,465 @@
+import re
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Any
+
+from quiz.utils.question_parser import Node, parse_question_from_content
+
+# === REGEX PATTERNS ===
+
+# Matches Obsidian-style embeds like ![[image.png]] or ![[image.png|300]]
+EMBED_RE = re.compile(
+    r"!\[\["      # Start of embed
+    r".*?"        # Content (filename and optional pipes)
+    r"\]\]"       # End of embed
+)
+
+# Captures the filename from an Obsidian embed, ignoring dimensions
+IMAGE_RE = re.compile(
+    r"!\[\["      # Start of embed
+    r"([^|\]]+)"  # Group 1: Filename (everything before | or ])
+    r"(?:\|.*?)?" # Optional dimension part starting with |
+    r"\]\]"       # End of embed
+)
+
+# Matches lettered options at the start of a line, e.g., "A: Text" or "B. Text"
+OPTION_LETTER_RE = re.compile(
+    r"^([A-Z])"   # Group 1: Single uppercase letter at start
+    r"[:\.]?"     # Optional colon or period
+    r"\s*"        # Optional whitespace
+    r"(.*)$"      # Group 2: The rest of the text
+)
+
+# Matches standalone uppercase letters used for answers, e.g., "A", "A och B"
+ANSWER_LETTER_RE = re.compile(
+    r"\b"         # Word boundary
+    r"([A-Z])"    # Group 1: Single uppercase letter
+    r"\b"         # Word boundary
+)
+
+# Matches sub-question markers like a), b) at the start of a line
+SUB_QUESTION_LETTER_RE = re.compile(
+    r"^\s*"          # Start of line and optional whitespace
+    r"([a-z])"       # Group 1: Single lowercase letter
+    r"\)"            # Closing parenthesis
+, re.MULTILINE)
+
+# Matches numbered sub-question markers like 1), 2) at the start of a line
+SUB_QUESTION_NUMBER_RE = re.compile(
+    r"^\s*"          # Start of line and optional whitespace
+    r"(\d+)"         # Group 1: One or more digits
+    r"\)"            # Closing parenthesis
+, re.MULTILINE)
+
+# Matches select range patterns like (1..10)
+SELECT_RANGE_RE = re.compile(
+    r"\("          # Opening parenthesis
+    r"(\d+)"       # Group 1: Start number
+    r"\.\."        # Range dots
+    r"(\d+)"       # Group 2: End number
+    r"\)"          # Closing parenthesis
+)
+
+# Matches letter range patterns like (A..H)
+SELECT_LETTER_RANGE_RE = re.compile(
+    r"\("          # Opening parenthesis
+    r"([A-Z])"     # Group 1: Start letter
+    r"\.\."        # Range dots
+    r"([A-Z])"     # Group 2: End letter
+    r"\)"          # Closing parenthesis
+)
+
+# Matches select list patterns like (A, B, C)
+SELECT_LIST_RE = re.compile(
+    r"\("          # Opening parenthesis
+    r"("           # Group 1: The list content
+    r"[^)]+"       # Anything but closing parenthesis
+    r","           # At least one comma
+    r"[^)]+"       # Anything but closing parenthesis
+    r")"
+    r"\)"          # Closing parenthesis
+)
+
+# Matches sub-question markers in mid-text (used for splitting intro text)
+FIELD_MARKER_RE = re.compile(
+    r"\b"          # Word boundary
+    r"([a-z]|\d+)" # Group 1: Letter or digit
+    r"\)"          # Closing parenthesis
+)
+
+# Matches sub-question markers (a, b or 1, 2) at start of line for splitting
+SUB_QUESTION_SPLIT_RE = re.compile(
+    r"^\s*"          # Start of line and optional whitespace
+    r"([a-z]|\d+)"   # Group 1: Single letter or one or more digits
+    r"\)"            # Closing parenthesis
+    r"\s*"           # Optional trailing whitespace
+, re.MULTILINE)
+
+# Matches point markers like (0.5 p) or (1 p)
+POINTS_RE = re.compile(
+    r"\("            # Opening parenthesis
+    r"\d+"           # One or more digits
+    r"(?:\.\d+)?"    # Optional decimal part
+    r"\s*"           # Optional whitespace
+    r"p"             # Literal 'p'
+    r"\)"            # Closing parenthesis
+)
+
+
+class QuestionType(Enum):
+    MCQ = "mcq"
+    SCQ = "scq"
+    MATCHING = "matching"
+    TEXTALTERNATIV = "textalternativ"
+    TEXTFÄLT = "textfält"
+    SIFFERFÄLT = "sifferfält"
+    HOTSPOT = "hotspot"
+    SAMMANSATT = "sammansatt"
+    DND_TEXT = "dnd-text"
+    DND_BILD = "dnd-bild"
+    SANT_FALSKT = "sant-falskt"
+
+
+@dataclass
+class SubQuestion:
+    id: str                # 'a', 'b', etc.
+    text: str              # Text for this part
+    answer: Any = None
+    options: list[str] | None = None  # None if text input
+
+
+@dataclass
+class QuestionData:
+    type: QuestionType
+    question: str
+    answer: Any  # str | list[str] | list[list[str]]
+    num_questions: int = 1  # Total sub-questions (a, b, c...)
+    is_complete: bool = False
+    options: list[str] = field(default_factory=list)
+    image: str | None = None
+    answer_image: str | None = None
+    instruction: str | None = None
+    metadata: dict = field(default_factory=dict)
+    sub_questions: list[SubQuestion] = field(default_factory=list)
+
+
+class UnifiedParser:
+    def __init__(self, content: str):
+        self.content = content
+        self.parsed = parse_question_from_content(content)
+        self.metadata = self.parsed.metadata
+        self.nodes = self.parsed.nodes
+        
+        # Pre-extract common fields
+        self.type = self._extract_type()
+        self.question = self._extract_question_text()
+        self.instruction = self._extract_instruction()
+        self.image = self._extract_image()
+        self.num_questions = self._count_sub_questions()
+
+    def parse(self) -> QuestionData:
+        match self.type:
+            case QuestionType.MCQ | QuestionType.SCQ:
+                data = self._parse_choice_question()
+            case QuestionType.MATCHING:
+                data = self._create_question(
+                    answer=self._extract_answer_pairs(),
+                    options=self._extract_bullet_list_options()
+                )
+            case QuestionType.TEXTALTERNATIV:
+                data = self._create_question(
+                    answer=self._extract_raw_answer(),
+                    options=self._extract_bullet_list_options()
+                )
+            case QuestionType.TEXTFÄLT:
+                data = self._parse_text_field()
+            case QuestionType.SIFFERFÄLT:
+                data = self._create_question(answer=self._extract_raw_answer())
+            case QuestionType.HOTSPOT:
+                data = self._parse_hotspot()
+            case QuestionType.SAMMANSATT:
+                data = self._create_question(answer=self._extract_answer_lines())
+            case QuestionType.DND_TEXT:
+                data = self._create_question(answer=self._extract_answer_lines())
+            case QuestionType.DND_BILD:
+                data = self._create_question(answer=self._extract_answer_lines())
+            case QuestionType.SANT_FALSKT:
+                data = self._create_question(answer=self._extract_answer_pairs())
+            case _:
+                raise ValueError(f"Unsupported question type: {self.type}")
+        
+        data.num_questions = self.num_questions
+        data.sub_questions = self._extract_sub_questions(data)
+        data.is_complete = self._check_completeness(data)
+        return data
+
+    def _check_completeness(self, data: QuestionData) -> bool:
+        """Verify if the answer is complete (no TODOs, matches sub-question count)."""
+        content = self._extract_raw_answer()
+        if not content or "TODO" in content:
+            return False
+
+        # If we have sub-questions, ensure we have enough answer lines/parts
+        if data.num_questions > 1:
+            if isinstance(data.answer, list):
+                if data.type in [QuestionType.MCQ, QuestionType.SCQ]:
+                    return len(data.answer) > 0
+                return len(data.answer) >= data.num_questions
+            else:
+                return False
+
+        return True
+
+    def _count_sub_questions(self) -> int:
+        """Count sub-questions like a), b), c) or 1), 2) in the question text."""
+        md_content = self.parsed.raw_content
+        
+        # Count lettered sub-questions: a), b), c)...
+        letters = SUB_QUESTION_LETTER_RE.findall(md_content)
+        if letters:
+            unique_letters = sorted(list(set(letters)))
+            if "a" in unique_letters:
+                max_letter = max(unique_letters)
+                return ord(max_letter) - ord("a") + 1
+        
+        # Count numbered sub-questions: 1), 2), 3)...
+        numbers = SUB_QUESTION_NUMBER_RE.findall(md_content)
+        if numbers:
+            unique_numbers = sorted(list(set(map(int, numbers))))
+            if 1 in unique_numbers:
+                return max(unique_numbers)
+
+        return 1
+
+    def _create_question(
+        self, 
+        answer: Any, 
+        options: list[str] = None, 
+        answer_image: str | None = None
+    ) -> QuestionData:
+        """Create a QuestionData object with common fields pre-populated."""
+        return QuestionData(
+            type=self.type,
+            question=self.question,
+            answer=answer,
+            options=options or [],
+            image=self.image,
+            answer_image=answer_image,
+            instruction=self.instruction,
+            metadata=self.metadata
+        )
+
+    # === Extraction Helpers ===
+
+    def _extract_type(self) -> QuestionType:
+        tags = self.metadata.get("tags", [])
+        for tag in tags:
+            if tag.startswith("frågetyp/"):
+                type_str = tag.split("/", 1)[1]
+                try:
+                    return QuestionType(type_str)
+                except ValueError:
+                    continue
+        return QuestionType.MCQ  # Default
+
+    def _extract_question_text(self) -> str:
+        texts = []
+        for node in self.nodes:
+            if node.type == "paragraph":
+                text = node.text.strip()
+                # Skip instructions
+                if text.startswith("Välj") and "alternativ" in text:
+                    continue
+                
+                # If paragraph contains a sub-question marker, stop there
+                # We use a more liberal search here because mistune might have joined lines
+                first_marker = FIELD_MARKER_RE.search(text)
+                
+                if first_marker:
+                    text = text[:first_marker.start()].strip()
+                    if text:
+                        # Only add if it doesn't look like an instruction we already skipped
+                        if not (text.startswith("Välj") and "alternativ" in text):
+                            texts.append(text)
+                    break # Stop collecting intro text once we hit a sub-question
+                
+                # Clean and collect
+                text = EMBED_RE.sub("", text).strip()
+                text = text.replace("**", "")
+                if text:
+                    texts.append(text)
+        return "\n".join(texts)
+
+    def _extract_instruction(self) -> str | None:
+        for node in self.nodes:
+            if node.type == "paragraph":
+                text = node.text.strip()
+                if "Välj" in text and "alternativ" in text:
+                    return text.replace("**", "")
+        return None
+
+    def _extract_image(self) -> str | None:
+        for node in self.nodes:
+            # Check for direct embed nodes
+            if node.type == "embed":
+                return f"![[{node.attrs['filename']}]]"
+            
+            # Check inside paragraphs/lists for inline embeds
+            if node.type in ["paragraph", "list"]:
+                for child in node.children:
+                    if child.type == "embed":
+                        return f"![[{child.attrs['filename']}]]"
+                
+                if node.raw:
+                    match = IMAGE_RE.search(node.raw)
+                    if match:
+                        return f"![[{match.group(1)}]]"
+        return None
+
+    def _extract_sub_questions(self, data: QuestionData) -> list[SubQuestion]:
+        # Only split the text BEFORE the spoiler block to avoid misidentifying markers in answers
+        full_raw = self.parsed.raw_content
+        parts = full_raw.split("```", 1)
+        question_portion = parts[0]
+        
+        # Split by sub-question markers at the start of lines: a), b) or 1), 2)
+        segments = SUB_QUESTION_SPLIT_RE.split(question_portion)[1:]
+        
+        sub_questions = []
+        # segments will be [id1, text1, id2, text2, ...]
+        for i in range(0, len(segments), 2):
+            q_id = segments[i]
+            q_full_text = segments[i+1].strip()
+            
+            # Extract options if any (for select fields)
+            options = self._extract_select_options(q_full_text)
+            
+            # Clean text (remove point markers like (0.5 p) and select patterns)
+            clean_text = SELECT_RANGE_RE.sub("", q_full_text)
+            clean_text = SELECT_LETTER_RANGE_RE.sub("", clean_text)
+            clean_text = SELECT_LIST_RE.sub("", clean_text)
+            clean_text = POINTS_RE.sub("", clean_text).strip()
+            
+            # Extract answer for this part
+            answer = None
+            if isinstance(data.answer, list) and i//2 < len(data.answer):
+                answer = data.answer[i//2]
+            elif isinstance(data.answer, str):
+                lines = [l.strip() for l in data.answer.split("\n") if l.strip()]
+                if i//2 < len(lines):
+                    answer = lines[i//2]
+            elif data.num_questions == 1:
+                answer = data.answer
+            
+            sub_questions.append(SubQuestion(
+                id=q_id,
+                text=clean_text,
+                answer=answer,
+                options=options
+            ))
+            
+        return sub_questions
+
+    def _extract_select_options(self, text: str) -> list[str] | None:
+        """Extract options from patterns like (1..10), (A..D), or (A, B, C)."""
+        # Numerical range (1..10)
+        match = SELECT_RANGE_RE.search(text)
+        if match:
+            start, end = map(int, match.groups())
+            return [str(x) for x in range(start, end + 1)]
+            
+        # Letter range (A..H)
+        match = SELECT_LETTER_RANGE_RE.search(text)
+        if match:
+            start, end = match.groups()
+            return [chr(x) for x in range(ord(start), ord(end) + 1)]
+            
+        # Comma-separated list (A, B, C)
+        match = SELECT_LIST_RE.search(text)
+        if match:
+            items = match.group(1).split(",")
+            return [item.strip() for item in items]
+            
+        return None
+
+    def _extract_lettered_options(self) -> list[str]:
+        options = []
+        for node in self.nodes:
+            if node.type == "list":
+                for item in node.children:
+                    item_text = item.text.strip()
+                    if OPTION_LETTER_RE.match(item_text):
+                        options.append(item_text)
+        return options
+
+    def _extract_bullet_list_options(self) -> list[str]:
+        options = []
+        for node in self.nodes:
+            if node.type == "list":
+                for item in node.children:
+                    options.append(item.text.strip())
+        return options
+
+    def _extract_raw_answer(self) -> str:
+        for node in self.nodes:
+            if node.type == "block_code" and node.attrs.get("info") == "spoiler-block:":
+                return node.raw.strip()
+        return ""
+
+    def _extract_answer_letters(self) -> list[str]:
+        content = self._extract_raw_answer()
+        if not content or content == "TODO":
+            return []
+        return ANSWER_LETTER_RE.findall(content)
+
+    def _extract_answer_lines(self) -> list[str]:
+        content = self._extract_raw_answer()
+        if not content or content == "TODO":
+            return []
+        return [line.strip() for line in content.split("\n") if line.strip()]
+
+    def _extract_answer_pairs(self) -> list[list[str]]:
+        lines = self._extract_answer_lines()
+        pairs = []
+        for line in lines:
+            if ":" in line:
+                key, value = line.split(":", 1)
+                pairs.append([key.strip(), value.strip()])
+        return pairs
+
+    # === Question Type Handlers ===
+
+    def _parse_choice_question(self) -> QuestionData:
+        answer_letters = self._extract_answer_letters()
+        if self.type == QuestionType.MCQ:
+            answer = answer_letters
+        else:
+            answer = answer_letters[0] if answer_letters else ""
+            
+        return self._create_question(
+            answer=answer,
+            options=self._extract_lettered_options()
+        )
+
+    def _parse_text_field(self) -> QuestionData:
+        lines = self._extract_answer_lines()
+        return self._create_question(
+            answer=lines if len(lines) > 1 else (lines[0] if lines else "")
+        )
+
+
+    def _parse_hotspot(self) -> QuestionData:
+        content = self._extract_raw_answer()
+        answer_image = None
+        
+        match = IMAGE_RE.search(content)
+        if match:
+            answer_image = f"![[{match.group(1)}]]"
+            answer_text = EMBED_RE.sub("", content).strip()
+        else:
+            answer_text = content
+            
+        return self._create_question(
+            answer=answer_text,
+            answer_image=answer_image
+        )