import re from dataclasses import dataclass, field from enum import Enum from typing import Any from quiz.utils.question_parser import Node, parse_question_from_content # === REGEX PATTERNS === # Matches Obsidian-style embeds like ![[image.png]] or ![[image.png|300]] EMBED_RE = re.compile( r"!\[\[" # Start of embed r".*?" # Content (filename and optional pipes) r"\]\]" # End of embed ) # Captures the filename from an Obsidian embed, ignoring dimensions IMAGE_RE = re.compile( r"!\[\[" # Start of embed r"([^|\]]+)" # Group 1: Filename (everything before | or ]) r"(?:\|.*?)?" # Optional dimension part starting with | r"\]\]" # End of embed ) # Matches lettered options at the start of a line, e.g., "A: Text" or "B. Text" OPTION_LETTER_RE = re.compile( r"^([A-Z])" # Group 1: Single uppercase letter at start r"[:\.]?" # Optional colon or period r"\s*" # Optional whitespace r"(.*)$" # Group 2: The rest of the text ) # Matches standalone uppercase letters used for answers, e.g., "A", "A och B" ANSWER_LETTER_RE = re.compile( r"\b" # Word boundary r"([A-Z])" # Group 1: Single uppercase letter r"\b" # Word boundary ) # Matches sub-question markers like a), b) at the start of a line SUB_QUESTION_LETTER_RE = re.compile( r"^\s*" # Start of line and optional whitespace r"([a-z])" # Group 1: Single lowercase letter r"\)" # Closing parenthesis , re.MULTILINE) # Matches numbered sub-question markers like 1), 2) at the start of a line SUB_QUESTION_NUMBER_RE = re.compile( r"^\s*" # Start of line and optional whitespace r"(\d+)" # Group 1: One or more digits r"\)" # Closing parenthesis , re.MULTILINE) # Matches select range patterns like (1..10) SELECT_RANGE_RE = re.compile( r"\(" # Opening parenthesis r"(\d+)" # Group 1: Start number r"\.\." # Range dots r"(\d+)" # Group 2: End number r"\)" # Closing parenthesis ) # Matches letter range patterns like (A..H) SELECT_LETTER_RANGE_RE = re.compile( r"\(" # Opening parenthesis r"([A-Z])" # Group 1: Start letter r"\.\." # Range dots r"([A-Z])" # Group 2: End letter r"\)" # Closing parenthesis ) # Matches select list patterns like (A, B, C) SELECT_LIST_RE = re.compile( r"\(" # Opening parenthesis r"(" # Group 1: The list content r"[^)]+" # Anything but closing parenthesis r"," # At least one comma r"[^)]+" # Anything but closing parenthesis r")" r"\)" # Closing parenthesis ) # Matches sub-question markers in mid-text (used for splitting intro text) FIELD_MARKER_RE = re.compile( r"\b" # Word boundary r"([a-z]|\d+)" # Group 1: Letter or digit r"\)" # Closing parenthesis ) # Matches sub-question markers (a, b or 1, 2) at start of line for splitting SUB_QUESTION_SPLIT_RE = re.compile( r"^\s*" # Start of line and optional whitespace r"([a-z]|\d+)" # Group 1: Single letter or one or more digits r"\)" # Closing parenthesis r"\s*" # Optional trailing whitespace , re.MULTILINE) # Matches point markers like (0.5 p) or (1 p) POINTS_RE = re.compile( r"\(" # Opening parenthesis r"\d+" # One or more digits r"(?:\.\d+)?" # Optional decimal part r"\s*" # Optional whitespace r"p" # Literal 'p' r"\)" # Closing parenthesis ) class QuestionType(Enum): MCQ = "mcq" SCQ = "scq" MATCHING = "matching" TEXTALTERNATIV = "textalternativ" TEXTFÄLT = "textfält" SIFFERFÄLT = "sifferfält" HOTSPOT = "hotspot" SAMMANSATT = "sammansatt" DND_TEXT = "dnd-text" DND_BILD = "dnd-bild" SANT_FALSKT = "sant-falskt" @dataclass class SubQuestion: id: str # 'a', 'b', etc. text: str # Text for this part answer: Any = None options: list[str] | None = None # None if text input @dataclass class QuestionData: type: QuestionType question: str answer: Any # str | list[str] | list[list[str]] num_questions: int = 1 # Total sub-questions (a, b, c...) is_complete: bool = False options: list[str] = field(default_factory=list) image: str | None = None answer_image: str | None = None instruction: str | None = None metadata: dict = field(default_factory=dict) sub_questions: list[SubQuestion] = field(default_factory=list) class UnifiedParser: def __init__(self, content: str): self.content = content self.parsed = parse_question_from_content(content) self.metadata = self.parsed.metadata self.nodes = self.parsed.nodes # Pre-extract common fields self.type = self._extract_type() self.question = self._extract_question_text() self.instruction = self._extract_instruction() self.image = self._extract_image() self.num_questions = self._count_sub_questions() def parse(self) -> QuestionData: match self.type: case QuestionType.MCQ | QuestionType.SCQ: data = self._parse_choice_question() case QuestionType.MATCHING: data = self._create_question( answer=self._extract_answer_pairs(), options=self._extract_bullet_list_options() ) case QuestionType.TEXTALTERNATIV: data = self._create_question( answer=self._extract_raw_answer(), options=self._extract_bullet_list_options() ) case QuestionType.TEXTFÄLT: data = self._parse_text_field() case QuestionType.SIFFERFÄLT: data = self._create_question(answer=self._extract_raw_answer()) case QuestionType.HOTSPOT: data = self._parse_hotspot() case QuestionType.SAMMANSATT: data = self._create_question(answer=self._extract_answer_lines()) case QuestionType.DND_TEXT: data = self._create_question(answer=self._extract_answer_lines()) case QuestionType.DND_BILD: data = self._create_question(answer=self._extract_answer_lines()) case QuestionType.SANT_FALSKT: data = self._create_question(answer=self._extract_answer_pairs()) case _: raise ValueError(f"Unsupported question type: {self.type}") data.num_questions = self.num_questions data.sub_questions = self._extract_sub_questions(data) data.is_complete = self._check_completeness(data) return data def _check_completeness(self, data: QuestionData) -> bool: """Verify if the answer is complete (no TODOs, matches sub-question count).""" content = self._extract_raw_answer() if not content or "TODO" in content: return False # If we have sub-questions, ensure we have enough answer lines/parts if data.num_questions > 1: if isinstance(data.answer, list): if data.type in [QuestionType.MCQ, QuestionType.SCQ]: return len(data.answer) > 0 return len(data.answer) >= data.num_questions else: return False return True def _count_sub_questions(self) -> int: """Count sub-questions like a), b), c) or 1), 2) in the question text.""" md_content = self.parsed.raw_content # Count lettered sub-questions: a), b), c)... letters = SUB_QUESTION_LETTER_RE.findall(md_content) if letters: unique_letters = sorted(list(set(letters))) if "a" in unique_letters: max_letter = max(unique_letters) return ord(max_letter) - ord("a") + 1 # Count numbered sub-questions: 1), 2), 3)... numbers = SUB_QUESTION_NUMBER_RE.findall(md_content) if numbers: unique_numbers = sorted(list(set(map(int, numbers)))) if 1 in unique_numbers: return max(unique_numbers) return 1 def _create_question( self, answer: Any, options: list[str] = None, answer_image: str | None = None ) -> QuestionData: """Create a QuestionData object with common fields pre-populated.""" return QuestionData( type=self.type, question=self.question, answer=answer, options=options or [], image=self.image, answer_image=answer_image, instruction=self.instruction, metadata=self.metadata ) # === Extraction Helpers === def _extract_type(self) -> QuestionType: tags = self.metadata.get("tags", []) for tag in tags: if tag.startswith("frågetyp/"): type_str = tag.split("/", 1)[1] try: return QuestionType(type_str) except ValueError: continue return QuestionType.MCQ # Default def _extract_question_text(self) -> str: texts = [] for node in self.nodes: if node.type == "paragraph": text = node.text.strip() # Skip instructions if text.startswith("Välj") and "alternativ" in text: continue # If paragraph contains a sub-question marker, stop there # We use a more liberal search here because mistune might have joined lines first_marker = FIELD_MARKER_RE.search(text) if first_marker: text = text[:first_marker.start()].strip() if text: # Only add if it doesn't look like an instruction we already skipped if not (text.startswith("Välj") and "alternativ" in text): texts.append(text) break # Stop collecting intro text once we hit a sub-question # Clean and collect text = EMBED_RE.sub("", text).strip() text = text.replace("**", "") if text: texts.append(text) return "\n".join(texts) def _extract_instruction(self) -> str | None: for node in self.nodes: if node.type == "paragraph": text = node.text.strip() if "Välj" in text and "alternativ" in text: return text.replace("**", "") return None def _extract_image(self) -> str | None: for node in self.nodes: # Check for direct embed nodes if node.type == "embed": return f"![[{node.attrs['filename']}]]" # Check inside paragraphs/lists for inline embeds if node.type in ["paragraph", "list"]: for child in node.children: if child.type == "embed": return f"![[{child.attrs['filename']}]]" if node.raw: match = IMAGE_RE.search(node.raw) if match: return f"![[{match.group(1)}]]" return None def _extract_sub_questions(self, data: QuestionData) -> list[SubQuestion]: # Only split the text BEFORE the spoiler block to avoid misidentifying markers in answers full_raw = self.parsed.raw_content parts = full_raw.split("```", 1) question_portion = parts[0] # Split by sub-question markers at the start of lines: a), b) or 1), 2) segments = SUB_QUESTION_SPLIT_RE.split(question_portion)[1:] sub_questions = [] # segments will be [id1, text1, id2, text2, ...] for i in range(0, len(segments), 2): q_id = segments[i] q_full_text = segments[i+1].strip() # Extract options if any (for select fields) options = self._extract_select_options(q_full_text) # Clean text (remove point markers like (0.5 p) and select patterns) clean_text = SELECT_RANGE_RE.sub("", q_full_text) clean_text = SELECT_LETTER_RANGE_RE.sub("", clean_text) clean_text = SELECT_LIST_RE.sub("", clean_text) clean_text = POINTS_RE.sub("", clean_text).strip() # Extract answer for this part answer = None if isinstance(data.answer, list) and i//2 < len(data.answer): answer = data.answer[i//2] elif isinstance(data.answer, str): lines = [l.strip() for l in data.answer.split("\n") if l.strip()] if i//2 < len(lines): answer = lines[i//2] elif data.num_questions == 1: answer = data.answer sub_questions.append(SubQuestion( id=q_id, text=clean_text, answer=answer, options=options )) return sub_questions def _extract_select_options(self, text: str) -> list[str] | None: """Extract options from patterns like (1..10), (A..D), or (A, B, C).""" # Numerical range (1..10) match = SELECT_RANGE_RE.search(text) if match: start, end = map(int, match.groups()) return [str(x) for x in range(start, end + 1)] # Letter range (A..H) match = SELECT_LETTER_RANGE_RE.search(text) if match: start, end = match.groups() return [chr(x) for x in range(ord(start), ord(end) + 1)] # Comma-separated list (A, B, C) match = SELECT_LIST_RE.search(text) if match: items = match.group(1).split(",") return [item.strip() for item in items] return None def _extract_lettered_options(self) -> list[str]: options = [] for node in self.nodes: if node.type == "list": for item in node.children: item_text = item.text.strip() if OPTION_LETTER_RE.match(item_text): options.append(item_text) return options def _extract_bullet_list_options(self) -> list[str]: options = [] for node in self.nodes: if node.type == "list": for item in node.children: options.append(item.text.strip()) return options def _extract_raw_answer(self) -> str: for node in self.nodes: if node.type == "block_code" and node.attrs.get("info") == "spoiler-block:": return node.raw.strip() return "" def _extract_answer_letters(self) -> list[str]: content = self._extract_raw_answer() if not content or content == "TODO": return [] return ANSWER_LETTER_RE.findall(content) def _extract_answer_lines(self) -> list[str]: content = self._extract_raw_answer() if not content or content == "TODO": return [] return [line.strip() for line in content.split("\n") if line.strip()] def _extract_answer_pairs(self) -> list[list[str]]: lines = self._extract_answer_lines() pairs = [] for line in lines: if ":" in line: key, value = line.split(":", 1) pairs.append([key.strip(), value.strip()]) return pairs # === Question Type Handlers === def _parse_choice_question(self) -> QuestionData: answer_letters = self._extract_answer_letters() if self.type == QuestionType.MCQ: answer = answer_letters else: answer = answer_letters[0] if answer_letters else "" return self._create_question( answer=answer, options=self._extract_lettered_options() ) def _parse_text_field(self) -> QuestionData: lines = self._extract_answer_lines() return self._create_question( answer=lines if len(lines) > 1 else (lines[0] if lines else "") ) def _parse_hotspot(self) -> QuestionData: content = self._extract_raw_answer() answer_image = None match = IMAGE_RE.search(content) if match: answer_image = f"![[{match.group(1)}]]" answer_text = EMBED_RE.sub("", content).strip() else: answer_text = content return self._create_question( answer=answer_text, answer_image=answer_image )