diff --git a/content/.obsidian/workspace.json b/content/.obsidian/workspace.json index 11ee97b..025580a 100644 --- a/content/.obsidian/workspace.json +++ b/content/.obsidian/workspace.json @@ -30,7 +30,7 @@ "state": { "file": "Anatomi & Histologi 2/Statistik.md", "mode": "source", - "source": true, + "source": false, "backlinks": false }, "icon": "lucide-file", diff --git a/content/Anatomi & Histologi 2/Statistik.md b/content/Anatomi & Histologi 2/Statistik.md index 3e13df5..70237df 100644 --- a/content/Anatomi & Histologi 2/Statistik.md +++ b/content/Anatomi & Histologi 2/Statistik.md @@ -1,11 +1,11 @@ ### Tentor -| Datum | OCR | Split | Bilder | Hotspot | Taggad | Svar | Granska | -| ---------- | :-: | :---: | :----: | :-----: | :----: | :--: | ------- | -| 2022-01-15 | ✅ | ✅ | ✅ | | ✅ | ✅ | | -| 2022-06-01 | ✅ | ✅ | ✅ | | ✅ | ✅ | | -| 2023-01-11 | ✅ | ✅ | ✅ | | ✅ | ✅ | | -| 2023-05-31 | ✅ | ✅ | ✅ | | ✅ | ✅ | | +| Datum | OCR | Split | Bilder | Hotspot | Taggad | Svar | Granska | +| ---------- | :-: | :---: | :----: | :-----: | :----: | :--: | ------- | +| 2022-01-15 | ✅ | ✅ | ✅ | | ✅ | ✅ | | +| 2022-06-01 | ✅ | ✅ | ✅ | | ✅ | ✅ | | +| 2023-01-11 | ✅ | ✅ | ✅ | | ✅ | ✅ | | +| 2023-05-31 | ✅ | ✅ | ✅ | | ✅ | ✅ | | | 2024-01-10 | ✅ | ✅ | ✅ | | ✅ | | | | 2024-05-29 | ✅ | ✅ | ✅ | | ✅ | | | | 2025-01-15 | ✅ | ✅ | ✅ | | ✅ | | | diff --git a/quiz/MATCHING_FORMAT.md b/quiz/MATCHING_FORMAT.md deleted file mode 100644 index 1ee176a..0000000 --- a/quiz/MATCHING_FORMAT.md +++ /dev/null @@ -1,18 +0,0 @@ -# Matching Questions Format Analysis - -Based on reviewing the 17 matching questions: - -## Key Finding: -Only **1 question has an answer** (2023-05-31/3.md), the rest have TODO. - -**That question uses this format:** -- Two separate bullet lists -- Answer: "ItemName: MatchName" format - -## Proposed Implementation: -1. Support two-list format (most flexible) -2. Parse answer as "Item: Match" pairs -3. Store as JSON with 0-indexed pairs -4. Render as n×n table with radio buttons - -## Next: Implement based on this one working example. diff --git a/quiz/parse-markdown.py b/quiz/parse-markdown.py deleted file mode 100644 index 43522a5..0000000 --- a/quiz/parse-markdown.py +++ /dev/null @@ -1,15 +0,0 @@ -import pathlib - -import mistune -markdown = mistune.create_markdown(renderer=None) - -root = pathlib.Path(__file__).parent.parent -exams = root / "content" / "Anatomi & Histologi 2" / "Gamla tentor" -print(exams.absolute()) -for file in sorted(exams.glob("*/*.md")): - if len(file.stem) > 2: - continue - print(f"Parsing {file}") - tokens = markdown(file.read_text(encoding="utf-8")) - import pprint - pprint.pprint(tokens) diff --git a/quiz/pytest.ini b/quiz/pytest.ini index 08d9cd4..06c8998 100644 --- a/quiz/pytest.ini +++ b/quiz/pytest.ini @@ -13,5 +13,6 @@ markers = admin: Admin interface tests import: Import and parsing tests import_tests: Import and parsing tests + parser: Question parser tests slow: Slow running tests diff --git a/quiz/quiz/tests/test_question_parser.py b/quiz/quiz/tests/test_question_parser.py new file mode 100644 index 0000000..16c4b0c --- /dev/null +++ b/quiz/quiz/tests/test_question_parser.py @@ -0,0 +1,537 @@ +""" +Comprehensive test suite for the question_parser module. + +This test suite uses pytest's parametrize decorator to test multiple scenarios +with minimal code duplication. It covers: + +1. Node class: + - Initialization with different token types + - Attribute handling + - Children node processing + - String representation (__repr__) + - Text extraction from nested structures + +2. parse_question function: + - Metadata parsing (tags, dates, etc.) + - Raw content extraction + - Different question types (MCQ, SCQ, text field, matching) + - Questions with images + - Edge cases (empty content, missing frontmatter) + - Document structure preservation + +3. ParsedQuestion dataclass: + - Default values + - Initialization with custom values + +4. Real exam questions: + - Parsing actual exam questions from the content directory + - Validation of all short-named question files + +Test execution: + pytest tests/test_question_parser.py -v # Verbose output + pytest tests/test_question_parser.py -k "mcq" # Run only MCQ tests + pytest tests/test_question_parser.py --collect-only # List all tests +""" + +import pathlib +import tempfile +import pytest +from quiz.utils.question_parser import Node, ParsedQuestion, parse_question + + +@pytest.fixture +def temp_dir(): + """Create a temporary directory for test files""" + with tempfile.TemporaryDirectory() as tmpdir: + yield pathlib.Path(tmpdir) + + +@pytest.fixture +def create_question_file(temp_dir): + """Factory fixture to create question files""" + def _create_file(filename: str, content: str) -> pathlib.Path: + file_path = temp_dir / filename + file_path.write_text(content, encoding="utf-8") + return file_path + return _create_file + + +class TestNode: + """Test the Node class""" + + @pytest.mark.parametrize("token,expected_type,expected_raw", [ + ({"type": "paragraph"}, "paragraph", ""), + ({"type": "heading", "raw": "Test Heading"}, "heading", "Test Heading"), + ({"type": "text", "raw": "Some text"}, "text", "Some text"), + ({"type": "list"}, "list", ""), + ]) + def test_node_initialization(self, token, expected_type, expected_raw): + """Test Node initialization with different token types""" + node = Node(token) + assert node.type == expected_type + assert node.raw == expected_raw + + @pytest.mark.parametrize("token,expected_attrs", [ + ({"type": "block_code", "attrs": {"info": "spoiler-block:"}}, {"info": "spoiler-block:"}), + ({"type": "paragraph"}, {}), + ({"type": "heading", "attrs": {"level": 2}}, {"level": 2}), + ]) + def test_node_attributes(self, token, expected_attrs): + """Test Node attributes handling""" + node = Node(token) + assert node.attrs == expected_attrs + + def test_node_children(self): + """Test Node children handling""" + token = { + "type": "paragraph", + "children": [ + {"type": "text", "raw": "Hello "}, + {"type": "text", "raw": "World"}, + ] + } + node = Node(token) + assert len(node.children) == 2 + assert node.children[0].type == "text" + assert node.children[0].raw == "Hello " + assert node.children[1].type == "text" + assert node.children[1].raw == "World" + + @pytest.mark.parametrize("token,expected_repr_contains", [ + ({"type": "text", "raw": "test"}, "Text(raw='test')"), + ({"type": "paragraph"}, "Paragraph()"), + ({"type": "block_code", "attrs": {"info": "python"}}, "BlockCode(attrs={'info': 'python'})"), + ]) + def test_node_repr(self, token, expected_repr_contains): + """Test Node __repr__ method""" + node = Node(token) + assert repr(node) == expected_repr_contains + + @pytest.mark.parametrize("token,expected_text", [ + ({"type": "text", "raw": "Simple text"}, "Simple text"), + ( + { + "type": "paragraph", + "children": [ + {"type": "text", "raw": "Hello "}, + {"type": "text", "raw": "World"}, + ] + }, + "Hello World" + ), + ( + { + "type": "paragraph", + "children": [ + {"type": "text", "raw": "Nested "}, + { + "type": "strong", + "children": [{"type": "text", "raw": "bold"}] + }, + {"type": "text", "raw": " text"}, + ] + }, + "Nested bold text" + ), + ]) + def test_node_text_property(self, token, expected_text): + """Test Node text property extraction""" + node = Node(token) + assert node.text == expected_text + + +class TestParseQuestion: + """Test the parse_question function""" + + @pytest.mark.parametrize("content,expected_tags", [ + ( + """--- +tags: [ah2, provfråga, frågetyp/mcq] +date: 2022-01-15 +--- +Question content""", + ["ah2", "provfråga", "frågetyp/mcq"] + ), + ( + """--- +tags: + - ah2 + - provfråga + - frågetyp/scq +date: 2023-05-31 +--- +Question content""", + ["ah2", "provfråga", "frågetyp/scq"] + ), + ]) + def test_parse_metadata_tags(self, create_question_file, content, expected_tags): + """Test parsing of metadata tags in different formats""" + file_path = create_question_file("test.md", content) + question = parse_question(file_path) + assert question.metadata["tags"] == expected_tags + + @pytest.mark.parametrize("content,expected_date", [ + ( + """--- +tags: [ah2] +date: 2022-01-15 +--- +Content""", + "2022-01-15" + ), + ( + """--- +tags: [ah2] +date: 2023-05-31 +--- +Content""", + "2023-05-31" + ), + ]) + def test_parse_metadata_date(self, create_question_file, content, expected_date): + """Test parsing of metadata date""" + file_path = create_question_file("test.md", content) + question = parse_question(file_path) + assert str(question.metadata["date"]) == expected_date + + @pytest.mark.parametrize("content,expected_raw", [ + ( + """--- +tags: [ah2] +--- +Simple question""", + "Simple question" + ), + ( + """--- +tags: [ah2] +--- +Question with **bold** text""", + "Question with **bold** text" + ), + ]) + def test_parse_raw_content(self, create_question_file, content, expected_raw): + """Test parsing of raw content""" + file_path = create_question_file("test.md", content) + question = parse_question(file_path) + assert question.raw_content.strip() == expected_raw + + def test_parse_mcq_question(self, create_question_file): + """Test parsing a complete MCQ question""" + content = """--- +tags: [ah2, provfråga, frågetyp/mcq, cerebrum] +date: 2022-01-15 +--- +Vilka av följande räknas till storhjärnans basala kärnor? + +**Välj två alternativ** +- A: Putamen +- B: Nucleus Ruber +- C: Substantia nigra +- D: Nucleus caudatus + +```spoiler-block: +A och D +``` +""" + file_path = create_question_file("mcq.md", content) + question = parse_question(file_path) + + assert question.metadata["tags"] == ["ah2", "provfråga", "frågetyp/mcq", "cerebrum"] + assert len(question.nodes) > 0 + + # Find paragraph nodes + paragraphs = [n for n in question.nodes if n.type == "paragraph"] + assert len(paragraphs) > 0 + + # Find list nodes + lists = [n for n in question.nodes if n.type == "list"] + assert len(lists) > 0 + + # Find spoiler block + code_blocks = [n for n in question.nodes if n.type == "block_code"] + assert len(code_blocks) > 0 + spoiler = code_blocks[0] + assert spoiler.attrs.get("info") == "spoiler-block:" + assert "A och D" in spoiler.raw + + def test_parse_scq_question(self, create_question_file): + """Test parsing a single choice question""" + content = """--- +tags: [ah2, provfråga, frågetyp/scq, histologi] +date: 2022-06-01 +--- +Vilken del av CNS syns i bild? +- A: Cerebellum +- B: Diencephalon +- C: Medulla spinalis +- D: Cerebrum +- E: Pons + +```spoiler-block: +A +``` +""" + file_path = create_question_file("scq.md", content) + question = parse_question(file_path) + + assert "frågetyp/scq" in question.metadata["tags"] + lists = [n for n in question.nodes if n.type == "list"] + assert len(lists) > 0 + + def test_parse_text_field_question(self, create_question_file): + """Test parsing a text field question""" + content = """--- +tags: [ah2, provfråga, frågetyp/textfält, öga, anatomi] +date: 2022-01-15 +--- +![[image-2.png|301x248]] +**Fyll i rätt siffra!** + +(0.5p per rätt svar, inga avdrag för fel svar): + +a) Vilken siffra pekar på gula fläcken? +b) Vilken siffra pekar på choroidea? + +```spoiler-block: +a) 7 +b) 6 +``` +""" + file_path = create_question_file("textfield.md", content) + question = parse_question(file_path) + + assert "frågetyp/textfält" in question.metadata["tags"] + assert len(question.nodes) > 0 + + def test_parse_matching_question(self, create_question_file): + """Test parsing a matching question""" + content = """--- +tags: [ah2, provfråga, frågetyp/matching, histologi] +date: 2023-05-31 +--- +Vilka av följande stödjeceller finns i CNS? Markera JA eller NEJ för varje angiven celltyp: +(1p för alla rätt, inga delpoäng) + +- a) oligodendrocyter +- b) Astrocyter +- c) satellitceller +- d) ependymceller +- e) mikroglia +- f) Schwannceller + +- JA, finn i CNS +- NEJ, finns inte i CNS + +```spoiler-block: +a) JA, finn i CNS +b) JA, finn i CNS +c) NEJ, finns inte i CNS +d) JA, finn i CNS +e) JA, finn i CNS +f) NEJ, finns inte i CNS +``` +""" + file_path = create_question_file("matching.md", content) + question = parse_question(file_path) + + assert "frågetyp/matching" in question.metadata["tags"] + lists = [n for n in question.nodes if n.type == "list"] + assert len(lists) > 0 + + def test_parse_question_with_image(self, create_question_file): + """Test parsing a question with embedded images""" + content = """--- +tags: [ah2, provfråga, frågetyp/textfält, öra, anatomi, bild] +date: 2022-01-15 +--- +![[image-4.png|292x316]] +**Fyll i rätt siffra !** + +(0.5p per rätt svar, inga avdrag för fel svar): + +a) Vilken siffra pekar på incus? (1..19) +b) Vilken siffra pekar på tuba auditiva? (1..19) + +```spoiler-block: +a) 7 +b) 18 +``` +""" + file_path = create_question_file("image_q.md", content) + question = parse_question(file_path) + + assert "bild" in question.metadata["tags"] + assert "![[image-4.png" in question.raw_content + embed = question.nodes[0].children[0] + assert embed.type == "embed" + assert embed.attrs == { + "filename": "image-4.png", + "width": 292, + "height": 316 + } + + @pytest.mark.parametrize("invalid_content", [ + "", # Empty content + "No frontmatter", # No frontmatter + "---\n---\n", # Empty frontmatter + ]) + def test_parse_edge_cases(self, create_question_file, invalid_content): + """Test parsing edge cases""" + file_path = create_question_file("edge.md", invalid_content) + question = parse_question(file_path) + assert isinstance(question, ParsedQuestion) + + def test_parse_question_preserves_structure(self, create_question_file): + """Test that parsing preserves the document structure""" + content = """--- +tags: [ah2] +--- +# Heading + +Paragraph text + +- List item 1 +- List item 2 + +```spoiler-block: +Answer +``` +""" + file_path = create_question_file("structure.md", content) + question = parse_question(file_path) + + node_types = [n.type for n in question.nodes] + assert "heading" in node_types + assert "paragraph" in node_types + assert "list" in node_types + assert "block_code" in node_types + + +class TestParsedQuestionDataclass: + """Test the ParsedQuestion dataclass""" + + def test_parsed_question_defaults(self): + """Test ParsedQuestion default values""" + question = ParsedQuestion() + assert question.metadata == {} + assert question.raw_content == "" + assert question.nodes == [] + + def test_parsed_question_initialization(self): + """Test ParsedQuestion initialization with values""" + metadata = {"tags": ["test"], "date": "2022-01-15"} + content = "Test content" + nodes = [Node({"type": "paragraph"})] + + question = ParsedQuestion( + metadata=metadata, + raw_content=content, + nodes=nodes + ) + + assert question.metadata == metadata + assert question.raw_content == content + assert question.nodes == nodes + + +class TestRealQuestions: + """Test parsing real questions from the exam files""" + + @pytest.fixture + def exam_dir(self): + """Get the real exam directory""" + root = pathlib.Path(__file__).parent.parent.parent + exam_path = root / "content" / "Anatomi & Histologi 2" / "Gamla tentor" + if exam_path.exists(): + return exam_path + pytest.skip("Exam directory not found") + + @pytest.mark.parametrize("exam_date,question_num", [ + ("2022-01-15", "1"), + ("2022-01-15", "2"), + ("2022-01-15", "3"), + ("2022-01-15", "4"), + ("2022-06-01", "8"), + ]) + def test_parse_real_exam_questions(self, exam_dir, exam_date, question_num): + """Test parsing real exam questions""" + file_path = exam_dir / exam_date / f"{question_num}.md" + if not file_path.exists(): + pytest.skip(f"Question file {file_path} not found") + + question = parse_question(file_path) + + # Verify metadata exists and has required fields + assert "tags" in question.metadata + assert isinstance(question.metadata["tags"], list) + assert "ah2" in question.metadata["tags"] + assert "provfråga" in question.metadata["tags"] + + # Verify content was parsed + assert len(question.raw_content) > 0 + assert len(question.nodes) > 0 + + def test_parse_all_short_named_questions(self, exam_dir): + """Test parsing all questions with short filenames (1-2 chars)""" + questions_found = 0 + + for file in sorted(exam_dir.glob("*/*.md")): + if len(file.stem) <= 2 and file.stem.isdigit(): + question = parse_question(file) + assert isinstance(question, ParsedQuestion) + assert "tags" in question.metadata + questions_found += 1 + + # Ensure we found at least some questions + assert questions_found > 0, "No exam questions found to test" + + +class TestNodeTextExtraction: + """Test text extraction from complex node structures""" + + @pytest.mark.parametrize("token,expected_text", [ + # Simple text + ({"type": "text", "raw": "Hello"}, "Hello"), + + # Paragraph with multiple text children + ( + { + "type": "paragraph", + "children": [ + {"type": "text", "raw": "A "}, + {"type": "text", "raw": "B "}, + {"type": "text", "raw": "C"}, + ] + }, + "A B C" + ), + + # Nested formatting + ( + { + "type": "paragraph", + "children": [ + {"type": "text", "raw": "Normal "}, + { + "type": "emphasis", + "children": [{"type": "text", "raw": "italic"}] + }, + {"type": "text", "raw": " "}, + { + "type": "strong", + "children": [{"type": "text", "raw": "bold"}] + }, + ] + }, + "Normal italic bold" + ), + + # Empty node + ({"type": "paragraph", "children": []}, ""), + ]) + def test_complex_text_extraction(self, token, expected_text): + """Test text extraction from complex nested structures""" + node = Node(token) + assert node.text == expected_text + diff --git a/quiz/quiz/utils/importer.py b/quiz/quiz/utils/importer.py index a16dccd..e97fc59 100644 --- a/quiz/quiz/utils/importer.py +++ b/quiz/quiz/utils/importer.py @@ -1,8 +1,13 @@ import re -from pathlib import Path from collections import defaultdict +from datetime import datetime +from pathlib import Path from typing import Tuple -from quiz.models import Question, Option + +from django.conf import settings + +from quiz.models import Course, Exam, Question, Option +from quiz.utils.question_parser import parse_question_from_content, Node class ImportStats: @@ -70,175 +75,10 @@ class ImportStats: return "\n".join(lines) -def parse_matching_question(content: str) -> Tuple[bool, dict]: - """ - Parse matching question from markdown. - - Expected format: - - Two consecutive bullet lists (with "- " prefix) - - First list = left column items (rows) - - Second list = top row items (columns) - - Answer format: "LeftItem: TopItem" pairs - - Returns: - (is_matching, question_data) where question_data contains: - - text: question text - - left_items: list of left column items - - top_items: list of top row items - - correct_pairs: list of [left_idx, top_idx] pairs (0-indexed) - - has_answer: whether it has an answer (not TODO) - - question_type: 'matching' - """ - lines = content.split('\n') - - # Extract question text (first non-empty line after frontmatter) - question_text = None - in_frontmatter = False - frontmatter_done = False - - for line in lines: - if line.strip() == '---': - if not in_frontmatter: - in_frontmatter = True - else: - in_frontmatter = False - frontmatter_done = True - continue - - if frontmatter_done and line.strip() and not line.startswith('![['): - if not line.startswith('-') and not line.startswith('```'): - question_text = line.strip().replace('**', '') - break - - if not question_text: - return True, { - 'text': None, - 'left_items': [], - 'top_items': [], - 'correct_pairs': [], - 'has_answer': False, - 'question_type': 'matching' - } - - # Extract two consecutive bullet lists - left_items = [] - top_items = [] - in_first_list = False - in_second_list = False - in_frontmatter = False - frontmatter_done = False - found_question_text = False - - for line in lines: - # Track frontmatter - if line.strip() == '---': - if not in_frontmatter: - in_frontmatter = True - else: - in_frontmatter = False - frontmatter_done = True - continue - - if in_frontmatter or not frontmatter_done: - continue - - # Skip spoiler blocks - if line.strip().startswith('```'): - break - - # Found question text - if not found_question_text and question_text in line: - found_question_text = True - continue - - if not found_question_text: - continue - - # Look for bullet lists - if line.strip().startswith('- '): - item = line.strip()[2:].strip() - if not item: # Empty bullet - continue - - if not in_first_list and not in_second_list: - in_first_list = True - left_items.append(item) - elif in_first_list: - left_items.append(item) - elif in_second_list: - top_items.append(item) - elif line.strip() == '': - # Empty line - transition from first list to second - if in_first_list and left_items: - in_first_list = False - in_second_list = True - elif not line.strip().startswith('-') and (in_first_list or in_second_list): - # Non-bullet line after starting lists - end of lists - break - - # Parse answer from spoiler block - correct_pairs = [] - has_answer = False - in_spoiler = False - answer_lines = [] - - for line in lines: - if line.strip().startswith('```spoiler-block'): - in_spoiler = True - continue - if in_spoiler: - if line.strip() == '```': - break - stripped = line.strip() - if stripped: - answer_lines.append(stripped) - - if answer_lines: - full_answer = ' '.join(answer_lines) - - # Check for TODO - if 'TODO' in full_answer.upper(): - has_answer = False - else: - has_answer = True - # Parse "Item: Match" format - # Example: "Smak: Lobus Insularis" - for line in answer_lines: - if ':' in line: - left_part, top_part = line.split(':', 1) - left_part = left_part.strip() - top_part = top_part.strip() - - # Find indices - left_idx = None - top_idx = None - - for idx, item in enumerate(left_items): - if left_part.lower() in item.lower() or item.lower() in left_part.lower(): - left_idx = idx - break - - for idx, item in enumerate(top_items): - if top_part.lower() in item.lower() or item.lower() in top_part.lower(): - top_idx = idx - break - - if left_idx is not None and top_idx is not None: - correct_pairs.append([left_idx, top_idx]) - - return True, { - 'text': question_text, - 'left_items': left_items, - 'top_items': top_items, - 'correct_pairs': correct_pairs, - 'has_answer': has_answer, - 'question_type': 'matching' - } - def parse_markdown_question(file_path: Path, content: str) -> Tuple[bool, dict]: """ - Parse a markdown file and extract question data. + Parse a markdown file and extract question data using the new question_parser. Returns: (is_mcq, question_data) where question_data contains: @@ -247,84 +87,67 @@ def parse_markdown_question(file_path: Path, content: str) -> Tuple[bool, dict]: - correct_answer: the correct answer letter(s) - has_answer: whether it has an answer (not TODO) - tags: list of tag strings + - question_type: type of question (mcq, scq, matching, etc.) """ - lines = content.split('\n') + # Parse from content string (works for both test cases and real files) + parsed = parse_question_from_content(content) - # Check for question tags in frontmatter - # Accept: frågetyp/mcq, frågetyp/scq, frågetyp/textalternativ, frågetyp/textfält - is_question = False + # Extract metadata + metadata = parsed.metadata + tags = metadata.get('tags', []) + + # Check for question type in tags question_type = None - in_frontmatter = False - - for line in lines: - if line.strip() == '---': - if in_frontmatter: - # End of frontmatter - in_frontmatter = False - break - else: - in_frontmatter = True - continue - - if in_frontmatter: - if 'frågetyp/' in line: - is_question = True - # Extract question type - if 'frågetyp/mcq' in line: - question_type = 'mcq' - elif 'frågetyp/scq' in line: - question_type = 'scq' - elif 'frågetyp/matching' in line: - question_type = 'matching' - elif 'frågetyp/textalternativ' in line: - question_type = 'textalternativ' - elif 'frågetyp/textfält' in line: - question_type = 'textfält' - - if line.strip().lower().startswith('tags:'): - # Extract tags - # Handle: tags: [tag1, tag2] or tags: tag1, tag2 - tag_content = line.split(':', 1)[1].strip() - # Remove brackets if present - tag_content = tag_content.strip('[]') - # Split by comma - tags = [t.strip() for t in tag_content.split(',') if t.strip()] - - # If it's a matching question, use the matching parser - if question_type == 'matching': - is_matching, matching_data = parse_matching_question(content) - if is_matching: - # Add tags to the data - matching_data['tags'] = tags if 'tags' in locals() else [] - return True, matching_data + is_question = False + for tag in tags: + if 'frågetyp/' in tag: + is_question = True + if 'frågetyp/mcq' in tag: + question_type = 'mcq' + elif 'frågetyp/scq' in tag: + question_type = 'scq' + elif 'frågetyp/matching' in tag: + question_type = 'matching' + elif 'frågetyp/textalternativ' in tag: + question_type = 'textalternativ' + elif 'frågetyp/textfält' in tag: + question_type = 'textfält' if not is_question: return False, {} - # Extract question text (first non-empty line after frontmatter) - question_text = None - in_frontmatter = False - frontmatter_done = False + # Handle matching questions separately + if question_type == 'matching': + return parse_matching_question_from_nodes(parsed.nodes, tags) - for line in lines: - if line.strip() == '---': - if not in_frontmatter: - in_frontmatter = True - else: - in_frontmatter = False - frontmatter_done = True + # Extract question text from first paragraph (skip images and special instructions) + question_text = None + for node in parsed.nodes: + if node.type != "paragraph": + continue + text = node.text.strip() + # Skip empty paragraphs + if not text: continue - if frontmatter_done and line.strip() and not line.startswith('![['): - # Skip "Välj ett/två alternativ:" lines - if 'Välj' in line and 'alternativ' in line: - continue - if not line.startswith('-') and not line.startswith('```'): - question_text = line.strip().replace('**', '') - break + # Remove inline images from text first + text = re.sub(r'!\[\[.*?\]\]', '', text).strip() + + # Skip if paragraph was only an image reference + if not text: + continue + + # Skip "Välj X alternativ" instructions + if 'Välj' in text and 'alternativ' in text: + continue + + # Clean up bold markers + text = text.replace('**', '') + if text: + question_text = text + break - # Return early if no question text found, but include has_answer field if not question_text: return True, { 'text': None, @@ -332,62 +155,38 @@ def parse_markdown_question(file_path: Path, content: str) -> Tuple[bool, dict]: 'correct_answer': '', 'has_answer': False, 'question_type': question_type, - 'tags': tags if 'tags' in locals() else [] + 'tags': tags } - - # Extract options (pattern: "- A:" or "- A" for MCQ, or text for textalternativ) + # Extract options from list nodes options_data = [] - in_frontmatter = False - frontmatter_done = False - in_spoiler = False - for line in lines: - # Track frontmatter to skip it - if line.strip() == '---': - if not in_frontmatter: - in_frontmatter = True - else: - in_frontmatter = False - frontmatter_done = True + for node in parsed.nodes: + if node.type != "list": continue + for item in node.children: + # Get the text of the list item + if item.type != "list_item": + continue + item_text = item.text.strip() - # Skip frontmatter and spoiler blocks - if in_frontmatter or not frontmatter_done: - continue - - if line.strip().startswith('```spoiler-block:'): - in_spoiler = True - continue - if in_spoiler: - if line.strip() == '```': - in_spoiler = False - continue - - # Match "- A: text" or "- A: " or just "- A" - match = re.match(r'^-\s*([A-Z]):\s*(.*)$', line.strip()) - if not match: - # Also try "- A" without colon - match = re.match(r'^-\s*([A-Z])$', line.strip()) - - if match: - letter = match.group(1) - text = match.group(2) if len(match.groups()) > 1 else "" - options_data.append((letter, text.strip())) - else: - # For textalternativ, options might be plain text items - if question_type in ['textalternativ', 'textfält'] and line.strip().startswith('-') and not line.strip().startswith('--'): - # Extract text after dash - option_text = line.strip()[1:].strip() - # Skip if it's a sub-question marker like "a)" or "b)" - if option_text and not re.match(r'^[a-z]\)', option_text): - # Use incrementing letters for text options + # Match "A: text" or just "A" + match = re.match(r'^([A-Z]):\s*(.*)$', item_text) + if match: + letter = match.group(1) + text = match.group(2).strip() + options_data.append((letter, text)) + elif re.match(r'^([A-Z])$', item_text): + letter = item_text + options_data.append((letter, '')) + elif question_type in ['textalternativ', 'textfält']: + # For text-based questions, use incrementing letters + if not re.match(r'^[a-z]\)', item_text): # Skip sub-question markers letter = chr(ord('A') + len(options_data)) - options_data.append((letter, option_text)) + options_data.append((letter, item_text)) # For text-based questions, options are optional if not options_data: - # At least return something for single-option questions options_data = [('A', '')] elif len(options_data) < 2 and question_type in ['mcq', 'scq']: return True, { @@ -395,44 +194,34 @@ def parse_markdown_question(file_path: Path, content: str) -> Tuple[bool, dict]: 'options': options_data, 'correct_answer': '', 'has_answer': False, - 'question_type': question_type + 'question_type': question_type, + 'tags': tags } # Extract answer from spoiler block correct_answer = None has_answer = False - in_spoiler = False - answer_lines = [] - for line in lines: - if line.strip().startswith('```spoiler-block:'): - in_spoiler = True - continue - if in_spoiler: - if line.strip() == '```': - break - stripped = line.strip() - if stripped: - answer_lines.append(stripped) + for node in parsed.nodes: + if node.type == "block_code" and node.attrs.get("info") == "spoiler-block:": + answer_text = node.raw.strip() - # Process collected answer lines - if answer_lines: - full_answer = ' '.join(answer_lines) - - # Check for TODO - if 'TODO' in full_answer.upper(): - has_answer = False - else: - has_answer = True - - # For MCQ/SCQ: Extract capital letters - if question_type in ['mcq', 'scq']: - letters = re.findall(r'\b([A-Z])\b', full_answer) - if letters: - correct_answer = ','.join(sorted(set(letters))) + # Check for TODO + if 'TODO' in answer_text.upper(): + has_answer = False else: - # For text-based questions: Store the full answer text - correct_answer = full_answer[:200] # Limit to 200 chars for database field + has_answer = True + + # For MCQ/SCQ: Extract capital letters + if question_type in ['mcq', 'scq']: + letters = re.findall(r'\b([A-Z])\b', answer_text) + if letters: + correct_answer = ','.join(sorted(set(letters))) + else: + # For text-based questions: Store the full answer text + correct_answer = answer_text[:200] # Limit to 200 chars for database field + + break return True, { 'text': question_text, @@ -440,7 +229,112 @@ def parse_markdown_question(file_path: Path, content: str) -> Tuple[bool, dict]: 'correct_answer': correct_answer, 'has_answer': has_answer, 'question_type': question_type, - 'tags': tags if 'tags' in locals() else [] + 'tags': tags + } + + +def parse_matching_question_from_nodes(nodes: list[Node], tags: list) -> Tuple[bool, dict]: + """ + Parse matching question from parsed nodes. + + Expected format: + - Two consecutive bullet lists + - First list = left column items (rows) + - Second list = top row items (columns) + - Answer format: "LeftItem: TopItem" pairs + + Returns: + (is_matching, question_data) + """ + # Extract question text + question_text = None + for node in nodes: + if node.type == "paragraph": + text = node.text.strip() + # Remove inline images + text = re.sub(r'!\[\[.*?\]\]', '', text).strip() + # Skip if empty after removing images + if not text: + continue + question_text = text.replace('**', '') + break + + if not question_text: + return True, { + 'text': None, + 'left_items': [], + 'top_items': [], + 'correct_pairs': [], + 'has_answer': False, + 'question_type': 'matching', + 'tags': tags + } + + # Extract two consecutive lists + left_items = [] + top_items = [] + list_nodes = [node for node in nodes if node.type == "list"] + + if len(list_nodes) >= 2: + # First list = left items + for item in list_nodes[0].children: + if item.type == "list_item": + left_items.append(item.text.strip()) + + # Second list = top items + for item in list_nodes[1].children: + if item.type == "list_item": + top_items.append(item.text.strip()) + + # Parse answer from spoiler block + correct_pairs = [] + has_answer = False + + for node in nodes: + if node.type == "block_code" and node.attrs.get("info") == "spoiler-block:": + answer_text = node.raw.strip() + + # Check for TODO + if 'TODO' in answer_text.upper(): + has_answer = False + break + has_answer = True + # Parse "Item: Match" format + answer_lines = answer_text.split('\n') + for line in answer_lines: + line = line.strip() + if ':' not in line: + continue + left_part, top_part = line.split(':', 1) + left_part = left_part.strip() + top_part = top_part.strip() + + # Find indices + left_idx = None + top_idx = None + + for idx, item in enumerate(left_items): + if left_part.lower() in item.lower() or item.lower() in left_part.lower(): + left_idx = idx + break + + for idx, item in enumerate(top_items): + if top_part.lower() in item.lower() or item.lower() in top_part.lower(): + top_idx = idx + break + + if left_idx is not None and top_idx is not None: + correct_pairs.append([left_idx, top_idx]) + break + + return True, { + 'text': question_text, + 'left_items': left_items, + 'top_items': top_items, + 'correct_pairs': correct_pairs, + 'has_answer': has_answer, + 'question_type': 'matching', + 'tags': tags } @@ -460,7 +354,6 @@ def import_question_file(file_path: Path, base_path: Path, stats: ImportStats, f file_mtime = file_path.stat().st_mtime # Calculate path relative to project root - from django.conf import settings project_root = settings.BASE_DIR.parent try: file_path_str = str(file_path.relative_to(project_root)) @@ -518,9 +411,6 @@ def import_question_file(file_path: Path, base_path: Path, stats: ImportStats, f # Try to parse as date if exam_folder and '-' in exam_folder: try: - from datetime import datetime - from quiz.models import Course, Exam - exam_date = datetime.strptime(exam_folder, '%Y-%m-%d').date() # Get or create course (default to "Anatomi & Histologi 2") @@ -610,17 +500,6 @@ def import_question_file(file_path: Path, base_path: Path, stats: ImportStats, f def import_questions(folder_path: Path, base_path: Path = None, force: bool = False) -> ImportStats: - """ - Import all questions from a folder. - - Args: - folder_path: Path to the folder containing question markdown files - base_path: Base path for relative path calculations (defaults to folder_path) - force: If True, import all files regardless of mtime (for initial import) - - Returns: - ImportStats object with import statistics - """ if base_path is None: base_path = folder_path @@ -634,9 +513,7 @@ def import_questions(folder_path: Path, base_path: Path = None, force: bool = Fa def delete_question_by_path(file_path: Path): - """Delete a question from the database by file path""" try: - from django.conf import settings project_root = settings.BASE_DIR.parent file_path_str = str(file_path.relative_to(project_root)) deleted_count, _ = Question.objects.filter(file_path=file_path_str).delete() diff --git a/quiz/quiz/utils/obsidian_embed_plugin.py b/quiz/quiz/utils/obsidian_embed_plugin.py new file mode 100644 index 0000000..eaaae86 --- /dev/null +++ b/quiz/quiz/utils/obsidian_embed_plugin.py @@ -0,0 +1,38 @@ +__all__ = ["obsidian_embed"] + +# https://help.obsidian.md/embeds + +# Supported: +# ![[image-4.png|292x316]] +def parse_embed(inline, match, state): + filename = match.group("filename") + attrs = {} + if "|" in filename: + filename, size = filename.split("|", 1) + else: + size = None + attrs["filename"] = filename + if size: + if "x" in size: + width, height = size.split("x", 1) + if width: + attrs["width"] = int(width) + if height: + attrs["height"] = int(height) + else: + attrs["width"] = int(size) + state.append_token({"type": "embed", "attrs": attrs}) + return match.end() + + +INLINE_EMBED_PATTERN = ( + r'!\[\[' # begins with ![ + r'(?!\s)' # not whitespace + r'(?P.+?)' # content between `![[xx]]` + r'(?!\s)' # not whitespace + r'\]\]' # closing ] +) + + +def obsidian_embed(md: "Markdown") -> None: + md.inline.register('embed', INLINE_EMBED_PATTERN, parse_embed, before="link") diff --git a/quiz/quiz/utils/question_parser.py b/quiz/quiz/utils/question_parser.py new file mode 100644 index 0000000..6dd750d --- /dev/null +++ b/quiz/quiz/utils/question_parser.py @@ -0,0 +1,89 @@ +import dataclasses +import pathlib + +import frontmatter +import mistune + +from quiz.utils.obsidian_embed_plugin import obsidian_embed + +markdown = mistune.create_markdown(renderer="ast", plugins=[obsidian_embed]) + + +class Node: + def __init__(self, token): + self.type = token["type"] + self.raw = token.get("raw", "") + self.attrs = token.get("attrs", {}) + self.children = [Node(token=child) for child in token.get("children", [])] + + def __repr__(self) -> str: + attrs = [] + if self.raw: + attrs.append(f"raw={self.raw!r}") + if self.attrs: + attrs.append(f"attrs={self.attrs!r}") + if self.children: + attrs.append(f"children={self.children!r}") + # block_text -> BlockText + pretty = self.type.replace("_", " ").title().replace(" ", "") + return f"{pretty}(" + ", ".join(attrs) + ")" + + @property + def text(self) -> str: + if self.type == "text": + return self.raw + texts = [] + for child in self.children: + texts.append(child.text) + return "".join(texts) + + +@dataclasses.dataclass +class ParsedQuestion: + metadata: dict = dataclasses.field(default_factory=dict) + raw_content: str = "" + nodes: list[Node] = dataclasses.field(default_factory=list) + + +def parse_question(path: pathlib.Path): + raw = path.read_text(encoding="utf-8") + return parse_question_from_content(raw) + + +def parse_question_from_content(content_str: str): + """Parse question from a content string instead of a file.""" + metadata, content = frontmatter.parse(content_str) + tokens = markdown(content) + question = ParsedQuestion( + metadata=metadata, + raw_content=content, + nodes=[Node(token=token) for token in tokens], + ) + return question + + +def main(): + root = pathlib.Path(__file__).parent.parent.parent.parent + print(root) + exams = root / "content" / "Anatomi & Histologi 2" / "Gamla tentor" + for file in sorted(exams.glob("*/*.md")): + if len(file.stem) > 2: + continue + question = parse_question(file) + print(question.metadata, repr(question.raw_content)) + continue + for node in question.nodes: + match node.type: + case "heading": + print("Heading:", repr(node.text)) + case "paragraph": + print("Paragraph:", repr(node.text)) + case "list": + print("List:") + for child in node.children: + print(" - List item:", repr(child.text)) + case "block_code" if node.attrs["info"] == "spoiler-block:": + print("Spoiler:", repr(node.raw.rstrip())) + +if __name__ == "__main__": + main() \ No newline at end of file