medical-notes/stroma/quiz/utils/tests/test_question_parser.py

"""
Comprehensive test suite for the question_parser module.

This test suite uses pytest's parametrize decorator to test multiple scenarios
with minimal code duplication. It covers:

1. Node class:
   - Initialization with different token types
   - Attribute handling
   - Children node processing
   - String representation (__repr__)
   - Text extraction from nested structures

2. parse_question function:
   - Metadata parsing (tags, dates, etc.)
   - Raw content extraction
   - Different question types (MCQ, SCQ, text field, matching)
   - Questions with images
   - Edge cases (empty content, missing frontmatter)
   - Document structure preservation

3. ParsedQuestion dataclass:
   - Default values
   - Initialization with custom values

4. Real exam questions:
   - Parsing actual exam questions from the content directory
   - Validation of all short-named question files

Test execution:
    pytest tests/test_question_parser.py -v              # Verbose output
    pytest tests/test_question_parser.py -k "mcq"        # Run only MCQ tests
    pytest tests/test_question_parser.py --collect-only  # List all tests
"""

import pathlib
import tempfile
import pytest
from quiz.utils.question_parser import Node, ParsedQuestion, parse_question


@pytest.fixture
def temp_dir():
    """Create a temporary directory for test files"""
    with tempfile.TemporaryDirectory() as tmpdir:
        yield pathlib.Path(tmpdir)


@pytest.fixture
def create_question_file(temp_dir):
    """Factory fixture to create question files"""
    def _create_file(filename: str, content: str) -> pathlib.Path:
        file_path = temp_dir / filename
        file_path.write_text(content, encoding="utf-8")
        return file_path
    return _create_file


class TestNode:
    """Test the Node class"""

    @pytest.mark.parametrize("token,expected_type,expected_raw", [
        ({"type": "paragraph"}, "paragraph", ""),
        ({"type": "heading", "raw": "Test Heading"}, "heading", "Test Heading"),
        ({"type": "text", "raw": "Some text"}, "text", "Some text"),
        ({"type": "list"}, "list", ""),
    ])
    def test_node_initialization(self, token, expected_type, expected_raw):
        """Test Node initialization with different token types"""
        node = Node(token)
        assert node.type == expected_type
        assert node.raw == expected_raw

    @pytest.mark.parametrize("token,expected_attrs", [
        ({"type": "block_code", "attrs": {"info": "spoiler-block:"}}, {"info": "spoiler-block:"}),
        ({"type": "paragraph"}, {}),
        ({"type": "heading", "attrs": {"level": 2}}, {"level": 2}),
    ])
    def test_node_attributes(self, token, expected_attrs):
        """Test Node attributes handling"""
        node = Node(token)
        assert node.attrs == expected_attrs

    def test_node_children(self):
        """Test Node children handling"""
        token = {
            "type": "paragraph",
            "children": [
                {"type": "text", "raw": "Hello "},
                {"type": "text", "raw": "World"},
            ]
        }
        node = Node(token)
        assert len(node.children) == 2
        assert node.children[0].type == "text"
        assert node.children[0].raw == "Hello "
        assert node.children[1].type == "text"
        assert node.children[1].raw == "World"

    @pytest.mark.parametrize("token,expected_repr_contains", [
        ({"type": "text", "raw": "test"}, "Text(raw='test')"),
        ({"type": "paragraph"}, "Paragraph()"),
        ({"type": "block_code", "attrs": {"info": "python"}}, "BlockCode(attrs={'info': 'python'})"),
    ])
    def test_node_repr(self, token, expected_repr_contains):
        """Test Node __repr__ method"""
        node = Node(token)
        assert repr(node) == expected_repr_contains

    @pytest.mark.parametrize("token,expected_text", [
        ({"type": "text", "raw": "Simple text"}, "Simple text"),
        (
            {
                "type": "paragraph",
                "children": [
                    {"type": "text", "raw": "Hello "},
                    {"type": "text", "raw": "World"},
                ]
            },
            "Hello World"
        ),
        (
            {
                "type": "paragraph",
                "children": [
                    {"type": "text", "raw": "Nested "},
                    {
                        "type": "strong",
                        "children": [{"type": "text", "raw": "bold"}]
                    },
                    {"type": "text", "raw": " text"},
                ]
            },
            "Nested bold text"
        ),
    ])
    def test_node_text_property(self, token, expected_text):
        """Test Node text property extraction"""
        node = Node(token)
        assert node.text == expected_text


class TestParseQuestion:
    """Test the parse_question function"""

    @pytest.mark.parametrize("content,expected_tags", [
        (
            """---
tags: [ah2, provfråga, frågetyp/mcq]
date: 2022-01-15
---
Question content""",
            ["ah2", "provfråga", "frågetyp/mcq"]
        ),
        (
            """---
tags:
  - ah2
  - provfråga
  - frågetyp/scq
date: 2023-05-31
---
Question content""",
            ["ah2", "provfråga", "frågetyp/scq"]
        ),
    ])
    def test_parse_metadata_tags(self, create_question_file, content, expected_tags):
        """Test parsing of metadata tags in different formats"""
        file_path = create_question_file("test.md", content)
        question = parse_question(file_path)
        assert question.metadata["tags"] == expected_tags

    @pytest.mark.parametrize("content,expected_date", [
        (
            """---
tags: [ah2]
date: 2022-01-15
---
Content""",
            "2022-01-15"
        ),
        (
            """---
tags: [ah2]
date: 2023-05-31
---
Content""",
            "2023-05-31"
        ),
    ])
    def test_parse_metadata_date(self, create_question_file, content, expected_date):
        """Test parsing of metadata date"""
        file_path = create_question_file("test.md", content)
        question = parse_question(file_path)
        assert str(question.metadata["date"]) == expected_date

    @pytest.mark.parametrize("content,expected_raw", [
        (
            """---
tags: [ah2]
---
Simple question""",
            "Simple question"
        ),
        (
            """---
tags: [ah2]
---
Question with **bold** text""",
            "Question with **bold** text"
        ),
    ])
    def test_parse_raw_content(self, create_question_file, content, expected_raw):
        """Test parsing of raw content"""
        file_path = create_question_file("test.md", content)
        question = parse_question(file_path)
        assert question.raw_content.strip() == expected_raw

    def test_parse_mcq_question(self, create_question_file):
        """Test parsing a complete MCQ question"""
        content = """---
tags: [ah2, provfråga, frågetyp/mcq, cerebrum]
date: 2022-01-15
---
Vilka av följande räknas till storhjärnans basala kärnor?

**Välj två alternativ**
- A: Putamen
- B: Nucleus Ruber
- C: Substantia nigra
- D: Nucleus caudatus

```spoiler-block:
A och D
```
"""
        file_path = create_question_file("mcq.md", content)
        question = parse_question(file_path)

        assert question.metadata["tags"] == ["ah2", "provfråga", "frågetyp/mcq", "cerebrum"]
        assert len(question.nodes) > 0

        # Find paragraph nodes
        paragraphs = [n for n in question.nodes if n.type == "paragraph"]
        assert len(paragraphs) > 0

        # Find list nodes
        lists = [n for n in question.nodes if n.type == "list"]
        assert len(lists) > 0

        # Find spoiler block
        code_blocks = [n for n in question.nodes if n.type == "block_code"]
        assert len(code_blocks) > 0
        spoiler = code_blocks[0]
        assert spoiler.attrs.get("info") == "spoiler-block:"
        assert "A och D" in spoiler.raw

    def test_parse_scq_question(self, create_question_file):
        """Test parsing a single choice question"""
        content = """---
tags: [ah2, provfråga, frågetyp/scq, histologi]
date: 2022-06-01
---
Vilken del av CNS syns i bild?
- A: Cerebellum
- B: Diencephalon
- C: Medulla spinalis
- D: Cerebrum
- E: Pons

```spoiler-block:
A
```
"""
        file_path = create_question_file("scq.md", content)
        question = parse_question(file_path)

        assert "frågetyp/scq" in question.metadata["tags"]
        lists = [n for n in question.nodes if n.type == "list"]
        assert len(lists) > 0

    def test_parse_text_field_question(self, create_question_file):
        """Test parsing a text field question"""
        content = """---
tags: [ah2, provfråga, frågetyp/textfält, öga, anatomi]
date: 2022-01-15
---
![[image-2.png|301x248]]
**Fyll i rätt siffra!**

(0.5p per rätt svar, inga avdrag för fel svar):

a) Vilken siffra pekar på gula fläcken?
b) Vilken siffra pekar på choroidea?

```spoiler-block:
a) 7
b) 6
```
"""
        file_path = create_question_file("textfield.md", content)
        question = parse_question(file_path)

        assert "frågetyp/textfält" in question.metadata["tags"]
        assert len(question.nodes) > 0

    def test_parse_matching_question(self, create_question_file):
        """Test parsing a matching question"""
        content = """---
tags: [ah2, provfråga, frågetyp/matching, histologi]
date: 2023-05-31
---
Vilka av följande stödjeceller finns i CNS? Markera JA eller NEJ för varje angiven celltyp:
(1p för alla rätt, inga delpoäng)

- a) oligodendrocyter
- b) Astrocyter
- c) satellitceller
- d) ependymceller
- e) mikroglia
- f) Schwannceller

- JA, finn i CNS
- NEJ, finns inte i CNS

```spoiler-block:
a) JA, finn i CNS
b) JA, finn i CNS
c) NEJ, finns inte i CNS
d) JA, finn i CNS
e) JA, finn i CNS
f) NEJ, finns inte i CNS
```
"""
        file_path = create_question_file("matching.md", content)
        question = parse_question(file_path)

        assert "frågetyp/matching" in question.metadata["tags"]
        lists = [n for n in question.nodes if n.type == "list"]
        assert len(lists) > 0

    def test_parse_question_with_image(self, create_question_file):
        """Test parsing a question with embedded images"""
        content = """---
tags: [ah2, provfråga, frågetyp/textfält, öra, anatomi, bild]
date: 2022-01-15
---
![[image-4.png|292x316]]
**Fyll i rätt siffra !**

(0.5p per rätt svar, inga avdrag för fel svar):

a) Vilken siffra pekar på incus? (1..19)
b) Vilken siffra pekar på tuba auditiva? (1..19)

```spoiler-block:
a) 7
b) 18
```
"""
        file_path = create_question_file("image_q.md", content)
        question = parse_question(file_path)

        assert "bild" in question.metadata["tags"]
        assert "![[image-4.png" in question.raw_content
        embed = question.nodes[0].children[0]
        assert embed.type == "embed"
        assert embed.attrs == {
            "filename": "image-4.png",
            "width": 292,
            "height": 316
        }

    @pytest.mark.parametrize("invalid_content", [
        "",  # Empty content
        "No frontmatter",  # No frontmatter
        "---\n---\n",  # Empty frontmatter
    ])
    def test_parse_edge_cases(self, create_question_file, invalid_content):
        """Test parsing edge cases"""
        file_path = create_question_file("edge.md", invalid_content)
        question = parse_question(file_path)
        assert isinstance(question, ParsedQuestion)

    def test_parse_question_preserves_structure(self, create_question_file):
        """Test that parsing preserves the document structure"""
        content = """---
tags: [ah2]
---
# Heading

Paragraph text

- List item 1
- List item 2

```spoiler-block:
Answer
```
"""
        file_path = create_question_file("structure.md", content)
        question = parse_question(file_path)

        node_types = [n.type for n in question.nodes]
        assert "heading" in node_types
        assert "paragraph" in node_types
        assert "list" in node_types
        assert "block_code" in node_types


class TestParsedQuestionDataclass:
    """Test the ParsedQuestion dataclass"""

    def test_parsed_question_defaults(self):
        """Test ParsedQuestion default values"""
        question = ParsedQuestion()
        assert question.metadata == {}
        assert question.raw_content == ""
        assert question.nodes == []

    def test_parsed_question_initialization(self):
        """Test ParsedQuestion initialization with values"""
        metadata = {"tags": ["test"], "date": "2022-01-15"}
        content = "Test content"
        nodes = [Node({"type": "paragraph"})]

        question = ParsedQuestion(
            metadata=metadata,
            raw_content=content,
            nodes=nodes
        )

        assert question.metadata == metadata
        assert question.raw_content == content
        assert question.nodes == nodes


class TestRealQuestions:
    """Test parsing real questions from the exam files"""

    @pytest.fixture
    def exam_dir(self):
        """Get the real exam directory"""
        root = pathlib.Path(__file__).parent.parent.parent
        exam_path = root / "content" / "Anatomi & Histologi 2" / "Gamla tentor"
        if exam_path.exists():
            return exam_path
        pytest.skip("Exam directory not found")

    @pytest.mark.parametrize("exam_date,question_num", [
        ("2022-01-15", "1"),
        ("2022-01-15", "2"),
        ("2022-01-15", "3"),
        ("2022-01-15", "4"),
        ("2022-06-01", "8"),
    ])
    def test_parse_real_exam_questions(self, exam_dir, exam_date, question_num):
        """Test parsing real exam questions"""
        file_path = exam_dir / exam_date / f"{question_num}.md"
        if not file_path.exists():
            pytest.skip(f"Question file {file_path} not found")

        question = parse_question(file_path)

        # Verify metadata exists and has required fields
        assert "tags" in question.metadata
        assert isinstance(question.metadata["tags"], list)
        assert "ah2" in question.metadata["tags"]
        assert "provfråga" in question.metadata["tags"]

        # Verify content was parsed
        assert len(question.raw_content) > 0
        assert len(question.nodes) > 0

    def test_parse_all_short_named_questions(self, exam_dir):
        """Test parsing all questions with short filenames (1-2 chars)"""
        questions_found = 0

        for file in sorted(exam_dir.glob("*/*.md")):
            if len(file.stem) <= 2 and file.stem.isdigit():
                question = parse_question(file)
                assert isinstance(question, ParsedQuestion)
                assert "tags" in question.metadata
                questions_found += 1

        # Ensure we found at least some questions
        assert questions_found > 0, "No exam questions found to test"


class TestNodeTextExtraction:
    """Test text extraction from complex node structures"""

    @pytest.mark.parametrize("token,expected_text", [
        # Simple text
        ({"type": "text", "raw": "Hello"}, "Hello"),

        # Paragraph with multiple text children
        (
            {
                "type": "paragraph",
                "children": [
                    {"type": "text", "raw": "A "},
                    {"type": "text", "raw": "B "},
                    {"type": "text", "raw": "C"},
                ]
            },
            "A B C"
        ),

        # Nested formatting
        (
            {
                "type": "paragraph",
                "children": [
                    {"type": "text", "raw": "Normal "},
                    {
                        "type": "emphasis",
                        "children": [{"type": "text", "raw": "italic"}]
                    },
                    {"type": "text", "raw": " "},
                    {
                        "type": "strong",
                        "children": [{"type": "text", "raw": "bold"}]
                    },
                ]
            },
            "Normal italic bold"
        ),

        # Empty node
        ({"type": "paragraph", "children": []}, ""),
    ])
    def test_complex_text_extraction(self, token, expected_text):
        """Test text extraction from complex nested structures"""
        node = Node(token)
        assert node.text == expected_text