medical-notes/quiz/quiz/utils/importer.py

import re
from pathlib import Path
from collections import defaultdict
from typing import Tuple
from quiz.models import Question, Option


class ImportStats:
    """Track import statistics by exam folder"""

    def __init__(self):
        self.total_files = 0
        self.mcq_questions = 0
        self.non_mcq_skipped = 0
        self.questions_with_answers = 0
        self.questions_with_todo = 0
        self.created = 0
        self.updated = 0
        self.errors = 0
        self.by_folder = defaultdict(lambda: {
            'total': 0,
            'mcq': 0,
            'answered': 0,
            'todo': 0
        })

    def format_output(self) -> str:
        """Format statistics for console output"""
        lines = []
        lines.append("\n" + "="*70)
        lines.append("QUESTION IMPORT STATISTICS")
        lines.append("="*70)
        lines.append(f"Total .md files found:     {self.total_files}")
        lines.append(f"MCQ questions found:       {self.mcq_questions}")
        lines.append(f"Non-MCQ skipped:           {self.non_mcq_skipped}")
        lines.append(f"Questions with answers:    {self.questions_with_answers}")
        lines.append(f"Questions with TODO:       {self.questions_with_todo}")
        lines.append(f"Created in database:       {self.created}")
        lines.append(f"Updated in database:       {self.updated}")
        if self.errors > 0:
            lines.append(f"Errors:                    {self.errors}")

        if self.mcq_questions > 0:
            completion_pct = (self.questions_with_answers / self.mcq_questions * 100)
            lines.append(f"Overall completion:        {completion_pct:.1f}%")

        lines.append("\n" + "-"*70)
        lines.append("COMPLETION BY EXAM FOLDER")
        lines.append("-"*70)

        sorted_folders = sorted(self.by_folder.items())
        for folder, stats in sorted_folders:
            if stats['mcq'] > 0:
                pct = (stats['answered'] / stats['mcq'] * 100)
                lines.append(f"{folder:20} {stats['answered']:3}/{stats['mcq']:3} MCQ ({pct:5.1f}%)")

        lines.append("="*70 + "\n")
        return "\n".join(lines)


def parse_markdown_question(file_path: Path, content: str) -> Tuple[bool, dict]:
    """
    Parse a markdown file and extract question data.

    Returns:
        (is_mcq, question_data) where question_data contains:
        - text: question text
        - options: list of (letter, text) tuples
        - correct_answer: the correct answer letter(s)
        - has_answer: whether it has an answer (not TODO)
    """
    lines = content.split('\n')

    # Check for MCQ tags in frontmatter
    is_mcq = False
    in_frontmatter = False
    for line in lines:
        if line.strip() == '---':
            in_frontmatter = not in_frontmatter
            continue
        if in_frontmatter and ('frågetyp/mcq' in line or 'frågetyp/scq' in line):
            is_mcq = True
            break

    if not is_mcq:
        return False, {}

    # Extract question text (first non-empty line after frontmatter)
    question_text = None
    in_frontmatter = False
    frontmatter_done = False

    for line in lines:
        if line.strip() == '---':
            if not in_frontmatter:
                in_frontmatter = True
            else:
                in_frontmatter = False
                frontmatter_done = True
            continue

        if frontmatter_done and line.strip() and not line.startswith('![['):
            # Skip "Välj ett/två alternativ:" lines
            if 'Välj' in line and 'alternativ' in line:
                continue
            if not line.startswith('-') and not line.startswith('```'):
                question_text = line.strip().replace('**', '')
                break

    if not question_text:
        return True, {}

    # Extract options (pattern: "- A:" or "- A" or just "- A:")
    options_data = []
    for line in lines:
        # Match "- A: text" or "- A: " or just "- A"
        match = re.match(r'^-\s*([A-Z]):\s*(.*)$', line.strip())
        if not match:
            # Also try "- A" without colon
            match = re.match(r'^-\s*([A-Z])$', line.strip())

        if match:
            letter = match.group(1)
            text = match.group(2) if len(match.groups()) > 1 else ""
            options_data.append((letter, text.strip()))

    if len(options_data) < 2:
        return True, {}

    # Extract answer from spoiler block
    correct_answer = None
    has_answer = False
    in_spoiler = False

    for line in lines:
        if line.strip().startswith('```spoiler-block:'):
            in_spoiler = True
            continue
        if in_spoiler:
            if line.strip() == '```':
                break
            stripped = line.strip()
            if stripped and stripped != 'TODO':
                # Extract single letter answer (e.g., "B" or "F")
                answer_match = re.match(r'^([A-Z])$', stripped)
                if answer_match:
                    correct_answer = answer_match.group(1)
                    has_answer = True
                break
            elif stripped == 'TODO':
                break

    return True, {
        'text': question_text,
        'options': options_data,
        'correct_answer': correct_answer,
        'has_answer': has_answer
    }


def import_question_file(file_path: Path, base_path: Path, stats: ImportStats):
    """Import a single question file"""
    try:
        content = file_path.read_text(encoding='utf-8')
        is_mcq, question_data = parse_markdown_question(file_path, content)

        # Track folder stats
        relative_path = file_path.relative_to(base_path)
        folder_name = relative_path.parts[0] if len(relative_path.parts) > 1 else 'root'
        stats.by_folder[folder_name]['total'] += 1

        if not is_mcq:
            stats.non_mcq_skipped += 1
            return

        stats.mcq_questions += 1
        stats.by_folder[folder_name]['mcq'] += 1

        if not question_data or not question_data.get('text'):
            stats.non_mcq_skipped += 1
            return

        if not question_data['has_answer']:
            stats.questions_with_todo += 1
            stats.by_folder[folder_name]['todo'] += 1
            return  # Skip questions without answers

        stats.questions_with_answers += 1
        stats.by_folder[folder_name]['answered'] += 1

        # Import to database
        file_path_str = str(file_path.relative_to(base_path.parent))

        question, created = Question.objects.update_or_create(
            file_path=file_path_str,
            defaults={
                'text': question_data['text'],
                'correct_answer': question_data['correct_answer'],
            }
        )

        if created:
            stats.created += 1
        else:
            stats.updated += 1

        # Update options
        question.options.all().delete()
        for letter, text in question_data['options']:
            Option.objects.create(question=question, letter=letter, text=text)

    except Exception as e:
        stats.errors += 1
        print(f"Error importing {file_path}: {e}")


def import_questions(folder_path: Path, base_path: Path = None) -> ImportStats:
    """
    Import all questions from a folder.

    Args:
        folder_path: Path to the folder containing question markdown files
        base_path: Base path for relative path calculations (defaults to folder_path)

    Returns:
        ImportStats object with import statistics
    """
    if base_path is None:
        base_path = folder_path

    stats = ImportStats()

    for md_file in folder_path.rglob('*.md'):
        stats.total_files += 1
        import_question_file(md_file, base_path, stats)

    return stats


def delete_question_by_path(file_path: Path, base_path: Path):
    """Delete a question from the database by file path"""
    try:
        file_path_str = str(file_path.relative_to(base_path.parent))
        Question.objects.filter(file_path=file_path_str).delete()
        print(f"Deleted question: {file_path_str}")
    except Exception as e:
        print(f"Error deleting question {file_path}: {e}")