import re from collections import defaultdict from datetime import datetime from pathlib import Path from typing import Tuple import django.db.utils from django.conf import settings from quiz.models import Course, Exam, Question, Option from quiz.utils.question_parser import parse_question_from_content, Node class ImportStats: """Track import statistics by exam folder""" def __init__(self): self.total_files = 0 self.mcq_questions = 0 self.non_mcq_skipped = 0 self.questions_with_answers = 0 self.questions_with_todo = 0 self.created = 0 self.updated = 0 self.errors = 0 self.by_folder = defaultdict(lambda: { 'total': 0, 'mcq': 0, 'answered': 0, 'todo': 0 }) def has_changes(self) -> bool: """Check if there were any actual changes""" return self.created > 0 or self.updated > 0 or self.errors > 0 def format_output(self, show_if_no_changes: bool = True) -> str: """ Format statistics for console output Args: show_if_no_changes: If False, returns empty string when no changes """ if not show_if_no_changes and not self.has_changes(): return "" lines = [] lines.append("\n" + "="*70) lines.append("QUESTION IMPORT STATISTICS") lines.append("="*70) lines.append(f"Total .md files found: {self.total_files}") lines.append(f"MCQ questions found: {self.mcq_questions}") lines.append(f"Non-MCQ skipped: {self.non_mcq_skipped}") lines.append(f"Questions with answers: {self.questions_with_answers}") lines.append(f"Questions with TODO: {self.questions_with_todo}") lines.append(f"Created in database: {self.created}") lines.append(f"Updated in database: {self.updated}") if self.errors > 0: lines.append(f"Errors: {self.errors}") if self.mcq_questions > 0: completion_pct = (self.questions_with_answers / self.mcq_questions * 100) lines.append(f"Overall completion: {completion_pct:.1f}%") lines.append("\n" + "-"*70) lines.append("COMPLETION BY EXAM FOLDER") lines.append("-"*70) sorted_folders = sorted(self.by_folder.items()) for folder, stats in sorted_folders: if stats['mcq'] > 0: pct = (stats['answered'] / stats['mcq'] * 100) lines.append(f"{folder:20} {stats['answered']:3}/{stats['mcq']:3} MCQ ({pct:5.1f}%)") lines.append("="*70 + "\n") return "\n".join(lines) def parse_markdown_question(file_path: Path, content: str) -> Tuple[bool, dict]: """ Parse a markdown file and extract question data using the new question_parser. Returns: (is_mcq, question_data) where question_data contains: - text: question text - options: list of (letter, text) tuples - correct_answer: the correct answer letter(s) - has_answer: whether it has an answer (not TODO) - tags: list of tag strings - question_type: type of question (mcq, scq, matching, etc.) """ # Parse from content string (works for both test cases and real files) parsed = parse_question_from_content(content) # Extract metadata metadata = parsed.metadata tags = metadata.get('tags', []) # Check for question type in tags question_type = None is_question = False for tag in tags: if 'frågetyp/' in tag: is_question = True if 'frågetyp/mcq' in tag: question_type = 'mcq' elif 'frågetyp/scq' in tag: question_type = 'scq' elif 'frågetyp/matching' in tag: question_type = 'matching' elif 'frågetyp/textalternativ' in tag: question_type = 'textalternativ' elif 'frågetyp/textfält' in tag: question_type = 'textfält' if not is_question: return False, {} # Handle matching questions separately if question_type == 'matching': return parse_matching_question_from_nodes(parsed.nodes, tags) # Extract question text from first paragraph (skip images and special instructions) question_text = None for node in parsed.nodes: if node.type != "paragraph": continue text = node.text.strip() # Skip empty paragraphs if not text: continue # Remove inline images from text first text = re.sub(r'!\[\[.*?\]\]', '', text).strip() # Skip if paragraph was only an image reference if not text: continue # Skip "Välj X alternativ" instructions if 'Välj' in text and 'alternativ' in text: continue # Clean up bold markers text = text.replace('**', '') if text: question_text = text break if not question_text: return True, { 'text': None, 'options': [], 'correct_answer': '', 'has_answer': False, 'question_type': question_type, 'tags': tags } # Extract options from list nodes options_data = [] for node in parsed.nodes: if node.type != "list": continue for item in node.children: # Get the text of the list item if item.type != "list_item": continue item_text = item.text.strip() # Match "A: text" or just "A" match = re.match(r'^([A-Z]):\s*(.*)$', item_text) if match: letter = match.group(1) text = match.group(2).strip() options_data.append((letter, text)) elif re.match(r'^([A-Z])$', item_text): letter = item_text options_data.append((letter, '')) elif question_type in ['textalternativ', 'textfält']: # For text-based questions, use incrementing letters if not re.match(r'^[a-z]\)', item_text): # Skip sub-question markers letter = chr(ord('A') + len(options_data)) options_data.append((letter, item_text)) # For text-based questions, options are optional if not options_data: options_data = [('A', '')] elif len(options_data) < 2 and question_type in ['mcq', 'scq']: return True, { 'text': question_text, 'options': options_data, 'correct_answer': '', 'has_answer': False, 'question_type': question_type, 'tags': tags } # Extract answer from spoiler block correct_answer = None has_answer = False for node in parsed.nodes: if node.type == "block_code" and node.attrs.get("info") == "spoiler-block:": answer_text = node.raw.strip() # Check for TODO if 'TODO' in answer_text.upper(): has_answer = False else: has_answer = True # For MCQ/SCQ: Extract capital letters if question_type in ['mcq', 'scq']: letters = re.findall(r'\b([A-Z])\b', answer_text) if letters: correct_answer = ','.join(sorted(set(letters))) else: # For text-based questions: Store the full answer text correct_answer = answer_text[:200] # Limit to 200 chars for database field break return True, { 'text': question_text, 'options': options_data, 'correct_answer': correct_answer, 'has_answer': has_answer, 'question_type': question_type, 'tags': tags } def parse_matching_question_from_nodes(nodes: list[Node], tags: list) -> Tuple[bool, dict]: """ Parse matching question from parsed nodes. Expected format: - Two consecutive bullet lists - First list = left column items (rows) - Second list = top row items (columns) - Answer format: "LeftItem: TopItem" pairs Returns: (is_matching, question_data) """ # Extract question text question_text = None for node in nodes: if node.type == "paragraph": text = node.text.strip() # Remove inline images text = re.sub(r'!\[\[.*?\]\]', '', text).strip() # Skip if empty after removing images if not text: continue question_text = text.replace('**', '') break if not question_text: return True, { 'text': None, 'left_items': [], 'top_items': [], 'correct_pairs': [], 'has_answer': False, 'question_type': 'matching', 'tags': tags } # Extract two consecutive lists left_items = [] top_items = [] list_nodes = [node for node in nodes if node.type == "list"] if len(list_nodes) >= 2: # First list = left items for item in list_nodes[0].children: if item.type == "list_item": left_items.append(item.text.strip()) # Second list = top items for item in list_nodes[1].children: if item.type == "list_item": top_items.append(item.text.strip()) # Parse answer from spoiler block correct_pairs = [] has_answer = False for node in nodes: if node.type == "block_code" and node.attrs.get("info") == "spoiler-block:": answer_text = node.raw.strip() # Check for TODO if 'TODO' in answer_text.upper(): has_answer = False break has_answer = True # Parse "Item: Match" format answer_lines = answer_text.split('\n') for line in answer_lines: line = line.strip() if ':' not in line: continue left_part, top_part = line.split(':', 1) left_part = left_part.strip() top_part = top_part.strip() # Find indices left_idx = None top_idx = None for idx, item in enumerate(left_items): if left_part.lower() in item.lower() or item.lower() in left_part.lower(): left_idx = idx break for idx, item in enumerate(top_items): if top_part.lower() in item.lower() or item.lower() in top_part.lower(): top_idx = idx break if left_idx is not None and top_idx is not None: correct_pairs.append([left_idx, top_idx]) break return True, { 'text': question_text, 'left_items': left_items, 'top_items': top_items, 'correct_pairs': correct_pairs, 'has_answer': has_answer, 'question_type': 'matching', 'tags': tags } def import_question_file(file_path: Path, base_path: Path, stats: ImportStats, force: bool = False): """ Import a single question file, checking modification time to avoid unnecessary updates. Args: file_path: Path to the question file base_path: Base path for relative calculations stats: ImportStats object to track statistics force: If True, import regardless of mtime (for initial import) """ try: # Get file modification time file_mtime = file_path.stat().st_mtime # Calculate path relative to project root project_root = settings.BASE_DIR.parent try: file_path_str = str(file_path.relative_to(project_root)) except ValueError: file_path_str = str(file_path.relative_to(base_path)) # Check if file has changed by comparing mtime if not force: try: existing_question = Question.objects.get(file_path=file_path_str) if existing_question.file_mtime and existing_question.file_mtime >= file_mtime: # File hasn't changed, skip return 'skipped_unchanged' except Question.DoesNotExist: pass # New file, will import content = file_path.read_text(encoding='utf-8') is_mcq, question_data = parse_markdown_question(file_path, content) # Track folder stats relative_path = file_path.relative_to(base_path) folder_name = relative_path.parts[0] if len(relative_path.parts) > 1 else 'root' stats.by_folder[folder_name]['total'] += 1 if not is_mcq: stats.non_mcq_skipped += 1 return 'skipped_not_mcq' stats.mcq_questions += 1 stats.by_folder[folder_name]['mcq'] += 1 if not question_data or not question_data.get('text'): stats.non_mcq_skipped += 1 return 'skipped_invalid' if not question_data['has_answer']: stats.questions_with_todo += 1 stats.by_folder[folder_name]['todo'] += 1 return 'skipped_todo' stats.questions_with_answers += 1 stats.by_folder[folder_name]['answered'] += 1 # Extract exam information from folder structure # Expected path: content/Anatomi & Histologi 2/Gamla tentor/2022-01-15/question.md exam = None relative_path = file_path.relative_to(base_path) path_parts = relative_path.parts # Try to extract exam date from folder structure if len(path_parts) >= 2: # Get the parent folder name which should be the exam date (e.g., "2022-01-15") exam_folder = path_parts[-2] if len(path_parts) > 1 else None # Try to parse as date if exam_folder and '-' in exam_folder: try: exam_date = datetime.strptime(exam_folder, '%Y-%m-%d').date() # Get or create course (default to "Anatomi & Histologi 2") # Extract course name from path if available course_name = "Anatomi & Histologi 2" if len(path_parts) >= 3 and 'Anatomi' in ' '.join(path_parts): # Try to find course name in path for part in path_parts: if 'Anatomi' in part or 'Histologi' in part: course_name = part break course, _ = Course.objects.get_or_create( name=course_name, defaults={'code': 'AH2'} ) # Get or create exam exam, _ = Exam.objects.get_or_create( course=course, date=exam_date, defaults={ 'name': exam_folder, 'folder_path': '/'.join(path_parts[:-1]) } ) except (ValueError, ImportError): pass # If date parsing fails, exam remains None # Import to database with mtime tracking # Prepare defaults dict defaults = { 'exam': exam, 'text': question_data['text'], 'correct_answer': question_data.get('correct_answer', ''), 'file_mtime': file_mtime, 'question_type': question_data.get('question_type', 'mcq'), } # Add matching_data if it's a matching question if question_data.get('question_type') == 'matching': defaults['matching_data'] = { 'left_items': question_data.get('left_items', []), 'top_items': question_data.get('top_items', []), 'correct_pairs': question_data.get('correct_pairs', []) } question, created = Question.objects.update_or_create( file_path=file_path_str, defaults=defaults ) if created: stats.created += 1 else: stats.updated += 1 # Update tags from django.utils.text import slugify from quiz.models import Tag question.tags.clear() for tag_name in question_data.get('tags', []): tag_slug = slugify(tag_name) tag, _ = Tag.objects.get_or_create( slug=tag_slug, defaults={'name': tag_name} ) question.tags.add(tag) # Update options (only for MCQ/SCQ questions) if question_data.get('question_type') not in ['matching']: question.options.all().delete() # Deduplicate options by letter (keep first occurrence) seen_letters = set() for letter, text in question_data.get('options', []): if letter not in seen_letters: Option.objects.create(question=question, letter=letter, text=text) seen_letters.add(letter) return 'imported' if created else 'updated' except (OSError, ValueError, django.db.utils.Error) as e: stats.errors += 1 print(f"Error importing {file_path}: {e}") return 'error' def import_questions(folder_path: Path, base_path: Path = None, force: bool = False) -> ImportStats: if base_path is None: base_path = folder_path stats = ImportStats() for md_file in folder_path.rglob('*.md'): stats.total_files += 1 import_question_file(md_file, base_path, stats, force=force) return stats def delete_question_by_path(file_path: Path): try: project_root = settings.BASE_DIR.parent file_path_str = str(file_path.relative_to(project_root)) deleted_count, _ = Question.objects.filter(file_path=file_path_str).delete() if deleted_count > 0: print(f"[Auto-delete] ✓ Deleted question: {file_path.name}") return deleted_count > 0 except (OSError, django.db.utils.Error) as e: print(f"[Auto-delete] ✗ Error deleting question {file_path}: {e}") return False