From 32782b32600311e8323e333a1b91048c3ca68f51 Mon Sep 17 00:00:00 2001 From: Yudi Wu Date: Thu, 21 May 2026 21:14:38 +1000 Subject: [PATCH 1/2] Fix missing prerequisite edges for units storing requisites in enrolment_rules MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 211 units (MTH2021, MTH2010, and others across Science, Pharmacy, and Education) store their PREREQUISITE and PROHIBITION relationships as HTML prose in enrolment_rules rather than the structured requisites field. This left requisite_refs empty for those units, so the tree graph showed no edges and "what does X unlock" views omitted them entirely. Fix the ingest pipeline to also extract unit code refs from enrolment_rules descriptions that carry a PREREQUISITE or PROHIBITION label with handbook unit links. Add a data migration (0007) to backfill all existing years — inserting 4,496 new prerequisite refs and 1,683 prohibition refs. Also fix a bug in requisite-tree-view where the units map title lookup was computed but leaf.academic_item_name was still rendered instead of the resolved name. Co-Authored-By: Claude Sonnet 4.6 --- .../0007_backfill_enrolment_rule_refs.sql | 38 + packages/db/drizzle/meta/0007_snapshot.json | 1582 +++++++++++++++++ packages/db/drizzle/meta/_journal.json | 7 + packages/ingest/src/parse.ts | 27 + .../planner/requisite-tree-view.tsx | 18 +- .../planner/unit-detail-popover.tsx | 2 + pnpm-lock.yaml | 14 +- pnpm-workspace.yaml | 7 + 8 files changed, 1683 insertions(+), 12 deletions(-) create mode 100644 packages/db/drizzle/0007_backfill_enrolment_rule_refs.sql create mode 100644 packages/db/drizzle/meta/0007_snapshot.json diff --git a/packages/db/drizzle/0007_backfill_enrolment_rule_refs.sql b/packages/db/drizzle/0007_backfill_enrolment_rule_refs.sql new file mode 100644 index 0000000..241ece4 --- /dev/null +++ b/packages/db/drizzle/0007_backfill_enrolment_rule_refs.sql @@ -0,0 +1,38 @@ +-- Backfill requisite_refs for units that store their PREREQUISITE / +-- PROHIBITION relationships in enrolment_rules HTML rather than in the +-- structured requisites field. +-- +-- Pattern: entries whose description contains a PREREQUISITE +-- or PROHIBITION label followed by handbook unit links. +-- We extract every /units/CODE href from those descriptions and insert a +-- matching requisite_refs edge. ON CONFLICT is a no-op so this is safe +-- to re-run, and structured-requisite rows are never duplicated because +-- the two sources are disjoint (verified: zero overlap in 2026 corpus). + +--> statement-breakpoint + +-- Prerequisites from enrolment_rules HTML +INSERT INTO requisite_refs (year, unit_code, requisite_type, requires_unit_code) +SELECT DISTINCT + er.year, + er.unit_code, + 'prerequisite'::requisite_type, + upper(m[1]) +FROM enrolment_rules er, + regexp_matches(er.description, 'handbook\.monash\.edu/[^"]+/units/([A-Za-z][A-Za-z0-9-]*)', 'g') m +WHERE er.description ~* '\s*PREREQUISITE' +ON CONFLICT (year, unit_code, requisite_type, requires_unit_code) DO NOTHING; + +--> statement-breakpoint + +-- Prohibitions from enrolment_rules HTML +INSERT INTO requisite_refs (year, unit_code, requisite_type, requires_unit_code) +SELECT DISTINCT + er.year, + er.unit_code, + 'prohibition'::requisite_type, + upper(m[1]) +FROM enrolment_rules er, + regexp_matches(er.description, 'handbook\.monash\.edu/[^"]+/units/([A-Za-z][A-Za-z0-9-]*)', 'g') m +WHERE er.description ~* '\s*PROHIBITION' +ON CONFLICT (year, unit_code, requisite_type, requires_unit_code) DO NOTHING; diff --git a/packages/db/drizzle/meta/0007_snapshot.json b/packages/db/drizzle/meta/0007_snapshot.json new file mode 100644 index 0000000..55bb0b4 --- /dev/null +++ b/packages/db/drizzle/meta/0007_snapshot.json @@ -0,0 +1,1582 @@ +{ + "id": "bede56c5-1991-4cf2-8666-828d3a69998f", + "prevId": "81ad2814-39f3-4dc0-857b-bf976b5bf52e", + "version": "7", + "dialect": "postgresql", + "tables": { + "public.account": { + "name": "account", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "account_id": { + "name": "account_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "provider_id": { + "name": "provider_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "user_id": { + "name": "user_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "access_token": { + "name": "access_token", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "refresh_token": { + "name": "refresh_token", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "id_token": { + "name": "id_token", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "access_token_expires_at": { + "name": "access_token_expires_at", + "type": "timestamp", + "primaryKey": false, + "notNull": false + }, + "refresh_token_expires_at": { + "name": "refresh_token_expires_at", + "type": "timestamp", + "primaryKey": false, + "notNull": false + }, + "scope": { + "name": "scope", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "password": { + "name": "password", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "account_user_id_idx": { + "name": "account_user_id_idx", + "columns": [ + { + "expression": "user_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "account_user_id_user_id_fk": { + "name": "account_user_id_user_id_fk", + "tableFrom": "account", + "tableTo": "user", + "columnsFrom": [ + "user_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.area_of_study_units": { + "name": "area_of_study_units", + "schema": "", + "columns": { + "aos_year": { + "name": "aos_year", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "aos_code": { + "name": "aos_code", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "unit_code": { + "name": "unit_code", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "grouping": { + "name": "grouping", + "type": "text", + "primaryKey": false, + "notNull": true + } + }, + "indexes": { + "aos_units_aos_idx": { + "name": "aos_units_aos_idx", + "columns": [ + { + "expression": "aos_year", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "aos_code", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "aos_units_unit_idx": { + "name": "aos_units_unit_idx", + "columns": [ + { + "expression": "unit_code", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": { + "area_of_study_units_aos_year_aos_code_unit_code_grouping_pk": { + "name": "area_of_study_units_aos_year_aos_code_unit_code_grouping_pk", + "columns": [ + "aos_year", + "aos_code", + "unit_code", + "grouping" + ] + } + }, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.areas_of_study": { + "name": "areas_of_study", + "schema": "", + "columns": { + "year": { + "name": "year", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "code": { + "name": "code", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "title": { + "name": "title", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "study_level": { + "name": "study_level", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "credit_points": { + "name": "credit_points", + "type": "integer", + "primaryKey": false, + "notNull": false + }, + "school": { + "name": "school", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "academic_org": { + "name": "academic_org", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "handbook_description": { + "name": "handbook_description", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "curriculum_structure": { + "name": "curriculum_structure", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "raw": { + "name": "raw", + "type": "jsonb", + "primaryKey": false, + "notNull": true + } + }, + "indexes": { + "aos_title_idx": { + "name": "aos_title_idx", + "columns": [ + { + "expression": "title", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": { + "areas_of_study_year_code_pk": { + "name": "areas_of_study_year_code_pk", + "columns": [ + "year", + "code" + ] + } + }, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.course_areas_of_study": { + "name": "course_areas_of_study", + "schema": "", + "columns": { + "course_year": { + "name": "course_year", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "course_code": { + "name": "course_code", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "aos_year": { + "name": "aos_year", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "aos_code": { + "name": "aos_code", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "kind": { + "name": "kind", + "type": "aos_relationship_kind", + "typeSchema": "public", + "primaryKey": false, + "notNull": true + }, + "relationship_label": { + "name": "relationship_label", + "type": "text", + "primaryKey": false, + "notNull": true + } + }, + "indexes": { + "course_aos_course_idx": { + "name": "course_aos_course_idx", + "columns": [ + { + "expression": "course_year", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "course_code", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "course_aos_aos_idx": { + "name": "course_aos_aos_idx", + "columns": [ + { + "expression": "aos_year", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "aos_code", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "course_aos_kind_idx": { + "name": "course_aos_kind_idx", + "columns": [ + { + "expression": "course_year", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "course_code", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "kind", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": { + "course_areas_of_study_course_year_course_code_aos_year_aos_code_relationship_label_pk": { + "name": "course_areas_of_study_course_year_course_code_aos_year_aos_code_relationship_label_pk", + "columns": [ + "course_year", + "course_code", + "aos_year", + "aos_code", + "relationship_label" + ] + } + }, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.courses": { + "name": "courses", + "schema": "", + "columns": { + "year": { + "name": "year", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "code": { + "name": "code", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "title": { + "name": "title", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "abbreviated_name": { + "name": "abbreviated_name", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "aqf_level": { + "name": "aqf_level", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "credit_points": { + "name": "credit_points", + "type": "integer", + "primaryKey": false, + "notNull": false + }, + "type": { + "name": "type", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "status": { + "name": "status", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "school": { + "name": "school", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "cricos_code": { + "name": "cricos_code", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "overview": { + "name": "overview", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "on_campus": { + "name": "on_campus", + "type": "boolean", + "primaryKey": false, + "notNull": false + }, + "online": { + "name": "online", + "type": "boolean", + "primaryKey": false, + "notNull": false + }, + "full_time": { + "name": "full_time", + "type": "boolean", + "primaryKey": false, + "notNull": false + }, + "part_time": { + "name": "part_time", + "type": "boolean", + "primaryKey": false, + "notNull": false + }, + "curriculum_structure": { + "name": "curriculum_structure", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "requirement_groups": { + "name": "requirement_groups", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "embedded_specialisations": { + "name": "embedded_specialisations", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "sub_course_refs": { + "name": "sub_course_refs", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "component_labels": { + "name": "component_labels", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "raw": { + "name": "raw", + "type": "jsonb", + "primaryKey": false, + "notNull": true + } + }, + "indexes": { + "courses_title_idx": { + "name": "courses_title_idx", + "columns": [ + { + "expression": "title", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "courses_title_trgm_idx": { + "name": "courses_title_trgm_idx", + "columns": [ + { + "expression": "title gin_trgm_ops", + "asc": true, + "isExpression": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "gin", + "with": {} + }, + "courses_code_trgm_idx": { + "name": "courses_code_trgm_idx", + "columns": [ + { + "expression": "code gin_trgm_ops", + "asc": true, + "isExpression": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "gin", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": { + "courses_year_code_pk": { + "name": "courses_year_code_pk", + "columns": [ + "year", + "code" + ] + } + }, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.enrolment_rules": { + "name": "enrolment_rules", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "serial", + "primaryKey": true, + "notNull": true + }, + "year": { + "name": "year", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "unit_code": { + "name": "unit_code", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "rule_type": { + "name": "rule_type", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "description": { + "name": "description", + "type": "text", + "primaryKey": false, + "notNull": false + } + }, + "indexes": { + "enrolment_rules_unit_idx": { + "name": "enrolment_rules_unit_idx", + "columns": [ + { + "expression": "year", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "unit_code", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.requisite_refs": { + "name": "requisite_refs", + "schema": "", + "columns": { + "year": { + "name": "year", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "unit_code": { + "name": "unit_code", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "requisite_type": { + "name": "requisite_type", + "type": "requisite_type", + "typeSchema": "public", + "primaryKey": false, + "notNull": true + }, + "requires_unit_code": { + "name": "requires_unit_code", + "type": "text", + "primaryKey": false, + "notNull": true + } + }, + "indexes": { + "requisite_refs_forward_idx": { + "name": "requisite_refs_forward_idx", + "columns": [ + { + "expression": "year", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "unit_code", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "requisite_refs_reverse_idx": { + "name": "requisite_refs_reverse_idx", + "columns": [ + { + "expression": "year", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "requires_unit_code", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": { + "requisite_refs_year_unit_code_requisite_type_requires_unit_code_pk": { + "name": "requisite_refs_year_unit_code_requisite_type_requires_unit_code_pk", + "columns": [ + "year", + "unit_code", + "requisite_type", + "requires_unit_code" + ] + } + }, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.requisites": { + "name": "requisites", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "serial", + "primaryKey": true, + "notNull": true + }, + "year": { + "name": "year", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "unit_code": { + "name": "unit_code", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "requisite_type": { + "name": "requisite_type", + "type": "requisite_type", + "typeSchema": "public", + "primaryKey": false, + "notNull": true + }, + "description": { + "name": "description", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "rule": { + "name": "rule", + "type": "jsonb", + "primaryKey": false, + "notNull": false + } + }, + "indexes": { + "requisites_unit_idx": { + "name": "requisites_unit_idx", + "columns": [ + { + "expression": "year", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "unit_code", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.session": { + "name": "session", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "expires_at": { + "name": "expires_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true + }, + "token": { + "name": "token", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "ip_address": { + "name": "ip_address", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "user_agent": { + "name": "user_agent", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "user_id": { + "name": "user_id", + "type": "text", + "primaryKey": false, + "notNull": true + } + }, + "indexes": { + "session_user_id_idx": { + "name": "session_user_id_idx", + "columns": [ + { + "expression": "user_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "session_user_id_user_id_fk": { + "name": "session_user_id_user_id_fk", + "tableFrom": "session", + "tableTo": "user", + "columnsFrom": [ + "user_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "session_token_unique": { + "name": "session_token_unique", + "nullsNotDistinct": false, + "columns": [ + "token" + ] + } + }, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.unit_offerings": { + "name": "unit_offerings", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "serial", + "primaryKey": true, + "notNull": true + }, + "year": { + "name": "year", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "unit_code": { + "name": "unit_code", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "display_name": { + "name": "display_name", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "teaching_period": { + "name": "teaching_period", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "location": { + "name": "location", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "attendance_mode": { + "name": "attendance_mode", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "attendance_mode_code": { + "name": "attendance_mode_code", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "offered": { + "name": "offered", + "type": "boolean", + "primaryKey": false, + "notNull": true, + "default": true + } + }, + "indexes": { + "offerings_unit_idx": { + "name": "offerings_unit_idx", + "columns": [ + { + "expression": "year", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "unit_code", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "offerings_slot_idx": { + "name": "offerings_slot_idx", + "columns": [ + { + "expression": "year", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "teaching_period", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "location", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "offerings_mode_idx": { + "name": "offerings_mode_idx", + "columns": [ + { + "expression": "attendance_mode_code", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.units": { + "name": "units", + "schema": "", + "columns": { + "year": { + "name": "year", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "code": { + "name": "code", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "title": { + "name": "title", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "credit_points": { + "name": "credit_points", + "type": "integer", + "primaryKey": false, + "notNull": false + }, + "level": { + "name": "level", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "type": { + "name": "type", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "status": { + "name": "status", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "undergrad_postgrad": { + "name": "undergrad_postgrad", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "school": { + "name": "school", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "academic_org": { + "name": "academic_org", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "handbook_synopsis": { + "name": "handbook_synopsis", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "raw": { + "name": "raw", + "type": "jsonb", + "primaryKey": false, + "notNull": true + } + }, + "indexes": { + "units_title_idx": { + "name": "units_title_idx", + "columns": [ + { + "expression": "title", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "units_school_idx": { + "name": "units_school_idx", + "columns": [ + { + "expression": "school", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "units_title_trgm_idx": { + "name": "units_title_trgm_idx", + "columns": [ + { + "expression": "title gin_trgm_ops", + "asc": true, + "isExpression": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "gin", + "with": {} + }, + "units_code_trgm_idx": { + "name": "units_code_trgm_idx", + "columns": [ + { + "expression": "code gin_trgm_ops", + "asc": true, + "isExpression": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "gin", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": { + "units_year_code_pk": { + "name": "units_year_code_pk", + "columns": [ + "year", + "code" + ] + } + }, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.user": { + "name": "user", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "email": { + "name": "email", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "email_verified": { + "name": "email_verified", + "type": "boolean", + "primaryKey": false, + "notNull": true, + "default": false + }, + "image": { + "name": "image", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "user_email_unique": { + "name": "user_email_unique", + "nullsNotDistinct": false, + "columns": [ + "email" + ] + } + }, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.user_grade": { + "name": "user_grade", + "schema": "", + "columns": { + "user_id": { + "name": "user_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "unit_code": { + "name": "unit_code", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "mark": { + "name": "mark", + "type": "integer", + "primaryKey": false, + "notNull": true + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "user_grade_user_id_idx": { + "name": "user_grade_user_id_idx", + "columns": [ + { + "expression": "user_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "user_grade_user_id_user_id_fk": { + "name": "user_grade_user_id_user_id_fk", + "tableFrom": "user_grade", + "tableTo": "user", + "columnsFrom": [ + "user_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": { + "user_grade_user_id_unit_code_pk": { + "name": "user_grade_user_id_unit_code_pk", + "columns": [ + "user_id", + "unit_code" + ] + } + }, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.user_plan": { + "name": "user_plan", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "user_id": { + "name": "user_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "state": { + "name": "state", + "type": "jsonb", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "user_plan_user_id_idx": { + "name": "user_plan_user_id_idx", + "columns": [ + { + "expression": "user_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "user_plan_user_id_user_id_fk": { + "name": "user_plan_user_id_user_id_fk", + "tableFrom": "user_plan", + "tableTo": "user", + "columnsFrom": [ + "user_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.verification": { + "name": "verification", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "identifier": { + "name": "identifier", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "value": { + "name": "value", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "expires_at": { + "name": "expires_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "verification_identifier_idx": { + "name": "verification_identifier_idx", + "columns": [ + { + "expression": "identifier", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + } + }, + "enums": { + "public.aos_relationship_kind": { + "name": "aos_relationship_kind", + "schema": "public", + "values": [ + "major", + "extended_major", + "minor", + "specialisation", + "elective", + "other" + ] + }, + "public.requisite_type": { + "name": "requisite_type", + "schema": "public", + "values": [ + "prerequisite", + "corequisite", + "prohibition", + "permission", + "other" + ] + } + }, + "schemas": {}, + "sequences": {}, + "roles": {}, + "policies": {}, + "views": {}, + "_meta": { + "columns": {}, + "schemas": {}, + "tables": {} + } +} \ No newline at end of file diff --git a/packages/db/drizzle/meta/_journal.json b/packages/db/drizzle/meta/_journal.json index 2ca298d..1bcc8c1 100644 --- a/packages/db/drizzle/meta/_journal.json +++ b/packages/db/drizzle/meta/_journal.json @@ -50,6 +50,13 @@ "when": 1778818974464, "tag": "0006_smooth_living_lightning", "breakpoints": true + }, + { + "idx": 7, + "version": "7", + "when": 1778819074464, + "tag": "0007_backfill_enrolment_rule_refs", + "breakpoints": true } ] } \ No newline at end of file diff --git a/packages/ingest/src/parse.ts b/packages/ingest/src/parse.ts index dfa5dcb..231408a 100644 --- a/packages/ingest/src/parse.ts +++ b/packages/ingest/src/parse.ts @@ -286,6 +286,33 @@ export function parseUnit(year: string, raw: UnitContent): UnitRows { typeof e["description"] === "string" ? (e["description"] as string) : null, })); + // Some units (e.g. MTH2021, MTH2010) store PREREQUISITE / PROHIBITION info + // in enrolment_rules HTML instead of the structured requisites field. Extract + // unit code refs from those HTML descriptions and add them to refSet so the + // graph edges and "what does X unlock" views reflect reality. + for (const rule of enrolmentRules) { + if (!rule.description) continue; + const desc = rule.description; + const isPrereq = /\s*PREREQUISITE/i.test(desc); + const isProhibition = !isPrereq && /\s*PROHIBITION/i.test(desc); + if (!isPrereq && !isProhibition) continue; + const rType: RequisiteType = isPrereq ? "prerequisite" : "prohibition"; + const unitLinkRe = /handbook\.monash\.edu\/[^"]+\/units\/([A-Za-z][A-Za-z0-9-]*)/g; + let m; + while ((m = unitLinkRe.exec(desc)) !== null) { + const upper = m[1]!.toUpperCase(); + const key = `${rType}|${upper}`; + if (!refSet.has(key)) { + refSet.set(key, { + year, + unitCode: code, + requisiteType: rType, + requiresUnitCode: upper, + }); + } + } + } + return { unit: { year, diff --git a/packages/webapp/components/planner/requisite-tree-view.tsx b/packages/webapp/components/planner/requisite-tree-view.tsx index ab75702..940d554 100644 --- a/packages/webapp/components/planner/requisite-tree-view.tsx +++ b/packages/webapp/components/planner/requisite-tree-view.tsx @@ -23,10 +23,12 @@ export function RequisiteTreeView({ rule, completed, isProhibition = false, + units, }: { rule: RequisiteRule | null | undefined completed: ReadonlySet isProhibition?: boolean + units?: ReadonlyMap }) { if (!rule || rule.length === 0) { return ( @@ -44,6 +46,7 @@ export function RequisiteTreeView({ depth={0} completed={completed} isProhibition={isProhibition} + units={units} /> ))} @@ -55,11 +58,13 @@ function ContainerNode({ depth, completed, isProhibition, + units, }: { container: RequisiteContainer depth: number completed: ReadonlySet isProhibition: boolean + units?: ReadonlyMap }) { const connector = (container.parent_connector?.value ?? "AND").toUpperCase() const children = [ @@ -70,6 +75,7 @@ function ContainerNode({ depth={depth + 1} completed={completed} isProhibition={isProhibition} + units={units} /> )), ...(container.relationships ?? []).map((l, i) => ( @@ -78,6 +84,7 @@ function ContainerNode({ leaf={l} completed={completed} isProhibition={isProhibition} + units={units} /> )), ] @@ -104,16 +111,21 @@ function LeafNode({ leaf, completed, isProhibition, + units, }: { leaf: RequisiteLeaf completed: ReadonlySet isProhibition: boolean + units?: ReadonlyMap }) { const taken = completed.has(leaf.academic_item_code) // Prohibitions invert: "taken" is a problem, not a check. const good = isProhibition ? !taken : taken + const name = + units?.get(leaf.academic_item_code)?.title ?? leaf.academic_item_name + return (
{leaf.academic_item_code} - {leaf.academic_item_name ? ( - - {leaf.academic_item_name} - + {name ? ( + {name} ) : null}
) diff --git a/packages/webapp/components/planner/unit-detail-popover.tsx b/packages/webapp/components/planner/unit-detail-popover.tsx index 39d05da..0a80a8a 100644 --- a/packages/webapp/components/planner/unit-detail-popover.tsx +++ b/packages/webapp/components/planner/unit-detail-popover.tsx @@ -468,6 +468,7 @@ function RequisiteBlockView({ block: RequisiteBlock completed: ReadonlySet }) { + const { units } = usePlanner() const label = block.requisiteType[0].toUpperCase() + block.requisiteType.slice(1) + "s" return ( @@ -486,6 +487,7 @@ function RequisiteBlockView({ rule={block.rule} completed={completed} isProhibition={block.requisiteType === "prohibition"} + units={units} /> ) diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 2e9e5db..c896b0a 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -4,9 +4,6 @@ settings: autoInstallPeers: true excludeLinksFromLockfile: false -overrides: - drizzle-orm: 0.38.4 - importers: .: @@ -36,7 +33,7 @@ importers: specifier: ^16.4.7 version: 16.6.1 drizzle-orm: - specifier: 0.38.4 + specifier: ^0.38.0 version: 0.38.4(@opentelemetry/api@1.9.1)(@types/react@19.2.14)(kysely@0.28.17)(postgres@3.4.9)(react@19.2.5) postgres: specifier: ^3.4.5 @@ -55,7 +52,7 @@ importers: specifier: workspace:* version: link:../scraper drizzle-orm: - specifier: 0.38.4 + specifier: ^0.38.0 version: 0.38.4(@opentelemetry/api@1.9.1)(@types/react@19.2.14)(kysely@0.28.17)(postgres@3.4.9)(react@19.2.5) devDependencies: tsx: @@ -104,7 +101,7 @@ importers: specifier: ^16.4.7 version: 16.6.1 drizzle-orm: - specifier: 0.38.4 + specifier: ^0.38.0 version: 0.38.4(@opentelemetry/api@1.9.1)(@types/react@19.2.14)(kysely@0.28.17)(postgres@3.4.9)(react@19.2.5) embla-carousel-react: specifier: ^8.6.0 @@ -389,7 +386,7 @@ packages: peerDependencies: '@better-auth/core': ^1.6.9 '@better-auth/utils': 0.4.0 - drizzle-orm: 0.38.4 + drizzle-orm: ^0.45.2 peerDependenciesMeta: drizzle-orm: optional: true @@ -2171,7 +2168,7 @@ packages: '@tanstack/solid-start': ^1.0.0 better-sqlite3: ^12.0.0 drizzle-kit: '>=0.31.4' - drizzle-orm: 0.38.4 + drizzle-orm: ^0.45.2 mongodb: ^6.0.0 || ^7.0.0 mysql2: ^3.0.0 next: ^14.0.0 || ^15.0.0 || ^16.0.0 @@ -2577,6 +2574,7 @@ packages: dompurify@3.4.4: resolution: {integrity: sha512-r8K7KGKEcztXfA/nfabSYB2hg9tDphORJTdf8xprN/luSLGmNhOBN8dm1/SYjqLLet6YUFEXOcrdTuwryp/Bew==} + deprecated: Fixed a security issue introduced in 3.4.4 dotenv@16.6.1: resolution: {integrity: sha512-uBq4egWHTcTt33a72vpSG0z3HnPuIl6NqYcTrKEg2azoEyl2hpW0zqlxysq2pK9HlDIHyHyakeYaYnSAwd8bow==} diff --git a/pnpm-workspace.yaml b/pnpm-workspace.yaml index dee51e9..0636aba 100644 --- a/pnpm-workspace.yaml +++ b/pnpm-workspace.yaml @@ -1,2 +1,9 @@ packages: - "packages/*" +allowBuilds: + core-js: true + esbuild: true + msw: true + protobufjs: true + sharp: true + unrs-resolver: true From 9948bf58616489db64eb057d4f1d4cd0594ef74a Mon Sep 17 00:00:00 2001 From: jason301c Date: Thu, 4 Jun 2026 13:16:57 +1000 Subject: [PATCH 2/2] Fix enrolment_rules requisite extraction: per-label sections, all URL hosts, ingest/SQL parity Validated the first pass against the live corpus (31,098 enrolment_rules rows, 2020-2026) and found three correctness gaps: - Whole-description classification mislabeled edges. 121 descriptions mix PREREQUISITE + PROHIBITION (81/32 also mix CO-REQUISITE), so labelling the whole blob by its first tag put ~126 links under the wrong type (e.g. a prohibited unit shown as a prerequisite). Now split each description at every label and classify per section. - The migration and the ingest parser used different logic, so a re-ingest would rewrite the backfilled rows. They now share identical extraction -- verified byte-identical over the whole corpus (6,624 refs, 0 drift). - Only handbook.monash.edu unit URLs were matched. Recover legacy hosts (www[3].monash.edu/pubs/.../units/CODE.html); still ignore /courses/ and /aos/ links that appear in the same prose, and drop self-references (105 artifacts like CHM3990's own corequisite). Also extract CO-REQUISITE (591 edges). Migration dry-run (rolled back) inserts 6,623 rows, near-disjoint from the structured requisite_refs (1 incidental overlap, no-op under ON CONFLICT). Also: - Extract the parser into a tested extractEnrolmentRuleRefs() with regression cases for each gotcha (CIV4283 mixed labels, MTH2010 course-link exclusion, CHM3990 self-coreq, plain-text codes left unparsed). - Revert the unrelated pnpm-lock.yaml / pnpm-workspace.yaml churn (drizzle-orm pin relaxation + allowBuilds block) that rode along from the fork's main. - Document the enrolment_rules quirk in docs/handbook-internals.md. --- docs/handbook-internals.md | 30 ++++-- .../0007_backfill_enrolment_rule_refs.sql | 65 +++++++------ packages/ingest/src/parse.test.ts | 87 ++++++++++++++++- packages/ingest/src/parse.ts | 97 ++++++++++++++----- pnpm-lock.yaml | 14 +-- pnpm-workspace.yaml | 7 -- 6 files changed, 228 insertions(+), 72 deletions(-) diff --git a/docs/handbook-internals.md b/docs/handbook-internals.md index fa43a69..b17db66 100644 --- a/docs/handbook-internals.md +++ b/docs/handbook-internals.md @@ -47,17 +47,35 @@ same thing: structured AND/OR tree of unit-code references in `rule` JSONB. The `description` field is **empty 99.9% of the time** — do not render it. The rule tree is the authoritative source. -- **`enrolment_rules`** are program-level constraints ("must be +- **`enrolment_rules`** are mostly program-level constraints ("must be enrolled in Bachelor of IT", "must have 48cp in Art, Design and Architecture"). They ship as HTML prose only — no structured tree — - and they always have a populated `description`. You can't evaluate - these programmatically without NLP; just render the HTML. + and they always have a populated `description`. Most you can't + evaluate programmatically without NLP; just render the HTML. + + **The leaky exception:** ~2,340 unit-years (Science, Engineering, + Pharmacy, Education) put their *unit-level* PREREQUISITE / + PROHIBITION / CO-REQUISITE refs *here* instead of in `requisites`, + as `PREREQUISITE: …` + prose. So a unit with an empty `requisites` tree is **not** + necessarily requisite-free — check `enrolment_rules` too. The ingest + extractor (`packages/ingest/src/parse.ts`) and migration `0007` + pull these into `requisite_refs`. Gotchas that bit the first pass: + one description can carry several labels (121 mix PREREQ + + PROHIBITION), the unit links use *both* the `handbook.monash.edu` + and legacy `www.monash.edu/pubs/.../units/CODE.html` hosts, the same + prose links to `/courses/` and `/aos/` (which must **not** become + unit edges), and some units list themselves. Extraction is + anchor-only and per-``-section; plain-text codes ("…or + MTH1040") are deliberately left unparsed (NLP-only; risks reading + course codes like `4531`/`M6011` as units). For graph-shaped queries on requisites ("what requires X?", "what unlocks after X?"), use `requisite_refs` — it's the flat edge view of -the trees. Use `requisites.rule` only when you need AND/OR semantics -for validation ("does this student's set of completed units satisfy -this block?"). +the trees, **plus** the `enrolment_rules`-derived edges above. Use +`requisites.rule` only when you need AND/OR semantics for validation +("does this student's set of completed units satisfy this block?") — +note the rule tree does *not* include the `enrolment_rules` edges. ## Graph shape: what references what diff --git a/packages/db/drizzle/0007_backfill_enrolment_rule_refs.sql b/packages/db/drizzle/0007_backfill_enrolment_rule_refs.sql index 241ece4..299036b 100644 --- a/packages/db/drizzle/0007_backfill_enrolment_rule_refs.sql +++ b/packages/db/drizzle/0007_backfill_enrolment_rule_refs.sql @@ -1,38 +1,49 @@ --- Backfill requisite_refs for units that store their PREREQUISITE / --- PROHIBITION relationships in enrolment_rules HTML rather than in the --- structured requisites field. +-- Backfill requisite_refs for units that record their PREREQUISITE, +-- PROHIBITION, or CO-REQUISITE relationships as HTML prose in +-- enrolment_rules rather than in the structured requisites field. +-- (~2,340 unit-years across Science, Engineering, Pharmacy, Education and +-- others, all seven handbook years.) -- --- Pattern: entries whose description contains a PREREQUISITE --- or PROHIBITION label followed by handbook unit links. --- We extract every /units/CODE href from those descriptions and insert a --- matching requisite_refs edge. ON CONFLICT is a no-op so this is safe --- to re-run, and structured-requisite rows are never duplicated because --- the two sources are disjoint (verified: zero overlap in 2026 corpus). - ---> statement-breakpoint - --- Prerequisites from enrolment_rules HTML -INSERT INTO requisite_refs (year, unit_code, requisite_type, requires_unit_code) -SELECT DISTINCT - er.year, - er.unit_code, - 'prerequisite'::requisite_type, - upper(m[1]) -FROM enrolment_rules er, - regexp_matches(er.description, 'handbook\.monash\.edu/[^"]+/units/([A-Za-z][A-Za-z0-9-]*)', 'g') m -WHERE er.description ~* '\s*PREREQUISITE' -ON CONFLICT (year, unit_code, requisite_type, requires_unit_code) DO NOTHING; +-- Extraction is anchor-based and high-precision: +-- * The description is split into sections at each label, so a +-- description carrying several labels attributes each unit link to its +-- OWN section rather than the whole blob. This matters: 121 descriptions +-- mix PREREQUISITE and PROHIBITION, and 81/32 mix CO-REQUISITE with +-- PREREQUISITE/PROHIBITION -- classifying the whole blob would mislabel +-- ~126 edges (e.g. tag a prohibited unit as a prerequisite). +-- * Only /units/CODE hrefs are taken, across every handbook URL host the +-- corpus uses (handbook.monash.edu//units/CODE plus the legacy +-- www[3].monash.edu/pubs/.../units/CODE.html). The /courses/ and /aos/ +-- links that appear in the same prose ("incompatible with course +-- versions E3001, ...") are intentionally ignored. +-- * Self-references are dropped (a unit listing itself, e.g. CHM3990's own +-- corequisite -- 105 such artifacts in the corpus). +-- +-- NOT extracted: plain-text codes with no anchor ("...or MTH1040", +-- "LAW1100 or LAW1101"). Parsing those needs NLP and would mistake course +-- codes (4531, M6011) for units. See docs/handbook-internals.md. +-- +-- This is kept in lockstep with the ingest extractor in +-- packages/ingest/src/parse.ts so a re-ingest reproduces exactly these rows. +-- ON CONFLICT is a no-op, so it is safe to re-run and never duplicates a +-- structured-requisite row (the two sources are disjoint: a single incidental +-- overlap across the whole 2020-2026 corpus). --> statement-breakpoint --- Prohibitions from enrolment_rules HTML INSERT INTO requisite_refs (year, unit_code, requisite_type, requires_unit_code) SELECT DISTINCT er.year, er.unit_code, - 'prohibition'::requisite_type, + (CASE + WHEN seg ~* '^]*>\s*PREREQUISITE' THEN 'prerequisite' + WHEN seg ~* '^]*>\s*PROHIBITION' THEN 'prohibition' + WHEN seg ~* '^]*>\s*CO-?REQUISITE' THEN 'corequisite' + END)::requisite_type, upper(m[1]) FROM enrolment_rules er, - regexp_matches(er.description, 'handbook\.monash\.edu/[^"]+/units/([A-Za-z][A-Za-z0-9-]*)', 'g') m -WHERE er.description ~* '\s*PROHIBITION' + regexp_split_to_table(er.description, '(?=]*>\s*(PREREQUISITE|PROHIBITION|CO-?REQUISITE)' + AND er.unit_code <> upper(m[1]) ON CONFLICT (year, unit_code, requisite_type, requires_unit_code) DO NOTHING; diff --git a/packages/ingest/src/parse.test.ts b/packages/ingest/src/parse.test.ts index ebb1602..811da8a 100644 --- a/packages/ingest/src/parse.test.ts +++ b/packages/ingest/src/parse.test.ts @@ -1,7 +1,12 @@ import { test } from "node:test" import assert from "node:assert/strict" -import { collectCodeRefs, extractCourseAosRefs, extractAosUnitRefs } from "./parse.ts" +import { + collectCodeRefs, + extractCourseAosRefs, + extractAosUnitRefs, + extractEnrolmentRuleRefs, +} from "./parse.ts" /* ------------------------------------------------------------------ * * Fixtures @@ -200,3 +205,83 @@ test("AoS→unit: unknown unit codes are filtered out", () => { const refs = extractAosUnitRefs("2026", "AOS01", structure, new Set(["U1"])) assert.deepEqual(refs.map((r) => r.unitCode), ["U1"]) }) + +/* ------------------------------------------------------------------ * + * extractEnrolmentRuleRefs — prose requisites in enrolment_rules + * ------------------------------------------------------------------ */ + +const erDesc = (s: string) => [{ description: s }] +const erKey = (r: { requisiteType: string; requiresUnitCode: string }) => + `${r.requisiteType}:${r.requiresUnitCode}` + +test("enrolment refs: a single description carrying both PREREQUISITE and PROHIBITION attributes each link to its own section (CIV4283 regression)", () => { + const refs = extractEnrolmentRuleRefs( + "2026", + "CIV4283", + erDesc( + '

Prerequisite: CIV2282

' + + '

Prohibitions: CIV4293

', + ), + ) + assert.deepEqual(refs.map(erKey).sort(), [ + "prerequisite:CIV2282", + "prohibition:CIV4293", + ]) +}) + +test("enrolment refs: ignores /courses/ and /aos/ links, keeps only /units/ (MTH2010 regression)", () => { + const refs = extractEnrolmentRuleRefs( + "2026", + "MTH2010", + erDesc( + '

PROHIBITION: ENG2005, ' + + 'MTH2015 and incompatible with course versions ' + + 'E3001.

' + + '

PREREQUISITE: You must have passed ' + + 'MTH1030, or MTH1040

', + ), + ) + // E3001 (/courses/) dropped; plain-text "MTH1040" (no anchor) not parsed. + assert.deepEqual(refs.map(erKey).sort(), [ + "prerequisite:MTH1030", + "prohibition:ENG2005", + "prohibition:MTH2015", + ]) +}) + +test("enrolment refs: extracts CO-REQUISITE but drops a unit listed as its own corequisite (CHM3990 regression)", () => { + const refs = extractEnrolmentRuleRefs( + "2026", + "CHM3990", + erDesc( + '

Co-requisites: ' + + 'CHM3990, ' + + 'CHM3911

', + ), + ) + assert.deepEqual(refs.map(erKey), ["corequisite:CHM3911"]) +}) + +test("enrolment refs: prose with no requisite label yields nothing", () => { + const refs = extractEnrolmentRuleRefs( + "2026", + "ABC1000", + erDesc( + '

Must be enrolled in S6002.

', + ), + ) + assert.deepEqual(refs, []) +}) + +test("enrolment refs: de-dupes a unit repeated within the same section", () => { + const refs = extractEnrolmentRuleRefs( + "2026", + "ABC1000", + erDesc( + '

Prerequisites: ' + + 'MTH1030 or ' + + 'MTH1030

', + ), + ) + assert.deepEqual(refs.map(erKey), ["prerequisite:MTH1030"]) +}) diff --git a/packages/ingest/src/parse.ts b/packages/ingest/src/parse.ts index 231408a..5607ac9 100644 --- a/packages/ingest/src/parse.ts +++ b/packages/ingest/src/parse.ts @@ -231,6 +231,71 @@ export interface UnitRows { }>; } +/** + * Some units (Science, Engineering, Pharmacy, Education — ~2,340 unit-years) + * record their PREREQUISITE / PROHIBITION / CO-REQUISITE relationships as HTML + * prose in `enrolment_rules` instead of the structured `requisites` tree, e.g. + *

Prerequisite: MTH1030

+ *

Prohibitions: MTH2015

+ * Pull the unit-code refs out so the graph edges and "what does X unlock" + * views reflect reality. + * + * High-precision, anchor-based extraction: + * - Split each description at every `` label, so a description that + * carries several labels attributes each unit link to its OWN section + * rather than the whole blob (121 descriptions mix PREREQUISITE + + * PROHIBITION; a whole-blob classify would mislabel ~126 edges). + * - Take only `/units/CODE` anchors, across every handbook host the corpus + * uses (`handbook.monash.edu//units/` and the legacy + * `www[3].monash.edu/pubs/.../units/CODE.html`). `/courses/` and `/aos/` + * links in the same prose are ignored — unit edges only reference units. + * - Drop self-references (a unit listing itself, e.g. CHM3990's corequisite). + * - Plain-text codes with no anchor ("…or MTH1040") are deliberately NOT + * parsed: that needs NLP and would read course codes (4531, M6011) as units. + * + * Kept in lockstep with migration + * `packages/db/drizzle/0007_backfill_enrolment_rule_refs.sql`. + */ +export function extractEnrolmentRuleRefs( + year: string, + unitCode: string, + rules: ReadonlyArray<{ description: string | null }>, +): UnitRows["requisiteRefs"] { + const out = new Map(); + const selfCode = unitCode.toUpperCase(); + for (const rule of rules) { + if (!rule.description) continue; + for (const seg of rule.description.split(/(?=]*>\s*PREREQUISITE/i.test( + seg, + ) + ? "prerequisite" + : /^]*>\s*PROHIBITION/i.test(seg) + ? "prohibition" + : /^]*>\s*CO-?REQUISITE/i.test(seg) + ? "corequisite" + : null; + if (!rType) continue; + const unitLinkRe = /\/units\/([A-Za-z][A-Za-z0-9]+)/g; + let m: RegExpExecArray | null; + while ((m = unitLinkRe.exec(seg)) !== null) { + const upper = m[1]!.toUpperCase(); + if (upper === selfCode) continue; // drop self-references + const key = `${rType}|${upper}`; + if (!out.has(key)) { + out.set(key, { + year, + unitCode, + requisiteType: rType, + requiresUnitCode: upper, + }); + } + } + } + } + return [...out.values()]; +} + export function parseUnit(year: string, raw: UnitContent): UnitRows { const code = raw.code; @@ -286,31 +351,13 @@ export function parseUnit(year: string, raw: UnitContent): UnitRows { typeof e["description"] === "string" ? (e["description"] as string) : null, })); - // Some units (e.g. MTH2021, MTH2010) store PREREQUISITE / PROHIBITION info - // in enrolment_rules HTML instead of the structured requisites field. Extract - // unit code refs from those HTML descriptions and add them to refSet so the - // graph edges and "what does X unlock" views reflect reality. - for (const rule of enrolmentRules) { - if (!rule.description) continue; - const desc = rule.description; - const isPrereq = /\s*PREREQUISITE/i.test(desc); - const isProhibition = !isPrereq && /\s*PROHIBITION/i.test(desc); - if (!isPrereq && !isProhibition) continue; - const rType: RequisiteType = isPrereq ? "prerequisite" : "prohibition"; - const unitLinkRe = /handbook\.monash\.edu\/[^"]+\/units\/([A-Za-z][A-Za-z0-9-]*)/g; - let m; - while ((m = unitLinkRe.exec(desc)) !== null) { - const upper = m[1]!.toUpperCase(); - const key = `${rType}|${upper}`; - if (!refSet.has(key)) { - refSet.set(key, { - year, - unitCode: code, - requisiteType: rType, - requiresUnitCode: upper, - }); - } - } + // Fold in the refs that some units record as HTML prose in enrolment_rules + // instead of the structured requisites field (see extractEnrolmentRuleRefs). + // Structured refs added above win on key collision; the two sources are + // effectively disjoint in practice. + for (const ref of extractEnrolmentRuleRefs(year, code, enrolmentRules)) { + const key = `${ref.requisiteType}|${ref.requiresUnitCode}`; + if (!refSet.has(key)) refSet.set(key, ref); } return { diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index c896b0a..2e9e5db 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -4,6 +4,9 @@ settings: autoInstallPeers: true excludeLinksFromLockfile: false +overrides: + drizzle-orm: 0.38.4 + importers: .: @@ -33,7 +36,7 @@ importers: specifier: ^16.4.7 version: 16.6.1 drizzle-orm: - specifier: ^0.38.0 + specifier: 0.38.4 version: 0.38.4(@opentelemetry/api@1.9.1)(@types/react@19.2.14)(kysely@0.28.17)(postgres@3.4.9)(react@19.2.5) postgres: specifier: ^3.4.5 @@ -52,7 +55,7 @@ importers: specifier: workspace:* version: link:../scraper drizzle-orm: - specifier: ^0.38.0 + specifier: 0.38.4 version: 0.38.4(@opentelemetry/api@1.9.1)(@types/react@19.2.14)(kysely@0.28.17)(postgres@3.4.9)(react@19.2.5) devDependencies: tsx: @@ -101,7 +104,7 @@ importers: specifier: ^16.4.7 version: 16.6.1 drizzle-orm: - specifier: ^0.38.0 + specifier: 0.38.4 version: 0.38.4(@opentelemetry/api@1.9.1)(@types/react@19.2.14)(kysely@0.28.17)(postgres@3.4.9)(react@19.2.5) embla-carousel-react: specifier: ^8.6.0 @@ -386,7 +389,7 @@ packages: peerDependencies: '@better-auth/core': ^1.6.9 '@better-auth/utils': 0.4.0 - drizzle-orm: ^0.45.2 + drizzle-orm: 0.38.4 peerDependenciesMeta: drizzle-orm: optional: true @@ -2168,7 +2171,7 @@ packages: '@tanstack/solid-start': ^1.0.0 better-sqlite3: ^12.0.0 drizzle-kit: '>=0.31.4' - drizzle-orm: ^0.45.2 + drizzle-orm: 0.38.4 mongodb: ^6.0.0 || ^7.0.0 mysql2: ^3.0.0 next: ^14.0.0 || ^15.0.0 || ^16.0.0 @@ -2574,7 +2577,6 @@ packages: dompurify@3.4.4: resolution: {integrity: sha512-r8K7KGKEcztXfA/nfabSYB2hg9tDphORJTdf8xprN/luSLGmNhOBN8dm1/SYjqLLet6YUFEXOcrdTuwryp/Bew==} - deprecated: Fixed a security issue introduced in 3.4.4 dotenv@16.6.1: resolution: {integrity: sha512-uBq4egWHTcTt33a72vpSG0z3HnPuIl6NqYcTrKEg2azoEyl2hpW0zqlxysq2pK9HlDIHyHyakeYaYnSAwd8bow==} diff --git a/pnpm-workspace.yaml b/pnpm-workspace.yaml index 0636aba..dee51e9 100644 --- a/pnpm-workspace.yaml +++ b/pnpm-workspace.yaml @@ -1,9 +1,2 @@ packages: - "packages/*" -allowBuilds: - core-js: true - esbuild: true - msw: true - protobufjs: true - sharp: true - unrs-resolver: true