diff --git a/docs/handbook-internals.md b/docs/handbook-internals.md index fa43a69..b17db66 100644 --- a/docs/handbook-internals.md +++ b/docs/handbook-internals.md @@ -47,17 +47,35 @@ same thing: structured AND/OR tree of unit-code references in `rule` JSONB. The `description` field is **empty 99.9% of the time** — do not render it. The rule tree is the authoritative source. -- **`enrolment_rules`** are program-level constraints ("must be +- **`enrolment_rules`** are mostly program-level constraints ("must be enrolled in Bachelor of IT", "must have 48cp in Art, Design and Architecture"). They ship as HTML prose only — no structured tree — - and they always have a populated `description`. You can't evaluate - these programmatically without NLP; just render the HTML. + and they always have a populated `description`. Most you can't + evaluate programmatically without NLP; just render the HTML. + + **The leaky exception:** ~2,340 unit-years (Science, Engineering, + Pharmacy, Education) put their *unit-level* PREREQUISITE / + PROHIBITION / CO-REQUISITE refs *here* instead of in `requisites`, + as `PREREQUISITE: …` + prose. So a unit with an empty `requisites` tree is **not** + necessarily requisite-free — check `enrolment_rules` too. The ingest + extractor (`packages/ingest/src/parse.ts`) and migration `0007` + pull these into `requisite_refs`. Gotchas that bit the first pass: + one description can carry several labels (121 mix PREREQ + + PROHIBITION), the unit links use *both* the `handbook.monash.edu` + and legacy `www.monash.edu/pubs/.../units/CODE.html` hosts, the same + prose links to `/courses/` and `/aos/` (which must **not** become + unit edges), and some units list themselves. Extraction is + anchor-only and per-``-section; plain-text codes ("…or + MTH1040") are deliberately left unparsed (NLP-only; risks reading + course codes like `4531`/`M6011` as units). For graph-shaped queries on requisites ("what requires X?", "what unlocks after X?"), use `requisite_refs` — it's the flat edge view of -the trees. Use `requisites.rule` only when you need AND/OR semantics -for validation ("does this student's set of completed units satisfy -this block?"). +the trees, **plus** the `enrolment_rules`-derived edges above. Use +`requisites.rule` only when you need AND/OR semantics for validation +("does this student's set of completed units satisfy this block?") — +note the rule tree does *not* include the `enrolment_rules` edges. ## Graph shape: what references what diff --git a/packages/db/drizzle/0007_backfill_enrolment_rule_refs.sql b/packages/db/drizzle/0007_backfill_enrolment_rule_refs.sql new file mode 100644 index 0000000..299036b --- /dev/null +++ b/packages/db/drizzle/0007_backfill_enrolment_rule_refs.sql @@ -0,0 +1,49 @@ +-- Backfill requisite_refs for units that record their PREREQUISITE, +-- PROHIBITION, or CO-REQUISITE relationships as HTML prose in +-- enrolment_rules rather than in the structured requisites field. +-- (~2,340 unit-years across Science, Engineering, Pharmacy, Education and +-- others, all seven handbook years.) +-- +-- Extraction is anchor-based and high-precision: +-- * The description is split into sections at each label, so a +-- description carrying several labels attributes each unit link to its +-- OWN section rather than the whole blob. This matters: 121 descriptions +-- mix PREREQUISITE and PROHIBITION, and 81/32 mix CO-REQUISITE with +-- PREREQUISITE/PROHIBITION -- classifying the whole blob would mislabel +-- ~126 edges (e.g. tag a prohibited unit as a prerequisite). +-- * Only /units/CODE hrefs are taken, across every handbook URL host the +-- corpus uses (handbook.monash.edu//units/CODE plus the legacy +-- www[3].monash.edu/pubs/.../units/CODE.html). The /courses/ and /aos/ +-- links that appear in the same prose ("incompatible with course +-- versions E3001, ...") are intentionally ignored. +-- * Self-references are dropped (a unit listing itself, e.g. CHM3990's own +-- corequisite -- 105 such artifacts in the corpus). +-- +-- NOT extracted: plain-text codes with no anchor ("...or MTH1040", +-- "LAW1100 or LAW1101"). Parsing those needs NLP and would mistake course +-- codes (4531, M6011) for units. See docs/handbook-internals.md. +-- +-- This is kept in lockstep with the ingest extractor in +-- packages/ingest/src/parse.ts so a re-ingest reproduces exactly these rows. +-- ON CONFLICT is a no-op, so it is safe to re-run and never duplicates a +-- structured-requisite row (the two sources are disjoint: a single incidental +-- overlap across the whole 2020-2026 corpus). + +--> statement-breakpoint + +INSERT INTO requisite_refs (year, unit_code, requisite_type, requires_unit_code) +SELECT DISTINCT + er.year, + er.unit_code, + (CASE + WHEN seg ~* '^]*>\s*PREREQUISITE' THEN 'prerequisite' + WHEN seg ~* '^]*>\s*PROHIBITION' THEN 'prohibition' + WHEN seg ~* '^]*>\s*CO-?REQUISITE' THEN 'corequisite' + END)::requisite_type, + upper(m[1]) +FROM enrolment_rules er, + regexp_split_to_table(er.description, '(?=]*>\s*(PREREQUISITE|PROHIBITION|CO-?REQUISITE)' + AND er.unit_code <> upper(m[1]) +ON CONFLICT (year, unit_code, requisite_type, requires_unit_code) DO NOTHING; diff --git a/packages/db/drizzle/meta/0007_snapshot.json b/packages/db/drizzle/meta/0007_snapshot.json new file mode 100644 index 0000000..55bb0b4 --- /dev/null +++ b/packages/db/drizzle/meta/0007_snapshot.json @@ -0,0 +1,1582 @@ +{ + "id": "bede56c5-1991-4cf2-8666-828d3a69998f", + "prevId": "81ad2814-39f3-4dc0-857b-bf976b5bf52e", + "version": "7", + "dialect": "postgresql", + "tables": { + "public.account": { + "name": "account", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "account_id": { + "name": "account_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "provider_id": { + "name": "provider_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "user_id": { + "name": "user_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "access_token": { + "name": "access_token", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "refresh_token": { + "name": "refresh_token", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "id_token": { + "name": "id_token", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "access_token_expires_at": { + "name": "access_token_expires_at", + "type": "timestamp", + "primaryKey": false, + "notNull": false + }, + "refresh_token_expires_at": { + "name": "refresh_token_expires_at", + "type": "timestamp", + "primaryKey": false, + "notNull": false + }, + "scope": { + "name": "scope", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "password": { + "name": "password", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "account_user_id_idx": { + "name": "account_user_id_idx", + "columns": [ + { + "expression": "user_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "account_user_id_user_id_fk": { + "name": "account_user_id_user_id_fk", + "tableFrom": "account", + "tableTo": "user", + "columnsFrom": [ + "user_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.area_of_study_units": { + "name": "area_of_study_units", + "schema": "", + "columns": { + "aos_year": { + "name": "aos_year", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "aos_code": { + "name": "aos_code", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "unit_code": { + "name": "unit_code", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "grouping": { + "name": "grouping", + "type": "text", + "primaryKey": false, + "notNull": true + } + }, + "indexes": { + "aos_units_aos_idx": { + "name": "aos_units_aos_idx", + "columns": [ + { + "expression": "aos_year", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "aos_code", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "aos_units_unit_idx": { + "name": "aos_units_unit_idx", + "columns": [ + { + "expression": "unit_code", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": { + "area_of_study_units_aos_year_aos_code_unit_code_grouping_pk": { + "name": "area_of_study_units_aos_year_aos_code_unit_code_grouping_pk", + "columns": [ + "aos_year", + "aos_code", + "unit_code", + "grouping" + ] + } + }, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.areas_of_study": { + "name": "areas_of_study", + "schema": "", + "columns": { + "year": { + "name": "year", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "code": { + "name": "code", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "title": { + "name": "title", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "study_level": { + "name": "study_level", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "credit_points": { + "name": "credit_points", + "type": "integer", + "primaryKey": false, + "notNull": false + }, + "school": { + "name": "school", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "academic_org": { + "name": "academic_org", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "handbook_description": { + "name": "handbook_description", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "curriculum_structure": { + "name": "curriculum_structure", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "raw": { + "name": "raw", + "type": "jsonb", + "primaryKey": false, + "notNull": true + } + }, + "indexes": { + "aos_title_idx": { + "name": "aos_title_idx", + "columns": [ + { + "expression": "title", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": { + "areas_of_study_year_code_pk": { + "name": "areas_of_study_year_code_pk", + "columns": [ + "year", + "code" + ] + } + }, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.course_areas_of_study": { + "name": "course_areas_of_study", + "schema": "", + "columns": { + "course_year": { + "name": "course_year", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "course_code": { + "name": "course_code", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "aos_year": { + "name": "aos_year", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "aos_code": { + "name": "aos_code", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "kind": { + "name": "kind", + "type": "aos_relationship_kind", + "typeSchema": "public", + "primaryKey": false, + "notNull": true + }, + "relationship_label": { + "name": "relationship_label", + "type": "text", + "primaryKey": false, + "notNull": true + } + }, + "indexes": { + "course_aos_course_idx": { + "name": "course_aos_course_idx", + "columns": [ + { + "expression": "course_year", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "course_code", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "course_aos_aos_idx": { + "name": "course_aos_aos_idx", + "columns": [ + { + "expression": "aos_year", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "aos_code", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "course_aos_kind_idx": { + "name": "course_aos_kind_idx", + "columns": [ + { + "expression": "course_year", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "course_code", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "kind", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": { + "course_areas_of_study_course_year_course_code_aos_year_aos_code_relationship_label_pk": { + "name": "course_areas_of_study_course_year_course_code_aos_year_aos_code_relationship_label_pk", + "columns": [ + "course_year", + "course_code", + "aos_year", + "aos_code", + "relationship_label" + ] + } + }, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.courses": { + "name": "courses", + "schema": "", + "columns": { + "year": { + "name": "year", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "code": { + "name": "code", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "title": { + "name": "title", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "abbreviated_name": { + "name": "abbreviated_name", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "aqf_level": { + "name": "aqf_level", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "credit_points": { + "name": "credit_points", + "type": "integer", + "primaryKey": false, + "notNull": false + }, + "type": { + "name": "type", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "status": { + "name": "status", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "school": { + "name": "school", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "cricos_code": { + "name": "cricos_code", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "overview": { + "name": "overview", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "on_campus": { + "name": "on_campus", + "type": "boolean", + "primaryKey": false, + "notNull": false + }, + "online": { + "name": "online", + "type": "boolean", + "primaryKey": false, + "notNull": false + }, + "full_time": { + "name": "full_time", + "type": "boolean", + "primaryKey": false, + "notNull": false + }, + "part_time": { + "name": "part_time", + "type": "boolean", + "primaryKey": false, + "notNull": false + }, + "curriculum_structure": { + "name": "curriculum_structure", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "requirement_groups": { + "name": "requirement_groups", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "embedded_specialisations": { + "name": "embedded_specialisations", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "sub_course_refs": { + "name": "sub_course_refs", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "component_labels": { + "name": "component_labels", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "raw": { + "name": "raw", + "type": "jsonb", + "primaryKey": false, + "notNull": true + } + }, + "indexes": { + "courses_title_idx": { + "name": "courses_title_idx", + "columns": [ + { + "expression": "title", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "courses_title_trgm_idx": { + "name": "courses_title_trgm_idx", + "columns": [ + { + "expression": "title gin_trgm_ops", + "asc": true, + "isExpression": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "gin", + "with": {} + }, + "courses_code_trgm_idx": { + "name": "courses_code_trgm_idx", + "columns": [ + { + "expression": "code gin_trgm_ops", + "asc": true, + "isExpression": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "gin", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": { + "courses_year_code_pk": { + "name": "courses_year_code_pk", + "columns": [ + "year", + "code" + ] + } + }, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.enrolment_rules": { + "name": "enrolment_rules", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "serial", + "primaryKey": true, + "notNull": true + }, + "year": { + "name": "year", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "unit_code": { + "name": "unit_code", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "rule_type": { + "name": "rule_type", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "description": { + "name": "description", + "type": "text", + "primaryKey": false, + "notNull": false + } + }, + "indexes": { + "enrolment_rules_unit_idx": { + "name": "enrolment_rules_unit_idx", + "columns": [ + { + "expression": "year", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "unit_code", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.requisite_refs": { + "name": "requisite_refs", + "schema": "", + "columns": { + "year": { + "name": "year", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "unit_code": { + "name": "unit_code", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "requisite_type": { + "name": "requisite_type", + "type": "requisite_type", + "typeSchema": "public", + "primaryKey": false, + "notNull": true + }, + "requires_unit_code": { + "name": "requires_unit_code", + "type": "text", + "primaryKey": false, + "notNull": true + } + }, + "indexes": { + "requisite_refs_forward_idx": { + "name": "requisite_refs_forward_idx", + "columns": [ + { + "expression": "year", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "unit_code", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "requisite_refs_reverse_idx": { + "name": "requisite_refs_reverse_idx", + "columns": [ + { + "expression": "year", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "requires_unit_code", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": { + "requisite_refs_year_unit_code_requisite_type_requires_unit_code_pk": { + "name": "requisite_refs_year_unit_code_requisite_type_requires_unit_code_pk", + "columns": [ + "year", + "unit_code", + "requisite_type", + "requires_unit_code" + ] + } + }, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.requisites": { + "name": "requisites", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "serial", + "primaryKey": true, + "notNull": true + }, + "year": { + "name": "year", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "unit_code": { + "name": "unit_code", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "requisite_type": { + "name": "requisite_type", + "type": "requisite_type", + "typeSchema": "public", + "primaryKey": false, + "notNull": true + }, + "description": { + "name": "description", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "rule": { + "name": "rule", + "type": "jsonb", + "primaryKey": false, + "notNull": false + } + }, + "indexes": { + "requisites_unit_idx": { + "name": "requisites_unit_idx", + "columns": [ + { + "expression": "year", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "unit_code", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.session": { + "name": "session", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "expires_at": { + "name": "expires_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true + }, + "token": { + "name": "token", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "ip_address": { + "name": "ip_address", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "user_agent": { + "name": "user_agent", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "user_id": { + "name": "user_id", + "type": "text", + "primaryKey": false, + "notNull": true + } + }, + "indexes": { + "session_user_id_idx": { + "name": "session_user_id_idx", + "columns": [ + { + "expression": "user_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "session_user_id_user_id_fk": { + "name": "session_user_id_user_id_fk", + "tableFrom": "session", + "tableTo": "user", + "columnsFrom": [ + "user_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "session_token_unique": { + "name": "session_token_unique", + "nullsNotDistinct": false, + "columns": [ + "token" + ] + } + }, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.unit_offerings": { + "name": "unit_offerings", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "serial", + "primaryKey": true, + "notNull": true + }, + "year": { + "name": "year", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "unit_code": { + "name": "unit_code", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "display_name": { + "name": "display_name", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "teaching_period": { + "name": "teaching_period", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "location": { + "name": "location", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "attendance_mode": { + "name": "attendance_mode", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "attendance_mode_code": { + "name": "attendance_mode_code", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "offered": { + "name": "offered", + "type": "boolean", + "primaryKey": false, + "notNull": true, + "default": true + } + }, + "indexes": { + "offerings_unit_idx": { + "name": "offerings_unit_idx", + "columns": [ + { + "expression": "year", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "unit_code", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "offerings_slot_idx": { + "name": "offerings_slot_idx", + "columns": [ + { + "expression": "year", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "teaching_period", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "location", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "offerings_mode_idx": { + "name": "offerings_mode_idx", + "columns": [ + { + "expression": "attendance_mode_code", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.units": { + "name": "units", + "schema": "", + "columns": { + "year": { + "name": "year", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "code": { + "name": "code", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "title": { + "name": "title", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "credit_points": { + "name": "credit_points", + "type": "integer", + "primaryKey": false, + "notNull": false + }, + "level": { + "name": "level", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "type": { + "name": "type", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "status": { + "name": "status", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "undergrad_postgrad": { + "name": "undergrad_postgrad", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "school": { + "name": "school", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "academic_org": { + "name": "academic_org", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "handbook_synopsis": { + "name": "handbook_synopsis", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "raw": { + "name": "raw", + "type": "jsonb", + "primaryKey": false, + "notNull": true + } + }, + "indexes": { + "units_title_idx": { + "name": "units_title_idx", + "columns": [ + { + "expression": "title", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "units_school_idx": { + "name": "units_school_idx", + "columns": [ + { + "expression": "school", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "units_title_trgm_idx": { + "name": "units_title_trgm_idx", + "columns": [ + { + "expression": "title gin_trgm_ops", + "asc": true, + "isExpression": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "gin", + "with": {} + }, + "units_code_trgm_idx": { + "name": "units_code_trgm_idx", + "columns": [ + { + "expression": "code gin_trgm_ops", + "asc": true, + "isExpression": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "gin", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": { + "units_year_code_pk": { + "name": "units_year_code_pk", + "columns": [ + "year", + "code" + ] + } + }, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.user": { + "name": "user", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "email": { + "name": "email", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "email_verified": { + "name": "email_verified", + "type": "boolean", + "primaryKey": false, + "notNull": true, + "default": false + }, + "image": { + "name": "image", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": { + "user_email_unique": { + "name": "user_email_unique", + "nullsNotDistinct": false, + "columns": [ + "email" + ] + } + }, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.user_grade": { + "name": "user_grade", + "schema": "", + "columns": { + "user_id": { + "name": "user_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "unit_code": { + "name": "unit_code", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "mark": { + "name": "mark", + "type": "integer", + "primaryKey": false, + "notNull": true + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "user_grade_user_id_idx": { + "name": "user_grade_user_id_idx", + "columns": [ + { + "expression": "user_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "user_grade_user_id_user_id_fk": { + "name": "user_grade_user_id_user_id_fk", + "tableFrom": "user_grade", + "tableTo": "user", + "columnsFrom": [ + "user_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": { + "user_grade_user_id_unit_code_pk": { + "name": "user_grade_user_id_unit_code_pk", + "columns": [ + "user_id", + "unit_code" + ] + } + }, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.user_plan": { + "name": "user_plan", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "user_id": { + "name": "user_id", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "state": { + "name": "state", + "type": "jsonb", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "user_plan_user_id_idx": { + "name": "user_plan_user_id_idx", + "columns": [ + { + "expression": "user_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": { + "user_plan_user_id_user_id_fk": { + "name": "user_plan_user_id_user_id_fk", + "tableFrom": "user_plan", + "tableTo": "user", + "columnsFrom": [ + "user_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "public.verification": { + "name": "verification", + "schema": "", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true + }, + "identifier": { + "name": "identifier", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "value": { + "name": "value", + "type": "text", + "primaryKey": false, + "notNull": true + }, + "expires_at": { + "name": "expires_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + } + }, + "indexes": { + "verification_identifier_idx": { + "name": "verification_identifier_idx", + "columns": [ + { + "expression": "identifier", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + } + }, + "enums": { + "public.aos_relationship_kind": { + "name": "aos_relationship_kind", + "schema": "public", + "values": [ + "major", + "extended_major", + "minor", + "specialisation", + "elective", + "other" + ] + }, + "public.requisite_type": { + "name": "requisite_type", + "schema": "public", + "values": [ + "prerequisite", + "corequisite", + "prohibition", + "permission", + "other" + ] + } + }, + "schemas": {}, + "sequences": {}, + "roles": {}, + "policies": {}, + "views": {}, + "_meta": { + "columns": {}, + "schemas": {}, + "tables": {} + } +} \ No newline at end of file diff --git a/packages/db/drizzle/meta/_journal.json b/packages/db/drizzle/meta/_journal.json index 2ca298d..1bcc8c1 100644 --- a/packages/db/drizzle/meta/_journal.json +++ b/packages/db/drizzle/meta/_journal.json @@ -50,6 +50,13 @@ "when": 1778818974464, "tag": "0006_smooth_living_lightning", "breakpoints": true + }, + { + "idx": 7, + "version": "7", + "when": 1778819074464, + "tag": "0007_backfill_enrolment_rule_refs", + "breakpoints": true } ] } \ No newline at end of file diff --git a/packages/ingest/src/parse.test.ts b/packages/ingest/src/parse.test.ts index ebb1602..811da8a 100644 --- a/packages/ingest/src/parse.test.ts +++ b/packages/ingest/src/parse.test.ts @@ -1,7 +1,12 @@ import { test } from "node:test" import assert from "node:assert/strict" -import { collectCodeRefs, extractCourseAosRefs, extractAosUnitRefs } from "./parse.ts" +import { + collectCodeRefs, + extractCourseAosRefs, + extractAosUnitRefs, + extractEnrolmentRuleRefs, +} from "./parse.ts" /* ------------------------------------------------------------------ * * Fixtures @@ -200,3 +205,83 @@ test("AoS→unit: unknown unit codes are filtered out", () => { const refs = extractAosUnitRefs("2026", "AOS01", structure, new Set(["U1"])) assert.deepEqual(refs.map((r) => r.unitCode), ["U1"]) }) + +/* ------------------------------------------------------------------ * + * extractEnrolmentRuleRefs — prose requisites in enrolment_rules + * ------------------------------------------------------------------ */ + +const erDesc = (s: string) => [{ description: s }] +const erKey = (r: { requisiteType: string; requiresUnitCode: string }) => + `${r.requisiteType}:${r.requiresUnitCode}` + +test("enrolment refs: a single description carrying both PREREQUISITE and PROHIBITION attributes each link to its own section (CIV4283 regression)", () => { + const refs = extractEnrolmentRuleRefs( + "2026", + "CIV4283", + erDesc( + '' + + '

Prohibitions: CIV4293

', + ), + ) + assert.deepEqual(refs.map(erKey).sort(), [ + "prerequisite:CIV2282", + "prohibition:CIV4293", + ]) +}) + +test("enrolment refs: ignores /courses/ and /aos/ links, keeps only /units/ (MTH2010 regression)", () => { + const refs = extractEnrolmentRuleRefs( + "2026", + "MTH2010", + erDesc( + '

PROHIBITION: ENG2005, ' + + 'MTH2015 and incompatible with course versions ' + + 'E3001.

' + + '

PREREQUISITE: You must have passed ' + + 'MTH1030, or MTH1040

', + ), + ) + // E3001 (/courses/) dropped; plain-text "MTH1040" (no anchor) not parsed. + assert.deepEqual(refs.map(erKey).sort(), [ + "prerequisite:MTH1030", + "prohibition:ENG2005", + "prohibition:MTH2015", + ]) +}) + +test("enrolment refs: extracts CO-REQUISITE but drops a unit listed as its own corequisite (CHM3990 regression)", () => { + const refs = extractEnrolmentRuleRefs( + "2026", + "CHM3990", + erDesc( + '

Co-requisites: ' + + 'CHM3990, ' + + 'CHM3911

', + ), + ) + assert.deepEqual(refs.map(erKey), ["corequisite:CHM3911"]) +}) + +test("enrolment refs: prose with no requisite label yields nothing", () => { + const refs = extractEnrolmentRuleRefs( + "2026", + "ABC1000", + erDesc( + '

Must be enrolled in S6002.

', + ), + ) + assert.deepEqual(refs, []) +}) + +test("enrolment refs: de-dupes a unit repeated within the same section", () => { + const refs = extractEnrolmentRuleRefs( + "2026", + "ABC1000", + erDesc( + '

Prerequisites: ' + + 'MTH1030 or ' + + 'MTH1030

', + ), + ) + assert.deepEqual(refs.map(erKey), ["prerequisite:MTH1030"]) +}) diff --git a/packages/ingest/src/parse.ts b/packages/ingest/src/parse.ts index dfa5dcb..5607ac9 100644 --- a/packages/ingest/src/parse.ts +++ b/packages/ingest/src/parse.ts @@ -231,6 +231,71 @@ export interface UnitRows { }>; } +/** + * Some units (Science, Engineering, Pharmacy, Education — ~2,340 unit-years) + * record their PREREQUISITE / PROHIBITION / CO-REQUISITE relationships as HTML + * prose in `enrolment_rules` instead of the structured `requisites` tree, e.g. + *

Prerequisite: MTH1030

+ *

Prohibitions: MTH2015

+ * Pull the unit-code refs out so the graph edges and "what does X unlock" + * views reflect reality. + * + * High-precision, anchor-based extraction: + * - Split each description at every `` label, so a description that + * carries several labels attributes each unit link to its OWN section + * rather than the whole blob (121 descriptions mix PREREQUISITE + + * PROHIBITION; a whole-blob classify would mislabel ~126 edges). + * - Take only `/units/CODE` anchors, across every handbook host the corpus + * uses (`handbook.monash.edu//units/` and the legacy + * `www[3].monash.edu/pubs/.../units/CODE.html`). `/courses/` and `/aos/` + * links in the same prose are ignored — unit edges only reference units. + * - Drop self-references (a unit listing itself, e.g. CHM3990's corequisite). + * - Plain-text codes with no anchor ("…or MTH1040") are deliberately NOT + * parsed: that needs NLP and would read course codes (4531, M6011) as units. + * + * Kept in lockstep with migration + * `packages/db/drizzle/0007_backfill_enrolment_rule_refs.sql`. + */ +export function extractEnrolmentRuleRefs( + year: string, + unitCode: string, + rules: ReadonlyArray<{ description: string | null }>, +): UnitRows["requisiteRefs"] { + const out = new Map(); + const selfCode = unitCode.toUpperCase(); + for (const rule of rules) { + if (!rule.description) continue; + for (const seg of rule.description.split(/(?=]*>\s*PREREQUISITE/i.test( + seg, + ) + ? "prerequisite" + : /^]*>\s*PROHIBITION/i.test(seg) + ? "prohibition" + : /^]*>\s*CO-?REQUISITE/i.test(seg) + ? "corequisite" + : null; + if (!rType) continue; + const unitLinkRe = /\/units\/([A-Za-z][A-Za-z0-9]+)/g; + let m: RegExpExecArray | null; + while ((m = unitLinkRe.exec(seg)) !== null) { + const upper = m[1]!.toUpperCase(); + if (upper === selfCode) continue; // drop self-references + const key = `${rType}|${upper}`; + if (!out.has(key)) { + out.set(key, { + year, + unitCode, + requisiteType: rType, + requiresUnitCode: upper, + }); + } + } + } + } + return [...out.values()]; +} + export function parseUnit(year: string, raw: UnitContent): UnitRows { const code = raw.code; @@ -286,6 +351,15 @@ export function parseUnit(year: string, raw: UnitContent): UnitRows { typeof e["description"] === "string" ? (e["description"] as string) : null, })); + // Fold in the refs that some units record as HTML prose in enrolment_rules + // instead of the structured requisites field (see extractEnrolmentRuleRefs). + // Structured refs added above win on key collision; the two sources are + // effectively disjoint in practice. + for (const ref of extractEnrolmentRuleRefs(year, code, enrolmentRules)) { + const key = `${ref.requisiteType}|${ref.requiresUnitCode}`; + if (!refSet.has(key)) refSet.set(key, ref); + } + return { unit: { year, diff --git a/packages/webapp/components/planner/requisite-tree-view.tsx b/packages/webapp/components/planner/requisite-tree-view.tsx index ab75702..940d554 100644 --- a/packages/webapp/components/planner/requisite-tree-view.tsx +++ b/packages/webapp/components/planner/requisite-tree-view.tsx @@ -23,10 +23,12 @@ export function RequisiteTreeView({ rule, completed, isProhibition = false, + units, }: { rule: RequisiteRule | null | undefined completed: ReadonlySet isProhibition?: boolean + units?: ReadonlyMap }) { if (!rule || rule.length === 0) { return ( @@ -44,6 +46,7 @@ export function RequisiteTreeView({ depth={0} completed={completed} isProhibition={isProhibition} + units={units} /> ))} @@ -55,11 +58,13 @@ function ContainerNode({ depth, completed, isProhibition, + units, }: { container: RequisiteContainer depth: number completed: ReadonlySet isProhibition: boolean + units?: ReadonlyMap }) { const connector = (container.parent_connector?.value ?? "AND").toUpperCase() const children = [ @@ -70,6 +75,7 @@ function ContainerNode({ depth={depth + 1} completed={completed} isProhibition={isProhibition} + units={units} /> )), ...(container.relationships ?? []).map((l, i) => ( @@ -78,6 +84,7 @@ function ContainerNode({ leaf={l} completed={completed} isProhibition={isProhibition} + units={units} /> )), ] @@ -104,16 +111,21 @@ function LeafNode({ leaf, completed, isProhibition, + units, }: { leaf: RequisiteLeaf completed: ReadonlySet isProhibition: boolean + units?: ReadonlyMap }) { const taken = completed.has(leaf.academic_item_code) // Prohibitions invert: "taken" is a problem, not a check. const good = isProhibition ? !taken : taken + const name = + units?.get(leaf.academic_item_code)?.title ?? leaf.academic_item_name + return (
{leaf.academic_item_code} - {leaf.academic_item_name ? ( - - {leaf.academic_item_name} - + {name ? ( + {name} ) : null}
) diff --git a/packages/webapp/components/planner/unit-detail-popover.tsx b/packages/webapp/components/planner/unit-detail-popover.tsx index 39d05da..0a80a8a 100644 --- a/packages/webapp/components/planner/unit-detail-popover.tsx +++ b/packages/webapp/components/planner/unit-detail-popover.tsx @@ -468,6 +468,7 @@ function RequisiteBlockView({ block: RequisiteBlock completed: ReadonlySet }) { + const { units } = usePlanner() const label = block.requisiteType[0].toUpperCase() + block.requisiteType.slice(1) + "s" return ( @@ -486,6 +487,7 @@ function RequisiteBlockView({ rule={block.rule} completed={completed} isProhibition={block.requisiteType === "prohibition"} + units={units} /> )

Prerequisite: CIV2282