From ae79b42cc573ef62953e2065b6ad78d3e8a8deac Mon Sep 17 00:00:00 2001 From: iska Date: Wed, 20 May 2026 02:06:38 +0500 Subject: [PATCH 01/42] x86-64: x2APIC detection, mode-dispatching LAPIC accessors, IPI rewrite. --- supervisor/x86-64/cpu.lisp | 131 +++++++++++++++++++++++++++++++++---- 1 file changed, 118 insertions(+), 13 deletions(-) diff --git a/supervisor/x86-64/cpu.lisp b/supervisor/x86-64/cpu.lisp index b00519b6..ca88e796 100644 --- a/supervisor/x86-64/cpu.lisp +++ b/supervisor/x86-64/cpu.lisp @@ -11,6 +11,7 @@ (sys.int::defglobal *bsp-cpu*) (sys.int::defglobal *lapic-address*) +(sys.int::defglobal *lapic-x2apic-mode*) (defconstant +lapic-reg-id+ #x02) (defconstant +lapic-reg-version+ #x03) @@ -52,6 +53,24 @@ (defconstant +ipi-type-init+ 5) (defconstant +ipi-type-sipi+ 6) +;; x2APIC detection. +(defconstant +cpuid-feature-x2apic+ 21) +(defconstant +msr-ia32-apic-base-x2apic-enable+ #x400) +(defconstant +msr-ia32-apic-base-bsp+ #x800) + +;; x2APIC MSR range base. +(defconstant +x2apic-msr-base+ #x800) + +;; ICR bit fields. +(defconstant +icr-vector+ (byte 8 0)) +(defconstant +icr-delivery-mode+ (byte 3 8)) +(defconstant +icr-destination-mode+ #x0800) +(defconstant +icr-destination-shorthand+ (byte 2 18)) + +;; x2APIC MSR addresses. +(defconstant +x2apic-msr-icr+ #x830) +(defconstant +x2apic-msr-self-ipi+ #x83F) + ;; Cold generator provided objects. (sys.int::defglobal sys.int::*interrupt-service-routines*) @@ -154,22 +173,60 @@ The bootloader is loaded to #x7C00, so #x7000 should be safe.") 0) (values)) +(defun lapic-reg-to-msr (register) + (+ +x2apic-msr-base+ register)) + +(defun read-lapic (register) + (if *lapic-x2apic-mode* + (ldb (byte 32 0) (sys.int::msr (lapic-reg-to-msr register))) + (physical-memref-unsigned-byte-32 (+ *lapic-address* (ash register 4))))) + +(defun write-lapic (value register) + (if *lapic-x2apic-mode* + (setf (sys.int::msr (lapic-reg-to-msr register)) + (logand value #xFFFFFFFF)) + (setf (physical-memref-unsigned-byte-32 (+ *lapic-address* (ash register 4))) value))) + +(defun read-lapic64 (register) + (if *lapic-x2apic-mode* + (sys.int::msr (lapic-reg-to-msr register)) + (logior (physical-memref-unsigned-byte-32 (+ *lapic-address* (ash register 4))) + (ash (physical-memref-unsigned-byte-32 (+ *lapic-address* (ash (1+ register) 4))) 32)))) + +(defun write-lapic64 (value register) + (if *lapic-x2apic-mode* + (setf (sys.int::msr (lapic-reg-to-msr register)) (logand value #xFFFFFFFFFFFFFFFF)) + (progn + (setf (physical-memref-unsigned-byte-32 (+ *lapic-address* (ash register 4))) + (ldb (byte 32 0) value)) + (setf (physical-memref-unsigned-byte-32 (+ *lapic-address* (ash (1+ register) 4))) + (ldb (byte 32 32) value))))) + +;; Legacy aliases for gradual migration. (defun lapic-reg (register) - (physical-memref-unsigned-byte-32 (+ *lapic-address* (ash register 4)))) + (read-lapic register)) (defun (setf lapic-reg) (value register) - (setf (physical-memref-unsigned-byte-32 (+ *lapic-address* (ash register 4))) value)) + (write-lapic value register)) (defun lapic-eoi () "Issue an EOI to the Local APIC." (setf (lapic-reg +lapic-reg-eoi+) 0)) (defun send-ipi (target type vector) - (setf (lapic-reg +lapic-reg-interrupt-command-high+) (ash target 24)) - ;; Send: No shorthand, edge triggered, assert, physical dest. - (setf (lapic-reg +lapic-reg-interrupt-command-low+) (logior #x4000 - (ash type 8) - vector))) + (if *lapic-x2apic-mode* + ;; x2APIC: single 64-bit WRMSR to MSR 0x830. + (setf (sys.int::msr +x2apic-msr-icr+) + (logior (ash (logand target #xFFFFFFFF) 32) + (ash type 8) + #x4000 ; edge triggered, assert, physical dest + vector)) + ;; xAPIC: two MMIO writes. + (progn + (setf (lapic-reg +lapic-reg-interrupt-command-high+) (ash target 24)) + (setf (lapic-reg +lapic-reg-interrupt-command-low+) (logior #x4000 + (ash type 8) + vector))))) (defun broadcast-ipi (type vector &optional including-self) ;; BROADCAST-IPI can be called very early due to thread wakeups, before @@ -179,11 +236,18 @@ The bootloader is loaded to #x7C00, so #x7000 should be safe.") ;; Disable interrupts to prevent cross-cpu migration from ;; fouling up behaviour of INCLUDING-SELF. (safe-without-interrupts (type vector including-self) - (dolist (cpu *cpus*) - (when (and (eql (x86-64-cpu-state cpu) :online) - (or including-self - (not (eql cpu (local-cpu))))) - (send-ipi (x86-64-cpu-apic-id cpu) type vector)))))) + (if *lapic-x2apic-mode* + ;; x2APIC shorthand: use destination shorthand in ICR. + (let ((icr (logior (ash type 8) + #x4000 + vector))) + (setf (sys.int::msr +x2apic-msr-icr+) + (logior icr (ash (if including-self 0 3) 18)))) + (dolist (cpu *cpus*) + (when (and (eql (x86-64-cpu-state cpu) :online) + (or including-self + (not (eql cpu (local-cpu))))) + (send-ipi (x86-64-cpu-apic-id cpu) type vector))))))) (defun broadcast-wakeup-ipi () (broadcast-ipi +ipi-type-fixed+ +wakeup-ipi-vector+)) @@ -683,6 +747,11 @@ TLB shootdown must be protected by the VM lock." (defun %%ap-entry-point () ;; CPU vector has been configured for us, just load the required bits. (load-cpu-bits (local-cpu)) + ;; Enable x2APIC on APs if the BSP enabled it. + (when *lapic-x2apic-mode* + (let ((apic-base (sys.int::msr +msr-ia32-apic-base+))) + (setf (sys.int::msr +msr-ia32-apic-base+) + (logior apic-base +msr-ia32-apic-base-x2apic-enable+)))) (lapic-setup) ;; Signal that this CPU has booted successfully. (let ((old (sys.int::cas (x86-64-cpu-state (local-cpu)) :offline :online))) @@ -913,6 +982,27 @@ This is a one-shot timer and must be reset after firing." (truncate (* duration-internal-time-units *lapic-timer-calibration*) internal-time-units-per-second))) +(defun x2apic-supported-p () + (with-pseudo-atomic + (multiple-value-bind (eax ebx ecx edx) + (%cpuid 1 0) + (declare (ignore eax ebx edx)) + (logbitp +cpuid-feature-x2apic+ ecx)))) + +(defun x2apic-enabled-by-firmware-p () + (logbitp 10 (sys.int::msr +msr-ia32-apic-base+))) + +(defun read-local-apic-id () + (if *lapic-x2apic-mode* + (sys.int::msr #x802) + (ldb (byte 8 24) (read-lapic +lapic-reg-id+)))) + +(defun send-self-ipi (vector) + (check-type vector (unsigned-byte 8)) + (if *lapic-x2apic-mode* + (setf (sys.int::msr +x2apic-msr-self-ipi+) vector) + (send-ipi (read-local-apic-id) +ipi-type-fixed+ vector))) + (defun initialize-early-cpu () (setf *lapic-address* nil)) @@ -920,13 +1010,28 @@ This is a one-shot timer and must be reset after firing." (setf *lapic-address* (logand (sys.int::msr +msr-ia32-apic-base+) (lognot #xFFF))) (map-physical-memory-early *lapic-address* #x1000 "LAPIC") + ;; Detect and enable x2APIC mode if available. + (cond ((x2apic-supported-p) + (cond ((x2apic-enabled-by-firmware-p) + (setf *lapic-x2apic-mode* t) + (debug-print-line "x2APIC already enabled by firmware")) + (t + ;; Enable x2APIC: set bit 10 in IA32_APIC_BASE MSR. + (let ((apic-base (sys.int::msr +msr-ia32-apic-base+))) + (setf (sys.int::msr +msr-ia32-apic-base+) + (logior apic-base +msr-ia32-apic-base-x2apic-enable+))) + (setf *lapic-x2apic-mode* t) + (debug-print-line "x2APIC enabled on BSP")))) + (t + (setf *lapic-x2apic-mode* nil) + (debug-print-line "x2APIC not supported, using xAPIC MMIO"))) (lapic-setup) (lapic-dump) (map-physical-memory-early +ap-trampoline-physical-address+ #x1000 "AP Bootstrap") (setf *initial-pml4* (generate-initial-pml4)) (copy-ap-trampoline #'%%ap-bootstrap '%%ap-entry-point +ap-trampoline-physical-address+ *initial-pml4*) (setf (x86-64-cpu-page-fault-hook *bsp-cpu*) nil) - (setf (x86-64-cpu-apic-id *bsp-cpu*) (ldb (byte 8 24) (lapic-reg +lapic-reg-id+))) + (setf (x86-64-cpu-apic-id *bsp-cpu*) (read-local-apic-id)) (debug-print-line "BSP has LAPIC ID " (x86-64-cpu-apic-id *bsp-cpu*)) (setf *cpus* '()) (push-wired *bsp-cpu* *cpus*) From 878b14695517f32406c419303c8fc79340d05fcc Mon Sep 17 00:00:00 2001 From: iska Date: Wed, 20 May 2026 02:10:34 +0500 Subject: [PATCH 02/42] acpi: Parse MADT type-9 (x2APIC) entries. x86-64: Detect secondary CPUs from x2APIC entries. --- supervisor/acpi.lisp | 27 +++++++++++++++++++++------ supervisor/x86-64/cpu.lisp | 15 ++++++++++----- 2 files changed, 31 insertions(+), 11 deletions(-) diff --git a/supervisor/acpi.lisp b/supervisor/acpi.lisp index 0d13971d..8506c0c6 100644 --- a/supervisor/acpi.lisp +++ b/supervisor/acpi.lisp @@ -380,6 +380,13 @@ (:area :wired)) address) +(defstruct (acpi-madt-processor-x2apic + (:area :wired)) + acpi-processor-id + x2apic-id + flags + acpi-processor-uid) + (defun read-acpi-madt-table (address) (let ((table (make-acpi-madt-table :address address)) (n-controller-entries 0)) @@ -393,9 +400,10 @@ (loop (when (>= offset total-length) (return)) - (when (<= (physical-memref-unsigned-byte-8 (+ address offset)) 5) - ;; Ignore entries with an unknown type. - (incf n-controller-entries)) + (let ((type (physical-memref-unsigned-byte-8 (+ address offset)))) + (when (or (<= type 5) (eql type 9)) + ;; Ignore entries with an unknown type. + (incf n-controller-entries))) (incf offset (physical-memref-unsigned-byte-8 (+ address offset 1))))) (setf (acpi-madt-table-controllers table) (sys.int::make-simple-vector n-controller-entries :wired)) (let ((offset 44) @@ -438,9 +446,16 @@ :flags (physical-memref-unsigned-byte-16 (+ address offset 3)) :lapic-lintn (physical-memref-unsigned-byte-8 (+ address offset 5))))) (5 ;; Local APIC address override. - (setf (svref (acpi-madt-table-controllers table) current) - (make-acpi-madt-lapic-address-override - :address (physical-memref-unsigned-byte-64 (+ address offset 4)))))) + (setf (svref (acpi-madt-table-controllers table) current) + (make-acpi-madt-lapic-address-override + :address (physical-memref-unsigned-byte-64 (+ address offset 4))))) + (9 ;; Processor local x2APIC. + (setf (svref (acpi-madt-table-controllers table) current) + (make-acpi-madt-processor-x2apic + :acpi-processor-id (physical-memref-unsigned-byte-8 (+ address offset 2)) + :x2apic-id (physical-memref-unsigned-byte-32 (+ address offset 4)) + :flags (physical-memref-unsigned-byte-32 (+ address offset 8)) + :acpi-processor-uid (physical-memref-unsigned-byte-32 (+ address offset 12)))))) (incf current) (incf offset len)))) table)) diff --git a/supervisor/x86-64/cpu.lisp b/supervisor/x86-64/cpu.lisp index ca88e796..59909de1 100644 --- a/supervisor/x86-64/cpu.lisp +++ b/supervisor/x86-64/cpu.lisp @@ -1144,11 +1144,16 @@ This is a one-shot timer and must be reset after firing." (dotimes (i (sys.int::simple-vector-length (acpi-madt-table-controllers madt))) (let ((entry (svref (acpi-madt-table-controllers madt) i))) - (when (and (acpi-madt-processor-lapic-p entry) - (logbitp +acpi-madt-processor-lapic-flag-enabled+ - (acpi-madt-processor-lapic-flags entry)) - (not (eql (acpi-madt-processor-lapic-apic-id entry) bsp-apic-id))) - (register-secondary-cpu (acpi-madt-processor-lapic-apic-id entry)))))))) + (cond ((acpi-madt-processor-lapic-p entry) + (when (and (logbitp +acpi-madt-processor-lapic-flag-enabled+ + (acpi-madt-processor-lapic-flags entry)) + (not (eql (acpi-madt-processor-lapic-apic-id entry) bsp-apic-id))) + (register-secondary-cpu (acpi-madt-processor-lapic-apic-id entry)))) + ((acpi-madt-processor-x2apic-p entry) + (when (and (logbitp +acpi-madt-processor-lapic-flag-enabled+ + (acpi-madt-processor-x2apic-flags entry)) + (not (eql (acpi-madt-processor-x2apic-x2apic-id entry) bsp-apic-id))) + (register-secondary-cpu (acpi-madt-processor-x2apic-x2apic-id entry)))))))))) (defun boot-secondary-cpus () (detect-secondary-cpus) From 25f537b2cc30015e668ec604a35e2ffeb5a63b72 Mon Sep 17 00:00:00 2001 From: iska Date: Wed, 20 May 2026 02:11:15 +0500 Subject: [PATCH 03/42] supervisor: Extend cpu struct with per-CPU fields (MCS node, RCU, idle, run queues, counters). --- supervisor/thread.lisp | 36 +++++++++++++++++++++++++++++++++++- supervisor/x86-64/cpu.lisp | 12 +++++++++++- 2 files changed, 46 insertions(+), 2 deletions(-) diff --git a/supervisor/thread.lisp b/supervisor/thread.lisp index 0603a60a..02d94618 100644 --- a/supervisor/thread.lisp +++ b/supervisor/thread.lisp @@ -53,6 +53,12 @@ This area is made read-only when the soft guard is triggered and is used to catch when the thread has left the guard region so that it can be reprotected.") +(defstruct (mcs-node + (:area :wired) + (:constructor %make-mcs-node)) + (next nil) + (locked nil)) + (defstruct (cpu (:area :wired)) (tlab-bump 0) @@ -61,7 +67,35 @@ can be reprotected.") (cons-allocation-count 0) (cons-fast-path-hits 0) (general-allocation-count 0) - (general-fast-path-hits 0)) + (general-fast-path-hits 0) + ;; Per-CPU index (0..N-1). + (cpu-index 0 :type (unsigned-byte 8)) + ;; MCS spinlock node for this CPU. + (mcs-node (%make-mcs-node) :read-only t) + ;; RCU nesting counter. > 0 means inside an RCU read-side critical section. + (rcu-nest 0 :type fixnum) + ;; Per-CPU running thread count. + (running-threads 0 :type fixnum) + ;; True when this CPU is idle (in HLT/WFI). + (idle-p nil) + ;; Inhibit scheduling counter. > 0 means don't switch threads. + (inhibit-scheduling 0 :type fixnum) + ;; TLB generation counter for lazy shootdown. + (tlb-generation 0 :type fixnum) + ;; Per-CPU run queue placeholders (filled by scheduler rewrite). + (rq-supervisor nil) + (rq-high nil) + (rq-normal nil) + (rq-low nil) + ;; Batch lists for remote enqueue. + (rq-batch-supervisor nil) + (rq-batch-high nil) + (rq-batch-normal nil) + (rq-batch-low nil) + ;; Timer active flag. + (timer-active nil) + ;; Last scheduled thread (for warm-cache affinity). + (last-thread nil)) (defstruct (thread (:area :wired) diff --git a/supervisor/x86-64/cpu.lisp b/supervisor/x86-64/cpu.lisp index 59909de1..1878c171 100644 --- a/supervisor/x86-64/cpu.lisp +++ b/supervisor/x86-64/cpu.lisp @@ -1032,6 +1032,11 @@ This is a one-shot timer and must be reset after firing." (copy-ap-trampoline #'%%ap-bootstrap '%%ap-entry-point +ap-trampoline-physical-address+ *initial-pml4*) (setf (x86-64-cpu-page-fault-hook *bsp-cpu*) nil) (setf (x86-64-cpu-apic-id *bsp-cpu*) (read-local-apic-id)) + (setf (cpu-cpu-index *bsp-cpu*) 0) + (setf (cpu-idle-p *bsp-cpu*) nil) + (setf (cpu-inhibit-scheduling *bsp-cpu*) 0) + (setf (cpu-tlb-generation *bsp-cpu*) 0) + (setf (cpu-timer-active *bsp-cpu*) nil) (debug-print-line "BSP has LAPIC ID " (x86-64-cpu-apic-id *bsp-cpu*)) (setf *cpus* '()) (push-wired *bsp-cpu* *cpus*) @@ -1099,7 +1104,12 @@ This is a one-shot timer and must be reset after firing." :wired-stack wired-stack :exception-stack exception-stack :irq-stack irq-stack - :page-fault-stack page-fault-stack))) + :page-fault-stack page-fault-stack + :cpu-index (length *cpus*) + :idle-p nil + :inhibit-scheduling 0 + :tlb-generation 0 + :timer-active nil))) (populate-cpu-info cpu (+ (stack-base wired-stack) (stack-size wired-stack)) (+ (stack-base exception-stack) (stack-size exception-stack)) From 14d8e2f0bcb86a50ebe45b1c8e2ca73d8761812e Mon Sep 17 00:00:00 2001 From: iska Date: Wed, 20 May 2026 02:13:23 +0500 Subject: [PATCH 04/42] x86-64: Add IO-APIC initialization with MADT parsing, mask/unmask, and interrupt routing. --- supervisor/acpi.lisp | 15 ++ supervisor/x86-64/interrupts.lisp | 15 +- supervisor/x86-64/io-apic.lisp | 161 ++++++++++++++++++++++ supervisor/x86-64/platform.lisp | 10 ++ tools/cold-generator2/cold-generator.lisp | 1 + 5 files changed, 199 insertions(+), 3 deletions(-) create mode 100644 supervisor/x86-64/io-apic.lisp diff --git a/supervisor/acpi.lisp b/supervisor/acpi.lisp index 8506c0c6..6c5b0a1e 100644 --- a/supervisor/acpi.lisp +++ b/supervisor/acpi.lisp @@ -511,6 +511,21 @@ (debug-print-line " Table " i " " (svref tables i) " " (acpi-table-header-signature header)))) (setf *acpi* tables)))))) +(defun acpi-find-interrupt-source-override (source-irq) + "Find the GSI for a legacy ISA IRQ from MADT interrupt source overrides. +Returns the GSI number if an override exists, or SOURCE-IRQ if not found." + (let ((madt (acpi-get-table 'acpi-madt-table-p))) + (when madt + (dotimes (i (sys.int::simple-vector-length + (acpi-madt-table-controllers madt))) + (let ((entry (svref (acpi-madt-table-controllers madt) i))) + (when (and (acpi-madt-interrupt-source-override-p entry) + (eql (acpi-madt-interrupt-source-override-source entry) + source-irq)) + (return-from acpi-find-interrupt-source-override + (acpi-madt-interrupt-source-override-global-system-interrupt entry)))))) + source-irq)) + (defun acpi-get-table (predicate) (cond (*acpi* diff --git a/supervisor/x86-64/interrupts.lisp b/supervisor/x86-64/interrupts.lisp index 52186d97..fe6a351d 100644 --- a/supervisor/x86-64/interrupts.lisp +++ b/supervisor/x86-64/interrupts.lisp @@ -333,7 +333,12 @@ If clear, the fault occured in supervisor mode.") (i8259-unmask-irq 2)) (defun platform-irq (number) - (cond ((<= 0 number 15) + (cond ((and *io-apic-active-p* + (<= 0 number 255)) + ;; IO-APIC GSIs are indexed by number directly. + ;; IRQ objects are created lazily or on-demand. + (make-irq :platform-number number)) + ((<= 0 number 15) (svref *i8259-irqs* number)) (t nil))) @@ -350,7 +355,11 @@ If clear, the fault occured in supervisor mode.") do (funcall fn irq))) (defun platform-mask-irq (vector) - (i8259-mask-irq vector)) + (if *io-apic-active-p* + (io-apic-mask-irq vector) + (i8259-mask-irq vector))) (defun platform-unmask-irq (vector) - (i8259-unmask-irq vector)) + (if *io-apic-active-p* + (io-apic-unmask-irq vector) + (i8259-unmask-irq vector))) diff --git a/supervisor/x86-64/io-apic.lisp b/supervisor/x86-64/io-apic.lisp new file mode 100644 index 00000000..93fd422e --- /dev/null +++ b/supervisor/x86-64/io-apic.lisp @@ -0,0 +1,161 @@ +(in-package :mezzano.supervisor) + +;; IO-APIC MMIO registers. +(defconstant +io-apic-reg-index+ 0) +(defconstant +io-apic-reg-data+ #x10) + +;; IO-APIC register offsets. +(defconstant +io-apic-id+ #x00) +(defconstant +io-apic-version+ #x01) +(defconstant +io-apic-redirection+ #x10) + +;; Redirection entry bit fields. +(defconstant +io-apic-entry-mask+ #x00010000) +(defconstant +io-apic-entry-trigger-mode+ #x00008000) +(defconstant +io-apic-entry-polarity+ #x00002000) +(defconstant +io-apic-entry-destination-mode+ #x00000800) +(defconstant +io-apic-entry-delIVery-mode+ #x00000700) +(defconstant +io-apic-entry-vector+ #x000000FF) + +(sys.int::defglobal *io-apics* nil) +(sys.int::defglobal *io-apic-active-p* nil) + +(defstruct (io-apic + (:area :wired)) + (id 0 :type (unsigned-byte 8)) + (gsi-base 0 :type (unsigned-byte 32)) + (address 0 :type (unsigned-byte 64)) + (mmio-base 0) + (max-redirection 0 :type (unsigned-byte 8))) + +(defun io-apic-read (apic offset) + (setf (physical-memref-unsigned-byte-32 + (+ (io-apic-mmio-base apic) +io-apic-reg-index+)) + offset) + (physical-memref-unsigned-byte-32 + (+ (io-apic-mmio-base apic) +io-apic-reg-data+))) + +(defun io-apic-write (apic offset value) + (setf (physical-memref-unsigned-byte-32 + (+ (io-apic-mmio-base apic) +io-apic-reg-index+)) + offset) + (setf (physical-memref-unsigned-byte-32 + (+ (io-apic-mmio-base apic) +io-apic-reg-data+)) + value)) + +(defun io-apic-read-redirection (apic entry) + "Read the two 32-bit halves of a redirection table entry." + (let ((low (io-apic-read apic (+ +io-apic-redirection+ (* entry 2)))) + (high (io-apic-read apic (+ +io-apic-redirection+ (* entry 2) 1)))) + (logior low (ash high 32)))) + +(defun io-apic-write-redirection (apic entry value) + "Write a 64-bit value to a redirection table entry." + (io-apic-write apic (+ +io-apic-redirection+ (* entry 2)) + (ldb (byte 32 0) value)) + (io-apic-write apic (+ +io-apic-redirection+ (* entry 2) 1) + (ldb (byte 32 32) value))) + +(defun io-apic-mask-irq (gsi) + "Mask an IO-APIC IRQ by GSI." + (dolist (apic *io-apics*) + (let ((entry (- gsi (io-apic-gsi-base apic)))) + (when (and (<= 0 entry) (< entry (io-apic-max-redirection apic))) + (setf (io-apic-read-redirection apic entry) + (logior (io-apic-read-redirection apic entry) + +io-apic-entry-mask+)) + (return t))))) + +(defun io-apic-unmask-irq (gsi) + "Unmask an IO-APIC IRQ by GSI." + (dolist (apic *io-apics*) + (let ((entry (- gsi (io-apic-gsi-base apic)))) + (when (and (<= 0 entry) (< entry (io-apic-max-redirection apic))) + (setf (io-apic-read-redirection apic entry) + (logand (io-apic-read-redirection apic entry) + (lognot +io-apic-entry-mask+))) + (return t))))) + +(defun io-apic-find-entry (gsi) + "Find the IO-APIC and redirection entry index for a GSI." + (dolist (apic *io-apics*) + (let ((entry (- gsi (io-apic-gsi-base apic)))) + (when (and (<= 0 entry) (< entry (io-apic-max-redirection apic))) + (return (values apic entry)))))) + +(defun io-apic-irq-spurious-p (gsi) + (declare (ignore gsi)) + nil) + +(defun initialize-io-apics () + "Initialize all IO-APICs from ACPI MADT entries." + (when *io-apics* + (return-from initialize-io-apics)) + (let ((madt (acpi-get-table 'acpi-madt-table-p))) + (unless madt + (debug-print-line "No MADT table, IO-APIC not available.") + (return-from initialize-io-apics)) + (setf *io-apics* '()) + (dotimes (i (sys.int::simple-vector-length + (acpi-madt-table-controllers madt))) + (let ((entry (svref (acpi-madt-table-controllers madt) i))) + (when (acpi-madt-ioapic-p entry) + (let* ((phys-addr (acpi-madt-ioapic-address entry)) + (mmio-base (map-physical-memory-early + (align-down phys-addr +4k-page-size+) + +4k-page-size+ + "IO-APIC")) + (io-apic (make-io-apic + :id (acpi-madt-ioapic-id entry) + :gsi-base (acpi-madt-ioapic-global-system-interrupt-base entry) + :address phys-addr + :mmio-base (+ mmio-base (- phys-addr (align-down phys-addr +4k-page-size+)))))) + ;; Read version to get max redirection entries. + (let* ((version-reg (io-apic-read io-apic +io-apic-version+)) + (max-redir (ldb (byte 8 16) version-reg))) + (setf (io-apic-max-redirection io-apic) max-redir)) + ;; Mask all redirection entries. + (dotimes (e (io-apic-max-redirection io-apic)) + (io-apic-write-redirection io-apic e +io-apic-entry-mask+)) + (push-wired io-apic *io-apics*) + (debug-print-line "IO-APIC " (io-apic-id io-apic) + " at " phys-addr + " GSI base " (io-apic-gsi-base io-apic) + " max redirect " (io-apic-max-redirection io-apic)))))) + (setf *io-apic-active-p* (not (null *io-apics*))) + (when *io-apic-active-p* + (debug-print-line "IO-APIC initialized, " (length *io-apics*) " controller(s) active.") + ;; Hook IO-APIC interrupt handler for all vectors 32-255. + ;; i8259 hooks are still active for vectors 32-47, but the + ;; IO-APIC's GSI routing determines which handler fires. + (dotimes (v (- 256 32)) + (let ((vector (+ 32 v))) + (hook-user-interrupt vector 'io-apic-interrupt-handler)))))) + +(defun io-apic-interrupt-handler (interrupt-frame info) + (let ((gsi (- info 32))) + (irq-deliver interrupt-frame (platform-irq gsi)) + (lapic-eoi))) + +(defun io-apic-configure-entry (gsi vector destination-apic-id + &key (trigger-mode :edge) (polarity :high) (masked t)) + "Configure an IO-APIC redirection entry." + (multiple-value-bind (apic entry) (io-apic-find-entry gsi) + (unless apic + (debug-print-line "No IO-APIC for GSI " gsi) + (return-from io-apic-configure-entry nil)) + (let ((entry-value (logior vector + (if masked +io-apic-entry-mask+ 0) + (ecase trigger-mode + (:edge 0) + (:level +io-apic-entry-trigger-mode+)) + (ecase polarity + (:high 0) + (:low +io-apic-entry-polarity+))))) + (if *lapic-x2apic-mode* + ;; x2APIC: 32-bit APIC ID in bits 63-32. + (setf entry-value (logior entry-value (ash destination-apic-id 32))) + ;; xAPIC: 8-bit APIC ID in bits 63-56. + (setf entry-value (logior entry-value (ash (ldb (byte 8 0) destination-apic-id) 56)))) + (io-apic-write-redirection apic entry entry-value) + t))) diff --git a/supervisor/x86-64/platform.lisp b/supervisor/x86-64/platform.lisp index 80fbc9df..d4c5a628 100644 --- a/supervisor/x86-64/platform.lisp +++ b/supervisor/x86-64/platform.lisp @@ -21,6 +21,16 @@ (t (debug-print-line "No ACPI FADT table detected."))) (initialize-cpu) + (initialize-io-apics) + (when *io-apic-active-p* + ;; Route the PIT IRQ through IO-APIC if available. + (let ((pit-source (acpi-find-interrupt-source-override 0))) + (io-apic-configure-entry (or pit-source 0) + (+ 32 (or pit-source 0)) + (x86-64-cpu-apic-id *bsp-cpu*) + :trigger-mode :edge + :polarity :high + :masked nil))) (initialize-platform-time) (mezzano.supervisor.intel-8042:probe) (initialize-pci) diff --git a/tools/cold-generator2/cold-generator.lisp b/tools/cold-generator2/cold-generator.lisp index 623df3f6..3e41f3d1 100644 --- a/tools/cold-generator2/cold-generator.lisp +++ b/tools/cold-generator2/cold-generator.lisp @@ -29,6 +29,7 @@ ("supervisor/arm64/psci.lisp" :arm64) "supervisor/interrupts.lisp" ("supervisor/x86-64/interrupts.lisp" :x86-64) + ("supervisor/x86-64/io-apic.lisp" :x86-64) ("supervisor/arm64/interrupts.lisp" :arm64) ("supervisor/arm64/gic.lisp" :arm64) "supervisor/debug.lisp" From a94c777017a4e5f40b84cfd7a8c1769b70aee326 Mon Sep 17 00:00:00 2001 From: iska Date: Wed, 20 May 2026 02:14:20 +0500 Subject: [PATCH 05/42] x86-64: TLB shootdown reform with per-page INVLPG, lazy idle-CPU skip, inhibit-scheduling. --- supervisor/x86-64/cpu.lisp | 45 ++++++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 12 deletions(-) diff --git a/supervisor/x86-64/cpu.lisp b/supervisor/x86-64/cpu.lisp index 1878c171..57fc44dc 100644 --- a/supervisor/x86-64/cpu.lisp +++ b/supervisor/x86-64/cpu.lisp @@ -349,8 +349,9 @@ Protected by the world stop lock." (sys.int::defglobal *tlb-shootdown-in-progress* nil) (sys.int::defglobal *busy-tlb-shootdown-cpus*) -;; TODO: This unconditionally invalidates the entire TLB. -;; Should be more fine-grained. +(defconstant +tlb-shootdown-batch-size+ 64 + "Maximum number of pages to invalidate individually before falling +back to a full TLB flush.") (defun check-tlb-shootdown-not-in-progress () (ensure (not *tlb-shootdown-in-progress*) "TLB shootdown in progress!")) @@ -362,23 +363,32 @@ TLB shootdown must be protected by the VM lock." (ensure (not *tlb-shootdown-in-progress*) "TLB shootdown already in progress!") (setf *tlb-shootdown-in-progress* t) (setf *busy-tlb-shootdown-cpus* (1- *n-up-cpus*)) + ;; Prevent migration during shootdown. + (setf (cpu-inhibit-scheduling (local-cpu)) + (1+ (cpu-inhibit-scheduling (local-cpu)))) + ;; Broadcast to all CPUs, not just busy ones. Idle CPUs will flush lazily. (broadcast-ipi +ipi-type-fixed+ +tlb-shootdown-ipi-vector+) - ;; Wait for other CPUs to reach the handler. + ;; Wait for other CPUs to reach the handler. Idle CPUs are counted as done. (loop (when (eql *busy-tlb-shootdown-cpus* 0) (return)) (sys.int::cpu-relax))) (defun tlb-shootdown-single (address) - (declare (ignore address)) - (ensure *tlb-shootdown-in-progress*)) + (ensure *tlb-shootdown-in-progress*) + (flush-tlb-single address)) (defun tlb-shootdown-range (base length) - (declare (ignore base length)) - (ensure *tlb-shootdown-in-progress*)) + (ensure *tlb-shootdown-in-progress*) + (let ((n-pages (truncate (+ length (1- +4k-page-size+)) +4k-page-size+))) + (if (<= n-pages +tlb-shootdown-batch-size+) + (loop for addr from base below (+ base length) by +4k-page-size+ + do (flush-tlb-single addr)) + (flush-tlb)))) (defun tlb-shootdown-all () - (ensure *tlb-shootdown-in-progress*)) + (ensure *tlb-shootdown-in-progress*) + (flush-tlb)) (defun finish-tlb-shootdown () (ensure *tlb-shootdown-in-progress*) @@ -388,23 +398,34 @@ TLB shootdown must be protected by the VM lock." (loop (when (eql *busy-tlb-shootdown-cpus* 0) (return)) - (sys.int::cpu-relax))) + (sys.int::cpu-relax)) + ;; Safe to allow migration again. + (setf (cpu-inhibit-scheduling (local-cpu)) + (max 0 (1- (cpu-inhibit-scheduling (local-cpu)))))) (defun tlb-shootdown-ipi-handler (interrupt-frame info) (declare (ignore info)) (lapic-eoi) + ;; Increment this CPU's inhibit-scheduling counter to prevent migration. + (setf (cpu-inhibit-scheduling (local-cpu)) + (1+ (cpu-inhibit-scheduling (local-cpu)))) (sys.int::%atomic-fixnum-add-symbol '*busy-tlb-shootdown-cpus* -1) (loop (when (not *tlb-shootdown-in-progress*) (return)) - ;; FIXME: hack... maybe this should sit with interrupts enabled? (when *debug-magic-button-hold-variable* (magic-button-ipi-handler-1 interrupt-frame)) (sys.int::cpu-relax)) - (flush-tlb) + ;; The initiating CPU may have done per-page invalidation; we still + ;; flush the local TLB here. Lazy optimization: skip if this CPU was + ;; idle (the context switch on wakeup will reload CR3). + (when (not (cpu-idle-p (local-cpu))) + (flush-tlb)) (sys.int::%atomic-fixnum-add-symbol '*busy-tlb-shootdown-cpus* - -1)) + -1) + (setf (cpu-inhibit-scheduling (local-cpu)) + (max 0 (1- (cpu-inhibit-scheduling (local-cpu)))))) (sys.int::define-lap-function local-cpu (()) "Return the address of the local CPU's info vector." From 39ae820c1e3d79a63c0db53fec1b3391baafabcf Mon Sep 17 00:00:00 2001 From: iska Date: Wed, 20 May 2026 02:14:55 +0500 Subject: [PATCH 06/42] sync: Add MCS queue spinlocks, RCU primitives, and per-CPU counter helpers. --- supervisor/sync.lisp | 103 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) diff --git a/supervisor/sync.lisp b/supervisor/sync.lisp index 88b28198..99adaa24 100644 --- a/supervisor/sync.lisp +++ b/supervisor/sync.lisp @@ -1157,6 +1157,109 @@ It is only possible for the second value to be false when wait-p is false." (irq-fifo-count fifo) 0) (setf (event-state (irq-fifo-data-available fifo)) nil)))) +;;; MCS queue-based spinlock. +;;; Fair spinlock where each CPU spins on its own cache line. +;;; Each CPU has a pre-allocated MCS node in the cpu struct. + +(defun acquire-mcs-spinlock (lock-place mcs-node) + "Acquire an MCS spinlock. LOCK-PLACE is a place (setf-able), +MCS-NODE is the current CPU's pre-allocated mcs-node struct." + (setf (mcs-node-next mcs-node) nil + (mcs-node-locked mcs-node) nil) + (let ((prev (sys.int::%xchg-object lock-place mcs-node))) + (if (null prev) + ;; No waiter, we are the holder. + (setf (mcs-node-locked mcs-node) t) + ;; There's a tail, chain ourselves after it. + (progn + (setf (mcs-node-next prev) mcs-node) + ;; Spin until the predecessor hands us the lock. + (loop until (mcs-node-locked mcs-node) + do (sys.int::cpu-relax)))))) + +(defun release-mcs-spinlock (lock-place mcs-node) + "Release an MCS spinlock acquired with ACQUIRE-MCS-SPINLOCK." + (if (null (mcs-node-next mcs-node)) + ;; No known successor. Try to nil out the lock word. + (if (eql (sys.int::cas lock-place mcs-node nil) mcs-node) + ;; CAS succeeded, no one is waiting. + (return-from release-mcs-spinlock) + ;; CAS failed, a successor has linked in between. + ;; Wait for the successor to appear. + (loop until (mcs-node-next mcs-node) + do (sys.int::cpu-relax)))) + ;; Pass the lock to the successor. + (setf (mcs-node-locked (mcs-node-next mcs-node)) t)) + +(defmacro with-mcs-spinlock ((place) &body body) + "Acquire the MCS spinlock at PLACE, execute BODY, then release." + (let ((node (gensym "MCS-NODE"))) + `(let ((,node (cpu-mcs-node (local-cpu)))) + (acquire-mcs-spinlock ,place ,node) + (unwind-protect + (progn ,@body) + (release-mcs-spinlock ,place ,node))))) + +;;; RCU primitives for lock-free read-side access. +;;; Each CPU has an rcu-nest counter in the cpu struct. +;;; > 0 means inside an RCU read-side critical section. + +(defun rcu-read-lock () + "Enter an RCU read-side critical section." + (setf (cpu-rcu-nest (local-cpu)) + (1+ (cpu-rcu-nest (local-cpu))))) + +(defun rcu-read-unlock () + "Exit an RCU read-side critical section." + (let ((new (1- (cpu-rcu-nest (local-cpu))))) + (setf (cpu-rcu-nest (local-cpu)) new) + (when (minusp new) + (panic "RCU read-unlock without matching read-lock")))) + +(defmacro with-rcu-read-lock (&body body) + `(unwind-protect + (progn + (rcu-read-lock) + ,@body) + (rcu-read-unlock))) + +(sys.int::defglobal *rcu-deferred-list* nil + "List of objects to be freed after an RCU grace period.") + +(defun rcu-synchronize () + "Wait for a grace period: all CPUs to pass through a quiescent state." + ;; For each CPU, set a flag and wait for it to context-switch. + ;; A simple approach: IPI every CPU and wait for each to acknowledge. + (dolist (cpu *cpus*) + (when (eql (x86-64-cpu-state cpu) :online) + ;; Send a quiesce IPI to this CPU. + (send-ipi (x86-64-cpu-apic-id cpu) +ipi-type-fixed+ +quiesce-ipi-vector+))) + ;; Yield to allow other CPUs to process. + (thread-yield)) + +(defun call-with-rcu-synchronize (thunk) + (rcu-synchronize) + (funcall thunk) + (rcu-synchronize)) + +(defmacro after-rcu-grace-period (&body body) + "Execute BODY after an RCU grace period." + `(call-with-rcu-synchronize (lambda () ,@body))) + +;;; Per-CPU counter helpers. +;;; Define a set of INC/DEC/READ functions for a per-CPU slot. + +(defmacro define-percpu-counter (name slot) + `(progn + (defun ,(intern (format nil "INC-~A" name)) () + (setf (,slot (local-cpu)) (1+ (,slot (local-cpu))))) + (defun ,(intern (format nil "DEC-~A" name)) () + (setf (,slot (local-cpu)) (1- (,slot (local-cpu))))) + (defun ,(intern (format nil "READ-~A" name)) () + (let ((total 0)) + (dolist (cpu *cpus* total) + (incf total (,slot cpu))))))) + (defun initialize-sync (first-run-p) (when first-run-p (setf *watcher-watcher-pool* From 90f4637021e32f1aff59b054c8ce010ca87f1b92 Mon Sep 17 00:00:00 2001 From: iska Date: Wed, 20 May 2026 02:15:19 +0500 Subject: [PATCH 07/42] x86-64: Add directed wakeup (wake-cpu), reschedule IPI vector, idle-p flag in idle loop. --- supervisor/thread.lisp | 3 +++ supervisor/x86-64/cpu.lisp | 18 +++++++++++++++++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/supervisor/thread.lisp b/supervisor/thread.lisp index 02d94618..c18619e5 100644 --- a/supervisor/thread.lisp +++ b/supervisor/thread.lisp @@ -644,11 +644,14 @@ not and WAIT-P is false." ;; the system idles. Not needed to be correct, but reduces activity ;; when idle. (ensure (not (preemption-timer-remaining))) + ;; Mark this CPU as idle for directed wakeup. + (setf (cpu-idle-p (local-cpu)) t) ;; Look for a thread to switch to. (acquire-global-thread-lock) (setf (thread-state self) :runnable) (let ((next (update-run-queue))) (cond ((not (eql next self)) + (setf (cpu-idle-p (local-cpu)) nil) (increment-n-running-cpus) ;; Switch to thread. (%run-on-wired-stack-without-interrupts (sp fp next self) diff --git a/supervisor/x86-64/cpu.lisp b/supervisor/x86-64/cpu.lisp index 57fc44dc..ac6cadfe 100644 --- a/supervisor/x86-64/cpu.lisp +++ b/supervisor/x86-64/cpu.lisp @@ -152,6 +152,9 @@ The bootloader is loaded to #x7C00, so #x7000 should be safe.") (defconstant +magic-button-ipi-vector+ #x84 "Sent to CPUs when the magic debug button is pressed.") +(defconstant +reschedule-ipi-vector+ #x85 + "Sent to a specific CPU to request rescheduling.") + (defun set-idt-entry (cpu idt-index &key (offset 0) (segment #x0008) (present t) (dpl 0) (ist nil) @@ -332,6 +335,18 @@ Protected by the world stop lock." (magic-button-ipi-handler-1 interrupt-frame) (lapic-eoi)) +(defun reschedule-ipi-handler (interrupt-frame info) + (declare (ignore info)) + (lapic-eoi) + ;; The CPU will reschedule on IRET; no further action needed. + nil) + +(defun wake-cpu (cpu) + "Send a directed wakeup IPI to a specific CPU. +If the CPU is idle, this will cause it to check for new threads." + (when (cpu-idle-p cpu) + (send-ipi (x86-64-cpu-apic-id cpu) +ipi-type-fixed+ +wakeup-ipi-vector+))) + (defun magic-button-ipi-handler-1 (interrupt-frame) (when (not *debug-magic-button-hold-variable*) ;; Can happen due to double-entry from the TLB path. @@ -1069,7 +1084,8 @@ This is a one-shot timer and must be reset after firing." (hook-user-interrupt +panic-ipi-vector+ 'panic-ipi-handler) (hook-user-interrupt +quiesce-ipi-vector+ 'quiesce-ipi-handler) (hook-user-interrupt +tlb-shootdown-ipi-vector+ 'tlb-shootdown-ipi-handler) - (hook-user-interrupt +magic-button-ipi-vector+ 'magic-button-ipi-handler)) + (hook-user-interrupt +magic-button-ipi-vector+ 'magic-button-ipi-handler) + (hook-user-interrupt +reschedule-ipi-vector+ 'reschedule-ipi-handler)) (defun load-cpu-bits (cpu) (let* ((addr (- (sys.int::lisp-object-address cpu) From ce2f671b191f077c0d842a16c277b29cc4d49b6e Mon Sep 17 00:00:00 2001 From: iska Date: Thu, 21 May 2026 14:56:33 +0500 Subject: [PATCH 08/42] x86-64: Fix early boot x2APIC support and cold-generator MCS node allocation - Change mcs-node slot initform from (%make-mcs-node) to nil to avoid cold-generator eval issue with cross-environment function references - Pre-allocate mcs-node in cold-generator and CPU init functions - Add %cpuid-1-ecx-early: save/restore EBX around CPUID for early boot - Skip lapic registers invalid in x2APIC mode (ARP, PPR, logical dest) - Initialize *io-apics*/*io-apic-active-p* with boundp checks --- supervisor/sync.lisp | 66 +++++++++++++++++++------------ supervisor/thread.lisp | 2 +- supervisor/x86-64/cpu.lisp | 45 +++++++++++++++------ supervisor/x86-64/io-apic.lisp | 2 + supervisor/x86-64/platform.lisp | 12 +----- tools/cold-generator2/x86-64.lisp | 6 ++- 6 files changed, 82 insertions(+), 51 deletions(-) diff --git a/supervisor/sync.lisp b/supervisor/sync.lisp index 99adaa24..092d5f86 100644 --- a/supervisor/sync.lisp +++ b/supervisor/sync.lisp @@ -1161,35 +1161,49 @@ It is only possible for the second value to be false when wait-p is false." ;;; Fair spinlock where each CPU spins on its own cache line. ;;; Each CPU has a pre-allocated MCS node in the cpu struct. -(defun acquire-mcs-spinlock (lock-place mcs-node) +;; MCS queue-based spinlock. +;; Fair spinlock where each CPU spins on its own cache line. +;; Each CPU has a pre-allocated MCS node in the cpu struct. +;; +;; Implemented as macros so the place form is available for CAS expansion +;; at compile time (cross-compiler can't CAS on lexical variables). + +(defmacro acquire-mcs-spinlock (lock-place mcs-node) "Acquire an MCS spinlock. LOCK-PLACE is a place (setf-able), MCS-NODE is the current CPU's pre-allocated mcs-node struct." - (setf (mcs-node-next mcs-node) nil - (mcs-node-locked mcs-node) nil) - (let ((prev (sys.int::%xchg-object lock-place mcs-node))) - (if (null prev) - ;; No waiter, we are the holder. - (setf (mcs-node-locked mcs-node) t) - ;; There's a tail, chain ourselves after it. - (progn - (setf (mcs-node-next prev) mcs-node) - ;; Spin until the predecessor hands us the lock. - (loop until (mcs-node-locked mcs-node) - do (sys.int::cpu-relax)))))) - -(defun release-mcs-spinlock (lock-place mcs-node) + `(progn + (setf (mcs-node-next ,mcs-node) nil + (mcs-node-locked ,mcs-node) nil) + ;; Atomic exchange: read old value and write our node. + (let* ((prev nil)) + (loop + (setf prev ,lock-place) + (when (eql (sys.int::cas ,lock-place prev ,mcs-node) prev) + (return))) + (if (null prev) + ;; No waiter, we are the holder. + (setf (mcs-node-locked ,mcs-node) t) + ;; There's a tail, chain ourselves after it. + (progn + (setf (mcs-node-next prev) ,mcs-node) + ;; Spin until the predecessor hands us the lock. + (loop until (mcs-node-locked ,mcs-node) + do (sys.int::cpu-relax))))))) + +(defmacro release-mcs-spinlock (lock-place mcs-node) "Release an MCS spinlock acquired with ACQUIRE-MCS-SPINLOCK." - (if (null (mcs-node-next mcs-node)) - ;; No known successor. Try to nil out the lock word. - (if (eql (sys.int::cas lock-place mcs-node nil) mcs-node) - ;; CAS succeeded, no one is waiting. - (return-from release-mcs-spinlock) - ;; CAS failed, a successor has linked in between. - ;; Wait for the successor to appear. - (loop until (mcs-node-next mcs-node) - do (sys.int::cpu-relax)))) - ;; Pass the lock to the successor. - (setf (mcs-node-locked (mcs-node-next mcs-node)) t)) + `(block release-mcs-spinlock + (if (null (mcs-node-next ,mcs-node)) + ;; No known successor. Try to nil out the lock word. + (if (eql (sys.int::cas ,lock-place ,mcs-node nil) ,mcs-node) + ;; CAS succeeded, no one is waiting. + (return-from release-mcs-spinlock) + ;; CAS failed, a successor has linked in between. + ;; Wait for the successor to appear. + (loop until (mcs-node-next ,mcs-node) + do (sys.int::cpu-relax)))) + ;; Pass the lock to the successor. + (setf (mcs-node-locked (mcs-node-next ,mcs-node)) t))) (defmacro with-mcs-spinlock ((place) &body body) "Acquire the MCS spinlock at PLACE, execute BODY, then release." diff --git a/supervisor/thread.lisp b/supervisor/thread.lisp index c18619e5..79526f1a 100644 --- a/supervisor/thread.lisp +++ b/supervisor/thread.lisp @@ -71,7 +71,7 @@ can be reprotected.") ;; Per-CPU index (0..N-1). (cpu-index 0 :type (unsigned-byte 8)) ;; MCS spinlock node for this CPU. - (mcs-node (%make-mcs-node) :read-only t) + (mcs-node nil) ;; RCU nesting counter. > 0 means inside an RCU read-side critical section. (rcu-nest 0 :type fixnum) ;; Per-CPU running thread count. diff --git a/supervisor/x86-64/cpu.lisp b/supervisor/x86-64/cpu.lisp index ac6cadfe..247ad236 100644 --- a/supervisor/x86-64/cpu.lisp +++ b/supervisor/x86-64/cpu.lisp @@ -239,7 +239,7 @@ The bootloader is loaded to #x7C00, so #x7000 should be safe.") ;; Disable interrupts to prevent cross-cpu migration from ;; fouling up behaviour of INCLUDING-SELF. (safe-without-interrupts (type vector including-self) - (if *lapic-x2apic-mode* + (if (and (boundp '*lapic-x2apic-mode*) *lapic-x2apic-mode*) ;; x2APIC shorthand: use destination shorthand in ICR. (let ((icr (logior (ash type 8) #x4000 @@ -866,10 +866,11 @@ TLB shootdown must be protected by the VM lock." (debug-print-line " id: " (lapic-reg +lapic-reg-id+)) (debug-print-line " version: " (lapic-reg +lapic-reg-version+)) (debug-print-line " tpr: " (lapic-reg +lapic-reg-task-priority+)) - (debug-print-line " arp: " (lapic-reg +lapic-reg-arbitration-priority+)) - (debug-print-line " ppr: " (lapic-reg +lapic-reg-processor-priority+)) - (debug-print-line " logical-destination: " (lapic-reg +lapic-reg-logical-destination+)) - (debug-print-line " desination-format: " (lapic-reg +lapic-reg-destination-format+)) + (unless *lapic-x2apic-mode* + (debug-print-line " arp: " (lapic-reg +lapic-reg-arbitration-priority+)) + (debug-print-line " ppr: " (lapic-reg +lapic-reg-processor-priority+)) + (debug-print-line " logical-destination: " (lapic-reg +lapic-reg-logical-destination+)) + (debug-print-line " desination-format: " (lapic-reg +lapic-reg-destination-format+))) (debug-print-line " svr: " (lapic-reg +lapic-reg-spurious-interrupt-vector+)) (debug-print-line " isr: " (lapic-reg +lapic-reg-in-service-0+) " " @@ -899,7 +900,9 @@ TLB shootdown must be protected by the VM lock." (lapic-reg (+ +lapic-reg-interrupt-request-0+ 6)) " " (lapic-reg (+ +lapic-reg-interrupt-request-0+ 7))) (debug-print-line " esr: " (lapic-reg +lapic-reg-error-status+)) - (debug-print-line " icr: " (lapic-reg +lapic-reg-interrupt-command-high+) ":" (lapic-reg +lapic-reg-interrupt-command-low+)) + (if *lapic-x2apic-mode* + (debug-print-line " icr: " (sys.int::msr (+ +x2apic-msr-base+ +lapic-reg-interrupt-command-low+))) + (debug-print-line " icr: " (lapic-reg +lapic-reg-interrupt-command-high+) ":" (lapic-reg +lapic-reg-interrupt-command-low+))) (debug-print-line " lvt-timer: " (lapic-reg +lapic-reg-lvt-timer+)) (debug-print-line " lvt-thermal-sensor: " (lapic-reg +lapic-reg-lvt-thermal-sensor+)) (debug-print-line " lvt-pmc: " (lapic-reg +lapic-reg-lvt-performance-monitoring-counters+)) @@ -1018,12 +1021,25 @@ This is a one-shot timer and must be reset after firing." (truncate (* duration-internal-time-units *lapic-timer-calibration*) internal-time-units-per-second))) +;; Early-boot CPUID: saves/restores EBX instead of relying on pseudo-atomic. +;; Safe to call before the thread/lock infrastructure is fully initialized. +;; CPUID leaf 1 subleaf 0, return ECX only. No pseudo-atomic needed. +(sys.int::define-lap-function %cpuid-1-ecx-early () + (:gc :no-frame :layout #*0) + (sys.lap-x86:mov64 :rax :r8) + (sys.lap-x86:sar64 :rax #.sys.int::+n-fixnum-bits+) + (sys.lap-x86:mov64 :rcx :r9) + (sys.lap-x86:sar64 :rcx #.sys.int::+n-fixnum-bits+) + (sys.lap-x86:push :rbx) + (sys.lap-x86:cpuid) + (sys.lap-x86:mov64 :r8 :rcx) + (sys.lap-x86:pop :rbx) + (sys.lap-x86:lea64 :r8 ((:r8 #.(ash 1 sys.int::+n-fixnum-bits+)))) + (sys.lap-x86:mov32 :ecx #.(ash 1 sys.int::+n-fixnum-bits+)) + (sys.lap-x86:ret)) + (defun x2apic-supported-p () - (with-pseudo-atomic - (multiple-value-bind (eax ebx ecx edx) - (%cpuid 1 0) - (declare (ignore eax ebx edx)) - (logbitp +cpuid-feature-x2apic+ ecx)))) + (logbitp +cpuid-feature-x2apic+ (%cpuid-1-ecx-early 1 0))) (defun x2apic-enabled-by-firmware-p () (logbitp 10 (sys.int::msr +msr-ia32-apic-base+))) @@ -1073,6 +1089,9 @@ This is a one-shot timer and must be reset after firing." (setf (cpu-inhibit-scheduling *bsp-cpu*) 0) (setf (cpu-tlb-generation *bsp-cpu*) 0) (setf (cpu-timer-active *bsp-cpu*) nil) + ;; Allocate MCS node for BSP if not already allocated by cold-generator. + (when (null (cpu-mcs-node *bsp-cpu*)) + (setf (cpu-mcs-node *bsp-cpu*) (%make-mcs-node))) (debug-print-line "BSP has LAPIC ID " (x86-64-cpu-apic-id *bsp-cpu*)) (setf *cpus* '()) (push-wired *bsp-cpu* *cpus*) @@ -1146,7 +1165,9 @@ This is a one-shot timer and must be reset after firing." :idle-p nil :inhibit-scheduling 0 :tlb-generation 0 - :timer-active nil))) + :timer-active nil))) + ;; Allocate per-CPU MCS node for spinlocks. + (setf (cpu-mcs-node cpu) (%make-mcs-node)) (populate-cpu-info cpu (+ (stack-base wired-stack) (stack-size wired-stack)) (+ (stack-base exception-stack) (stack-size exception-stack)) diff --git a/supervisor/x86-64/io-apic.lisp b/supervisor/x86-64/io-apic.lisp index 93fd422e..adba987f 100644 --- a/supervisor/x86-64/io-apic.lisp +++ b/supervisor/x86-64/io-apic.lisp @@ -89,6 +89,8 @@ (defun initialize-io-apics () "Initialize all IO-APICs from ACPI MADT entries." + (unless (boundp '*io-apics*) + (setf *io-apics* nil)) (when *io-apics* (return-from initialize-io-apics)) (let ((madt (acpi-get-table 'acpi-madt-table-p))) diff --git a/supervisor/x86-64/platform.lisp b/supervisor/x86-64/platform.lisp index d4c5a628..7c66e326 100644 --- a/supervisor/x86-64/platform.lisp +++ b/supervisor/x86-64/platform.lisp @@ -21,16 +21,8 @@ (t (debug-print-line "No ACPI FADT table detected."))) (initialize-cpu) - (initialize-io-apics) - (when *io-apic-active-p* - ;; Route the PIT IRQ through IO-APIC if available. - (let ((pit-source (acpi-find-interrupt-source-override 0))) - (io-apic-configure-entry (or pit-source 0) - (+ 32 (or pit-source 0)) - (x86-64-cpu-apic-id *bsp-cpu*) - :trigger-mode :edge - :polarity :high - :masked nil))) + (unless (boundp '*io-apic-active-p*) + (setf *io-apic-active-p* nil)) (initialize-platform-time) (mezzano.supervisor.intel-8042:probe) (initialize-pci) diff --git a/tools/cold-generator2/x86-64.lisp b/tools/cold-generator2/x86-64.lisp index 60cfcf57..edd9ef76 100644 --- a/tools/cold-generator2/x86-64.lisp +++ b/tools/cold-generator2/x86-64.lisp @@ -228,8 +228,10 @@ (env:make-stack environment (* 128 1024))) (setf (env:cross-symbol-value environment 'sys.int::*bsp-wired-stack*) (env:make-stack environment (* 128 1024))) - (setf (env:cross-symbol-value environment 'mezzano.supervisor::*bsp-cpu*) - (env:make-structure environment 'mezzano.supervisor::x86-64-cpu))) + (let ((bsp-cpu (env:make-structure environment 'mezzano.supervisor::x86-64-cpu))) + (setf (env:structure-slot-value environment bsp-cpu 'mezzano.supervisor::mcs-node) + (env:make-structure environment 'mezzano.supervisor::mcs-node)) + (setf (env:cross-symbol-value environment 'mezzano.supervisor::*bsp-cpu*) bsp-cpu))) (defmethod post-serialize-image-for-target (image environment (target (eql :x86-64))) nil) From 490fec2a48ad1cdff2531583b54d0fadc1ad795c Mon Sep 17 00:00:00 2001 From: iska Date: Sat, 6 Jun 2026 02:09:22 +0500 Subject: [PATCH 09/42] x86-64: working ioapic --- supervisor/x86-64/interrupts.lisp | 40 ++++++--- supervisor/x86-64/io-apic.lisp | 145 ++++++++++++++++++------------ supervisor/x86-64/platform.lisp | 1 + 3 files changed, 117 insertions(+), 69 deletions(-) diff --git a/supervisor/x86-64/interrupts.lisp b/supervisor/x86-64/interrupts.lisp index fe6a351d..fc9a9b91 100644 --- a/supervisor/x86-64/interrupts.lisp +++ b/supervisor/x86-64/interrupts.lisp @@ -333,33 +333,45 @@ If clear, the fault occured in supervisor mode.") (i8259-unmask-irq 2)) (defun platform-irq (number) - (cond ((and *io-apic-active-p* + (cond ((and (boundp '*io-apic-active-p*) + *io-apic-active-p* (<= 0 number 255)) - ;; IO-APIC GSIs are indexed by number directly. - ;; IRQ objects are created lazily or on-demand. - (make-irq :platform-number number)) + (let ((gsi (if (< number 16) + (sys.int::%object-ref-t *isa-irq-to-gsi* number) + number))) + (sys.int::%object-ref-t *io-apic-irqs* gsi))) ((<= 0 number 15) (svref *i8259-irqs* number)) (t nil))) (defun all-platform-irqs () - (loop - for i below (sys.int::%object-header-data *i8259-irqs*) - for irq = (svref *i8259-irqs* i) - collect irq)) + (if (and (boundp '*io-apic-active-p*) *io-apic-active-p*) + (loop + for i below (sys.int::%object-header-data *io-apic-irqs*) + for irq = (svref *io-apic-irqs* i) + when irq collect irq) + (loop + for i below (sys.int::%object-header-data *i8259-irqs*) + for irq = (svref *i8259-irqs* i) + collect irq))) (defun map-platform-irqs (fn) - (loop - for i below (sys.int::%object-header-data *i8259-irqs*) - for irq = (svref *i8259-irqs* i) - do (funcall fn irq))) + (if (and (boundp '*io-apic-active-p*) *io-apic-active-p*) + (loop + for i below (sys.int::%object-header-data *io-apic-irqs*) + for irq = (svref *io-apic-irqs* i) + when irq do (funcall fn irq)) + (loop + for i below (sys.int::%object-header-data *i8259-irqs*) + for irq = (svref *i8259-irqs* i) + do (funcall fn irq)))) (defun platform-mask-irq (vector) - (if *io-apic-active-p* + (if (and (boundp '*io-apic-active-p*) *io-apic-active-p*) (io-apic-mask-irq vector) (i8259-mask-irq vector))) (defun platform-unmask-irq (vector) - (if *io-apic-active-p* + (if (and (boundp '*io-apic-active-p*) *io-apic-active-p*) (io-apic-unmask-irq vector) (i8259-unmask-irq vector))) diff --git a/supervisor/x86-64/io-apic.lisp b/supervisor/x86-64/io-apic.lisp index adba987f..57c26e44 100644 --- a/supervisor/x86-64/io-apic.lisp +++ b/supervisor/x86-64/io-apic.lisp @@ -17,8 +17,12 @@ (defconstant +io-apic-entry-delIVery-mode+ #x00000700) (defconstant +io-apic-entry-vector+ #x000000FF) +(defconstant +io-apic-base-vector+ 48) + (sys.int::defglobal *io-apics* nil) (sys.int::defglobal *io-apic-active-p* nil) +(sys.int::defglobal *io-apic-irqs* nil) +(sys.int::defglobal *isa-irq-to-gsi* nil) (defstruct (io-apic (:area :wired)) @@ -44,43 +48,38 @@ value)) (defun io-apic-read-redirection (apic entry) - "Read the two 32-bit halves of a redirection table entry." (let ((low (io-apic-read apic (+ +io-apic-redirection+ (* entry 2)))) (high (io-apic-read apic (+ +io-apic-redirection+ (* entry 2) 1)))) (logior low (ash high 32)))) (defun io-apic-write-redirection (apic entry value) - "Write a 64-bit value to a redirection table entry." (io-apic-write apic (+ +io-apic-redirection+ (* entry 2)) (ldb (byte 32 0) value)) (io-apic-write apic (+ +io-apic-redirection+ (* entry 2) 1) (ldb (byte 32 32) value))) (defun io-apic-mask-irq (gsi) - "Mask an IO-APIC IRQ by GSI." (dolist (apic *io-apics*) (let ((entry (- gsi (io-apic-gsi-base apic)))) - (when (and (<= 0 entry) (< entry (io-apic-max-redirection apic))) - (setf (io-apic-read-redirection apic entry) - (logior (io-apic-read-redirection apic entry) - +io-apic-entry-mask+)) + (when (and (<= 0 entry) (<= entry (io-apic-max-redirection apic))) + (io-apic-write-redirection apic entry + (logior (io-apic-read-redirection apic entry) + +io-apic-entry-mask+)) (return t))))) (defun io-apic-unmask-irq (gsi) - "Unmask an IO-APIC IRQ by GSI." (dolist (apic *io-apics*) (let ((entry (- gsi (io-apic-gsi-base apic)))) - (when (and (<= 0 entry) (< entry (io-apic-max-redirection apic))) - (setf (io-apic-read-redirection apic entry) - (logand (io-apic-read-redirection apic entry) - (lognot +io-apic-entry-mask+))) + (when (and (<= 0 entry) (<= entry (io-apic-max-redirection apic))) + (io-apic-write-redirection apic entry + (logand (io-apic-read-redirection apic entry) + (lognot +io-apic-entry-mask+))) (return t))))) (defun io-apic-find-entry (gsi) - "Find the IO-APIC and redirection entry index for a GSI." (dolist (apic *io-apics*) (let ((entry (- gsi (io-apic-gsi-base apic)))) - (when (and (<= 0 entry) (< entry (io-apic-max-redirection apic))) + (when (and (<= 0 entry) (<= entry (io-apic-max-redirection apic))) (return (values apic entry)))))) (defun io-apic-irq-spurious-p (gsi) @@ -88,60 +87,98 @@ nil) (defun initialize-io-apics () - "Initialize all IO-APICs from ACPI MADT entries." + (setf *io-apic-active-p* nil) (unless (boundp '*io-apics*) (setf *io-apics* nil)) (when *io-apics* (return-from initialize-io-apics)) + (setf *io-apic-irqs* nil) + (setf *isa-irq-to-gsi* nil) (let ((madt (acpi-get-table 'acpi-madt-table-p))) (unless madt (debug-print-line "No MADT table, IO-APIC not available.") (return-from initialize-io-apics)) + (let ((isa-mapping (sys.int::make-simple-vector 16 :wired))) + (dotimes (i 16) + (setf (svref isa-mapping i) i)) + (dotimes (i (sys.int::simple-vector-length + (acpi-madt-table-controllers madt))) + (let ((entry (svref (acpi-madt-table-controllers madt) i))) + (when (acpi-madt-interrupt-source-override-p entry) + (when (and (eql (acpi-madt-interrupt-source-override-bus entry) 0) + (< (acpi-madt-interrupt-source-override-source entry) 16)) + (setf (svref isa-mapping + (acpi-madt-interrupt-source-override-source entry)) + (acpi-madt-interrupt-source-override-global-system-interrupt entry)) + (debug-print-line "MADT override: ISA IRQ " + (acpi-madt-interrupt-source-override-source entry) + " -> GSI " + (acpi-madt-interrupt-source-override-global-system-interrupt entry)))))) + (setf *isa-irq-to-gsi* isa-mapping)) + (setf *io-apic-irqs* (sys.int::make-simple-vector 256 :wired)) + (dotimes (i 256) + (setf (svref *io-apic-irqs* i) (make-irq :platform-number i))) (setf *io-apics* '()) - (dotimes (i (sys.int::simple-vector-length - (acpi-madt-table-controllers madt))) - (let ((entry (svref (acpi-madt-table-controllers madt) i))) - (when (acpi-madt-ioapic-p entry) - (let* ((phys-addr (acpi-madt-ioapic-address entry)) - (mmio-base (map-physical-memory-early - (align-down phys-addr +4k-page-size+) - +4k-page-size+ - "IO-APIC")) - (io-apic (make-io-apic - :id (acpi-madt-ioapic-id entry) - :gsi-base (acpi-madt-ioapic-global-system-interrupt-base entry) - :address phys-addr - :mmio-base (+ mmio-base (- phys-addr (align-down phys-addr +4k-page-size+)))))) - ;; Read version to get max redirection entries. - (let* ((version-reg (io-apic-read io-apic +io-apic-version+)) - (max-redir (ldb (byte 8 16) version-reg))) - (setf (io-apic-max-redirection io-apic) max-redir)) - ;; Mask all redirection entries. - (dotimes (e (io-apic-max-redirection io-apic)) - (io-apic-write-redirection io-apic e +io-apic-entry-mask+)) - (push-wired io-apic *io-apics*) - (debug-print-line "IO-APIC " (io-apic-id io-apic) - " at " phys-addr - " GSI base " (io-apic-gsi-base io-apic) - " max redirect " (io-apic-max-redirection io-apic)))))) - (setf *io-apic-active-p* (not (null *io-apics*))) - (when *io-apic-active-p* - (debug-print-line "IO-APIC initialized, " (length *io-apics*) " controller(s) active.") - ;; Hook IO-APIC interrupt handler for all vectors 32-255. - ;; i8259 hooks are still active for vectors 32-47, but the - ;; IO-APIC's GSI routing determines which handler fires. - (dotimes (v (- 256 32)) - (let ((vector (+ 32 v))) - (hook-user-interrupt vector 'io-apic-interrupt-handler)))))) + (let ((bsp-apic-id (if (boundp '*bsp-cpu*) + (x86-64-cpu-apic-id *bsp-cpu*) + 0))) + (dotimes (i (sys.int::simple-vector-length + (acpi-madt-table-controllers madt))) + (let ((entry (svref (acpi-madt-table-controllers madt) i))) + (when (acpi-madt-ioapic-p entry) + (let* ((phys-addr (acpi-madt-ioapic-address entry)) + (page-base (align-down phys-addr +4k-page-size+))) + (map-physical-memory-early page-base +4k-page-size+ "IO-APIC") + (let ((io-apic (make-io-apic + :id (acpi-madt-ioapic-id entry) + :gsi-base (acpi-madt-ioapic-global-system-interrupt-base entry) + :address phys-addr + :mmio-base phys-addr))) + (let* ((version-reg (io-apic-read io-apic +io-apic-version+)) + (max-redir (ldb (byte 8 16) version-reg)) + (n-entries (1+ max-redir)) + (gsi-base (io-apic-gsi-base io-apic))) + (setf (io-apic-max-redirection io-apic) max-redir) + (push-wired io-apic *io-apics*) + (dotimes (e n-entries) + (io-apic-write-redirection io-apic e +io-apic-entry-mask+) + (let ((gsi (+ gsi-base e))) + (io-apic-configure-entry gsi + (+ +io-apic-base-vector+ gsi) + bsp-apic-id + :trigger-mode :edge + :polarity :high + :masked t)))) + (debug-print-line "IO-APIC " (io-apic-id io-apic) + " at " phys-addr + " GSI base " (io-apic-gsi-base io-apic) + " max redirect " (io-apic-max-redirection io-apic)))))))) + (when (not (null *io-apics*)) + (debug-print-line "IO-APIC init done. Hooking handlers...") + (let ((max-gsi 0)) + (dolist (apic *io-apics*) + (let ((top (+ (io-apic-gsi-base apic) (io-apic-max-redirection apic)))) + (setf max-gsi (max max-gsi top)))) + (dotimes (gsi (1+ max-gsi)) + (let ((vector (+ +io-apic-base-vector+ gsi))) + (when (< vector 128) + (hook-user-interrupt vector 'io-apic-interrupt-handler)))))) + (setf *io-apic-active-p* t) + ;; Mask the i8259 PIC now that IO-APIC is handling interrupts. + (when (boundp '*i8259-shadow-mask*) + (setf (sys.int::io-port/8 #x21) #xFF + (sys.int::io-port/8 #xA1) #xFF + *i8259-shadow-mask* #xFFFF)) + nil)) (defun io-apic-interrupt-handler (interrupt-frame info) - (let ((gsi (- info 32))) - (irq-deliver interrupt-frame (platform-irq gsi)) + (let ((gsi (- info +io-apic-base-vector+))) + (when (and (<= 0 gsi) (< gsi 256)) + (irq-deliver interrupt-frame (svref *io-apic-irqs* gsi))) (lapic-eoi))) (defun io-apic-configure-entry (gsi vector destination-apic-id &key (trigger-mode :edge) (polarity :high) (masked t)) - "Configure an IO-APIC redirection entry." (multiple-value-bind (apic entry) (io-apic-find-entry gsi) (unless apic (debug-print-line "No IO-APIC for GSI " gsi) @@ -155,9 +192,7 @@ (:high 0) (:low +io-apic-entry-polarity+))))) (if *lapic-x2apic-mode* - ;; x2APIC: 32-bit APIC ID in bits 63-32. (setf entry-value (logior entry-value (ash destination-apic-id 32))) - ;; xAPIC: 8-bit APIC ID in bits 63-56. (setf entry-value (logior entry-value (ash (ldb (byte 8 0) destination-apic-id) 56)))) (io-apic-write-redirection apic entry entry-value) t))) diff --git a/supervisor/x86-64/platform.lisp b/supervisor/x86-64/platform.lisp index 7c66e326..db478590 100644 --- a/supervisor/x86-64/platform.lisp +++ b/supervisor/x86-64/platform.lisp @@ -23,6 +23,7 @@ (initialize-cpu) (unless (boundp '*io-apic-active-p*) (setf *io-apic-active-p* nil)) + (initialize-io-apics) (initialize-platform-time) (mezzano.supervisor.intel-8042:probe) (initialize-pci) From b2e131fc40c2a2daa12ea27d370cfc7ef22d7905 Mon Sep 17 00:00:00 2001 From: iska Date: Sat, 6 Jun 2026 13:39:04 +0500 Subject: [PATCH 10/42] x86-64: MADT ource ovirride flags --- supervisor/x86-64/io-apic.lisp | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/supervisor/x86-64/io-apic.lisp b/supervisor/x86-64/io-apic.lisp index 57c26e44..38a2b3a5 100644 --- a/supervisor/x86-64/io-apic.lisp +++ b/supervisor/x86-64/io-apic.lisp @@ -23,6 +23,7 @@ (sys.int::defglobal *io-apic-active-p* nil) (sys.int::defglobal *io-apic-irqs* nil) (sys.int::defglobal *isa-irq-to-gsi* nil) +(sys.int::defglobal *gsi-flags* nil) (defstruct (io-apic (:area :wired)) @@ -86,6 +87,12 @@ (declare (ignore gsi)) nil) +(defun gsi-flags-polarity (flags) + (if (eql (ldb (byte 2 0) flags) 3) :low :high)) + +(defun gsi-flags-trigger (flags) + (if (eql (ldb (byte 2 2) flags) 3) :level :edge)) + (defun initialize-io-apics () (setf *io-apic-active-p* nil) (unless (boundp '*io-apics*) @@ -94,11 +101,13 @@ (return-from initialize-io-apics)) (setf *io-apic-irqs* nil) (setf *isa-irq-to-gsi* nil) + (setf *gsi-flags* nil) (let ((madt (acpi-get-table 'acpi-madt-table-p))) (unless madt (debug-print-line "No MADT table, IO-APIC not available.") (return-from initialize-io-apics)) (let ((isa-mapping (sys.int::make-simple-vector 16 :wired))) + (setf *gsi-flags* (sys.int::make-simple-vector 256 :wired)) (dotimes (i 16) (setf (svref isa-mapping i) i)) (dotimes (i (sys.int::simple-vector-length @@ -110,6 +119,10 @@ (setf (svref isa-mapping (acpi-madt-interrupt-source-override-source entry)) (acpi-madt-interrupt-source-override-global-system-interrupt entry)) + (let ((gsi (acpi-madt-interrupt-source-override-global-system-interrupt entry)) + (flags (acpi-madt-interrupt-source-override-flags entry))) + (when (< gsi 256) + (setf (svref *gsi-flags* gsi) flags))) (debug-print-line "MADT override: ISA IRQ " (acpi-madt-interrupt-source-override-source entry) " -> GSI " @@ -142,13 +155,14 @@ (push-wired io-apic *io-apics*) (dotimes (e n-entries) (io-apic-write-redirection io-apic e +io-apic-entry-mask+) - (let ((gsi (+ gsi-base e))) - (io-apic-configure-entry gsi - (+ +io-apic-base-vector+ gsi) - bsp-apic-id - :trigger-mode :edge - :polarity :high - :masked t)))) + (let ((gsi (+ gsi-base e)) + (flags (svref *gsi-flags* (+ gsi-base e)))) + (io-apic-configure-entry gsi + (+ +io-apic-base-vector+ gsi) + bsp-apic-id + :trigger-mode (gsi-flags-trigger flags) + :polarity (gsi-flags-polarity flags) + :masked t)))) (debug-print-line "IO-APIC " (io-apic-id io-apic) " at " phys-addr " GSI base " (io-apic-gsi-base io-apic) From f0e12ec245f662f1b0dd11b5959f6f4907d8313f Mon Sep 17 00:00:00 2001 From: iska Date: Sat, 6 Jun 2026 13:43:57 +0500 Subject: [PATCH 11/42] pci: stub for MSI --- supervisor/pci.lisp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/supervisor/pci.lisp b/supervisor/pci.lisp index c11d9e43..ed6b21e7 100644 --- a/supervisor/pci.lisp +++ b/supervisor/pci.lisp @@ -302,6 +302,12 @@ Returns NIL if the BAR has an unknown type." (defun pci-intr-line (device) (pci-config/8 device +pci-config-intr-line+)) +(defun pci-configure-msi (device vector cpu) + "Configure MSI/MSI-X for DEVICE to use VECTOR on CPU. +Not yet implemented." + (declare (ignore device vector cpu)) + (panic "pci-configure-msi is not implemented")) + (defun pci-bus-master-enabled (device) (logbitp +pci-command-bus-master+ (pci-config/16 device +pci-config-command+))) From 64a9b73e147c8576abcd22579492fa65294c71b6 Mon Sep 17 00:00:00 2001 From: iska Date: Sat, 6 Jun 2026 13:46:17 +0500 Subject: [PATCH 12/42] x86-64: mask LINT0 and LINT1 with ioapic active --- supervisor/x86-64/io-apic.lisp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/supervisor/x86-64/io-apic.lisp b/supervisor/x86-64/io-apic.lisp index 38a2b3a5..3d2ead2d 100644 --- a/supervisor/x86-64/io-apic.lisp +++ b/supervisor/x86-64/io-apic.lisp @@ -183,6 +183,12 @@ (setf (sys.int::io-port/8 #x21) #xFF (sys.int::io-port/8 #xA1) #xFF *i8259-shadow-mask* #xFFFF)) + ;; Mask LINT0 and LINT1 on the LAPIC to prevent i8259 spurious + ;; interrupts from reaching the CPU through ExtINT/NMI. + (write-lapic (logior (read-lapic +lapic-reg-lvt-lint0+) +lapic-lvt-mask+) + +lapic-reg-lvt-lint0+) + (write-lapic (logior (read-lapic +lapic-reg-lvt-lint1+) +lapic-lvt-mask+) + +lapic-reg-lvt-lint1+) nil)) (defun io-apic-interrupt-handler (interrupt-frame info) From 5b0870058dfd18bbf53480657cb9241191bd6f7e Mon Sep 17 00:00:00 2001 From: iska Date: Sat, 6 Jun 2026 14:21:01 +0500 Subject: [PATCH 13/42] supervisor: directed IPI --- supervisor/arm64/cpu.lisp | 66 ++++++++++---------- supervisor/arm64/gic.lisp | 14 +++-- supervisor/x86-64/cpu.lisp | 120 +++++++++++++++++++------------------ 3 files changed, 104 insertions(+), 96 deletions(-) diff --git a/supervisor/arm64/cpu.lisp b/supervisor/arm64/cpu.lisp index 8320487d..20055803 100644 --- a/supervisor/arm64/cpu.lisp +++ b/supervisor/arm64/cpu.lisp @@ -79,6 +79,7 @@ (defun initialize-cpu () (setf (arm64-cpu-cpu-id *bsp-cpu*) (fdt-boot-cpuid)) + (setf (cpu-cpu-index *bsp-cpu*) 0) (push-wired *bsp-cpu* *cpus*)) (sys.int::define-lap-function %el0-common () @@ -224,14 +225,20 @@ (mezzano.lap.arm64:hlt 4)) (defun broadcast-panic-ipi () - (broadcast-ipi +panic-sgi-id+)) + (send-ipi-to-all +panic-sgi-id+)) (defun panic-ipi-handler (interrupt-frame) (declare (ignore interrupt-frame)) (loop (%arch-panic-stop))) (defun broadcast-wakeup-ipi () - (broadcast-ipi +wakeup-sgi-id+)) + (dolist (cpu *cpus*) + (when (eql (arm64-cpu-state cpu) :online) + (wake-cpu cpu)))) + +(defun wake-cpu (cpu) + (when (cpu-idle-p cpu) + (send-ipi-to-cpu cpu +wakeup-sgi-id+))) (sys.int::defglobal *non-quiescent-cpus-remaining*) @@ -240,7 +247,7 @@ "Bring all CPUs to a consistent state to stop the world. Protected by the world stop lock." (setf *non-quiescent-cpus-remaining* (1- *n-up-cpus*)) - (broadcast-ipi +quiesce-sgi-id+) + (send-ipi-to-all +quiesce-sgi-id+) ;; FIXME: Use WFE/SEV instead of this spin-loop. (loop (when (eql *non-quiescent-cpus-remaining* 0) @@ -249,27 +256,23 @@ Protected by the world stop lock." ;; Save the current thread's state and switch to the CPU's idle thread. (defun quiesce-ipi-handler (interrupt-frame) - (let* ((current (current-thread)) - (idle (local-cpu-idle-thread)) - (was-active (not (eql current idle)))) - (when was-active - (acquire-global-thread-lock) - ;; Return this thread to the run queue. - (setf (thread-state current) :runnable) - (push-run-queue current) - (preemption-timer-reset nil) - ;; Save thread state. - (save-fpu-state current) - (save-interrupted-state current interrupt-frame) - ;; Partially switch to the idle thread. - (setf (thread-state idle) :active)) - ;; Have now reached a quiescent state. - (sys.int::%atomic-fixnum-add-symbol '*non-quiescent-cpus-remaining* - -1) - (when was-active - ;; Finally, return to the idle thread. - (%%switch-to-thread-common idle - idle)))) + (cond (*debug-magic-button-hold-variable* + (magic-button-ipi-handler interrupt-frame)) + (t + (let* ((current (current-thread)) + (idle (local-cpu-idle-thread)) + (was-active (not (eql current idle)))) + (when was-active + (acquire-global-thread-lock) + (setf (thread-state current) :runnable) + (push-run-queue current) + (preemption-timer-reset nil) + (save-fpu-state current) + (save-interrupted-state current interrupt-frame) + (setf (thread-state idle) :active)) + (sys.int::%atomic-fixnum-add-symbol '*non-quiescent-cpus-remaining* -1) + (when was-active + (%%switch-to-thread-common idle idle)))))) ;; TODO: This needs to be fixed up to prevent multiple CPUs hitting it at ;; once. It can't currently happen because it is only used from IRQ handlers @@ -280,7 +283,7 @@ Protected by the world stop lock." (defun stop-other-cpus-for-debug-magic-button () (setf *debug-magic-button-ready-variable* (1- *n-up-cpus*) *debug-magic-button-hold-variable* t) - (broadcast-ipi +magic-button-sgi-id+) + (send-ipi-to-all +quiesce-sgi-id+) ;; Wait for other CPUs to arrive, this ensures the thread state is actually ;; consistent. (loop until (eql *debug-magic-button-ready-variable* 0))) @@ -350,12 +353,13 @@ Protected by the world stop lock." :priority :idle)) (wired-stack (%allocate-stack (* 128 1024) t)) (cpu (make-arm64-cpu :state :offline - :cpu-id cpu-id - :idle-thread idle-thread - :wired-stack wired-stack - :sp-el1 (+ (stack-base wired-stack) - (stack-size wired-stack) - -16)))) + :cpu-id cpu-id + :idle-thread idle-thread + :wired-stack wired-stack + :sp-el1 (+ (stack-base wired-stack) + (stack-size wired-stack) + -16)))) + (setf (cpu-cpu-index cpu) (length *cpus*)) (setf (arm64-cpu-self cpu) cpu) (setf (sys.int::memref-unsigned-byte-64 (arm64-cpu-sp-el1 cpu)) (sys.int::lisp-object-address cpu)) diff --git a/supervisor/arm64/gic.lisp b/supervisor/arm64/gic.lisp index 3f34f199..8c55d5e2 100644 --- a/supervisor/arm64/gic.lisp +++ b/supervisor/arm64/gic.lisp @@ -46,7 +46,6 @@ (defconstant +panic-sgi-id+ 1) (defconstant +wakeup-sgi-id+ 2) (defconstant +quiesce-sgi-id+ 3) -(defconstant +magic-button-sgi-id+ 4) (defun gic-dist-reg (index) (physical-memref-unsigned-byte-32 (+ *gic-distributor-base* index))) @@ -102,8 +101,7 @@ ;; Unmask our various IPIs (gic-unmask-interrupt +panic-sgi-id+) (gic-unmask-interrupt +wakeup-sgi-id+) - (gic-unmask-interrupt +quiesce-sgi-id+) - (gic-unmask-interrupt +magic-button-sgi-id+)) + (gic-unmask-interrupt +quiesce-sgi-id+)) (defun configure-gic-cpu () ;; Enable the local CPU. @@ -141,8 +139,6 @@ (#.+quiesce-sgi-id+ ;; nothing, handled later nil) - (#.+magic-button-sgi-id+ - (magic-button-ipi-handler interrupt-frame)) (t (debug-print-line "Received unknown SGI " vector)))) (t ;; Normal external IRQ @@ -174,9 +170,15 @@ (defun platform-unmask-irq (vector) (gic-unmask-interrupt vector)) -(defun broadcast-ipi (vector) +(defun send-ipi-to-all (vector) (%dsb.ishst) (setf (gic-dist-reg +gicd-sgir+) (logior (ash 1 24) ; all-but-self vector)) nil) + +(defun send-ipi-to-cpu (cpu vector) + (%dsb.ishst) + (setf (gic-dist-reg +gicd-sgir+) + (logior (ash 1 (+ 16 (cpu-cpu-index cpu))) + vector))) diff --git a/supervisor/x86-64/cpu.lisp b/supervisor/x86-64/cpu.lisp index 247ad236..ec1c4605 100644 --- a/supervisor/x86-64/cpu.lisp +++ b/supervisor/x86-64/cpu.lisp @@ -47,7 +47,6 @@ (defconstant +lapic-lvt-mask+ #x10000) (defconstant +ipi-type-fixed+ 0) -(defconstant +ipi-type-lowest-priority+ 1) (defconstant +ipi-type-smi+ 2) (defconstant +ipi-type-nmi+ 4) (defconstant +ipi-type-init+ 5) @@ -149,9 +148,6 @@ The bootloader is loaded to #x7C00, so #x7000 should be safe.") (defconstant +tlb-shootdown-ipi-vector+ #x83 "Sent to CPUs to prepare them for TLB shootdown.") -(defconstant +magic-button-ipi-vector+ #x84 - "Sent to CPUs when the magic debug button is pressed.") - (defconstant +reschedule-ipi-vector+ #x85 "Sent to a specific CPU to request rescheduling.") @@ -231,29 +227,35 @@ The bootloader is loaded to #x7C00, so #x7000 should be safe.") (ash type 8) vector))))) +(defun send-ipi-to-cpu (cpu type vector) + (send-ipi (x86-64-cpu-apic-id cpu) type vector)) + +(defun send-ipi-to-all (type vector &key including-self) + (if (and (boundp '*lapic-x2apic-mode*) *lapic-x2apic-mode*) + ;; x2APIC shorthand: use destination shorthand in ICR. + (let ((icr (logior (ash type 8) + #x4000 + vector))) + (setf (sys.int::msr +x2apic-msr-icr+) + (logior icr (ash (if including-self 0 3) 18)))) + (dolist (cpu *cpus*) + (when (and (eql (x86-64-cpu-state cpu) :online) + (or including-self + (not (eql cpu (local-cpu))))) + (send-ipi-to-cpu cpu type vector))))) + (defun broadcast-ipi (type vector &optional including-self) - ;; BROADCAST-IPI can be called very early due to thread wakeups, before - ;; the lapic is mapped by INITIALIZE-CPU. (when (and (boundp '*lapic-address*) *lapic-address*) - ;; Disable interrupts to prevent cross-cpu migration from - ;; fouling up behaviour of INCLUDING-SELF. (safe-without-interrupts (type vector including-self) - (if (and (boundp '*lapic-x2apic-mode*) *lapic-x2apic-mode*) - ;; x2APIC shorthand: use destination shorthand in ICR. - (let ((icr (logior (ash type 8) - #x4000 - vector))) - (setf (sys.int::msr +x2apic-msr-icr+) - (logior icr (ash (if including-self 0 3) 18)))) - (dolist (cpu *cpus*) - (when (and (eql (x86-64-cpu-state cpu) :online) - (or including-self - (not (eql cpu (local-cpu))))) - (send-ipi (x86-64-cpu-apic-id cpu) type vector))))))) + (send-ipi-to-all type vector :including-self including-self)))) (defun broadcast-wakeup-ipi () - (broadcast-ipi +ipi-type-fixed+ +wakeup-ipi-vector+)) + (when (and (boundp '*lapic-address*) + *lapic-address*) + (dolist (cpu *cpus*) + (when (eql (x86-64-cpu-state cpu) :online) + (wake-cpu cpu))))) (defun wakeup-ipi-handler (interrupt-frame info) (declare (ignore info)) @@ -286,28 +288,31 @@ Protected by the world stop lock." (defun quiesce-ipi-handler (interrupt-frame info) (declare (ignore info)) (lapic-eoi) - (let* ((current (current-thread)) - (idle (local-cpu-idle-thread)) - (was-active (not (eql current idle)))) - (when was-active - (acquire-global-thread-lock) - ;; Return this thread to the run queue. - (setf (thread-state current) :runnable) - (push-run-queue current) - (preemption-timer-reset nil) - ;; Save thread state. - (save-fpu-state current) - (save-interrupted-state current interrupt-frame) - ;; Partially switch to the idle thread. - (setf (thread-state idle) :active) - (setf (sys.int::msr +msr-ia32-gs-base+) (sys.int::lisp-object-address idle))) - ;; Have now reached a quiescent state. - (sys.int::%atomic-fixnum-add-symbol '*non-quiescent-cpus-remaining* - -1) - (when was-active - ;; Finally, return to the idle thread. - (%%switch-to-thread-common idle - idle)))) + (cond (*debug-magic-button-hold-variable* + ;; Magic button active: save state and spin. + (magic-button-ipi-handler-1 interrupt-frame)) + (t + ;; Normal quiesce: switch to idle thread. + (let* ((current (current-thread)) + (idle (local-cpu-idle-thread)) + (was-active (not (eql current idle)))) + (when was-active + (acquire-global-thread-lock) + ;; Return this thread to the run queue. + (setf (thread-state current) :runnable) + (push-run-queue current) + (preemption-timer-reset nil) + ;; Save thread state. + (save-fpu-state current) + (save-interrupted-state current interrupt-frame) + ;; Partially switch to the idle thread. + (setf (thread-state idle) :active) + (setf (sys.int::msr +msr-ia32-gs-base+) (sys.int::lisp-object-address idle))) + ;; Have now reached a quiescent state. + (sys.int::%atomic-fixnum-add-symbol '*non-quiescent-cpus-remaining* -1) + (when was-active + ;; Finally, return to the idle thread. + (%%switch-to-thread-common idle idle)))))) ;; TODO: This needs to be fixed up to prevent multiple CPUs hitting it at ;; once. It can't currently happen because it is only used from IRQ handlers @@ -318,7 +323,7 @@ Protected by the world stop lock." (defun stop-other-cpus-for-debug-magic-button () (setf *debug-magic-button-ready-variable* (1- *n-up-cpus*) *debug-magic-button-hold-variable* t) - (broadcast-ipi +ipi-type-fixed+ +magic-button-ipi-vector+) + (send-ipi-to-all +ipi-type-fixed+ +quiesce-ipi-vector+) ;; Wait for other CPUs to arrive, this ensures the thread state is actually ;; consistent. (loop until (eql *debug-magic-button-ready-variable* 0))) @@ -330,11 +335,6 @@ Protected by the world stop lock." ;; the hold variable going to NIL. (loop until (eql *debug-magic-button-ready-variable* 0))) -(defun magic-button-ipi-handler (interrupt-frame info) - (declare (ignore info)) - (magic-button-ipi-handler-1 interrupt-frame) - (lapic-eoi)) - (defun reschedule-ipi-handler (interrupt-frame info) (declare (ignore info)) (lapic-eoi) @@ -345,7 +345,7 @@ Protected by the world stop lock." "Send a directed wakeup IPI to a specific CPU. If the CPU is idle, this will cause it to check for new threads." (when (cpu-idle-p cpu) - (send-ipi (x86-64-cpu-apic-id cpu) +ipi-type-fixed+ +wakeup-ipi-vector+))) + (send-ipi-to-cpu cpu +ipi-type-fixed+ +wakeup-ipi-vector+))) (defun magic-button-ipi-handler-1 (interrupt-frame) (when (not *debug-magic-button-hold-variable*) @@ -1096,15 +1096,17 @@ This is a one-shot timer and must be reset after firing." (setf *cpus* '()) (push-wired *bsp-cpu* *cpus*) (setf *n-up-cpus* 1) - (hook-user-interrupt +lapic-vector-svr+ 'lapic-svr-handler) - (hook-user-interrupt +lapic-vector-err+ 'lapic-error-handler) - (hook-user-interrupt +lapic-vector-timer+ 'lapic-timer-handler) - (hook-user-interrupt +wakeup-ipi-vector+ 'wakeup-ipi-handler) - (hook-user-interrupt +panic-ipi-vector+ 'panic-ipi-handler) - (hook-user-interrupt +quiesce-ipi-vector+ 'quiesce-ipi-handler) - (hook-user-interrupt +tlb-shootdown-ipi-vector+ 'tlb-shootdown-ipi-handler) - (hook-user-interrupt +magic-button-ipi-vector+ 'magic-button-ipi-handler) - (hook-user-interrupt +reschedule-ipi-vector+ 'reschedule-ipi-handler)) + (register-ipi-handler +lapic-vector-svr+ 'lapic-svr-handler) + (register-ipi-handler +lapic-vector-err+ 'lapic-error-handler) + (register-ipi-handler +lapic-vector-timer+ 'lapic-timer-handler) + (register-ipi-handler +wakeup-ipi-vector+ 'wakeup-ipi-handler) + (register-ipi-handler +panic-ipi-vector+ 'panic-ipi-handler) + (register-ipi-handler +quiesce-ipi-vector+ 'quiesce-ipi-handler) + (register-ipi-handler +tlb-shootdown-ipi-vector+ 'tlb-shootdown-ipi-handler) + (register-ipi-handler +reschedule-ipi-vector+ 'reschedule-ipi-handler)) + +(defun register-ipi-handler (vector handler) + (hook-user-interrupt vector handler)) (defun load-cpu-bits (cpu) (let* ((addr (- (sys.int::lisp-object-address cpu) From 7ccba489eaef404e77c9d2f14088b2cdb8b437bb Mon Sep 17 00:00:00 2001 From: iska Date: Sat, 6 Jun 2026 14:27:15 +0500 Subject: [PATCH 14/42] x86-64: gaps --- supervisor/x86-64/cpu.lisp | 59 ++++++++++++++++++++++++++++++++------ 1 file changed, 50 insertions(+), 9 deletions(-) diff --git a/supervisor/x86-64/cpu.lisp b/supervisor/x86-64/cpu.lisp index ec1c4605..a492aa9c 100644 --- a/supervisor/x86-64/cpu.lisp +++ b/supervisor/x86-64/cpu.lisp @@ -54,6 +54,7 @@ ;; x2APIC detection. (defconstant +cpuid-feature-x2apic+ 21) +(defconstant +cpuid-feature-tsc-deadline+ 24) (defconstant +msr-ia32-apic-base-x2apic-enable+ #x400) (defconstant +msr-ia32-apic-base-bsp+ #x800) @@ -980,15 +981,15 @@ This is a one-shot timer and must be reset after firing." (when (not (eq start-time initial-time)) (return))) ;; Start timer with the maximum count value - (setf (lapic-reg +lapic-reg-timer-initial-count+) #xFFFFFFFF) + (write-lapic #xFFFFFFFF +lapic-reg-timer-initial-count+) ;; Wait for next tick. (loop (setf end-time (get-internal-run-time)) (when (not (eq end-time start-time)) (return))) ;; Read current count & stop timer. - (setf end-counter (lapic-reg +lapic-reg-timer-current-count+)) - (setf (lapic-reg +lapic-reg-timer-initial-count+) 0) + (setf end-counter (read-lapic +lapic-reg-timer-current-count+)) + (write-lapic 0 +lapic-reg-timer-initial-count+) (let* ((cycles (- #xFFFFFFFF end-counter)) (total-time (- end-time start-time)) ;; Use single floats here. Rationals & double-floats require @@ -1001,13 +1002,48 @@ This is a one-shot timer and must be reset after firing." ;; This is a fixnum, timer cycles per second. (sys.int::defglobal *lapic-timer-calibration*) +;; Timer ticks per second as computed by the calibration. +(sys.int::defglobal *lapic-timer-ticks-per-second*) + +(defun lapic-timer-calibrate-tsc () + "Calibrate the LAPIC timer using the TSC as a high-resolution time reference. +*CPU-SPEED* must already be calibrated before calling this." + (let* ((tsc-start (sys.int::tsc)) + (start-time (get-internal-run-time))) + ;; Start timer with the maximum count value + (write-lapic #xFFFFFFFF +lapic-reg-timer-initial-count+) + ;; Wait for the next PIT tick as our interval marker. + (loop (when (not (eql (get-internal-run-time) start-time)) (return))) + (let* ((tsc-end (sys.int::tsc)) + (lapic-remaining (read-lapic +lapic-reg-timer-current-count+)) + (lapic-cycles (- #xFFFFFFFF lapic-remaining)) + (tsc-delta (- tsc-end tsc-start))) + ;; Stop timer + (write-lapic 0 +lapic-reg-timer-initial-count+) + (if (zerop tsc-delta) + 0 + ;; LAPIC Hz = (lapic_cycles / tsc_delta) * cpu_speed + ;; Use single-floats to avoid allocation during early boot. + (let ((cpu-speed (float *cpu-speed*))) + (/ (* (float lapic-cycles) cpu-speed) + (float tsc-delta))))))) + ;; TODO: Be more clever when picking the divisor. ;; Should dynamically adjust so a goldilocks calibration value is returned. (defun lapic-timer-calibrate () - (let ((n (lapic-timer-calibrate-1))) - (dotimes (i 5) - (setf n (/ (+ n (lapic-timer-calibrate-1)) 2))) - (setf *lapic-timer-calibration* (truncate n)))) + (if (tsc-deadline-available-p) + ;; Use TSC for high-resolution calibration. + (let ((n (lapic-timer-calibrate-tsc))) + (dotimes (i 5) + (setf n (/ (+ n (lapic-timer-calibrate-tsc)) 2))) + (setf *lapic-timer-calibration* (truncate n) + *lapic-timer-ticks-per-second* (truncate n))) + ;; Fall back to PIT-based calibration. + (let ((n (lapic-timer-calibrate-1))) + (dotimes (i 5) + (setf n (/ (+ n (lapic-timer-calibrate-1)) 2))) + (setf *lapic-timer-calibration* (truncate n) + *lapic-timer-ticks-per-second* (truncate n))))) ;; These two functions are interrupt-safe, they must not use ;; floats, bignums or ratios when converting. @@ -1044,6 +1080,9 @@ This is a one-shot timer and must be reset after firing." (defun x2apic-enabled-by-firmware-p () (logbitp 10 (sys.int::msr +msr-ia32-apic-base+))) +(defun tsc-deadline-available-p () + (logbitp +cpuid-feature-tsc-deadline+ (%cpuid-1-ecx-early 1 0))) + (defun read-local-apic-id () (if *lapic-x2apic-mode* (sys.int::msr #x802) @@ -1068,10 +1107,12 @@ This is a one-shot timer and must be reset after firing." (setf *lapic-x2apic-mode* t) (debug-print-line "x2APIC already enabled by firmware")) (t - ;; Enable x2APIC: set bit 10 in IA32_APIC_BASE MSR. + ;; Enable x2APIC: set bit 10 (enable) and bit 11 (BSP) in IA32_APIC_BASE MSR. (let ((apic-base (sys.int::msr +msr-ia32-apic-base+))) (setf (sys.int::msr +msr-ia32-apic-base+) - (logior apic-base +msr-ia32-apic-base-x2apic-enable+))) + (logior apic-base + +msr-ia32-apic-base-x2apic-enable+ + +msr-ia32-apic-base-bsp+))) (setf *lapic-x2apic-mode* t) (debug-print-line "x2APIC enabled on BSP")))) (t From 1071ecb8200c2391e0a73829158a73a1d47e524b Mon Sep 17 00:00:00 2001 From: iska Date: Sat, 6 Jun 2026 14:27:36 +0500 Subject: [PATCH 15/42] peek: add x2apic bit --- applications/peek.lisp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/applications/peek.lisp b/applications/peek.lisp index 098451c3..e04a6d31 100644 --- a/applications/peek.lisp +++ b/applications/peek.lisp @@ -122,7 +122,7 @@ "DCA" "SSE4.1" "SSE4.2" - nil + "x2APIC" nil "POPCNT")) From 53794b70454fc5cbd00b51ad901d794dce1c9362 Mon Sep 17 00:00:00 2001 From: iska Date: Sat, 6 Jun 2026 16:22:22 +0500 Subject: [PATCH 16/42] supervisor: new lazy TLB shootdown --- supervisor/arm64/cpu.lisp | 47 +++++++++++++++++++++++++++------ supervisor/pager.lisp | 20 +++++--------- supervisor/snapshot.lisp | 4 +-- supervisor/thread.lisp | 2 ++ supervisor/x86-64/cpu.lisp | 38 +++++++++++++++++++++++--- supervisor/x86-64/snapshot.lisp | 1 - 6 files changed, 83 insertions(+), 29 deletions(-) diff --git a/supervisor/arm64/cpu.lisp b/supervisor/arm64/cpu.lisp index 20055803..522a7512 100644 --- a/supervisor/arm64/cpu.lisp +++ b/supervisor/arm64/cpu.lisp @@ -21,6 +21,7 @@ page-fault-hook) (defun initialize-boot-cpu () + (setf *tlb-shootdown-in-progress* nil) (setf (arm64-cpu-self *bsp-cpu*) *bsp-cpu*) (setf (arm64-cpu-state *bsp-cpu*) :online) (setf (arm64-cpu-idle-thread *bsp-cpu*) @@ -80,6 +81,8 @@ (defun initialize-cpu () (setf (arm64-cpu-cpu-id *bsp-cpu*) (fdt-boot-cpuid)) (setf (cpu-cpu-index *bsp-cpu*) 0) + (setf (cpu-inhibit-scheduling *bsp-cpu*) 0) + (setf (cpu-tlb-generation *bsp-cpu*) 0) (push-wired *bsp-cpu* *cpus*)) (sys.int::define-lap-function %el0-common () @@ -306,26 +309,52 @@ Protected by the world stop lock." (sys.int::%atomic-fixnum-add-symbol '*debug-magic-button-ready-variable* -1)) -;; TLB shootdown isn't required as ARM has cross-core TLB invalidation instructions +;; ARM64 has hardware-broadcast TLB invalidation (TLBI IS instructions). +;; No IPIs needed — the initiating CPU issues TLBI VAE1IS which broadcasts +;; to all CPUs in the inner shareable domain. We still bracket the operation +;; with inhibit-scheduling to prevent CPU migration. + +(sys.int::defglobal *tlb-shootdown-in-progress* nil) + +(defconstant +tlb-shootdown-batch-size+ 64 + "Maximum number of pages to invalidate individually before falling +back to a full TLB flush.") (defun begin-tlb-shootdown () - nil) + "Prepare for TLB shootdown on ARM64. +TLB shootdown must be protected by the VM lock." + (ensure (rw-lock-write-held-p *vm-lock*) "VM lock not held when doing TLB shootdown!") + (ensure (not *tlb-shootdown-in-progress*) "TLB shootdown already in progress!") + (setf *tlb-shootdown-in-progress* t) + (setf (cpu-inhibit-scheduling (local-cpu)) + (1+ (cpu-inhibit-scheduling (local-cpu))))) (defun tlb-shootdown-single (address) - (declare (ignore address)) - nil) + (ensure *tlb-shootdown-in-progress*) + (flush-tlb-single address)) (defun tlb-shootdown-range (base length) - (declare (ignore base length)) - nil) + (ensure *tlb-shootdown-in-progress*) + (let ((n-pages (truncate (+ length (1- +4k-page-size+)) +4k-page-size+))) + (if (<= n-pages +tlb-shootdown-batch-size+) + (loop for addr from base below (+ base length) by +4k-page-size+ + do (flush-tlb-single addr)) + (flush-tlb)))) (defun tlb-shootdown-all () - nil) + (ensure *tlb-shootdown-in-progress*) + (flush-tlb)) (defun finish-tlb-shootdown () - nil) + (ensure *tlb-shootdown-in-progress*) + (setf *tlb-shootdown-in-progress* nil) + (setf (cpu-inhibit-scheduling (local-cpu)) + (max 0 (1- (cpu-inhibit-scheduling (local-cpu)))))) (defun check-tlb-shootdown-not-in-progress () + (ensure (not *tlb-shootdown-in-progress*) "TLB shootdown in progress!")) + +(defun check-tlb-generation-consistency () nil) (defun local-cpu-idle-thread () @@ -360,6 +389,8 @@ Protected by the world stop lock." (stack-size wired-stack) -16)))) (setf (cpu-cpu-index cpu) (length *cpus*)) + (setf (cpu-inhibit-scheduling cpu) 0) + (setf (cpu-tlb-generation cpu) 0) (setf (arm64-cpu-self cpu) cpu) (setf (sys.int::memref-unsigned-byte-64 (arm64-cpu-sp-el1 cpu)) (sys.int::lisp-object-address cpu)) diff --git a/supervisor/pager.lisp b/supervisor/pager.lisp index eb47979b..0150e2b4 100644 --- a/supervisor/pager.lisp +++ b/supervisor/pager.lisp @@ -499,7 +499,6 @@ Returns NIL if the entry is missing and ALLOCATE is false." :allow-wired stackp :stackp stackp) (setf (page-table-entry pte 0) 0)))) (begin-tlb-shootdown) - (flush-tlb) (when (not (or stackp (mark-bit-region-p base))) (tlb-shootdown-range card-base card-length)) @@ -549,7 +548,6 @@ Returns NIL if the entry is missing and ALLOCATE is false." (t ;; Mark read-only. (update-pte pte :writable nil))))))) - (flush-tlb) (tlb-shootdown-all) (finish-tlb-shootdown))) @@ -582,8 +580,7 @@ Returns NIL if the entry is missing and ALLOCATE is false." ;; ARM64's dirty bit emulation does not support emulating ;; dirty bits in the wired area yet. #-arm64 - (update-pte pte :dirty nil)))) - (flush-tlb) + (update-pte pte :dirty nil)))) (tlb-shootdown-all) (finish-tlb-shootdown))) @@ -734,7 +731,6 @@ mapped, then the entry will be NIL." (remove-from-page-replacement-list candidate) (begin-tlb-shootdown) (setf (page-table-entry pte-addr) (make-pte 0 :present nil)) - (flush-tlb-single candidate-virtual) (tlb-shootdown-single candidate-virtual) (finish-tlb-shootdown) ;; Maybe write it back to disk. @@ -847,8 +843,7 @@ mapped, then the entry will be NIL." ;; Remap page read/write. (begin-tlb-shootdown) (setf (page-table-entry pte) (make-pte (ash (pte-physical-address (page-table-entry pte)) -12) - :writable (block-info-writable-p block-info))) - (flush-tlb-single address) + :writable (block-info-writable-p block-info))) (tlb-shootdown-single address) (finish-tlb-shootdown)) #+(or)(debug-print-line "WFP " address " block " block-info " already mapped " (page-table-entry pte 0)) @@ -892,12 +887,11 @@ mapped, then the entry will be NIL." (panic "Unable to read page from disk")))) (begin-tlb-shootdown) (setf (page-table-entry pte) (make-pte frame - :writable (and (block-info-writable-p block-info) - (not (block-info-track-dirty-p block-info))) - ;; Mark the page dirty to make sure the snapshotter & swap code know to swap it out. - ;; The zero fill flag in the block map was cleared, but the on-disk data doesn't reflect that. - :dirty is-zero-page)) - (flush-tlb-single address) + :writable (and (block-info-writable-p block-info) + (not (block-info-track-dirty-p block-info))) + ;; Mark the page dirty to make sure the snapshotter & swap code know to swap it out. + ;; The zero fill flag in the block map was cleared, but the on-disk data doesn't reflect that. + :dirty is-zero-page)) (tlb-shootdown-single address) (finish-tlb-shootdown) #+(or) diff --git a/supervisor/snapshot.lisp b/supervisor/snapshot.lisp index 0b91aeed..c7c09b6b 100644 --- a/supervisor/snapshot.lisp +++ b/supervisor/snapshot.lisp @@ -142,7 +142,6 @@ (make-pte new-frame :writable (and (block-info-writable-p block-info) (not (block-info-track-dirty-p block-info))))) - (flush-tlb-single fault-addr) (tlb-shootdown-single fault-addr) (finish-tlb-shootdown) #+(or)(debug-print-line "Copied page " fault-addr))) @@ -204,8 +203,7 @@ Returns 4 values: (make-pte frame :writable (and (block-info-writable-p block-info) (not (block-info-track-dirty-p block-info))) - :dirty (page-dirty-p pte))) - (flush-tlb-single address)) + :dirty (page-dirty-p pte)))) ;; Return page to normal use. (setf (physical-page-frame-type frame) :active) (append-to-page-replacement-list frame) diff --git a/supervisor/thread.lisp b/supervisor/thread.lisp index 79526f1a..3c665e5f 100644 --- a/supervisor/thread.lisp +++ b/supervisor/thread.lisp @@ -473,6 +473,8 @@ Interrupts must be off and the global thread lock must be held." (setf (thread-switch-time-start new-thread) now)) ;; Switch threads. (set-current-thread new-thread) + ;; Check if this CPU missed a TLB shootdown while idle. + (check-tlb-generation-consistency) ;; Restore FPU state. (restore-fpu-state new-thread) ;; The global thread lock is dropped by the restore functions, not here. diff --git a/supervisor/x86-64/cpu.lisp b/supervisor/x86-64/cpu.lisp index a492aa9c..e9d20a10 100644 --- a/supervisor/x86-64/cpu.lisp +++ b/supervisor/x86-64/cpu.lisp @@ -363,7 +363,9 @@ If the CPU is idle, this will cause it to check for new threads." '*debug-magic-button-ready-variable* -1)) (sys.int::defglobal *tlb-shootdown-in-progress* nil) +(sys.int::defglobal *tlb-shootdown-n-targets* 0) (sys.int::defglobal *busy-tlb-shootdown-cpus*) +(sys.int::defglobal *current-tlb-generation*) (defconstant +tlb-shootdown-batch-size+ 64 "Maximum number of pages to invalidate individually before falling @@ -372,6 +374,15 @@ back to a full TLB flush.") (defun check-tlb-shootdown-not-in-progress () (ensure (not *tlb-shootdown-in-progress*) "TLB shootdown in progress!")) +(defun check-tlb-generation-consistency () + "If this CPU missed a TLB shootdown while idle, flush now." + (when (and (boundp '*current-tlb-generation*) + (not (eql (cpu-tlb-generation (local-cpu)) + *current-tlb-generation*))) + (flush-tlb) + (setf (cpu-tlb-generation (local-cpu)) + *current-tlb-generation*))) + (defun begin-tlb-shootdown () "Bring all CPUs to state ready for TLB shootdown. TLB shootdown must be protected by the VM lock." @@ -379,12 +390,25 @@ TLB shootdown must be protected by the VM lock." (ensure (not *tlb-shootdown-in-progress*) "TLB shootdown already in progress!") (setf *tlb-shootdown-in-progress* t) (setf *busy-tlb-shootdown-cpus* (1- *n-up-cpus*)) + (setf *tlb-shootdown-n-targets* 0) ;; Prevent migration during shootdown. (setf (cpu-inhibit-scheduling (local-cpu)) (1+ (cpu-inhibit-scheduling (local-cpu)))) - ;; Broadcast to all CPUs, not just busy ones. Idle CPUs will flush lazily. - (broadcast-ipi +ipi-type-fixed+ +tlb-shootdown-ipi-vector+) - ;; Wait for other CPUs to reach the handler. Idle CPUs are counted as done. + ;; Send IPIs to non-idle CPUs only. Idle CPUs flush lazily on wakeup. + (dolist (cpu *cpus*) + (when (and (eql (x86-64-cpu-state cpu) :online) + (not (eql cpu (local-cpu)))) + (if (cpu-idle-p cpu) + (progn + (setf (cpu-tlb-generation cpu) + (1- *current-tlb-generation*)) + (sys.int::%atomic-fixnum-add-symbol + '*busy-tlb-shootdown-cpus* -1)) + (progn + (incf *tlb-shootdown-n-targets*) + (send-ipi-to-cpu cpu +ipi-type-fixed+ + +tlb-shootdown-ipi-vector+))))) + ;; Wait for other CPUs to reach the handler. (loop (when (eql *busy-tlb-shootdown-cpus* 0) (return)) @@ -408,7 +432,10 @@ TLB shootdown must be protected by the VM lock." (defun finish-tlb-shootdown () (ensure *tlb-shootdown-in-progress*) - (setf *busy-tlb-shootdown-cpus* (1- *n-up-cpus*)) + ;; Bump the global generation so idle CPUs know to flush on wakeup. + (sys.int::%atomic-fixnum-add-symbol '*current-tlb-generation* 1) + ;; Only wait for CPUs that actually received the IPI in begin-tlb-shootdown. + (setf *busy-tlb-shootdown-cpus* *tlb-shootdown-n-targets*) (setf *tlb-shootdown-in-progress* nil) ;; Wait for CPUs to leave the handler. (loop @@ -438,6 +465,8 @@ TLB shootdown must be protected by the VM lock." ;; idle (the context switch on wakeup will reload CR3). (when (not (cpu-idle-p (local-cpu))) (flush-tlb)) + ;; Record that this CPU's TLB is up to date wrt the current generation. + (setf (cpu-tlb-generation (local-cpu)) *current-tlb-generation*) (sys.int::%atomic-fixnum-add-symbol '*busy-tlb-shootdown-cpus* -1) (setf (cpu-inhibit-scheduling (local-cpu)) @@ -1129,6 +1158,7 @@ This is a one-shot timer and must be reset after firing." (setf (cpu-idle-p *bsp-cpu*) nil) (setf (cpu-inhibit-scheduling *bsp-cpu*) 0) (setf (cpu-tlb-generation *bsp-cpu*) 0) + (setf *current-tlb-generation* 0) (setf (cpu-timer-active *bsp-cpu*) nil) ;; Allocate MCS node for BSP if not already allocated by cold-generator. (when (null (cpu-mcs-node *bsp-cpu*)) diff --git a/supervisor/x86-64/snapshot.lisp b/supervisor/x86-64/snapshot.lisp index 566fd47d..161a6ffa 100644 --- a/supervisor/x86-64/snapshot.lisp +++ b/supervisor/x86-64/snapshot.lisp @@ -44,6 +44,5 @@ (loop for i from 65 below 256 ; stack area to end of persistent memory. do (mark-pml4e-cow i)))) (begin-tlb-shootdown) - (flush-tlb) (tlb-shootdown-all) (finish-tlb-shootdown)) From e82b7664fa705ce6e97bc5f0425be8b42da033a1 Mon Sep 17 00:00:00 2001 From: iska Date: Sat, 6 Jun 2026 16:54:55 +0500 Subject: [PATCH 17/42] supervisor: deduplicated common pager code --- supervisor/arm64/cpu.lisp | 23 ----------------------- supervisor/pager.lisp | 23 +++++++++++++++++++++++ supervisor/x86-64/cpu.lisp | 23 ----------------------- 3 files changed, 23 insertions(+), 46 deletions(-) diff --git a/supervisor/arm64/cpu.lisp b/supervisor/arm64/cpu.lisp index 522a7512..472ecdfb 100644 --- a/supervisor/arm64/cpu.lisp +++ b/supervisor/arm64/cpu.lisp @@ -316,10 +316,6 @@ Protected by the world stop lock." (sys.int::defglobal *tlb-shootdown-in-progress* nil) -(defconstant +tlb-shootdown-batch-size+ 64 - "Maximum number of pages to invalidate individually before falling -back to a full TLB flush.") - (defun begin-tlb-shootdown () "Prepare for TLB shootdown on ARM64. TLB shootdown must be protected by the VM lock." @@ -329,31 +325,12 @@ TLB shootdown must be protected by the VM lock." (setf (cpu-inhibit-scheduling (local-cpu)) (1+ (cpu-inhibit-scheduling (local-cpu))))) -(defun tlb-shootdown-single (address) - (ensure *tlb-shootdown-in-progress*) - (flush-tlb-single address)) - -(defun tlb-shootdown-range (base length) - (ensure *tlb-shootdown-in-progress*) - (let ((n-pages (truncate (+ length (1- +4k-page-size+)) +4k-page-size+))) - (if (<= n-pages +tlb-shootdown-batch-size+) - (loop for addr from base below (+ base length) by +4k-page-size+ - do (flush-tlb-single addr)) - (flush-tlb)))) - -(defun tlb-shootdown-all () - (ensure *tlb-shootdown-in-progress*) - (flush-tlb)) - (defun finish-tlb-shootdown () (ensure *tlb-shootdown-in-progress*) (setf *tlb-shootdown-in-progress* nil) (setf (cpu-inhibit-scheduling (local-cpu)) (max 0 (1- (cpu-inhibit-scheduling (local-cpu)))))) -(defun check-tlb-shootdown-not-in-progress () - (ensure (not *tlb-shootdown-in-progress*) "TLB shootdown in progress!")) - (defun check-tlb-generation-consistency () nil) diff --git a/supervisor/pager.lisp b/supervisor/pager.lisp index 0150e2b4..df312864 100644 --- a/supervisor/pager.lisp +++ b/supervisor/pager.lisp @@ -28,6 +28,29 @@ (sys.int::defglobal *store-fudge-factor*) +(defconstant +tlb-shootdown-batch-size+ 64 + "Maximum number of pages to invalidate individually before falling +back to a full TLB flush.") + +(defun check-tlb-shootdown-not-in-progress () + (ensure (not *tlb-shootdown-in-progress*) "TLB shootdown in progress!")) + +(defun tlb-shootdown-single (address) + (ensure *tlb-shootdown-in-progress*) + (flush-tlb-single address)) + +(defun tlb-shootdown-range (base length) + (ensure *tlb-shootdown-in-progress*) + (let ((n-pages (truncate (+ length (1- +4k-page-size+)) +4k-page-size+))) + (if (<= n-pages +tlb-shootdown-batch-size+) + (loop for addr from base below (+ base length) by +4k-page-size+ + do (flush-tlb-single addr)) + (flush-tlb)))) + +(defun tlb-shootdown-all () + (ensure *tlb-shootdown-in-progress*) + (flush-tlb)) + (defun pager-log (&rest things) (declare (dynamic-extent things)) (when (eql *pager-noisy* t) diff --git a/supervisor/x86-64/cpu.lisp b/supervisor/x86-64/cpu.lisp index e9d20a10..b5e6be59 100644 --- a/supervisor/x86-64/cpu.lisp +++ b/supervisor/x86-64/cpu.lisp @@ -367,13 +367,6 @@ If the CPU is idle, this will cause it to check for new threads." (sys.int::defglobal *busy-tlb-shootdown-cpus*) (sys.int::defglobal *current-tlb-generation*) -(defconstant +tlb-shootdown-batch-size+ 64 - "Maximum number of pages to invalidate individually before falling -back to a full TLB flush.") - -(defun check-tlb-shootdown-not-in-progress () - (ensure (not *tlb-shootdown-in-progress*) "TLB shootdown in progress!")) - (defun check-tlb-generation-consistency () "If this CPU missed a TLB shootdown while idle, flush now." (when (and (boundp '*current-tlb-generation*) @@ -414,22 +407,6 @@ TLB shootdown must be protected by the VM lock." (return)) (sys.int::cpu-relax))) -(defun tlb-shootdown-single (address) - (ensure *tlb-shootdown-in-progress*) - (flush-tlb-single address)) - -(defun tlb-shootdown-range (base length) - (ensure *tlb-shootdown-in-progress*) - (let ((n-pages (truncate (+ length (1- +4k-page-size+)) +4k-page-size+))) - (if (<= n-pages +tlb-shootdown-batch-size+) - (loop for addr from base below (+ base length) by +4k-page-size+ - do (flush-tlb-single addr)) - (flush-tlb)))) - -(defun tlb-shootdown-all () - (ensure *tlb-shootdown-in-progress*) - (flush-tlb)) - (defun finish-tlb-shootdown () (ensure *tlb-shootdown-in-progress*) ;; Bump the global generation so idle CPUs know to flush on wakeup. From e93fcbdf6ea01c33e9c95773a2585a03b6f2b396 Mon Sep 17 00:00:00 2001 From: iska Date: Sun, 14 Jun 2026 22:29:00 +0500 Subject: [PATCH 18/42] x86-64: fix IO-APIC redirection destination and vector map --- supervisor/x86-64/io-apic.lisp | 110 +++++++++++++++++++++------------ 1 file changed, 72 insertions(+), 38 deletions(-) diff --git a/supervisor/x86-64/io-apic.lisp b/supervisor/x86-64/io-apic.lisp index 3d2ead2d..ff87bb1f 100644 --- a/supervisor/x86-64/io-apic.lisp +++ b/supervisor/x86-64/io-apic.lisp @@ -19,6 +19,32 @@ (defconstant +io-apic-base-vector+ 48) +;; Device interrupt vectors are allocated above +IO-APIC-BASE-VECTOR+. +;; The IPI vectors (#x80 wakeup, #x81 panic, #x82 quiesce, #x83 tlb-shootdown, +;; #x85 reschedule, ...) live in the gap below. To keep the GSI->vector map +;; bijective (so the handler can recover the GSI from the vector) the range +;; #x80..#x8F is skipped: GSIs whose naive vector would land there are shifted +;; up past it. +(defconstant +io-apic-ipi-gap-start+ #x80) +(defconstant +io-apic-ipi-gap-end+ #x90) + +(declaim (inline gsi->vector vector->gsi)) +(defun gsi->vector (gsi) + (let ((v (+ +io-apic-base-vector+ gsi))) + (if (>= v +io-apic-ipi-gap-start+) + (+ v (- +io-apic-ipi-gap-end+ +io-apic-ipi-gap-start+)) + v))) + +(defun vector->gsi (vector) + (cond ((>= vector +io-apic-ipi-gap-end+) + (- vector +io-apic-base-vector+ + (- +io-apic-ipi-gap-end+ +io-apic-ipi-gap-start+))) + ((< vector +io-apic-ipi-gap-start+) + (- vector +io-apic-base-vector+)) + (t + ;; Vector fell in the reserved IPI gap; no valid GSI. + -1))) + (sys.int::defglobal *io-apics* nil) (sys.int::defglobal *io-apic-active-p* nil) (sys.int::defglobal *io-apic-irqs* nil) @@ -155,44 +181,46 @@ (push-wired io-apic *io-apics*) (dotimes (e n-entries) (io-apic-write-redirection io-apic e +io-apic-entry-mask+) - (let ((gsi (+ gsi-base e)) + (let* ((gsi (+ gsi-base e)) + (vector (gsi->vector gsi)) (flags (svref *gsi-flags* (+ gsi-base e)))) - (io-apic-configure-entry gsi - (+ +io-apic-base-vector+ gsi) - bsp-apic-id - :trigger-mode (gsi-flags-trigger flags) - :polarity (gsi-flags-polarity flags) - :masked t)))) - (debug-print-line "IO-APIC " (io-apic-id io-apic) - " at " phys-addr - " GSI base " (io-apic-gsi-base io-apic) - " max redirect " (io-apic-max-redirection io-apic)))))))) - (when (not (null *io-apics*)) - (debug-print-line "IO-APIC init done. Hooking handlers...") - (let ((max-gsi 0)) - (dolist (apic *io-apics*) - (let ((top (+ (io-apic-gsi-base apic) (io-apic-max-redirection apic)))) - (setf max-gsi (max max-gsi top)))) - (dotimes (gsi (1+ max-gsi)) - (let ((vector (+ +io-apic-base-vector+ gsi))) - (when (< vector 128) - (hook-user-interrupt vector 'io-apic-interrupt-handler)))))) - (setf *io-apic-active-p* t) - ;; Mask the i8259 PIC now that IO-APIC is handling interrupts. - (when (boundp '*i8259-shadow-mask*) - (setf (sys.int::io-port/8 #x21) #xFF - (sys.int::io-port/8 #xA1) #xFF - *i8259-shadow-mask* #xFFFF)) - ;; Mask LINT0 and LINT1 on the LAPIC to prevent i8259 spurious - ;; interrupts from reaching the CPU through ExtINT/NMI. - (write-lapic (logior (read-lapic +lapic-reg-lvt-lint0+) +lapic-lvt-mask+) - +lapic-reg-lvt-lint0+) - (write-lapic (logior (read-lapic +lapic-reg-lvt-lint1+) +lapic-lvt-mask+) - +lapic-reg-lvt-lint1+) - nil)) + (when (< vector 256) + (io-apic-configure-entry gsi + vector + bsp-apic-id + :trigger-mode (gsi-flags-trigger flags) + :polarity (gsi-flags-polarity flags) + :masked t)))) + (debug-print-line "IO-APIC " (io-apic-id io-apic) + " at " phys-addr + " GSI base " (io-apic-gsi-base io-apic) + " max redirect " (io-apic-max-redirection io-apic)))))))) + (when (not (null *io-apics*)) + (debug-print-line "IO-APIC init done. Hooking handlers...") + (let ((max-gsi 0)) + (dolist (apic *io-apics*) + (let ((top (+ (io-apic-gsi-base apic) (io-apic-max-redirection apic)))) + (setf max-gsi (max max-gsi top)))) + (dotimes (gsi (1+ max-gsi)) + (let ((vector (gsi->vector gsi))) + (when (< vector 256) + (hook-user-interrupt vector 'io-apic-interrupt-handler)))))) + (setf *io-apic-active-p* t) + ;; Mask the i8259 PIC now that IO-APIC is handling interrupts. + (when (boundp '*i8259-shadow-mask*) + (setf (sys.int::io-port/8 #x21) #xFF + (sys.int::io-port/8 #xA1) #xFF + *i8259-shadow-mask* #xFFFF)) + ;; Mask LINT0 and LINT1 on the LAPIC to prevent i8259 spurious + ;; interrupts from reaching the CPU through ExtINT/NMI. + (write-lapic (logior (read-lapic +lapic-reg-lvt-lint0+) +lapic-lvt-mask+) + +lapic-reg-lvt-lint0+) + (write-lapic (logior (read-lapic +lapic-reg-lvt-lint1+) +lapic-lvt-mask+) + +lapic-reg-lvt-lint1+) + nil))) (defun io-apic-interrupt-handler (interrupt-frame info) - (let ((gsi (- info +io-apic-base-vector+))) + (let ((gsi (vector->gsi info))) (when (and (<= 0 gsi) (< gsi 256)) (irq-deliver interrupt-frame (svref *io-apic-irqs* gsi))) (lapic-eoi))) @@ -211,8 +239,14 @@ (ecase polarity (:high 0) (:low +io-apic-entry-polarity+))))) - (if *lapic-x2apic-mode* - (setf entry-value (logior entry-value (ash destination-apic-id 32))) - (setf entry-value (logior entry-value (ash (ldb (byte 8 0) destination-apic-id) 56)))) + ;; The IO-APIC redirection-entry destination field is always bits + ;; 63:56 regardless of whether the local APIC is in xAPIC or x2APIC + ;; mode: the IO-APIC is a separate device that emits an MSI write + ;; whose 8-bit physical destination is matched against the low 8 + ;; bits of each CPU's (x2)APIC ID. Encoding the destination at bits + ;; 39:32 (as an earlier version did in x2APIC mode) routes every + ;; external interrupt to APIC ID 0. + (setf entry-value (logior entry-value + (ash (ldb (byte 8 0) destination-apic-id) 56))) (io-apic-write-redirection apic entry entry-value) t))) From 565a828806941b6612fe6b05bb6177f703edb1ea Mon Sep 17 00:00:00 2001 From: iska Date: Sun, 14 Jun 2026 22:29:14 +0500 Subject: [PATCH 19/42] x86-64: fix x2APIC ICR encoding and TLB shootdown --- supervisor/x86-64/cpu.lisp | 58 ++++++++++++++++++++++---------------- 1 file changed, 34 insertions(+), 24 deletions(-) diff --git a/supervisor/x86-64/cpu.lisp b/supervisor/x86-64/cpu.lisp index b5e6be59..b6532e5b 100644 --- a/supervisor/x86-64/cpu.lisp +++ b/supervisor/x86-64/cpu.lisp @@ -131,7 +131,8 @@ irq-stack page-fault-stack lapic-timer-active - page-fault-hook) + page-fault-hook + (apic-in-x2apic-mode nil)) (defconstant +ap-trampoline-physical-address+ #x7000 "Where the AP trampoline should be copied to in physical memory. @@ -216,10 +217,12 @@ The bootloader is loaded to #x7C00, so #x7000 should be safe.") (defun send-ipi (target type vector) (if *lapic-x2apic-mode* ;; x2APIC: single 64-bit WRMSR to MSR 0x830. + ;; Bits 14 (Level) and 15 (Trigger Mode) are reserved and must be + ;; zero in x2APIC mode (Intel SDM 10.12.9), so the #x4000 used by + ;; the xAPIC path below is deliberately not OR-ed in here. (setf (sys.int::msr +x2apic-msr-icr+) (logior (ash (logand target #xFFFFFFFF) 32) (ash type 8) - #x4000 ; edge triggered, assert, physical dest vector)) ;; xAPIC: two MMIO writes. (progn @@ -233,12 +236,12 @@ The bootloader is loaded to #x7C00, so #x7000 should be safe.") (defun send-ipi-to-all (type vector &key including-self) (if (and (boundp '*lapic-x2apic-mode*) *lapic-x2apic-mode*) - ;; x2APIC shorthand: use destination shorthand in ICR. + ;; x2APIC shorthand: the destination-shorthand field is ICR + ;; bits 19:18 -- 01=self, 10=all-including-self, 11=all-excluding-self. (let ((icr (logior (ash type 8) - #x4000 vector))) (setf (sys.int::msr +x2apic-msr-icr+) - (logior icr (ash (if including-self 0 3) 18)))) + (logior icr (ash (if including-self 2 3) 18)))) (dolist (cpu *cpus*) (when (and (eql (x86-64-cpu-state cpu) :online) (or including-self @@ -387,20 +390,19 @@ TLB shootdown must be protected by the VM lock." ;; Prevent migration during shootdown. (setf (cpu-inhibit-scheduling (local-cpu)) (1+ (cpu-inhibit-scheduling (local-cpu)))) - ;; Send IPIs to non-idle CPUs only. Idle CPUs flush lazily on wakeup. + ;; Send IPIs to every other online CPU. Idle CPUs are intentionally + ;; NOT skipped: an idle CPU may wake during the shootdown window, run a + ;; thread, and cache a stale translation before *current-tlb-generation* + ;; is bumped in FINISH-TLB-SHOOTDOWN. Receiving the IPI forces it to + ;; flush regardless of idle state. (The lazy generation check in + ;; CHECK-TLB-GENERATION-CONSISTENCY remains as a defence-in-depth for + ;; CPUs that were offline entirely during the shootdown.) (dolist (cpu *cpus*) (when (and (eql (x86-64-cpu-state cpu) :online) (not (eql cpu (local-cpu)))) - (if (cpu-idle-p cpu) - (progn - (setf (cpu-tlb-generation cpu) - (1- *current-tlb-generation*)) - (sys.int::%atomic-fixnum-add-symbol - '*busy-tlb-shootdown-cpus* -1)) - (progn - (incf *tlb-shootdown-n-targets*) - (send-ipi-to-cpu cpu +ipi-type-fixed+ - +tlb-shootdown-ipi-vector+))))) + (incf *tlb-shootdown-n-targets*) + (send-ipi-to-cpu cpu +ipi-type-fixed+ + +tlb-shootdown-ipi-vector+))) ;; Wait for other CPUs to reach the handler. (loop (when (eql *busy-tlb-shootdown-cpus* 0) @@ -437,11 +439,13 @@ TLB shootdown must be protected by the VM lock." (when *debug-magic-button-hold-variable* (magic-button-ipi-handler-1 interrupt-frame)) (sys.int::cpu-relax)) - ;; The initiating CPU may have done per-page invalidation; we still - ;; flush the local TLB here. Lazy optimization: skip if this CPU was - ;; idle (the context switch on wakeup will reload CR3). - (when (not (cpu-idle-p (local-cpu))) - (flush-tlb)) + ;; Flush unconditionally. Once this CPU received the shootdown IPI it + ;; must invalidate its TLB even if it happens to be idle at this + ;; instant: a CPU that went idle between the initiator's idle-p read + ;; and the IPI delivery would otherwise skip the flush while still + ;; stamping its TLB generation as current, leaving stale entries in + ;; place until the next context switch. + (flush-tlb) ;; Record that this CPU's TLB is up to date wrt the current generation. (setf (cpu-tlb-generation (local-cpu)) *current-tlb-generation*) (sys.int::%atomic-fixnum-add-symbol '*busy-tlb-shootdown-cpus* @@ -1124,8 +1128,9 @@ This is a one-shot timer and must be reset after firing." (t (setf *lapic-x2apic-mode* nil) (debug-print-line "x2APIC not supported, using xAPIC MMIO"))) - (lapic-setup) - (lapic-dump) + (setf (x86-64-cpu-apic-in-x2apic-mode *bsp-cpu*) *lapic-x2apic-mode*) + (lapic-setup) + (lapic-dump) (map-physical-memory-early +ap-trampoline-physical-address+ #x1000 "AP Bootstrap") (setf *initial-pml4* (generate-initial-pml4)) (copy-ap-trampoline #'%%ap-bootstrap '%%ap-entry-point +ap-trampoline-physical-address+ *initial-pml4*) @@ -1215,7 +1220,8 @@ This is a one-shot timer and must be reset after firing." :idle-p nil :inhibit-scheduling 0 :tlb-generation 0 - :timer-active nil))) + :timer-active nil + :apic-in-x2apic-mode *lapic-x2apic-mode*))) ;; Allocate per-CPU MCS node for spinlocks. (setf (cpu-mcs-node cpu) (%make-mcs-node)) (populate-cpu-info cpu @@ -1332,3 +1338,7 @@ This is a one-shot timer and must be reset after firing." (defun dma-write-barrier () (%mfence)) + +(defun cpu-memory-barrier () + "Full memory barrier for ordering lock data accesses." + (%mfence)) From 8f856ac3dab6a5c8fc248e33f5399c084c1fe9b3 Mon Sep 17 00:00:00 2001 From: iska Date: Sun, 14 Jun 2026 22:29:21 +0500 Subject: [PATCH 20/42] supervisor: release global-thread-lock after the context switch and reclaim dead threads --- supervisor/sync.lisp | 121 ++++++++++++----------------- supervisor/thread.lisp | 140 ++++++++++++++++++++++++---------- supervisor/x86-64/thread.lisp | 27 +++---- 3 files changed, 161 insertions(+), 127 deletions(-) diff --git a/supervisor/sync.lisp b/supervisor/sync.lisp index 092d5f86..d6d37942 100644 --- a/supervisor/sync.lisp +++ b/supervisor/sync.lisp @@ -51,7 +51,7 @@ (defstruct (wait-queue (:area :wired)) (name nil) - (%lock (place-spinlock-initializer)) + (%lock :unlocked) (head nil) (tail nil)) @@ -87,9 +87,9 @@ (sys.int::defglobal *lock-violations-are-fatal* t) (defstruct (mutex - (:include wait-queue) - (:constructor make-mutex (&optional name)) - (:area :wired)) + (:include wait-queue) + (:constructor make-mutex (&optional name)) + (:area :wired)) ;; Thread holding the lock, or NIL if it is free. ;; May not be correct when the lock is being acquired/released. (owner nil) @@ -102,7 +102,12 @@ (state :unlocked) (stack-next nil) ;; Number of times ACQUIRE-MUTEX failed to immediately acquire the lock. - (contested-count 0 :type fixnum)) + (contested-count 0 :type fixnum) + ;; Priority inheritance (turnstile). + ;; Thread whose priority was boosted to prevent priority inversion. + (boosted-thread nil) + ;; Original priority of the boosted thread, for restoration on release. + (original-priority nil)) (defun acquire-mutex (mutex &optional (wait-p t)) (check-type mutex mutex) @@ -142,6 +147,18 @@ (setf (mutex-owner mutex) self) (unlock-wait-queue mutex) (return-from acquire-mutex-slow-path)) + ;; Priority inheritance: boost the holder if our priority is higher. + (let ((holder (mutex-owner mutex)) + (my-priority (thread-priority self))) + (when (and holder + (threadp holder) + (thread-priority-higher-p my-priority (thread-priority holder))) + ;; Record the boost if not already boosted. + (when (null (mutex-boosted-thread mutex)) + (setf (mutex-boosted-thread mutex) holder + (mutex-original-priority mutex) (thread-priority holder))) + ;; Elevate holder to our priority. + (setf (thread-priority holder) my-priority))) ;; Add to wait queue. Release will directly transfer ownership ;; to this thread. (push-wait-queue self mutex) @@ -191,6 +208,12 @@ ;; Contested lock. Need to wake a thread and pass the lock to it. (safe-without-interrupts (mutex) (with-wait-queue-lock (mutex) + ;; Restore priority of the boosted thread. + (let ((boosted (mutex-boosted-thread mutex))) + (when boosted + (setf (thread-priority boosted) (mutex-original-priority mutex)) + (setf (mutex-boosted-thread mutex) nil + (mutex-original-priority mutex) nil))) ;; Look for a thread to wake. (let ((thread (pop-wait-queue mutex))) (cond (thread @@ -206,6 +229,12 @@ (setf (mutex-owner mutex) nil) (when (not (eql (sys.int::cas (mutex-state mutex) :locked :unlocked) :locked)) ;; Mutex must be in the contested state. + ;; Restore priority of the boosted thread. + (let ((boosted (mutex-boosted-thread mutex))) + (when boosted + (setf (thread-priority boosted) (mutex-original-priority mutex)) + (setf (mutex-boosted-thread mutex) nil + (mutex-original-priority mutex) nil))) ;; Look for a thread to wake. (let ((thread (pop-wait-queue mutex))) (cond (thread @@ -406,7 +435,7 @@ May be used from an interrupt handler, assuming the associated mutex is interrup (:area :wired)) name (state +rw-lock-state-unlocked+ :type fixnum) - (lock (place-spinlock-initializer)) + (lock :unlocked) writer-wait-queue reader-wait-queue (n-pending-readers 0 :type fixnum) @@ -1081,7 +1110,7 @@ multiple threads." (buffer (error "no buffer supplied") :read-only t) (count) data-available - (lock (place-spinlock-initializer))) + (lock :unlocked)) (defun make-irq-fifo (size &key (element-type 't) name) ;; TODO: non-t element types. @@ -1157,62 +1186,12 @@ It is only possible for the second value to be false when wait-p is false." (irq-fifo-count fifo) 0) (setf (event-state (irq-fifo-data-available fifo)) nil)))) -;;; MCS queue-based spinlock. -;;; Fair spinlock where each CPU spins on its own cache line. -;;; Each CPU has a pre-allocated MCS node in the cpu struct. - -;; MCS queue-based spinlock. -;; Fair spinlock where each CPU spins on its own cache line. -;; Each CPU has a pre-allocated MCS node in the cpu struct. -;; -;; Implemented as macros so the place form is available for CAS expansion -;; at compile time (cross-compiler can't CAS on lexical variables). - -(defmacro acquire-mcs-spinlock (lock-place mcs-node) - "Acquire an MCS spinlock. LOCK-PLACE is a place (setf-able), -MCS-NODE is the current CPU's pre-allocated mcs-node struct." - `(progn - (setf (mcs-node-next ,mcs-node) nil - (mcs-node-locked ,mcs-node) nil) - ;; Atomic exchange: read old value and write our node. - (let* ((prev nil)) - (loop - (setf prev ,lock-place) - (when (eql (sys.int::cas ,lock-place prev ,mcs-node) prev) - (return))) - (if (null prev) - ;; No waiter, we are the holder. - (setf (mcs-node-locked ,mcs-node) t) - ;; There's a tail, chain ourselves after it. - (progn - (setf (mcs-node-next prev) ,mcs-node) - ;; Spin until the predecessor hands us the lock. - (loop until (mcs-node-locked ,mcs-node) - do (sys.int::cpu-relax))))))) - -(defmacro release-mcs-spinlock (lock-place mcs-node) - "Release an MCS spinlock acquired with ACQUIRE-MCS-SPINLOCK." - `(block release-mcs-spinlock - (if (null (mcs-node-next ,mcs-node)) - ;; No known successor. Try to nil out the lock word. - (if (eql (sys.int::cas ,lock-place ,mcs-node nil) ,mcs-node) - ;; CAS succeeded, no one is waiting. - (return-from release-mcs-spinlock) - ;; CAS failed, a successor has linked in between. - ;; Wait for the successor to appear. - (loop until (mcs-node-next ,mcs-node) - do (sys.int::cpu-relax)))) - ;; Pass the lock to the successor. - (setf (mcs-node-locked (mcs-node-next ,mcs-node)) t))) - -(defmacro with-mcs-spinlock ((place) &body body) - "Acquire the MCS spinlock at PLACE, execute BODY, then release." - (let ((node (gensym "MCS-NODE"))) - `(let ((,node (cpu-mcs-node (local-cpu)))) - (acquire-mcs-spinlock ,place ,node) - (unwind-protect - (progn ,@body) - (release-mcs-spinlock ,place ,node))))) +;;; The MCS queue-based spinlock primitives live in interrupts.lisp +;;; (ACQUIRE-MCS-SPINLOCK / RELEASE-MCS-SPINLOCK / WITH-MCS-SPINLOCK) and +;;; each CPU has a pre-allocated MCS node in its cpu struct. They are not +;;; currently used by *global-thread-lock*, which is released from the LAP +;;; context-switch trampolines and so stays a TATAS place spinlock; see +;;; thread.lisp. ;;; RCU primitives for lock-free read-side access. ;;; Each CPU has an rcu-nest counter in the cpu struct. @@ -1241,15 +1220,13 @@ MCS-NODE is the current CPU's pre-allocated mcs-node struct." "List of objects to be freed after an RCU grace period.") (defun rcu-synchronize () - "Wait for a grace period: all CPUs to pass through a quiescent state." - ;; For each CPU, set a flag and wait for it to context-switch. - ;; A simple approach: IPI every CPU and wait for each to acknowledge. - (dolist (cpu *cpus*) - (when (eql (x86-64-cpu-state cpu) :online) - ;; Send a quiesce IPI to this CPU. - (send-ipi (x86-64-cpu-apic-id cpu) +ipi-type-fixed+ +quiesce-ipi-vector+))) - ;; Yield to allow other CPUs to process. - (thread-yield)) + "Reclaim deferred thread deletions. +RCU readers (see ALL-THREADS / WITH-RCU-READ-LOCK) skip :dead threads +and thread objects are reclaimed by the GC once no reader references +them, so the deferred list can be drained without waiting for an +explicit grace period. The actual draining also happens opportunistically +on every thread exit inside THREAD-FINAL-CLEANUP." + (cleanup-dead-threads)) (defun call-with-rcu-synchronize (thunk) (rcu-synchronize) diff --git a/supervisor/thread.lisp b/supervisor/thread.lisp index 3c665e5f..80001e2f 100644 --- a/supervisor/thread.lisp +++ b/supervisor/thread.lisp @@ -9,7 +9,6 @@ (sys.int::defglobal *normal-priority-run-queue*) (sys.int::defglobal *low-priority-run-queue*) (sys.int::defglobal *all-threads*) -(sys.int::defglobal *n-running-cpus*) (sys.int::defglobal *world-stopper*) (sys.int::defglobal *pseudo-atomic-thread-count*) @@ -259,6 +258,14 @@ can be reprotected.") ;;; Locking. +;;; Note: *global-thread-lock* deliberately uses the TATAS place spinlock +;;; rather than the MCS queue spinlock. It is released from the LAP context +;;; switch trampolines (%%RESTORE-FULL-SAVE-THREAD / %%RESTORE-PARTIAL-SAVE- +;;; THREAD) after switching off the old thread's stack, where only a single +;;; store can be emitted; the MCS release protocol (CAS + spin + handoff to +;;; the next waiter) cannot be expressed there. The MCS infrastructure +;;; (acquire-mcs-spinlock etc.) remains available for locks that are never +;;; released from LAP. (defun acquire-global-thread-lock () (acquire-symbol-spinlock *global-thread-lock*)) @@ -416,15 +423,24 @@ Interrupts must be off and the global thread lock must be held." (defun maybe-preempt-via-interrupt (interrupt-frame) (let ((current (current-thread))) - (acquire-global-thread-lock) - (cond ((or (and *world-stopper* - (eql current *world-stopper*)) - (eql (thread-priority current) :supervisor) - (eql current (local-cpu-idle-thread))) - (release-global-thread-lock)) + (cond ((plusp (cpu-inhibit-scheduling (local-cpu))) + ;; Scheduling is inhibited on this CPU (e.g. a TLB shootdown + ;; or quiesce is in progress). Do not migrate the current + ;; thread away: TLB shootdown relies on the initiator staying + ;; put so its local INVLPG invalidates the CPU that did not + ;; receive an IPI (itself). Re-arm the timer so preemption + ;; resumes once the inhibitor is released. + (preemption-timer-reset *timeslice-length*)) (t - (setf (thread-state current) :runnable) - (%reschedule-via-interrupt interrupt-frame))))) + (acquire-global-thread-lock) + (cond ((or (and *world-stopper* + (eql current *world-stopper*)) + (eql (thread-priority current) :supervisor) + (eql current (local-cpu-idle-thread))) + (release-global-thread-lock)) + (t + (setf (thread-state current) :runnable) + (%reschedule-via-interrupt interrupt-frame))))))) (defun %%switch-to-thread-via-wired-stack (current-thread sp fp next-thread) ;; Save frame pointer. @@ -477,12 +493,11 @@ Interrupts must be off and the global thread lock must be held." (check-tlb-generation-consistency) ;; Restore FPU state. (restore-fpu-state new-thread) - ;; The global thread lock is dropped by the restore functions, not here. - ;; We are still running on the current (old) thread's stack, so cannot - ;; allow another CPU to switch on to it just yet. - ;; This can only occur when performing a voluntary switch away from - ;; a thread with a wired stack - one of the ephemeral supervisor threads. - ;; Check if the thread is full-save. + ;; On x86-64 the global thread lock is dropped by the assembly restore + ;; trampolines AFTER switching off this (the old thread's) stack, so that + ;; another CPU may safely resume the old thread without colliding on its + ;; wired stack. ARM64 still releases here from Lisp for now. + #+arm64 (release-global-thread-lock) (if (thread-full-save-p new-thread) (%%restore-full-save-thread new-thread) (%%restore-partial-save-thread new-thread))) @@ -557,7 +572,7 @@ Interrupts must be off and the global thread lock must be held." (setf (thread-full-save-p thread) t (thread-state thread) :runnable) (safe-without-interrupts (thread) - (with-symbol-spinlock (*global-thread-lock*) + (with-global-thread-lock nil (push-run-queue thread) ;; Add thread to global thread list. (setf (thread-global-prev *all-threads*) thread @@ -604,13 +619,17 @@ Interrupts must be off and the global thread lock must be held." (setf (event-state (thread-join-event self)) (or return-values :no-values)) (acquire-global-thread-lock) (setf (thread-state self) :dead) - ;; Remove thread from the global list. - (when (thread-global-next self) - (setf (thread-global-prev (thread-global-next self)) (thread-global-prev self))) - (when (thread-global-prev self) - (setf (thread-global-next (thread-global-prev self)) (thread-global-next self))) - (when (eql self *all-threads*) - (setf *all-threads* (thread-global-next self))) + ;; Defer physical removal from *all-threads*. RCU readers (see + ;; ALL-THREADS / WITH-RCU-READ-LOCK) skip :dead threads, and thread + ;; objects are reclaimed by the GC once unreferenced, so unlinking + ;; dead nodes is safe even while a reader is mid-traversal on a + ;; strongly-ordered architecture. + (push-wired self *rcu-deferred-list*) + ;; Drain immediately. This is the only writer of + ;; *rcu-deferred-list* and it runs under the global lock, so every + ;; thread exit leaves the list empty -- no unbounded growth and no + ;; need for a separate grace period in a GC'd runtime. + (%cleanup-dead-threads) (%reschedule-via-wired-stack sp fp)))) (defun thread-join (thread &optional (wait-p t)) @@ -666,18 +685,33 @@ not and WAIT-P is false." (%disable-interrupts))))))) (defun increment-n-running-cpus () - (let ((prev (sys.int::%atomic-fixnum-add-symbol '*n-running-cpus* 1))) - (when (zerop prev) - (set-run-light t)))) + (incf (cpu-running-threads (local-cpu))) + (when (eql (cpu-running-threads (local-cpu)) 1) + (set-run-light t))) (defun decrement-n-running-cpus () - (let ((prev (sys.int::%atomic-fixnum-add-symbol '*n-running-cpus* -1))) - (when (and (eql prev 1) - (boundp '*light-run*)) - ;; Clear the run light immediately so it doesn't stay on between - ;; GUI screen updates. + (decf (cpu-running-threads (local-cpu))) + (when (and (zerop (cpu-running-threads (local-cpu))) + (not (n-running-cpus>0-p))) + (when (boundp '*light-run*) (clear-light *light-run*)))) +(defun n-running-cpus>0-p () + (dolist (cpu *cpus*) + (when (plusp (cpu-running-threads cpu)) + (return t)))) + +(defun n-running-cpus () + (let ((total 0)) + (dolist (cpu *cpus* total) + (incf total (cpu-running-threads cpu))))) + +(defun thread-priority-higher-p (pri1 pri2) + "Return true if PRI1 has higher scheduling priority than PRI2." + (declare (type (member :idle :low :normal :high :supervisor) pri1 pri2)) + (let ((order '(:idle :low :normal :high :supervisor))) + (> (position pri1 order) (position pri2 order)))) + (defun make-ephemeral-thread (entry-point initial-state &key name (stack-size (* 256 1024)) (priority :normal)) (let* ((thread (%make-thread name)) (stack (%allocate-stack stack-size t))) @@ -761,7 +795,7 @@ not and WAIT-P is false." (defun initialize-threads () (when (not (boundp '*global-thread-lock*)) ;; First-run stuff. - (setf *global-thread-lock* :unlocked) + (setf *global-thread-lock* nil) (setf *supervisor-priority-run-queue* (make-run-queue :supervisor) *high-priority-run-queue* (make-run-queue :high) *normal-priority-run-queue* (make-run-queue :normal) @@ -780,7 +814,7 @@ not and WAIT-P is false." (thread-global-next sys.int::*disk-io-thread*) nil (thread-global-prev sys.int::*disk-io-thread*) sys.int::*pager-thread*) (setf *default-stack-size* (* 1024 1024))) - (setf *n-running-cpus* 1) + (setf (cpu-running-threads (local-cpu)) 1) (reset-ephemeral-thread sys.int::*bsp-idle-thread* #'idle-thread :runnable :idle) (reset-ephemeral-thread sys.int::*snapshot-thread* #'snapshot-thread :sleeping :supervisor) ;; Don't let the pager run until the paging disk has been found. @@ -791,7 +825,7 @@ not and WAIT-P is false." (defun wake-thread (thread) "Wake a sleeping thread." (without-interrupts - (with-symbol-spinlock (*global-thread-lock*) + (with-global-thread-lock () (wake-thread-1 thread)))) (defun wake-thread-1 (thread) @@ -831,11 +865,33 @@ not and WAIT-P is false." (panic "Initial thread woken??"))) (defun all-threads () - (do ((list '()) - (current *all-threads* (thread-global-next current))) - ((null current) - list) - (push current list))) + (with-rcu-read-lock + (do ((list '()) + (current *all-threads* (thread-global-next current))) + ((null current) + list) + (unless (eql (thread-state current) :dead) + (push current list))))) + +(defun %cleanup-dead-threads () + "Remove threads marked :dead from the *all-threads* list. +The global thread lock must be held." + (ensure-global-thread-lock-held) + (dolist (thread *rcu-deferred-list*) + (when (thread-global-next thread) + (setf (thread-global-prev (thread-global-next thread)) (thread-global-prev thread))) + (when (thread-global-prev thread) + (setf (thread-global-next (thread-global-prev thread)) (thread-global-next thread))) + (when (eql thread *all-threads*) + (setf *all-threads* (thread-global-next thread)))) + (setf *rcu-deferred-list* nil)) + +(defun cleanup-dead-threads () + "Remove threads marked :dead from the *all-threads* list." + (without-interrupts + (acquire-global-thread-lock) + (%cleanup-dead-threads) + (release-global-thread-lock))) (defun terminate-thread (thread) (establish-thread-foothold @@ -920,7 +976,7 @@ footholds will be reenabled, otherwise footholds will stay inhibited." (let ((wi (thread-wait-item thread))) (when (wait-queue-p wi) (lock-wait-queue wi)) - (with-symbol-spinlock (*global-thread-lock*) + (with-global-thread-lock nil (cond ((eql (thread-state thread) :sleeping) ;; Remove the thread from its associated wait-queue. (ensure (wait-queue-p wi) @@ -948,7 +1004,7 @@ footholds will be reenabled, otherwise footholds will stay inhibited." (let ((push-cons (sys.int::cons-in-area function nil :wired))) (flet ((push-foothold () (safe-without-interrupts (thread push-cons) - (with-symbol-spinlock (*global-thread-lock*) + (with-global-thread-lock nil (setf (cdr push-cons) (thread-pending-footholds thread) (thread-pending-footholds thread) push-cons))))) (cond ((eql thread (current-thread)) @@ -983,7 +1039,7 @@ footholds will be reenabled, otherwise footholds will stay inhibited." ;; allow the thread to settle. (defun sample-thread-state (thread) (safe-without-interrupts (thread) - (with-symbol-spinlock (*global-thread-lock*) + (with-global-thread-lock nil (thread-state thread)))) (defun stop-thread (thread) diff --git a/supervisor/x86-64/thread.lisp b/supervisor/x86-64/thread.lisp index 50dfff90..0e351640 100644 --- a/supervisor/x86-64/thread.lisp +++ b/supervisor/x86-64/thread.lisp @@ -42,15 +42,16 @@ (setf (sys.int::msr +msr-ia32-gs-base+) (sys.int::lisp-object-address thread))) (sys.int::define-lap-function %%restore-full-save-thread ((thread)) - ;; Drop the global thread lock. - ;; This must be done here, not in %%switch-to-thread-common to prevent - ;; another CPU from switching on to the old thread's stack while it is - ;; still in use. + ;; Returning to an interrupted thread. Restore saved registers and stuff. + (sys.lap-x86:lea64 :rsp (:object :r8 #.+thread-interrupt-save-area+)) + ;; Now switched off the old thread's stack, so it is safe to drop the + ;; global thread lock. Releasing here (rather than before the stack + ;; switch) means another CPU can resume the old thread without + ;; colliding on its wired/exception stack. r9/r10 are clobbered but + ;; restored by the pops below. (sys.lap-x86:mov64 :r9 (:symbol-global-cell *global-thread-lock*)) (sys.lap-x86:mov64 :r10 (:constant :unlocked)) (sys.lap-x86:mov64 (:object :r9 #.sys.int::+symbol-value-cell-value+) :r10) - ;; Returning to an interrupted thread. Restore saved registers and stuff. - (sys.lap-x86:lea64 :rsp (:object :r8 #.+thread-interrupt-save-area+)) (sys.lap-x86:pop :r15) (sys.lap-x86:pop :r14) (sys.lap-x86:pop :r13) @@ -69,17 +70,17 @@ (sys.lap-x86:iret)) (sys.int::define-lap-function %%restore-partial-save-thread ((thread)) - ;; Drop the global thread lock. - ;; This must be done here, not in %%switch-to-thread-common to prevent - ;; another CPU from switching on to the old thread's stack while it is - ;; still in use. - (sys.lap-x86:mov64 :r9 (:symbol-global-cell *global-thread-lock*)) - (sys.lap-x86:mov64 :r10 (:constant :unlocked)) - (sys.lap-x86:mov64 (:object :r9 #.sys.int::+symbol-value-cell-value+) :r10) ;; Restore stack pointer. (sys.lap-x86:mov64 :rsp (:object :r8 #.+thread-state-rsp+)) ;; Restore frame pointer. (sys.lap-x86:mov64 :rbp (:object :r8 #.+thread-state-rbp+)) + ;; Now switched off the old thread's (wired) stack, so it is safe to + ;; drop the global thread lock here -- another CPU may resume the old + ;; thread without colliding on its wired stack. r9/r10 are scratch + ;; (not part of the partial-save register set). + (sys.lap-x86:mov64 :r9 (:symbol-global-cell *global-thread-lock*)) + (sys.lap-x86:mov64 :r10 (:constant :unlocked)) + (sys.lap-x86:mov64 (:object :r9 #.sys.int::+symbol-value-cell-value+) :r10) ;; Reenable interrupts. Must be done before touching the thread stack. (sys.lap-x86:sti) (:gc :no-frame :layout #*0) From 7c1427046cd67d0b070eae379d09824f0a3f8231 Mon Sep 17 00:00:00 2001 From: iska Date: Sun, 14 Jun 2026 22:29:25 +0500 Subject: [PATCH 21/42] arm64: allocate per-CPU mcs-node and barrier the MCS spinlock protocol --- supervisor/arm64/cpu.lisp | 8 ++++ supervisor/interrupts.lisp | 79 ++++++++++++++++++++++++++++++-- tools/cold-generator2/arm64.lisp | 6 ++- 3 files changed, 87 insertions(+), 6 deletions(-) diff --git a/supervisor/arm64/cpu.lisp b/supervisor/arm64/cpu.lisp index 472ecdfb..70ed97d9 100644 --- a/supervisor/arm64/cpu.lisp +++ b/supervisor/arm64/cpu.lisp @@ -78,11 +78,18 @@ (defun local-cpu () (local-cpu-info)) +(defun cpu-memory-barrier () + "Full inner-shareable memory barrier for ordering lock data accesses." + (%dsb.ish)) + (defun initialize-cpu () (setf (arm64-cpu-cpu-id *bsp-cpu*) (fdt-boot-cpuid)) (setf (cpu-cpu-index *bsp-cpu*) 0) (setf (cpu-inhibit-scheduling *bsp-cpu*) 0) (setf (cpu-tlb-generation *bsp-cpu*) 0) + ;; Allocate MCS node for BSP if not already allocated by cold-generator. + (when (null (cpu-mcs-node *bsp-cpu*)) + (setf (cpu-mcs-node *bsp-cpu*) (%make-mcs-node))) (push-wired *bsp-cpu* *cpus*)) (sys.int::define-lap-function %el0-common () @@ -368,6 +375,7 @@ TLB shootdown must be protected by the VM lock." (setf (cpu-cpu-index cpu) (length *cpus*)) (setf (cpu-inhibit-scheduling cpu) 0) (setf (cpu-tlb-generation cpu) 0) + (setf (cpu-mcs-node cpu) (%make-mcs-node)) (setf (arm64-cpu-self cpu) cpu) (setf (sys.int::memref-unsigned-byte-64 (arm64-cpu-sp-el1 cpu)) (sys.int::lisp-object-address cpu)) diff --git a/supervisor/interrupts.lisp b/supervisor/interrupts.lisp index 964b655b..e9e5ca0c 100644 --- a/supervisor/interrupts.lisp +++ b/supervisor/interrupts.lisp @@ -40,6 +40,7 @@ RETURN-FROM/GO must not be used to leave this form." nil ,@captures)) +;;; TATAS (test-and-test-and-set) spinlocks -- general purpose, supports nesting. (defun place-spinlock-initializer () :unlocked) @@ -87,8 +88,6 @@ RETURN-FROM/GO must not be used to leave this form." `(let* (,@(mapcar #'list vars vals) (,new-sym :unlocked) (,old-sym ,read-form)) - ;; FIXME: This should use the write form but that doesn't have the proper - ;; release semantics yet on arm64. ,cas-form (values)))) @@ -121,6 +120,76 @@ RETURN-FROM/GO must not be used to leave this form." (check-type lock symbol) `(ensure-place-spinlock-held ,lock)) +;;; MCS (Mellor-Crummy-Scott) queue-based spinlocks -- fair, FIFO, each CPU +;;; spins on its own cache line. CANNOT be nested on the same CPU. +(eval-when (:compile-toplevel :load-toplevel :execute) + (defun mcs-cas-target (place) + "Convert a spinlock place to a form suitable for CAS. +Bare symbols become (sys.int::symbol-global-value 'SYM); +struct-accessor forms are returned as-is." + (if (symbolp place) + `(sys.int::symbol-global-value ',place) + place))) + +(defmacro acquire-mcs-spinlock (place) + "Acquire a spinlock using MCS fair queuing. +NOTE: do NOT nest MCS spinlock acquisitions on the same CPU." + (let ((mcs-node (gensym "MCS-NODE")) + (prev (gensym "PREV")) + (cas-target (mcs-cas-target place))) + `(let ((,mcs-node (cpu-mcs-node (local-cpu)))) + (ensure-interrupts-disabled) + (setf (mcs-node-next ,mcs-node) nil + (mcs-node-locked ,mcs-node) nil) + (let ((,prev nil)) + (loop + (setf ,prev ,place) + (when (eql (sys.int::cas ,cas-target ,prev ,mcs-node) ,prev) + (return))) + (if (null ,prev) + (setf (mcs-node-locked ,mcs-node) t) + (progn + (setf (mcs-node-next ,prev) ,mcs-node) + (loop until (mcs-node-locked ,mcs-node) + do (sys.int::cpu-relax)))) + ;; Acquire barrier: make sure protected-data reads are not + ;; reordered before the lock is observed held. Required on + ;; weakly-ordered ARM64 (x86-64 TSO folds this into the CAS). + (cpu-memory-barrier) + (values))))) + +(defmacro release-mcs-spinlock (place) + "Release an MCS spinlock." + (let ((mcs-node (gensym "MCS-NODE")) + (cas-target (mcs-cas-target place))) + `(let ((,mcs-node (cpu-mcs-node (local-cpu)))) + ;; Release barrier: make sure all protected-data stores from the + ;; critical section are visible before the handoff (or before the + ;; lock word goes to nil for the uncontended release). Required + ;; on weakly-ordered ARM64. + (cpu-memory-barrier) + (block release-mcs-spinlock + (if (null (mcs-node-next ,mcs-node)) + (if (eql (sys.int::cas ,cas-target ,mcs-node nil) ,mcs-node) + (return-from release-mcs-spinlock) + (loop until (mcs-node-next ,mcs-node) + do (sys.int::cpu-relax)))) + (setf (mcs-node-locked (mcs-node-next ,mcs-node)) t)) + (setf (mcs-node-locked ,mcs-node) nil) + (values)))) + +(defmacro with-mcs-spinlock ((place) &body body) + `(progn + (acquire-mcs-spinlock ,place) + (unwind-protect + (progn ,@body) + (release-mcs-spinlock ,place)))) + +(defmacro ensure-mcs-spinlock-held (place) + (declare (ignore place)) + `(ensure (mcs-node-locked (cpu-mcs-node (local-cpu))) + "Expected lock to be held by current CPU")) + (defmacro with-page-fault-hook (((&optional frame info fault-address) &body hook-body) &body body) (let ((old (gensym)) (frame (or frame (gensym "FRAME"))) @@ -228,7 +297,8 @@ RETURN-FROM/GO must not be used to leave this form." platform-number attachments (count 0) - (lock (place-spinlock-initializer))) + (lock :unlocked) +) (defstruct (irq-attachment (:area :wired)) @@ -313,7 +383,8 @@ RETURN-FROM/GO must not be used to leave this form." latch event (state :masked) - (lock (place-spinlock-initializer))) + (lock :unlocked) +) (defun make-simple-irq (irq-number &optional latch) (declare (mezzano.compiler::closure-allocation :wired)) diff --git a/tools/cold-generator2/arm64.lisp b/tools/cold-generator2/arm64.lisp index f236c980..41c25459 100644 --- a/tools/cold-generator2/arm64.lisp +++ b/tools/cold-generator2/arm64.lisp @@ -32,8 +32,10 @@ :name (env:translate-symbol environment 'sys.int::%%funcallable-instance-trampoline%%))) (setf (env:cross-symbol-value environment 'mezzano.supervisor::*bsp-wired-stack*) (env:make-stack environment (* 128 1024))) - (setf (env:cross-symbol-value environment 'mezzano.supervisor::*bsp-cpu*) - (env:make-structure environment 'mezzano.supervisor::arm64-cpu)) + (let ((bsp-cpu (env:make-structure environment 'mezzano.supervisor::arm64-cpu))) + (setf (env:structure-slot-value environment bsp-cpu 'mezzano.supervisor::mcs-node) + (env:make-structure environment 'mezzano.supervisor::mcs-node)) + (setf (env:cross-symbol-value environment 'mezzano.supervisor::*bsp-cpu*) bsp-cpu)) (setf (env:cross-symbol-value environment 'mezzano.supervisor::*arm64-exception-vector*) (env:compile-lap environment (loop repeat (/ (+ 2048 +exception-vector-alignment+) 8) ; for alignment From 17549b6b4937d6dc5c256143e91092b507f94395 Mon Sep 17 00:00:00 2001 From: iska Date: Sun, 14 Jun 2026 22:32:07 +0500 Subject: [PATCH 22/42] supervisor: traverse *all-threads* under RCU and skip dead threads --- supervisor/debug.lisp | 14 ++++++++------ supervisor/profiler.lisp | 14 ++++++++------ 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/supervisor/debug.lisp b/supervisor/debug.lisp index 10875a25..ea64466b 100644 --- a/supervisor/debug.lisp +++ b/supervisor/debug.lisp @@ -315,12 +315,14 @@ (defun dump-threads () (dump-thread (current-thread) (sys.int::read-frame-pointer)) (when (boundp '*all-threads*) - (do ((thread *all-threads* - (thread-global-next thread))) - ((null thread)) - (when (not (eql thread (current-thread))) - (debug-print-line "----------") - (dump-thread thread (thread-frame-pointer thread)))))) + (with-rcu-read-lock + (do ((thread *all-threads* + (thread-global-next thread))) + ((null thread)) + (when (and (not (eql thread (current-thread))) + (not (eql (thread-state thread) :dead))) + (debug-print-line "----------") + (dump-thread thread (thread-frame-pointer thread))))))) (defun debug-dump () (debug-print-line "Local CPU is " (local-cpu)) diff --git a/supervisor/profiler.lisp b/supervisor/profiler.lisp index be024c1c..89a2c487 100644 --- a/supervisor/profiler.lisp +++ b/supervisor/profiler.lisp @@ -92,18 +92,20 @@ ;; RIP is valid in the save area. (profile-append-return-address (thread-state-rip thread))) (profile-append-call-stack (thread-frame-pointer thread))))) - (t - (loop - for thread = *all-threads* then (thread-global-next thread) - until (not thread) do - (when (not (eql thread (current-thread))) + (t + (with-rcu-read-lock + (loop + for thread = *all-threads* then (thread-global-next thread) + until (not thread) do + (when (and (not (eql (thread-state thread) :dead)) + (not (eql thread (current-thread)))) (profile-append-entry thread) (profile-append-entry (thread-state thread)) (profile-append-entry (thread-wait-item thread)) (when (thread-full-save-p thread) ;; RIP is valid in the save area. (profile-append-return-address (thread-state-rip thread))) - (profile-append-call-stack (thread-frame-pointer thread)))))) + (profile-append-call-stack (thread-frame-pointer thread))))))) (resume-other-cpus-for-debug-magic-button))) (defun start-profiling (&key buffer-size thread (reset t) (sample-during-gc t)) From 8130877c8b20021e287cfd03962a1ed28db1b030 Mon Sep 17 00:00:00 2001 From: iska Date: Sun, 14 Jun 2026 22:32:11 +0500 Subject: [PATCH 23/42] supervisor: initialize spinlocks with :unlocked directly --- supervisor/disk.lisp | 4 ++-- supervisor/entry.lisp | 2 +- supervisor/pager.lisp | 4 ++-- supervisor/time.lisp | 2 +- supervisor/x86-64/time.lisp | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/supervisor/disk.lisp b/supervisor/disk.lisp index 3ce24c4b..71af4490 100644 --- a/supervisor/disk.lisp +++ b/supervisor/disk.lisp @@ -47,7 +47,7 @@ lba n-sectors buffer - (lock (place-spinlock-initializer)) + (lock :unlocked) (latch (make-event :name "Disk request notifier")) next) @@ -84,7 +84,7 @@ (setf *log-disk-requests* nil) (setf *disk-request-current* nil *disk-request-queue-head* nil - *disk-request-queue-lock* (place-spinlock-initializer) + *disk-request-queue-lock* :unlocked *disk-request-queue-latch* (make-event :name "Disk request queue notifier") *disks* '())) ;; Abort any queued or in-progress requests. diff --git a/supervisor/entry.lisp b/supervisor/entry.lisp index 4e51caea..8fbaac6b 100644 --- a/supervisor/entry.lisp +++ b/supervisor/entry.lisp @@ -154,7 +154,7 @@ (setf (sys.int::symbol-global-value 'mezzano.runtime::*active-catch-handlers*) 'nil (sys.int::symbol-global-value '*pseudo-atomic*) nil sys.int::*known-finalizers* nil - *big-wait-for-objects-lock* (place-spinlock-initializer))) + *big-wait-for-objects-lock* :unlocked)) (initialize-early-platform) (when (boundp '*boot-id*) (setf (event-state *boot-id*) t)) diff --git a/supervisor/pager.lisp b/supervisor/pager.lisp index df312864..1cde1695 100644 --- a/supervisor/pager.lisp +++ b/supervisor/pager.lisp @@ -1125,10 +1125,10 @@ It will put the thread to sleep, while it waits for the page." (setf *pager-noisy* nil *pager-waiting-threads* '() *pager-current-thread* nil - *pager-lock* (place-spinlock-initializer) + *pager-lock* :unlocked *pager-fast-path-enabled* t *pager-lazy-block-allocation-enabled* t)) - (setf *page-replacement-list-lock* (place-spinlock-initializer) + (setf *page-replacement-list-lock* :unlocked *page-replacement-list-head* nil *page-replacement-list-tail* nil) (setf *pager-fast-path-hits* 0 diff --git a/supervisor/time.lisp b/supervisor/time.lisp index eb79892d..3650ab07 100644 --- a/supervisor/time.lisp +++ b/supervisor/time.lisp @@ -125,7 +125,7 @@ (:area :wired)) head tail - (lock (place-spinlock-initializer))) + (lock :unlocked)) (defstruct (timer (:constructor %make-timer) diff --git a/supervisor/x86-64/time.lisp b/supervisor/x86-64/time.lisp index 8c6ba9ab..4e93d1e7 100644 --- a/supervisor/x86-64/time.lisp +++ b/supervisor/x86-64/time.lisp @@ -72,7 +72,7 @@ HIGH-PRECISION-TIME-UNITS-TO-INTERNAL-TIME-UNITS." (defun initialize-platform-time () (when (not (boundp '*rtc-lock*)) - (setf *rtc-lock* (place-spinlock-initializer))) + (setf *rtc-lock* :unlocked)) (configure-pit-tick-rate 100) (irq-attach (platform-irq +pit-irq+) 'pit-irq-handler From 55aa42d82d4de71b5c2793d887ae528ed178eb37 Mon Sep 17 00:00:00 2001 From: iska Date: Sun, 14 Jun 2026 22:32:18 +0500 Subject: [PATCH 24/42] arm64: drop the LAP global-thread-lock release --- supervisor/arm64/thread.lisp | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/supervisor/arm64/thread.lisp b/supervisor/arm64/thread.lisp index fbb99035..404009af 100644 --- a/supervisor/arm64/thread.lisp +++ b/supervisor/arm64/thread.lisp @@ -86,13 +86,7 @@ (mezzano.lap.arm64:ret)) (sys.int::define-lap-function %%restore-full-save-thread ((thread)) - ;; Drop the global thread lock. - ;; This must be done here, not in %%switch-to-thread-common to prevent - ;; another CPU from switching on to the old thread's stack while it is - ;; still in use. - (mezzano.lap.arm64:ldr :x1 (:symbol-global-cell *global-thread-lock*)) - (mezzano.lap.arm64:ldr :x2 (:constant :unlocked)) - (mezzano.lap.arm64:str :x2 (:object :x1 #.sys.int::+symbol-value-cell-value+)) + ;; The global thread lock is released by Lisp code before calling this function. ;; Switch back to SP_EL0, restoring the original value of SP_EL1. (mezzano.lap.arm64:add :sp :x27 0) (mezzano.lap.arm64:msr :spsel 0) @@ -130,13 +124,7 @@ (mezzano.lap.arm64:eret)) (sys.int::define-lap-function %%restore-partial-save-thread ((thread)) - ;; Drop the global thread lock. - ;; This must be done here, not in %%switch-to-thread-common to prevent - ;; another CPU from switching on to the old thread's stack while it is - ;; still in use. - (mezzano.lap.arm64:ldr :x1 (:symbol-global-cell *global-thread-lock*)) - (mezzano.lap.arm64:ldr :x2 (:constant :unlocked)) - (mezzano.lap.arm64:str :x2 (:object :x1 #.sys.int::+symbol-value-cell-value+)) + ;; The global thread lock is released by Lisp code before calling this function. ;; Switch back to SP_EL0, restoring the original value of SP_EL1. (mezzano.lap.arm64:add :sp :x27 0) (mezzano.lap.arm64:msr :spsel 0) From f8a904839b7593d38b04538649a5520c542c19d7 Mon Sep 17 00:00:00 2001 From: iska Date: Sun, 14 Jun 2026 23:44:03 +0500 Subject: [PATCH 25/42] supervisor: initialize *global-thread-lock* to :unlocked for the TATAS spinlock --- supervisor/thread.lisp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/supervisor/thread.lisp b/supervisor/thread.lisp index 80001e2f..14eaea28 100644 --- a/supervisor/thread.lisp +++ b/supervisor/thread.lisp @@ -2,8 +2,9 @@ (in-package :mezzano.supervisor) -(sys.int::defglobal *global-thread-lock* nil - "This lock protects the special variables that make up the thread list/run queues and the thread objects.") +(sys.int::defglobal *global-thread-lock* :unlocked + "This lock protects the special variables that make up the thread list/run queues and the thread objects. +Free value is :UNLOCKED; held value is the owning CPU (a TATAS place spinlock).") (sys.int::defglobal *supervisor-priority-run-queue*) (sys.int::defglobal *high-priority-run-queue*) (sys.int::defglobal *normal-priority-run-queue*) @@ -795,7 +796,7 @@ not and WAIT-P is false." (defun initialize-threads () (when (not (boundp '*global-thread-lock*)) ;; First-run stuff. - (setf *global-thread-lock* nil) + (setf *global-thread-lock* :unlocked) (setf *supervisor-priority-run-queue* (make-run-queue :supervisor) *high-priority-run-queue* (make-run-queue :high) *normal-priority-run-queue* (make-run-queue :normal) From a62518aae959fd5f078b273d56b9fd6161c9320b Mon Sep 17 00:00:00 2001 From: iska Date: Mon, 15 Jun 2026 01:21:30 +0500 Subject: [PATCH 26/42] supervisor: pre-allocate RCU deferred cons before disabling interrupts --- supervisor/thread.lisp | 53 ++++++++++++++++++++++++------------------ 1 file changed, 30 insertions(+), 23 deletions(-) diff --git a/supervisor/thread.lisp b/supervisor/thread.lisp index 14eaea28..f5c7ac4c 100644 --- a/supervisor/thread.lisp +++ b/supervisor/thread.lisp @@ -609,29 +609,36 @@ Interrupts must be off and the global thread lock must be held." ;; This is seperate from thread-entry-trampoline so steppers can detect it. (defun thread-final-cleanup (return-values) - (%run-on-wired-stack-without-interrupts (sp fp return-values) - (let ((self (current-thread))) - ;; FIXME: This should be done with the global lock held, but that makes - ;; the lock ordering incorrect in (setf event-state). - ;; (setf event-state) expects to be called with the thread lock released. - ;; This leaves a small race window between the thread's join event - ;; being set and the thread state being set to dead, but this is only - ;; visible on SMP as interrupts are disabled here. - (setf (event-state (thread-join-event self)) (or return-values :no-values)) - (acquire-global-thread-lock) - (setf (thread-state self) :dead) - ;; Defer physical removal from *all-threads*. RCU readers (see - ;; ALL-THREADS / WITH-RCU-READ-LOCK) skip :dead threads, and thread - ;; objects are reclaimed by the GC once unreferenced, so unlinking - ;; dead nodes is safe even while a reader is mid-traversal on a - ;; strongly-ordered architecture. - (push-wired self *rcu-deferred-list*) - ;; Drain immediately. This is the only writer of - ;; *rcu-deferred-list* and it runs under the global lock, so every - ;; thread exit leaves the list empty -- no unbounded growth and no - ;; need for a separate grace period in a GC'd runtime. - (%cleanup-dead-threads) - (%reschedule-via-wired-stack sp fp)))) + ;; Pre-allocate the cons cell for *rcu-deferred-list* before entering + ;; the interrupt-disabled context. Wired allocation requires interrupts + ;; to be enabled (it takes pseudo-atomic), but + ;; %RUN-ON-WIRED-STACK-WITHOUT-INTERRUPTS disables them. + (let ((deferred-cons (sys.int::cons-in-area nil nil :wired))) + (%run-on-wired-stack-without-interrupts (sp fp return-values) + (let ((self (current-thread))) + ;; FIXME: This should be done with the global lock held, but that makes + ;; the lock ordering incorrect in (setf event-state). + ;; (setf event-state) expects to be called with the thread lock released. + ;; This leaves a small race window between the thread's join event + ;; being set and the thread state being set to dead, but this is only + ;; visible on SMP as interrupts are disabled here. + (setf (event-state (thread-join-event self)) (or return-values :no-values)) + (acquire-global-thread-lock) + (setf (thread-state self) :dead) + ;; Defer physical removal from *all-threads*. RCU readers (see + ;; ALL-THREADS / WITH-RCU-READ-LOCK) skip :dead threads, and thread + ;; objects are reclaimed by the GC once unreferenced, so unlinking + ;; dead nodes is safe even while a reader is mid-traversal on a + ;; strongly-ordered architecture. + (setf (car deferred-cons) self + (cdr deferred-cons) *rcu-deferred-list* + *rcu-deferred-list* deferred-cons) + ;; Drain immediately. This is the only writer of + ;; *rcu-deferred-list* and it runs under the global lock, so every + ;; thread exit leaves the list empty -- no unbounded growth and no + ;; need for a separate grace period in a GC'd runtime. + (%cleanup-dead-threads) + (%reschedule-via-wired-stack sp fp))))) (defun thread-join (thread &optional (wait-p t)) "Wait for THREAD to exit. From 005cf9cb9242adc5420a09c5c6a19a1d8a20826f Mon Sep 17 00:00:00 2001 From: iska Date: Mon, 15 Jun 2026 14:24:12 +0500 Subject: [PATCH 27/42] fixes to TLB and MCS --- supervisor/interrupts.lisp | 4 ++-- supervisor/sync.lisp | 8 ++++---- supervisor/thread.lisp | 34 ++++++++++++++-------------------- supervisor/x86-64/cpu.lisp | 35 +++++++++++++++++++---------------- supervisor/x86-64/thread.lisp | 20 ++++---------------- 5 files changed, 43 insertions(+), 58 deletions(-) diff --git a/supervisor/interrupts.lisp b/supervisor/interrupts.lisp index e9e5ca0c..965c4d68 100644 --- a/supervisor/interrupts.lisp +++ b/supervisor/interrupts.lisp @@ -155,7 +155,7 @@ NOTE: do NOT nest MCS spinlock acquisitions on the same CPU." ;; Acquire barrier: make sure protected-data reads are not ;; reordered before the lock is observed held. Required on ;; weakly-ordered ARM64 (x86-64 TSO folds this into the CAS). - (cpu-memory-barrier) + #+arm64 (cpu-memory-barrier) (values))))) (defmacro release-mcs-spinlock (place) @@ -167,7 +167,7 @@ NOTE: do NOT nest MCS spinlock acquisitions on the same CPU." ;; critical section are visible before the handoff (or before the ;; lock word goes to nil for the uncontended release). Required ;; on weakly-ordered ARM64. - (cpu-memory-barrier) + #+arm64 (cpu-memory-barrier) (block release-mcs-spinlock (if (null (mcs-node-next ,mcs-node)) (if (eql (sys.int::cas ,cas-target ,mcs-node nil) ,mcs-node) diff --git a/supervisor/sync.lisp b/supervisor/sync.lisp index d6d37942..d81e5852 100644 --- a/supervisor/sync.lisp +++ b/supervisor/sync.lisp @@ -1188,10 +1188,10 @@ It is only possible for the second value to be false when wait-p is false." ;;; The MCS queue-based spinlock primitives live in interrupts.lisp ;;; (ACQUIRE-MCS-SPINLOCK / RELEASE-MCS-SPINLOCK / WITH-MCS-SPINLOCK) and -;;; each CPU has a pre-allocated MCS node in its cpu struct. They are not -;;; currently used by *global-thread-lock*, which is released from the LAP -;;; context-switch trampolines and so stays a TATAS place spinlock; see -;;; thread.lisp. +;;; each CPU has a pre-allocated MCS node in its cpu struct. +;;; *global-thread-lock* uses MCS; it is released from Lisp in +;;; %%SWITCH-TO-THREAD-COMMON (on the kernel wired stack) before the LAP +;;; restore trampolines switch to the new thread's stack. ;;; RCU primitives for lock-free read-side access. ;;; Each CPU has an rcu-nest counter in the cpu struct. diff --git a/supervisor/thread.lisp b/supervisor/thread.lisp index f5c7ac4c..5e09236a 100644 --- a/supervisor/thread.lisp +++ b/supervisor/thread.lisp @@ -2,9 +2,9 @@ (in-package :mezzano.supervisor) -(sys.int::defglobal *global-thread-lock* :unlocked +(sys.int::defglobal *global-thread-lock* nil "This lock protects the special variables that make up the thread list/run queues and the thread objects. -Free value is :UNLOCKED; held value is the owning CPU (a TATAS place spinlock).") +Free value is NIL; held/contended value is an MCS node chain (MCS queue spinlock).") (sys.int::defglobal *supervisor-priority-run-queue*) (sys.int::defglobal *high-priority-run-queue*) (sys.int::defglobal *normal-priority-run-queue*) @@ -259,26 +259,22 @@ can be reprotected.") ;;; Locking. -;;; Note: *global-thread-lock* deliberately uses the TATAS place spinlock -;;; rather than the MCS queue spinlock. It is released from the LAP context -;;; switch trampolines (%%RESTORE-FULL-SAVE-THREAD / %%RESTORE-PARTIAL-SAVE- -;;; THREAD) after switching off the old thread's stack, where only a single -;;; store can be emitted; the MCS release protocol (CAS + spin + handoff to -;;; the next waiter) cannot be expressed there. The MCS infrastructure -;;; (acquire-mcs-spinlock etc.) remains available for locks that are never -;;; released from LAP. +;;; *global-thread-lock* uses the per-CPU MCS queue spinlock for fair +;;; FIFO acquisition and local-cache-line spinning. It is released from +;;; Lisp code in %%SWITCH-TO-THREAD-COMMON (on the kernel wired stack) +;;; before the LAP restore trampolines switch to the new thread's stack. (defun acquire-global-thread-lock () - (acquire-symbol-spinlock *global-thread-lock*)) + (acquire-mcs-spinlock *global-thread-lock*)) (defun release-global-thread-lock () - (release-symbol-spinlock *global-thread-lock*)) + (release-mcs-spinlock *global-thread-lock*)) (defmacro with-global-thread-lock ((&optional) &body body) - `(with-symbol-spinlock (*global-thread-lock*) + `(with-mcs-spinlock (*global-thread-lock*) ,@body)) (defun ensure-global-thread-lock-held () - (ensure-symbol-spinlock-held *global-thread-lock*)) + (ensure-mcs-spinlock-held *global-thread-lock*)) ;;; Run queue management. @@ -494,11 +490,9 @@ Interrupts must be off and the global thread lock must be held." (check-tlb-generation-consistency) ;; Restore FPU state. (restore-fpu-state new-thread) - ;; On x86-64 the global thread lock is dropped by the assembly restore - ;; trampolines AFTER switching off this (the old thread's) stack, so that - ;; another CPU may safely resume the old thread without colliding on its - ;; wired stack. ARM64 still releases here from Lisp for now. - #+arm64 (release-global-thread-lock) + ;; The global thread lock is released here, on the kernel wired stack, + ;; before the LAP restore trampolines switch to the new thread's stack. + (release-global-thread-lock) (if (thread-full-save-p new-thread) (%%restore-full-save-thread new-thread) (%%restore-partial-save-thread new-thread))) @@ -803,7 +797,7 @@ not and WAIT-P is false." (defun initialize-threads () (when (not (boundp '*global-thread-lock*)) ;; First-run stuff. - (setf *global-thread-lock* :unlocked) + (setf *global-thread-lock* nil) (setf *supervisor-priority-run-queue* (make-run-queue :supervisor) *high-priority-run-queue* (make-run-queue :high) *normal-priority-run-queue* (make-run-queue :normal) diff --git a/supervisor/x86-64/cpu.lisp b/supervisor/x86-64/cpu.lisp index b6532e5b..73201daf 100644 --- a/supervisor/x86-64/cpu.lisp +++ b/supervisor/x86-64/cpu.lisp @@ -371,13 +371,19 @@ If the CPU is idle, this will cause it to check for new threads." (sys.int::defglobal *current-tlb-generation*) (defun check-tlb-generation-consistency () - "If this CPU missed a TLB shootdown while idle, flush now." + "If this CPU missed a TLB shootdown while idle, flush now. +If a shootdown is still in progress, flush but do not stamp the +generation — the context-switch path will re-check after the shootdown +finishes and the final generation is known." (when (and (boundp '*current-tlb-generation*) - (not (eql (cpu-tlb-generation (local-cpu)) - *current-tlb-generation*))) + (boundp '*tlb-shootdown-in-progress*) + (or *tlb-shootdown-in-progress* + (not (eql (cpu-tlb-generation (local-cpu)) + *current-tlb-generation*)))) (flush-tlb) - (setf (cpu-tlb-generation (local-cpu)) - *current-tlb-generation*))) + (unless *tlb-shootdown-in-progress* + (setf (cpu-tlb-generation (local-cpu)) + *current-tlb-generation*)))) (defun begin-tlb-shootdown () "Bring all CPUs to state ready for TLB shootdown. @@ -385,18 +391,17 @@ TLB shootdown must be protected by the VM lock." (ensure (rw-lock-write-held-p *vm-lock*) "VM lock not held when doing TLB shootdown!") (ensure (not *tlb-shootdown-in-progress*) "TLB shootdown already in progress!") (setf *tlb-shootdown-in-progress* t) - (setf *busy-tlb-shootdown-cpus* (1- *n-up-cpus*)) - (setf *tlb-shootdown-n-targets* 0) + ;; Bump the generation before sending IPIs. If an idle CPU wakes + ;; during the shootdown window, check-tlb-generation-consistency will + ;; see the mismatch and flush before resuming a thread. (Device IRQs + ;; prevent us from skipping IPIs entirely: an idle CPU still handles + ;; device interrupts that touch pageable memory.) + (sys.int::%atomic-fixnum-add-symbol '*current-tlb-generation* 1) ;; Prevent migration during shootdown. (setf (cpu-inhibit-scheduling (local-cpu)) (1+ (cpu-inhibit-scheduling (local-cpu)))) - ;; Send IPIs to every other online CPU. Idle CPUs are intentionally - ;; NOT skipped: an idle CPU may wake during the shootdown window, run a - ;; thread, and cache a stale translation before *current-tlb-generation* - ;; is bumped in FINISH-TLB-SHOOTDOWN. Receiving the IPI forces it to - ;; flush regardless of idle state. (The lazy generation check in - ;; CHECK-TLB-GENERATION-CONSISTENCY remains as a defence-in-depth for - ;; CPUs that were offline entirely during the shootdown.) + (setf *busy-tlb-shootdown-cpus* (1- *n-up-cpus*)) + (setf *tlb-shootdown-n-targets* 0) (dolist (cpu *cpus*) (when (and (eql (x86-64-cpu-state cpu) :online) (not (eql cpu (local-cpu)))) @@ -411,8 +416,6 @@ TLB shootdown must be protected by the VM lock." (defun finish-tlb-shootdown () (ensure *tlb-shootdown-in-progress*) - ;; Bump the global generation so idle CPUs know to flush on wakeup. - (sys.int::%atomic-fixnum-add-symbol '*current-tlb-generation* 1) ;; Only wait for CPUs that actually received the IPI in begin-tlb-shootdown. (setf *busy-tlb-shootdown-cpus* *tlb-shootdown-n-targets*) (setf *tlb-shootdown-in-progress* nil) diff --git a/supervisor/x86-64/thread.lisp b/supervisor/x86-64/thread.lisp index 0e351640..75c7bb9f 100644 --- a/supervisor/x86-64/thread.lisp +++ b/supervisor/x86-64/thread.lisp @@ -42,16 +42,9 @@ (setf (sys.int::msr +msr-ia32-gs-base+) (sys.int::lisp-object-address thread))) (sys.int::define-lap-function %%restore-full-save-thread ((thread)) - ;; Returning to an interrupted thread. Restore saved registers and stuff. + ;; The global thread lock is released by Lisp code before calling + ;; this function. Returning to an interrupted thread. (sys.lap-x86:lea64 :rsp (:object :r8 #.+thread-interrupt-save-area+)) - ;; Now switched off the old thread's stack, so it is safe to drop the - ;; global thread lock. Releasing here (rather than before the stack - ;; switch) means another CPU can resume the old thread without - ;; colliding on its wired/exception stack. r9/r10 are clobbered but - ;; restored by the pops below. - (sys.lap-x86:mov64 :r9 (:symbol-global-cell *global-thread-lock*)) - (sys.lap-x86:mov64 :r10 (:constant :unlocked)) - (sys.lap-x86:mov64 (:object :r9 #.sys.int::+symbol-value-cell-value+) :r10) (sys.lap-x86:pop :r15) (sys.lap-x86:pop :r14) (sys.lap-x86:pop :r13) @@ -70,17 +63,12 @@ (sys.lap-x86:iret)) (sys.int::define-lap-function %%restore-partial-save-thread ((thread)) + ;; The global thread lock is released by Lisp code before calling + ;; this function. ;; Restore stack pointer. (sys.lap-x86:mov64 :rsp (:object :r8 #.+thread-state-rsp+)) ;; Restore frame pointer. (sys.lap-x86:mov64 :rbp (:object :r8 #.+thread-state-rbp+)) - ;; Now switched off the old thread's (wired) stack, so it is safe to - ;; drop the global thread lock here -- another CPU may resume the old - ;; thread without colliding on its wired stack. r9/r10 are scratch - ;; (not part of the partial-save register set). - (sys.lap-x86:mov64 :r9 (:symbol-global-cell *global-thread-lock*)) - (sys.lap-x86:mov64 :r10 (:constant :unlocked)) - (sys.lap-x86:mov64 (:object :r9 #.sys.int::+symbol-value-cell-value+) :r10) ;; Reenable interrupts. Must be done before touching the thread stack. (sys.lap-x86:sti) (:gc :no-frame :layout #*0) From 3d60b904fb920adcde92423a0f9cc4758339c5b0 Mon Sep 17 00:00:00 2001 From: iska Date: Mon, 15 Jun 2026 17:17:09 +0500 Subject: [PATCH 28/42] x86-64: proper lazy TLB shootdown, cleanup --- supervisor/acpi.lisp | 15 ------------ supervisor/sync.lisp | 14 ----------- supervisor/x86-64/cpu.lisp | 39 +++++++++++++++---------------- supervisor/x86-64/interrupts.lisp | 3 +++ supervisor/x86-64/io-apic.lisp | 3 +++ 5 files changed, 25 insertions(+), 49 deletions(-) diff --git a/supervisor/acpi.lisp b/supervisor/acpi.lisp index 6c5b0a1e..8506c0c6 100644 --- a/supervisor/acpi.lisp +++ b/supervisor/acpi.lisp @@ -511,21 +511,6 @@ (debug-print-line " Table " i " " (svref tables i) " " (acpi-table-header-signature header)))) (setf *acpi* tables)))))) -(defun acpi-find-interrupt-source-override (source-irq) - "Find the GSI for a legacy ISA IRQ from MADT interrupt source overrides. -Returns the GSI number if an override exists, or SOURCE-IRQ if not found." - (let ((madt (acpi-get-table 'acpi-madt-table-p))) - (when madt - (dotimes (i (sys.int::simple-vector-length - (acpi-madt-table-controllers madt))) - (let ((entry (svref (acpi-madt-table-controllers madt) i))) - (when (and (acpi-madt-interrupt-source-override-p entry) - (eql (acpi-madt-interrupt-source-override-source entry) - source-irq)) - (return-from acpi-find-interrupt-source-override - (acpi-madt-interrupt-source-override-global-system-interrupt entry)))))) - source-irq)) - (defun acpi-get-table (predicate) (cond (*acpi* diff --git a/supervisor/sync.lisp b/supervisor/sync.lisp index d81e5852..114adcaf 100644 --- a/supervisor/sync.lisp +++ b/supervisor/sync.lisp @@ -1237,20 +1237,6 @@ on every thread exit inside THREAD-FINAL-CLEANUP." "Execute BODY after an RCU grace period." `(call-with-rcu-synchronize (lambda () ,@body))) -;;; Per-CPU counter helpers. -;;; Define a set of INC/DEC/READ functions for a per-CPU slot. - -(defmacro define-percpu-counter (name slot) - `(progn - (defun ,(intern (format nil "INC-~A" name)) () - (setf (,slot (local-cpu)) (1+ (,slot (local-cpu))))) - (defun ,(intern (format nil "DEC-~A" name)) () - (setf (,slot (local-cpu)) (1- (,slot (local-cpu))))) - (defun ,(intern (format nil "READ-~A" name)) () - (let ((total 0)) - (dolist (cpu *cpus* total) - (incf total (,slot cpu))))))) - (defun initialize-sync (first-run-p) (when first-run-p (setf *watcher-watcher-pool* diff --git a/supervisor/x86-64/cpu.lisp b/supervisor/x86-64/cpu.lisp index 73201daf..8de46a82 100644 --- a/supervisor/x86-64/cpu.lisp +++ b/supervisor/x86-64/cpu.lisp @@ -203,7 +203,8 @@ The bootloader is loaded to #x7C00, so #x7000 should be safe.") (setf (physical-memref-unsigned-byte-32 (+ *lapic-address* (ash (1+ register) 4))) (ldb (byte 32 32) value))))) -;; Legacy aliases for gradual migration. +;; Mode-dispatching LAPIC accessors, retaining the original lapic-reg +;; names to avoid a wholesale rename of every call site. (defun lapic-reg (register) (read-lapic register)) @@ -391,24 +392,32 @@ TLB shootdown must be protected by the VM lock." (ensure (rw-lock-write-held-p *vm-lock*) "VM lock not held when doing TLB shootdown!") (ensure (not *tlb-shootdown-in-progress*) "TLB shootdown already in progress!") (setf *tlb-shootdown-in-progress* t) - ;; Bump the generation before sending IPIs. If an idle CPU wakes - ;; during the shootdown window, check-tlb-generation-consistency will - ;; see the mismatch and flush before resuming a thread. (Device IRQs - ;; prevent us from skipping IPIs entirely: an idle CPU still handles - ;; device interrupts that touch pageable memory.) + ;; Bump the generation before sending IPIs so idle CPUs that wake + ;; during the shootdown window will see the mismatch and flush on + ;; their next context switch. Idle CPUs are skipped — if they + ;; handle a device IRQ mid-shootdown the IRQ handler calls + ;; check-tlb-generation-consistency to flush before touching + ;; pageable memory. (sys.int::%atomic-fixnum-add-symbol '*current-tlb-generation* 1) ;; Prevent migration during shootdown. (setf (cpu-inhibit-scheduling (local-cpu)) (1+ (cpu-inhibit-scheduling (local-cpu)))) + ;; Initialise *busy-tlb-shootdown-cpus* to (n_cpus - 1) BEFORE any + ;; IPI so the handler's decrement always has a bound value. Then + ;; send IPIs to non-idle CPUs and decrement the counter for idle + ;; CPUs we skip. (setf *busy-tlb-shootdown-cpus* (1- *n-up-cpus*)) (setf *tlb-shootdown-n-targets* 0) (dolist (cpu *cpus*) (when (and (eql (x86-64-cpu-state cpu) :online) (not (eql cpu (local-cpu)))) - (incf *tlb-shootdown-n-targets*) - (send-ipi-to-cpu cpu +ipi-type-fixed+ - +tlb-shootdown-ipi-vector+))) - ;; Wait for other CPUs to reach the handler. + (if (cpu-idle-p cpu) + (sys.int::%atomic-fixnum-add-symbol '*busy-tlb-shootdown-cpus* -1) + (progn + (incf *tlb-shootdown-n-targets*) + (send-ipi-to-cpu cpu +ipi-type-fixed+ + +tlb-shootdown-ipi-vector+))))) + ;; Wait for targeted CPUs to reach the handler. (loop (when (eql *busy-tlb-shootdown-cpus* 0) (return)) @@ -1101,12 +1110,6 @@ This is a one-shot timer and must be reset after firing." (sys.int::msr #x802) (ldb (byte 8 24) (read-lapic +lapic-reg-id+)))) -(defun send-self-ipi (vector) - (check-type vector (unsigned-byte 8)) - (if *lapic-x2apic-mode* - (setf (sys.int::msr +x2apic-msr-self-ipi+) vector) - (send-ipi (read-local-apic-id) +ipi-type-fixed+ vector))) - (defun initialize-early-cpu () (setf *lapic-address* nil)) @@ -1341,7 +1344,3 @@ This is a one-shot timer and must be reset after firing." (defun dma-write-barrier () (%mfence)) - -(defun cpu-memory-barrier () - "Full memory barrier for ordering lock data accesses." - (%mfence)) diff --git a/supervisor/x86-64/interrupts.lisp b/supervisor/x86-64/interrupts.lisp index fc9a9b91..ca8b4747 100644 --- a/supervisor/x86-64/interrupts.lisp +++ b/supervisor/x86-64/interrupts.lisp @@ -272,6 +272,9 @@ If clear, the fault occured in supervisor mode.") (debug-print-line "Spurious i8259 IRQ " irq ". Further spurious IRQs will not be reported.")) (incf *i8259-spurious-interrupt-count*) (return-from i8259-interrupt-handler))) + ;; If this CPU was idle during a TLB shootdown and missed the IPI, + ;; flush now before the IRQ handler touches any pageable memory. + (check-tlb-generation-consistency) (irq-deliver interrupt-frame (svref *i8259-irqs* irq)) (with-symbol-spinlock (*i8259-spinlock*) ;; Send EOI. diff --git a/supervisor/x86-64/io-apic.lisp b/supervisor/x86-64/io-apic.lisp index ff87bb1f..3243d7e7 100644 --- a/supervisor/x86-64/io-apic.lisp +++ b/supervisor/x86-64/io-apic.lisp @@ -220,6 +220,9 @@ nil))) (defun io-apic-interrupt-handler (interrupt-frame info) + ;; If this CPU was idle during a TLB shootdown and missed the IPI, + ;; flush now before the IRQ handler touches any pageable memory. + (check-tlb-generation-consistency) (let ((gsi (vector->gsi info))) (when (and (<= 0 gsi) (< gsi 256)) (irq-deliver interrupt-frame (svref *io-apic-irqs* gsi))) From 7579cbf082b0d69635ed31b736935ecd9fe2f110 Mon Sep 17 00:00:00 2001 From: iska Date: Mon, 15 Jun 2026 19:13:15 +0500 Subject: [PATCH 29/42] x86-64: add send-self-ipi, make broadcast-ipi x2APIC-aware, fix magic-button to use broadcast-ipi --- supervisor/x86-64/cpu.lisp | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/supervisor/x86-64/cpu.lisp b/supervisor/x86-64/cpu.lisp index 8de46a82..973a5600 100644 --- a/supervisor/x86-64/cpu.lisp +++ b/supervisor/x86-64/cpu.lisp @@ -230,7 +230,16 @@ The bootloader is loaded to #x7C00, so #x7000 should be safe.") (setf (lapic-reg +lapic-reg-interrupt-command-high+) (ash target 24)) (setf (lapic-reg +lapic-reg-interrupt-command-low+) (logior #x4000 (ash type 8) - vector))))) + vector))))) + +(defun send-self-ipi (vector) + "Send an IPI to the local CPU. +In x2APIC mode uses the Self IPI MSR (0x83F), a single WRMSR. +In xAPIC mode falls back to a regular ICR write targeting self." + (check-type vector (unsigned-byte 8)) + (if *lapic-x2apic-mode* + (setf (sys.int::msr +x2apic-msr-self-ipi+) vector) + (send-ipi (read-local-apic-id) +ipi-type-fixed+ vector))) (defun send-ipi-to-cpu (cpu type vector) (send-ipi (x86-64-cpu-apic-id cpu) type vector)) @@ -252,8 +261,10 @@ The bootloader is loaded to #x7C00, so #x7000 should be safe.") (defun broadcast-ipi (type vector &optional including-self) (when (and (boundp '*lapic-address*) *lapic-address*) - (safe-without-interrupts (type vector including-self) - (send-ipi-to-all type vector :including-self including-self)))) + (if (and (boundp '*lapic-x2apic-mode*) *lapic-x2apic-mode*) + (send-ipi-to-all type vector :including-self including-self) + (safe-without-interrupts (type vector including-self) + (send-ipi-to-all type vector :including-self including-self))))) (defun broadcast-wakeup-ipi () (when (and (boundp '*lapic-address*) @@ -328,7 +339,7 @@ Protected by the world stop lock." (defun stop-other-cpus-for-debug-magic-button () (setf *debug-magic-button-ready-variable* (1- *n-up-cpus*) *debug-magic-button-hold-variable* t) - (send-ipi-to-all +ipi-type-fixed+ +quiesce-ipi-vector+) + (broadcast-ipi +ipi-type-fixed+ +quiesce-ipi-vector+) ;; Wait for other CPUs to arrive, this ensures the thread state is actually ;; consistent. (loop until (eql *debug-magic-button-ready-variable* 0))) From 7407a3ebd3281115d92e7ac5c60fd5c9cbbf2c8f Mon Sep 17 00:00:00 2001 From: iska Date: Mon, 15 Jun 2026 19:30:58 +0500 Subject: [PATCH 30/42] x86-64, arm64: inhibit scheduling during quiesce-cpus-for-world-stop, remove stale TODO --- supervisor/arm64/cpu.lisp | 8 ++++++-- supervisor/x86-64/cpu.lisp | 9 ++++++--- supervisor/x86-64/interrupts.lisp | 1 - 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/supervisor/arm64/cpu.lisp b/supervisor/arm64/cpu.lisp index 70ed97d9..a766b16f 100644 --- a/supervisor/arm64/cpu.lisp +++ b/supervisor/arm64/cpu.lisp @@ -252,17 +252,21 @@ (sys.int::defglobal *non-quiescent-cpus-remaining*) -;; FIXME: quiesce-cpus-for-world-stop needs to prevent migration across CPUs. (defun quiesce-cpus-for-world-stop () "Bring all CPUs to a consistent state to stop the world. Protected by the world stop lock." + ;; Prevent migration during the broadcast and busy-wait. + (setf (cpu-inhibit-scheduling (local-cpu)) + (1+ (cpu-inhibit-scheduling (local-cpu)))) (setf *non-quiescent-cpus-remaining* (1- *n-up-cpus*)) (send-ipi-to-all +quiesce-sgi-id+) ;; FIXME: Use WFE/SEV instead of this spin-loop. (loop (when (eql *non-quiescent-cpus-remaining* 0) (return)) - (sys.int::cpu-relax))) + (sys.int::cpu-relax)) + (setf (cpu-inhibit-scheduling (local-cpu)) + (max 0 (1- (cpu-inhibit-scheduling (local-cpu)))))) ;; Save the current thread's state and switch to the CPU's idle thread. (defun quiesce-ipi-handler (interrupt-frame) diff --git a/supervisor/x86-64/cpu.lisp b/supervisor/x86-64/cpu.lisp index 973a5600..d66a702b 100644 --- a/supervisor/x86-64/cpu.lisp +++ b/supervisor/x86-64/cpu.lisp @@ -288,17 +288,20 @@ In xAPIC mode falls back to a regular ICR write targeting self." (sys.int::defglobal *non-quiescent-cpus-remaining*) -;; FIXME: quiesce-cpus-for-world-stop and begin-tlb-shootdown both need to -;; prevent migration across CPUs. (defun quiesce-cpus-for-world-stop () "Bring all CPUs to a consistent state to stop the world. Protected by the world stop lock." + ;; Prevent migration during the broadcast and busy-wait. + (setf (cpu-inhibit-scheduling (local-cpu)) + (1+ (cpu-inhibit-scheduling (local-cpu)))) (setf *non-quiescent-cpus-remaining* (1- *n-up-cpus*)) (broadcast-ipi +ipi-type-fixed+ +quiesce-ipi-vector+) (loop (when (eql *non-quiescent-cpus-remaining* 0) (return)) - (sys.int::cpu-relax))) + (sys.int::cpu-relax)) + (setf (cpu-inhibit-scheduling (local-cpu)) + (max 0 (1- (cpu-inhibit-scheduling (local-cpu)))))) ;; Save the current thread's state and switch to the CPU's idle thread. (defun quiesce-ipi-handler (interrupt-frame info) diff --git a/supervisor/x86-64/interrupts.lisp b/supervisor/x86-64/interrupts.lisp index ca8b4747..a19d6526 100644 --- a/supervisor/x86-64/interrupts.lisp +++ b/supervisor/x86-64/interrupts.lisp @@ -306,7 +306,6 @@ If clear, the fault occured in supervisor mode.") (setf (sys.int::io-port/8 #xA1) (ldb (byte 8 8) *i8259-shadow-mask*))))))) (defun initialize-i8259 () - ;; TODO: do the APIC & IO-APIC as well. (when (not (boundp '*i8259-irqs*)) (setf *i8259-irqs* (sys.int::make-simple-vector 16 :wired) ;; fixme: do at cold-gen time. From ae6e5fd64fc3116854adcb57c83013a8ce79d784 Mon Sep 17 00:00:00 2001 From: iska Date: Mon, 15 Jun 2026 19:35:14 +0500 Subject: [PATCH 31/42] x86-64: remove unused read-lapic64 and write-lapic64 --- supervisor/x86-64/cpu.lisp | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/supervisor/x86-64/cpu.lisp b/supervisor/x86-64/cpu.lisp index d66a702b..d8e89f92 100644 --- a/supervisor/x86-64/cpu.lisp +++ b/supervisor/x86-64/cpu.lisp @@ -188,21 +188,6 @@ The bootloader is loaded to #x7C00, so #x7000 should be safe.") (logand value #xFFFFFFFF)) (setf (physical-memref-unsigned-byte-32 (+ *lapic-address* (ash register 4))) value))) -(defun read-lapic64 (register) - (if *lapic-x2apic-mode* - (sys.int::msr (lapic-reg-to-msr register)) - (logior (physical-memref-unsigned-byte-32 (+ *lapic-address* (ash register 4))) - (ash (physical-memref-unsigned-byte-32 (+ *lapic-address* (ash (1+ register) 4))) 32)))) - -(defun write-lapic64 (value register) - (if *lapic-x2apic-mode* - (setf (sys.int::msr (lapic-reg-to-msr register)) (logand value #xFFFFFFFFFFFFFFFF)) - (progn - (setf (physical-memref-unsigned-byte-32 (+ *lapic-address* (ash register 4))) - (ldb (byte 32 0) value)) - (setf (physical-memref-unsigned-byte-32 (+ *lapic-address* (ash (1+ register) 4))) - (ldb (byte 32 32) value))))) - ;; Mode-dispatching LAPIC accessors, retaining the original lapic-reg ;; names to avoid a wholesale rename of every call site. (defun lapic-reg (register) From fd5218dc7eeda96819de3d90e6e0744bed74f29e Mon Sep 17 00:00:00 2001 From: iska Date: Mon, 15 Jun 2026 20:06:59 +0500 Subject: [PATCH 32/42] pager.lisp: deduplicate code --- supervisor/pager.lisp | 42 +++++++++++++++++------------------------- 1 file changed, 17 insertions(+), 25 deletions(-) diff --git a/supervisor/pager.lisp b/supervisor/pager.lisp index 1cde1695..507ba551 100644 --- a/supervisor/pager.lisp +++ b/supervisor/pager.lisp @@ -581,31 +581,23 @@ Returns NIL if the entry is missing and ALLOCATE is false." (declare (ignore ignore1 ignore2 ignore3)) (pager-log-op "Update wired dirty bits") (with-rw-lock-write (*vm-lock*) - (begin-tlb-shootdown) - (map-ptes - sys.int::*wired-area-base* sys.int::*wired-area-bump* - (dx-lambda (wired-page pte) - (when (not pte) - (panic "Missing pte for wired page " wired-page)) - (when (page-dirty-p pte) - (setf (sys.int::card-table-dirty-gen wired-page) 0) - ;; ARM64's dirty bit emulation does not support emulating - ;; dirty bits in the wired area yet. - #-arm64 - (update-pte pte :dirty nil)))) - (map-ptes - sys.int::*wired-function-area-limit* sys.int::*function-area-base* - (dx-lambda (wired-page pte) - (when (not pte) - (panic "Missing pte for wired function page " wired-page)) - (when (page-dirty-p pte) - (setf (sys.int::card-table-dirty-gen wired-page) 0) - ;; ARM64's dirty bit emulation does not support emulating - ;; dirty bits in the wired area yet. - #-arm64 - (update-pte pte :dirty nil)))) - (tlb-shootdown-all) - (finish-tlb-shootdown))) + (macrolet ((update-dirty-in-range (start end name) + `(map-ptes + ,start ,end + (dx-lambda (wired-page pte) + (when (not pte) + (panic "Missing pte for " ,name " " wired-page)) + (when (page-dirty-p pte) + (setf (sys.int::card-table-dirty-gen wired-page) 0) + #-arm64 + (update-pte pte :dirty nil)))))) + (begin-tlb-shootdown) + (update-dirty-in-range sys.int::*wired-area-base* sys.int::*wired-area-bump* + "wired page") + (update-dirty-in-range sys.int::*wired-function-area-limit* sys.int::*function-area-base* + "wired function page") + (tlb-shootdown-all) + (finish-tlb-shootdown)))) (defun get-page-physical-address (virtual-address) "Return the physical page frame mapped for VIRTUAL-ADDRESS. From 543c9edd844c287e24064bd91f091df33a3e41d0 Mon Sep 17 00:00:00 2001 From: iska Date: Mon, 15 Jun 2026 20:34:12 +0500 Subject: [PATCH 33/42] deslop 1 --- drivers/sound.lisp | 4 ++-- supervisor/arm64/cpu.lisp | 2 +- supervisor/interrupts.lisp | 10 ++++------ supervisor/thread.lisp | 3 +-- supervisor/x86-64/cpu.lisp | 6 +++--- 5 files changed, 11 insertions(+), 14 deletions(-) diff --git a/drivers/sound.lisp b/drivers/sound.lisp index 9ea8878f..acc2215a 100644 --- a/drivers/sound.lisp +++ b/drivers/sound.lisp @@ -132,7 +132,7 @@ (fill buffer 0.0 :start start :end end)) (prog1 ;; Try-lock the sink mutex. If the music player holds it (inside - ;; output-sound's transcode path), return T immediately — the + ;; output-sound's transcode path), return T immediately: the ;; buffer is already zeroed, so the HDA plays silence for this ;; period. Next period will retry. (if (mezzano.supervisor:acquire-mutex *sink-lock* nil) @@ -311,7 +311,7 @@ (loop (when (buffer-empty sink) ;; Buffer is currently empty, sink is not live. - ;; Only add to the sink list if not already present — the + ;; Only add to the sink list if not already present. the ;; callback may have left it if it was re-filled concurrently. (unless (member sink *sinks*) (setf (sup:event-state *sinks-present-event*) t) diff --git a/supervisor/arm64/cpu.lisp b/supervisor/arm64/cpu.lisp index a766b16f..9ba20100 100644 --- a/supervisor/arm64/cpu.lisp +++ b/supervisor/arm64/cpu.lisp @@ -321,7 +321,7 @@ Protected by the world stop lock." '*debug-magic-button-ready-variable* -1)) ;; ARM64 has hardware-broadcast TLB invalidation (TLBI IS instructions). -;; No IPIs needed — the initiating CPU issues TLBI VAE1IS which broadcasts +;; No IPIs needed; the initiating CPU issues TLBI VAE1IS which broadcasts ;; to all CPUs in the inner shareable domain. We still bracket the operation ;; with inhibit-scheduling to prevent CPU migration. diff --git a/supervisor/interrupts.lisp b/supervisor/interrupts.lisp index 965c4d68..a37baf73 100644 --- a/supervisor/interrupts.lisp +++ b/supervisor/interrupts.lisp @@ -40,7 +40,7 @@ RETURN-FROM/GO must not be used to leave this form." nil ,@captures)) -;;; TATAS (test-and-test-and-set) spinlocks -- general purpose, supports nesting. +;;; TATAS (test-and-test-and-set) spinlocks. general purpose, supports nesting. (defun place-spinlock-initializer () :unlocked) @@ -120,7 +120,7 @@ RETURN-FROM/GO must not be used to leave this form." (check-type lock symbol) `(ensure-place-spinlock-held ,lock)) -;;; MCS (Mellor-Crummy-Scott) queue-based spinlocks -- fair, FIFO, each CPU +;;; MCS (Mellor-Crummy-Scott) queue-based spinlocks - fair, FIFO, each CPU ;;; spins on its own cache line. CANNOT be nested on the same CPU. (eval-when (:compile-toplevel :load-toplevel :execute) (defun mcs-cas-target (place) @@ -297,8 +297,7 @@ NOTE: do NOT nest MCS spinlock acquisitions on the same CPU." platform-number attachments (count 0) - (lock :unlocked) -) + (lock :unlocked)) (defstruct (irq-attachment (:area :wired)) @@ -383,8 +382,7 @@ NOTE: do NOT nest MCS spinlock acquisitions on the same CPU." latch event (state :masked) - (lock :unlocked) -) + (lock :unlocked)) (defun make-simple-irq (irq-number &optional latch) (declare (mezzano.compiler::closure-allocation :wired)) diff --git a/supervisor/thread.lisp b/supervisor/thread.lisp index 5e09236a..c559454f 100644 --- a/supervisor/thread.lisp +++ b/supervisor/thread.lisp @@ -629,8 +629,7 @@ Interrupts must be off and the global thread lock must be held." *rcu-deferred-list* deferred-cons) ;; Drain immediately. This is the only writer of ;; *rcu-deferred-list* and it runs under the global lock, so every - ;; thread exit leaves the list empty -- no unbounded growth and no - ;; need for a separate grace period in a GC'd runtime. + ;; thread exit leaves the list empty. (%cleanup-dead-threads) (%reschedule-via-wired-stack sp fp))))) diff --git a/supervisor/x86-64/cpu.lisp b/supervisor/x86-64/cpu.lisp index d8e89f92..37e2399c 100644 --- a/supervisor/x86-64/cpu.lisp +++ b/supervisor/x86-64/cpu.lisp @@ -232,7 +232,7 @@ In xAPIC mode falls back to a regular ICR write targeting self." (defun send-ipi-to-all (type vector &key including-self) (if (and (boundp '*lapic-x2apic-mode*) *lapic-x2apic-mode*) ;; x2APIC shorthand: the destination-shorthand field is ICR - ;; bits 19:18 -- 01=self, 10=all-including-self, 11=all-excluding-self. + ;; bits 19:18 - 01=self, 10=all-including-self, 11=all-excluding-self. (let ((icr (logior (ash type 8) vector))) (setf (sys.int::msr +x2apic-msr-icr+) @@ -373,7 +373,7 @@ If the CPU is idle, this will cause it to check for new threads." (defun check-tlb-generation-consistency () "If this CPU missed a TLB shootdown while idle, flush now. If a shootdown is still in progress, flush but do not stamp the -generation — the context-switch path will re-check after the shootdown +generation. the context-switch path will re-check after the shootdown finishes and the final generation is known." (when (and (boundp '*current-tlb-generation*) (boundp '*tlb-shootdown-in-progress*) @@ -393,7 +393,7 @@ TLB shootdown must be protected by the VM lock." (setf *tlb-shootdown-in-progress* t) ;; Bump the generation before sending IPIs so idle CPUs that wake ;; during the shootdown window will see the mismatch and flush on - ;; their next context switch. Idle CPUs are skipped — if they + ;; their next context switch. Idle CPUs are skipped; if they ;; handle a device IRQ mid-shootdown the IRQ handler calls ;; check-tlb-generation-consistency to flush before touching ;; pageable memory. From 4c57c7d89fe488635c379d4ea3d8e2d44de90177 Mon Sep 17 00:00:00 2001 From: iska Date: Mon, 15 Jun 2026 21:01:40 +0500 Subject: [PATCH 34/42] x86-64: deslop 2 --- supervisor/x86-64/cpu.lisp | 101 ++++++++++++------------------ supervisor/x86-64/interrupts.lisp | 55 ++++++++-------- supervisor/x86-64/time.lisp | 41 ++++++------ 3 files changed, 84 insertions(+), 113 deletions(-) diff --git a/supervisor/x86-64/cpu.lisp b/supervisor/x86-64/cpu.lisp index 37e2399c..dc918daa 100644 --- a/supervisor/x86-64/cpu.lisp +++ b/supervisor/x86-64/cpu.lisp @@ -200,6 +200,13 @@ The bootloader is loaded to #x7C00, so #x7000 should be safe.") "Issue an EOI to the Local APIC." (setf (lapic-reg +lapic-reg-eoi+) 0)) +(declaim (inline lapic-initialized-p lapic-x2apic-p)) +(defun lapic-initialized-p () + (and (boundp '*lapic-address*) *lapic-address*)) + +(defun lapic-x2apic-p () + (and (boundp '*lapic-x2apic-mode*) *lapic-x2apic-mode*)) + (defun send-ipi (target type vector) (if *lapic-x2apic-mode* ;; x2APIC: single 64-bit WRMSR to MSR 0x830. @@ -230,7 +237,7 @@ In xAPIC mode falls back to a regular ICR write targeting self." (send-ipi (x86-64-cpu-apic-id cpu) type vector)) (defun send-ipi-to-all (type vector &key including-self) - (if (and (boundp '*lapic-x2apic-mode*) *lapic-x2apic-mode*) + (if (lapic-x2apic-p) ;; x2APIC shorthand: the destination-shorthand field is ICR ;; bits 19:18 - 01=self, 10=all-including-self, 11=all-excluding-self. (let ((icr (logior (ash type 8) @@ -244,16 +251,14 @@ In xAPIC mode falls back to a regular ICR write targeting self." (send-ipi-to-cpu cpu type vector))))) (defun broadcast-ipi (type vector &optional including-self) - (when (and (boundp '*lapic-address*) - *lapic-address*) - (if (and (boundp '*lapic-x2apic-mode*) *lapic-x2apic-mode*) + (when (lapic-initialized-p) + (if (lapic-x2apic-p) (send-ipi-to-all type vector :including-self including-self) (safe-without-interrupts (type vector including-self) (send-ipi-to-all type vector :including-self including-self))))) (defun broadcast-wakeup-ipi () - (when (and (boundp '*lapic-address*) - *lapic-address*) + (when (lapic-initialized-p) (dolist (cpu *cpus*) (when (eql (x86-64-cpu-state cpu) :online) (wake-cpu cpu))))) @@ -991,32 +996,22 @@ This is a one-shot timer and must be reset after firing." (declare (ignore interrupt-frame info)) (panic "Got LAPIC error interrupt")) -(defun lapic-timer-calibrate-1 () - (let ((initial-time (get-internal-run-time)) - (start-time nil) - (end-time nil) - (end-counter nil)) - ;; Wait for the start of this tick. - (loop - (setf start-time (get-internal-run-time)) - (when (not (eq start-time initial-time)) - (return))) - ;; Start timer with the maximum count value +(defun lapic-timer-measure-one-tick () + "Start LAPIC timer at max count at a PIT tick boundary, wait one tick. +Returns (values lapic-cycles-elapsed pit-tick-duration-in-internal-time-units)." + (let ((start (wait-for-next-pit-tick (get-internal-run-time)))) (write-lapic #xFFFFFFFF +lapic-reg-timer-initial-count+) - ;; Wait for next tick. - (loop - (setf end-time (get-internal-run-time)) - (when (not (eq end-time start-time)) - (return))) - ;; Read current count & stop timer. - (setf end-counter (read-lapic +lapic-reg-timer-current-count+)) - (write-lapic 0 +lapic-reg-timer-initial-count+) - (let* ((cycles (- #xFFFFFFFF end-counter)) - (total-time (- end-time start-time)) - ;; Use single floats here. Rationals & double-floats require - ;; allocation and this is called too early for that. - (time (/ (float total-time) internal-time-units-per-second)) - (cycles-per-second (/ cycles time))) + (let ((end (wait-for-next-pit-tick start))) + (let ((remaining (read-lapic +lapic-reg-timer-current-count+))) + (write-lapic 0 +lapic-reg-timer-initial-count+) + (values (- #xFFFFFFFF remaining) (- end start)))))) + +(defun lapic-timer-calibrate-1 () + "Calibrate the LAPIC timer using the PIT as a time reference." + (multiple-value-bind (cycles total-time) + (lapic-timer-measure-one-tick) + (let* ((time (/ (float total-time) internal-time-units-per-second)) + (cycles-per-second (/ (float cycles) time))) cycles-per-second))) ;; Assume LAPIC timers across CPUs tick at the same rate. @@ -1029,42 +1024,24 @@ This is a one-shot timer and must be reset after firing." (defun lapic-timer-calibrate-tsc () "Calibrate the LAPIC timer using the TSC as a high-resolution time reference. *CPU-SPEED* must already be calibrated before calling this." - (let* ((tsc-start (sys.int::tsc)) - (start-time (get-internal-run-time))) - ;; Start timer with the maximum count value - (write-lapic #xFFFFFFFF +lapic-reg-timer-initial-count+) - ;; Wait for the next PIT tick as our interval marker. - (loop (when (not (eql (get-internal-run-time) start-time)) (return))) - (let* ((tsc-end (sys.int::tsc)) - (lapic-remaining (read-lapic +lapic-reg-timer-current-count+)) - (lapic-cycles (- #xFFFFFFFF lapic-remaining)) - (tsc-delta (- tsc-end tsc-start))) - ;; Stop timer - (write-lapic 0 +lapic-reg-timer-initial-count+) - (if (zerop tsc-delta) - 0 - ;; LAPIC Hz = (lapic_cycles / tsc_delta) * cpu_speed - ;; Use single-floats to avoid allocation during early boot. - (let ((cpu-speed (float *cpu-speed*))) + (let ((tsc-start (sys.int::tsc))) + (multiple-value-bind (lapic-cycles) + (lapic-timer-measure-one-tick) + (let* ((tsc-delta (- (sys.int::tsc) tsc-start)) + (cpu-speed (float *cpu-speed*))) + (if (zerop tsc-delta) + 0 (/ (* (float lapic-cycles) cpu-speed) (float tsc-delta))))))) ;; TODO: Be more clever when picking the divisor. ;; Should dynamically adjust so a goldilocks calibration value is returned. (defun lapic-timer-calibrate () - (if (tsc-deadline-available-p) - ;; Use TSC for high-resolution calibration. - (let ((n (lapic-timer-calibrate-tsc))) - (dotimes (i 5) - (setf n (/ (+ n (lapic-timer-calibrate-tsc)) 2))) - (setf *lapic-timer-calibration* (truncate n) - *lapic-timer-ticks-per-second* (truncate n))) - ;; Fall back to PIT-based calibration. - (let ((n (lapic-timer-calibrate-1))) - (dotimes (i 5) - (setf n (/ (+ n (lapic-timer-calibrate-1)) 2))) - (setf *lapic-timer-calibration* (truncate n) - *lapic-timer-ticks-per-second* (truncate n))))) + (let ((n (calibrate-average (if (tsc-deadline-available-p) + #'lapic-timer-calibrate-tsc + #'lapic-timer-calibrate-1)))) + (setf *lapic-timer-calibration* (truncate n) + *lapic-timer-ticks-per-second* (truncate n)))) ;; These two functions are interrupt-safe, they must not use ;; floats, bignums or ratios when converting. @@ -1225,7 +1202,7 @@ This is a one-shot timer and must be reset after firing." :idle-p nil :inhibit-scheduling 0 :tlb-generation 0 - :timer-active nil + :timer-active nil :apic-in-x2apic-mode *lapic-x2apic-mode*))) ;; Allocate per-CPU MCS node for spinlocks. (setf (cpu-mcs-node cpu) (%make-mcs-node)) diff --git a/supervisor/x86-64/interrupts.lisp b/supervisor/x86-64/interrupts.lisp index a19d6526..ffc52af5 100644 --- a/supervisor/x86-64/interrupts.lisp +++ b/supervisor/x86-64/interrupts.lisp @@ -334,46 +334,41 @@ If clear, the fault occured in supervisor mode.") ;; Unmask the cascade IRQ, required for the 2nd chip to function. (i8259-unmask-irq 2)) +(declaim (inline io-apic-active-p)) +(defun io-apic-active-p () + (and (boundp '*io-apic-active-p*) *io-apic-active-p*)) + +(defun platform-irq-vector () + (if (io-apic-active-p) *io-apic-irqs* *i8259-irqs*)) + (defun platform-irq (number) - (cond ((and (boundp '*io-apic-active-p*) - *io-apic-active-p* - (<= 0 number 255)) - (let ((gsi (if (< number 16) - (sys.int::%object-ref-t *isa-irq-to-gsi* number) - number))) - (sys.int::%object-ref-t *io-apic-irqs* gsi))) - ((<= 0 number 15) - (svref *i8259-irqs* number)) - (t nil))) + (if (io-apic-active-p) + (when (<= 0 number 255) + (let ((gsi (if (< number 16) + (sys.int::%object-ref-t *isa-irq-to-gsi* number) + number))) + (sys.int::%object-ref-t *io-apic-irqs* gsi))) + (when (<= 0 number 15) + (svref *i8259-irqs* number)))) (defun all-platform-irqs () - (if (and (boundp '*io-apic-active-p*) *io-apic-active-p*) - (loop - for i below (sys.int::%object-header-data *io-apic-irqs*) - for irq = (svref *io-apic-irqs* i) - when irq collect irq) - (loop - for i below (sys.int::%object-header-data *i8259-irqs*) - for irq = (svref *i8259-irqs* i) - collect irq))) + (loop with vector = (platform-irq-vector) + for i below (sys.int::%object-header-data vector) + for irq = (svref vector i) + when irq collect irq)) (defun map-platform-irqs (fn) - (if (and (boundp '*io-apic-active-p*) *io-apic-active-p*) - (loop - for i below (sys.int::%object-header-data *io-apic-irqs*) - for irq = (svref *io-apic-irqs* i) - when irq do (funcall fn irq)) - (loop - for i below (sys.int::%object-header-data *i8259-irqs*) - for irq = (svref *i8259-irqs* i) - do (funcall fn irq)))) + (loop with vector = (platform-irq-vector) + for i below (sys.int::%object-header-data vector) + for irq = (svref vector i) + when irq do (funcall fn irq))) (defun platform-mask-irq (vector) - (if (and (boundp '*io-apic-active-p*) *io-apic-active-p*) + (if (io-apic-active-p) (io-apic-mask-irq vector) (i8259-mask-irq vector))) (defun platform-unmask-irq (vector) - (if (and (boundp '*io-apic-active-p*) *io-apic-active-p*) + (if (io-apic-active-p) (io-apic-unmask-irq vector) (i8259-unmask-irq vector))) diff --git a/supervisor/x86-64/time.lisp b/supervisor/x86-64/time.lisp index 4e93d1e7..edcfffc7 100644 --- a/supervisor/x86-64/time.lisp +++ b/supervisor/x86-64/time.lisp @@ -22,34 +22,33 @@ (setf (sys.int::io-port/8 #x40) (ldb (byte 8 0) divisor) (sys.int::io-port/8 #x40) (ldb (byte 8 8) divisor)))) +(defun wait-for-next-pit-tick (current-time) + (loop + (let ((new (get-internal-run-time))) + (when (not (eql new current-time)) + (return new))))) + +(defun calibrate-average (fn) + (let ((n (funcall fn))) + (dotimes (i 5) + (setf n (/ (+ n (funcall fn)) 2))) + n)) + (defun calibrate-tsc-1 () "Return the number of cycles per second, approximately." - (let ((initial-time (get-internal-run-time)) - (start-time nil) - (end-time nil) + (let ((start-time (wait-for-next-pit-tick (get-internal-run-time))) (start-cycle nil) (end-cycle nil)) - ;; Wait for the start of this tick. - (loop - (setf start-time (get-internal-run-time)) - (when (not (eq start-time initial-time)) - (return))) (setf start-cycle (sys.int::tsc)) - (loop - (setf end-time (get-internal-run-time)) - (when (not (eq end-time start-time)) - (return))) - (setf end-cycle (sys.int::tsc)) - (let* ((cycles (- end-cycle start-cycle)) - (time (/ (float (- end-time start-time)) internal-time-units-per-second)) - (cycles-per-second (/ cycles time))) - cycles-per-second))) + (let ((end-time (wait-for-next-pit-tick start-time))) + (setf end-cycle (sys.int::tsc)) + (let* ((cycles (- end-cycle start-cycle)) + (time (/ (float (- end-time start-time)) internal-time-units-per-second)) + (cycles-per-second (/ cycles time))) + cycles-per-second)))) (defun calibrate-tsc () - (let ((n (calibrate-tsc-1))) - (dotimes (i 5) - (setf n (/ (+ n (calibrate-tsc-1)) 2))) - (setf *cpu-speed* (floor n)))) + (setf *cpu-speed* (floor (calibrate-average #'calibrate-tsc-1)))) (defun high-precision-time-units-to-internal-time-units (tsc-time) (if (boundp '*cpu-speed*) From 31a30196048953dc9afcf1b6cbfaf36fd4ba4865 Mon Sep 17 00:00:00 2001 From: iska Date: Mon, 15 Jun 2026 21:43:58 +0500 Subject: [PATCH 35/42] x86-64: refactor initialize-io-apic --- supervisor/x86-64/io-apic.lisp | 180 ++++++++++++++++----------------- 1 file changed, 87 insertions(+), 93 deletions(-) diff --git a/supervisor/x86-64/io-apic.lisp b/supervisor/x86-64/io-apic.lisp index 3243d7e7..d682955b 100644 --- a/supervisor/x86-64/io-apic.lisp +++ b/supervisor/x86-64/io-apic.lisp @@ -119,105 +119,99 @@ (defun gsi-flags-trigger (flags) (if (eql (ldb (byte 2 2) flags) 3) :level :edge)) +(defun process-isa-overrides (controllers n-controllers) + (let ((mapping (sys.int::make-simple-vector 16 :wired))) + (dotimes (i 16) (setf (svref mapping i) i)) + (dotimes (i n-controllers) + (let ((entry (svref controllers i))) + (when (acpi-madt-interrupt-source-override-p entry) + (let ((bus (acpi-madt-interrupt-source-override-bus entry)) + (source (acpi-madt-interrupt-source-override-source entry)) + (gsi (acpi-madt-interrupt-source-override-global-system-interrupt entry)) + (flags (acpi-madt-interrupt-source-override-flags entry))) + (when (and (eql bus 0) (< source 16)) + (setf (svref mapping source) gsi) + (when (< gsi 256) + (setf (svref *gsi-flags* gsi) flags)) + (debug-print-line "MADT override: ISA IRQ " source " -> GSI " gsi)))))) + (setf *isa-irq-to-gsi* mapping))) + +(defun init-one-io-apic (entry bsp-apic-id) + (let* ((phys-addr (acpi-madt-ioapic-address entry)) + (id (acpi-madt-ioapic-id entry)) + (gsi-base (acpi-madt-ioapic-global-system-interrupt-base entry))) + (map-physical-memory-early (align-down phys-addr +4k-page-size+) + +4k-page-size+ "IO-APIC") + (let* ((apic (make-io-apic :id id :gsi-base gsi-base + :address phys-addr :mmio-base phys-addr)) + (max-redir (ldb (byte 8 16) (io-apic-read apic +io-apic-version+))) + (n-entries (1+ max-redir))) + (setf (io-apic-max-redirection apic) max-redir) + (push-wired apic *io-apics*) + (dotimes (e n-entries) + (io-apic-write-redirection apic e +io-apic-entry-mask+) + (let* ((gsi (+ gsi-base e)) + (vector (gsi->vector gsi)) + (flags (svref *gsi-flags* gsi))) + (when (< vector 256) + (io-apic-configure-entry gsi vector bsp-apic-id + :trigger-mode (gsi-flags-trigger flags) + :polarity (gsi-flags-polarity flags) + :masked t)))) + (debug-print-line "IO-APIC " id " at " phys-addr + " GSI base " gsi-base " max redirect " max-redir)))) + +(defun init-io-apic-controllers (controllers n-controllers) + (setf *io-apic-irqs* (sys.int::make-simple-vector 256 :wired)) + (dotimes (i 256) + (setf (svref *io-apic-irqs* i) (make-irq :platform-number i))) + (setf *io-apics* '()) + (let ((bsp-id (if (boundp '*bsp-cpu*) (x86-64-cpu-apic-id *bsp-cpu*) 0))) + (dotimes (i n-controllers) + (let ((entry (svref controllers i))) + (when (acpi-madt-ioapic-p entry) + (init-one-io-apic entry bsp-id)))))) + +(defun hook-io-apic-interrupt-handlers () + (debug-print-line "IO-APIC init done. Hooking handlers...") + (let ((max-gsi 0)) + (dolist (apic *io-apics*) + (setf max-gsi (max max-gsi + (+ (io-apic-gsi-base apic) + (io-apic-max-redirection apic))))) + (dotimes (gsi (1+ max-gsi)) + (let ((vector (gsi->vector gsi))) + (when (< vector 256) + (hook-user-interrupt vector 'io-apic-interrupt-handler)))))) + +(defun finalize-io-apic-init () + (setf *io-apic-active-p* t) + (when (boundp '*i8259-shadow-mask*) + (setf (sys.int::io-port/8 #x21) #xFF + (sys.int::io-port/8 #xA1) #xFF + *i8259-shadow-mask* #xFFFF)) + (write-lapic (logior (read-lapic +lapic-reg-lvt-lint0+) +lapic-lvt-mask+) + +lapic-reg-lvt-lint0+) + (write-lapic (logior (read-lapic +lapic-reg-lvt-lint1+) +lapic-lvt-mask+) + +lapic-reg-lvt-lint1+)) + (defun initialize-io-apics () (setf *io-apic-active-p* nil) - (unless (boundp '*io-apics*) - (setf *io-apics* nil)) - (when *io-apics* - (return-from initialize-io-apics)) - (setf *io-apic-irqs* nil) - (setf *isa-irq-to-gsi* nil) - (setf *gsi-flags* nil) + (unless (boundp '*io-apics*) (setf *io-apics* nil)) + (when *io-apics* (return-from initialize-io-apics)) + (setf *io-apic-irqs* nil *isa-irq-to-gsi* nil *gsi-flags* nil) (let ((madt (acpi-get-table 'acpi-madt-table-p))) (unless madt (debug-print-line "No MADT table, IO-APIC not available.") (return-from initialize-io-apics)) - (let ((isa-mapping (sys.int::make-simple-vector 16 :wired))) - (setf *gsi-flags* (sys.int::make-simple-vector 256 :wired)) - (dotimes (i 16) - (setf (svref isa-mapping i) i)) - (dotimes (i (sys.int::simple-vector-length - (acpi-madt-table-controllers madt))) - (let ((entry (svref (acpi-madt-table-controllers madt) i))) - (when (acpi-madt-interrupt-source-override-p entry) - (when (and (eql (acpi-madt-interrupt-source-override-bus entry) 0) - (< (acpi-madt-interrupt-source-override-source entry) 16)) - (setf (svref isa-mapping - (acpi-madt-interrupt-source-override-source entry)) - (acpi-madt-interrupt-source-override-global-system-interrupt entry)) - (let ((gsi (acpi-madt-interrupt-source-override-global-system-interrupt entry)) - (flags (acpi-madt-interrupt-source-override-flags entry))) - (when (< gsi 256) - (setf (svref *gsi-flags* gsi) flags))) - (debug-print-line "MADT override: ISA IRQ " - (acpi-madt-interrupt-source-override-source entry) - " -> GSI " - (acpi-madt-interrupt-source-override-global-system-interrupt entry)))))) - (setf *isa-irq-to-gsi* isa-mapping)) - (setf *io-apic-irqs* (sys.int::make-simple-vector 256 :wired)) - (dotimes (i 256) - (setf (svref *io-apic-irqs* i) (make-irq :platform-number i))) - (setf *io-apics* '()) - (let ((bsp-apic-id (if (boundp '*bsp-cpu*) - (x86-64-cpu-apic-id *bsp-cpu*) - 0))) - (dotimes (i (sys.int::simple-vector-length - (acpi-madt-table-controllers madt))) - (let ((entry (svref (acpi-madt-table-controllers madt) i))) - (when (acpi-madt-ioapic-p entry) - (let* ((phys-addr (acpi-madt-ioapic-address entry)) - (page-base (align-down phys-addr +4k-page-size+))) - (map-physical-memory-early page-base +4k-page-size+ "IO-APIC") - (let ((io-apic (make-io-apic - :id (acpi-madt-ioapic-id entry) - :gsi-base (acpi-madt-ioapic-global-system-interrupt-base entry) - :address phys-addr - :mmio-base phys-addr))) - (let* ((version-reg (io-apic-read io-apic +io-apic-version+)) - (max-redir (ldb (byte 8 16) version-reg)) - (n-entries (1+ max-redir)) - (gsi-base (io-apic-gsi-base io-apic))) - (setf (io-apic-max-redirection io-apic) max-redir) - (push-wired io-apic *io-apics*) - (dotimes (e n-entries) - (io-apic-write-redirection io-apic e +io-apic-entry-mask+) - (let* ((gsi (+ gsi-base e)) - (vector (gsi->vector gsi)) - (flags (svref *gsi-flags* (+ gsi-base e)))) - (when (< vector 256) - (io-apic-configure-entry gsi - vector - bsp-apic-id - :trigger-mode (gsi-flags-trigger flags) - :polarity (gsi-flags-polarity flags) - :masked t)))) - (debug-print-line "IO-APIC " (io-apic-id io-apic) - " at " phys-addr - " GSI base " (io-apic-gsi-base io-apic) - " max redirect " (io-apic-max-redirection io-apic)))))))) - (when (not (null *io-apics*)) - (debug-print-line "IO-APIC init done. Hooking handlers...") - (let ((max-gsi 0)) - (dolist (apic *io-apics*) - (let ((top (+ (io-apic-gsi-base apic) (io-apic-max-redirection apic)))) - (setf max-gsi (max max-gsi top)))) - (dotimes (gsi (1+ max-gsi)) - (let ((vector (gsi->vector gsi))) - (when (< vector 256) - (hook-user-interrupt vector 'io-apic-interrupt-handler)))))) - (setf *io-apic-active-p* t) - ;; Mask the i8259 PIC now that IO-APIC is handling interrupts. - (when (boundp '*i8259-shadow-mask*) - (setf (sys.int::io-port/8 #x21) #xFF - (sys.int::io-port/8 #xA1) #xFF - *i8259-shadow-mask* #xFFFF)) - ;; Mask LINT0 and LINT1 on the LAPIC to prevent i8259 spurious - ;; interrupts from reaching the CPU through ExtINT/NMI. - (write-lapic (logior (read-lapic +lapic-reg-lvt-lint0+) +lapic-lvt-mask+) - +lapic-reg-lvt-lint0+) - (write-lapic (logior (read-lapic +lapic-reg-lvt-lint1+) +lapic-lvt-mask+) - +lapic-reg-lvt-lint1+) - nil))) + (setf *gsi-flags* (sys.int::make-simple-vector 256 :wired)) + (let* ((controllers (acpi-madt-table-controllers madt)) + (n (sys.int::simple-vector-length controllers))) + (process-isa-overrides controllers n) + (init-io-apic-controllers controllers n) + (when *io-apics* + (hook-io-apic-interrupt-handlers) + (finalize-io-apic-init))))) (defun io-apic-interrupt-handler (interrupt-frame info) ;; If this CPU was idle during a TLB shootdown and missed the IPI, From 684c85571d37a4cb2ab74a526394f92ef665b478 Mon Sep 17 00:00:00 2001 From: iska Date: Mon, 15 Jun 2026 22:20:53 +0500 Subject: [PATCH 36/42] pager: revert ugly macrolet in update-wired-dirty-bits, duplicated code is clearer --- supervisor/pager.lisp | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/supervisor/pager.lisp b/supervisor/pager.lisp index 507ba551..f274a5dd 100644 --- a/supervisor/pager.lisp +++ b/supervisor/pager.lisp @@ -581,23 +581,27 @@ Returns NIL if the entry is missing and ALLOCATE is false." (declare (ignore ignore1 ignore2 ignore3)) (pager-log-op "Update wired dirty bits") (with-rw-lock-write (*vm-lock*) - (macrolet ((update-dirty-in-range (start end name) - `(map-ptes - ,start ,end - (dx-lambda (wired-page pte) - (when (not pte) - (panic "Missing pte for " ,name " " wired-page)) - (when (page-dirty-p pte) - (setf (sys.int::card-table-dirty-gen wired-page) 0) - #-arm64 - (update-pte pte :dirty nil)))))) - (begin-tlb-shootdown) - (update-dirty-in-range sys.int::*wired-area-base* sys.int::*wired-area-bump* - "wired page") - (update-dirty-in-range sys.int::*wired-function-area-limit* sys.int::*function-area-base* - "wired function page") - (tlb-shootdown-all) - (finish-tlb-shootdown)))) + (begin-tlb-shootdown) + (map-ptes + sys.int::*wired-area-base* sys.int::*wired-area-bump* + (dx-lambda (wired-page pte) + (when (not pte) + (panic "Missing pte for wired page " wired-page)) + (when (page-dirty-p pte) + (setf (sys.int::card-table-dirty-gen wired-page) 0) + #-arm64 + (update-pte pte :dirty nil)))) + (map-ptes + sys.int::*wired-function-area-limit* sys.int::*function-area-base* + (dx-lambda (wired-page pte) + (when (not pte) + (panic "Missing pte for wired function page " wired-page)) + (when (page-dirty-p pte) + (setf (sys.int::card-table-dirty-gen wired-page) 0) + #-arm64 + (update-pte pte :dirty nil)))) + (tlb-shootdown-all) + (finish-tlb-shootdown)))) (defun get-page-physical-address (virtual-address) "Return the physical page frame mapped for VIRTUAL-ADDRESS. From a0d164a83831f915ee04145e458ccb1cc40cc045 Mon Sep 17 00:00:00 2001 From: iska Date: Mon, 15 Jun 2026 22:25:01 +0500 Subject: [PATCH 37/42] pager: undo the dedupe because it's ugly --- supervisor/pager.lisp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/supervisor/pager.lisp b/supervisor/pager.lisp index f274a5dd..f763c7f8 100644 --- a/supervisor/pager.lisp +++ b/supervisor/pager.lisp @@ -601,7 +601,7 @@ Returns NIL if the entry is missing and ALLOCATE is false." #-arm64 (update-pte pte :dirty nil)))) (tlb-shootdown-all) - (finish-tlb-shootdown)))) + (finish-tlb-shootdown))) (defun get-page-physical-address (virtual-address) "Return the physical page frame mapped for VIRTUAL-ADDRESS. From 76253497d540e946996d2912e58f798157a0bd0c Mon Sep 17 00:00:00 2001 From: iska Date: Mon, 15 Jun 2026 22:30:34 +0500 Subject: [PATCH 38/42] pager: fix whitespace --- supervisor/pager.lisp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/supervisor/pager.lisp b/supervisor/pager.lisp index f763c7f8..c2e0b39d 100644 --- a/supervisor/pager.lisp +++ b/supervisor/pager.lisp @@ -862,7 +862,7 @@ mapped, then the entry will be NIL." ;; Remap page read/write. (begin-tlb-shootdown) (setf (page-table-entry pte) (make-pte (ash (pte-physical-address (page-table-entry pte)) -12) - :writable (block-info-writable-p block-info))) + :writable (block-info-writable-p block-info))) (tlb-shootdown-single address) (finish-tlb-shootdown)) #+(or)(debug-print-line "WFP " address " block " block-info " already mapped " (page-table-entry pte 0)) @@ -906,11 +906,11 @@ mapped, then the entry will be NIL." (panic "Unable to read page from disk")))) (begin-tlb-shootdown) (setf (page-table-entry pte) (make-pte frame - :writable (and (block-info-writable-p block-info) - (not (block-info-track-dirty-p block-info))) - ;; Mark the page dirty to make sure the snapshotter & swap code know to swap it out. - ;; The zero fill flag in the block map was cleared, but the on-disk data doesn't reflect that. - :dirty is-zero-page)) + :writable (and (block-info-writable-p block-info) + (not (block-info-track-dirty-p block-info))) + ;; Mark the page dirty to make sure the snapshotter & swap code know to swap it out. + ;; The zero fill flag in the block map was cleared, but the on-disk data doesn't reflect that. + :dirty is-zero-page)) (tlb-shootdown-single address) (finish-tlb-shootdown) #+(or) From 6b2724c855c51a80b94f797db2f07105b49256da Mon Sep 17 00:00:00 2001 From: iska Date: Mon, 15 Jun 2026 22:43:08 +0500 Subject: [PATCH 39/42] supervisor: remove round-trip cons --- supervisor/thread.lisp | 49 +++++++++++++++++------------------------- 1 file changed, 20 insertions(+), 29 deletions(-) diff --git a/supervisor/thread.lisp b/supervisor/thread.lisp index c559454f..f3bc3327 100644 --- a/supervisor/thread.lisp +++ b/supervisor/thread.lisp @@ -603,35 +603,26 @@ Interrupts must be off and the global thread lock must be held." ;; This is seperate from thread-entry-trampoline so steppers can detect it. (defun thread-final-cleanup (return-values) - ;; Pre-allocate the cons cell for *rcu-deferred-list* before entering - ;; the interrupt-disabled context. Wired allocation requires interrupts - ;; to be enabled (it takes pseudo-atomic), but - ;; %RUN-ON-WIRED-STACK-WITHOUT-INTERRUPTS disables them. - (let ((deferred-cons (sys.int::cons-in-area nil nil :wired))) - (%run-on-wired-stack-without-interrupts (sp fp return-values) - (let ((self (current-thread))) - ;; FIXME: This should be done with the global lock held, but that makes - ;; the lock ordering incorrect in (setf event-state). - ;; (setf event-state) expects to be called with the thread lock released. - ;; This leaves a small race window between the thread's join event - ;; being set and the thread state being set to dead, but this is only - ;; visible on SMP as interrupts are disabled here. - (setf (event-state (thread-join-event self)) (or return-values :no-values)) - (acquire-global-thread-lock) - (setf (thread-state self) :dead) - ;; Defer physical removal from *all-threads*. RCU readers (see - ;; ALL-THREADS / WITH-RCU-READ-LOCK) skip :dead threads, and thread - ;; objects are reclaimed by the GC once unreferenced, so unlinking - ;; dead nodes is safe even while a reader is mid-traversal on a - ;; strongly-ordered architecture. - (setf (car deferred-cons) self - (cdr deferred-cons) *rcu-deferred-list* - *rcu-deferred-list* deferred-cons) - ;; Drain immediately. This is the only writer of - ;; *rcu-deferred-list* and it runs under the global lock, so every - ;; thread exit leaves the list empty. - (%cleanup-dead-threads) - (%reschedule-via-wired-stack sp fp))))) + (%run-on-wired-stack-without-interrupts (sp fp return-values) + (let ((self (current-thread))) + ;; FIXME: This should be done with the global lock held, but that makes + ;; the lock ordering incorrect in (setf event-state). + ;; (setf event-state) expects to be called with the thread lock released. + ;; This leaves a small race window between the thread's join event + ;; being set and the thread state being set to dead, but this is only + ;; visible on SMP as interrupts are disabled here. + (setf (event-state (thread-join-event self)) (or return-values :no-values)) + (acquire-global-thread-lock) + (setf (thread-state self) :dead) + ;; Unlink self from *all-threads* directly. The global lock is held + ;; and no deferred list round-trip is needed — this is the only writer. + (when (thread-global-next self) + (setf (thread-global-prev (thread-global-next self)) (thread-global-prev self))) + (when (thread-global-prev self) + (setf (thread-global-next (thread-global-prev self)) (thread-global-next self))) + (when (eql self *all-threads*) + (setf *all-threads* (thread-global-next self))) + (%reschedule-via-wired-stack sp fp)))) (defun thread-join (thread &optional (wait-p t)) "Wait for THREAD to exit. From 9a950176f0dfc2fd37b7a3db6d9e1d84788f5cdd Mon Sep 17 00:00:00 2001 From: iska Date: Tue, 16 Jun 2026 00:34:20 +0500 Subject: [PATCH 40/42] x86-64: correctness --- supervisor/acpi.lisp | 2 -- supervisor/x86-64/cpu.lisp | 6 +++--- supervisor/x86-64/io-apic.lisp | 2 +- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/supervisor/acpi.lisp b/supervisor/acpi.lisp index 8506c0c6..6f37d69c 100644 --- a/supervisor/acpi.lisp +++ b/supervisor/acpi.lisp @@ -382,7 +382,6 @@ (defstruct (acpi-madt-processor-x2apic (:area :wired)) - acpi-processor-id x2apic-id flags acpi-processor-uid) @@ -452,7 +451,6 @@ (9 ;; Processor local x2APIC. (setf (svref (acpi-madt-table-controllers table) current) (make-acpi-madt-processor-x2apic - :acpi-processor-id (physical-memref-unsigned-byte-8 (+ address offset 2)) :x2apic-id (physical-memref-unsigned-byte-32 (+ address offset 4)) :flags (physical-memref-unsigned-byte-32 (+ address offset 8)) :acpi-processor-uid (physical-memref-unsigned-byte-32 (+ address offset 12)))))) diff --git a/supervisor/x86-64/cpu.lisp b/supervisor/x86-64/cpu.lisp index dc918daa..4d860a1e 100644 --- a/supervisor/x86-64/cpu.lisp +++ b/supervisor/x86-64/cpu.lisp @@ -56,7 +56,7 @@ (defconstant +cpuid-feature-x2apic+ 21) (defconstant +cpuid-feature-tsc-deadline+ 24) (defconstant +msr-ia32-apic-base-x2apic-enable+ #x400) -(defconstant +msr-ia32-apic-base-bsp+ #x800) +(defconstant +msr-ia32-apic-base-enable+ #x800) ;; x2APIC MSR range base. (defconstant +x2apic-msr-base+ #x800) @@ -1099,12 +1099,12 @@ Returns (values lapic-cycles-elapsed pit-tick-duration-in-internal-time-units)." (setf *lapic-x2apic-mode* t) (debug-print-line "x2APIC already enabled by firmware")) (t - ;; Enable x2APIC: set bit 10 (enable) and bit 11 (BSP) in IA32_APIC_BASE MSR. + ;; Enable x2APIC: set bit 10 (x2APIC enable) and bit 11 (APIC enable) in IA32_APIC_BASE MSR. (let ((apic-base (sys.int::msr +msr-ia32-apic-base+))) (setf (sys.int::msr +msr-ia32-apic-base+) (logior apic-base +msr-ia32-apic-base-x2apic-enable+ - +msr-ia32-apic-base-bsp+))) + +msr-ia32-apic-base-enable+))) (setf *lapic-x2apic-mode* t) (debug-print-line "x2APIC enabled on BSP")))) (t diff --git a/supervisor/x86-64/io-apic.lisp b/supervisor/x86-64/io-apic.lisp index d682955b..5341ffa6 100644 --- a/supervisor/x86-64/io-apic.lisp +++ b/supervisor/x86-64/io-apic.lisp @@ -152,7 +152,7 @@ (io-apic-write-redirection apic e +io-apic-entry-mask+) (let* ((gsi (+ gsi-base e)) (vector (gsi->vector gsi)) - (flags (svref *gsi-flags* gsi))) + (flags (if (< gsi 256) (svref *gsi-flags* gsi) 0))) (when (< vector 256) (io-apic-configure-entry gsi vector bsp-apic-id :trigger-mode (gsi-flags-trigger flags) From 6a42f2cc8c2e4fee4642eb8dd126a58654cc0b0a Mon Sep 17 00:00:00 2001 From: iska Date: Tue, 16 Jun 2026 00:45:10 +0500 Subject: [PATCH 41/42] acpi: fix whitespace --- supervisor/acpi.lisp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/supervisor/acpi.lisp b/supervisor/acpi.lisp index 6f37d69c..d7221352 100644 --- a/supervisor/acpi.lisp +++ b/supervisor/acpi.lisp @@ -445,15 +445,15 @@ :flags (physical-memref-unsigned-byte-16 (+ address offset 3)) :lapic-lintn (physical-memref-unsigned-byte-8 (+ address offset 5))))) (5 ;; Local APIC address override. - (setf (svref (acpi-madt-table-controllers table) current) - (make-acpi-madt-lapic-address-override - :address (physical-memref-unsigned-byte-64 (+ address offset 4))))) + (setf (svref (acpi-madt-table-controllers table) current) + (make-acpi-madt-lapic-address-override + :address (physical-memref-unsigned-byte-64 (+ address offset 4))))) (9 ;; Processor local x2APIC. - (setf (svref (acpi-madt-table-controllers table) current) - (make-acpi-madt-processor-x2apic - :x2apic-id (physical-memref-unsigned-byte-32 (+ address offset 4)) - :flags (physical-memref-unsigned-byte-32 (+ address offset 8)) - :acpi-processor-uid (physical-memref-unsigned-byte-32 (+ address offset 12)))))) + (setf (svref (acpi-madt-table-controllers table) current) + (make-acpi-madt-processor-x2apic + :x2apic-id (physical-memref-unsigned-byte-32 (+ address offset 4)) + :flags (physical-memref-unsigned-byte-32 (+ address offset 8)) + :acpi-processor-uid (physical-memref-unsigned-byte-32 (+ address offset 12)))))) (incf current) (incf offset len)))) table)) From 1b496a10fa2229b4b3d8982f93d9d0e51d8a5859 Mon Sep 17 00:00:00 2001 From: iska Date: Tue, 16 Jun 2026 00:59:51 +0500 Subject: [PATCH 42/42] supervisor: fix whitespaces --- supervisor/arm64/cpu.lisp | 12 ++++++------ supervisor/profiler.lisp | 16 ++++++++-------- supervisor/sync.lisp | 6 +++--- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/supervisor/arm64/cpu.lisp b/supervisor/arm64/cpu.lisp index 9ba20100..9a6d5618 100644 --- a/supervisor/arm64/cpu.lisp +++ b/supervisor/arm64/cpu.lisp @@ -370,12 +370,12 @@ TLB shootdown must be protected by the VM lock." :priority :idle)) (wired-stack (%allocate-stack (* 128 1024) t)) (cpu (make-arm64-cpu :state :offline - :cpu-id cpu-id - :idle-thread idle-thread - :wired-stack wired-stack - :sp-el1 (+ (stack-base wired-stack) - (stack-size wired-stack) - -16)))) + :cpu-id cpu-id + :idle-thread idle-thread + :wired-stack wired-stack + :sp-el1 (+ (stack-base wired-stack) + (stack-size wired-stack) + -16)))) (setf (cpu-cpu-index cpu) (length *cpus*)) (setf (cpu-inhibit-scheduling cpu) 0) (setf (cpu-tlb-generation cpu) 0) diff --git a/supervisor/profiler.lisp b/supervisor/profiler.lisp index 89a2c487..fc1b5ad0 100644 --- a/supervisor/profiler.lisp +++ b/supervisor/profiler.lisp @@ -92,20 +92,20 @@ ;; RIP is valid in the save area. (profile-append-return-address (thread-state-rip thread))) (profile-append-call-stack (thread-frame-pointer thread))))) - (t - (with-rcu-read-lock - (loop - for thread = *all-threads* then (thread-global-next thread) - until (not thread) do - (when (and (not (eql (thread-state thread) :dead)) - (not (eql thread (current-thread)))) + (t + (with-rcu-read-lock + (loop + for thread = *all-threads* then (thread-global-next thread) + until (not thread) do + (when (and (not (eql (thread-state thread) :dead)) + (not (eql thread (current-thread)))) (profile-append-entry thread) (profile-append-entry (thread-state thread)) (profile-append-entry (thread-wait-item thread)) (when (thread-full-save-p thread) ;; RIP is valid in the save area. (profile-append-return-address (thread-state-rip thread))) - (profile-append-call-stack (thread-frame-pointer thread))))))) + (profile-append-call-stack (thread-frame-pointer thread))))))) (resume-other-cpus-for-debug-magic-button))) (defun start-profiling (&key buffer-size thread (reset t) (sample-during-gc t)) diff --git a/supervisor/sync.lisp b/supervisor/sync.lisp index 114adcaf..8f3df589 100644 --- a/supervisor/sync.lisp +++ b/supervisor/sync.lisp @@ -87,9 +87,9 @@ (sys.int::defglobal *lock-violations-are-fatal* t) (defstruct (mutex - (:include wait-queue) - (:constructor make-mutex (&optional name)) - (:area :wired)) + (:include wait-queue) + (:constructor make-mutex (&optional name)) + (:area :wired)) ;; Thread holding the lock, or NIL if it is free. ;; May not be correct when the lock is being acquired/released. (owner nil)