diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
new file mode 100644
index 0000000..2f97a41
--- /dev/null
+++ b/.devcontainer/Dockerfile
@@ -0,0 +1,16 @@
+FROM python:3.12-slim-bookworm
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    git \
+    yosys \
+    nextpnr-ice40 \
+    fpga-icestorm \
+    nodejs npm \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN npm install -g @anthropic-ai/claude-code
+
+WORKDIR /workspace
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
diff --git a/.devcontainer/attach-icebreaker.ps1 b/.devcontainer/attach-icebreaker.ps1
new file mode 100644
index 0000000..722a84d
--- /dev/null
+++ b/.devcontainer/attach-icebreaker.ps1
@@ -0,0 +1,32 @@
+#Requires -RunAsAdministrator
+# Attaches the IceBreaker FPGA (FTDI FT2232H, VID 0403) to WSL2 via usbipd-win.
+# Run this on the Windows host before opening the devcontainer.
+
+$ErrorActionPreference = 'Stop'
+
+if (-not (Get-Command usbipd -ErrorAction SilentlyContinue)) {
+    Write-Error "usbipd not found. Install it from: https://github.com/dorssel/usbipd-win/releases"
+    exit 1
+}
+
+# Find all devices with FTDI VID 0403
+$devices = usbipd list | Where-Object { $_ -match '0403' }
+
+if (-not $devices) {
+    Write-Error "No FTDI device (VID 0403) found. Is the IceBreaker plugged in?"
+    exit 1
+}
+
+if (($devices | Measure-Object).Count -gt 1) {
+    Write-Host "Multiple FTDI devices found:"
+    $devices | ForEach-Object { Write-Host "  $_" }
+    Write-Error "Ambiguous. Unplug other FTDI devices or run 'usbipd attach --wsl --busid <BUSID>' manually."
+    exit 1
+}
+
+# Extract BUSID (first token on the line, e.g. "3-1")
+$busid = ($devices -split '\s+')[0].Trim()
+
+Write-Host "Attaching IceBreaker at bus ID $busid to WSL2..."
+usbipd attach --wsl --busid $busid
+Write-Host "Done. You can now open the devcontainer and use iceprog."
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
new file mode 100644
index 0000000..f719d95
--- /dev/null
+++ b/.devcontainer/devcontainer.json
@@ -0,0 +1,29 @@
+{
+    "name": "Amaranth HDL - IceBreaker",
+    "build": {
+        "dockerfile": "Dockerfile",
+        "context": ".."
+    },
+    // USB flashing (iceprog) requires the IceBreaker to be forwarded to WSL2 first.
+    // On Windows: install usbipd-win (https://github.com/dorssel/usbipd-win/releases),
+    // then run (as Administrator) before opening this devcontainer:
+    //   .devcontainer/attach-icebreaker.ps1
+    "runArgs": ["--privileged"],
+    "workspaceFolder": "/workspace",
+    "workspaceMount": "source=${localWorkspaceFolder},target=/workspace,type=bind,consistency=cached",
+    "mounts": [
+        "source=${localEnv:USERPROFILE}/.claude,target=/root/.claude,type=bind,consistency=cached"
+    ],
+    "customizations": {
+        "vscode": {
+            "extensions": [
+                "ms-python.python",
+                "ms-python.pylance",
+                "anthropic.claude-code"
+            ],
+            "settings": {
+                "python.defaultInterpreterPath": "/usr/local/bin/python"
+            }
+        }
+    }
+}
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..33bf949
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,16 @@
+# Generated FPGA build artifacts (regenerate with: python -m exi_bba.synth)
+build/
+
+# Simulation waveforms (regenerate by running the testbenches)
+*.vcd
+
+# Python
+__pycache__/
+*.pyc
+*.pyo
+.venv/
+venv/
+
+# Editor / OS cruft
+.DS_Store
+*.swp
diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 0000000..3862be6
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,493 @@
+# CLAUDE.md
+
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+
+## Project: GC BBA FPGA Replacement
+
+Replace the GameCube Broadband Adapter (DOL-015 / MX98730EC) with an iCEbreaker
+FPGA (Lattice iCE40UP5K) written in Amaranth HDL. The FPGA emulates the BBA
+register interface over the GameCube EXI bus and bridges to a WIZnet ethernet
+chip for real 100BASE-TX ethernet — default **W5100** (indirect parallel bus,
+reaches the EXI throughput ceiling) or **W5500** (SPI Pmod, simpler wiring but
+~12 Mbit/s). GC software (Swiss homebrew) sees an identical BBA. See "W5100 vs
+W5500 ethernet back-end".
+
+---
+
+## Development Environment
+
+**Preferred:** Use the devcontainer (`.devcontainer/`) which includes Python 3.12,
+`nextpnr-ice40`, and `fpga-icestorm` pre-installed.
+
+**Windows host + WSL2 devcontainer — USB flashing setup:**
+1. Install `usbipd-win` (https://github.com/dorssel/usbipd-win/releases)
+2. Run `.devcontainer/attach-icebreaker.ps1` as Administrator before opening the devcontainer
+3. The devcontainer runs `--privileged` to pass through the USB device
+
+**Local venv (outside devcontainer):**
+```bash
+python -m venv .venv
+source .venv/bin/activate   # Windows: .venv\Scripts\activate
+pip install -r requirements.txt
+```
+
+Yosys is bundled in `amaranth-yosys`; `nextpnr-ice40` and `iceprog` must be
+installed separately (via apt on Linux, or via the devcontainer).
+
+---
+
+## Commands
+
+**Build and flash the iCEbreaker (must run from workspace root):**
+```bash
+python rebbarb/rebbarb.py
+```
+Runs synthesis (yosys), place-and-route (nextpnr-ice40), and flashes via `iceprog`.
+Set `ICEPROG=/path/to/iceprog` env var to override the binary location.
+Note: `rebbarb/rebbarb.py` builds a 36 MHz LED blink demo. The BBA
+implementation (`exi_bba/`) uses a split-domain clock: `capture` @ 54 MHz (PLL)
+for the SPI bit engine, `exi`/`sync` @ 24 MHz (HFOSC) for everything else.
+Synthesize/flash the real design with `python -m exi_bba.synth [--flash]`.
+
+**Run a simulation:**
+```bash
+# New-API testbench style (preferred for new code):
+python rebbarb/toggle_button.py     # writes ToggleButton.vcd
+python rebbarb/pulse_button.py      # writes PulseButton.vcd
+
+# Old-API process style (reference only, do not replicate in new code):
+python examples/amaranth_cdc.py     # CDC primitives demo
+python examples/async_fifo.py       # AsyncFIFO behaviour
+python examples/icebreaker_fifo.py  # iCEbreaker-specific FIFO (Verilog dump)
+```
+Open VCD output with `gtkwave`. Simulations are the primary testing mechanism —
+there is no separate test runner.
+
+**Verify PLL parameters:**
+```bash
+icepll -i 12 -o 54    # confirms DIVR=0 DIVF=71 DIVQ=4 → 54 MHz (capture domain)
+```
+(`exi`/`sync` come from the internal SB_HFOSC ÷2 = 24 MHz — no PLL.)
+
+---
+
+## Current Implementation State
+
+The `exi_bba/` module tree is **fully implemented** with simulation testbenches.
+All modules elaborate without errors and pass their unit tests. The full design
+**synthesizes, places, routes, and meets timing** on the iCE40UP5K
+(`python -m exi_bba.synth`): `capture` closes ~70 MHz (target 54) and `exi`/
+`sync` close ~36 MHz (target 24) — both PASS.
+
+### `exi_bba/` module status
+
+| Module | File | Tests pass |
+|---|---|---|
+| `BBATop` | `exi_bba/bba_top.py` | ✅ EXI integration + full W5100→SPRAM→GC RX loop; synth PASS |
+| `ExiCapture` | `exi_bba/exi_capture.py` | ✅ rx/tx byte-stream + over-push/flush |
+| `SPIMode3Slave` | `exi_bba/spi_mode3_slave.py` | ✅ 4 tests (live-drive TX) |
+| `BBARegisterFile` | `exi_bba/bba_register_file.py` | ✅ 7 tests (proactive push + DMA stream) |
+| `SPRAMArbiter` | `exi_bba/spram_arbiter.py` | ✅ 3 tests |
+| `RXFrameAssembler` | `exi_bba/rx_frame_assembler.py` | ✅ 3 tests |
+| `TXFrameDrain` | `exi_bba/tx_frame_drain.py` | ✅ 2 tests |
+| `W5100ParallelMaster` | `exi_bba/w5100_parallel_master.py` | ✅ 5 tests (init/TX/RX vs bus model, incl. ring wrap) — **default eth back-end** |
+| `W5500SPIMaster` | `exi_bba/w5500_spi_master.py` | ✅ init/TX/RX vs SPI-slave model (alt back-end) |
+| `StatusPanel` | `exi_bba/status_panel.py` | ✅ 6 tests (heartbeat, stretched activity LEDs, debounced buttons, freeze) |
+| `EEPROMModel` | `exi_bba/eeprom_model.py` | ✅ 4 tests |
+
+**Bring-up status panel (optional):** `BBATop(status_panel=True)` adds a
+`StatusPanel` driving onboard iCEbreaker LEDs + button (dedicated pins, so it
+coexists with EXI + W5100). `synth.py` enables it: **LEDG=heartbeat**,
+**LEDR=EXI activity** (the GC is talking), **BTN_N=manual re-init**. The full
+EXI + W5100 + panel build synthesizes and meets timing (slow ~35≥24, capture
+~64≥54, 44% LC). Panel LEDs 3–5 (rx/tx/ready) exist in the module but aren't
+mapped on the iCEbreaker (only 2 discrete LEDs); the onboard RGB or a custom
+PCB can expose them.
+
+**Ethernet back-end is selectable:** `BBATop(eth="w5100")` (default — indirect
+parallel bus, reaches the ~27 Mbit/s EXI ceiling) or `BBATop(eth="w5500")` (SPI,
+~12 Mbit/s). Both masters expose the identical tx/rx/init/par streaming
+interface; only the physical pins differ. See "W5100 vs W5500" below.
+
+### Run all module testbenches (from workspace root)
+```bash
+python -m exi_bba.spi_mode3_slave
+python -m exi_bba.exi_capture
+python -m exi_bba.bba_register_file
+python -m exi_bba.spram_arbiter
+python -m exi_bba.rx_frame_assembler
+python -m exi_bba.tx_frame_drain
+python -m exi_bba.w5100_parallel_master   # 5 tests: init, TX(+wrap), RX(+wrap)
+python -m exi_bba.w5500_spi_master
+python -m exi_bba.status_panel            # 6 tests: heartbeat/activity/buttons
+python -m exi_bba.eeprom_model
+python -m exi_bba.bba_top        # end-to-end EXI integration test (W5100 RX loop)
+```
+
+### Pending work
+- **Synthesis/timing**: ✅ done — `python -m exi_bba.synth` synthesizes, P&Rs,
+  and meets timing on both clock domains (capture ~68≥54, slow ~40≥24).
+- **W5500 init/TX/RX**: ✅ done — `W5500SPIMaster` has a real Mode-0 byte engine,
+  a generic register-transaction engine (header + wbuf/stream payload), the full
+  init sequence (MR reset, SHAR, S0_MR MACRAW, S0_CR OPEN, S0_IMR), MACRAW TX
+  (read TX_WR → stream frame to TX buffer → advance TX_WR → SEND) and MACRAW RX
+  (RSR → RD → 2-byte length → stream frame out → advance RD → RECV). All verified
+  on the wire by a responding W5500 SPI-slave model in the testbench.
+- **PAR0–5 → W5500 SHAR**: ✅ done — `reg.par` wired to `w5500.par` in `BBATop`
+  (PAR0 packed in the low byte so it is the first SHAR octet).
+- **NCRA SR bit**: ✅ done — `BBARegisterFile.ncra_sr` (= NCRA[3]) gates
+  `asm.rx_enabled` in `BBATop` (was hard-wired to 1).
+- **W5500 SPI throughput**: SCK = sync÷2 = 12 MHz (~12 Mbit/s) — exceeds
+  real-world GC BBA TCP throughput (~6–10 Mbit/s) but is below the 27 Mbit/s raw
+  EXI ceiling. Pushing past 12 Mbit/s was investigated and found NOT achievable
+  on this UP5K (the W5500-operating logic is distributed ~40 MHz, not just the
+  bit-bang) — see the "Full-rate W5500 SPI" item below.
+  `W5500SPIMaster(clk_div=N)` divides SCK further if signal integrity needs it.
+- **EXI DMA bulk reads**: ✅ done — SPRAM-region reads (addr ≥ 0x100) now STREAM
+  until CS deasserts instead of stopping at the header's 2-bit length, so they
+  serve both ≤4-byte immediate reads (Swiss) AND arbitrary-length DMA reads
+  (other GC software, and a future Swiss path for loading ROMs from a network
+  file store). Implementation:
+    - `SPIMode3Slave.cs_active` (synchronised CS level) → `ExiCapture` crosses it
+      to the exi domain (FFSynchronizer) → `BBARegisterFile.cs_active`.
+    - `BBARegisterFile` SPRAM_STREAM state: auto-increments the SPRAM address,
+      prefetches up to SP_LIMIT=4 reads in flight, pushes responses to tx_fifo;
+      SPRAM_END drains the in-flight pipeline + rx dummies on CS-rise.
+    - `ExiCapture` flushes tx_fifo on CS-fall to clear prefetch over-push so a
+      truncated DMA read can't leak stale bytes into the next transaction.
+    Tested: register-file streaming read (SPRAM model, 12 bytes), ExiCapture
+    over-push/flush, AND the full BBATop loop — a W5500 model delivers a frame →
+    W5500 master RX → RXFrameAssembler writes the SPRAM ring → GC reads RWP then
+    DMA-reads the descriptor+frame back (verified byte-for-byte).
+    Note: a DMA read header must keep length-1 within the 2-bit field; the GC
+    driver sets it ≤3 and clocks the real length via CS (the design streams
+    until CS regardless). (EXI DMA *writes* are not implemented; the GC's
+    DMA-write engine has a 1-bit-shift bug and Swiss avoids them — see
+    design-doc §"EXI DMA bug".)
+- **S0_IR interrupt clear after RX**: ✅ done — `W5500SPIMaster` RX_CLR_IR state
+  writes Sn_IR[2]=1 after RECV so `INT_N` deasserts (else the FSM would re-enter
+  RX_CHECK forever on real hardware).
+- **Full-rate W5500 SPI (27 Mbit/s) — INVESTIGATED, NOT achievable on UP5K**:
+  the W5500 SCK is sync÷2 = 12 MHz. Raising it needs the SPI engine on a ≥54 MHz
+  clock, but a standalone synth of `W5500SPIMaster` in the capture domain closes
+  only **40 MHz** — and the slack histogram shows the failure is *distributed*
+  (~140 endpoints fail 54, incl. the `wbuf`/header mux feeding the shift
+  register), NOT a single cuttable path. So the bottleneck is the **logic that
+  operates the SPI device** (transaction FSM, byte sourcing), not the bit-bang.
+  Consequences:
+    - The "split the bit engine to capture + per-byte CDC handshake" idea nets
+      only ~14 Mbit/s — the CDC round-trip ≈ the SPI byte time — not worth it.
+    - A capture-domain "streaming executor" would still contain that distributed
+      ~40 MHz logic, so it wouldn't close 54 either.
+    - Hardware `SB_SPI` wouldn't help (it only offloads the bit-bang, which was
+      never the bottleneck) and is unsimulatable.
+    - There is no usable clock between 24 (HFOSC) and 54 (the one PLL, needed at
+      54 for the EXI front-end); PLL÷2 = 27 → SCK 13.5 MHz, a ~12% gain, not
+      worth the fabric divider.
+  Net: 12 Mbit/s is the practical W5500 ceiling on this part. It exceeds
+  real-world GC BBA TCP throughput and is fine for chunked ROM streaming.
+  Reaching 27 Mbit/s would need a faster FPGA or a much shallower W5500-operating
+  redesign (uncertain) — **OR a parallel-bus ethernet chip (see W5100 below)**,
+  which is the implemented solution for the ROM-streaming throughput target.
+
+## W5100 vs W5500 ethernet back-end
+
+The throughput insight: SPI serialises 8 bits/byte, so the W5500 byte rate is
+(operating-logic clock)/16 — and that logic caps ~40 MHz on this UP5K → ~12
+Mbit/s. A **parallel** bus moves a whole byte per access, so the *same* ~24 MHz
+`sync` logic clears the 27 Mbit/s EXI ceiling (the real hard limit — the GC EXI
+bus tops out there). So `W5100ParallelMaster` is the throughput path and is now
+the `BBATop` default.
+
+- **Interface:** W5100 **indirect parallel bus** (IDM). Only A[1:0] are wired
+  (board ties A[14:2]=0 so a power-up direct access at A=00 still hits MR):
+  `00`=MR, `01`=IDM_AR0(hi), `10`=IDM_AR1(lo), `11`=IDM_DR. A register/buffer
+  access = write IDM_AR (the 16-bit address) then read/write IDM_DR. With MR.AI
+  set, IDM_DR auto-increments → a multi-byte block is one address-set + a burst.
+- **Bus engine:** drives A + D with `/CS` and `/RD`|`/WR` asserted for
+  `strobe_cycles` (default 3 ≈ 125 ns at 24 MHz, ≥ the W5100's ~80 ns access).
+  DATA[7:0] is bidirectional → an SB_IO tristate (`bus_data_o`/`oe`/`i`).
+- **Pins (15):** A[1:0]=2, D[7:0]=8, /CS,/RD,/WR=3, /INT=1, /RST=1. With EXI (5)
+  + clk (1) = **21 of ~34 usable SG48 I/O** — comfortable. See `synth.py`.
+- **MR.AI requires init first:** unlike the W5500 (each SPI transaction is
+  self-framed), the W5100's multi-byte accesses depend on MR.AI, so the init
+  sequence (triggered by the GC's NCRA reset) MUST run before any TX/RX. The
+  BBATop test issues NCRA-reset before its RX loop for this reason; on hardware
+  the GC driver already does. (`BBATop(reset_cycles=N)` shrinks the MR settle
+  wait for sim.)
+- **Ring wraparound is in fabric:** the W5100 does NOT auto-wrap the IDM address
+  at the socket-buffer boundary (the W5500 did), so the streamer re-sets IDM_AR
+  to the buffer base when the running address reaches the 2 KB boundary. Handled
+  in the SW/SR/RB paths (`xfer_wrap`/`xfer_wbase`/`xfer_wend`/`cur_addr`); both
+  TX and RX wrap cases are tested.
+- **Register map differs from the W5500:** common regs at 0x0000 (MR, SHAR 0x09,
+  IMR 0x16, RMSR/TMSR 0x1A/0x1B), socket 0 at 0x0400 (S0_MR/CR/IR, TX_WR 0x424,
+  RX_RSR 0x426, RX_RD 0x428), TX buffer 0x4000, RX buffer 0x6000. MACRAW mode.
+- **Status:** init/TX/RX (with wrap) verified vs a bus model; BBATop full
+  W5100→SPRAM→GC RX loop passes byte-for-byte; synth PASS (slow ~32≥24, capture
+  ~56≥54, 42% LC). Register addresses/MR bits are from the datasheet (from
+  memory) — **confirm at hardware bring-up**.
+
+### `rebbarb/` — LED blink demo (unchanged)
+- `rebbarb.py` — blinks LEDs via a PLL (36 MHz), demonstrates `IceBreakerPlatform`
+- `debouncer.py` — `Debouncer(cycles)` — synchronous debounce, configurable hold
+- `toggle_button.py` — `ToggleButton` — edge-to-toggle state machine (wraps Debouncer)
+- `pulse_button.py` — `PulseButton` — single-cycle pulse on rising edge (wraps Debouncer)
+
+These components are reusable building blocks. The `Debouncer` and button wrappers
+will be needed for any physical input in `exi_bba/`.
+
+**Import note:** `rebbarb/` files use bare imports (`from debouncer import Debouncer`).
+Run them as `python rebbarb/<file>.py` from the workspace root so Python adds
+`rebbarb/` to `sys.path` automatically.
+
+**Simulation at module level:** `toggle_button.py` and `pulse_button.py` run
+their simulations unconditionally (no `__main__` guard) — importing either file
+triggers a VCD write. New modules should guard simulation code with
+`if __name__ == "__main__":`.
+
+`examples/amaranth_cdc.py` contains handwritten `SyncFF` and `TogglePulseSync`
+reference implementations — use `amaranth.lib.cdc` primitives (`FFSynchronizer`,
+`PulseSynchronizer`) in production code instead.
+
+`hardware/sp1_test_plug/` — KiCad project for a physical SP1 edge-connector test
+plug (schematic, PCB, custom GameCube symbol library). Used to verify pad geometry
+before ordering the interposer PCB; not part of the FPGA build.
+
+---
+
+## Amaranth Simulator API
+
+Two API generations are present in this repo:
+
+| API | Where used | Status |
+|---|---|---|
+| `sim.add_testbench(async_fn)` + `await ctx.tick()` + `Period(MHz=n)` | `rebbarb/*.py` | **Use this for new code** |
+| `sim.add_sync_process(gen_fn)` + `sim.run_until(t)` | `examples/` | Old — reference only |
+
+New modules should use the testbench API (`add_testbench`, `sim.write_vcd(ctx)`
+context manager). The old process API still works but is not idiomatic in current
+Amaranth.
+
+**Critical testbench timing rule:** `ctx.get(signal)` reads signal values AFTER
+the clock edge (post-update registered values). Combinatorial signals that depend
+on registered signals that were updated by the SAME tick will already reflect the
+new registered values. For example: if `tx_sof = tx_bytes_r_rdy & is_first` and
+`is_first` is cleared synchronously on the first byte, then reading `tx_sof` after
+the first byte's tick always returns 0 — read BEFORE the tick instead.
+
+**`ctx.set()` takes effect immediately** (combinatorial, not registered). Use it
+AFTER `await ctx.tick()` to prepare inputs for the NEXT tick.
+
+The full design specification lives in `docs/gc_bba_fpga_design.md`.
+
+---
+
+## Key Architecture Decisions
+
+- **No network stack in the FPGA.** The GC CPU runs TCP/IP. The FPGA is a dumb
+  MAC bridge.
+- **Split-domain clocking — 3 domains, 2 sources (1 PLL + 1 HFOSC):**
+  - `capture` — 54 MHz (PLL, DIVR=0 DIVF=71 DIVQ=4). Hosts ONLY the SPI Mode 3
+    bit engine inside `ExiCapture`. 54 MHz = 2× the **real 27 MHz** EXI clock —
+    the minimum oversampling for clean Mode 3. The isolated bit engine closes
+    ~91 MHz; integrated with the byte-FIFO read path the capture domain closes
+    ~62 MHz, so 54 passes with margin.
+  - `exi` — 24 MHz (HFOSC ÷2). BBA register file / transaction FSM.
+  - `sync` — 24 MHz (same HFOSC net as `exi`). SPRAM arbiter, RX/TX engines,
+    W5500 SPI master.
+  - **Why split:** only the tiny SPI bit engine needs a fast clock to sample
+    27 MHz EXI. The bulky register-file/SPRAM/W5500 logic is routing-bound at
+    ~33–44 MHz on the UP5K and only needs the byte rate (27 MHz ÷ 8 ≈ 3.4 MHz).
+    `ExiCapture` bridges capture↔exi with rx/tx byte AsyncFIFOs.
+  - **EXI clock reality:** the GC EXI clock tops out at ~27 MHz. libogc's
+    `EXI_SPEED32MHZ` is a nominal name — the real rate is 27 MHz. The old
+    "96 MHz = 3× 32 MHz EXI" target was doubly wrong and unreachable on UP5K
+    (which caps ~44 MHz for non-trivial logic).
+  - **TX/MISO across the split:** the register file PROACTIVELY pushes read
+    responses into the tx byte FIFO during the EXI clock-idle gap (the GC pauses
+    the clock between an EXI_Imm header-write and the data-read). The bit engine
+    drives MISO live from the FIFO head; see `ExiCapture` / `SPIMode3Slave`.
+- **All CDC via `amaranth.lib.cdc`.** Never pass raw multi-bit signals across
+  domains. Use `FFSynchronizer` for slow single bits, `PulseSynchronizer` for
+  events, `AsyncFIFO` for data streams, `ResetSynchronizer` for resets.
+- **Register file lives entirely in `exi` domain.** The `sync` domain only
+  communicates through AsyncFIFOs and PulseSynchronizers — never direct register
+  reads/writes.
+
+---
+
+## Critical Protocol Notes
+
+### EXI / SPI Mode 3
+- CLK idles **HIGH** (CPOL=1, CPHA=1).
+- MOSI sampled on **falling** CLK edge. MISO driven on **rising** CLK edge.
+- Getting this wrong means the GC never enumerates the device.
+- CS is active **low**, delineates each transaction.
+
+### EXI Transaction Header (2 bytes before data)
+```
+Byte 0: [7]=write_flag  [6:0]=addr[12:6]
+Byte 1: [7:2]=addr[5:0] [1:0]=xfer_len-1  (0=1B … 3=4B)
+```
+Full address = 13 bits → 0x0000–0x1FFF.
+
+### Device ID Query
+On power-on the GC writes `0x0000` (2 bytes) then reads 4 bytes.
+Must return: `0x04 0x02 0x02 0x00`.
+
+---
+
+## Memory Map (abridged)
+
+| Range | Region |
+|---|---|
+| 0x0000–0x0033 | MAC control registers (register file, exi domain) |
+| 0x0048 | TXDATA — bulk TX data port (→ `tx_bytes` AsyncFIFO) |
+| 0x0100–0x0FFF | RX ring buffer in SPRAM (15 × 256-byte pages, pages 1–15) |
+| 0x0100–0x1FFF | any read ≥ 0x0100 streams from SPRAM (DMA path); the ring proper is pages 1–15 above |
+
+---
+
+## Key Registers
+
+| Addr | Name | Notes |
+|---|---|---|
+| 0x00 | NCRA | [0]=RESET self-clears; pulses `ncra_rst` to sync domain |
+| 0x08 | IMR | Interrupt mask |
+| 0x09 | IR | Write-1-to-clear. [1]=RI, [2]=TI. INT_N asserts when IR & IMR ≠ 0 |
+| 0x16–17 | RWP | RX write pointer — updated by sync domain via `rx_wptr` FIFO |
+| 0x18–19 | RRP | RX read pointer — GC writes after consuming frames |
+| 0x20–25 | PAR0–5 | MAC address; also forwarded to W5500 as SHAR |
+| 0x31 | NWAYS | Hardcode **0x17** (100M full-duplex link up, autoneg complete) |
+| 0x3A | HIPR | Hardcode **0x01** (BBA present) |
+| 0x48 | TXDATA | GC streams TX frame bytes here |
+
+---
+
+## Module Breakdown
+
+| Module | Domain | File |
+|---|---|---|
+| `BBATop` | all | `exi_bba/bba_top.py` |
+| `ExiCapture` | capture (+exi FIFOs) | `exi_bba/exi_capture.py` |
+| `SPIMode3Slave` | capture (param `domain`) | `exi_bba/spi_mode3_slave.py` |
+| `BBARegisterFile` | exi (+FIFO to sync) | `exi_bba/bba_register_file.py` |
+| `SPRAMArbiter` | sync | `exi_bba/spram_arbiter.py` |
+| `RXFrameAssembler` | sync | `exi_bba/rx_frame_assembler.py` |
+| `TXFrameDrain` | sync | `exi_bba/tx_frame_drain.py` |
+| `W5100ParallelMaster` | sync | `exi_bba/w5100_parallel_master.py` (default eth) |
+| `W5500SPIMaster` | sync | `exi_bba/w5500_spi_master.py` (alt eth) |
+| `EEPROMModel` | exi | `exi_bba/eeprom_model.py` |
+
+`ExiCapture` wraps `SPIMode3Slave` (in the fast `capture` domain) plus the
+capture↔exi rx/tx byte AsyncFIFOs. `BBARegisterFile` consumes the rx byte
+stream and proactively pushes read responses into the tx byte FIFO — it no
+longer sees the per-bit SPI cadence (that lives entirely in `capture`).
+
+---
+
+## CDC Signal Inventory
+
+| Signal | Direction | Primitive |
+|---|---|---|
+| EXI CLK / MOSI / CS pins | async → capture | `FFSynchronizer` (stages=2) |
+| RX byte stream (capture→core) | capture → exi | `AsyncFIFO` 8-bit, depth=4 |
+| TX byte stream (core→capture) | exi → capture | `AsyncFIFO` 8-bit, depth=2 |
+| cs_active (transaction in progress) | capture → exi | `FFSynchronizer` (DMA read length) |
+| SPRAM read request (addr) | exi → sync | `AsyncFIFO` 16-bit, depth=4 |
+| SPRAM read result (data) | sync → exi | `AsyncFIFO` 8-bit, depth=4 |
+| TX packet bytes | exi → sync | `AsyncFIFO` 8-bit, depth=16 |
+| TX frame length | exi → sync | `AsyncFIFO` 16-bit, depth=4 |
+| RX frame bytes | sync → SPRAM | `RXFrameAssembler` → `SPRAMArbiter` (not a byte FIFO; the GC reads frames back out of SPRAM via the SPRAM read req/rsp FIFOs) |
+| RWP update | sync → exi | `AsyncFIFO` 8-bit, depth=4 |
+| RRP update | exi → sync | `AsyncFIFO` 8-bit, depth=4 |
+| RX ready (IR[RI]) | sync → exi | `PulseSynchronizer` |
+| TX done (IR[TI]) | sync → exi | `PulseSynchronizer` |
+| NCRA reset pulse | exi → sync | `PulseSynchronizer` |
+
+---
+
+## W5500 Configuration (on NCRA reset)
+
+The W5500 selects the register **block** via the BSB field of the control byte,
+NOT via the address — so register addresses below are **block offsets**, not flat
+0x4000-style addresses (see `_W5500_*` and `_CTRL_*` in `w5500_spi_master.py`).
+```
+1. Write MR     = 0x80   (common block, offset 0x0000)  software reset
+2. Wait ~1 ms
+3. Write SHAR   = MAC     (common block, offset 0x0009, 6 bytes from PAR0–5)
+4. Write S0_MR  = 0x04    (socket-0 reg block, offset 0x0000)  MACRAW
+5. Write S0_CR  = 0x01    (socket-0 reg block, offset 0x0001)  OPEN
+6. Write S0_IMR = 0x05    (socket-0 reg block, offset 0x002C)  RECV | SEND_OK
+```
+
+W5500 SPI is **Mode 0** (CPOL=0 CPHA=0); SCK = **12 MHz** (the 24 MHz `sync`
+domain ÷ 2 via a toggle clock-enable). Connect W5500 `INT_N` to an FPGA input
+for low-latency RX detection. (The W5500 is the alternate back-end; the W5100
+parallel master is the default — see "W5100 vs W5500".)
+
+---
+
+## Physical Interface (SP1 Edge Connector)
+
+- PCB must be **1.2 mm thick, ENIG finish**.
+- Staggered (not mirrored) top/bottom contact rows — same geometry as PCI/ISA.
+- Derive exact pad geometry from **SP1ETH KiCad project** (silverstee1/SP1ETH),
+  cross-referenced with ETH2SP1 (LaserBear). Do not rely on YAGCD alone.
+- Add **100 µF bulk cap** on the interposer near FPGA power pins (3.3 V budget
+  is tight: iCEbreaker ~80 mA + W5500 ~150 mA ≈ 230 mA).
+- **Pin 5 is 12 V — do not connect to FPGA I/O.** Test point or leave open.
+- `EXTIN` (pin 1): tie to 3.3 V via 10 kΩ — required for GC device enumeration.
+- All signal levels are 3.3 V. No level shifting needed.
+
+---
+
+## SPRAM Notes
+
+- iCE40UP5K has 128 KB SPRAM (SB_SPRAM256KA, 16-bit wide).
+- **1-cycle synchronous read latency** — result of read at cycle N is valid at N+1.
+- Byte writes via `MASKWREN`: lower byte = `0b0011`, upper byte = `0b1100`.
+- Address to SPRAM = byte_address >> 1.
+- ETH writes take priority over EXI reads in the arbiter (safe by ring-buffer
+  invariant: GC only reads pages the ETH engine has already finished).
+
+---
+
+## GC Initialisation Sequence (Swiss/BBA driver)
+
+```
+1.  Write 0x0000 × 2, read 4 B → must get 0x04020200 (device ID)
+2.  Write NCRA = 0x01            (reset, self-clears; resets W5500 + SPRAM ptrs)
+3.  Poll NCRA bit 0 until 0      (wait reset complete)
+4.  Write PAR0–5                 (MAC address)
+5.  Write MAR0–7 = 0xFF          (promiscuous multicast)
+6.  Write ANALOG = 0xD6          (enable PHY — no FPGA effect, just store)
+7.  Write NWAYC                  (autoneg config — store only)
+8.  Write IMR = 0x86             (enable RBFI | TI | RI interrupts)
+9.  Write GCA (AUTOPUB bit)
+10. Write NCRA SR bit = 0x08     (start receive)
+11. Poll NWAYS until link up     → return hardcoded 0x17 immediately
+```
+
+---
+
+## Implementation Notes & Gotchas
+
+- **`NWAYS` must return `0x17` always.** GC polls it to confirm 100 Mbps link
+  before enabling RX. Do not attempt to reflect real W5500 link status.
+- **`EEPROMModel` can be stubbed initially.** Many GC BBA drivers write their own
+  MAC to PAR0–5 rather than using the EEPROM. Pre-populate PAR0–5 reset state
+  with a valid Nintendo OUI MAC (`00:09:BF:xx:xx:xx`).
+- **`tx_load` timing in `SPIMode3Slave`:** pulses at CS assertion (first byte)
+  and after each complete received byte. Upstream must register next TX byte
+  within one `exi` clock.
+- **PLL target 54 MHz**: verify with `icepll -i 12 -o 54` (DIVR=0 DIVF=71 DIVQ=4)
+  before coding PLL parameters; the capture-domain bit engine oversamples the
+  27 MHz EXI clock 2×.
+- **TX buffer selection (NCRA ST bits):** Ignore buffer select (ST1 vs ST0).
+  Treat any non-zero ST as a TX trigger.
+- **If nextpnr fails capture-domain timing at 54 MHz:** the isolated bit engine
+  closes ~91 MHz, so 54 has margin; if a seed fails, sweep seeds
+  (`synth.py --seeds N`) or instruct users to configure Swiss to a lower EXI
+  clock index.
diff --git a/PulseButton.vcd b/PulseButton.vcd
deleted file mode 100644
index 42710e4..0000000
--- a/PulseButton.vcd
+++ /dev/null
@@ -1,195 +0,0 @@
-$comment Generated by Amaranth $end
-$date 2025-09-20 22:27:02.816595 $end
-$timescale 1 fs $end
-$scope module bench $end
-$scope module top $end
-$var wire 1 ! clk $end
-$var wire 1 " rst $end
-$var wire 1 # i $end
-$var wire 1 $ i$3 $end
-$var wire 14 % counter $end
-$var wire 1 & o $end
-$var wire 1 ' o$6 $end
-$var wire 1 ( last_seen $end
-$scope module U$0 $end
-$var wire 1 ! clk $end
-$var wire 1 " rst $end
-$var wire 1 # i $end
-$var wire 1 ' o $end
-$var wire 1 ) prevInValid $end
-$var wire 14 * count $end
-$var wire 1 + state $end
-$var wire 1 , prevIn $end
-$upscope $end
-$upscope $end
-$upscope $end
-$enddefinitions $end
-#0
-$dumpvars
-0!
-0"
-0#
-0$
-b0 %
-0&
-0'
-0(
-0)
-b10011100010000 *
-0+
-0,
-$end
-#500000000
-1!
-1)
-b0 *
-#1000000000
-0!
-#1500000000
-1!
-1$
-1#
-#2000000000
-0!
-#2500000000
-1!
-1+
-1,
-b10011100010000 *
-1'
-#3000000000
-0!
-#3500000000
-1!
-1&
-1(
-#4000000000
-0!
-#4500000000
-1!
-0&
-b10011100010000 %
-#5000000000
-0!
-#5500000000
-1!
-b10011100001111 %
-#6000000000
-0!
-#6500000000
-1!
-b10011100001110 %
-0$
-0#
-#7000000000
-0!
-#7500000000
-1!
-0,
-b10011100001111 *
-b10011100001101 %
-#8000000000
-0!
-#8500000000
-1!
-b10011100001110 *
-b10011100001100 %
-#9000000000
-0!
-#9500000000
-1!
-b10011100001101 *
-b10011100001011 %
-#10000000000
-0!
-#10500000000
-1!
-b10011100001100 *
-b10011100001010 %
-#11000000000
-0!
-#11500000000
-1!
-b10011100001011 *
-b10011100001001 %
-1$
-1#
-#12000000000
-0!
-#12500000000
-1!
-1,
-b10011100010000 *
-b10011100001000 %
-#13000000000
-0!
-#13500000000
-1!
-b10011100000111 %
-#14000000000
-0!
-#14500000000
-1!
-b10011100000110 %
-#15000000000
-0!
-#15500000000
-1!
-b10011100000101 %
-#16000000000
-0!
-#16500000000
-1!
-b10011100000100 %
-0$
-0#
-#17000000000
-0!
-#17500000000
-1!
-0,
-b10011100001111 *
-b10011100000011 %
-#18000000000
-0!
-#18500000000
-1!
-b10011100001110 *
-b10011100000010 %
-#19000000000
-0!
-#19500000000
-1!
-b10011100001101 *
-b10011100000001 %
-#20000000000
-0!
-#20500000000
-1!
-b10011100001100 *
-b10011100000000 %
-#21000000000
-0!
-#21500000000
-1!
-b10011100001011 *
-b10011011111111 %
-#22000000000
-0!
-#22500000000
-1!
-b10011100001010 *
-b10011011111110 %
-#23000000000
-0!
-#23500000000
-1!
-b10011100001001 *
-b10011011111101 %
-#24000000000
-0!
-#24500000000
-1!
-b10011100001000 *
-b10011011111100 %
-#25000000000
diff --git a/ToggleButton.vcd b/ToggleButton.vcd
deleted file mode 100644
index da8a43c..0000000
--- a/ToggleButton.vcd
+++ /dev/null
@@ -1,171 +0,0 @@
-$comment Generated by Amaranth $end
-$date 2025-09-20 22:27:02.809849 $end
-$timescale 1 fs $end
-$scope module bench $end
-$scope module top $end
-$var wire 1 ! clk $end
-$var wire 1 " rst $end
-$var wire 1 # i $end
-$var wire 1 $ i$3 $end
-$var wire 1 % o $end
-$var wire 1 & last_seen $end
-$var wire 1 ' o$6 $end
-$scope module U$0 $end
-$var wire 1 ! clk $end
-$var wire 1 " rst $end
-$var wire 1 # i $end
-$var wire 1 % o $end
-$var wire 1 ( prevInValid $end
-$var wire 14 ) count $end
-$var wire 1 * state $end
-$var wire 1 + prevIn $end
-$upscope $end
-$upscope $end
-$upscope $end
-$enddefinitions $end
-#0
-$dumpvars
-0!
-0"
-0#
-0$
-0%
-0&
-0'
-0(
-b10011100010000 )
-0*
-0+
-$end
-#500000000
-1!
-b0 )
-1(
-#1000000000
-0!
-#1500000000
-1!
-1$
-1#
-#2000000000
-0!
-#2500000000
-1!
-b10011100010000 )
-1*
-1+
-1%
-#3000000000
-0!
-#3500000000
-1!
-1&
-1'
-#4000000000
-0!
-#4500000000
-1!
-#5000000000
-0!
-#5500000000
-1!
-#6000000000
-0!
-#6500000000
-1!
-0$
-0#
-#7000000000
-0!
-#7500000000
-1!
-b10011100001111 )
-0+
-#8000000000
-0!
-#8500000000
-1!
-b10011100001110 )
-#9000000000
-0!
-#9500000000
-1!
-b10011100001101 )
-#10000000000
-0!
-#10500000000
-1!
-b10011100001100 )
-#11000000000
-0!
-#11500000000
-1!
-b10011100001011 )
-1$
-1#
-#12000000000
-0!
-#12500000000
-1!
-b10011100010000 )
-1+
-#13000000000
-0!
-#13500000000
-1!
-#14000000000
-0!
-#14500000000
-1!
-#15000000000
-0!
-#15500000000
-1!
-#16000000000
-0!
-#16500000000
-1!
-0$
-0#
-#17000000000
-0!
-#17500000000
-1!
-b10011100001111 )
-0+
-#18000000000
-0!
-#18500000000
-1!
-b10011100001110 )
-#19000000000
-0!
-#19500000000
-1!
-b10011100001101 )
-#20000000000
-0!
-#20500000000
-1!
-b10011100001100 )
-#21000000000
-0!
-#21500000000
-1!
-b10011100001011 )
-#22000000000
-0!
-#22500000000
-1!
-b10011100001010 )
-#23000000000
-0!
-#23500000000
-1!
-b10011100001001 )
-#24000000000
-0!
-#24500000000
-1!
-b10011100001000 )
-#25000000000
diff --git a/docs/.obsidian/.obsidian/app.json b/docs/.obsidian/.obsidian/app.json
new file mode 100644
index 0000000..9e26dfe
--- /dev/null
+++ b/docs/.obsidian/.obsidian/app.json
@@ -0,0 +1 @@
+{}
\ No newline at end of file
diff --git a/docs/.obsidian/.obsidian/appearance.json b/docs/.obsidian/.obsidian/appearance.json
new file mode 100644
index 0000000..9e26dfe
--- /dev/null
+++ b/docs/.obsidian/.obsidian/appearance.json
@@ -0,0 +1 @@
+{}
\ No newline at end of file
diff --git a/docs/.obsidian/.obsidian/core-plugins.json b/docs/.obsidian/.obsidian/core-plugins.json
new file mode 100644
index 0000000..0faa60d
--- /dev/null
+++ b/docs/.obsidian/.obsidian/core-plugins.json
@@ -0,0 +1,33 @@
+{
+  "file-explorer": true,
+  "global-search": true,
+  "switcher": true,
+  "graph": true,
+  "backlink": true,
+  "canvas": true,
+  "outgoing-link": true,
+  "tag-pane": true,
+  "footnotes": false,
+  "properties": false,
+  "page-preview": true,
+  "daily-notes": true,
+  "templates": true,
+  "note-composer": true,
+  "command-palette": true,
+  "slash-command": false,
+  "editor-status": true,
+  "bookmarks": true,
+  "markdown-importer": false,
+  "zk-prefixer": false,
+  "random-note": false,
+  "outline": true,
+  "word-count": true,
+  "slides": false,
+  "audio-recorder": false,
+  "workspaces": false,
+  "file-recovery": true,
+  "publish": false,
+  "sync": true,
+  "bases": true,
+  "webviewer": false
+}
\ No newline at end of file
diff --git a/docs/.obsidian/.obsidian/workspace.json b/docs/.obsidian/.obsidian/workspace.json
new file mode 100644
index 0000000..41369cc
--- /dev/null
+++ b/docs/.obsidian/.obsidian/workspace.json
@@ -0,0 +1,167 @@
+{
+  "main": {
+    "id": "6eef6b982305e97c",
+    "type": "split",
+    "children": [
+      {
+        "id": "ef28aa54abb02b7c",
+        "type": "tabs",
+        "children": [
+          {
+            "id": "dd2aafdfa4873c3e",
+            "type": "leaf",
+            "state": {
+              "type": "empty",
+              "state": {},
+              "icon": "lucide-file",
+              "title": "New tab"
+            }
+          }
+        ]
+      }
+    ],
+    "direction": "vertical"
+  },
+  "left": {
+    "id": "7dcb0dd958c47669",
+    "type": "split",
+    "children": [
+      {
+        "id": "5addbd6c8b989a49",
+        "type": "tabs",
+        "children": [
+          {
+            "id": "10f89da0d72538c0",
+            "type": "leaf",
+            "state": {
+              "type": "file-explorer",
+              "state": {
+                "sortOrder": "alphabetical",
+                "autoReveal": false
+              },
+              "icon": "lucide-folder-closed",
+              "title": "Files"
+            }
+          },
+          {
+            "id": "476834a62536c756",
+            "type": "leaf",
+            "state": {
+              "type": "search",
+              "state": {
+                "query": "",
+                "matchingCase": false,
+                "explainSearch": false,
+                "collapseAll": false,
+                "extraContext": false,
+                "sortOrder": "alphabetical"
+              },
+              "icon": "lucide-search",
+              "title": "Search"
+            }
+          },
+          {
+            "id": "ce54c42efc557a72",
+            "type": "leaf",
+            "state": {
+              "type": "bookmarks",
+              "state": {},
+              "icon": "lucide-bookmark",
+              "title": "Bookmarks"
+            }
+          }
+        ]
+      }
+    ],
+    "direction": "horizontal",
+    "width": 300
+  },
+  "right": {
+    "id": "87b1d8f1ca08108d",
+    "type": "split",
+    "children": [
+      {
+        "id": "69cbc257ba71f388",
+        "type": "tabs",
+        "children": [
+          {
+            "id": "739632e6a61f8d8e",
+            "type": "leaf",
+            "state": {
+              "type": "backlink",
+              "state": {
+                "collapseAll": false,
+                "extraContext": false,
+                "sortOrder": "alphabetical",
+                "showSearch": false,
+                "searchQuery": "",
+                "backlinkCollapsed": false,
+                "unlinkedCollapsed": true
+              },
+              "icon": "links-coming-in",
+              "title": "Backlinks"
+            }
+          },
+          {
+            "id": "e20c6e67aeb6eacb",
+            "type": "leaf",
+            "state": {
+              "type": "outgoing-link",
+              "state": {
+                "linksCollapsed": false,
+                "unlinkedCollapsed": true
+              },
+              "icon": "links-going-out",
+              "title": "Outgoing links"
+            }
+          },
+          {
+            "id": "858ad7c8f3ac4d90",
+            "type": "leaf",
+            "state": {
+              "type": "tag",
+              "state": {
+                "sortOrder": "frequency",
+                "useHierarchy": true,
+                "showSearch": false,
+                "searchQuery": ""
+              },
+              "icon": "lucide-tags",
+              "title": "Tags"
+            }
+          },
+          {
+            "id": "661ea018f1aa1171",
+            "type": "leaf",
+            "state": {
+              "type": "outline",
+              "state": {
+                "followCursor": false,
+                "showSearch": false,
+                "searchQuery": ""
+              },
+              "icon": "lucide-list",
+              "title": "Outline"
+            }
+          }
+        ]
+      }
+    ],
+    "direction": "horizontal",
+    "width": 300,
+    "collapsed": true
+  },
+  "left-ribbon": {
+    "hiddenItems": {
+      "switcher:Open quick switcher": false,
+      "graph:Open graph view": false,
+      "canvas:Create new canvas": false,
+      "daily-notes:Open today's daily note": false,
+      "templates:Insert template": false,
+      "command-palette:Open command palette": false,
+      "bases:Create new base": false
+    }
+  },
+  "active": "dd2aafdfa4873c3e",
+  "lastOpenFiles": []
+}
\ No newline at end of file
diff --git a/docs/ReBbarb.md b/docs/ReBbarb.md
deleted file mode 100644
index c0b6f9f..0000000
--- a/docs/ReBbarb.md
+++ /dev/null
@@ -1,24 +0,0 @@
-This project attempts to emulate the Gamecube BroadBand Adapter in an FPGA. The following things need to happen.
-
- - [x] [[Amaranth-Hdl project setup]]
-	 - [x] Setup venv
-	 - [x] Install packages
-	 - [x] Flash Blinky on icebreaker
- - [ ] Figuring out how to deal with [[external clocks]].
-	 - [x] How to get a clock greater than 12Mhz needed to interface with 32Mhz EXI
-		 - [x] PLL configured to 48Mhz
-		 - [ ] ~~48Mhz oscillator onboard? ~~
-	 - [ ] Check if Clock Domain Crossing is possible.
-	 - [ ] Oversampeling approach was tedious but worked
- - [ ] Interfacing with [[GameCube]]
-	 - [ ] Figuring pinout of SP1.
-		 - [ ] Unofficial gamecube docs?
- - [ ] Make sure connecting [[SP1]] to IceBreaker is safe.
-	 - [ ] Can we power the FPGA with the SP1?
-		 - [ ] How much voltage do we get from SP1.
-		 - [ ] How much current can we source?
- - [ ] Figuring out basic [[EXI protocol]]
-	 - [ ] What is the structure of the messages?
-		 - [ ] How to know how long the message is
-		 - [ ] Integrity checks?
-	 - [ ] How fast do we need to respond to a message.
\ No newline at end of file
diff --git a/docs/gc_bba_fpga_design.md b/docs/gc_bba_fpga_design.md
new file mode 100644
index 0000000..8661345
--- /dev/null
+++ b/docs/gc_bba_fpga_design.md
@@ -0,0 +1,1443 @@
+# GameCube BBA FPGA Replacement — Design Document
+
+**Target hardware:** iCEbreaker (Lattice iCE40UP5K)  
+**Target language:** Amaranth HDL (Python)  
+**Toolchain:** Yosys + nextpnr-ice40 + IceStorm  
+**Purpose:** Replace the Nintendo GameCube Broadband Adapter (DOL-015) with an
+FPGA-based implementation, exposing a W5500 100BASE-TX ethernet chip to the GC
+over the EXI (Expansion Interface) serial bus, enabling game ISO streaming via
+Swiss homebrew.
+
+---
+
+## Table of Contents
+
+1. [System Overview](#1-system-overview)
+2. [Protocol References](#2-protocol-references)
+3. [Physical Interface — SP1 Edge Connector](#3-physical-interface--sp1-edge-connector)
+4. [Clock Domains](#4-clock-domains)
+5. [Clock Domain Crossing Strategy](#5-clock-domain-crossing-strategy)
+6. [Module Hierarchy](#6-module-hierarchy)
+7. [Module Specifications](#7-module-specifications)
+   - 7.1 [SPIMode3Slave](#71-spimode3slave)
+   - 7.2 [BBARegisterFile](#72-bbaregisterfile)
+   - 7.3 [SPRAMArbiter](#73-spramarbiter)
+   - 7.4 [RXFrameAssembler](#74-rxframeassembler)
+   - 7.5 [TXFrameDrain](#75-txframedrain)
+   - 7.6 [W5500SPIMaster](#76-w5500spimaster)
+   - 7.7 [EEPROMModel](#77-eeprommodel)
+   - 7.8 [BBATop](#78-bbatop)
+8. [Memory Map](#8-memory-map)
+9. [EXI Transaction Protocol](#9-exi-transaction-protocol)
+10. [BBA Register Reference](#10-bba-register-reference)
+11. [Initialisation Sequence](#11-initialisation-sequence)
+12. [RX Data Path — Detailed Flow](#12-rx-data-path--detailed-flow)
+13. [TX Data Path — Detailed Flow](#13-tx-data-path--detailed-flow)
+14. [SPRAM Layout](#14-spram-layout)
+15. [Critical Timing Constraints](#15-critical-timing-constraints)
+16. [SPRAM Read Prefetch Pipeline](#16-spram-read-prefetch-pipeline)
+17. [Interrupt Handling](#17-interrupt-handling)
+18. [EEPROM / MAC Address](#18-eeprom--mac-address)
+19. [iCE40UP5K Resource Budget](#19-ice40up5k-resource-budget)
+20. [PCB / Connector Notes](#20-pcb--connector-notes)
+21. [Known Hardware Quirks](#21-known-hardware-quirks)
+22. [File Structure](#22-file-structure)
+23. [Simulation Strategy](#23-simulation-strategy)
+24. [Open Issues and Extension Points](#24-open-issues-and-extension-points)
+
+---
+
+## 1. System Overview
+
+The GameCube Broadband Adapter (BBA) is a hardware peripheral that plugs into
+Serial Port 1 (SP1) on the underside of the GameCube. It presents a network
+interface to the GC CPU using a Macronix MX98730EC custom IC. GC software
+(primarily Swiss homebrew) communicates with the BBA through a memory-mapped
+register interface accessed over the EXI serial bus.
+
+This project replaces the MX98730EC with an iCEbreaker FPGA that emulates the
+register interface, and connects to a W5500 ethernet chip (on a Pmod-compatible
+module) for actual network communication.
+
+### High-level data flow
+
+```
+GameCube CPU
+    │  EXI (SPI Mode 3, 32 MHz, Serial Port 1)
+    ▼
+iCEbreaker FPGA
+    ├── exi domain (64 MHz): SPI slave, register file, prefetch pipeline
+    └── sync domain (48 MHz): SPRAM arbiter, RX assembler, TX drain, W5500 driver
+            │  SPI (up to 40 MHz)
+            ▼
+        W5500 Pmod module (100BASE-TX ethernet)
+            │  RJ-45
+            ▼
+        Network
+```
+
+### What this design does NOT implement
+
+- A network stack. The GC CPU runs TCP/IP. The FPGA is a dumb MAC bridge.
+- IP address awareness. The FPGA never parses ethernet frame payloads.
+- The GC's DMA engine quirk (only relevant to GC-side software).
+- Video/audio streaming logic (handled by Swiss on the GC CPU side).
+
+---
+
+## 2. Protocol References
+
+| Source | Content |
+|---|---|
+| YAGCD §2.4.1.4 | SP1 (P6) connector pinout |
+| YAGCD §5.9 | EXI bus register descriptions |
+| YAGCD §10.8 | MX98730EC (BBA chip) register map |
+| Dolphin source `EXI_DeviceEthernet.h` | Register offsets, init sequence, RX/TX flow |
+| Dolphin source `EXI_DeviceEthernet.cpp` | Transaction encoding, interrupt logic |
+| Swiss source `bba.c` | GC-side driver, exact register access patterns |
+| MX98730EC datasheet | Unavailable publicly; YAGCD is the primary reference |
+| W5500 datasheet | SPI interface, register map, socket model |
+| iCE40UP5K datasheet | SPRAM timing, PLL parameters, I/O standards |
+
+**Critical implementation note:** The MX98730EC uses **SPI Mode 3** (CPOL=1,
+CPHA=1). CLK idles HIGH. Data is sampled on the FALLING edge of CLK and set up
+on the RISING edge. This is the opposite of memory cards and the RTC chip, which
+use SPI Mode 0. Getting this wrong means the GC will never enumerate the device.
+
+---
+
+## 3. Physical Interface — SP1 Edge Connector
+
+### Slot characteristics
+
+- Dual-sided PCB edge connector
+- Contacts on both top and bottom faces of the PCB edge
+- Top and bottom contact rows are **staggered** (offset by half a pitch), not
+  mirrored — similar to ISA/PCI card edge geometry
+- PCB must be ordered at **1.2 mm thickness** with **ENIG (gold) finish**
+- Keying notch at top-right corner of housing (when looking into console socket
+  with front of console facing right)
+
+### Connector footprint
+
+Exact pad positions and pitch must be taken from the SP1ETH KiCad project
+(github.com/silverstee1/SP1ETH). Do not attempt to derive dimensions from YAGCD
+alone — the document lists signals but not physical geometry. Cross-reference
+against the ETH2SP1 (LaserBear) open model files as a second source.
+
+Key parameters to verify from those files before PCB layout:
+- Contact pitch (expected: 2.0 mm or 2.54 mm — measure from KiCad file)
+- Stagger offset between top and bottom rows
+- Total contact count per side (expected: 6 per side = 12 total, or 12 per side
+  = 24 total with duplicated power/ground)
+- Insertion depth from board edge to first contact
+- Board width at connector edge
+
+### Signal pinout (YAGCD §2.4.1.4)
+
+Pin numbering: looking into the console socket, front of console to the right,
+pin 1 is on the left. On the adapter PCB (component side up, inserting down),
+pin 1 is also on the left — numbering does not mirror.
+
+| Pin | Signal | Direction | Notes |
+|---|---|---|---|
+| 1 | EXTIN | Adapter → GC | Device detect/sense. Tie to 3.3V via 10 kΩ resistor. Without this the GC does not enumerate the device. |
+| 2 | GND | — | Shield ground |
+| 3 | INT | Adapter → GC | Active-low interrupt to GC CPU. Assert when IR & IMR != 0. |
+| 4 | CLK | GC → Adapter | SPI clock, up to 32 MHz, idles HIGH (Mode 3) |
+| 5 | 12V | — | 12 V supply from GC. **Do not connect to FPGA I/O.** Leave unconnected or route to a test point only. |
+| 6 | DO (MISO) | Adapter → GC | Serial data out: adapter drives, GC samples |
+| 7 | 3.3V | — | 3.3 V supply (~200 mA available combined with pin 8) |
+| 8 | 3.3V | — | 3.3 V supply (parallel with pin 7) |
+| 9 | DI (MOSI) | GC → Adapter | Serial data in: GC drives, adapter samples |
+| 10 | CS | GC → Adapter | Chip select, active low. Delineates each transaction. |
+| 11 | GND | — | Signal ground |
+| 12 | GND | — | Signal ground |
+
+**Power budget:** Pins 7+8 together supply 3.3 V. The iCEbreaker draws ~80 mA
+active, the W5500 ~150 mA peak. Total ~230 mA. The GC's 3.3 V rail on SP1 is
+rated for the original BBA which also drew ~200 mA, so headroom is tight. Add a
+100 µF bulk capacitor on the interposer PCB close to the FPGA power pins.
+
+**Voltage levels:** All EXI signals are 3.3 V logic. The iCEbreaker I/O is 3.3 V.
+The W5500 is 3.3 V. No level shifting required anywhere in this design.
+
+---
+
+## 4. Clock Domains
+
+The design uses two clock domains. The iCE40UP5K has one PLL and one internal
+48 MHz oscillator (SB_HFOSC).
+
+### Domain table
+
+| Domain | Frequency | Source | Purpose |
+|---|---|---|---|
+| `exi` | 64 MHz | PLL (12 MHz × 16 / 3) | SPI Mode 3 slave, BBA register file, prefetch pipeline |
+| `sync` | 48 MHz | SB_HFOSC internal oscillator | SPRAM arbiter, RX/TX ethernet engines, W5500 SPI master |
+
+### Rationale
+
+**Why 64 MHz for `exi`?**  
+The EXI bus runs at 32 MHz. The SPI Mode 3 slave needs to detect CLK edges and
+respond on the correct edge. Running the `exi` domain at 2× the bus rate (64 MHz)
+gives two FPGA ticks per EXI CLK half-period. One tick for the setup phase
+(MOSI→shift register, prepare MISO), one tick for the sample/drive phase. This
+is the minimum oversampling ratio that cleanly implements Mode 3 without
+combinatorial timing risk on the MISO output path.
+
+**Why 48 MHz for `sync`?**  
+The iCE40UP5K's internal 48 MHz oscillator (SB_HFOSC) is available without
+consuming the PLL. This leaves the one PLL free for the 64 MHz `exi` domain. The
+W5500 SPI can run up to 80 MHz but we drive it at 24 MHz (48 MHz ÷ 2 via clock
+enable), which is well within spec and requires no additional PLL output.
+
+### PLL configuration (iCE40UP5K)
+
+```
+Input:  12 MHz crystal (iCEbreaker on-board)
+DIVR:   0   (input divider:   12 MHz / (0+1) = 12 MHz)
+DIVF:   63  (feedback mult:   12 MHz × (63+1) = 768 MHz VCO)
+DIVQ:   3   (output divider:  768 MHz / 2^3   = 96 MHz)
+... actually for 64 MHz:
+DIVR:   0
+DIVF:   15  (12 × 16 = 192 MHz VCO)  -- VCO must be 533–1066 MHz on UP5K
+```
+
+The iCE40UP5K VCO range is 533–1066 MHz. To reach 64 MHz cleanly:
+
+```
+DIVR = 0  → F_pfd = 12 MHz
+DIVF = 63 → F_vco = 12 × (63+1) = 768 MHz  (within range)
+DIVQ = 3  → F_out = 768 / 8 = 96 MHz        (too fast)
+
+Better: target 64 MHz
+DIVF = 53 → F_vco = 12 × 54 = 648 MHz
+DIVQ = 3  → F_out = 648 / 8 = 81 MHz        (still off)
+
+Correct combination:
+DIVR = 0, DIVF = 42, DIVQ = 3
+F_vco = 12 × 43 = 516 MHz  (just below range minimum — not valid)
+
+Use:
+DIVR = 0, DIVF = 63, DIVQ = 3  → 96 MHz, then use clock enable for /1.5
+-- or --
+Accept 96 MHz exi domain (3× bus rate instead of 2×): more margin, same logic
+-- or --
+DIVR = 2, DIVF = 63, DIVQ = 2  → (12/3) × 64 / 4 = 64 MHz exactly
+  F_pfd = 4 MHz, F_vco = 4×64 = 256 MHz — below 533 MHz minimum, invalid
+
+Recommended: use 96 MHz (DIVR=0, DIVF=63, DIVQ=3) for exi domain.
+At 96 MHz there are 3 ticks per 32 MHz EXI half-period.
+Adjust SPIMode3Slave edge detection accordingly (3-tick phases instead of 2).
+```
+
+**Implementation note:** Verify exact PLL parameters with `icepll` tool:
+```bash
+icepll -i 12 -o 64    # finds closest achievable output
+icepll -i 12 -o 96    # alternative
+```
+The agent implementing this should run `icepll` and use whatever output it
+recommends, then adjust the `SPIMode3Slave` tick counts accordingly.
+
+### Reset strategy
+
+Each domain has its own reset, deasserted synchronously using
+`ResetSynchronizer` from `amaranth.lib.cdc`:
+
+```python
+# In platform create_missing_domain("exi"):
+m.submodules.exi_rst = ResetSynchronizer(
+    arst   = ResetSignal("sync"),
+    domain = "exi",
+)
+```
+
+The `sync` domain reset comes from the iCEbreaker's on-chip power-on reset
+(SB_GB driven by SB_HFOSC, which has built-in POR).
+
+---
+
+## 5. Clock Domain Crossing Strategy
+
+All signals crossing between `exi` and `sync` domains must use one of the
+following CDC primitives from `amaranth.lib.cdc`. Never pass a raw multi-bit
+signal directly between domains — only one bit may change per clock crossing.
+
+### CDC primitive selection guide
+
+| Signal type | Primitive | Latency |
+|---|---|---|
+| Single bit, slow-changing (flags, status) | `FFSynchronizer` | 2 dest clocks |
+| Single-cycle pulse / event | `PulseSynchronizer` | ~3–4 dest clocks |
+| Multi-bit data stream (packet bytes) | `AsyncFIFO` | ~3–4 dest clocks |
+| Reset deassertion | `ResetSynchronizer` | 2 dest clocks |
+| Async external pin (CLK, MOSI, CS) | `FFSynchronizer` | 2 dest clocks |
+
+### CDC inventory for this design
+
+| Signal | From | To | Primitive | Notes |
+|---|---|---|---|---|
+| EXI CLK pin | async | exi | FFSynchronizer | stages=2, reset=1 (CLK idles high) |
+| EXI MOSI pin | async | exi | FFSynchronizer | stages=2 |
+| EXI CS pin | async | exi | FFSynchronizer | stages=2, reset=1 (CS idles high) |
+| SPRAM read request (addr) | exi | sync | AsyncFIFO 16-bit wide, depth=4 | Prefetch pipeline |
+| SPRAM read result (data) | sync | exi | AsyncFIFO 8-bit wide, depth=4 | Prefetch pipeline |
+| TX packet bytes | exi | sync | AsyncFIFO 8-bit wide, depth=64 | GC→ethernet |
+| TX packet start/len | exi | sync | AsyncFIFO 16-bit wide, depth=4 | Frame delimiter |
+| RX packet bytes | sync | exi | AsyncFIFO 8-bit wide, depth=64 | ethernet→GC |
+| RWP update (new value) | sync | exi | AsyncFIFO 8-bit wide, depth=4 | After frame committed |
+| RRP update (new value) | exi | sync | AsyncFIFO 8-bit wide, depth=4 | After GC advances pointer |
+| IR[RI] set (RX ready) | sync | exi | PulseSynchronizer | Triggers RI interrupt |
+| IR[TI] set (TX done) | sync | exi | PulseSynchronizer | Triggers TI interrupt |
+| NCRA reset pulse | exi | sync | PulseSynchronizer | Resets ethernet engine |
+| exi_int_n output | exi | physical pin | Direct (output register) | Active-low to GC |
+
+**Critical rule:** The register file lives entirely in the `exi` domain. The
+`sync` domain never directly reads or writes EXI registers. All interaction
+between the two domains goes through the AsyncFIFOs and PulseSynchronizers
+listed above. This ensures the GC's register reads always respond within the
+`exi` domain without waiting on CDC latency.
+
+---
+
+## 6. Module Hierarchy
+
+```
+BBATop                          (top-level, sets up clock domains)
+├── SPIMode3Slave               (exi domain — bit engine)
+├── BBARegisterFile             (exi domain — register decode + response)
+│   ├── [AsyncFIFO: spram_req]  (exi→sync: read address requests)
+│   ├── [AsyncFIFO: spram_rsp]  (sync→exi: read data responses)
+│   ├── [AsyncFIFO: tx_bytes]   (exi→sync: TX packet data)
+│   ├── [AsyncFIFO: tx_ctrl]    (exi→sync: TX frame length)
+│   ├── [AsyncFIFO: rx_wptr]    (sync→exi: RWP updates)
+│   ├── [AsyncFIFO: rx_rptr]    (exi→sync: RRP updates from GC)
+│   ├── [PulseSynchronizer: rx_irq]   (sync→exi)
+│   ├── [PulseSynchronizer: tx_irq]   (sync→exi)
+│   └── [PulseSynchronizer: ncra_rst] (exi→sync)
+├── SPRAMArbiter                (sync domain — owns all SPRAM)
+├── RXFrameAssembler            (sync domain — ethernet→SPRAM)
+├── TXFrameDrain                (sync domain — SPRAM→ethernet)
+├── W5500SPIMaster              (sync domain — SPI master to W5500)
+└── EEPROMModel                 (exi domain — 93C46 bit-bang model)
+```
+
+---
+
+## 7. Module Specifications
+
+### 7.1 SPIMode3Slave
+
+**Domain:** `exi`  
+**File:** `exi_bba/spi_mode3_slave.py`
+
+Implements a byte-oriented SPI Mode 3 slave. Handles CLK/MOSI/MISO/CS at the
+bit level and presents a clean byte interface to `BBARegisterFile`.
+
+**SPI Mode 3 timing recap:**
+- CLK idles HIGH
+- MOSI is set up by master before the FALLING edge
+- Slave samples MOSI on the FALLING edge of CLK
+- Slave drives MISO on the RISING edge of CLK (ready for master to sample on
+  next falling edge)
+
+**Port list:**
+
+| Port | Width | Dir | Domain | Description |
+|---|---|---|---|---|
+| `spi_clk` | 1 | in | async→exi | Raw SPI clock from GC, synchronized internally |
+| `spi_mosi` | 1 | in | async→exi | Raw MOSI from GC, synchronized internally |
+| `spi_miso` | 1 | out | exi | MISO output to GC |
+| `spi_cs_n` | 1 | in | async→exi | Raw CS from GC (active low), synchronized internally |
+| `rx_byte` | 8 | out | exi | Last complete received byte |
+| `rx_valid` | 1 | out | exi | Pulses 1 cycle when `rx_byte` contains a new byte |
+| `tx_byte` | 8 | in | exi | Byte to transmit; sampled when `tx_load` pulses |
+| `tx_load` | 1 | out | exi | Requests next TX byte from upstream |
+
+**Internal behaviour:**
+
+1. Instantiate FFSynchronizer stages=2 on each of `spi_clk`, `spi_mosi`,
+   `spi_cs_n`. Reset values: `spi_clk`=1, `spi_cs_n`=1.
+2. Register the synchronized signals one further cycle to form edge detectors:
+   `rising_clk = clk_s & ~clk_prev`, `falling_clk = ~clk_s & clk_prev`.
+3. On CS falling edge: load `tx_byte` into internal shift register, pulse
+   `tx_load`, reset `bit_ctr` to 0.
+4. On FALLING CLK edge (sample): shift `mosi_s` into `rx_shift` MSB-first,
+   increment `bit_ctr`. When `bit_ctr == 8`: register `rx_shift` into `rx_byte`,
+   pulse `rx_valid`, reset `bit_ctr` to 0, pulse `tx_load` to request next byte.
+5. On RISING CLK edge (drive): shift `tx_shift` left by 1, drive MSB onto
+   `spi_miso`.
+6. On CS rising edge: drive `spi_miso` high (idle), reset state.
+
+**Note on `tx_load` timing:** `tx_load` pulses at two points — CS assertion
+(loads first byte before any bits are clocked) and after each complete received
+byte (loads the next byte). The upstream (`BBARegisterFile`) must register the
+next TX byte within one `exi` clock of `tx_load` pulsing.
+
+---
+
+### 7.2 BBARegisterFile
+
+**Domain:** `exi` (with AsyncFIFO interfaces to `sync`)  
+**File:** `exi_bba/bba_register_file.py`
+
+Decodes EXI transactions (2-byte header + N data bytes), reads/writes the BBA
+register space, and manages all CDC crossings to the `sync` domain.
+
+#### EXI transaction decoder FSM
+
+States: `HEADER0` → `HEADER1` → `DATA` → (back to `HEADER0`)
+
+**Header format:**
+
+```
+Byte 0:  [7]   = write flag  (1 = write, 0 = read)
+         [6:0] = addr[12:6]  (upper 7 bits of 13-bit address)
+
+Byte 1:  [7:2] = addr[5:0]   (lower 6 bits of 13-bit address)
+         [1:0] = xfer_len    (0=1 byte, 1=2 bytes, 2=3 bytes, 3=4 bytes)
+```
+
+Full address = `{ byte0[6:0], byte1[7:2] }` = 13 bits → range 0x0000–0x1FFF.
+
+**`HEADER0` state:** Wait for `rx_valid`. Latch `rx_byte` as `hdr0`.
+
+**`HEADER1` state:** Wait for `rx_valid`. Decode address and flags. For read
+transactions, immediately issue SPRAM prefetch request if address ≥ 0x100
+(ring buffer region). Load `tx_byte` with the register value for addresses
+< 0x100 (register file region). Transition to `DATA`.
+
+**`DATA` state (write path):** For each `rx_valid`, write `rx_byte` to
+`regs[addr + byte_ctr]` and handle side effects (see register side effects
+table). Increment `byte_ctr`. When `byte_ctr == xfer_len`, go to `HEADER0`.
+
+**`DATA` state (read path):** Drive `tx_byte` from prefetch result (addresses
+≥ 0x100) or directly from `regs[]` (addresses < 0x100). On each `tx_load`,
+advance the read pointer and issue next prefetch. When `byte_ctr == xfer_len`,
+go to `HEADER0`.
+
+**CS deassertion abort:** In any state, if `cs_n` rises, return to `HEADER0`.
+
+#### Register file storage
+
+Registers 0x00–0x1FF are implemented as an `Array` of 8-bit `Signal`s (512
+registers). In synthesis this maps to distributed RAM on iCE40. Not SPRAM —
+SPRAM is reserved for the packet ring buffer.
+
+The register file is entirely in the `exi` domain. No CDC is needed to read
+or write registers 0x00–0xFF.
+
+#### Register side effects
+
+| Register | Write side effect |
+|---|---|
+| NCRA (0x00) | If bit 0 (RESET) written: pulse `ncra_rst` PulseSynchronizer to `sync` domain. Self-clear bit 0 on next cycle. Reset TX/RX pointers in register file. |
+| IR (0x09) | Write-1-to-clear: `IR <= IR & ~written_value` |
+| RRP (0x18–0x19) | After GC writes new RRP value, push value into `rx_rptr` AsyncFIFO (exi→sync) so RX engine knows GC has consumed those pages |
+| TWD (0x34–0x37) | Bytes written here are the TX frame length field (2 bytes little-endian). Latch for TX engine. |
+| TXDATA (0x48) | Each byte written goes into `tx_bytes` AsyncFIFO (exi→sync). When `byte_ctr == xfer_len` on last write chunk, push frame length into `tx_ctrl` AsyncFIFO. |
+
+#### Interrupt register update (from sync domain)
+
+- `rx_irq` PulseSynchronizer arriving from sync: set `IR[1]` (RI bit)
+- `tx_irq` PulseSynchronizer arriving from sync: set `IR[2]` (TI bit), clear
+  `NCRA[3:2]` (ST1:ST0 — transmit start bits)
+
+#### Interrupt output
+
+```
+exi_int_n <= ~|(IR & IMR)   # active-low: assert when any unmasked bit set
+```
+
+Register this one flip-flop in the `exi` domain. The physical pin is a direct
+output — no CDC needed because the GC only reads the interrupt state via polling
+IR over EXI (which is already in the `exi` domain) or via the interrupt line
+which the GC CPU samples asynchronously.
+
+#### NWAYS register
+
+Always return `0x17` (link up, 100 Mbps, full duplex, autoneg complete).
+The GC's BBA driver polls NWAYS after reset to confirm link status before
+enabling RX. Hardcode this value — do not attempt to forward real link status
+from the W5500.
+
+```python
+# NWAYS = 0x17:
+# bit 4 (LS100)  = 1: 100BASE-TX link up
+# bit 2 (ANCLPT) = 1: autoneg complete
+# bit 1 (100TXH) = 1: 100BASE-TX half (also set in practice)
+# bit 0 (LS10)   = 1: 10BASE-T (also reported)
+```
+
+---
+
+### 7.3 SPRAMArbiter
+
+**Domain:** `sync`  
+**File:** `exi_bba/spram_arbiter.py`
+
+Arbitrates access to the iCE40UP5K's 128 KB SPRAM between two clients:
+
+- **Client A (EXI read):** Issues read requests from the prefetch pipeline
+  (`spram_req` AsyncFIFO). Must service requests fast enough to keep the
+  prefetch pipeline full.
+- **Client B (ETH write):** The `RXFrameAssembler` writes incoming ethernet
+  frames into the ring buffer area.
+
+**Priority:** ETH write wins over EXI read when both request simultaneously.
+This is safe because:
+1. The GC only reads a ring buffer page after RWP has advanced past it (i.e.,
+   the ETH engine has finished writing that page).
+2. Even if an EXI read is delayed by one SPRAM cycle, the prefetch pipeline
+   has enough depth (4 entries) to absorb the stall without the SPI slave
+   running out of data.
+
+**SPRAM interface (iCE40UP5K SB_SPRAM256KA):**
+
+```
+WREN   : write enable
+CHIPSELECT : always 1
+CLOCK  : sync domain clock (48 MHz)
+STANDBY : 0
+SLEEP  : 0
+POWEROFF_N : 1
+ADDRESS[13:0] : byte address divided by 2 (SPRAM is 16-bit wide)
+DATAIN[15:0] : write data (use only [7:0] for byte writes, mask upper byte)
+MASKWREN[3:0] : byte enable (0b0011 for lower byte, 0b1100 for upper byte)
+DATAOUT[15:0] : read data
+```
+
+The SPRAM is 16-bit wide. Byte addressing is done via `MASKWREN`. For an 8-bit
+write to address `A`: set `ADDRESS = A >> 1`, `MASKWREN = (A & 1) ? 0b1100 :
+0b0011`, write data in the appropriate byte of `DATAIN`.
+
+**Read latency:** SPRAM has 1-cycle synchronous read latency. The result of a
+read issued at cycle N is valid at cycle N+1. The arbiter must account for this
+when responding to the prefetch pipeline.
+
+**Port list:**
+
+| Port | Width | Dir | Notes |
+|---|---|---|---|
+| `exi_req_addr` | 16 | in | From spram_req AsyncFIFO (exi→sync) |
+| `exi_req_valid` | 1 | in | FIFO r_rdy |
+| `exi_req_ready` | 1 | out | FIFO r_en (pop when serviced) |
+| `exi_rsp_data` | 8 | out | To spram_rsp AsyncFIFO (sync→exi) |
+| `exi_rsp_valid` | 1 | out | FIFO w_en |
+| `eth_wr_addr` | 16 | in | From RXFrameAssembler |
+| `eth_wr_data` | 8 | in | Byte to write |
+| `eth_wr_valid` | 1 | in | Write request |
+| `eth_wr_ready` | 1 | out | Write accepted this cycle |
+
+---
+
+### 7.4 RXFrameAssembler
+
+**Domain:** `sync`  
+**File:** `exi_bba/rx_frame_assembler.py`
+
+Receives complete ethernet frames from `W5500SPIMaster` and writes them into
+the SPRAM ring buffer in the correct MX98730EC format.
+
+**Ring buffer layout (in SPRAM):**
+
+```
+SPRAM address 0x0100–0x0FFF  (3840 bytes = 15 × 256-byte pages)
+  Page 0x01: first usable RX page
+  Page 0x0F: last usable RX page (RHBP default)
+  Pages wrap: after 0x0F, next is 0x01 (not 0x00, which is reserved)
+```
+
+Each page is 256 bytes. A received frame may span multiple pages.
+
+**Frame descriptor (first 4 bytes of first page):**
+
+```
+Byte 0: LRPS value (Last Received Packet Status — set to 0x00 or actual status)
+Byte 1: 0x00
+Byte 2: frame_length[15:8]  (big-endian, includes descriptor bytes)
+Byte 3: frame_length[7:0]
+Bytes 4+: raw ethernet frame data (DA, SA, EtherType, payload, FCS)
+```
+
+**Flow:**
+
+1. Wait for `W5500SPIMaster` to signal frame available (`rx_sof` pulse).
+2. Read frame bytes from W5500 frame FIFO.
+3. Compute how many 256-byte pages are needed:
+   `pages_needed = ceil((frame_length + 4) / 256)`
+4. Check that `(RWP + pages_needed) mod 16 != RRP` (ring not full). If full,
+   drop the frame and increment a drop counter.
+5. Write 4-byte descriptor at SPRAM address `0x100 + (RWP * 0x100)`.
+6. Write frame bytes sequentially, wrapping pages at 256-byte boundaries.
+   Page wrap: `next_page = (current_page % 15) + 1` (pages 1–15, skip 0).
+7. After last byte written, update `RWP` in the `rx_wptr` AsyncFIFO (sync→exi).
+   The `exi` domain will update the RWP register from this FIFO.
+8. Pulse `rx_irq` PulseSynchronizer to `exi` domain.
+
+**MAC address filter:**
+
+Before writing a frame, check destination MAC against PAR0–PAR5 (broadcast
+FF:FF:FF:FF:FF:FF always accepted). The GC will typically configure PAR0–PAR5
+via EXI after boot, so the `BBARegisterFile` must expose these to the
+`RXFrameAssembler`. Pass them via a dedicated small AsyncFIFO or by reading
+them from a shared register shadow (6 bytes, sync domain copy updated when
+GC writes PAR0–PAR5). Multicast hash table (MAR0–MAR7) filtering is optional
+for initial implementation — accept all frames (promiscuous mode) until the GC
+configures the filter.
+
+---
+
+### 7.5 TXFrameDrain
+
+**Domain:** `sync`  
+**File:** `exi_bba/tx_frame_drain.py`
+
+Drains the TX byte FIFO (fed from the `exi` domain as the GC writes to TXDATA
+register 0x48) and forwards complete frames to `W5500SPIMaster`.
+
+**Flow:**
+
+1. Wait for `tx_ctrl` AsyncFIFO to contain a frame length value. This is pushed
+   by `BBARegisterFile` when the GC has written the complete TX frame (i.e.,
+   NCRA ST1:ST0 transitions to 01 or 10).
+2. Pop `frame_length` from `tx_ctrl`.
+3. Pop exactly `frame_length` bytes from `tx_bytes` AsyncFIFO.
+4. Forward bytes to `W5500SPIMaster` TX interface with SOF/EOF framing.
+5. Wait for `W5500SPIMaster` to signal TX complete.
+6. Pulse `tx_irq` PulseSynchronizer to `exi` domain.
+
+**NCRA ST bits:** The GC writes NCRA with ST1:ST0 = 01 (start transmit from
+buffer 1) or 10 (start transmit from buffer 2). The BBA hardware has two TX
+buffers; this implementation uses a single TX FIFO and ignores the buffer
+selection. When ST1:ST0 goes non-zero, treat it as a TX trigger regardless of
+which bits are set. The `BBARegisterFile` should push the frame length into
+`tx_ctrl` on this transition.
+
+---
+
+### 7.6 W5500SPIMaster
+
+**Domain:** `sync`  
+**File:** `exi_bba/w5500_spi_master.py`
+
+Implements the W5500 SPI master interface. The W5500 uses SPI Mode 0 (CPOL=0,
+CPHA=0), opposite to the BBA EXI interface.
+
+**W5500 SPI frame format:**
+
+```
+Byte 0–1: Address (16-bit, big-endian)
+Byte 2:   Control byte:
+            [7:3] = Block Select (BSB):
+                    00000 = Common Register
+                    00001 = Socket 0 Register
+                    00010 = Socket 0 TX buffer
+                    00011 = Socket 0 RX buffer
+            [2]   = Read/Write (0=read, 1=write)
+            [1:0] = Operation Mode (00=variable, 01=fixed 1B, 10=fixed 2B, 11=fixed 4B)
+Byte 3+:  Data bytes
+```
+
+**W5500 configuration (to be performed once on NCRA reset):**
+
+```
+1. Write MR (Mode Register, 0x0000): 0x80  — software reset
+2. Wait ~1 ms
+3. Write SHAR (Source MAC, 0x0009–0x000E): copy from PAR0–PAR5 register shadow
+4. Write S0_MR (Socket 0 Mode, 0x4000): 0x04  — MACRAW mode (raw ethernet)
+5. Write S0_CR (Socket 0 Command, 0x4001): 0x01 — OPEN
+6. Write S0_IMR (Socket 0 Interrupt Mask, 0x4024): 0x04 | 0x01  — RECV | SEND_OK
+```
+
+**MACRAW mode:** In MACRAW mode the W5500 Socket 0 sends and receives raw
+ethernet frames including the full MAC header and FCS. This is exactly what
+the MX98730EC presents to the GC. No IP stack runs in the FPGA.
+
+**RX polling:** The W5500 asserts its INT_N pin (active low) when a frame
+arrives. Connect W5500 INT_N to an FPGA input pin and use it to trigger the
+`RXFrameAssembler`. Alternatively poll `S0_IR` (Socket 0 Interrupt Register,
+0x4002) periodically. The INT_N approach has lower latency and is preferred.
+
+**SPI clock rate:** Drive W5500 SPI at 24 MHz (sync clock 48 MHz ÷ 2 using a
+clock enable toggle). The W5500 supports up to 80 MHz so there is ample margin.
+
+**Port list:**
+
+| Port | Width | Dir | Notes |
+|---|---|---|---|
+| `spi_clk` | 1 | out | To W5500 CLK pin (SPI Mode 0, idles LOW) |
+| `spi_mosi` | 1 | out | To W5500 MOSI |
+| `spi_miso` | 1 | in | From W5500 MISO |
+| `spi_cs_n` | 1 | out | To W5500 CS (active low) |
+| `w5500_int_n` | 1 | in | W5500 interrupt (active low) |
+| `tx_data` | 8 | in | Byte to transmit (from TXFrameDrain) |
+| `tx_valid` | 1 | in | TX byte available |
+| `tx_ready` | 1 | out | TX byte consumed |
+| `tx_sof` | 1 | in | Start of frame marker |
+| `tx_eof` | 1 | in | End of frame marker |
+| `rx_data` | 8 | out | Received byte (to RXFrameAssembler) |
+| `rx_valid` | 1 | out | RX byte available |
+| `rx_ready` | 1 | in | RX byte consumed |
+| `rx_sof` | 1 | out | Start of frame |
+| `rx_eof` | 1 | out | End of frame |
+
+---
+
+### 7.7 EEPROMModel
+
+**Domain:** `exi`  
+**File:** `exi_bba/eeprom_model.py`
+
+Models the 93C46-compatible serial EEPROM that stores the BBA's MAC address.
+The GC software bit-bangs the EEPROM interface through register 0x1C
+(EEPROM Interface Register) of the BBA chip.
+
+**Register 0x1C bit fields:**
+
+```
+[3] EECK  — EEPROM clock
+[2] EECS  — EEPROM chip select
+[1] EEDI  — EEPROM data in (GC → EEPROM)
+[0] EEDO  — EEPROM data out (EEPROM → GC) [read-only]
+```
+
+The GC reads EEDO by reading register 0x1C bit 0.
+
+**93C46 protocol summary:**
+
+The 93C46 uses a 3-wire serial protocol (SK=clock, CS=select, DI=data in,
+DO=data out). Commands:
+- READ: start bit (1) + opcode (10) + 6-bit address → 16-bit data out
+- WRITE: start bit (1) + opcode (01) + 6-bit address + 16-bit data
+- EWEN (write enable): start bit (1) + opcode (00) + address (11xxxx)
+
+Each 93C46 word is 16 bits. The MAC address occupies words 0–2 (6 bytes).
+
+**Implementation approach:**
+
+Maintain a small ROM of 64 × 16-bit words in the `exi` domain (as a Const
+array, synthesises to LUTs). Pre-populate words 0–2 with the chosen MAC
+address. Implement a small FSM that watches writes to register 0x1C for the
+93C46 protocol, drives EEDO accordingly.
+
+**Simpler alternative:** Many GC BBA drivers read the EEPROM once at boot and
+then write the MAC to PAR0–PAR5 themselves. Pre-populate PAR0–PAR5 in the
+register file reset state with a valid Nintendo OUI MAC (00:09:BF:xx:xx:xx).
+Skip a full 93C46 implementation for the first version — if Swiss ignores the
+EEPROM read result and uses a hardcoded or user-configurable MAC, this is
+sufficient.
+
+---
+
+### 7.8 BBATop
+
+**Domain:** both  
+**File:** `exi_bba/bba_top.py`
+
+Top-level module. Instantiates all submodules, creates clock domains, connects
+physical pins.
+
+**Clock domain creation:**
+
+```python
+def elaborate(self, platform):
+    m = Module()
+
+    # exi domain: 96 MHz from PLL (3× 32 MHz EXI bus rate)
+    exi_domain = ClockDomain("exi")
+    m.domains += exi_domain
+    pll = platform.get_pll()   # platform-specific PLL primitive
+    m.d.comb += exi_domain.clk.eq(pll.clkout)
+    m.submodules.exi_rst = ResetSynchronizer(
+        arst=ResetSignal("sync"), domain="exi"
+    )
+
+    # sync domain: 48 MHz from SB_HFOSC (platform default)
+    # Created automatically by iCEbreaker platform
+
+    # Instantiate submodules...
+    m.submodules.spi    = spi    = SPIMode3Slave()
+    m.submodules.regfile = regfile = BBARegisterFile()
+    m.submodules.arbiter = arbiter = SPRAMArbiter()
+    m.submodules.rx_asm  = rx_asm  = RXFrameAssembler()
+    m.submodules.tx_drn  = tx_drn  = TXFrameDrain()
+    m.submodules.w5500   = w5500   = W5500SPIMaster()
+    m.submodules.eeprom  = eeprom  = EEPROMModel()
+    # ... wiring ...
+```
+
+**Physical pin connections (iCEbreaker):**
+
+The SP1 EXI signals connect via the interposer PCB to iCEbreaker PMOD pins.
+The W5500 Pmod connects to the second PMOD connector. Exact pin mapping depends
+on the interposer PCB layout — define these in a platform resource file.
+
+```python
+# Example resource definitions (add to iCEbreaker platform file):
+Resource("exi", 0,
+    Subsignal("clk",  Pins("1",  conn=("pmod", 0), dir="i")),
+    Subsignal("mosi", Pins("2",  conn=("pmod", 0), dir="i")),
+    Subsignal("miso", Pins("3",  conn=("pmod", 0), dir="o")),
+    Subsignal("cs_n", Pins("4",  conn=("pmod", 0), dir="i")),
+    Subsignal("int_n",Pins("7",  conn=("pmod", 0), dir="o")),
+    Attrs(IO_STANDARD="SB_LVCMOS"),
+),
+Resource("w5500", 0,
+    Subsignal("clk",  Pins("1",  conn=("pmod", 1), dir="o")),
+    Subsignal("mosi", Pins("2",  conn=("pmod", 1), dir="o")),
+    Subsignal("miso", Pins("3",  conn=("pmod", 1), dir="i")),
+    Subsignal("cs_n", Pins("4",  conn=("pmod", 1), dir="o")),
+    Subsignal("int_n",Pins("7",  conn=("pmod", 1), dir="i")),
+    Subsignal("rst_n",Pins("8",  conn=("pmod", 1), dir="o")),
+    Attrs(IO_STANDARD="SB_LVCMOS"),
+),
+```
+
+---
+
+## 8. Memory Map
+
+The BBA register address space is 13 bits wide (0x0000–0x1FFF).
+
+| Address range | Region | Implemented in | Notes |
+|---|---|---|---|
+| 0x0000–0x0033 | MAC control registers | Register file (exi) | NCRA, NCRB, IMR, IR, pointers |
+| 0x0034–0x0037 | TWD — TX write data | Register file (exi) | TX frame length (2 bytes) |
+| 0x0038–0x0039 | Reserved | — | Ignore |
+| 0x003A | HIPR — Host Interface Protocol | Register file (exi) | Read: 0x01 (BBA present) |
+| 0x003B | NAFR — Network Address Filter | Register file (exi) | |
+| 0x003C | NWBA — Network Write Buffer Addr | Register file (exi) | |
+| 0x003D–0x0047 | Reserved | — | Ignore |
+| 0x0048 | TXDATA — Bulk TX data port | Register file → tx_bytes FIFO | Write path to ethernet |
+| 0x0049–0x00FF | Reserved | — | Ignore |
+| 0x0100–0x0FFF | RX ring buffer | SPRAM (sync) | Read path from ethernet |
+
+---
+
+## 9. EXI Transaction Protocol
+
+All BBA register accesses follow a strict two-phase (header + data) format.
+
+### Header encoding
+
+```
+Byte 0: [7]   write flag     1=write, 0=read
+        [6:0] addr[12:6]     upper 7 bits of address
+
+Byte 1: [7:2] addr[5:0]      lower 6 bits of address
+        [1:0] xfer_len-1     0=1 byte, 1=2 bytes, 2=3 bytes, 3=4 bytes
+```
+
+CS is asserted (low) before byte 0 and remains low through the entire
+transaction including all data bytes. CS deasserts (high) after the last
+data byte.
+
+### Read transaction timing
+
+```
+CS  ─┐                                    ┌─
+      └────────────────────────────────────┘
+CLK   ┌┐┌┐┌┐┌┐┌┐┌┐┌┐┌┐  ┌┐┌┐┌┐┌┐┌┐┌┐┌┐┌┐  ┌┐┌┐...
+      header byte 0      header byte 1      data byte 0...
+MOSI  [addr+flags]        [addr+len]         [don't care]
+MISO  [don't care]        [don't care]       [register data]
+```
+
+The register file must have data ready on MISO from the **very first clock
+edge of the data phase**. For register-file-backed reads (address < 0x100),
+the data is available immediately after header decode. For SPRAM-backed reads
+(address ≥ 0x100), the prefetch pipeline issues the SPRAM read request during
+the header phase so data is ready in time.
+
+### Write transaction timing
+
+Identical header, then MOSI carries the write data. The FPGA samples MOSI on
+each falling CLK edge during the data phase and writes to the register.
+
+### ID query
+
+On power-on the GC queries the device ID. The query is two 0x00 bytes written,
+then four bytes read. The BBA returns `0x04020200`. Implement this as a special
+case: when address decodes to 0x0000 on a read with no prior NCRA reset, return
+the hardcoded ID.
+
+Alternatively, read the Dolphin source for the exact byte sequence GC software
+uses to detect the BBA and replicate it faithfully.
+
+---
+
+## 10. BBA Register Reference
+
+Key registers the GC driver accesses. Full register map in YAGCD §10.8.
+
+| Addr | Name | R/W | Reset | Description |
+|---|---|---|---|---|
+| 0x00 | NCRA | R/W | 0x00 | Network Control A. [0]=RESET (self-clear), [2:1]=ST (TX start), [3]=SR (start receive), [6]=INTMODE (0=int active low) |
+| 0x01 | NCRB | R/W | 0x00 | Network Control B |
+| 0x04 | LTPS | R | 0x00 | Last TX packet status |
+| 0x05 | LRPS | R | 0x00 | Last RX packet status |
+| 0x08 | IMR | R/W | 0x00 | Interrupt mask. Bits match IR. Interrupt fires when IR & IMR != 0 |
+| 0x09 | IR | R/W | 0x00 | Interrupt register. Write 1 to clear. [7]=RBFI, [4]=TEI, [2]=TI, [1]=RI |
+| 0x0A–0x0B | BP | R/W | — | Boundary page pointer |
+| 0x0C–0x0D | TLBP | R/W | — | TX low boundary page |
+| 0x0E–0x0F | TWP | R/W | 0x00 | TX write page pointer |
+| 0x12–0x13 | TRP | R/W | 0x00 | TX read page pointer |
+| 0x16–0x17 | RWP | R | updates | RX write page pointer. Advances after each frame written |
+| 0x18–0x19 | RRP | R/W | 0x01 | RX read page pointer. GC writes to advance after consuming frames |
+| 0x1A–0x1B | RHBP | R/W | 0x0F | RX high boundary page (last valid page). Default 0x0F |
+| 0x1C | EEPROM | R/W | — | EEPROM bit-bang interface [3:0] = EECK, EECS, EEDI, EEDO |
+| 0x20–0x25 | PAR0–5 | R/W | MAC | MAC address bytes 0–5. GC writes after reading EEPROM |
+| 0x26–0x2D | MAR0–7 | R/W | 0xFF | Multicast hash table. 0xFF = accept all |
+| 0x2E | ANALOG | R/W | — | PHY analog control. GC writes 0xD6 to enable PHY |
+| 0x30 | NWAYC | R/W | — | Autoneg config. GC sets ANE + LTE bits |
+| 0x31 | NWAYS | R | 0x17 | Autoneg status. Hardcode 0x17 = 100M full duplex link up |
+| 0x32 | GCA | R/W | — | GMAC config A. GC sets AUTOPUB bit |
+| 0x33 | GCB | R/W | — | GMAC config B |
+| 0x34–0x37 | TWD | W | — | TX write data (frame length, 2 bytes LE, then ignored) |
+| 0x3A | HIPR | R | 0x01 | Host interface protocol version. Return 0x01 |
+| 0x3B | NAFR | R/W | — | Network address filter |
+| 0x3C | NWBA | R/W | — | Network write buffer address |
+| 0x48 | TXDATA | W | — | Bulk TX data port. GC streams frame bytes here |
+| 0x100+ | RX buf | R | — | RX ring buffer. GC reads frames from here |
+
+---
+
+## 11. Initialisation Sequence
+
+This is the exact sequence Swiss/GC software executes. The register file must
+respond correctly to each step.
+
+```
+1.  Assert CS, write 0x0000 (2 bytes), read 4 bytes
+    → Must return: 0x04 0x02 0x02 0x00  (device ID)
+
+2.  Write 0x01 to NCRA (0x00)       — software reset
+    → RESET bit self-clears next cycle
+    → Pulse ncra_rst to sync domain (resets W5500, clears SPRAM pointers)
+
+3.  Poll NCRA bit 0 until clear      — wait for reset complete
+    → Return 0x00 from NCRA reads after self-clear
+
+4.  Write 6 bytes to PAR0–PAR5 (0x20–0x25)
+    → Latch MAC address; forward to sync domain MAC filter shadow
+
+5.  Write 8 bytes to MAR0–MAR7 (0x26–0x2D)
+    → Typically all 0xFF (promiscuous mode)
+
+6.  Write 0xD6 to ANALOG (0x2E)     — enable PHY
+    → Store in register file; no hardware effect in FPGA
+
+7.  Write NWAYC (0x30): set bits for ANE + LTE
+    → Store; no hardware effect
+
+8.  Write IMR (0x08): typically 0x86 (RBFI | TI | RI)
+    → Enables interrupts; INT line will now assert when frames arrive
+
+9.  Write GCA (0x32): set AUTOPUB bit
+    → Store; AUTOPUB means RWP auto-updates — we always do this anyway
+
+10. Write NCRA (0x00): set SR bit (0x08) — start receive
+    → Enable RX path; the RXFrameAssembler should begin accepting frames
+
+11. Poll NWAYS (0x31) until link up
+    → Return hardcoded 0x17 immediately
+```
+
+---
+
+## 12. RX Data Path — Detailed Flow
+
+```
+W5500 receives frame on wire
+        │
+        ▼
+W5500SPIMaster detects S0_IR[RECV] (via INT_N pin)
+Reads frame length from S0_RX_RSR (Socket 0 RX Received Size, 0x4026)
+Reads frame bytes from Socket 0 RX buffer (BSB=0b00011)
+Pulses rx_sof, streams rx_data bytes, pulses rx_eof
+        │
+        ▼ (sync domain)
+RXFrameAssembler
+  - Checks destination MAC vs PAR shadow
+  - Checks NCRA SR bit is set (RX enabled)
+  - Computes pages_needed
+  - Checks ring buffer not full (RWP+pages != RRP)
+  - Writes descriptor + frame data into SPRAM via SPRAMArbiter
+  - Advances RWP (local register in sync domain)
+  - Pushes new RWP value into rx_wptr AsyncFIFO (sync→exi)
+  - Pulses rx_irq PulseSynchronizer (sync→exi)
+        │
+        ▼ AsyncFIFO / PulseSynchronizer crossing
+        │ (exi domain)
+BBARegisterFile
+  - Pops new RWP from rx_wptr FIFO, updates RWP register
+  - rx_irq pulse arrives: sets IR[1] (RI bit)
+  - IR & IMR now non-zero: asserts exi_int_n (INT low to GC)
+        │
+        ▼ (GC CPU, driven by interrupt or polling)
+GC reads IR register: sees RI=1
+GC reads RWP (0x16): gets updated pointer
+GC reads frame from 0x100+RRP (bulk read, up to 1500+ bytes)
+  → BBARegisterFile issues SPRAM read requests via spram_req FIFO (exi→sync)
+  → SPRAMArbiter services reads from SPRAM
+  → Results flow back via spram_rsp FIFO (sync→exi)
+  → Prefetch pipeline keeps data ready for SPI bit engine
+GC writes new RRP (0x18) to advance past consumed pages
+  → BBARegisterFile pushes RRP update into rx_rptr FIFO (exi→sync)
+  → RXFrameAssembler updates its local RRP shadow
+GC writes IR register with RI=1 (write-1-to-clear)
+  → IR[1] clears, INT line deasserts
+```
+
+---
+
+## 13. TX Data Path — Detailed Flow
+
+```
+GC CPU constructs ethernet frame in GC RAM
+        │
+        ▼ (GC CPU → EXI)
+GC writes 2-byte length to TWD register (0x34)
+GC writes frame bytes to TXDATA register (0x48) in chunks
+  → BBARegisterFile: each written byte goes into tx_bytes AsyncFIFO (exi→sync)
+GC writes NCRA with ST1:ST0 = 01 (transmit trigger)
+  → BBARegisterFile pushes frame_length into tx_ctrl AsyncFIFO (exi→sync)
+        │
+        ▼ AsyncFIFO crossing
+        │ (sync domain)
+TXFrameDrain
+  - Pops frame_length from tx_ctrl
+  - Pops frame_length bytes from tx_bytes
+  - Forwards to W5500SPIMaster with SOF/EOF
+        │
+        ▼ (sync domain)
+W5500SPIMaster
+  - Writes frame length to S0_TX_FSR (TX Free Size Register, 0x4020)
+  - Writes frame bytes into Socket 0 TX buffer (BSB=0b00010)
+  - Writes SEND command to S0_CR (0x4001 = 0x20)
+  - Polls S0_IR until SEND_OK bit set
+  - Clears S0_IR[SEND_OK]
+  - Pulses tx_irq PulseSynchronizer (sync→exi)
+        │
+        ▼ PulseSynchronizer crossing
+        │ (exi domain)
+BBARegisterFile
+  - tx_irq arrives: sets IR[2] (TI bit), clears NCRA ST1:ST0
+  - If IMR[2] set: INT asserts to GC
+        │
+        ▼ (GC CPU)
+GC reads IR, sees TI=1
+GC writes IR with TI=1 to clear
+```
+
+---
+
+## 14. SPRAM Layout
+
+The iCE40UP5K has 4 × 32 KB SPRAM banks (128 KB total). Map them as:
+
+| SPRAM region | Size | Usage |
+|---|---|---|
+| 0x0000–0x00FF | 256 B | Reserved (address 0x00 page not used by ring buffer) |
+| 0x0100–0x0FFF | 3840 B | RX ring buffer (15 × 256-byte pages, pages 0x01–0x0F) |
+| 0x1000–0x17FF | 2048 B | TX frame staging buffer |
+| 0x1800–0x1FFF | 2048 B | Reserved / future use |
+
+The ring buffer uses pages 0x01–0x0F (15 pages × 256 bytes = 3840 bytes). This
+matches the MX98730EC default `RHBP` (RX High Boundary Page) value of 0x0F and
+`RRP` reset value of 0x01.
+
+**SPRAM addressing:** iCE40UP5K SB_SPRAM256KA instances are 64K × 16-bit
+(128 KB total across 4 instances). To address the ring buffer region as bytes:
+- Byte address 0x0100 maps to SPRAM word address 0x0080 (byte 0x0100 >> 1)
+- The arbiter converts byte addresses to word addresses and uses MASKWREN for
+  byte selection
+
+---
+
+## 15. Critical Timing Constraints
+
+### Must-meet timing in `exi` domain (96 MHz → 10.4 ns period)
+
+| Path | Budget | Notes |
+|---|---|---|
+| FFSynchronizer output → edge detect flip-flop | 1 cycle = 10.4 ns | Trivially met — just a register |
+| Edge detect → shift register update | 1 cycle | Register-to-register, no logic |
+| `rx_valid` → header decode → `spram_req` FIFO write | 2 cycles | Address decode is combinatorial MUX; must close at 96 MHz |
+| `tx_load` → `tx_byte` driven from register file | 1 cycle | `regs[addr]` array lookup — critical path; keep address decode combinatorial depth ≤ 4 LUTs |
+| `tx_load` → `tx_byte` driven from prefetch buffer | 1 cycle | Just a register read — trivial |
+
+### Must-meet timing in `sync` domain (48 MHz → 20.8 ns period)
+
+| Path | Budget | Notes |
+|---|---|---|
+| SPRAM read request → SPRAM address valid | 1 cycle | AsyncFIFO read + mux — easy |
+| SPRAM DATAOUT → result FIFO write | 1 cycle | Register-to-FIFO — easy |
+| W5500 SPI bit engine | N/A | Clock-enable based at 24 MHz effective; no hard timing |
+
+### Cross-domain latency budget for SPRAM prefetch
+
+```
+EXI header phase duration: 16 exi clocks at 96 MHz = 167 ns
+
+SPRAM prefetch round trip:
+  exi → spram_req FIFO write:         1 exi  tick  = 10 ns
+  FIFO cross-domain:                  2 sync ticks  = 42 ns
+  SPRAM read (1 cycle latency):       1 sync tick   = 21 ns
+  Result → spram_rsp FIFO write:      1 sync tick   = 21 ns
+  FIFO cross-domain:                  2 exi  ticks  = 21 ns
+  Result available in prefetch buffer:               = 21 ns
+  Total:                                            ~136 ns
+
+136 ns < 167 ns header window → prefetch completes before first data bit needed ✓
+```
+
+This is the tightest timing consideration in the design. The prefetch must be
+issued during HEADER1 (not after) to make the deadline.
+
+---
+
+## 16. SPRAM Read Prefetch Pipeline
+
+The prefetch pipeline ensures MISO data is always ready before the SPI slave
+needs it for the data phase.
+
+### State machine (in BBARegisterFile, exi domain)
+
+```
+State HEADER1 (decoding second header byte):
+  If is_read AND address >= 0x100:
+    push address into spram_req AsyncFIFO  ← issued NOW, during header decode
+    set prefetch_pending = True
+
+State DATA (read phase):
+  On each tx_load pulse:
+    If prefetch_pending AND spram_rsp FIFO has data:
+      pop byte from spram_rsp FIFO
+      load into tx_byte
+      push (address + byte_ctr + 1) into spram_req for NEXT byte  ← pipelining
+    Elif address < 0x100:
+      tx_byte = regs[address + byte_ctr]  ← direct register file read
+```
+
+### Pipeline depth
+
+The `spram_req` and `spram_rsp` FIFOs each have depth 4. This allows up to 4
+read requests to be in-flight simultaneously, which absorbs any SPRAM arbiter
+stalls (ETH write winning the arbitration) without stalling the SPI data phase.
+
+### SPRAM arbiter stall handling
+
+If the SPRAM arbiter defers an EXI read by 1 cycle (due to ETH write priority),
+the `spram_rsp` FIFO will be momentarily empty when `tx_load` arrives. The
+BBARegisterFile must stall the SPI slave in this case.
+
+However: the SPI slave cannot be stalled mid-bit. The stall mechanism must
+work at byte boundaries only — i.e., after a complete byte has been transmitted,
+hold MISO at 0 (or 1) and do not toggle until the next byte is ready. Since the
+GC is the SPI master and controls CLK, it will simply clock in garbage on the
+retry byte.
+
+**Practical note:** At 48 MHz sync with 24 MHz effective W5500 access rate, the
+ETH write path can only consume the SPRAM arbiter for ~1 sync cycle per byte
+written. The EXI read path gets the remaining cycles. With 4-deep FIFOs the
+pipeline should almost never stall in practice. Monitor the stall condition in
+simulation.
+
+---
+
+## 17. Interrupt Handling
+
+The `exi_int_n` output (pin 3 of SP1) is active-low. Assert it (drive low)
+when `IR & IMR != 0`.
+
+```python
+# In BBARegisterFile, exi domain:
+ir_masked = Signal(8)
+m.d.comb += ir_masked.eq(regs[BBARegs.IR] & regs[BBARegs.IMR])
+m.d.exi += exi_int_n.eq(~ir_masked.any())
+```
+
+Register the output — do not drive `exi_int_n` combinatorially. A registered
+output prevents glitches from propagating onto the GC board.
+
+**Interrupt sources and IR bit assignments:**
+
+| IR bit | Name | Set by | Cleared by |
+|---|---|---|---|
+| 7 | RBFI | RXFrameAssembler when ring full | GC write-1-to-clear |
+| 4 | TEI | TXFrameDrain on TX error | GC write-1-to-clear |
+| 2 | TI | tx_irq pulse from sync | GC write-1-to-clear |
+| 1 | RI | rx_irq pulse from sync | GC write-1-to-clear |
+
+The GC typically masks in IMR: 0x86 = 0b10000110 (RBFI | TI | RI).
+
+---
+
+## 18. EEPROM / MAC Address
+
+The GC software reads the MAC address from the 93C46 EEPROM during
+initialisation (bit-banging through register 0x1C). It then writes the MAC
+to PAR0–PAR5.
+
+**Recommended approach for initial implementation:**
+
+Skip full 93C46 emulation. Pre-populate `regs[0x1C]` with a pattern that makes
+the EEPROM read return a valid MAC. Use Nintendo's OUI `00:09:BF` for the first
+3 bytes, with locally administered bits for the last 3:
+
+```
+MAC: 00:09:BF:00:00:01
+```
+
+Verify against Swiss source whether it validates the MAC read from EEPROM or
+accepts whatever PAR0–PAR5 contains. If it re-reads EEPROM after writing PAR,
+a full 93C46 model is required. If it only uses PAR0–PAR5, pre-populating the
+register file is sufficient.
+
+**MAC address propagation:**
+
+When the GC writes PAR0–PAR5, forward the new MAC to the W5500 SHAR register
+via the `sync` domain. Use a 6-byte AsyncFIFO or a dedicated MAC update pulse.
+The W5500 uses SHAR as its source MAC for all transmitted frames.
+
+---
+
+## 19. iCE40UP5K Resource Budget
+
+| Resource | Available | Estimated use | Margin |
+|---|---|---|---|
+| Logic cells (4-LUT + FF) | 5280 | ~1800 | 66% free |
+| EBR (4 Kbit blocks) | 30 (120 Kbit) | 4 (FIFOs) | 26 free |
+| SPRAM (32 KB banks) | 4 (128 KB) | 1 bank for ring buffer | 3 free |
+| PLL | 1 | 1 (for exi domain) | 0 free |
+| SB_HFOSC | 1 | 1 (sync domain) | 0 free |
+| I/O pins | 39 usable | ~14 (EXI:5 + W5500:6 + misc:3) | 25 free |
+
+**Logic cell breakdown:**
+
+| Module | Estimated cells |
+|---|---|
+| SPIMode3Slave | 90 |
+| BBARegisterFile FSM + decode | 250 |
+| Register file (512 × 8b) | ~200 (distributed RAM) |
+| AsyncFIFO × 8 | 400 |
+| PulseSynchronizer × 4 | 40 |
+| FFSynchronizer × 5 | 30 |
+| SPRAMArbiter | 80 |
+| RXFrameAssembler | 200 |
+| TXFrameDrain | 150 |
+| W5500SPIMaster | 200 |
+| EEPROMModel | 100 |
+| Misc glue | 60 |
+| **Total** | **~1800** |
+
+iCE40UP5K fmax with nextpnr: typically 60–80 MHz for logic of this complexity.
+The `exi` domain at 96 MHz is the tightest. If nextpnr fails to close timing:
+
+1. First option: reduce to 64 MHz `exi` domain (icepll alternative).
+2. Second option: reduce EXI bus speed in Swiss settings to 16 MHz (clock index
+   4 instead of 5), halving the FPGA timing requirement.
+3. Third option: add pipeline registers on the critical address decode path.
+
+---
+
+## 20. PCB / Connector Notes
+
+### Interposer PCB
+
+A simple pass-through interposer PCB connects the GC SP1 slot to the iCEbreaker
+via a ribbon cable or header.
+
+**Required PCB spec:**
+- Thickness: **1.2 mm** (not standard 1.6 mm — critical for fit)
+- Copper finish: **ENIG (gold)** — prevents oxidation on edge contacts
+- Board material: FR4 standard
+
+**Footprint source:** Copy the edge connector footprint from
+`github.com/silverstee1/SP1ETH` KiCad files. Do not design from scratch.
+The staggered dual-row geometry requires exact pad positions that have been
+physically verified. Cross-reference with the ETH2SP1 LaserBear open files.
+
+**Additional interposer components:**
+- 10 kΩ resistor: EXTIN (pin 1) to 3.3V (pin 7) — device detect
+- 100 µF capacitor: 3.3V to GND — bulk decoupling near connector
+- 100 nF capacitor × 2: additional HF decoupling
+- ESD protection diode array: on CLK, MOSI, MISO, CS lines (optional but
+  recommended — the GC motherboard is difficult to repair if damaged)
+
+**Do not connect pin 5 (12V) to anything on the FPGA side.**
+
+### iCEbreaker connection
+
+The interposer PCB exposes EXI signals on a 2.54 mm pitch 8-pin header.
+Connect to iCEbreaker PMOD1 connector using a short ribbon cable. Keep the
+cable as short as possible (< 10 cm) to minimize signal integrity issues at
+32 MHz.
+
+---
+
+## 21. Known Hardware Quirks
+
+### EXI DMA bug
+
+The GC's EXI DMA engine has a bug where data on the MISO line during a DMA
+write is clocked back out with a 1-bit shift. This only affects GC software
+doing DMA writes (rare). Swiss uses IMM (immediate) mode transfers. No FPGA
+workaround needed.
+
+### SPI Mode 3 vs Mode 0
+
+Every other EXI device (memory cards, RTC, IPL) uses SPI Mode 0. The BBA
+is the only device using Mode 3. Do not share the SPI slave implementation
+with other EXI device implementations without parameterising CPOL/CPHA.
+
+### MISO tristate
+
+On real hardware, MISO (DO) is tristated when CS is deasserted. Other EXI
+devices on the same bus would otherwise conflict. On this FPGA implementation,
+drive MISO high (not tristated) when CS is deasserted. The iCE40UP5K does
+not easily support pin tristate from user logic — drive high is safe because
+the BBA occupies a dedicated CS line (SP1 device 2) separate from memory cards
+and the RTC.
+
+### GC hardware revisions
+
+- DOL-001 (original): SP1 present, BBA compatible
+- DOL-001 Rev B: SP1 physically absent on motherboard but case hole present
+- DOL-101 (later): SP1 present again (but Serial Port 2 absent)
+- Panasonic Q: SP1 present
+
+Swiss supports all revisions with SP1 via the EXI hypervisor driver (required
+from Swiss build 1788 onwards for BBA emulation features).
+
+### EXI clock index
+
+The real BBA uses clock index 5 (32 MHz). Swiss allows configuring a lower
+clock index for compatibility. If 96 MHz fmax is not achievable, instruct users
+to configure Swiss to use clock index 4 (16 MHz EXI), which requires only
+32 MHz `exi` domain and is trivially achievable.
+
+---
+
+## 22. File Structure
+
+```
+gc_bba_fpga/
+├── exi_bba/
+│   ├── __init__.py
+│   ├── spi_mode3_slave.py       # SPIMode3Slave
+│   ├── bba_register_file.py     # BBARegisterFile + register constants
+│   ├── spram_arbiter.py         # SPRAMArbiter
+│   ├── rx_frame_assembler.py    # RXFrameAssembler
+│   ├── tx_frame_drain.py        # TXFrameDrain
+│   ├── w5500_spi_master.py      # W5500SPIMaster
+│   ├── eeprom_model.py          # EEPROMModel (93C46)
+│   └── bba_top.py               # BBATop + clock domain setup
+├── sim/
+│   ├── sim_spi_slave.py         # SPIMode3Slave unit test
+│   ├── sim_register_file.py     # BBARegisterFile unit test
+│   ├── sim_bba_init.py          # Full init sequence simulation
+│   ├── sim_rx_path.py           # RX data path end-to-end test
+│   ├── sim_tx_path.py           # TX data path end-to-end test
+│   ├── gc_master_model.py       # GC CPU SPI master simulation model
+│   ├── w5500_slave_model.py     # W5500 SPI slave simulation model
+│   └── ethernet_frame_gen.py    # Test frame generator
+├── platform/
+│   ├── icebreaker_bba.py        # iCEbreaker platform with BBA resources
+│   └── interposer_pinmap.py     # SP1 ↔ PMOD pin mapping
+├── pcb/
+│   ├── interposer/              # KiCad project for interposer PCB
+│   └── README.md                # PCB ordering instructions (1.2mm, ENIG)
+├── constraints/
+│   └── timing.py                # nextpnr timing constraints (if needed)
+├── tests/
+│   └── test_bba.py              # pytest suite
+├── build.py                     # Amaranth build script
+└── README.md
+```
+
+---
+
+## 23. Simulation Strategy
+
+Each module should have a standalone simulation before integration. All
+simulations use Amaranth's `Simulator` with two clock domains:
+`sim.add_clock(1/96e6, domain="exi")` and `sim.add_clock(1/48e6, domain="sync")`.
+
+### Unit tests
+
+**SPIMode3Slave:** Drive CLK/MOSI/CS manually from a process in the `exi`
+domain. Verify `rx_byte`/`rx_valid` match sent data. Verify `spi_miso`
+matches pre-loaded `tx_byte`. Test CS abort mid-byte.
+
+**BBARegisterFile:** Use a `GCMasterModel` (SPI Mode 3 master process) to
+perform read/write transactions. Verify register writes are stored. Verify
+register reads return correct values. Verify IR bit setting and clearing.
+Verify NWAYS returns 0x17. Verify ID query returns 0x04020200.
+
+**SPRAMArbiter:** Issue concurrent EXI reads and ETH writes. Verify ETH writes
+win arbitration. Verify EXI reads complete within 3 sync cycles. Verify no
+data corruption.
+
+**RXFrameAssembler:** Feed a known ethernet frame byte-by-byte. Verify SPRAM
+contents match expected descriptor + frame layout. Verify RWP advances by
+correct page count. Verify rx_irq fires.
+
+**TXFrameDrain + W5500SPIMaster:** Issue TX frame from `tx_bytes` FIFO. Use
+`W5500SlaveModel` process to simulate W5500 responses. Verify frame bytes
+arrive at W5500 correctly. Verify tx_irq fires after SEND_OK.
+
+### Integration test
+
+**sim_bba_init.py:** Full GC init sequence (all 11 steps from Section 11).
+`GCMasterModel` performs every transaction. Verify no stalls, correct responses.
+
+**sim_rx_path.py:** `W5500SlaveModel` delivers a 64-byte test frame.
+`GCMasterModel` polls IR, reads RWP, bulk-reads the frame, advances RRP.
+Verify GC receives identical bytes to what W5500 sent.
+
+**sim_tx_path.py:** `GCMasterModel` writes a 64-byte frame through TXDATA.
+`W5500SlaveModel` captures it. Verify W5500 receives identical bytes.
+
+---
+
+## 24. Open Issues and Extension Points
+
+### Must resolve before first synthesis
+
+- [ ] Exact PLL parameters for iCE40UP5K: run `icepll -i 12 -o 96` and
+      confirm the output is achievable (VCO in 533–1066 MHz range).
+- [ ] SP1 connector footprint: clone SP1ETH repo, extract pad positions, verify
+      stagger geometry and pitch before PCB layout.
+- [ ] W5500 Pmod module pin mapping: confirm which Pmod pins INT_N and RST_N
+      appear on (varies by module vendor).
+- [ ] Swiss version requirement: confirm Swiss build ≥ 1788 for BBA hypervisor
+      support. Earlier builds use a different driver that may have different
+      register access patterns.
+
+### Known limitations
+
+- Single TX buffer (MX98730EC has two). ST1:ST0 = 01 and 10 are treated
+  identically. No known GC title relies on dual TX buffering.
+- No DMA mode support. IMM mode only. Matches real-world Swiss usage.
+- No Serial Port 2 support (different connector, different project scope).
+- 93C46 EEPROM emulation is simplified (hardcoded MAC). A full bit-bang
+  model can be added later if Swiss requires it.
+- RX ring buffer is 15 pages (3840 bytes). The real BBA has 4KB. Frames
+  larger than ~3800 bytes (jumbo frames) will be dropped. Standard 1500-byte
+  MTU frames fit in at most 7 pages — no practical issue.
+
+### Extension points
+
+- **Larger ring buffer:** Use additional SPRAM banks for more RX buffering.
+- **Multiple sockets:** W5500 supports 8 sockets; only socket 0 in MACRAW
+  mode is used here.
+- **Link status passthrough:** Read W5500 PHYCFGR register and forward real
+  link status to NWAYS instead of hardcoding 0x17.
+- **Statistics counters:** LTPS/LRPS (last packet status) are currently 0x00.
+  A more complete implementation would fill these from W5500 socket status.
+- **Serial Port 2 support:** Different physical connector and EXI channel but
+  same FPGA logic; would require a second interposer PCB.
diff --git a/examples/amaranth_cdc.py b/examples/amaranth_cdc.py
new file mode 100644
index 0000000..deb22ce
--- /dev/null
+++ b/examples/amaranth_cdc.py
@@ -0,0 +1,107 @@
+from amaranth import *
+from amaranth.sim import Simulator
+
+
+class SyncFF(Elaboratable):
+    """Width-N multi-flop synchronizer from `src_domain` to `dst_domain`.
+
+    Use when the source is a level signal that may be stable for multiple destination
+    cycles. Not suitable for single-cycle pulses (use TogglePulseSync instead).
+    """
+
+    def __init__(self, width=1, src_domain="src", dst_domain="dst"):
+        self.width = width
+        self.src_domain = src_domain
+        self.dst_domain = dst_domain
+        self.src = Signal(self.width)
+        self.dst = Signal(self.width)
+
+    def elaborate(self, platform):
+        m = Module()
+        reg_src = Signal(self.width)
+        ff0 = Signal(self.width)
+        ff1 = Signal(self.width)
+
+        m.d[self.src_domain] += reg_src.eq(self.src)
+        m.d[self.dst_domain] += ff0.eq(reg_src)
+        m.d[self.dst_domain] += ff1.eq(ff0)
+        m.d.comb += self.dst.eq(ff1)
+
+        return m
+
+
+class TogglePulseSync(Elaboratable):
+    """Reliable pulse transfer from `src_domain` into `dst_domain`.
+
+    - Source toggles `toggle` whenever an event occurs.
+    - Destination synchronizes the toggle and detects edges.
+    Guarantees ordering and no lost pulses for single-bit events.
+    """
+
+    def __init__(self, src_domain="src", dst_domain="dst"):
+        self.src_domain = src_domain
+        self.dst_domain = dst_domain
+        self.src_pulse = Signal()
+        self.dst_pulse = Signal()
+
+    def elaborate(self, platform):
+        m = Module()
+        toggle = Signal()
+        sync0 = Signal()
+        sync1 = Signal()
+        prev = Signal()
+        edge = Signal()
+
+        # Source domain: flip the toggle when a pulse arrives
+        m.d[self.src_domain] += If(self.src_pulse, toggle.eq(~toggle))
+
+        # Destination domain: two-flop synchronize the toggle
+        m.d[self.dst_domain] += sync0.eq(toggle)
+        m.d[self.dst_domain] += sync1.eq(sync0)
+
+        # Detect the change in the destination domain
+        m.d[self.dst_domain] += edge.eq(sync1 ^ prev)
+        m.d[self.dst_domain] += prev.eq(sync1)
+        m.d.comb += self.dst_pulse.eq(edge)
+
+        return m
+
+
+def _sim_toggle_pulse():
+    """Simple simulation that drives pulses on the source domain and prints detections on the destination domain."""
+
+    top = Module()
+    t = TogglePulseSync(src_domain="src", dst_domain="dst")
+    top.submodules.t = t
+
+    sim = Simulator(top)
+    # Create two asynchronous clocks (periods chosen arbitrarily for the sim)
+    sim.add_clock(1e-6, domain="src")
+    sim.add_clock(1.5e-6, domain="dst")
+
+    def process():
+        # Wait a little, then generate three source pulses at different phases
+        for _ in range(5):
+            yield
+
+        for i in range(3):
+            yield t.src_pulse.eq(1)
+            yield
+            yield t.src_pulse.eq(0)
+            # let the domains run for a few cycles
+            for _ in range(10):
+                dp = (yield t.dst_pulse)
+                if dp:
+                    print(f"dst detected pulse at sim tick")
+                yield
+
+        # run a bit longer to observe behavior
+        for _ in range(20):
+            yield
+
+    sim.add_sync_process(process, domain="src")
+    sim.run_until(100e-6)
+
+
+if __name__ == "__main__":
+    _sim_toggle_pulse()
diff --git a/examples/async_fifo.py b/examples/async_fifo.py
new file mode 100644
index 0000000..b8686ad
--- /dev/null
+++ b/examples/async_fifo.py
@@ -0,0 +1,182 @@
+from amaranth import *
+from amaranth.sim import Simulator
+
+
+def bin_to_gray(x):
+    return x ^ (x >> 1)
+
+
+def gray_to_bin(g, width):
+    # convert gray to binary iteratively
+    b = 0
+    for i in range(width - 1, -1, -1):
+        if i == width - 1:
+            b |= ((g >> i) & 1) << i
+        else:
+            b |= (((b >> (i + 1)) & 1) ^ ((g >> i) & 1)) << i
+    return b
+
+
+class AsyncFIFO(Elaboratable):
+    """Parameterizable gray-pointer dual-clock FIFO.
+
+    - width: data width in bits
+    - depth: must be a power of two
+    - wdomain: write (source) domain name
+    - rdomain: read (destination) domain name
+    """
+
+    def __init__(self, width=1, depth=16, wdomain="src", rdomain="dst"):
+        assert depth & (depth - 1) == 0
+        self.width = width
+        self.depth = depth
+        self.aw = (depth - 1).bit_length()  # address width
+        self.wdomain = wdomain
+        self.rdomain = rdomain
+
+        # write-side interface
+        self.wdata = Signal(width)
+        self.w_en = Signal()
+        self.w_full = Signal()
+
+        # read-side interface
+        self.rdata = Signal(width)
+        self.r_en = Signal()
+        self.r_valid = Signal()
+        self.r_empty = Signal()
+
+    def elaborate(self, platform):
+        m = Module()
+
+        mem = Memory(width=self.width, depth=self.depth)
+        wp = mem.write_port(domain=self.wdomain)
+        rp = mem.read_port(domain=self.rdomain, transparent=False)
+        m.submodules += wp, rp
+
+        # pointers are AW+1 bits (extra MSB for wrap)
+        wbin = Signal(self.aw + 1)
+        wgray = Signal(self.aw + 1)
+        rbin = Signal(self.aw + 1)
+        rgray = Signal(self.aw + 1)
+
+        # synchronized opposing domain gray pointers
+        rgray_sync0 = Signal(self.aw + 1)
+        rgray_sync1 = Signal(self.aw + 1)
+        wgray_sync0 = Signal(self.aw + 1)
+        wgray_sync1 = Signal(self.aw + 1)
+
+        # write domain logic
+        with m.Domain(self.wdomain):
+            waddr = Signal(self.aw)
+            next_wbin = Signal(self.aw + 1)
+            next_wgray = Signal(self.aw + 1)
+
+            # compute next pointer
+            m.d.comb += next_wbin.eq(wbin + self.w_en)
+            m.d.comb += next_wgray.eq(next_wbin ^ (next_wbin >> 1))
+
+            # synchronize rgray into write domain (two flops per bit)
+            m.d.comb += []
+            for i in range(self.aw + 1):
+                m.d[self.wdomain] += rgray_sync0[i].eq(rgray[i])
+                m.d[self.wdomain] += rgray_sync1[i].eq(rgray_sync0[i])
+
+            # full detection: next_wgray equals rgray_sync with top two bits inverted
+            if self.aw >= 1:
+                top = self.aw
+                msb_cmp = Signal()
+                low_eq = Signal()
+                m.d.comb += low_eq.eq(next_wgray[top - 1:0] == rgray_sync1[top - 1:0])
+                m.d.comb += msb_cmp.eq((next_wgray[top] != rgray_sync1[top]) & (next_wgray[top - 1] != rgray_sync1[top - 1]))
+                m.d.comb += self.w_full.eq(low_eq & msb_cmp)
+            else:
+                # depth==2 special case
+                m.d.comb += self.w_full.eq(next_wgray != rgray_sync1)
+
+            # write to memory when enabled & not full
+            with m.If(self.w_en & ~self.w_full):
+                m.d[self.wdomain] += wp.addr.eq(wbin[self.aw - 1:0])
+                m.d[self.wdomain] += wp.data.eq(self.wdata)
+                m.d[self.wdomain] += wp.en.eq(1)
+                m.d[self.wdomain] += wbin.eq(next_wbin)
+                m.d[self.wdomain] += wgray.eq(next_wgray)
+            with m.Else():
+                m.d[self.wdomain] += wp.en.eq(0)
+
+        # read domain logic
+        with m.Domain(self.rdomain):
+            raddr = Signal(self.aw)
+            next_rbin = Signal(self.aw + 1)
+            next_rgray = Signal(self.aw + 1)
+
+            # compute next pointer
+            m.d.comb += next_rbin.eq(rbin + self.r_en)
+            m.d.comb += next_rgray.eq(next_rbin ^ (next_rbin >> 1))
+
+            # synchronize wgray into read domain
+            for i in range(self.aw + 1):
+                m.d[self.rdomain] += wgray_sync0[i].eq(wgray[i])
+                m.d[self.rdomain] += wgray_sync1[i].eq(wgray_sync0[i])
+
+            # empty detection
+            m.d.comb += self.r_empty.eq(rgray == wgray_sync1)
+
+            # read when enabled and not empty
+            with m.If(self.r_en & ~self.r_empty):
+                m.d[self.rdomain] += rp.addr.eq(rbin[self.aw - 1:0])
+                m.d[self.rdomain] += rp.en.eq(1)
+                m.d[self.rdomain] += rbin.eq(next_rbin)
+                m.d[self.rdomain] += rgray.eq(next_rgray)
+                m.d[self.rdomain] += self.r_valid.eq(1)
+                m.d[self.rdomain] += self.rdata.eq(rp.data)
+            with m.Else():
+                m.d[self.rdomain] += rp.en.eq(0)
+                m.d[self.rdomain] += self.r_valid.eq(0)
+
+        return m
+
+
+def _sim_fifo():
+    top = Module()
+    fifo = AsyncFIFO(width=1, depth=16, wdomain="src", rdomain="dst")
+    top.submodules.fifo = fifo
+
+    sim = Simulator(top)
+    sim.add_clock(1e-6, domain="src")
+    sim.add_clock(1.7e-6, domain="dst")
+
+    def writer():
+        # write a sequence of bits (0..31 repeating pattern)
+        for i in range(32):
+            yield fifo.wdata.eq(i & 1)
+            yield fifo.w_en.eq(1)
+            yield
+            yield fifo.w_en.eq(0)
+            # allow some idle cycles
+            for _ in range((i % 3)):
+                yield
+
+    def reader():
+        seen = []
+        for _ in range(200):
+            # try to consume if not empty
+            empty = (yield fifo.r_empty)
+            if not empty:
+                yield fifo.r_en.eq(1)
+                yield
+                yield fifo.r_en.eq(0)
+                if (yield fifo.r_valid):
+                    d = (yield fifo.rdata)
+                    seen.append(d)
+                    print(f"read: {d}")
+            else:
+                yield
+        print(f"total read: {len(seen)}")
+
+    sim.add_sync_process(writer, domain="src")
+    sim.add_sync_process(reader, domain="dst")
+    sim.run()
+
+
+if __name__ == "__main__":
+    _sim_fifo()
diff --git a/examples/icebreaker_fifo.py b/examples/icebreaker_fifo.py
new file mode 100644
index 0000000..5507400
--- /dev/null
+++ b/examples/icebreaker_fifo.py
@@ -0,0 +1,119 @@
+"""IceBreaker (iCE40 UP5K) vendor-backed async FIFO example.
+
+This module uses Amaranth's `Memory` with separate write/read ports in different
+clock domains. With the icestorm toolchain the memory typically maps to
+`SB_RAM40_4K` block RAMs. The control (full/empty) is implemented with
+gray-pointer logic and two-stage synchronization of opposing pointers.
+
+Notes:
+- This prefers block RAM for storage (small LUT usage, lower power).
+- The write/read ports are in independent domains; backend maps ports to
+  dual-port RAM primitives when available.
+"""
+
+from amaranth import *
+
+
+class Ice40AsyncFIFO(Elaboratable):
+    def __init__(self, depth=256, wdomain="src", rdomain="dst"):
+        assert depth & (depth - 1) == 0, "depth must be power of two"
+        self.depth = depth
+        self.aw = (depth - 1).bit_length()
+        self.wdomain = wdomain
+        self.rdomain = rdomain
+
+        # serial (1-bit) interface
+        self.wdata = Signal()
+        self.w_en = Signal()
+        self.w_full = Signal()
+
+        self.rdata = Signal()
+        self.r_en = Signal()
+        self.r_valid = Signal()
+        self.r_empty = Signal()
+
+    def elaborate(self, platform):
+        m = Module()
+
+        # single-bit-wide memory mapped to vendor BRAMs by the backend
+        mem = Memory(width=1, depth=self.depth)
+        wp = mem.write_port(domain=self.wdomain)
+        rp = mem.read_port(domain=self.rdomain, transparent=False)
+        m.submodules += wp, rp
+
+        # pointers (aw+1 bits to include wrap bit)
+        wbin = Signal(self.aw + 1)
+        wgray = Signal(self.aw + 1)
+        rbin = Signal(self.aw + 1)
+        rgray = Signal(self.aw + 1)
+
+        # sync registers for opposing pointers (two-stage)
+        rgray_sync0 = Signal(self.aw + 1)
+        rgray_sync1 = Signal(self.aw + 1)
+        wgray_sync0 = Signal(self.aw + 1)
+        wgray_sync1 = Signal(self.aw + 1)
+
+        # write-side
+        with m.Domain(self.wdomain):
+            next_wbin = Signal(self.aw + 1)
+            next_wgray = Signal(self.aw + 1)
+            m.d.comb += next_wbin.eq(wbin + self.w_en)
+            m.d.comb += next_wgray.eq(next_wbin ^ (next_wbin >> 1))
+
+            # sync read pointer into write domain
+            for i in range(self.aw + 1):
+                m.d[self.wdomain] += rgray_sync0[i].eq(rgray[i])
+                m.d[self.wdomain] += rgray_sync1[i].eq(rgray_sync0[i])
+
+            # full detection (standard gray-pointer trick)
+            top = self.aw
+            low_eq = Signal()
+            msb_cmp = Signal()
+            m.d.comb += low_eq.eq(next_wgray[top - 1:0] == rgray_sync1[top - 1:0])
+            m.d.comb += msb_cmp.eq((next_wgray[top] != rgray_sync1[top]) & (next_wgray[top - 1] != rgray_sync1[top - 1]))
+            m.d.comb += self.w_full.eq(low_eq & msb_cmp)
+
+            # perform write
+            with m.If(self.w_en & ~self.w_full):
+                m.d[self.wdomain] += wp.addr.eq(wbin[self.aw - 1:0])
+                m.d[self.wdomain] += wp.data.eq(self.wdata)
+                m.d[self.wdomain] += wp.en.eq(1)
+                m.d[self.wdomain] += wbin.eq(next_wbin)
+                m.d[self.wdomain] += wgray.eq(next_wgray)
+            with m.Else():
+                m.d[self.wdomain] += wp.en.eq(0)
+
+        # read-side
+        with m.Domain(self.rdomain):
+            next_rbin = Signal(self.aw + 1)
+            next_rgray = Signal(self.aw + 1)
+            m.d.comb += next_rbin.eq(rbin + self.r_en)
+            m.d.comb += next_rgray.eq(next_rbin ^ (next_rbin >> 1))
+
+            # sync write pointer into read domain
+            for i in range(self.aw + 1):
+                m.d[self.rdomain] += wgray_sync0[i].eq(wgray[i])
+                m.d[self.rdomain] += wgray_sync1[i].eq(wgray_sync0[i])
+
+            m.d.comb += self.r_empty.eq(rgray == wgray_sync1)
+
+            with m.If(self.r_en & ~self.r_empty):
+                m.d[self.rdomain] += rp.addr.eq(rbin[self.aw - 1:0])
+                m.d[self.rdomain] += rp.en.eq(1)
+                m.d[self.rdomain] += rbin.eq(next_rbin)
+                m.d[self.rdomain] += rgray.eq(next_rgray)
+                m.d[self.rdomain] += self.r_valid.eq(1)
+                m.d[self.rdomain] += self.rdata.eq(rp.data)
+            with m.Else():
+                m.d[self.rdomain] += rp.en.eq(0)
+                m.d[self.rdomain] += self.r_valid.eq(0)
+
+        return m
+
+
+if __name__ == "__main__":
+    # Quick smoke-check: instantiate and print fragment
+    from amaranth.back import verilog
+
+    fifo = Ice40AsyncFIFO(depth=256)
+    print(verilog.convert(fifo, ports=[fifo.wdata, fifo.w_en, fifo.w_full, fifo.rdata, fifo.r_en, fifo.r_valid, fifo.r_empty]))
diff --git a/exi_bba/__init__.py b/exi_bba/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/exi_bba/bba_register_file.py b/exi_bba/bba_register_file.py
new file mode 100644
index 0000000..0bac2b5
--- /dev/null
+++ b/exi_bba/bba_register_file.py
@@ -0,0 +1,617 @@
+"""BBA register file — EXI domain.
+
+Decodes EXI transactions (2-byte header + N data bytes), reads/writes the BBA
+register space, and owns all AsyncFIFO / PulseSynchronizer CDC primitives.
+
+Transaction header format
+--------------------------
+Byte 0  [7]    write_flag
+        [6:0]  addr[12:6]
+Byte 1  [7:2]  addr[5:0]
+        [1:0]  xfer_len−1   (0=1B, 1=2B, 2=3B, 3=4B)
+
+Addresses 0x0000–0x00FF : register file (sparse individual Signals, exi domain).
+Addresses 0x0100–0x1FFF : SPRAM ring buffer (sync domain, prefetch FIFOs).
+"""
+
+from amaranth import *
+from amaranth.lib.cdc import PulseSynchronizer
+from amaranth.lib.fifo import AsyncFIFO
+
+__all__ = ["BBARegisterFile"]
+
+# Register addresses
+_NCRA    = 0x00
+_IMR     = 0x08
+_IR      = 0x09
+_RWP_LO  = 0x16
+_RWP_HI  = 0x17
+_RRP_LO  = 0x18
+_RRP_HI  = 0x19
+_PAR0    = 0x20
+_PAR1    = 0x21
+_PAR2    = 0x22
+_PAR3    = 0x23
+_PAR4    = 0x24
+_PAR5    = 0x25
+_NWAYS   = 0x31
+_HIPR    = 0x3A
+_TWD_LO  = 0x34
+_TWD_HI  = 0x35
+_TXDATA  = 0x48
+
+# Read-only hardcoded values
+_NWAYS_VAL = 0x17
+_HIPR_VAL  = 0x01
+
+# Device ID returned on first 4-byte read of addr 0x0000
+_DEVICE_ID = [0x04, 0x02, 0x02, 0x00]
+
+
+class BBARegisterFile(Elaboratable):
+    """EXI transaction decoder and BBA register file with CDC bridges.
+
+    Sync-domain FIFO/pulse ports are wired by BBATop to the sync-domain modules.
+    """
+
+    def __init__(self):
+        # ── EXI byte-stream interface (exi domain, from/to ExiCapture) ────
+        # RX: received bytes (header + write data + read dummies) — FWFT read
+        # side of ExiCapture's rx_fifo.
+        self.rx_data = Signal(8)
+        self.rx_rdy  = Signal()
+        self.rx_en   = Signal()
+        # TX: response bytes pushed proactively into ExiCapture's tx_fifo.
+        self.tx_data = Signal(8)
+        self.tx_en   = Signal()
+        self.tx_rdy  = Signal()
+
+        # High while an EXI transaction is in progress (from ExiCapture).
+        # SPRAM reads stream until this deasserts → supports variable-length
+        # (DMA) bulk reads, not just ≤4-byte immediate transfers.
+        self.cs_active = Signal()
+
+        # ── Interrupt (exi domain) ────────────────────────────────────────
+        self.exi_int_n = Signal(init=1)
+
+        # ── PAR output (for forwarding to W5500 as source MAC) ───────────
+        self.par = Signal(48)   # PAR0-5 packed: PAR0 in low byte par[0:8]
+
+        # NCRA[3] = SR (start receive) bit — gates the RX ring-buffer path.
+        self.ncra_sr = Signal()
+
+        # ── CDC FIFO sync-domain sides (wired by BBATop) ──────────────────
+        # SPRAM request  exi→sync: sync reads these
+        self.spram_req_r_data = Signal(16)
+        self.spram_req_r_en   = Signal()
+        self.spram_req_r_rdy  = Signal()
+
+        # SPRAM response sync→exi: sync writes these
+        self.spram_rsp_w_data = Signal(8)
+        self.spram_rsp_w_en   = Signal()
+        self.spram_rsp_w_rdy  = Signal()
+
+        # TX bytes  exi→sync: sync reads these
+        self.tx_bytes_r_data  = Signal(8)
+        self.tx_bytes_r_en    = Signal()
+        self.tx_bytes_r_rdy   = Signal()
+
+        # TX ctrl (frame length)  exi→sync: sync reads these
+        self.tx_ctrl_r_data   = Signal(16)
+        self.tx_ctrl_r_en     = Signal()
+        self.tx_ctrl_r_rdy    = Signal()
+
+        # RX write-pointer update  sync→exi: sync writes these
+        self.rx_wptr_w_data   = Signal(8)
+        self.rx_wptr_w_en     = Signal()
+        self.rx_wptr_w_rdy    = Signal()
+
+        # RX read-pointer update  exi→sync: sync reads these
+        self.rx_rptr_r_data   = Signal(8)
+        self.rx_rptr_r_en     = Signal()
+        self.rx_rptr_r_rdy    = Signal()
+
+        # PulseSynchronizer ports (exi↔sync)
+        self.ncra_rst_o = Signal()   # exi→sync
+        self.rx_irq_i   = Signal()   # sync→exi
+        self.tx_irq_i   = Signal()   # sync→exi
+
+    def elaborate(self, platform):
+        m = Module()
+
+        # ── CDC FIFOs ────────────────────────────────────────────────────
+        spram_req = AsyncFIFO(width=16, depth=4,  w_domain="exi",  r_domain="sync")
+        spram_rsp = AsyncFIFO(width=8,  depth=4,  w_domain="sync", r_domain="exi")
+        tx_bytes  = AsyncFIFO(width=8,  depth=16, w_domain="exi",  r_domain="sync")
+        tx_ctrl   = AsyncFIFO(width=16, depth=4,  w_domain="exi",  r_domain="sync")
+        rx_wptr   = AsyncFIFO(width=8,  depth=4,  w_domain="sync", r_domain="exi")
+        rx_rptr   = AsyncFIFO(width=8,  depth=4,  w_domain="exi",  r_domain="sync")
+
+        m.submodules.spram_req = spram_req
+        m.submodules.spram_rsp = spram_rsp
+        m.submodules.tx_bytes  = tx_bytes
+        m.submodules.tx_ctrl   = tx_ctrl
+        m.submodules.rx_wptr   = rx_wptr
+        m.submodules.rx_rptr   = rx_rptr
+
+        # Expose sync-domain FIFO sides
+        m.d.comb += [
+            self.spram_req_r_data .eq(spram_req.r_data),
+            spram_req.r_en        .eq(self.spram_req_r_en),
+            self.spram_req_r_rdy  .eq(spram_req.r_rdy),
+
+            spram_rsp.w_data      .eq(self.spram_rsp_w_data),
+            spram_rsp.w_en        .eq(self.spram_rsp_w_en),
+            self.spram_rsp_w_rdy  .eq(spram_rsp.w_rdy),
+
+            self.tx_bytes_r_data  .eq(tx_bytes.r_data),
+            tx_bytes.r_en         .eq(self.tx_bytes_r_en),
+            self.tx_bytes_r_rdy   .eq(tx_bytes.r_rdy),
+
+            self.tx_ctrl_r_data   .eq(tx_ctrl.r_data),
+            tx_ctrl.r_en          .eq(self.tx_ctrl_r_en),
+            self.tx_ctrl_r_rdy    .eq(tx_ctrl.r_rdy),
+
+            rx_wptr.w_data        .eq(self.rx_wptr_w_data),
+            rx_wptr.w_en          .eq(self.rx_wptr_w_en),
+            self.rx_wptr_w_rdy    .eq(rx_wptr.w_rdy),
+
+            self.rx_rptr_r_data   .eq(rx_rptr.r_data),
+            rx_rptr.r_en          .eq(self.rx_rptr_r_en),
+            self.rx_rptr_r_rdy    .eq(rx_rptr.r_rdy),
+        ]
+
+        # ── PulseSynchronizers ───────────────────────────────────────────
+        ncra_rst_ps = PulseSynchronizer(i_domain="exi",  o_domain="sync")
+        rx_irq_ps   = PulseSynchronizer(i_domain="sync", o_domain="exi")
+        tx_irq_ps   = PulseSynchronizer(i_domain="sync", o_domain="exi")
+
+        m.submodules.ncra_rst_ps = ncra_rst_ps
+        m.submodules.rx_irq_ps   = rx_irq_ps
+        m.submodules.tx_irq_ps   = tx_irq_ps
+
+        m.d.comb += [
+            self.ncra_rst_o .eq(ncra_rst_ps.o),
+            rx_irq_ps.i     .eq(self.rx_irq_i),
+            tx_irq_ps.i     .eq(self.tx_irq_i),
+        ]
+
+        # ── Register file (sparse individual Signals, exi domain) ────────
+        # Only the registers actually read/written by the GC or sync domain.
+        # Writes to unknown addresses are silently ignored; reads return 0.
+        r_ncra   = Signal(8)
+        r_imr    = Signal(8)
+        r_ir     = Signal(8)
+        r_rwp_lo = Signal(8)
+        r_rrp_lo = Signal(8)
+        # PAR0–5 reset to a valid Nintendo OUI MAC (00:09:BF:00:00:01) so the
+        # device has a sane source MAC even before the GC driver programs its
+        # own.  PAR0 is the first MAC octet.
+        _par_reset = [0x00, 0x09, 0xBF, 0x00, 0x00, 0x01]
+        r_par    = Array([Signal(8, name=f"par{i}", init=_par_reset[i])
+                          for i in range(6)])
+        r_twd_lo = Signal(8)
+        r_twd_hi = Signal(8)
+
+        # PAR packed output: PAR0 in the LOW byte (par[0:8]).  The W5500 master
+        # reads mac_shadow[i] = par[i*8:(i+1)*8], so this puts PAR0 first in the
+        # SHAR write — i.e. PAR0 is the first MAC octet on the wire.
+        m.d.comb += self.par.eq(Cat(
+            r_par[0], r_par[1], r_par[2], r_par[3], r_par[4], r_par[5],
+        ))
+        m.d.comb += self.ncra_sr.eq(r_ncra[3])   # start-receive bit
+
+        # ── Transaction state ────────────────────────────────────────────
+        hdr0         = Signal(8)
+        addr         = Signal(13)
+        is_write     = Signal()
+        xfer_len     = Signal(2)     # 0=1B … 3=4B
+        byte_ctr     = Signal(2)
+        tx_frame_len = Signal(16)
+
+        # True until first NCRA reset write: return device ID on addr=0 reads
+        id_phase = Signal(init=1)
+
+        # Per-byte SPRAM read handshake (register-read path): sp_req marks a
+        # request in flight; drain_ctr counts the read-phase dummy bytes.
+        sp_req    = Signal()
+        drain_ctr = Signal(2)
+
+        # SPRAM streaming-read state (DMA / variable-length reads):
+        #   sp_addr     — next SPRAM byte address to request (auto-increments)
+        #   outstanding — SPRAM requests issued but whose responses are not yet
+        #                 popped (bounds prefetch and is drained at end)
+        sp_addr     = Signal(13)
+        outstanding = Signal(4)
+        SP_LIMIT    = 4              # max prefetch depth in flight
+
+        # Effective address of the current data byte — a REGISTERED running
+        # pointer (set to the base in HEADER1, incremented per byte).  Keeping
+        # it registered keeps the 13-bit adder off the combinational path that
+        # feeds the read-response mux → tx_fifo write data.
+        eff_addr = Signal(13)
+        rd_sel = eff_addr[0:8]
+
+        # ── Combinational read-response value (non-SPRAM) ────────────────
+        reg_rdval = Signal(8)
+        with m.Switch(rd_sel):
+            with m.Case(_NCRA):   m.d.comb += reg_rdval.eq(r_ncra)
+            with m.Case(_IMR):    m.d.comb += reg_rdval.eq(r_imr)
+            with m.Case(_IR):     m.d.comb += reg_rdval.eq(r_ir)
+            with m.Case(_RWP_LO): m.d.comb += reg_rdval.eq(r_rwp_lo)
+            with m.Case(_RRP_LO): m.d.comb += reg_rdval.eq(r_rrp_lo)
+            with m.Case(_PAR0, _PAR1, _PAR2, _PAR3, _PAR4, _PAR5):
+                                  m.d.comb += reg_rdval.eq(r_par[eff_addr[0:3]])
+            with m.Case(_TWD_LO): m.d.comb += reg_rdval.eq(r_twd_lo)
+            with m.Case(_TWD_HI): m.d.comb += reg_rdval.eq(r_twd_hi)
+            with m.Case(_NWAYS):  m.d.comb += reg_rdval.eq(_NWAYS_VAL)
+            with m.Case(_HIPR):   m.d.comb += reg_rdval.eq(_HIPR_VAL)
+            with m.Default():     m.d.comb += reg_rdval.eq(0)
+
+        # Device-ID bytes (addr 0 read while id_phase): 0x04 0x02 0x02 0x00
+        devid = Signal(8)
+        with m.Switch(byte_ctr):
+            with m.Case(0): m.d.comb += devid.eq(0x04)
+            with m.Case(1): m.d.comb += devid.eq(0x02)
+            with m.Case(2): m.d.comb += devid.eq(0x02)
+            with m.Case(3): m.d.comb += devid.eq(0x00)
+
+        rd_val = Signal(8)   # response for the current non-SPRAM read byte
+        with m.If((addr == 0) & id_phase):
+            m.d.comb += rd_val.eq(devid)
+        with m.Else():
+            m.d.comb += rd_val.eq(reg_rdval)
+
+        # ── Default strobes ──────────────────────────────────────────────
+        m.d.exi += [
+            spram_req.w_en  .eq(0),
+            tx_bytes.w_en   .eq(0),
+            tx_ctrl.w_en    .eq(0),
+            rx_rptr.w_en    .eq(0),
+            rx_wptr.r_en    .eq(0),
+            ncra_rst_ps.i   .eq(0),
+        ]
+        m.d.comb += [
+            self.rx_en  .eq(0),
+            self.tx_en  .eq(0),
+            self.tx_data.eq(0),
+            # Combinational so the FIFO advances in the SAME cycle as the pop —
+            # a registered r_en would let `pop` re-fire on the same byte.
+            spram_rsp.r_en.eq(0),
+        ]
+
+        # ── Transaction FSM (proactive push/pull over byte FIFOs) ────────
+        # The SPI bit cadence lives in the capture domain; here we just consume
+        # received bytes and, for reads, push response bytes into tx_fifo during
+        # the EXI clock-idle gap before the GC clocks the data phase.
+        with m.FSM(domain="exi", name="exi_fsm"):
+
+            with m.State("HEADER0"):
+                with m.If(self.rx_rdy):
+                    m.d.comb += self.rx_en.eq(1)
+                    m.d.exi += hdr0.eq(self.rx_data)
+                    m.next = "HEADER1"
+
+            with m.State("HEADER1"):
+                with m.If(self.rx_rdy):
+                    m.d.comb += self.rx_en.eq(1)
+                    new_addr  = Cat(self.rx_data[2:8], hdr0[0:7])  # 13-bit addr
+                    new_len   = self.rx_data[0:2]
+                    new_write = hdr0[7]
+
+                    m.d.exi += addr.eq(new_addr)
+                    m.d.exi += eff_addr.eq(new_addr)   # running pointer init
+                    m.d.exi += xfer_len.eq(new_len)
+                    m.d.exi += is_write.eq(new_write)
+                    m.d.exi += byte_ctr.eq(0)
+                    m.d.exi += sp_req.eq(0)
+                    m.d.exi += drain_ctr.eq(0)
+
+                    with m.If(new_write):
+                        m.next = "WRITE"
+                    with m.Elif(new_addr >= 0x100):
+                        # SPRAM region: stream until CS deasserts (DMA-capable).
+                        m.d.exi += sp_addr.eq(new_addr)
+                        m.d.exi += outstanding.eq(0)
+                        m.next = "SPRAM_STREAM"
+                    with m.Else():
+                        m.next = "REG_READ"
+
+            with m.State("WRITE"):
+                # Consume xfer_len+1 data bytes, writing the register file.
+                with m.If(self.rx_rdy):
+                    m.d.comb += self.rx_en.eq(1)
+                    with m.Switch(rd_sel):
+                        with m.Case(_NCRA):
+                            m.d.exi += r_ncra.eq(self.rx_data)
+                            with m.If(self.rx_data[0]):
+                                m.d.exi += r_ncra[0].eq(0)  # RESET self-clears
+                                m.d.exi += ncra_rst_ps.i.eq(1)
+                                m.d.exi += id_phase.eq(0)
+                            with m.If(self.rx_data[1:3].any()):
+                                with m.If(tx_ctrl.w_rdy):
+                                    m.d.exi += tx_ctrl.w_data.eq(tx_frame_len)
+                                    m.d.exi += tx_ctrl.w_en.eq(1)
+                        with m.Case(_IMR):
+                            m.d.exi += r_imr.eq(self.rx_data)
+                        with m.Case(_IR):
+                            m.d.exi += r_ir.eq(r_ir & ~self.rx_data)  # write-1-clear
+                        with m.Case(_RRP_LO):
+                            m.d.exi += r_rrp_lo.eq(self.rx_data)
+                            with m.If(rx_rptr.w_rdy):
+                                m.d.exi += rx_rptr.w_data.eq(self.rx_data)
+                                m.d.exi += rx_rptr.w_en.eq(1)
+                        with m.Case(_PAR0, _PAR1, _PAR2, _PAR3, _PAR4, _PAR5):
+                            m.d.exi += r_par[eff_addr[0:3]].eq(self.rx_data)
+                        with m.Case(_TWD_LO):
+                            m.d.exi += r_twd_lo.eq(self.rx_data)
+                            m.d.exi += tx_frame_len[0:8].eq(self.rx_data)
+                        with m.Case(_TWD_HI):
+                            m.d.exi += r_twd_hi.eq(self.rx_data)
+                            m.d.exi += tx_frame_len[8:16].eq(self.rx_data)
+                        with m.Case(_TXDATA):
+                            with m.If(tx_bytes.w_rdy):
+                                m.d.exi += tx_bytes.w_data.eq(self.rx_data)
+                                m.d.exi += tx_bytes.w_en.eq(1)
+                        # All other addresses silently ignored
+
+                    with m.If(byte_ctr == xfer_len):
+                        m.next = "HEADER0"
+                    with m.Else():
+                        m.d.exi += byte_ctr.eq(byte_ctr + 1)
+                        m.d.exi += eff_addr.eq(eff_addr + 1)
+
+            with m.State("REG_READ"):
+                # Register / device-ID read (addr < 0x100): value available
+                # immediately, bounded by the header's xfer_len (≤4 bytes).
+                with m.If(self.tx_rdy):
+                    m.d.comb += self.tx_data.eq(rd_val)
+                    m.d.comb += self.tx_en.eq(1)
+                    with m.If(byte_ctr == xfer_len):
+                        m.next = "READ_DRAIN"
+                    with m.Else():
+                        m.d.exi += byte_ctr.eq(byte_ctr + 1)
+                        m.d.exi += eff_addr.eq(eff_addr + 1)
+
+            with m.State("READ_DRAIN"):
+                # Discard the xfer_len+1 dummy bytes the GC clocks while reading.
+                with m.If(self.rx_rdy):
+                    m.d.comb += self.rx_en.eq(1)
+                    with m.If(drain_ctr == xfer_len):
+                        m.next = "HEADER0"
+                    with m.Else():
+                        m.d.exi += drain_ctr.eq(drain_ctr + 1)
+
+            with m.State("SPRAM_STREAM"):
+                # Stream SPRAM bytes until CS deasserts — handles both ≤4-byte
+                # immediate reads and arbitrary-length DMA reads uniformly.
+                # Issue read requests ahead (prefetch, bounded by SP_LIMIT) and
+                # push responses into tx_fifo; the capture domain pops them as
+                # the GC clocks.  Drain rx dummies as they arrive.
+                issue = Signal()
+                pop   = Signal()
+                m.d.comb += issue.eq(self.cs_active & spram_req.w_rdy
+                                     & (outstanding < SP_LIMIT))
+                m.d.comb += pop.eq(spram_rsp.r_rdy & self.tx_rdy)
+
+                with m.If(issue):
+                    m.d.exi += spram_req.w_data.eq(sp_addr)
+                    m.d.exi += spram_req.w_en.eq(1)
+                    m.d.exi += sp_addr.eq(sp_addr + 1)
+                with m.If(pop):
+                    m.d.comb += self.tx_data.eq(spram_rsp.r_data)
+                    m.d.comb += self.tx_en.eq(1)
+                    m.d.comb += spram_rsp.r_en.eq(1)
+                m.d.exi += outstanding.eq(outstanding + issue - pop)
+
+                with m.If(self.rx_rdy):
+                    m.d.comb += self.rx_en.eq(1)          # drain dummy bytes
+
+                with m.If(~self.cs_active):
+                    m.next = "SPRAM_END"
+
+            with m.State("SPRAM_END"):
+                # CS deasserted: drain in-flight SPRAM responses and rx dummies,
+                # then idle.  Leftover prefetch in tx_fifo is flushed by
+                # ExiCapture on the next CS assertion.
+                with m.If(spram_rsp.r_rdy):
+                    m.d.comb += spram_rsp.r_en.eq(1)
+                    m.d.exi += outstanding.eq(outstanding - 1)
+                with m.If(self.rx_rdy):
+                    m.d.comb += self.rx_en.eq(1)
+                with m.If((outstanding == 0) & ~self.rx_rdy & ~spram_rsp.r_rdy):
+                    m.next = "HEADER0"
+
+        # ── Interrupt output ─────────────────────────────────────────────
+        m.d.exi += self.exi_int_n.eq(~(r_ir & r_imr).any())
+
+        # ── Consume RWP updates from sync domain ──────────────────────────
+        with m.If(rx_wptr.r_rdy):
+            m.d.exi += rx_wptr.r_en.eq(1)
+            m.d.exi += r_rwp_lo.eq(rx_wptr.r_data)
+
+        # ── PulseSynchronizer arrivals ────────────────────────────────────
+        with m.If(rx_irq_ps.o):
+            m.d.exi += r_ir[1].eq(1)   # RI bit
+        with m.If(tx_irq_ps.o):
+            m.d.exi += r_ir[2].eq(1)   # TI bit
+            m.d.exi += r_ncra[1:3].eq(0)  # clear ST bits
+
+        return m
+
+
+# ── Testbench ─────────────────────────────────────────────────────────────
+
+if __name__ == "__main__":
+    import sys, os
+    sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
+
+    from amaranth.sim import Simulator, Period
+
+    reg = BBARegisterFile()
+
+    # Drive the byte-stream interface directly (the SPI bit cadence and FIFOs
+    # live in ExiCapture; here we model the byte producer/consumer).
+    async def push_rx(ctx, b):
+        """Present one received byte and wait for the register file to pop it."""
+        ctx.set(reg.rx_data, b)
+        ctx.set(reg.rx_rdy, 1)
+        while True:
+            en = ctx.get(reg.rx_en)
+            await ctx.tick("exi")
+            if en:
+                break
+        ctx.set(reg.rx_rdy, 0)
+
+    async def collect_tx(ctx, n):
+        """Collect n response bytes pushed by the register file (bounded)."""
+        out = []
+        for _ in range(3000):
+            if ctx.get(reg.tx_en):
+                out.append(ctx.get(reg.tx_data))
+            if len(out) >= n:
+                break
+            await ctx.tick("exi")
+        return out
+
+    async def exi_read(ctx, addr, length=1):
+        hdr0 = (addr >> 6) & 0x7F
+        hdr1 = ((addr & 0x3F) << 2) | (length - 1)
+        await push_rx(ctx, hdr0)
+        await push_rx(ctx, hdr1)
+        result = await collect_tx(ctx, length)     # READ pushes `length` bytes
+        for _ in range(length):                    # READ_DRAIN dummies
+            await push_rx(ctx, 0x00)
+        return result
+
+    async def exi_write(ctx, addr, data):
+        hdr0 = 0x80 | ((addr >> 6) & 0x7F)
+        hdr1 = ((addr & 0x3F) << 2) | (len(data) - 1)
+        await push_rx(ctx, hdr0)
+        await push_rx(ctx, hdr1)
+        for b in data:
+            await push_rx(ctx, b)
+
+    # SPRAM contents the streaming-read test reads back (byte i = 0xA0+i).
+    spram_mem = {0x100 + i: (0xA0 + i) & 0xFF for i in range(64)}
+
+    async def spram_model(ctx):
+        """Model the SPRAM (sync side): answer spram_req with mem[addr].
+
+        One request at a time, with cleanly-pulsed r_en/w_en so the FIFO pop
+        and the response push stay in lock-step (no double-response races).
+        """
+        state = "POP"
+        held = 0
+        async for vals in ctx.tick("sync").sample(
+                reg.spram_req_r_rdy, reg.spram_req_r_data, reg.spram_rsp_w_rdy):
+            rdy, addr, rsp_rdy = vals[-3:]
+            ctx.set(reg.spram_req_r_en, 0)
+            ctx.set(reg.spram_rsp_w_en, 0)
+            if state == "POP":
+                if rdy:
+                    held = spram_mem.get(addr, 0)
+                    ctx.set(reg.spram_req_r_en, 1)     # consume the request
+                    state = "RESP"
+            else:  # RESP
+                if rsp_rdy:
+                    ctx.set(reg.spram_rsp_w_data, held)
+                    ctx.set(reg.spram_rsp_w_en, 1)     # deliver the response
+                    state = "POP"
+
+    errors = []
+
+    async def testbench(ctx):
+        ctx.set(reg.tx_rdy, 1)        # tx_fifo always has room in this model
+        await ctx.tick("exi").repeat(8)
+
+        # T1: Device ID (addr=0, 4-byte read)
+        result = await exi_read(ctx, 0x0000, length=4)
+        if result != _DEVICE_ID:
+            errors.append(f"T1 device ID: expected {_DEVICE_ID}, got {result}")
+        print(f"T1 device ID: {[f'0x{b:02X}' for b in result]}")
+        await ctx.tick("exi").repeat(4)
+
+        # T2: Write and read back PAR0-PAR3
+        await exi_write(ctx, _PAR0, [0xDE, 0xAD, 0xBE, 0xEF])
+        await ctx.tick("exi").repeat(4)
+        result = await exi_read(ctx, _PAR0, length=4)
+        if result != [0xDE, 0xAD, 0xBE, 0xEF]:
+            errors.append(f"T2 PAR readback: {result}")
+        print(f"T2 PAR0-3: {[f'0x{b:02X}' for b in result]}")
+        await ctx.tick("exi").repeat(4)
+
+        # T3: NWAYS hardcoded 0x17
+        result = await exi_read(ctx, _NWAYS, length=1)
+        if result != [0x17]:
+            errors.append(f"T3 NWAYS: expected 0x17, got {result}")
+        print(f"T3 NWAYS: 0x{result[0]:02X}")
+        await ctx.tick("exi").repeat(4)
+
+        # T4: HIPR hardcoded 0x01
+        result = await exi_read(ctx, _HIPR, length=1)
+        if result != [0x01]:
+            errors.append(f"T4 HIPR: expected 0x01, got {result}")
+        print(f"T4 HIPR: 0x{result[0]:02X}")
+        await ctx.tick("exi").repeat(4)
+
+        # T5: IMR write, rx_irq pulse, INT_N asserts, then IR clear
+        await exi_write(ctx, _IMR, [0x02])   # enable RI (bit 1)
+        await ctx.tick("exi").repeat(4)
+        ctx.set(reg.rx_irq_i, 1)
+        await ctx.tick("sync").repeat(1)
+        ctx.set(reg.rx_irq_i, 0)
+        await ctx.tick("exi").repeat(12)     # wait for PS propagation
+        int_n = ctx.get(reg.exi_int_n)
+        if int_n != 0:
+            errors.append(f"T5 INT_N after RI: expected 0, got {int_n}")
+        print(f"T5 INT_N after RI pulse: {int_n} (want 0)")
+        await exi_write(ctx, _IR, [0x02])    # write-1-to-clear RI
+        await ctx.tick("exi").repeat(4)
+        int_n = ctx.get(reg.exi_int_n)
+        if int_n != 1:
+            errors.append(f"T5 INT_N after clear: expected 1, got {int_n}")
+        print(f"T5 INT_N after IR clear: {int_n} (want 1)")
+
+        # T6: streaming SPRAM read (DMA) — read N>4 bytes from 0x100 by holding
+        # cs_active and clocking past the header's 4-byte length field.
+        N = 12
+        ctx.set(reg.cs_active, 1)
+        await push_rx(ctx, 0x04)         # hdr0 → addr[12:6]; addr 0x100, read
+        await push_rx(ctx, 0x00)         # hdr1 → addr[5:0]=0, len field ignored
+        got = []
+        for _ in range(5000):
+            if ctx.get(reg.tx_en):
+                got.append(ctx.get(reg.tx_data))
+            if len(got) >= N:
+                break
+            await ctx.tick("exi")
+        ctx.set(reg.cs_active, 0)         # end the transaction
+        await ctx.tick("exi").repeat(40)  # let SPRAM_END drain/clean up
+        want = [spram_mem[0x100 + i] for i in range(N)]
+        print(f"T6 DMA read {N}B: {[f'0x{b:02X}' for b in got]}")
+        if got != want:
+            errors.append(f"T6 streaming SPRAM read: got {got}, want {want}")
+
+        # T7: a normal register read still works after the streaming transaction
+        # (FSM cleaned up and returned to HEADER0)
+        result = await exi_read(ctx, _NWAYS, length=1)
+        if result != [0x17]:
+            errors.append(f"T7 NWAYS after DMA: got {result}")
+        print(f"T7 NWAYS after DMA read: 0x{result[0]:02X}")
+
+    sim = Simulator(reg)
+    sim.add_clock(Period(MHz=24), domain="exi")
+    sim.add_clock(Period(MHz=24), domain="sync")
+    sim.add_testbench(testbench)
+    sim.add_process(spram_model)
+
+    sim.run()
+
+    if errors:
+        print("\nFAILURES:")
+        for e in errors:
+            print(" ", e)
+        sys.exit(1)
+    else:
+        print("\nAll tests passed.")
diff --git a/exi_bba/bba_top.py b/exi_bba/bba_top.py
new file mode 100644
index 0000000..f6a6ada
--- /dev/null
+++ b/exi_bba/bba_top.py
@@ -0,0 +1,533 @@
+"""BBATop — top-level elaboratable for the GC BBA FPGA replacement.
+
+Clock domains
+-------------
+capture  : 54 MHz, from 12 MHz crystal via SB_PLL40_PAD (DIVR=0 DIVF=71 DIVQ=4)
+exi/sync : 24 MHz, from the iCE40UP5K internal SB_HFOSC (÷2, CLKHF_DIV=0b01)
+
+Submodule instantiation and signal wiring
+-----------------------------------------
+See CLAUDE.md "Module Breakdown" and "CDC Signal Inventory" for the full list.
+"""
+
+from amaranth import *
+
+from exi_bba.exi_capture        import ExiCapture
+from exi_bba.bba_register_file  import BBARegisterFile
+from exi_bba.spram_arbiter      import SPRAMArbiter
+from exi_bba.rx_frame_assembler import RXFrameAssembler
+from exi_bba.tx_frame_drain     import TXFrameDrain
+from exi_bba.w5500_spi_master   import W5500SPIMaster
+from exi_bba.w5100_parallel_master import W5100ParallelMaster
+from exi_bba.status_panel        import StatusPanel
+
+from amaranth.lib.cdc import FFSynchronizer
+
+__all__ = ["BBATop"]
+
+
+class BBATop(Elaboratable):
+    """Top-level module.  Wires all submodules and defines clock domains.
+
+    External ports (exposed for platform or testbench connection)
+    -------------------------------------------------------------
+    EXI / GC interface (SPI Mode 3)
+        exi_clk / exi_mosi / exi_cs_n   : inputs from GC
+        exi_miso                          : output to GC
+        int_n                             : interrupt output (active low)
+
+    W5500 SPI interface (SPI Mode 0)
+        w5500_clk / w5500_mosi / w5500_cs_n  : outputs to W5500
+        w5500_miso                             : input from W5500
+        w5500_int_n                            : W5500 interrupt (input, active low)
+        w5500_rst_n                            : W5500 hardware reset (output, active low)
+    """
+
+    def __init__(self, eth="w5100", reset_cycles=24000, status_panel=False):
+        # Ethernet back-end: "w5100" (indirect parallel bus, reaches the EXI
+        # ceiling) or "w5500" (SPI, ~12 Mbit/s).  Both expose the identical
+        # tx/rx/init/par interface, so only the physical pins differ.
+        self._eth = eth
+        # MR-reset settle wait passed to the ethernet master (~1 ms on hardware;
+        # the testbench overrides with a small value for fast simulation).
+        self._reset_cycles = reset_cycles
+        # Optional bring-up status panel (drives onboard LEDs/button on the
+        # iCEbreaker — see synth.py).  panel_led bit order matches StatusPanel.
+        self._status_panel = status_panel
+
+        # EXI (GC side)
+        self.exi_clk   = Signal(init=1)
+        self.exi_mosi  = Signal()
+        self.exi_cs_n  = Signal(init=1)
+        self.exi_miso  = Signal()
+        self.int_n     = Signal(init=1)
+
+        if eth == "w5500":
+            # W5500 SPI
+            self.w5500_clk   = Signal()
+            self.w5500_mosi  = Signal()
+            self.w5500_miso  = Signal()
+            self.w5500_cs_n  = Signal(init=1)
+            self.w5500_int_n = Signal(init=1)
+            self.w5500_rst_n = Signal(init=1)
+        else:
+            # W5100 indirect parallel bus.  data_o/data_oe/data_i are the FPGA
+            # side of a bidirectional D[7:0] (wrapped in a tristate SB_IO at the
+            # platform level); a board ties the upper address lines to 0 so only
+            # A[1:0] are wired.
+            self.w5100_addr    = Signal(2)
+            self.w5100_data_o  = Signal(8)
+            self.w5100_data_oe = Signal()
+            self.w5100_data_i  = Signal(8)
+            self.w5100_cs_n    = Signal(init=1)
+            self.w5100_rd_n    = Signal(init=1)
+            self.w5100_wr_n    = Signal(init=1)
+            self.w5100_int_n   = Signal(init=1)
+            self.w5100_rst_n   = Signal(init=1)
+
+        if status_panel:
+            self.panel_led = Signal(5)   # to onboard LEDs (see StatusPanel)
+            self.panel_btn = Signal(3)   # from onboard button(s)
+
+    def elaborate(self, platform):
+        m = Module()
+
+        # ── Clock domain generation ───────────────────────────────────────
+        # Three domains, two physical sources (1 PLL + 1 internal HFOSC):
+        #   capture @ 54 MHz (PLL)   — SPI bit engine only; oversamples the
+        #                              27 MHz EXI clock 2× (robust Mode-3).
+        #   exi     @ 24 MHz (HFOSC) — register file / transaction FSM.
+        #   sync    @ 24 MHz (HFOSC) — SPRAM, RX/TX engines, ethernet master.
+        # exi and sync share the HFOSC net (frequency- and phase-matched); the
+        # AsyncFIFOs between them are still valid CDC and keep the module
+        # boundaries clean.  Only the tiny capture front-end needs the fast
+        # clock — which is why 27 MHz-EXI / OG performance is reachable on the
+        # iCE40UP5K even though the register-file logic tops out ~44 MHz.
+        if platform is not None:
+            # capture @ 54 MHz: icepll -i 12 -o 54 → DIVR=0 DIVF=71 DIVQ=4.
+            # 54 MHz = 2× the 27 MHz EXI clock — the minimum oversampling that
+            # cleanly implements SPI Mode 3.  The isolated SPI bit engine closes
+            # ~91 MHz on this device; the byte-FIFO read path brings the
+            # integrated capture domain to ~62 MHz, so 54 closes with margin.
+            m.domains += ClockDomain("capture")
+            platform.lookup(platform.default_clk).attrs["GLOBAL"] = False
+            m.submodules.pll = Instance(
+                "SB_PLL40_PAD",
+                p_FEEDBACK_PATH = "SIMPLE",
+                p_DIVR          = 0,
+                p_DIVF          = 71,
+                p_DIVQ          = 4,
+                p_FILTER_RANGE  = 1,
+                i_PACKAGEPIN    = platform.request("clk12", dir="-").io,
+                i_RESETB        = Const(1, 1),
+                i_BYPASS        = Const(0, 1),
+                o_PLLOUTGLOBAL  = ClockSignal("capture"),
+            )
+
+            # exi & sync @ 24 MHz: one SB_HFOSC (÷2) drives both slow domains.
+            # The bulky register-file / SPRAM / W5500 logic is routing-bound at
+            # ~33–44 MHz on the UP5K; 24 MHz closes with large margin.  The byte
+            # rate (27 MHz EXI ÷ 8 ≈ 3.4 MHz) leaves ~7 slow cycles per byte.
+            m.domains += ClockDomain("exi")
+            m.domains += ClockDomain("sync")
+            m.submodules.hfosc = Instance(
+                "SB_HFOSC",
+                p_CLKHF_DIV = "0b01",    # 48 ÷ 2 → 24 MHz
+                i_CLKHFEN   = Const(1, 1),
+                i_CLKHFPU   = Const(1, 1),
+                o_CLKHF     = ClockSignal("exi"),
+            )
+            m.d.comb += ClockSignal("sync").eq(ClockSignal("exi"))
+        # (simulation: test harness provides capture/exi/sync clocks via add_clock)
+
+        # ── Submodules ────────────────────────────────────────────────────
+        cap   = ExiCapture()        # SPI bit engine (capture) + byte FIFOs
+        reg   = BBARegisterFile()
+        arb   = SPRAMArbiter()
+        asm   = RXFrameAssembler()
+        drain = TXFrameDrain()
+        eth   = (W5500SPIMaster(reset_cycles=self._reset_cycles)
+                 if self._eth == "w5500"
+                 else W5100ParallelMaster(reset_cycles=self._reset_cycles))
+
+        m.submodules.cap   = cap
+        m.submodules.reg   = reg
+        m.submodules.arb   = arb
+        m.submodules.asm   = asm
+        m.submodules.drain = drain
+        m.submodules.eth   = eth
+
+        # ── External pin connections ──────────────────────────────────────
+        m.d.comb += [
+            # EXI inputs (to capture-domain front-end)
+            cap.spi_clk .eq(self.exi_clk),
+            cap.spi_mosi.eq(self.exi_mosi),
+            cap.spi_cs_n.eq(self.exi_cs_n),
+            # EXI outputs
+            self.exi_miso.eq(cap.spi_miso),
+            self.int_n   .eq(reg.exi_int_n),
+        ]
+
+        # Ethernet back-end physical pins
+        if self._eth == "w5500":
+            m.d.comb += [
+                self.w5500_clk .eq(eth.spi_clk),
+                self.w5500_mosi.eq(eth.spi_mosi),
+                self.w5500_cs_n.eq(eth.spi_cs_n),
+                eth.spi_miso   .eq(self.w5500_miso),
+                eth.w5500_int_n.eq(self.w5500_int_n),
+                self.w5500_rst_n.eq(eth.w5500_rst_n),
+            ]
+        else:
+            m.d.comb += [
+                self.w5100_addr   .eq(eth.bus_addr),
+                self.w5100_data_o .eq(eth.bus_data_o),
+                self.w5100_data_oe.eq(eth.bus_data_oe),
+                eth.bus_data_i    .eq(self.w5100_data_i),
+                self.w5100_cs_n   .eq(eth.cs_n),
+                self.w5100_rd_n   .eq(eth.rd_n),
+                self.w5100_wr_n   .eq(eth.wr_n),
+                eth.w5100_int_n   .eq(self.w5100_int_n),
+                self.w5100_rst_n  .eq(eth.w5100_rst_n),
+            ]
+
+        # ── ExiCapture byte stream ↔ BBARegisterFile (exi domain) ────────
+        m.d.comb += [
+            reg.rx_data .eq(cap.rx_data),
+            reg.rx_rdy  .eq(cap.rx_rdy),
+            cap.rx_en   .eq(reg.rx_en),
+
+            cap.tx_data .eq(reg.tx_data),
+            cap.tx_en   .eq(reg.tx_en),
+            reg.tx_rdy  .eq(cap.tx_rdy),
+
+            reg.cs_active.eq(cap.cs_active),   # transaction-active (for DMA reads)
+        ]
+
+        # ── BBARegisterFile ↔ SPRAMArbiter (sync domain FIFO sides) ──────
+        # SPRAM request: reg exi→sync FIFO read side → arb
+        m.d.comb += [
+            arb.exi_req_addr .eq(reg.spram_req_r_data),
+            arb.exi_req_valid.eq(reg.spram_req_r_rdy),
+            reg.spram_req_r_en.eq(arb.exi_req_ready),
+        ]
+        # SPRAM response: arb result → reg sync→exi FIFO write side
+        m.d.comb += [
+            reg.spram_rsp_w_data.eq(arb.exi_rsp_data),
+            reg.spram_rsp_w_en  .eq(arb.exi_rsp_valid),
+            # arb does not need w_rdy feedback (spram_rsp FIFO is deeper than latency)
+        ]
+
+        # ── BBARegisterFile ↔ TXFrameDrain (sync domain FIFO sides) ──────
+        m.d.comb += [
+            drain.tx_bytes_r_data.eq(reg.tx_bytes_r_data),
+            drain.tx_bytes_r_rdy .eq(reg.tx_bytes_r_rdy),
+            reg.tx_bytes_r_en    .eq(drain.tx_bytes_r_en),
+
+            drain.tx_ctrl_r_data.eq(reg.tx_ctrl_r_data),
+            drain.tx_ctrl_r_rdy .eq(reg.tx_ctrl_r_rdy),
+            reg.tx_ctrl_r_en    .eq(drain.tx_ctrl_r_en),
+        ]
+
+        # ── TXFrameDrain ↔ ethernet master (sync domain) ──────────────────
+        m.d.comb += [
+            eth.tx_data .eq(drain.tx_data),
+            eth.tx_valid.eq(drain.tx_valid),
+            drain.tx_ready.eq(eth.tx_ready),
+            eth.tx_sof  .eq(drain.tx_sof),
+            eth.tx_eof  .eq(drain.tx_eof),
+        ]
+
+        # ── ethernet master → RXFrameAssembler (sync domain) ─────────────
+        m.d.comb += [
+            asm.rx_data .eq(eth.rx_data),
+            asm.rx_valid.eq(eth.rx_valid),
+            eth.rx_ready.eq(asm.rx_ready),
+            asm.rx_sof  .eq(eth.rx_sof),
+            asm.rx_eof  .eq(eth.rx_eof),
+        ]
+
+        # ── RXFrameAssembler → SPRAMArbiter (ETH write, sync domain) ─────
+        m.d.comb += [
+            arb.eth_wr_addr .eq(asm.eth_wr_addr),
+            arb.eth_wr_data .eq(asm.eth_wr_data),
+            arb.eth_wr_valid.eq(asm.eth_wr_valid),
+            asm.eth_wr_ready.eq(arb.eth_wr_ready),
+        ]
+
+        # ── RXFrameAssembler → BBARegisterFile (rx_wptr FIFO write side) ─
+        m.d.comb += [
+            reg.rx_wptr_w_data.eq(asm.rx_wptr_w_data),
+            reg.rx_wptr_w_en  .eq(asm.rx_wptr_w_en),
+            asm.rx_wptr_w_rdy .eq(reg.rx_wptr_w_rdy),
+        ]
+
+        # ── Pulse synchronizer connections ────────────────────────────────
+        m.d.comb += [
+            # RX irq: sync → exi (RXFrameAssembler → reg → PS → exi domain)
+            reg.rx_irq_i.eq(asm.rx_irq),
+            # TX irq: sync → exi
+            reg.tx_irq_i.eq(drain.tx_irq),
+            # MAC address (PAR0–5) → SHAR.  exi and sync share the HFOSC net,
+            # and par is quasi-static (sampled by the master at init_req).
+            eth.par.eq(reg.par),
+        ]
+
+        # ── RX enabled gate (NCRA SR / start-receive bit) ─────────────────
+        # The RX ring-buffer path is active only after the GC sets NCRA[3].
+        m.d.comb += asm.rx_enabled.eq(reg.ncra_sr)
+
+        # ── Optional bring-up status panel (sync domain) ──────────────────
+        # init_req = NCRA reset (exi→sync PS), OR'd with the panel's manual
+        # re-init button when the panel is present.
+        if self._status_panel:
+            panel = StatusPanel()
+            m.submodules.panel = panel
+
+            # cs_active lives in the exi domain; bring it to sync for the LED.
+            cs_a_sync = Signal()
+            m.submodules.panel_cs = FFSynchronizer(
+                cap.cs_active, cs_a_sync, o_domain="sync")
+
+            # "ready" = ethernet init complete (latched until the next init).
+            ready = Signal()
+            with m.If(eth.init_done):
+                m.d.sync += ready.eq(1)
+            with m.Elif(reg.ncra_rst_o | panel.reinit):
+                m.d.sync += ready.eq(0)
+
+            m.d.comb += [
+                panel.cs_active.eq(cs_a_sync),
+                panel.rx_pulse .eq(asm.rx_irq),
+                panel.tx_pulse .eq(drain.tx_irq),
+                panel.ready    .eq(ready),
+                panel.btn      .eq(self.panel_btn),
+                self.panel_led .eq(panel.led),
+                eth.init_req   .eq(reg.ncra_rst_o | panel.reinit),
+            ]
+        else:
+            m.d.comb += eth.init_req.eq(reg.ncra_rst_o)
+
+        return m
+
+
+# ── Integration testbench ─────────────────────────────────────────────────
+# Drives real EXI Mode-3 transactions on the GC-facing pins and checks the
+# response — exercising the full chain ExiCapture (capture domain) ↔ byte FIFOs
+# ↔ BBARegisterFile (exi domain) ↔ sync modules, across all three clock domains.
+
+if __name__ == "__main__":
+    import sys
+    from amaranth.sim import Simulator, Period
+
+    dut = BBATop(eth="w5100", reset_cycles=20,    # small reset wait for sim
+                 status_panel=True)               # also exercise the panel wiring
+    errors = []
+
+    HALF = 8        # capture ticks per SPI half-period (well-oversampled)
+
+    async def spi_byte(ctx, mosi_val):
+        """Drive one EXI Mode-3 byte; return the assembled MISO byte."""
+        miso = 0
+        for bit in range(7, -1, -1):
+            ctx.set(dut.exi_mosi, (mosi_val >> bit) & 1)
+            ctx.set(dut.exi_clk, 0)                      # falling: slave samples MOSI
+            await ctx.tick("capture").repeat(HALF)
+            miso = (miso << 1) | ctx.get(dut.exi_miso)
+            ctx.set(dut.exi_clk, 1)                      # rising
+            await ctx.tick("capture").repeat(HALF)
+        return miso
+
+    async def exi_read(ctx, addr, length):
+        """EXI immediate read: 2-byte header, clock-idle gap, then `length` bytes."""
+        hdr0 = (addr >> 6) & 0x7F
+        # The header length field is only 2 bits ([1:0]); mask it so a long
+        # (DMA) read doesn't overflow length-1 into the addr[5:0] bits.  For
+        # SPRAM reads the field is ignored anyway — the stream runs until CS.
+        hdr1 = ((addr & 0x3F) << 2) | ((length - 1) & 0x3)
+        ctx.set(dut.exi_cs_n, 0)
+        ctx.set(dut.exi_clk, 1)
+        await ctx.tick("capture").repeat(HALF)
+        await spi_byte(ctx, hdr0)
+        await spi_byte(ctx, hdr1)
+        # EXI_Imm clock-idle gap: the core decodes the header and prefetches
+        # responses into the tx FIFO before the GC clocks the data phase.
+        await ctx.tick("capture").repeat(HALF * 12)
+        result = [await spi_byte(ctx, 0x00) for _ in range(length)]
+        ctx.set(dut.exi_cs_n, 1)
+        await ctx.tick("capture").repeat(HALF)
+        return result
+
+    async def exi_write(ctx, addr, data):
+        """EXI immediate write: 2-byte header then the data bytes."""
+        hdr0 = 0x80 | ((addr >> 6) & 0x7F)
+        hdr1 = ((addr & 0x3F) << 2) | (len(data) - 1)
+        ctx.set(dut.exi_cs_n, 0)
+        ctx.set(dut.exi_clk, 1)
+        await ctx.tick("capture").repeat(HALF)
+        await spi_byte(ctx, hdr0)
+        await spi_byte(ctx, hdr1)
+        for b in data:
+            await spi_byte(ctx, b)
+        ctx.set(dut.exi_cs_n, 1)
+        await ctx.tick("capture").repeat(HALF)
+
+    # ── W5100 indirect-bus slave model (drives w5100_data_i) ─────────────
+    # Pre-loads a known MACRAW packet in the RX buffer so we can verify the full
+    # ethernet→SPRAM→GC path.  Same protocol as the W5100ParallelMaster bench.
+    RX_FRAME = [0xDE, 0xAD, 0xBE, 0xEF, 0x01, 0x02, 0x03, 0x04]
+    _W_RX_BASE   = 0x6000
+    _W_S0_CR     = 0x0401
+    _W_S0_RX_RSR = 0x0426
+    _W_S0_RX_RD  = 0x0428
+    _W_CR_RECV   = 0x40
+    _A_MR, _A_AR0, _A_AR1, _A_DR = 0b00, 0b01, 0b10, 0b11
+
+    def w5100_preload():
+        plen = len(RX_FRAME) + 2            # MACRAW length includes its header
+        mem = {}
+        for i, b in enumerate([(plen >> 8) & 0xFF, plen & 0xFF] + RX_FRAME):
+            mem[_W_RX_BASE + i] = b
+        mem[_W_S0_RX_RSR], mem[_W_S0_RX_RSR + 1] = (plen >> 8) & 0xFF, plen & 0xFF
+        mem[_W_S0_RX_RD],  mem[_W_S0_RX_RD + 1]  = 0, 0
+        return mem
+
+    w5100_mem = w5100_preload()
+
+    async def w5100_model(ctx):
+        idm_ar = 0
+        mr = 0
+        prev_cs = prev_rd = prev_wr = 1
+        async for vals in ctx.tick("sync").sample(
+                dut.w5100_cs_n, dut.w5100_rd_n, dut.w5100_wr_n,
+                dut.w5100_addr, dut.w5100_data_o):
+            cs, rd, wr, a, do = vals[-5:]
+            ai = (mr >> 1) & 1
+            if cs == 0 and rd == 0:                  # drive read data
+                if a == _A_MR:
+                    val = mr
+                elif a == _A_AR0:
+                    val = (idm_ar >> 8) & 0xFF
+                elif a == _A_AR1:
+                    val = idm_ar & 0xFF
+                else:
+                    val = w5100_mem.get(idm_ar, 0)
+                ctx.set(dut.w5100_data_i, val)
+            if cs == 0 and prev_wr == 0 and wr == 1:  # latch write on /WR rising
+                if a == _A_MR:
+                    mr = do
+                elif a == _A_AR0:
+                    idm_ar = (idm_ar & 0x00FF) | (do << 8)
+                elif a == _A_AR1:
+                    idm_ar = (idm_ar & 0xFF00) | do
+                else:
+                    w5100_mem[idm_ar] = do
+                    if idm_ar == _W_S0_CR and do == _W_CR_RECV:
+                        w5100_mem[_W_S0_RX_RSR] = 0
+                        w5100_mem[_W_S0_RX_RSR + 1] = 0
+                    if ai:
+                        idm_ar = (idm_ar + 1) & 0xFFFF
+            if cs == 0 and prev_rd == 0 and rd == 1 and a == _A_DR and ai:
+                idm_ar = (idm_ar + 1) & 0xFFFF
+            prev_cs, prev_rd, prev_wr = cs, rd, wr
+
+    async def testbench(ctx):
+        ctx.set(dut.exi_clk, 1)
+        ctx.set(dut.exi_cs_n, 1)
+        ctx.set(dut.panel_btn, 0b111)   # all buttons released (active-low idle)
+        await ctx.tick("capture").repeat(20)
+
+        # T1: device ID — read 4 bytes from addr 0 → 0x04 0x02 0x02 0x00
+        dev = await exi_read(ctx, 0x0000, 4)
+        print(f"T1 device ID: {[f'0x{b:02X}' for b in dev]}")
+        if dev != [0x04, 0x02, 0x02, 0x00]:
+            errors.append(f"T1 device ID: got {dev}")
+        await ctx.tick("capture").repeat(HALF)
+
+        # T2: write PAR0–3, read them back through the full chain
+        await exi_write(ctx, 0x20, [0xDE, 0xAD, 0xBE, 0xEF])
+        await ctx.tick("capture").repeat(HALF * 4)
+        par = await exi_read(ctx, 0x20, 4)
+        print(f"T2 PAR0-3 readback: {[f'0x{b:02X}' for b in par]}")
+        if par != [0xDE, 0xAD, 0xBE, 0xEF]:
+            errors.append(f"T2 PAR readback: got {par}")
+        await ctx.tick("capture").repeat(HALF)
+
+        # T3: NWAYS must read back the hardcoded 0x17 (link-up sentinel)
+        nways = await exi_read(ctx, 0x31, 1)
+        print(f"T3 NWAYS: 0x{nways[0]:02X} (want 0x17)")
+        if nways != [0x17]:
+            errors.append(f"T3 NWAYS: got {nways}")
+        await ctx.tick("capture").repeat(HALF)
+
+        # T4: DMA-style SPRAM read — clock 8 data bytes (past the 4-byte header
+        # limit) within one CS.  Exercises the integrated streaming path:
+        # ExiCapture(cs_active) → register file SPRAM_STREAM → SPRAMArbiter →
+        # real SPRAM → MISO, plus the SPRAM_END cleanup.  SPRAM is uninitialised
+        # here, so we check the stream completes (8 bytes, no underrun/hang)
+        # rather than specific data.
+        dma = await exi_read(ctx, 0x0100, 8)
+        print(f"T4 DMA read (8B from 0x100): {[f'0x{b:02X}' for b in dma]}")
+        if len(dma) != 8:
+            errors.append(f"T4 DMA read length: got {len(dma)}")
+        await ctx.tick("capture").repeat(HALF)
+
+        # T5: a register read after the streaming read confirms the FSM cleaned
+        # up (SPRAM_END → HEADER0) and the device is responsive again.
+        nways2 = await exi_read(ctx, 0x31, 1)
+        print(f"T5 NWAYS after DMA: 0x{nways2[0]:02X} (want 0x17)")
+        if nways2 != [0x17]:
+            errors.append(f"T5 NWAYS after DMA read: got {nways2}")
+        await ctx.tick("capture").repeat(HALF)
+
+        # ── T6: FULL ETHERNET→SPRAM→GC LOOP ──────────────────────────────
+        # A frame arrives from the network (W5500 model) → W5500 master reads it
+        # → RXFrameAssembler writes it to the SPRAM ring → GC reads RWP then
+        # DMA-reads the descriptor+frame back.  Exercises the entire RX path.
+        # The W5100 needs its init sequence (which sets MR.AI / opens socket 0)
+        # before multi-byte bus accesses work — trigger it via NCRA reset, as
+        # the real GC driver does, and let it run before enabling RX.
+        await exi_write(ctx, 0x00, [0x01])    # NCRA reset → init_req pulse
+        await ctx.tick("capture").repeat(2000)   # let W5100 init run
+        await exi_write(ctx, 0x00, [0x08])    # NCRA SR bit → enable RX
+        await ctx.tick("capture").repeat(HALF * 2)
+        ctx.set(dut.w5100_int_n, 0)           # W5100: a packet was received
+        await ctx.tick("capture").repeat(4000)   # let the W5100 RX + SPRAM write run
+        ctx.set(dut.w5100_int_n, 1)
+        await ctx.tick("capture").repeat(HALF * 2)
+
+        rwp = await exi_read(ctx, 0x16, 1)    # RX write pointer (page)
+        total_len = len(RX_FRAME) + 4
+        got = await exi_read(ctx, 0x0100, total_len)   # descriptor + frame
+        want = [0x00, 0x00, (total_len >> 8) & 0xFF, total_len & 0xFF] + RX_FRAME
+        print(f"T6 RWP=0x{rwp[0]:02X} (want 0x02)")
+        print(f"T6 SPRAM[0x100]: {[f'0x{b:02X}' for b in got]}")
+        print(f"T6 expected    : {[f'0x{b:02X}' for b in want]}")
+        if rwp != [0x02]:
+            errors.append(f"T6 RWP: got {rwp}, want [0x02]")
+        if got != want:
+            errors.append(f"T6 RX frame mismatch:\n  got  {got}\n  want {want}")
+
+        # T7: status-panel integration — after all the EXI traffic above, the
+        # EXI-activity LED (panel led[1] = stretched cs_active) must be lit,
+        # proving cap.cs_active → FFSync → StatusPanel → LED is wired end-to-end.
+        leds = ctx.get(dut.panel_led)
+        if not (leds >> 1) & 1:
+            errors.append(f"T7 panel: EXI-activity LED not lit (led=0b{leds:05b})")
+        print(f"T7 panel led=0b{leds:05b} (bit1=EXI activity, expect 1)")
+
+    sim = Simulator(dut)
+    sim.add_clock(Period(MHz=54), domain="capture")
+    sim.add_clock(Period(MHz=24), domain="exi")
+    sim.add_clock(Period(MHz=24), domain="sync")
+    sim.add_testbench(testbench)
+    sim.add_process(w5100_model)
+    sim.run()
+
+    if errors:
+        print("\nFAILURES:")
+        for e in errors:
+            print(" ", e)
+        sys.exit(1)
+    else:
+        print("\nAll BBATop integration tests passed.")
diff --git a/exi_bba/eeprom_model.py b/exi_bba/eeprom_model.py
new file mode 100644
index 0000000..6e3cd75
--- /dev/null
+++ b/exi_bba/eeprom_model.py
@@ -0,0 +1,222 @@
+"""EEPROM model — exi domain.
+
+Emulates the MX98730EC's 93C46 serial EEPROM.
+
+93C46 protocol (Microwire, bit-bang)
+-------------------------------------
+CS=1 activates the device.
+Data clocked on rising SK edge, 9-bit header then data:
+  Bit 0: start (always 1)
+  Bit 1: opcode MSB }  READ = 10
+  Bit 2: opcode LSB }
+  Bits 3–8: 6-bit address (MSB first)
+
+After the 9th rising SK the DO line presents the MSB of the 16-bit word.
+Each subsequent rising SK advances one bit (MSB→LSB).
+
+Shift register `shift_in` convention
+--------------------------------------
+`Cat(di_s, shift_in[:-1])` places di_s at bit 0 and shifts existing bits up.
+After N edges:
+  shift_in[N-1] = first bit received (start)
+  shift_in[0]   = last bit received so far
+
+At bit_ctr==8 (after 8 edges, receiving 9th on di_s):
+  shift_in[7] = start          (bit 0)
+  shift_in[6] = opcode MSB     (bit 1)
+  shift_in[5] = opcode LSB     (bit 2)
+  shift_in[4:0] = addr[5:1]    (bits 3–7, MSB first→LSB first in register)
+  di_s          = addr[0]      (bit 8)
+
+  opcode  = Cat(shift_in[5], shift_in[6])   → 0b10 = READ
+  address = Cat(di_s, shift_in[0:5])        → addr[0..5]
+
+EEPROM content (64 × 16-bit words)
+-------------------------------------
+Words 0–2 hold the source MAC address (Nintendo OUI 00:09:BF:AA:BB:CC).
+The GC BBA driver reads words 0–3 then copies to PAR0–5.
+"""
+
+from amaranth import *
+from amaranth.lib.cdc import FFSynchronizer
+
+__all__ = ["EEPROMModel"]
+
+_EEPROM_WORDS = [
+    0x0009,   # word 0: PAR0=0x00, PAR1=0x09
+    0xBFAA,   # word 1: PAR2=0xBF, PAR3=0xAA
+    0xBBCC,   # word 2: PAR4=0xBB, PAR5=0xCC
+    0x0000,   # word 3: checksum placeholder
+]
+_EEPROM_WORDS += [0x0000] * (64 - len(_EEPROM_WORDS))
+
+_OP_READ = 0b10   # opcode for READ
+
+
+class EEPROMModel(Elaboratable):
+    """93C46 serial EEPROM model in the exi domain (read-only).
+
+    Ports
+    -----
+    sk / cs / di : bit-bang inputs (raw async; synchronized internally)
+    do           : serial data output
+    """
+
+    def __init__(self):
+        self.sk = Signal()
+        self.cs = Signal()
+        self.di = Signal()
+        self.do = Signal()
+
+    def elaborate(self, platform):
+        m = Module()
+
+        words = Array([Signal(16, init=v, name=f"e{i}") for i, v in enumerate(_EEPROM_WORDS)])
+
+        # ── Input synchronization (async → exi, 2 stages) ────────────────
+        sk_s = Signal()
+        cs_s = Signal()
+        di_s = Signal()
+        m.submodules.sync_sk = FFSynchronizer(self.sk, sk_s, o_domain="exi")
+        m.submodules.sync_cs = FFSynchronizer(self.cs, cs_s, o_domain="exi")
+        m.submodules.sync_di = FFSynchronizer(self.di, di_s, o_domain="exi")
+
+        sk_prev = Signal()
+        m.d.exi += sk_prev.eq(sk_s)
+        rising_sk = Signal()
+        m.d.comb += rising_sk.eq(sk_s & ~sk_prev)
+
+        # ── State ─────────────────────────────────────────────────────────
+        shift_in  = Signal(9)
+        bit_ctr   = Signal(4)    # 0..8 during header receive
+
+        shift_out = Signal(16)   # data word being shifted out MSB-first
+        out_ctr   = Signal(4)    # 0..15, counts bits shifted out
+        in_read   = Signal()     # 1 while outputting a word
+
+        # DO is combinatorial: MSB of shift_out while in read-out phase
+        m.d.comb += self.do.eq(Mux(in_read, shift_out[15], 0))
+
+        with m.If(~cs_s):
+            m.d.exi += bit_ctr.eq(0)
+            m.d.exi += in_read.eq(0)
+            m.d.exi += out_ctr.eq(0)
+
+        with m.Elif(rising_sk):
+            with m.If(in_read):
+                # Shift out next bit (MSB first: left shift, zero into LSB)
+                m.d.exi += shift_out.eq(Cat(0, shift_out[:-1]))
+                with m.If(out_ctr == 15):
+                    m.d.exi += in_read.eq(0)
+                    m.d.exi += out_ctr.eq(0)
+                with m.Else():
+                    m.d.exi += out_ctr.eq(out_ctr + 1)
+
+            with m.Else():
+                # Shift di_s in at bit 0 (existing bits move up)
+                m.d.exi += shift_in.eq(Cat(di_s, shift_in[:-1]))
+                m.d.exi += bit_ctr.eq(bit_ctr + 1)
+
+                with m.If(bit_ctr == 8):
+                    # 9th bit (di_s = addr[0]) arrives.
+                    # shift_in[7] = start, [6]=op_MSB, [5]=op_LSB, [4:0]=addr[5:1]
+                    op   = Cat(shift_in[5], shift_in[6])    # 0b10 for READ
+                    adr  = Cat(di_s, shift_in[0:5])          # addr[0..5]
+                    with m.If(op == _OP_READ):
+                        m.d.exi += shift_out.eq(words[adr])
+                        m.d.exi += in_read.eq(1)
+                        m.d.exi += out_ctr.eq(0)
+
+        return m
+
+
+# ── Testbench ─────────────────────────────────────────────────────────────
+
+if __name__ == "__main__":
+    import sys
+    from amaranth.sim import Simulator, Period
+
+    dut = EEPROMModel()
+    errors = []
+
+    HALF = 6   # exi-domain ticks per SK half-period (much longer than sync latency)
+
+    async def eeprom_read(ctx, addr):
+        """93C46 READ at 6-bit address; returns 16-bit word.
+
+        DO is read BEFORE each rising SK edge, since in_read=1 causes
+        shift_out[15] to be valid between edges.  After 16 reads the full
+        16-bit word is assembled MSB-first.
+        """
+        ctx.set(dut.cs, 1)
+        ctx.set(dut.sk, 0)
+        await ctx.tick("exi").repeat(HALF)
+
+        # Transmit 9 bits: start(1) + opcode READ(10) + addr[5:0] MSB-first
+        bits = [1, 1, 0]
+        for a in range(5, -1, -1):
+            bits.append((addr >> a) & 1)
+
+        for bit in bits:
+            ctx.set(dut.di, bit)
+            ctx.set(dut.sk, 1)   # rising edge: DUT latches bit
+            await ctx.tick("exi").repeat(HALF)
+            ctx.set(dut.sk, 0)
+            await ctx.tick("exi").repeat(HALF)
+
+        # After 9th falling SK: in_read=1, shift_out=word[addr], do=MSB.
+        # Read DO before each rising edge (it is valid in the LOW phase).
+        result = 0
+        for _ in range(16):
+            result = (result << 1) | ctx.get(dut.do)   # sample before rising SK
+            ctx.set(dut.sk, 1)
+            await ctx.tick("exi").repeat(HALF)
+            ctx.set(dut.sk, 0)
+            await ctx.tick("exi").repeat(HALF)
+
+        ctx.set(dut.cs, 0)
+        await ctx.tick("exi").repeat(HALF)
+        return result
+
+    async def testbench(ctx):
+        await ctx.tick("exi").repeat(4)
+        ctx.set(dut.cs, 0)
+        ctx.set(dut.sk, 0)
+        ctx.set(dut.di, 0)
+        await ctx.tick("exi").repeat(4)
+
+        w0 = await eeprom_read(ctx, 0)
+        print(f"T1 word 0 = 0x{w0:04X}  (expected 0x0009)")
+        if w0 != 0x0009:
+            errors.append(f"T1: word 0 = 0x{w0:04X}, expected 0x0009")
+
+        w1 = await eeprom_read(ctx, 1)
+        print(f"T2 word 1 = 0x{w1:04X}  (expected 0xBFAA)")
+        if w1 != 0xBFAA:
+            errors.append(f"T2: word 1 = 0x{w1:04X}, expected 0xBFAA")
+
+        w2 = await eeprom_read(ctx, 2)
+        print(f"T3 word 2 = 0x{w2:04X}  (expected 0xBBCC)")
+        if w2 != 0xBBCC:
+            errors.append(f"T3: word 2 = 0x{w2:04X}, expected 0xBBCC")
+
+        # T4: word 3 → 0x0000
+        w3 = await eeprom_read(ctx, 3)
+        print(f"T4 word 3 = 0x{w3:04X}  (expected 0x0000)")
+        if w3 != 0x0000:
+            errors.append(f"T4: word 3 = 0x{w3:04X}, expected 0x0000")
+
+    sim = Simulator(dut)
+    sim.add_clock(Period(MHz=24), domain="exi")
+    sim.add_testbench(testbench)
+
+    with sim.write_vcd("EEPROMModel.vcd"):
+        sim.run()
+
+    if errors:
+        print("\nFAILURES:")
+        for e in errors:
+            print(" ", e)
+        sys.exit(1)
+    else:
+        print("\nAll tests passed.")
diff --git a/exi_bba/exi_capture.py b/exi_bba/exi_capture.py
new file mode 100644
index 0000000..93148d0
--- /dev/null
+++ b/exi_bba/exi_capture.py
@@ -0,0 +1,269 @@
+"""ExiCapture — fast EXI byte-capture front-end (capture domain, 54 MHz).
+
+Wraps the SPIMode3Slave bit engine and bridges it to the slower `exi` domain
+(24 MHz) through two AsyncFIFOs:
+
+    capture (54 MHz)                         exi (24 MHz)
+    ┌────────────────────┐   rx_fifo  ───►   received bytes (header + data)
+    │  SPIMode3Slave      │   (8-bit, capture→exi)
+    │  (bit engine)       │   tx_fifo  ◄───   response bytes to drive on MISO
+    └────────────────────┘   (8-bit, exi→capture)
+
+Why split: the bit engine must oversample a 27 MHz EXI clock 2×, which needs a
+54 MHz clock — far faster than the register-file logic can close (~44 MHz).
+Only this small, shallow front-end runs fast; everything else stays at 24 MHz.
+
+TX response gating
+------------------
+Every EXI transaction begins with 2 header bytes (write_flag/addr/len) during
+which the GC ignores MISO.  The core cannot have produced a response yet (it
+hasn't even decoded the header), so the wrapper must NOT pop tx_fifo for those
+2 bytes.  A per-transaction counter (`txld_cnt`, reset by frame_start) gates the
+pop: header bytes drive a don't-care 0xFF; from the first data byte onward the
+wrapper pops tx_fifo (one byte per tx_load).  `tx_hold` is registered at tx_load
+time — before the FIFO advances — so the bit engine latches the correct byte on
+the following SPI rising edge (the classic FWFT-advance off-by-one is avoided).
+"""
+
+from amaranth import *
+from amaranth.lib.cdc import FFSynchronizer
+from amaranth.lib.fifo import AsyncFIFO
+
+from exi_bba.spi_mode3_slave import SPIMode3Slave
+
+__all__ = ["ExiCapture"]
+
+
+class ExiCapture(Elaboratable):
+    """EXI front-end: SPI bit engine (capture domain) + byte FIFOs to core.
+
+    Physical SPI pins (capture domain)
+    ----------------------------------
+    spi_clk / spi_mosi / spi_cs_n : raw async inputs from the GC
+    spi_miso                       : output to the GC
+
+    Core-facing RX byte stream (core domain, FWFT read side of rx_fifo)
+    ------------------------------------------------------------------
+    rx_data : current received byte
+    rx_rdy  : a received byte is available
+    rx_en   : pop (assert for one core cycle to consume rx_data)
+
+    Core-facing TX byte stream (core domain, write side of tx_fifo)
+    --------------------------------------------------------------
+    tx_data : response byte to enqueue
+    tx_en   : write strobe
+    tx_rdy  : tx_fifo has room
+    """
+
+    def __init__(self, rx_depth=4, tx_depth=2):
+        self._rx_depth = rx_depth
+        self._tx_depth = tx_depth
+
+        # Physical SPI (capture domain, wired to pins by BBATop)
+        self.spi_clk  = Signal(init=1)
+        self.spi_mosi = Signal()
+        self.spi_cs_n = Signal(init=1)
+        self.spi_miso = Signal()
+
+        # Core-facing RX read side
+        self.rx_data = Signal(8)
+        self.rx_rdy  = Signal()
+        self.rx_en   = Signal()
+
+        # Core-facing TX write side
+        self.tx_data = Signal(8)
+        self.tx_en   = Signal()
+        self.tx_rdy  = Signal()
+
+        # Core-facing: high (exi domain) while a transaction is in progress.
+        # The register file uses it to stream variable-length (DMA) reads until
+        # CS deasserts.
+        self.cs_active = Signal()
+
+    def elaborate(self, platform):
+        m = Module()
+
+        spi = SPIMode3Slave(domain="capture")
+        m.submodules.spi = spi
+
+        rx_fifo = AsyncFIFO(width=8, depth=self._rx_depth,
+                            w_domain="capture", r_domain="exi")
+        tx_fifo = AsyncFIFO(width=8, depth=self._tx_depth,
+                            w_domain="exi", r_domain="capture")
+        m.submodules.rx_fifo = rx_fifo
+        m.submodules.tx_fifo = tx_fifo
+
+        # cs_active (capture) → exi domain for the register file
+        m.submodules.cs_sync = FFSynchronizer(spi.cs_active, self.cs_active,
+                                              o_domain="exi")
+
+        # ── Physical pins ↔ bit engine ───────────────────────────────────
+        m.d.comb += [
+            spi.spi_clk .eq(self.spi_clk),
+            spi.spi_mosi.eq(self.spi_mosi),
+            spi.spi_cs_n.eq(self.spi_cs_n),
+            self.spi_miso.eq(spi.spi_miso),
+        ]
+
+        # ── RX: every received byte → rx_fifo (capture write side) ───────
+        m.d.comb += [
+            rx_fifo.w_data.eq(spi.rx_byte),
+            rx_fifo.w_en  .eq(spi.rx_valid),
+        ]
+        # Core read side
+        m.d.comb += [
+            self.rx_data .eq(rx_fifo.r_data),
+            self.rx_rdy  .eq(rx_fifo.r_rdy),
+            rx_fifo.r_en .eq(self.rx_en),
+        ]
+
+        # ── TX: core write side ──────────────────────────────────────────
+        m.d.comb += [
+            tx_fifo.w_data.eq(self.tx_data),
+            tx_fifo.w_en  .eq(self.tx_en),
+            self.tx_rdy   .eq(tx_fifo.w_rdy),
+        ]
+
+        # ── TX response gating (capture domain) ──────────────────────────
+        # The bit engine drives MISO LIVE from tx_byte = tx_fifo head, so the
+        # response byte at the head is what gets sent for the current data byte.
+        # `txld_cnt` counts completed bytes within the transaction (tx_load
+        # pulses at each byte completion):
+        #   completion 0,1 → header bytes  (no pop)
+        #   completion ≥2  → a data byte finished → pop to advance the head
+        # The first data byte (data0) is served live from the head without a
+        # pop; the pop after it advances the head to data1's response, etc.
+        txld_cnt = Signal(2)
+
+        m.d.comb += spi.tx_byte.eq(tx_fifo.r_data)
+
+        # Pop depends ONLY on the registered tx_load and txld_cnt — NOT on
+        # frame_start.  (frame_start precedes byte-0's tx_load by a cycle and
+        # has already reset txld_cnt to 0, so byte 0 is never a data byte.)
+        # Keeping cs_fall/frame_start off the pop path shortens the capture-
+        # domain critical path through the FIFO consume pointer.
+        #
+        # `flushing` clears prefetch over-push left in tx_fifo by the previous
+        # transaction: the register file streams response bytes ahead of the GC
+        # clock for DMA reads, so when CS deasserts mid-stream a few unsent
+        # bytes remain.  On CS-fall (frame_start) drain tx_fifo to empty before
+        # the new transaction's data phase, so stale bytes never reach MISO.
+        flushing = Signal()
+        m.d.comb += tx_fifo.r_en.eq(
+            (spi.tx_load & (txld_cnt >= 2)) | (flushing & tx_fifo.r_rdy)
+        )
+        with m.If(spi.frame_start):
+            m.d.capture += flushing.eq(1)
+        with m.Elif(~tx_fifo.r_rdy):
+            m.d.capture += flushing.eq(0)
+
+        with m.If(spi.frame_start):
+            m.d.capture += txld_cnt.eq(0)
+        with m.Elif(spi.tx_load & (txld_cnt < 3)):
+            m.d.capture += txld_cnt.eq(txld_cnt + 1)
+
+        return m
+
+
+# ── Testbench ─────────────────────────────────────────────────────────────
+
+if __name__ == "__main__":
+    import sys
+    from amaranth.sim import Simulator, Period
+
+    dut = ExiCapture()
+    errors = []
+
+    # SPI half-period in capture ticks.  At 54 MHz capture / 27 MHz EXI the real
+    # ratio is ~2; use 4 here for a clean, well-oversampled functional check.
+    HALF = 4
+
+    async def spi_byte(ctx, mosi_val):
+        """Clock one SPI Mode 3 byte; return the assembled MISO byte."""
+        miso = 0
+        for bit in range(7, -1, -1):
+            ctx.set(dut.spi_mosi, (mosi_val >> bit) & 1)
+            ctx.set(dut.spi_clk, 0)
+            await ctx.tick("capture").repeat(HALF)
+            miso = (miso << 1) | ctx.get(dut.spi_miso)
+            ctx.set(dut.spi_clk, 1)
+            await ctx.tick("capture").repeat(HALF)
+        return miso
+
+    async def core_drain_rx(ctx, into):
+        """Pop one byte from the core RX side if available."""
+        if ctx.get(dut.rx_rdy):
+            into.append(ctx.get(dut.rx_data))
+            ctx.set(dut.rx_en, 1)
+            await ctx.tick("exi").repeat(1)
+            ctx.set(dut.rx_en, 0)
+            return True
+        return False
+
+    async def push_tx(ctx, b):
+        ctx.set(dut.tx_data, b)
+        ctx.set(dut.tx_en, 1)
+        await ctx.tick("exi").repeat(1)
+        ctx.set(dut.tx_en, 0)
+
+    async def do_txn(ctx, hdr, responses, n_data, rx_seen):
+        """One EXI transaction: clock `hdr` bytes, model the clock-idle gap
+        (drain rx + prefetch `responses` into tx_fifo), then clock `n_data`
+        data bytes; return the MISO data bytes read."""
+        ctx.set(dut.spi_cs_n, 0)
+        ctx.set(dut.spi_clk, 1)
+        await ctx.tick("capture").repeat(HALF)
+        for h in hdr:
+            await spi_byte(ctx, h)
+        for _ in range(20):                       # clock-idle gap
+            await core_drain_rx(ctx, rx_seen)
+            await ctx.tick("exi").repeat(1)
+        for r in responses:
+            await push_tx(ctx, r)
+        await ctx.tick("capture").repeat(2)
+        miso = [await spi_byte(ctx, 0x00) for _ in range(n_data)]
+        ctx.set(dut.spi_cs_n, 1)
+        await ctx.tick("capture").repeat(HALF)
+        for _ in range(20):                       # drain data-phase dummies
+            await core_drain_rx(ctx, rx_seen)
+            await ctx.tick("exi").repeat(1)
+        return miso
+
+    async def testbench(ctx):
+        rx_seen = []
+        await ctx.tick("capture").repeat(2)
+
+        # ── T1: header + 2 data bytes read back ──────────────────────────
+        miso = await do_txn(ctx, [0x12, 0x34], [0xA5, 0x5A], 2, rx_seen)
+        print(f"T1 rx={[hex(b) for b in rx_seen[:2]]}  MISO={[f'0x{b:02X}' for b in miso]}")
+        if rx_seen[:2] != [0x12, 0x34]:
+            errors.append(f"T1 header rx wrong: {rx_seen[:2]}")
+        if miso != [0xA5, 0x5A]:
+            errors.append(f"T1 MISO wrong: {[hex(b) for b in miso]}")
+
+        # ── T2: prefetch over-push must NOT leak into the next transaction ─
+        # Txn A pushes 2 responses but the GC clocks only 1 data byte, leaving
+        # one stale byte in tx_fifo.  Txn B must read its OWN fresh responses,
+        # proving the CS-fall flush cleared the stale prefetch.
+        rx_seen.clear()
+        await do_txn(ctx, [0x12, 0x34], [0xA5, 0x5A], 1, rx_seen)   # leaves 0x5A
+        misoB = await do_txn(ctx, [0x12, 0x34], [0x11, 0x22], 2, rx_seen)
+        print(f"T2 MISO after over-push: {[f'0x{b:02X}' for b in misoB]}  (want 0x11 0x22)")
+        if misoB != [0x11, 0x22]:
+            errors.append(f"T2 flush failed — stale byte leaked: {[hex(b) for b in misoB]}")
+
+    sim = Simulator(dut)
+    sim.add_clock(Period(MHz=54), domain="capture")
+    sim.add_clock(Period(MHz=24), domain="exi")
+    sim.add_testbench(testbench)
+
+    with sim.write_vcd("ExiCapture.vcd"):
+        sim.run()
+
+    if errors:
+        print("\nFAILURES:")
+        for e in errors:
+            print(" ", e)
+        sys.exit(1)
+    else:
+        print("\nAll tests passed.")
diff --git a/exi_bba/rx_frame_assembler.py b/exi_bba/rx_frame_assembler.py
new file mode 100644
index 0000000..e08a5a1
--- /dev/null
+++ b/exi_bba/rx_frame_assembler.py
@@ -0,0 +1,312 @@
+"""RX frame assembler — sync domain (24 MHz).
+
+Receives raw ethernet frames from W5500SPIMaster and writes them into the SPRAM
+ring buffer in MX98730EC format.
+
+Ring buffer layout (SPRAM byte addresses)
+------------------------------------------
+0x0100–0x0FFF  15 pages × 256 bytes = 3840 bytes
+Pages 0x01–0x0F; page 0x00 is reserved.
+Page wrap: after 0x0F → 0x01 (skip 0x00).
+
+Frame descriptor (4 bytes at page start)
+-----------------------------------------------
+Byte 0: LRPS (last received packet status) — 0x00
+Byte 1: 0x00
+Byte 2: total_length[15:8]   (big-endian; includes 4 descriptor bytes)
+Byte 3: total_length[7:0]
+Bytes 4+: raw ethernet frame
+
+Write sequence
+--------------
+1. Issue 4 SPRAM writes of 0x00 (placeholder descriptor).
+2. For each byte received from W5500, issue one SPRAM write.
+3. After EOF: rewrite descriptor bytes 2 and 3 with actual length.
+4. Advance RWP, push to rx_wptr FIFO, pulse rx_irq.
+"""
+
+from amaranth import *
+
+__all__ = ["RXFrameAssembler"]
+
+_RX_PAGE_FIRST = 0x01
+_RX_PAGE_LAST  = 0x0F
+_PAGES_TOTAL   = _RX_PAGE_LAST - _RX_PAGE_FIRST + 1   # 15
+
+
+class RXFrameAssembler(Elaboratable):
+    """Writes incoming ethernet frames into the SPRAM ring buffer.
+
+    W5500 streaming interface (sync domain)
+    ----------------------------------------
+    rx_data / rx_valid / rx_ready : byte stream
+    rx_sof / rx_eof               : frame delimiters (same cycle as rx_valid)
+
+    SPRAM write interface (to SPRAMArbiter, sync domain)
+    -----------------------------------------------------
+    eth_wr_addr / eth_wr_data / eth_wr_valid / eth_wr_ready
+
+    CDC outputs (wired by BBATop)
+    -----------------------------
+    rx_wptr_w_data / rx_wptr_w_en / rx_wptr_w_rdy
+    rx_irq : 1-cycle pulse → PulseSynchronizer input
+    rx_enabled : controlled by NCRA SR bit (from BBARegisterFile)
+    """
+
+    def __init__(self):
+        # W5500 stream in
+        self.rx_data   = Signal(8)
+        self.rx_valid  = Signal()
+        self.rx_ready  = Signal()
+        self.rx_sof    = Signal()
+        self.rx_eof    = Signal()
+
+        # SPRAM write out
+        self.eth_wr_addr  = Signal(16)
+        self.eth_wr_data  = Signal(8)
+        self.eth_wr_valid = Signal()
+        self.eth_wr_ready = Signal()
+
+        # RWP FIFO write-side (sync→exi)
+        self.rx_wptr_w_data = Signal(8)
+        self.rx_wptr_w_en   = Signal()
+        self.rx_wptr_w_rdy  = Signal()
+
+        # rx_irq pulse (→ PulseSynchronizer)
+        self.rx_irq = Signal()
+
+        # RX gate from NCRA SR bit
+        self.rx_enabled = Signal()
+
+    def elaborate(self, platform):
+        m = Module()
+
+        # ── Ring-buffer state ─────────────────────────────────────────────
+        rwp = Signal(8, init=_RX_PAGE_FIRST)   # current RX write page (1–15)
+
+        # Write address within current frame
+        wr_addr  = Signal(16)
+        # Number of frame data bytes received
+        data_ctr = Signal(12)
+        # Total length = data_ctr + 4
+        total_len = Signal(12)
+
+        # Descriptor base (rwp*256) — saved when frame starts
+        desc_base = Signal(16)
+
+        # Placeholder descriptor byte counter (0..3)
+        desc_ctr = Signal(2)
+
+        # Number of pages consumed by this frame (rounded up)
+        pages_used = Signal(5)
+
+        # Default: no pulses
+        m.d.sync += self.rx_irq.eq(0)
+        m.d.sync += self.rx_wptr_w_en.eq(0)
+
+        # Combinatorial outputs
+        m.d.comb += total_len.eq(data_ctr + 4)
+
+        with m.FSM(domain="sync", name="rx_fsm"):
+
+            with m.State("IDLE"):
+                m.d.comb += self.rx_ready.eq(0)
+                m.d.sync += self.eth_wr_valid.eq(0)
+                with m.If(self.rx_valid & self.rx_sof & self.rx_enabled):
+                    frame_base = Signal(16)
+                    m.d.comb += frame_base.eq(Cat(Const(0, 8), rwp))
+                    m.d.sync += desc_base.eq(frame_base)
+                    m.d.sync += wr_addr.eq(frame_base)
+                    m.d.sync += data_ctr.eq(0)
+                    m.d.sync += desc_ctr.eq(0)
+                    m.next = "WRITE_PLACEHOLDER"
+
+            with m.State("WRITE_PLACEHOLDER"):
+                # Write 4 bytes of 0x00 as placeholder descriptor
+                m.d.sync += self.eth_wr_addr.eq(wr_addr)
+                m.d.sync += self.eth_wr_data.eq(0x00)
+                m.d.sync += self.eth_wr_valid.eq(1)
+                with m.If(self.eth_wr_ready):
+                    m.d.sync += wr_addr.eq(wr_addr + 1)
+                    with m.If(desc_ctr == 3):
+                        m.d.sync += self.eth_wr_valid.eq(0)
+                        m.next = "RECV_AND_WRITE"
+                    with m.Else():
+                        m.d.sync += desc_ctr.eq(desc_ctr + 1)
+
+            with m.State("RECV_AND_WRITE"):
+                # Accept bytes from W5500 and write each to SPRAM immediately
+                m.d.comb += self.rx_ready.eq(~self.eth_wr_valid | self.eth_wr_ready)
+                with m.If(self.rx_valid & (~self.eth_wr_valid | self.eth_wr_ready)):
+                    m.d.sync += self.eth_wr_addr.eq(wr_addr)
+                    m.d.sync += self.eth_wr_data.eq(self.rx_data)
+                    m.d.sync += self.eth_wr_valid.eq(1)
+                    m.d.sync += wr_addr.eq(wr_addr + 1)
+                    m.d.sync += data_ctr.eq(data_ctr + 1)
+                    with m.If(self.rx_eof):
+                        m.next = "WAIT_LAST_WRITE"
+                with m.Elif(self.eth_wr_valid & self.eth_wr_ready):
+                    m.d.sync += self.eth_wr_valid.eq(0)
+
+            with m.State("WAIT_LAST_WRITE"):
+                # Wait for the last data byte write to be accepted
+                with m.If(~self.eth_wr_valid | self.eth_wr_ready):
+                    m.d.sync += self.eth_wr_valid.eq(0)
+                    # Compute pages used: ceil((data_ctr + 4) / 256)
+                    # = (total_len + 255) >> 8 = total_len[11:8] + (total_len[7:0] != 0)
+                    m.d.sync += pages_used.eq(total_len[8:12] + (total_len[:8] != 0))
+                    m.next = "WRITE_LEN_HI"
+
+            with m.State("WRITE_LEN_HI"):
+                # Overwrite descriptor byte 2 with total_len[15:8]
+                m.d.sync += self.eth_wr_addr.eq(desc_base + 2)
+                m.d.sync += self.eth_wr_data.eq(total_len[8:12])
+                m.d.sync += self.eth_wr_valid.eq(1)
+                with m.If(self.eth_wr_ready):
+                    m.d.sync += self.eth_wr_valid.eq(0)
+                    m.next = "WRITE_LEN_LO"
+
+            with m.State("WRITE_LEN_LO"):
+                # Overwrite descriptor byte 3 with total_len[7:0]
+                m.d.sync += self.eth_wr_addr.eq(desc_base + 3)
+                m.d.sync += self.eth_wr_data.eq(total_len[:8])
+                m.d.sync += self.eth_wr_valid.eq(1)
+                with m.If(self.eth_wr_ready):
+                    m.d.sync += self.eth_wr_valid.eq(0)
+                    m.next = "ADVANCE_RWP"
+
+            with m.State("ADVANCE_RWP"):
+                # next_rwp = ((rwp - 1 + pages_used) % 15) + 1
+                next_rwp_raw = Signal(8)
+                m.d.comb += next_rwp_raw.eq(rwp + pages_used)
+                with m.If(next_rwp_raw > _RX_PAGE_LAST):
+                    m.d.sync += rwp.eq(next_rwp_raw - _PAGES_TOTAL)
+                with m.Else():
+                    m.d.sync += rwp.eq(next_rwp_raw)
+                m.next = "PUSH_WPT"
+
+            with m.State("PUSH_WPT"):
+                with m.If(self.rx_wptr_w_rdy):
+                    m.d.sync += self.rx_wptr_w_data.eq(rwp)
+                    m.d.sync += self.rx_wptr_w_en.eq(1)
+                    m.d.sync += self.rx_irq.eq(1)
+                    m.next = "IDLE"
+
+        return m
+
+
+# ── Testbench ─────────────────────────────────────────────────────────────
+
+if __name__ == "__main__":
+    import sys
+    from amaranth.sim import Simulator, Period
+
+    dut = RXFrameAssembler()
+    errors = []
+
+    # Track all SPRAM writes issued by the DUT
+    spram_writes = []
+
+    async def testbench(ctx):
+        # Setup: acknowledge all SPRAM writes immediately
+        ctx.set(dut.eth_wr_ready,    1)
+        ctx.set(dut.rx_wptr_w_rdy,  1)
+        ctx.set(dut.rx_enabled,     1)
+        await ctx.tick("sync").repeat(2)
+
+        # ── T1: 10-byte frame → pages_used=1, rwp advances 1→2 ──────────────
+        # Send SOF + first byte
+        frame = [0xAA, 0xBB, 0xCC, 0xDD, 0x08, 0x00, 0x45, 0x00, 0x00, 0x01]
+
+        ctx.set(dut.rx_data,  frame[0])
+        ctx.set(dut.rx_valid, 1)
+        ctx.set(dut.rx_sof,   1)
+        await ctx.tick("sync").repeat(1)
+        ctx.set(dut.rx_sof, 0)
+
+        for i, b in enumerate(frame[1:], start=1):
+            ctx.set(dut.rx_data, b)
+            ctx.set(dut.rx_eof,  1 if i == len(frame) - 1 else 0)
+            await ctx.tick("sync").repeat(1)
+
+        ctx.set(dut.rx_valid, 0)
+        ctx.set(dut.rx_eof,   0)
+
+        # Poll for up to 30 ticks until rx_irq pulses (1-cycle pulse)
+        t1_irq_seen = False
+        t1_wptr_d   = 0
+        for _ in range(30):
+            await ctx.tick("sync").repeat(1)
+            if ctx.get(dut.rx_irq):
+                t1_irq_seen = True
+                t1_wptr_d   = ctx.get(dut.rx_wptr_w_data)
+                break
+
+        print(f"T1 rx_irq_seen={t1_irq_seen}  wptr_data=0x{t1_wptr_d:02X}")
+        if not t1_irq_seen:
+            errors.append("T1: rx_irq never pulsed")
+        if t1_wptr_d != 2:
+            errors.append(f"T1: rwp should be 2 (page 1→2), got {t1_wptr_d}")
+
+        await ctx.tick("sync").repeat(4)
+
+        # ── T2: Send a second frame; verify rwp advances further ────────────
+        frame2 = [0x11, 0x22, 0x33, 0x44, 0x55, 0x66]
+        ctx.set(dut.rx_data,  frame2[0])
+        ctx.set(dut.rx_valid, 1)
+        ctx.set(dut.rx_sof,   1)
+        await ctx.tick("sync").repeat(1)
+        ctx.set(dut.rx_sof, 0)
+
+        for i, b in enumerate(frame2[1:], start=1):
+            ctx.set(dut.rx_data, b)
+            ctx.set(dut.rx_eof,  1 if i == len(frame2) - 1 else 0)
+            await ctx.tick("sync").repeat(1)
+
+        ctx.set(dut.rx_valid, 0)
+        ctx.set(dut.rx_eof,   0)
+
+        t2_irq_seen = False
+        t2_wptr_d   = 0
+        for _ in range(30):
+            await ctx.tick("sync").repeat(1)
+            if ctx.get(dut.rx_irq):
+                t2_irq_seen = True
+                t2_wptr_d   = ctx.get(dut.rx_wptr_w_data)
+                break
+
+        print(f"T2 rx_irq_seen={t2_irq_seen}  wptr_data=0x{t2_wptr_d:02X}")
+        if not t2_irq_seen:
+            errors.append("T2: rx_irq never pulsed after second frame")
+        if t2_wptr_d != 3:
+            errors.append(f"T2: rwp should be 3 (page 2→3), got {t2_wptr_d}")
+
+        # ── T3: RX disabled — SOF must be ignored ──────────────────────────
+        ctx.set(dut.rx_enabled, 0)
+        ctx.set(dut.rx_data,  0xDE)
+        ctx.set(dut.rx_valid, 1)
+        ctx.set(dut.rx_sof,   1)
+        await ctx.tick("sync").repeat(4)
+        ctx.set(dut.rx_valid, 0)
+        ctx.set(dut.rx_sof, 0)
+
+        # No SPRAM write should have been issued
+        wr_valid = ctx.get(dut.eth_wr_valid)
+        if wr_valid:
+            errors.append("T3: SPRAM write issued while rx_enabled=0")
+        print(f"T3 rx disabled: eth_wr_valid={wr_valid} (expected 0)")
+
+    sim = Simulator(dut)
+    sim.add_clock(Period(MHz=24), domain="sync")
+    sim.add_testbench(testbench)
+
+    with sim.write_vcd("RXFrameAssembler.vcd"):
+        sim.run()
+
+    if errors:
+        print("\nFAILURES:")
+        for e in errors:
+            print(" ", e)
+        sys.exit(1)
+    else:
+        print("\nAll tests passed.")
diff --git a/exi_bba/spi_mode3_slave.py b/exi_bba/spi_mode3_slave.py
new file mode 100644
index 0000000..3ae4bfe
--- /dev/null
+++ b/exi_bba/spi_mode3_slave.py
@@ -0,0 +1,274 @@
+"""SPI Mode 3 byte-oriented slave for the EXI bus.
+
+CPOL=1, CPHA=1: CLK idles HIGH.
+Slave samples MOSI on the FALLING CLK edge.
+Slave drives MISO on the RISING CLK edge (master samples on next falling edge).
+All three raw inputs are run through a 2-stage FFSynchronizer before use.
+"""
+
+from amaranth import *
+from amaranth.lib.cdc import FFSynchronizer
+
+
+# ── public re-export for import convenience ─────────────────────────────────
+__all__ = ["SPIMode3Slave"]
+
+
+class SPIMode3Slave(Elaboratable):
+    """Byte-oriented SPI Mode 3 slave.
+
+    Ports
+    -----
+    spi_clk / spi_mosi / spi_cs_n : raw async inputs from GC (synchronized internally)
+    spi_miso                       : output to GC; idles HIGH when CS deasserted
+    rx_byte  : last complete received byte (valid when rx_valid pulses)
+    rx_valid : 1-cycle pulse in exi domain when rx_byte contains a new byte
+    tx_byte  : upstream loads this before or within one exi clock of tx_load pulsing
+    tx_load  : 1-cycle pulse requesting the next TX byte from upstream
+    """
+
+    def __init__(self, domain="capture"):
+        # Clock domain this byte engine runs in.  Split-domain design puts the
+        # bit engine in a fast `capture` domain (54 MHz) so it can oversample
+        # a 27 MHz EXI clock ~3×; the register file lives in a slower domain.
+        self._domain = domain
+
+        self.spi_clk  = Signal(init=1)   # idles HIGH
+        self.spi_mosi = Signal()
+        self.spi_cs_n = Signal(init=1)   # active LOW
+
+        self.spi_miso = Signal()         # combinatorial output
+
+        self.rx_byte  = Signal(8)
+        self.rx_valid = Signal()
+        self.tx_byte  = Signal(8)
+        self.tx_load  = Signal()
+
+        # 1-cycle pulse on CS assertion (transaction start).  The capture
+        # wrapper uses it to reset its per-transaction TX byte counter.
+        self.frame_start = Signal()
+
+        # Level: high while CS is asserted (a transaction is in progress).
+        # Lets downstream logic detect variable-length (DMA) transaction ends.
+        self.cs_active = Signal()
+
+    def elaborate(self, platform):
+        m = Module()
+        d = self._domain
+
+        # ── Input synchronization (async → exi, 2 stages) ──────────────────
+        clk_s  = Signal(init=1)
+        mosi_s = Signal()
+        cs_s   = Signal(init=1)
+
+        m.submodules.sync_clk  = FFSynchronizer(self.spi_clk,  clk_s,  o_domain=d, init=1)
+        m.submodules.sync_mosi = FFSynchronizer(self.spi_mosi, mosi_s, o_domain=d)
+        m.submodules.sync_cs   = FFSynchronizer(self.spi_cs_n, cs_s,   o_domain=d, init=1)
+
+        # ── Edge detection ──────────────────────────────────────────────────
+        clk_prev = Signal(init=1)
+        cs_prev  = Signal(init=1)
+        m.d[d] += clk_prev.eq(clk_s)
+        m.d[d] += cs_prev.eq(cs_s)
+
+        falling_clk = Signal()
+        rising_clk  = Signal()
+        cs_fall     = Signal()
+        cs_rise     = Signal()
+        m.d.comb += falling_clk.eq(~clk_s &  clk_prev)
+        m.d.comb += rising_clk .eq( clk_s & ~clk_prev)
+        m.d.comb += cs_fall    .eq(~cs_s  &  cs_prev)
+        m.d.comb += cs_rise    .eq( cs_s  & ~cs_prev)
+        m.d.comb += self.frame_start.eq(cs_fall)
+        m.d.comb += self.cs_active.eq(~cs_s)
+
+        # ── Shift registers ─────────────────────────────────────────────────
+        rx_shift = Signal(8)
+        tx_shift = Signal(8)
+        bit_ctr  = Signal(4)         # counts 0..7; 7 means "8th (last) bit"
+        armed    = Signal(init=1)    # between bytes: drive the LIVE tx_byte MSB
+        rearm    = Signal()          # arm for next byte on the next rising edge
+
+        # MISO: idle HIGH when CS deasserted.  While "armed" — i.e. at the start
+        # of a byte, including the inter-byte / clock-idle gap before the first
+        # falling edge — drive the LIVE tx_byte MSB.  This is what lets a
+        # response that upstream pushes DURING the EXI clock-idle gap reach MISO
+        # in time: there is no clock edge during the gap to latch it, so MISO
+        # must be combinational on tx_byte until the byte actually starts.  Once
+        # shifting (after the first falling edge) drive the latched shift reg.
+        m.d.comb += self.spi_miso.eq(
+            Mux(cs_s, 1, Mux(armed, self.tx_byte[7], tx_shift[7]))
+        )
+
+        # Default: deassert single-cycle pulses every cycle
+        m.d[d] += self.rx_valid.eq(0)
+        m.d[d] += self.tx_load.eq(0)
+
+        with m.If(cs_fall):
+            # Transaction start: first byte drives its MSB live (armed).
+            m.d[d] += bit_ctr.eq(0)
+            m.d[d] += armed.eq(1)
+
+        with m.Elif(cs_rise | cs_s):
+            # CS deasserted / idle: reset state
+            m.d[d] += bit_ctr.eq(0)
+            m.d[d] += armed.eq(1)
+
+        with m.Else():
+            # CS asserted: run bit engine
+            with m.If(falling_clk):
+                # Sample MOSI (MSB first: left-shift, new bit enters at LSB)
+                # Cat(a, b) → a at lower bits; so Cat(mosi, rx[6:0]) = {rx[6:0], mosi}
+                m.d[d] += rx_shift.eq(Cat(mosi_s, rx_shift[:-1]))
+
+                with m.If(armed):
+                    # First falling edge of this byte: master has just sampled
+                    # the MSB (driven live above).  Latch tx_byte so the
+                    # remaining 7 bits shift out of a stable register.
+                    m.d[d] += tx_shift.eq(self.tx_byte)
+                    m.d[d] += armed.eq(0)
+
+                with m.If(bit_ctr == 7):
+                    # 8th falling edge: byte complete.  The master samples the
+                    # LSB on THIS edge, so MISO must still hold tx_shift[7].
+                    # Defer arming to the next rising edge (rearm) so MISO is
+                    # not switched to the next byte's live MSB too early.
+                    m.d[d] += self.rx_byte.eq(Cat(mosi_s, rx_shift[:-1]))
+                    m.d[d] += self.rx_valid.eq(1)
+                    m.d[d] += bit_ctr.eq(0)
+                    m.d[d] += self.tx_load.eq(1)   # advance source to next byte
+                    m.d[d] += rearm.eq(1)          # arm on the next rising edge
+                with m.Else():
+                    m.d[d] += bit_ctr.eq(bit_ctr + 1)
+
+            with m.If(rising_clk):
+                with m.If(rearm):
+                    # Byte boundary: arm for the next byte (live MSB drive).
+                    m.d[d] += armed.eq(1)
+                    m.d[d] += rearm.eq(0)
+                with m.Elif(~armed):
+                    # Shift left: next bit into MSB position
+                    # Cat(0, tx[6:0]) = {tx[6:0], 0} — left shift
+                    m.d[d] += tx_shift.eq(Cat(0, tx_shift[:-1]))
+
+        return m
+
+
+# ── Testbench ───────────────────────────────────────────────────────────────
+
+if __name__ == "__main__":
+    from amaranth.sim import Simulator, Period
+
+    dut = SPIMode3Slave()
+
+    # 4 exi ticks per SPI half-period → well above the 3-cycle (2 sync + 1 edge) latency.
+    HALF = 4
+
+    async def spi_send_byte(ctx, mosi_val, next_tx_byte=None):
+        """Drive one SPI Mode 3 byte on MOSI; return the MISO byte assembled.
+
+        next_tx_byte: if given, written to tx_byte after the LAST falling edge
+        (before the last rising edge) so need_reload picks it up in time.
+        """
+        miso_byte = 0
+        for bit in range(7, -1, -1):
+            ctx.set(dut.spi_mosi, (mosi_val >> bit) & 1)
+            ctx.set(dut.spi_clk, 0)                      # falling edge
+            await ctx.tick("capture").repeat(HALF)
+            miso_byte = (miso_byte << 1) | ctx.get(dut.spi_miso)
+            # Set next TX byte here — after last fall, before rising edge.
+            # The rising edge is detected 3 cycles after we assert clk=1,
+            # so we have HALF ticks of margin.
+            if bit == 0 and next_tx_byte is not None:
+                ctx.set(dut.tx_byte, next_tx_byte)
+            ctx.set(dut.spi_clk, 1)                      # rising edge
+            await ctx.tick("capture").repeat(HALF)
+        return miso_byte
+
+    errors = []
+
+    async def testbench(ctx):
+        # ── Test 1: Single byte TX/RX ──────────────────────────────────────
+        ctx.set(dut.spi_cs_n, 0)
+        ctx.set(dut.spi_clk,  1)
+        ctx.set(dut.tx_byte, 0xA5)        # pre-load before CS fall is detected
+        await ctx.tick("capture").repeat(HALF)
+
+        miso = await spi_send_byte(ctx, 0x37)
+        await ctx.tick("capture").repeat(2)
+        rx = ctx.get(dut.rx_byte)
+
+        ctx.set(dut.spi_cs_n, 1)
+        await ctx.tick("capture").repeat(HALF)
+
+        if rx != 0x37:
+            errors.append(f"Test1 rx_byte: expected 0x37, got 0x{rx:02X}")
+        if miso != 0xA5:
+            errors.append(f"Test1 miso: expected 0xA5, got 0x{miso:02X}")
+        print(f"Test1 – MOSI→rx_byte: 0x{rx:02X}  MISO←tx_byte: 0x{miso:02X}")
+
+        await ctx.tick("capture").repeat(HALF)
+
+        # ── Test 2: Two-byte transaction; second byte loaded via need_reload ─
+        ctx.set(dut.spi_cs_n, 0)
+        ctx.set(dut.tx_byte, 0xBE)        # first response byte
+        await ctx.tick("capture").repeat(HALF)
+
+        # Pass next_tx_byte=0xEF so it's set after last falling edge of byte 0,
+        # giving need_reload time to load it on the subsequent rising edge.
+        miso0 = await spi_send_byte(ctx, 0x00, next_tx_byte=0xEF)
+        miso1 = await spi_send_byte(ctx, 0xFF)
+
+        await ctx.tick("capture").repeat(2)
+        rx1 = ctx.get(dut.rx_byte)
+
+        ctx.set(dut.spi_cs_n, 1)
+        await ctx.tick("capture").repeat(HALF)
+
+        if miso0 != 0xBE:
+            errors.append(f"Test2 miso0: expected 0xBE, got 0x{miso0:02X}")
+        if miso1 != 0xEF:
+            errors.append(f"Test2 miso1: expected 0xEF, got 0x{miso1:02X}")
+        if rx1 != 0xFF:
+            errors.append(f"Test2 rx1: expected 0xFF, got 0x{rx1:02X}")
+        print(f"Test2 – byte0 MISO: 0x{miso0:02X}  byte1 MISO: 0x{miso1:02X}  rx1: 0x{rx1:02X}")
+
+        await ctx.tick("capture").repeat(HALF)
+
+        # ── Test 3: MISO idles HIGH when CS deasserted ─────────────────────
+        miso_idle = ctx.get(dut.spi_miso)
+        if miso_idle != 1:
+            errors.append(f"Test3 MISO idle: expected 1, got {miso_idle}")
+        print(f"Test3 – MISO idle (CS=1): {miso_idle}")
+
+        # ── Test 4: All-zeros byte (0x00) TX and RX ────────────────────────
+        ctx.set(dut.spi_cs_n, 0)
+        ctx.set(dut.tx_byte, 0x00)
+        await ctx.tick("capture").repeat(HALF)
+
+        miso = await spi_send_byte(ctx, 0xFF)
+        await ctx.tick("capture").repeat(2)
+        rx = ctx.get(dut.rx_byte)
+        ctx.set(dut.spi_cs_n, 1)
+        await ctx.tick("capture").repeat(HALF)
+
+        if miso != 0x00:
+            errors.append(f"Test4 miso: expected 0x00, got 0x{miso:02X}")
+        if rx != 0xFF:
+            errors.append(f"Test4 rx: expected 0xFF, got 0x{rx:02X}")
+        print(f"Test4 – 0x00 TX / 0xFF RX: MISO=0x{miso:02X}  rx=0x{rx:02X}")
+
+    sim = Simulator(dut)
+    sim.add_clock(Period(MHz=54), domain="capture")
+    sim.add_testbench(testbench)
+
+    with sim.write_vcd("SPIMode3Slave.vcd"):
+        sim.run()
+
+    if errors:
+        print("\nFAILURES:")
+        for e in errors:
+            print(" ", e)
+        raise SystemExit(1)
+    else:
+        print("\nAll tests passed.")
diff --git a/exi_bba/spram_arbiter.py b/exi_bba/spram_arbiter.py
new file mode 100644
index 0000000..f1ddd92
--- /dev/null
+++ b/exi_bba/spram_arbiter.py
@@ -0,0 +1,276 @@
+"""SPRAM arbiter — sync domain (24 MHz).
+
+Owns the iCE40UP5K 128 KB SPRAM (SB_SPRAM256KA, 16-bit wide) and arbitrates
+between two clients:
+
+  Client A (EXI read) : prefetch pipeline; low priority.
+  Client B (ETH write): RXFrameAssembler; high priority.
+
+ETH writes win when both clients are active.  This is safe because the GC only
+reads pages that the ETH engine has already finished writing (ring-buffer
+invariant).
+
+SPRAM addressing
+-----------------
+SB_SPRAM256KA is 64 K × 16-bit.  Byte addressing:
+  ADDRESS = byte_addr >> 1
+  MASKWREN[3:0]:
+    0b0011 → write lower byte  (byte_addr even)
+    0b1100 → write upper byte  (byte_addr odd)
+  Read: both bytes returned; pick the right one from DATAOUT based on addr bit 0.
+
+Read latency: 1 synchronous cycle — result of cycle N is valid at N+1.
+
+In simulation (platform is None) a behavioural Array model is used instead of
+the SB_SPRAM256KA Instance so tests run without IceStorm.
+"""
+
+from amaranth import *
+from amaranth.lib.memory import Memory
+
+__all__ = ["SPRAMArbiter"]
+
+_SPRAM_WORDS = 65536   # 64 K 16-bit words = 128 KB
+
+
+class SPRAMArbiter(Elaboratable):
+    """Arbitrated SPRAM controller in the sync domain.
+
+    EXI read interface (from BBARegisterFile spram_req / spram_rsp FIFOs)
+    ----------------------------------------------------------------------
+    exi_req_addr  : 16-bit byte address to read
+    exi_req_valid : FIFO r_rdy — a request is waiting
+    exi_req_ready : FIFO r_en  — pop the request (asserted when serviced)
+    exi_rsp_data  : 8-bit result byte
+    exi_rsp_valid : FIFO w_en  — push result when valid
+
+    ETH write interface (from RXFrameAssembler)
+    -------------------------------------------
+    eth_wr_addr  : 16-bit byte address to write
+    eth_wr_data  : 8-bit byte value
+    eth_wr_valid : write request present
+    eth_wr_ready : write accepted this cycle
+    """
+
+    def __init__(self):
+        # EXI read interface
+        self.exi_req_addr  = Signal(16)
+        self.exi_req_valid = Signal()
+        self.exi_req_ready = Signal()
+        self.exi_rsp_data  = Signal(8)
+        self.exi_rsp_valid = Signal()
+
+        # ETH write interface
+        self.eth_wr_addr   = Signal(16)
+        self.eth_wr_data   = Signal(8)
+        self.eth_wr_valid  = Signal()
+        self.eth_wr_ready  = Signal()
+
+    def elaborate(self, platform):
+        m = Module()
+
+        # ── SPRAM instantiation (hardware vs simulation) ──────────────────
+        spram_addr   = Signal(14)     # word address (byte_addr >> 1)
+        spram_din    = Signal(16)
+        spram_dout   = Signal(16)
+        spram_wren   = Signal()
+        spram_mask   = Signal(4)      # MASKWREN
+
+        if platform is None:
+            # Behavioural model: synchronous read with 1-cycle latency.
+            # Memory is a Component; read/write ports are obtained from it
+            # and wired via its submodule ports (not added as separate submodules).
+            mem = Memory(shape=16, depth=_SPRAM_WORDS, init=[])
+            m.submodules.mem = mem
+            mem_rd = mem.read_port(domain="sync", transparent_for=[])
+            mem_wr = mem.write_port(domain="sync", granularity=8)
+
+            # en[0] = lower byte enable, en[1] = upper byte enable
+            byte0_en = Signal()
+            byte1_en = Signal()
+            m.d.comb += [
+                byte0_en        .eq(spram_wren & (spram_mask[0] | spram_mask[1])),
+                byte1_en        .eq(spram_wren & (spram_mask[2] | spram_mask[3])),
+                mem_rd.addr     .eq(spram_addr),
+                mem_rd.en       .eq(1),
+                spram_dout      .eq(mem_rd.data),
+                mem_wr.addr     .eq(spram_addr),
+                mem_wr.data     .eq(spram_din),
+                mem_wr.en       .eq(Cat(byte0_en, byte1_en)),
+            ]
+        else:
+            # Hardware: instantiate two SB_SPRAM256KA (64K×16 each; use one)
+            m.submodules.spram = Instance(
+                "SB_SPRAM256KA",
+                i_ADDRESS    = spram_addr,
+                i_DATAIN     = spram_din,
+                i_MASKWREN   = spram_mask,
+                i_WREN       = spram_wren,
+                i_CHIPSELECT = Const(1, 1),
+                i_CLOCK      = ClockSignal("sync"),
+                i_STANDBY    = Const(0, 1),
+                i_SLEEP      = Const(0, 1),
+                i_POWEROFF   = Const(1, 1),
+                o_DATAOUT    = spram_dout,
+            )
+
+        # ── Arbiter pipeline ─────────────────────────────────────────────
+        # Stage 1: issue SPRAM address and control signals (combinatorial)
+        # Stage 2: capture SPRAM output into rsp_buf (synchronous, 1-cycle)
+
+        read_pending  = Signal()   # a read address was issued last cycle
+        read_was_odd  = Signal()   # byte address bit 0 of the pending read
+        rsp_buf       = Signal(8)  # registered response byte; valid when exi_rsp_valid
+
+        # Combinatorial defaults
+        m.d.comb += [
+            spram_wren        .eq(0),
+            spram_mask        .eq(0),
+            spram_din         .eq(0),
+            spram_addr        .eq(0),
+            self.exi_req_ready.eq(0),
+            self.eth_wr_ready .eq(0),
+            self.exi_rsp_data .eq(rsp_buf),  # always sourced from registered buffer
+        ]
+        # Registered defaults
+        m.d.sync += [
+            self.exi_rsp_valid.eq(0),
+            read_pending      .eq(0),
+        ]
+
+        # ETH write has priority
+        with m.If(self.eth_wr_valid):
+            m.d.comb += [
+                spram_addr       .eq(self.eth_wr_addr[1:]),
+                spram_wren       .eq(1),
+                self.eth_wr_ready.eq(1),
+            ]
+            with m.If(self.eth_wr_addr[0]):
+                m.d.comb += [
+                    spram_din [8:16].eq(self.eth_wr_data),
+                    spram_mask      .eq(0b1100),
+                ]
+            with m.Else():
+                m.d.comb += [
+                    spram_din [0:8].eq(self.eth_wr_data),
+                    spram_mask     .eq(0b0011),
+                ]
+
+        # EXI read (lower priority)
+        with m.Elif(self.exi_req_valid):
+            m.d.comb += [
+                spram_addr        .eq(self.exi_req_addr[1:]),
+                self.exi_req_ready.eq(1),
+            ]
+            m.d.sync += [
+                read_pending.eq(1),
+                read_was_odd.eq(self.exi_req_addr[0]),
+            ]
+
+        # Capture SPRAM output into registered buffer after 1-cycle latency
+        with m.If(read_pending):
+            with m.If(read_was_odd):
+                m.d.sync += rsp_buf.eq(spram_dout[8:16])
+            with m.Else():
+                m.d.sync += rsp_buf.eq(spram_dout[0:8])
+            m.d.sync += self.exi_rsp_valid.eq(1)
+
+        return m
+
+
+# ── Testbench ─────────────────────────────────────────────────────────────
+
+if __name__ == "__main__":
+    import sys
+    from amaranth.sim import Simulator, Period
+
+    dut = SPRAMArbiter()
+    errors = []
+
+    async def testbench(ctx):
+        await ctx.tick("sync").repeat(2)
+
+        # T1: ETH write to even byte address 0x0100, then EXI read it back
+        ctx.set(dut.eth_wr_addr,  0x0100)
+        ctx.set(dut.eth_wr_data,  0xAB)
+        ctx.set(dut.eth_wr_valid, 1)
+        await ctx.tick("sync").repeat(1)
+        accepted = ctx.get(dut.eth_wr_ready)
+        if not accepted:
+            errors.append("T1 eth write not accepted")
+        ctx.set(dut.eth_wr_valid, 0)
+        await ctx.tick("sync").repeat(1)
+
+        # Issue EXI read of the same address
+        ctx.set(dut.exi_req_addr,  0x0100)
+        ctx.set(dut.exi_req_valid, 1)
+        await ctx.tick("sync").repeat(1)   # clock A: read issued, read_pending=1
+        ctx.set(dut.exi_req_valid, 0)
+        await ctx.tick("sync").repeat(1)   # clock B: SPRAM output captured, valid=1
+        # Check HERE — exi_rsp_valid is 1 for exactly this one cycle
+
+        rdata = ctx.get(dut.exi_rsp_data)
+        rvalid = ctx.get(dut.exi_rsp_valid)
+        if rdata != 0xAB:
+            errors.append(f"T1 read back: expected 0xAB, got 0x{rdata:02X}")
+        if not rvalid:
+            errors.append("T1 exi_rsp_valid not set")
+        print(f"T1 even addr read-back: data=0x{rdata:02X}  valid={rvalid}")
+
+        await ctx.tick("sync").repeat(2)
+
+        # T2: ETH write to ODD byte address 0x0101, read back
+        ctx.set(dut.eth_wr_addr,  0x0101)
+        ctx.set(dut.eth_wr_data,  0xCD)
+        ctx.set(dut.eth_wr_valid, 1)
+        await ctx.tick("sync").repeat(1)
+        ctx.set(dut.eth_wr_valid, 0)
+        await ctx.tick("sync").repeat(1)
+
+        ctx.set(dut.exi_req_addr,  0x0101)
+        ctx.set(dut.exi_req_valid, 1)
+        await ctx.tick("sync").repeat(1)
+        ctx.set(dut.exi_req_valid, 0)
+        await ctx.tick("sync").repeat(1)
+
+        rdata = ctx.get(dut.exi_rsp_data)
+        if rdata != 0xCD:
+            errors.append(f"T2 odd addr read-back: expected 0xCD, got 0x{rdata:02X}")
+        print(f"T2 odd addr read-back: data=0x{rdata:02X}")
+
+        await ctx.tick("sync").repeat(2)
+
+        # T3: ETH write wins when both clients active simultaneously
+        # Write 0xEE to 0x0200
+        ctx.set(dut.eth_wr_addr,  0x0200)
+        ctx.set(dut.eth_wr_data,  0xEE)
+        ctx.set(dut.eth_wr_valid, 1)
+        ctx.set(dut.exi_req_addr,  0x0100)   # also wants to read
+        ctx.set(dut.exi_req_valid, 1)
+        await ctx.tick("sync").repeat(1)
+
+        eth_won = ctx.get(dut.eth_wr_ready)
+        exi_blocked = not ctx.get(dut.exi_req_ready)
+        ctx.set(dut.eth_wr_valid, 0)
+        ctx.set(dut.exi_req_valid, 0)
+
+        if not eth_won:
+            errors.append("T3 ETH priority: ETH write not accepted")
+        if not exi_blocked:
+            errors.append("T3 ETH priority: EXI read was not blocked")
+        print(f"T3 ETH priority: eth_won={eth_won} exi_blocked={exi_blocked}")
+
+    sim = Simulator(dut)
+    sim.add_clock(Period(MHz=24), domain="sync")
+    sim.add_testbench(testbench)
+
+    with sim.write_vcd("SPRAMArbiter.vcd"):
+        sim.run()
+
+    if errors:
+        print("\nFAILURES:")
+        for e in errors:
+            print(" ", e)
+        sys.exit(1)
+    else:
+        print("\nAll tests passed.")
diff --git a/exi_bba/status_panel.py b/exi_bba/status_panel.py
new file mode 100644
index 0000000..fdd66fa
--- /dev/null
+++ b/exi_bba/status_panel.py
@@ -0,0 +1,227 @@
+"""StatusPanel — 5-LED / 3-button bring-up panel (sync domain).
+
+A development/diagnostics front panel for the iCEbreaker LED+button PMOD.  It
+turns the device's internal liveness signals into something you can watch on a
+real GameCube during bring-up, and gives three buttons for manual control.
+
+LEDs (logical, active-high; set `led_active_low=True` if the board sinks current)
+    led[0]  heartbeat   — ~1–2 Hz blink: clock alive, bitstream loaded
+    led[1]  exi_active  — stretched `cs_active`: the GC is talking on EXI
+    led[2]  rx_act      — stretched `rx_pulse`: a packet arrived from the net
+    led[3]  tx_act      — stretched `tx_pulse`: a packet went out
+    led[4]  ready       — `ready` level (e.g. ethernet init complete)
+
+Buttons (raw pin level; `btn_active_low=True` for the usual pull-up wiring)
+    btn[0]  eth_rst  — while held, drive `eth_rst_n` low (reset the ethernet chip)
+    btn[1]  reinit   — on press, emit a one-cycle `reinit` pulse (force re-init)
+    btn[2]  freeze   — toggle: latch the rx/tx activity LEDs so a single one-shot
+                       blink sticks until you unfreeze (catch a lone packet)
+
+Single-cycle events (`rx_pulse`/`tx_pulse`) are stretched to ~`stretch_cycles`
+so the eye can see them; `cs_active` is a level that is re-triggered while high.
+Buttons are debounced (`debounce_cycles` stable samples) — same idea as
+`rebbarb/debouncer.py`, inlined here to keep this module self-contained.
+"""
+
+from amaranth import *
+
+__all__ = ["StatusPanel"]
+
+
+class StatusPanel(Elaboratable):
+    def __init__(self, hb_bit=23, stretch_cycles=1_440_000,
+                 debounce_cycles=240_000, led_active_low=False,
+                 btn_active_low=True):
+        # hb_bit: heartbeat = bit `hb_bit` of a free-running counter
+        #   (24 MHz / 2**23 ≈ 1.4 Hz).  stretch_cycles ≈ 60 ms at 24 MHz.
+        self._hb_bit  = hb_bit
+        self._stretch = stretch_cycles
+        self._deb     = debounce_cycles
+        self._led_inv = led_active_low
+        self._btn_inv = btn_active_low
+
+        # Status inputs (sync domain)
+        self.cs_active = Signal()   # level: EXI transaction in progress
+        self.rx_pulse  = Signal()   # 1-cycle: frame received
+        self.tx_pulse  = Signal()   # 1-cycle: frame sent
+        self.ready     = Signal()   # level: ethernet ready
+
+        # Raw button inputs (from pins)
+        self.btn = Signal(3)
+
+        # Outputs
+        self.led       = Signal(5)
+        self.eth_rst_n = Signal(init=1)   # btn0 held → 0
+        self.reinit    = Signal()         # btn1 press → 1-cycle pulse
+
+    def elaborate(self, platform):
+        m = Module()
+
+        # ── Heartbeat ────────────────────────────────────────────────────
+        hb = Signal(self._hb_bit + 1)
+        m.d.sync += hb.eq(hb + 1)
+        heartbeat = hb[self._hb_bit]
+
+        # ── Button conditioning (normalise polarity → debounce) ──────────
+        braw = Signal(3)
+        m.d.comb += braw.eq(self.btn ^ C(0b111 if self._btn_inv else 0, 3))
+
+        bdeb = Signal(3)
+        for i in range(3):
+            cnt = Signal(range(self._deb + 1), name=f"deb_cnt{i}")
+            with m.If(braw[i] == bdeb[i]):
+                m.d.sync += cnt.eq(0)              # stable: hold
+            with m.Else():
+                m.d.sync += cnt.eq(cnt + 1)        # changing: count stable samples
+                with m.If(cnt == self._deb - 1):
+                    m.d.sync += [bdeb[i].eq(braw[i]), cnt.eq(0)]
+
+        # btn0: hold → ethernet reset asserted (active-low output)
+        m.d.comb += self.eth_rst_n.eq(~bdeb[0])
+
+        # btn1: rising edge → reinit pulse
+        b1_prev = Signal()
+        m.d.sync += b1_prev.eq(bdeb[1])
+        m.d.comb += self.reinit.eq(bdeb[1] & ~b1_prev)
+
+        # btn2: rising edge toggles freeze
+        b2_prev = Signal()
+        freeze  = Signal()
+        m.d.sync += b2_prev.eq(bdeb[2])
+        with m.If(bdeb[2] & ~b2_prev):
+            m.d.sync += freeze.eq(~freeze)
+
+        # ── Activity stretchers (rx/tx), sticky while frozen ─────────────
+        def stretch(pulse, name):
+            cnt    = Signal(range(self._stretch + 1), name=f"{name}_cnt")
+            sticky = Signal(name=f"{name}_sticky")
+            with m.If(pulse):
+                m.d.sync += cnt.eq(self._stretch)
+                with m.If(freeze):
+                    m.d.sync += sticky.eq(1)       # latch a one-shot when frozen
+            with m.Elif(cnt != 0):
+                m.d.sync += cnt.eq(cnt - 1)
+            with m.If(~freeze):
+                m.d.sync += sticky.eq(0)           # clear sticky when unfrozen
+            return (cnt != 0) | sticky
+
+        rx_led = stretch(self.rx_pulse, "rx")
+        tx_led = stretch(self.tx_pulse, "tx")
+
+        # ── cs_active: level → stretched so brief transactions are visible ─
+        cs_cnt = Signal(range(self._stretch + 1))
+        with m.If(self.cs_active):
+            m.d.sync += cs_cnt.eq(self._stretch)
+        with m.Elif(cs_cnt != 0):
+            m.d.sync += cs_cnt.eq(cs_cnt - 1)
+        cs_led = cs_cnt != 0
+
+        leds = Cat(heartbeat, cs_led, rx_led, tx_led, self.ready)
+        m.d.comb += self.led.eq(leds ^ C(0b11111 if self._led_inv else 0, 5))
+
+        return m
+
+
+# ── Testbench ─────────────────────────────────────────────────────────────
+
+if __name__ == "__main__":
+    import sys
+    from amaranth.sim import Simulator, Period
+
+    # Tiny parameters so the timed behaviours are observable in a short sim.
+    dut = StatusPanel(hb_bit=3, stretch_cycles=8, debounce_cycles=3)
+    errors = []
+
+    async def settle(ctx, n=1):
+        await ctx.tick("sync").repeat(n)
+
+    async def testbench(ctx):
+        ctx.set(dut.btn, 0b111)        # active-low idle (no press)
+        await settle(ctx, 4)
+
+        # T1: heartbeat toggles (bit 3 of the counter flips every 8 cycles)
+        h0 = ctx.get(dut.led) & 1
+        await settle(ctx, 8)
+        h1 = ctx.get(dut.led) & 1
+        if h0 == h1:
+            errors.append("T1 heartbeat did not toggle over 8 cycles")
+        print(f"T1 heartbeat toggled: {h0} -> {h1}")
+
+        # T2: rx pulse lights led[2] and it stretches, then clears
+        ctx.set(dut.rx_pulse, 1)
+        await settle(ctx, 1)
+        ctx.set(dut.rx_pulse, 0)
+        await settle(ctx, 1)
+        on = (ctx.get(dut.led) >> 2) & 1
+        if not on:
+            errors.append("T2 rx LED not lit after pulse")
+        await settle(ctx, 12)          # > stretch_cycles
+        off = (ctx.get(dut.led) >> 2) & 1
+        if off:
+            errors.append("T2 rx LED did not clear after stretch")
+        print(f"T2 rx LED: on={on} then off={not off}")
+
+        # T3: ready level drives led[4]
+        ctx.set(dut.ready, 1)
+        await settle(ctx, 1)
+        if not ((ctx.get(dut.led) >> 4) & 1):
+            errors.append("T3 ready LED not lit")
+        ctx.set(dut.ready, 0)
+        print("T3 ready LED follows level")
+
+        # T4: btn0 held (active-low → drive 0) asserts eth_rst_n low after debounce
+        ctx.set(dut.btn, 0b110)        # btn0 pressed
+        await settle(ctx, 6)           # > debounce
+        if ctx.get(dut.eth_rst_n) != 0:
+            errors.append("T4 eth_rst_n not asserted while btn0 held")
+        ctx.set(dut.btn, 0b111)        # release
+        await settle(ctx, 6)
+        if ctx.get(dut.eth_rst_n) != 1:
+            errors.append("T4 eth_rst_n not released")
+        print("T4 btn0 → eth_rst_n hold/release ok")
+
+        # T5: btn1 press emits exactly one reinit pulse
+        pulses = 0
+        ctx.set(dut.btn, 0b101)        # btn1 pressed
+        for _ in range(10):
+            await settle(ctx, 1)
+            pulses += (ctx.get(dut.reinit) & 1)
+        ctx.set(dut.btn, 0b111)
+        await settle(ctx, 6)
+        if pulses != 1:
+            errors.append(f"T5 reinit pulses: got {pulses}, want 1")
+        print(f"T5 btn1 → reinit pulses={pulses}")
+
+        # T6: freeze (btn2) makes a single rx pulse stick
+        ctx.set(dut.btn, 0b011)        # btn2 press → toggle freeze on
+        await settle(ctx, 6)
+        ctx.set(dut.btn, 0b111)
+        await settle(ctx, 2)
+        ctx.set(dut.rx_pulse, 1)       # one-shot while frozen
+        await settle(ctx, 1)
+        ctx.set(dut.rx_pulse, 0)
+        await settle(ctx, 20)          # well past stretch
+        stuck = (ctx.get(dut.led) >> 2) & 1
+        if not stuck:
+            errors.append("T6 frozen rx LED did not stick")
+        ctx.set(dut.btn, 0b011)        # toggle freeze off
+        await settle(ctx, 6)
+        ctx.set(dut.btn, 0b111)
+        await settle(ctx, 2)
+        cleared = ((ctx.get(dut.led) >> 2) & 1) == 0
+        if not cleared:
+            errors.append("T6 rx LED did not clear after unfreeze")
+        print(f"T6 freeze: stuck={stuck} cleared_after_unfreeze={cleared}")
+
+    sim = Simulator(dut)
+    sim.add_clock(Period(MHz=24), domain="sync")
+    sim.add_testbench(testbench)
+    sim.run()
+
+    if errors:
+        print("\nFAILURES:")
+        for e in errors:
+            print(" ", e)
+        sys.exit(1)
+    else:
+        print("\nAll tests passed.")
diff --git a/exi_bba/synth.py b/exi_bba/synth.py
new file mode 100644
index 0000000..8dbb171
--- /dev/null
+++ b/exi_bba/synth.py
@@ -0,0 +1,197 @@
+"""Synthesis script for BBATop → iCEbreaker (iCE40UP5K SG48).
+
+Run from workspace root:
+    python -m exi_bba.synth              # synthesize only
+    python -m exi_bba.synth --flash      # synthesize and flash
+
+This file re-declares IceBreakerPlatform inline so that importing
+rebbarb/rebbarb.py (which has a module-level platform.build() call) is avoided.
+"""
+
+import os
+import subprocess
+import sys
+
+from amaranth import *
+from amaranth.build import *
+from amaranth.vendor import LatticeICE40Platform
+
+from exi_bba.bba_top import BBATop
+
+
+# ── Platform definition ───────────────────────────────────────────────────
+# Pin assignments use the iCEbreaker PMOD connectors as placeholders.
+# Replace with actual SP1-interposer pin numbers once PCB is finalised.
+#
+# PMOD1A (J2): pins 4 2 47 45 / 3 48 46 44  (top/bottom)
+# PMOD1B (J3): pins 43 38 34 31 / 42 36 32 28
+# PMOD2  (J4): pins 27 25 21 19 / 26 23 20 18
+#
+# EXI   : CLK=4  MOSI=2  MISO=47  CS_N=45  INT_N=3   (PMOD1A)
+# W5100 : indirect parallel bus — 15 pins across PMOD1B + PMOD2.
+#   ADDR[1:0]=43 38   DATA[7:0]=34 31 42 36 32 28 27 25
+#   CS_N=21  RD_N=19  WR_N=26  INT_N=23  RST_N=20      (pin 18 free)
+# Board: tie the W5100's upper address lines A[14:2] to 0 (only A[1:0] wired);
+# DATA[7:0] is bidirectional (SB_IO tristate, single shared output-enable).
+
+class IceBreakerPlatform(LatticeICE40Platform):
+    device      = "iCE40UP5K"
+    package     = "SG48"
+    default_clk = "clk12"
+
+    resources = [
+        Resource("clk12", 0,
+                 Pins("35", dir="i"),
+                 Clock(12e6),
+                 Attrs(GLOBAL=True, IO_STANDARD="SB_LVCMOS")),
+
+        # EXI interface (GC side, SPI Mode 3) — PMOD1A FPGA pins
+        Resource("exi", 0,
+                 Subsignal("clk",   Pins("4",  dir="i")),
+                 Subsignal("mosi",  Pins("2",  dir="i")),
+                 Subsignal("miso",  Pins("47", dir="o")),
+                 Subsignal("cs_n",  Pins("45", dir="i")),
+                 Subsignal("int_n", Pins("3",  dir="o")),
+                 Attrs(IO_STANDARD="SB_LVCMOS")),
+
+        # W5100 indirect parallel bus — PMOD1B + PMOD2 FPGA pins
+        Resource("w5100", 0,
+                 Subsignal("addr",  Pins("43 38", dir="o")),
+                 Subsignal("data",  Pins("34 31 42 36 32 28 27 25", dir="io")),
+                 Subsignal("cs_n",  Pins("21", dir="o")),
+                 Subsignal("rd_n",  Pins("19", dir="o")),
+                 Subsignal("wr_n",  Pins("26", dir="o")),
+                 Subsignal("int_n", Pins("23", dir="i")),
+                 Subsignal("rst_n", Pins("20", dir="o")),
+                 Attrs(IO_STANDARD="SB_LVCMOS")),
+
+        # Bring-up status panel → iCEbreaker ONBOARD parts (dedicated pins, not
+        # on any PMOD, so they coexist with EXI + W5100).  LEDR/LEDG are
+        # active-low discrete LEDs; BTN_N is the user button.
+        # (The onboard RGB LED on pins 39/40/41 needs an SB_RGBA_DRV instance
+        # wired to raw pads — board/version-specific — left as a future add-on
+        # to expose rx/tx/ready as colours; the 2 discrete LEDs cover bring-up.)
+        Resource("ledr", 0, Pins("11", dir="o"), Attrs(IO_STANDARD="SB_LVCMOS")),
+        Resource("ledg", 0, Pins("37", dir="o"), Attrs(IO_STANDARD="SB_LVCMOS")),
+        Resource("btn",  0, Pins("10", dir="i"), Attrs(IO_STANDARD="SB_LVCMOS")),
+    ]
+
+    connectors = []
+
+    def toolchain_program(self, products, name):
+        iceprog = os.environ.get("ICEPROG", "iceprog")
+        with products.extract(f"{name}.bin") as bitstream_filename:
+            subprocess.check_call([iceprog, bitstream_filename])
+
+
+# ── BBATop with platform resource wiring ─────────────────────────────────
+
+class BBATopSynth(BBATop):
+    """BBATop with platform pin connections added in elaborate()."""
+
+    def elaborate(self, platform):
+        m = super().elaborate(platform)
+
+        if platform is not None:
+            exi   = platform.request("exi",   0)
+            w5100 = platform.request("w5100", 0)
+
+            m.d.comb += [
+                self.exi_clk  .eq(exi.clk.i),
+                self.exi_mosi .eq(exi.mosi.i),
+                self.exi_cs_n .eq(exi.cs_n.i),
+                exi.miso.o    .eq(self.exi_miso),
+                exi.int_n.o   .eq(self.int_n),
+
+                # W5100 parallel bus (DATA[7:0] bidirectional via SB_IO)
+                w5100.addr.o     .eq(self.w5100_addr),
+                w5100.data.o     .eq(self.w5100_data_o),
+                w5100.data.oe    .eq(self.w5100_data_oe),
+                self.w5100_data_i.eq(w5100.data.i),
+                w5100.cs_n.o     .eq(self.w5100_cs_n),
+                w5100.rd_n.o     .eq(self.w5100_rd_n),
+                w5100.wr_n.o     .eq(self.w5100_wr_n),
+                self.w5100_int_n .eq(w5100.int_n.i),
+                w5100.rst_n.o    .eq(self.w5100_rst_n),
+            ]
+
+            # ── Bring-up status panel → onboard LEDs / button ──────────────
+            # Two discrete LEDs answer the #1 bring-up question on a real GC:
+            #   LEDG = heartbeat (clock alive)   LEDR = EXI activity (GC talking)
+            # The one onboard button → panel btn[1] (manual re-init).
+            if self._status_panel:
+                ledr = platform.request("ledr", 0)
+                ledg = platform.request("ledg", 0)
+                btn  = platform.request("btn",  0)
+                led  = self.panel_led
+
+                m.d.comb += [
+                    ledg.o.eq(~led[0]),     # heartbeat    (active-low LED)
+                    ledr.o.eq(~led[1]),     # EXI activity (active-low LED)
+                    # btn[0]/[2] held released (active-low idle = 1)
+                    self.panel_btn.eq(Cat(C(1, 1), btn.i, C(1, 1))),
+                ]
+
+        return m
+
+
+# ── Entry point ───────────────────────────────────────────────────────────
+#
+# Seed sweep: nextpnr placement is stochastic.  With ~22% LC utilisation
+# routing dominates timing, so different seeds can vary fmax by ±20%.
+# Pass --seeds N to try N seeds (default 1, i.e. seed 1 only).
+# The build directory is reused across seeds; the final artefact in
+# build/top.bin is the result of the last (or best) seed tried.
+
+if __name__ == "__main__":
+    do_flash  = "--flash"  in sys.argv
+    n_seeds   = next((int(sys.argv[i+1]) for i, a in enumerate(sys.argv)
+                      if a == "--seeds"), 1)
+
+    platform = IceBreakerPlatform()
+    print(f"Synthesizing BBATop for {platform.device}-{platform.package}  "
+          f"(do_program={do_flash}, seeds=1..{n_seeds})")
+
+    best_seed = 1
+    best_fmax = 0.0
+    for seed in range(1, n_seeds + 1):
+        print(f"\n{'='*60}")
+        print(f"  Seed {seed}/{n_seeds}")
+        print(f"{'='*60}")
+        opts = (f"--opt-timing --seed {seed} --timing-allow-fail")
+        try:
+            platform.build(BBATopSynth(status_panel=True), do_program=False,
+                           verbose=True, nextpnr_opts=opts)
+        except Exception as exc:
+            # nextpnr exits non-zero even with --timing-allow-fail on some
+            # versions; treat as non-fatal timing failure.
+            print(f"  [seed {seed}] build exception (timing?): {exc}")
+
+        # Parse fmax from nextpnr log in build/top.tim (if present)
+        import glob, re
+        tim_files = glob.glob("build/top.tim") + glob.glob("build/*.tim")
+        fmax_exi = 0.0
+        for tf in tim_files:
+            try:
+                with open(tf) as f:
+                    for line in f:
+                        m_ = re.search(
+                            r"Max frequency.*exi.*?:\s*([\d.]+)\s*MHz", line)
+                        if m_:
+                            fmax_exi = float(m_.group(1))
+            except OSError:
+                pass
+        print(f"  [seed {seed}] exi fmax extracted: {fmax_exi:.1f} MHz")
+        if fmax_exi > best_fmax:
+            best_fmax = fmax_exi
+            best_seed = seed
+
+    print(f"\nBest seed: {best_seed}  exi fmax: {best_fmax:.1f} MHz")
+
+    if do_flash:
+        print(f"\nFlashing with seed {best_seed}...")
+        opts = f"--opt-timing --seed {best_seed} --timing-allow-fail"
+        platform.build(BBATopSynth(status_panel=True), do_program=True,
+                       verbose=True, nextpnr_opts=opts)
+
+    print("Done.")
diff --git a/exi_bba/tx_frame_drain.py b/exi_bba/tx_frame_drain.py
new file mode 100644
index 0000000..5e6bc9e
--- /dev/null
+++ b/exi_bba/tx_frame_drain.py
@@ -0,0 +1,253 @@
+"""TX frame drain — sync domain (24 MHz).
+
+Drains the tx_bytes AsyncFIFO (written by BBARegisterFile in the exi domain),
+forwards each byte to W5500SPIMaster with SOF/EOF framing, then pulses tx_irq
+to notify the GC that the transmit is complete.
+
+Flow
+----
+1. Wait for tx_len FIFO to have a length word (signals a complete frame queued).
+2. Pop the length from tx_len FIFO.
+3. Assert tx_sof on first byte, tx_eof on last byte, consuming tx_bytes FIFO.
+4. When W5500SPIMaster accepts the final byte: pulse tx_irq.
+
+The tx_bytes AsyncFIFO (exi→sync, 8-bit, depth=16) and tx_ctrl FIFO (exi→sync,
+16-bit, depth=4) are instantiated in BBARegisterFile and their sync-domain read
+sides are exposed as ports wired here by BBATop.
+"""
+
+from amaranth import *
+
+__all__ = ["TXFrameDrain"]
+
+
+class TXFrameDrain(Elaboratable):
+    """Drains BBA TX FIFOs and forwards frames to W5500SPIMaster.
+
+    TX FIFO read interfaces (async FIFOs, sync-domain read side)
+    ---------------------------------------------------------------
+    tx_bytes_r_data  / tx_bytes_r_en / tx_bytes_r_rdy  : byte stream
+    tx_ctrl_r_data   / tx_ctrl_r_en  / tx_ctrl_r_rdy   : 16-bit frame length
+
+    W5500 streaming output (sync domain, to W5500SPIMaster)
+    -------------------------------------------------------
+    tx_data / tx_valid / tx_ready / tx_sof / tx_eof
+
+    CDC output (sync→exi, via PulseSynchronizer in BBATop)
+    -------------------------------------------------------
+    tx_irq : 1-cycle pulse when frame transmission is handed off to W5500SPIMaster
+    """
+
+    def __init__(self):
+        # tx_bytes FIFO read side
+        self.tx_bytes_r_data = Signal(8)
+        self.tx_bytes_r_en   = Signal()
+        self.tx_bytes_r_rdy  = Signal()
+
+        # tx_ctrl FIFO read side (frame length)
+        self.tx_ctrl_r_data  = Signal(16)
+        self.tx_ctrl_r_en    = Signal()
+        self.tx_ctrl_r_rdy   = Signal()
+
+        # W5500 streaming TX interface
+        self.tx_data  = Signal(8)
+        self.tx_valid = Signal()
+        self.tx_ready = Signal()
+        self.tx_sof   = Signal()
+        self.tx_eof   = Signal()
+
+        # TX done pulse → PulseSynchronizer
+        self.tx_irq = Signal()
+
+    def elaborate(self, platform):
+        m = Module()
+
+        frame_len    = Signal(16)   # bytes still to LOAD from FIFO (incl. held one)
+        is_first     = Signal()     # next byte loaded is the first (SOF)
+        load_pending = Signal()     # 1-bit "more bytes to load" flag (replaces
+                                    # a 16-bit frame_len!=0 compare in the
+                                    # combinational FIFO read-enable path)
+
+        # ── Registered holding stage presented to W5500 ──────────────────
+        # All W5500-facing outputs are driven from these registers.  This
+        # breaks the long combinational path that previously ran from the
+        # tx_bytes FIFO read pointer, out through W5500 (tx_ready) and the
+        # is_first/eof logic, and back into the FIFO pointer increment — the
+        # sync-domain critical path.  The FIFO read-enable now depends only on
+        # the registered hold_valid and the FIFO's own r_rdy.
+        hold_data  = Signal(8)
+        hold_valid = Signal()
+        hold_sof   = Signal()
+        hold_eof   = Signal()
+
+        m.d.sync += self.tx_irq.eq(0)   # default
+
+        m.d.comb += [
+            self.tx_data .eq(hold_data),
+            self.tx_valid.eq(hold_valid),
+            self.tx_sof  .eq(hold_sof),
+            self.tx_eof  .eq(hold_eof),
+        ]
+
+        # W5500 took the currently-held byte this cycle
+        hold_consumed = Signal()
+        m.d.comb += hold_consumed.eq(hold_valid & self.tx_ready)
+
+        # FIFO read-enable defaults (combinational, no W5500 dependency)
+        m.d.comb += self.tx_bytes_r_en.eq(0)
+        m.d.comb += self.tx_ctrl_r_en .eq(0)
+
+        with m.FSM(domain="sync", name="tx_fsm"):
+
+            with m.State("IDLE"):
+                # Wait for a complete frame length in tx_ctrl FIFO
+                with m.If(self.tx_ctrl_r_rdy):
+                    m.d.comb += self.tx_ctrl_r_en.eq(1)
+                    m.d.sync += frame_len.eq(self.tx_ctrl_r_data)
+                    m.d.sync += is_first.eq(1)
+                    # A frame with length 0 has nothing to load.
+                    m.d.sync += load_pending.eq(self.tx_ctrl_r_data != 0)
+                    m.next = "DRAIN"
+
+            with m.State("DRAIN"):
+                # Load the next byte into the holding register only when it is
+                # empty.  Costs one idle sync cycle per byte, negligible
+                # against the W5500 SPI rate (~16 sync cycles/byte), and keeps
+                # tx_ready off the FIFO read-enable path entirely.
+                #
+                # The gate uses the registered 1-bit load_pending instead of a
+                # 16-bit (frame_len != 0) reduction, so the combinational path
+                # consume_r_gry → r_rdy → do_load → tx_bytes_r_en stays shallow.
+                do_load = Signal()
+                m.d.comb += do_load.eq(
+                    ~hold_valid & self.tx_bytes_r_rdy & load_pending
+                )
+                m.d.comb += self.tx_bytes_r_en.eq(do_load)
+
+                with m.If(hold_consumed):
+                    m.d.sync += hold_valid.eq(0)
+                    with m.If(hold_eof):
+                        m.d.sync += self.tx_irq.eq(1)
+                        m.next = "IDLE"
+
+                with m.If(do_load):
+                    m.d.sync += hold_data .eq(self.tx_bytes_r_data)
+                    m.d.sync += hold_valid.eq(1)
+                    m.d.sync += hold_sof  .eq(is_first)
+                    m.d.sync += hold_eof  .eq(frame_len == 1)
+                    m.d.sync += is_first  .eq(0)
+                    m.d.sync += frame_len .eq(frame_len - 1)
+                    # Last byte just loaded → stop further loads (registered).
+                    with m.If(frame_len == 1):
+                        m.d.sync += load_pending.eq(0)
+
+        return m
+
+
+# ── Testbench ─────────────────────────────────────────────────────────────
+
+if __name__ == "__main__":
+    import sys
+    from amaranth.sim import Simulator, Period
+
+    dut = TXFrameDrain()
+    errors = []
+
+    async def _send_frame(ctx, frame):
+        """Drive one frame through the TXFrameDrain DUT.
+
+        Returns (received_bytes, seen_sof, seen_eof, saw_irq).
+
+        Key timing: tx_sof/tx_eof are combinatorial outputs that depend on
+        registered signals (is_first, frame_len) BEFORE they update.  We read
+        them BEFORE each tick to capture the correct values, then advance the
+        FIFO AFTER the tick.
+        """
+        ctx.set(dut.tx_ctrl_r_data,  len(frame))
+        ctx.set(dut.tx_ctrl_r_rdy,   1)
+        ctx.set(dut.tx_bytes_r_data,  frame[0])
+        ctx.set(dut.tx_bytes_r_rdy,   1)
+
+        # Tick 0: IDLE pops ctrl word (comb), FSM→DRAIN, frame_len registered
+        await ctx.tick("sync").repeat(1)
+        # Deassert ctrl FIFO so FSM doesn't re-pop when it returns to IDLE
+        ctx.set(dut.tx_ctrl_r_rdy, 0)
+
+        received = []
+        seen_sof = False
+        seen_eof = False
+        saw_irq  = False
+
+        for _ in range(len(frame) + 10):
+            # Read comb signals BEFORE the tick (is_first and frame_len still
+            # reflect pre-tick registered values, so sof/eof are correct)
+            if ctx.get(dut.tx_valid):
+                d   = ctx.get(dut.tx_data)
+                sof = ctx.get(dut.tx_sof)
+                eof = ctx.get(dut.tx_eof)
+                received.append(d)
+                seen_sof = seen_sof or sof
+                seen_eof = seen_eof or eof
+
+            await ctx.tick("sync").repeat(1)
+
+            if ctx.get(dut.tx_irq):
+                saw_irq = True
+                break
+
+            # Advance FIFO AFTER the tick: present next byte for next tick
+            if len(received) < len(frame):
+                ctx.set(dut.tx_bytes_r_data, frame[len(received)])
+            elif len(received) == len(frame):
+                ctx.set(dut.tx_bytes_r_rdy, 0)
+
+        return received, seen_sof, seen_eof, saw_irq
+
+    async def testbench(ctx):
+        await ctx.tick("sync").repeat(2)
+        ctx.set(dut.tx_ready, 1)
+
+        # ── T1: 4-byte frame ─────────────────────────────────────────────────
+        frame = [0xDE, 0xAD, 0xBE, 0xEF]
+        received, seen_sof, seen_eof, saw_irq = await _send_frame(ctx, frame)
+
+        print(f"T1 received={[hex(b) for b in received]}  sof={seen_sof} eof={seen_eof}  tx_irq={saw_irq}")
+
+        if received != frame:
+            errors.append(f"T1 bytes mismatch: got {received}, want {frame}")
+        if not seen_sof:
+            errors.append("T1: SOF never seen")
+        if not seen_eof:
+            errors.append("T1: EOF never seen")
+        if not saw_irq:
+            errors.append("T1: tx_irq never pulsed")
+
+        await ctx.tick("sync").repeat(4)
+
+        # ── T2: Single-byte frame — SOF and EOF on same byte ─────────────────
+        frame2 = [0x42]
+        received2, s2_sof, s2_eof, s2_irq = await _send_frame(ctx, frame2)
+
+        print(f"T2 byte=0x{received2[0] if received2 else 0:02X}  sof={s2_sof} eof={s2_eof}  tx_irq={s2_irq}")
+
+        if received2 != frame2:
+            errors.append(f"T2: bytes wrong, got {received2}")
+        if not (s2_sof and s2_eof):
+            errors.append("T2: SOF+EOF both must be set for 1-byte frame")
+        if not s2_irq:
+            errors.append("T2: tx_irq not seen for 1-byte frame")
+
+    sim = Simulator(dut)
+    sim.add_clock(Period(MHz=24), domain="sync")
+    sim.add_testbench(testbench)
+
+    with sim.write_vcd("TXFrameDrain.vcd"):
+        sim.run()
+
+    if errors:
+        print("\nFAILURES:")
+        for e in errors:
+            print(" ", e)
+        sys.exit(1)
+    else:
+        print("\nAll tests passed.")
diff --git a/exi_bba/w5100_parallel_master.py b/exi_bba/w5100_parallel_master.py
new file mode 100644
index 0000000..ca354bf
--- /dev/null
+++ b/exi_bba/w5100_parallel_master.py
@@ -0,0 +1,840 @@
+"""W5100 parallel-bus master — sync domain.
+
+A drop-in alternative to `W5500SPIMaster` that talks to a WIZnet **W5100** over
+its **indirect parallel bus** instead of SPI.  The external streaming interface
+(init_req/init_done/par, tx_*, rx_*) is identical, so BBATop wiring is unchanged;
+only the physical pins differ (a parallel bus instead of 4 SPI wires).
+
+Why parallel
+------------
+SPI serialises 8 bits per byte, so on this UP5K (whose W5500-operating logic
+closes only ~40 MHz) the SPI byte rate caps at ~12 Mbit/s.  A parallel bus moves
+a whole byte per access, so the same ~24 MHz sync logic clears the 27 Mbit/s EXI
+ceiling — the real hard limit — with margin.  See CLAUDE.md.
+
+W5100 indirect bus interface (IDM)
+----------------------------------
+Only two address lines A[1:0] are wired (the upper address lines are tied to 0
+on the board, so a power-up *direct*-mode access at A=00 still lands on MR):
+
+    A[1:0]   register
+    00       MR       (Mode Register — also reachable directly at power-up)
+    01       IDM_AR0  (indirect address, high byte)
+    10       IDM_AR1  (indirect address, low byte)
+    11       IDM_DR   (indirect data — accesses mem[IDM_AR]; auto-increments
+                       IDM_AR when MR.AI is set)
+
+So a register/buffer access is: write IDM_AR0/AR1 with the 16-bit address, then
+read/write IDM_DR.  With MR.AI=1 a multi-byte block is one address-set followed
+by a burst of IDM_DR accesses (the chip auto-increments) — used for SHAR and for
+streaming frame data.
+
+A bus cycle drives A + (for writes) D with /CS and /RD or /WR asserted for
+`strobe_cycles` sync clocks (≥ the W5100's ~80 ns access time at 24 MHz).
+
+Phase status
+------------
+Phase 1 (this file): bus access engine + transaction engine + init sequence,
+verified against a W5100 bus model.  TX/RX MACRAW (with socket-buffer ring
+wraparound) land in phases 2–3.
+"""
+
+from amaranth import *
+
+__all__ = ["W5100ParallelMaster"]
+
+# ── W5100 register addresses (indirect 16-bit address space) ────────────────
+_MR        = 0x0000   # Mode register (common)
+_SHAR0     = 0x0009   # Source MAC, 6 bytes
+_IR        = 0x0015   # Interrupt register
+_IMR       = 0x0016   # Interrupt mask
+_RMSR      = 0x001A   # RX memory size (2 bits/socket)
+_TMSR      = 0x001B   # TX memory size
+_S0_MR     = 0x0400   # Socket 0 mode
+_S0_CR     = 0x0401   # Socket 0 command
+_S0_IR     = 0x0402   # Socket 0 interrupt
+_S0_SR     = 0x0403   # Socket 0 status
+_S0_TX_FSR = 0x0420   # Socket 0 TX free size (2 bytes)
+_S0_TX_RD  = 0x0422   # Socket 0 TX read pointer
+_S0_TX_WR  = 0x0424   # Socket 0 TX write pointer
+_S0_RX_RSR = 0x0426   # Socket 0 RX received size (2 bytes)
+_S0_RX_RD  = 0x0428   # Socket 0 RX read pointer
+
+_TX_BASE   = 0x4000   # Socket 0 TX buffer base (default 2 KB window)
+_RX_BASE   = 0x6000   # Socket 0 RX buffer base
+_S0_TX_MASK = 0x07FF  # 2 KB ring mask
+_S0_RX_MASK = 0x07FF
+
+# MR bits / command / mode values
+_MR_RST = 0x80
+_MR_AI  = 0x02        # address auto-increment (indirect mode)
+_MR_IND = 0x01        # indirect bus interface mode
+_S0_MR_MACRAW = 0x04
+_CR_OPEN = 0x01
+_CR_SEND = 0x20
+_CR_RECV = 0x40
+
+# Indirect-mode address selects (A[1:0])
+_A_MR  = 0b00
+_A_AR0 = 0b01         # IDM_AR high byte
+_A_AR1 = 0b10         # IDM_AR low byte
+_A_DR  = 0b11         # IDM_DR (data)
+
+
+class W5100ParallelMaster(Elaboratable):
+    """W5100 master over the indirect parallel bus, sync clock domain.
+
+    Physical bus pins
+    -----------------
+    bus_addr   : A[1:0] output
+    bus_data_o : D[7:0] output value (drive when bus_data_oe=1)
+    bus_data_oe: data-bus output enable (1=FPGA drives D, 0=W5100 drives D)
+    bus_data_i : D[7:0] input value (sampled during reads)
+    cs_n / rd_n / wr_n : bus control (active low)
+    w5100_int_n : W5100 INT_N input (active low)
+    w5100_rst_n : W5100 hardware reset (active low)
+
+    Init / TX / RX interfaces are identical to W5500SPIMaster.
+    """
+
+    def __init__(self, strobe_cycles=3, reset_cycles=24000):
+        # /RD//WR strobe width in sync cycles (≥ W5100 access time).
+        self._strobe = strobe_cycles
+        # MR-reset settle wait; testbench overrides with a small value.
+        self._reset_cycles = reset_cycles
+
+        # Physical parallel bus
+        self.bus_addr    = Signal(2)
+        self.bus_data_o  = Signal(8)
+        self.bus_data_oe = Signal()
+        self.bus_data_i  = Signal(8)
+        self.cs_n        = Signal(init=1)
+        self.rd_n        = Signal(init=1)
+        self.wr_n        = Signal(init=1)
+        self.w5100_int_n = Signal(init=1)
+        self.w5100_rst_n = Signal(init=1)
+
+        # Init control
+        self.init_req  = Signal()
+        self.init_done = Signal()
+        self.par       = Signal(48)   # MAC address (PAR0..5 packed)
+
+        # TX stream
+        self.tx_data   = Signal(8)
+        self.tx_valid  = Signal()
+        self.tx_ready  = Signal()
+        self.tx_sof    = Signal()
+        self.tx_eof    = Signal()
+
+        # RX stream
+        self.rx_data   = Signal(8)
+        self.rx_valid  = Signal()
+        self.rx_ready  = Signal()
+        self.rx_sof    = Signal()
+        self.rx_eof    = Signal()
+
+    def elaborate(self, platform):
+        m = Module()
+        STROBE = self._strobe
+
+        # ── Bus access engine: one indirect-bus read or write cycle ──────────
+        bus_go    = Signal()
+        bus_rw    = Signal()        # 1 = write, 0 = read
+        bus_a     = Signal(2)
+        bus_wdata = Signal(8)
+        bus_rdata = Signal(8)
+        bus_done  = Signal()
+        bus_ctr   = Signal(range(STROBE + 2))
+        rw_r      = Signal()
+
+        # registered physical outputs
+        a_o  = Signal(2)
+        d_o  = Signal(8)
+        d_oe = Signal()
+        cs_r = Signal(init=1)
+        rd_r = Signal(init=1)
+        wr_r = Signal(init=1)
+        m.d.comb += [
+            self.bus_addr   .eq(a_o),
+            self.bus_data_o .eq(d_o),
+            self.bus_data_oe.eq(d_oe),
+            self.cs_n       .eq(cs_r),
+            self.rd_n       .eq(rd_r),
+            self.wr_n       .eq(wr_r),
+        ]
+
+        m.d.sync += bus_done.eq(0)
+        with m.FSM(domain="sync", name="bus_fsm"):
+            with m.State("IDLE"):
+                m.d.sync += [cs_r.eq(1), rd_r.eq(1), wr_r.eq(1), d_oe.eq(0)]
+                with m.If(bus_go):
+                    m.d.sync += [a_o.eq(bus_a), rw_r.eq(bus_rw),
+                                 cs_r.eq(0), bus_ctr.eq(0)]
+                    with m.If(bus_rw):
+                        m.d.sync += [d_o.eq(bus_wdata), d_oe.eq(1), wr_r.eq(0)]
+                    with m.Else():
+                        m.d.sync += rd_r.eq(0)
+                    m.next = "STROBE"
+            with m.State("STROBE"):
+                m.d.sync += bus_ctr.eq(bus_ctr + 1)
+                with m.If(bus_ctr == STROBE - 1):
+                    with m.If(~rw_r):
+                        m.d.sync += bus_rdata.eq(self.bus_data_i)   # sample read
+                    m.d.sync += [rd_r.eq(1), wr_r.eq(1)]
+                    m.next = "FINISH"
+            with m.State("FINISH"):
+                m.d.sync += [cs_r.eq(1), d_oe.eq(0), bus_done.eq(1)]
+                m.next = "IDLE"
+
+        # ── Transaction engine: address-set + payload over the bus engine ────
+        WBUF = 8
+        xfer_start  = Signal()
+        xfer_direct = Signal()      # 1 = single A=00 access (MR), addr ignored
+        xfer_addr   = Signal(16)
+        xfer_rw     = Signal()      # payload direction: 1=write, 0=read
+        xfer_len    = Signal(range(WBUF + 1))
+        xfer_stream = Signal()      # stream-write payload from s_*
+        xfer_sread  = Signal()      # stream-read payload to r_*
+        xfer_rcount = Signal(16)
+        xfer_done   = Signal()
+
+        wbuf = Array([Signal(8, name=f"wbuf{i}") for i in range(WBUF)])
+        rbuf = Array([Signal(8, name=f"rbuf{i}") for i in range(WBUF)])
+        s_count  = Signal(16)       # bytes streamed-written (advances pointers)
+        xfer_idx = Signal(range(WBUF + 1))
+        s_last_r = Signal()
+        r_idx    = Signal(16)
+
+        # Streaming payload interfaces.
+        s_data, s_valid, s_last, s_consume = Signal(8), Signal(), Signal(), Signal()
+        r_data, r_valid, r_first, r_last, r_ready = (
+            Signal(8), Signal(), Signal(), Signal(), Signal())
+        # TX stream source = external tx interface (Phase 2).
+        m.d.comb += [s_data.eq(self.tx_data), s_valid.eq(self.tx_valid),
+                     s_last.eq(self.tx_eof), self.tx_ready.eq(s_consume)]
+        # RX stream sink = external rx interface (Phase 3).
+        m.d.comb += [self.rx_data.eq(r_data), self.rx_valid.eq(r_valid),
+                     self.rx_sof.eq(r_first), self.rx_eof.eq(r_last),
+                     r_ready.eq(self.rx_ready)]
+
+        # Socket-buffer ring wraparound.  Unlike the W5500, the W5100's IDM
+        # address does NOT auto-wrap at the socket-buffer boundary — it just
+        # increments linearly into the next region.  So when a streamed access
+        # reaches `xfer_wend`, the engine re-sets IDM_AR back to `xfer_wbase`.
+        xfer_wrap  = Signal()
+        xfer_wbase = Signal(16)
+        xfer_wend  = Signal(16)
+        cur_addr   = Signal(16)
+
+        m.d.comb += [bus_go.eq(0), bus_rw.eq(0), bus_a.eq(0), bus_wdata.eq(0)]
+        m.d.comb += [s_consume.eq(0), r_valid.eq(0), r_data.eq(0),
+                     r_first.eq(0), r_last.eq(0)]
+        m.d.sync += xfer_done.eq(0)
+
+        def bus_write(a, data):
+            m.d.comb += [bus_go.eq(1), bus_rw.eq(1), bus_a.eq(a), bus_wdata.eq(data)]
+
+        def bus_read(a):
+            m.d.comb += [bus_go.eq(1), bus_rw.eq(0), bus_a.eq(a)]
+
+        with m.FSM(domain="sync", name="xfer_fsm"):
+            with m.State("IDLE"):
+                with m.If(xfer_start):
+                    m.d.sync += [xfer_idx.eq(0), s_count.eq(0), r_idx.eq(0),
+                                 cur_addr.eq(xfer_addr)]
+                    with m.If(xfer_direct):
+                        m.next = "DIRECT"
+                    with m.Else():
+                        m.next = "AR_HI"
+
+            # Direct MR write (A=00)
+            with m.State("DIRECT"):
+                bus_write(_A_MR, wbuf[0])
+                m.next = "DIRECT_W"
+            with m.State("DIRECT_W"):
+                with m.If(bus_done):
+                    m.next = "FINISH"
+
+            # Set indirect address IDM_AR (high then low)
+            with m.State("AR_HI"):
+                bus_write(_A_AR0, xfer_addr[8:16])
+                m.next = "AR_HI_W"
+            with m.State("AR_HI_W"):
+                with m.If(bus_done):
+                    m.next = "AR_LO"
+            with m.State("AR_LO"):
+                bus_write(_A_AR1, xfer_addr[0:8])
+                m.next = "AR_LO_W"
+            with m.State("AR_LO_W"):
+                with m.If(bus_done):
+                    with m.If(xfer_stream):
+                        m.next = "SW_LOAD"
+                    with m.Elif(xfer_sread):
+                        m.next = "SR_LOAD"
+                    with m.Elif(xfer_rw):
+                        m.next = "WB_ISSUE"
+                    with m.Else():
+                        m.next = "RB_ISSUE"
+
+            # Fixed-length write from wbuf (IDM_DR burst, auto-increment)
+            with m.State("WB_ISSUE"):
+                bus_write(_A_DR, wbuf[xfer_idx])
+                m.next = "WB_WAIT"
+            with m.State("WB_WAIT"):
+                with m.If(bus_done):
+                    m.d.sync += xfer_idx.eq(xfer_idx + 1)
+                    with m.If(xfer_idx + 1 == xfer_len):
+                        m.next = "FINISH"
+                    with m.Else():
+                        m.next = "WB_ISSUE"
+
+            # Fixed-length read into rbuf (with ring wrap, for the length header)
+            with m.State("RB_ISSUE"):
+                with m.If(xfer_wrap & (cur_addr == xfer_wend)):
+                    m.next = "RB_WRAP_HI"
+                with m.Else():
+                    bus_read(_A_DR)
+                    m.next = "RB_WAIT"
+            with m.State("RB_WAIT"):
+                with m.If(bus_done):
+                    m.d.sync += rbuf[xfer_idx].eq(bus_rdata)
+                    m.d.sync += [xfer_idx.eq(xfer_idx + 1), cur_addr.eq(cur_addr + 1)]
+                    with m.If(xfer_idx + 1 == xfer_len):
+                        m.next = "FINISH"
+                    with m.Else():
+                        m.next = "RB_ISSUE"
+            with m.State("RB_WRAP_HI"):
+                bus_write(_A_AR0, xfer_wbase[8:16])
+                m.next = "RB_WRAP_HI_W"
+            with m.State("RB_WRAP_HI_W"):
+                with m.If(bus_done):
+                    m.next = "RB_WRAP_LO"
+            with m.State("RB_WRAP_LO"):
+                bus_write(_A_AR1, xfer_wbase[0:8])
+                m.next = "RB_WRAP_LO_W"
+            with m.State("RB_WRAP_LO_W"):
+                with m.If(bus_done):
+                    m.d.sync += cur_addr.eq(xfer_wbase)
+                    m.next = "RB_ISSUE"
+
+            # Stream-write payload from s_* until s_last (with ring wrap)
+            with m.State("SW_LOAD"):
+                with m.If(xfer_wrap & (cur_addr == xfer_wend)):
+                    m.next = "SW_WRAP_HI"
+                with m.Elif(s_valid):
+                    bus_write(_A_DR, s_data)
+                    m.d.sync += s_last_r.eq(s_last)
+                    m.next = "SW_WAIT"
+            with m.State("SW_WAIT"):
+                with m.If(bus_done):
+                    m.d.comb += s_consume.eq(1)
+                    m.d.sync += [s_count.eq(s_count + 1), cur_addr.eq(cur_addr + 1)]
+                    with m.If(s_last_r):
+                        m.next = "FINISH"
+                    with m.Else():
+                        m.next = "SW_LOAD"
+            with m.State("SW_WRAP_HI"):
+                bus_write(_A_AR0, xfer_wbase[8:16])
+                m.next = "SW_WRAP_HI_W"
+            with m.State("SW_WRAP_HI_W"):
+                with m.If(bus_done):
+                    m.next = "SW_WRAP_LO"
+            with m.State("SW_WRAP_LO"):
+                bus_write(_A_AR1, xfer_wbase[0:8])
+                m.next = "SW_WRAP_LO_W"
+            with m.State("SW_WRAP_LO_W"):
+                with m.If(bus_done):
+                    m.d.sync += cur_addr.eq(xfer_wbase)
+                    m.next = "SW_LOAD"
+
+            # Stream-read payload to r_* for rcount bytes (with ring wrap)
+            with m.State("SR_LOAD"):
+                with m.If(r_idx == xfer_rcount):
+                    m.next = "FINISH"
+                with m.Elif(xfer_wrap & (cur_addr == xfer_wend)):
+                    m.next = "SR_WRAP_HI"
+                with m.Else():
+                    bus_read(_A_DR)
+                    m.next = "SR_WAIT"
+            with m.State("SR_WAIT"):
+                with m.If(bus_done):
+                    m.next = "SR_PUSH"
+            with m.State("SR_PUSH"):
+                m.d.comb += [r_data.eq(bus_rdata), r_valid.eq(1),
+                             r_first.eq(r_idx == 0),
+                             r_last.eq(r_idx + 1 == xfer_rcount)]
+                with m.If(r_ready):
+                    m.d.sync += [r_idx.eq(r_idx + 1), cur_addr.eq(cur_addr + 1)]
+                    m.next = "SR_LOAD"
+            with m.State("SR_WRAP_HI"):
+                bus_write(_A_AR0, xfer_wbase[8:16])
+                m.next = "SR_WRAP_HI_W"
+            with m.State("SR_WRAP_HI_W"):
+                with m.If(bus_done):
+                    m.next = "SR_WRAP_LO"
+            with m.State("SR_WRAP_LO"):
+                bus_write(_A_AR1, xfer_wbase[0:8])
+                m.next = "SR_WRAP_LO_W"
+            with m.State("SR_WRAP_LO_W"):
+                with m.If(bus_done):
+                    m.d.sync += cur_addr.eq(xfer_wbase)
+                    m.next = "SR_LOAD"
+
+            with m.State("FINISH"):
+                m.d.sync += xfer_done.eq(1)
+                m.next = "IDLE"
+
+        # ── Control regs ─────────────────────────────────────────────────────
+        mac_shadow = Array([Signal(8, name=f"mac{i}") for i in range(6)])
+        wait_ctr   = Signal(range(self._reset_cycles + 2))
+        tx_wr      = Signal(16)
+        rx_rsr     = Signal(16)
+        rx_rd      = Signal(16)
+        pkt_len    = Signal(16)
+
+        def write_reg(name, addr, payload, nxt, direct=False):
+            """Emit a 2-state block that writes `payload` (a list) to `addr`."""
+            with m.State(name):
+                m.d.sync += [xfer_addr.eq(addr), xfer_rw.eq(1),
+                             xfer_stream.eq(0), xfer_sread.eq(0), xfer_wrap.eq(0),
+                             xfer_direct.eq(1 if direct else 0),
+                             xfer_len.eq(len(payload))]
+                for i, b in enumerate(payload):
+                    m.d.sync += wbuf[i].eq(b)
+                m.d.sync += xfer_start.eq(1)
+                m.next = name + "_W"
+            with m.State(name + "_W"):
+                m.d.sync += xfer_start.eq(0)
+                with m.If(xfer_done):
+                    m.next = nxt
+
+        # ── Main control FSM (Phase 1: init only) ────────────────────────────
+        with m.FSM(domain="sync", name="main_fsm"):
+            with m.State("IDLE"):
+                m.d.sync += self.init_done.eq(0)
+                with m.If(self.init_req):
+                    for i in range(6):
+                        m.d.sync += mac_shadow[i].eq(self.par[i*8:(i+1)*8])
+                    m.next = "MR_RST"
+                with m.Elif(~self.w5100_int_n):
+                    m.next = "RX_CHECK"
+                with m.Elif(self.tx_valid & self.tx_sof):
+                    m.next = "TX_START"
+
+            # MR = 0x80 software reset (direct A=00), then settle.
+            write_reg("MR_RST", _MR, [_MR_RST], "MR_WAIT", direct=True)
+            with m.State("MR_WAIT"):
+                with m.If(wait_ctr == self._reset_cycles):
+                    m.d.sync += wait_ctr.eq(0)
+                    m.next = "MR_MODE"
+                with m.Else():
+                    m.d.sync += wait_ctr.eq(wait_ctr + 1)
+
+            # MR = indirect + auto-increment (direct A=00).
+            write_reg("MR_MODE", _MR, [_MR_IND | _MR_AI], "SHAR", direct=True)
+
+            # SHAR = source MAC (6-byte auto-increment burst).
+            with m.State("SHAR"):
+                m.d.sync += [xfer_addr.eq(_SHAR0), xfer_rw.eq(1),
+                             xfer_stream.eq(0), xfer_sread.eq(0),
+                             xfer_direct.eq(0), xfer_len.eq(6)]
+                for i in range(6):
+                    m.d.sync += wbuf[i].eq(mac_shadow[i])
+                m.d.sync += xfer_start.eq(1)
+                m.next = "SHAR_W"
+            with m.State("SHAR_W"):
+                m.d.sync += xfer_start.eq(0)
+                with m.If(xfer_done):
+                    m.next = "MEMSZ"
+
+            # RMSR/TMSR = 0x55 (2 KB per socket — default; socket 0 used).
+            write_reg("MEMSZ", _RMSR, [0x55, 0x55], "S0_MODE")  # RMSR then TMSR
+            # Socket 0: MACRAW mode, OPEN, enable interrupt.
+            write_reg("S0_MODE", _S0_MR, [_S0_MR_MACRAW], "S0_OPEN")
+            write_reg("S0_OPEN", _S0_CR, [_CR_OPEN], "S0_IMR")
+            write_reg("S0_IMR",  _IMR,  [0x01], "INIT_DONE")   # enable S0 IRQ
+
+            with m.State("INIT_DONE"):
+                m.d.sync += self.init_done.eq(1)
+                m.next = "IDLE"
+
+            # ── TX MACRAW ────────────────────────────────────────────────────
+            # read S0_TX_WR → stream frame into the TX buffer at that offset
+            # (ring-wrapping at the 2 KB boundary) → advance S0_TX_WR → SEND.
+            with m.State("TX_START"):           # read S0_TX_WR (2 bytes)
+                m.d.sync += [xfer_addr.eq(_S0_TX_WR), xfer_rw.eq(0),
+                             xfer_stream.eq(0), xfer_sread.eq(0), xfer_wrap.eq(0),
+                             xfer_direct.eq(0), xfer_len.eq(2)]
+                m.d.sync += xfer_start.eq(1)
+                m.next = "TX_RDPTR_W"
+            with m.State("TX_RDPTR_W"):
+                m.d.sync += xfer_start.eq(0)
+                with m.If(xfer_done):
+                    m.d.sync += tx_wr.eq(Cat(rbuf[1], rbuf[0]))   # big-endian
+                    m.next = "TX_DATA"
+
+            with m.State("TX_DATA"):            # stream frame → TX buffer
+                m.d.sync += [xfer_addr.eq(_TX_BASE + (tx_wr & _S0_TX_MASK)),
+                             xfer_rw.eq(1), xfer_stream.eq(1), xfer_sread.eq(0),
+                             xfer_direct.eq(0), xfer_wrap.eq(1),
+                             xfer_wbase.eq(_TX_BASE),
+                             xfer_wend.eq(_TX_BASE + _S0_TX_MASK + 1)]
+                m.d.sync += xfer_start.eq(1)
+                m.next = "TX_DATA_W"
+            with m.State("TX_DATA_W"):
+                m.d.sync += xfer_start.eq(0)
+                with m.If(xfer_done):
+                    m.d.sync += [xfer_stream.eq(0), xfer_wrap.eq(0),
+                                 tx_wr.eq(tx_wr + s_count)]   # advanced pointer
+                    m.next = "TX_UPDPTR"
+
+            with m.State("TX_UPDPTR"):          # write back S0_TX_WR
+                m.d.sync += [xfer_addr.eq(_S0_TX_WR), xfer_rw.eq(1),
+                             xfer_stream.eq(0), xfer_sread.eq(0), xfer_wrap.eq(0),
+                             xfer_direct.eq(0), xfer_len.eq(2)]
+                m.d.sync += [wbuf[0].eq(tx_wr[8:16]), wbuf[1].eq(tx_wr[0:8])]
+                m.d.sync += xfer_start.eq(1)
+                m.next = "TX_UPDPTR_W"
+            with m.State("TX_UPDPTR_W"):
+                m.d.sync += xfer_start.eq(0)
+                with m.If(xfer_done):
+                    m.next = "TX_SEND"
+
+            # S0_CR = SEND
+            write_reg("TX_SEND", _S0_CR, [_CR_SEND], "IDLE")
+
+            # ── RX MACRAW ────────────────────────────────────────────────────
+            # On W5100 INT: read RX_RSR; if non-zero read RX_RD, read the 2-byte
+            # MACRAW length, stream (length−2) frame bytes out (ring-wrapping),
+            # advance RX_RD by the length, issue RECV, clear the RECV interrupt.
+            with m.State("RX_CHECK"):           # read S0_RX_RSR (2 bytes)
+                m.d.sync += [xfer_addr.eq(_S0_RX_RSR), xfer_rw.eq(0),
+                             xfer_stream.eq(0), xfer_sread.eq(0), xfer_wrap.eq(0),
+                             xfer_direct.eq(0), xfer_len.eq(2)]
+                m.d.sync += xfer_start.eq(1)
+                m.next = "RX_RSR_W"
+            with m.State("RX_RSR_W"):
+                m.d.sync += xfer_start.eq(0)
+                with m.If(xfer_done):
+                    m.d.sync += rx_rsr.eq(Cat(rbuf[1], rbuf[0]))
+                    m.next = "RX_RSR_CHK"
+            with m.State("RX_RSR_CHK"):
+                with m.If(rx_rsr == 0):
+                    m.next = "IDLE"             # nothing received
+                with m.Else():
+                    m.next = "RX_RDPTR"
+
+            with m.State("RX_RDPTR"):           # read S0_RX_RD (2 bytes)
+                m.d.sync += [xfer_addr.eq(_S0_RX_RD), xfer_rw.eq(0),
+                             xfer_stream.eq(0), xfer_sread.eq(0), xfer_wrap.eq(0),
+                             xfer_direct.eq(0), xfer_len.eq(2)]
+                m.d.sync += xfer_start.eq(1)
+                m.next = "RX_RDPTR_W"
+            with m.State("RX_RDPTR_W"):
+                m.d.sync += xfer_start.eq(0)
+                with m.If(xfer_done):
+                    m.d.sync += rx_rd.eq(Cat(rbuf[1], rbuf[0]))
+                    m.next = "RX_LEN"
+
+            with m.State("RX_LEN"):             # read 2-byte MACRAW length (wrap)
+                m.d.sync += [xfer_addr.eq(_RX_BASE + (rx_rd & _S0_RX_MASK)),
+                             xfer_rw.eq(0), xfer_stream.eq(0), xfer_sread.eq(0),
+                             xfer_direct.eq(0), xfer_len.eq(2), xfer_wrap.eq(1),
+                             xfer_wbase.eq(_RX_BASE),
+                             xfer_wend.eq(_RX_BASE + _S0_RX_MASK + 1)]
+                m.d.sync += xfer_start.eq(1)
+                m.next = "RX_LEN_W"
+            with m.State("RX_LEN_W"):
+                m.d.sync += xfer_start.eq(0)
+                with m.If(xfer_done):
+                    m.d.sync += pkt_len.eq(Cat(rbuf[1], rbuf[0]))
+                    m.next = "RX_FRAME"
+
+            with m.State("RX_FRAME"):           # stream (pkt_len−2) frame bytes
+                m.d.sync += [xfer_addr.eq(_RX_BASE + ((rx_rd + 2) & _S0_RX_MASK)),
+                             xfer_rw.eq(0), xfer_stream.eq(0), xfer_sread.eq(1),
+                             xfer_direct.eq(0), xfer_rcount.eq(pkt_len - 2),
+                             xfer_wrap.eq(1), xfer_wbase.eq(_RX_BASE),
+                             xfer_wend.eq(_RX_BASE + _S0_RX_MASK + 1)]
+                m.d.sync += xfer_start.eq(1)
+                m.next = "RX_FRAME_W"
+            with m.State("RX_FRAME_W"):
+                m.d.sync += xfer_start.eq(0)
+                with m.If(xfer_done):
+                    m.d.sync += [xfer_sread.eq(0), xfer_wrap.eq(0)]
+                    m.next = "RX_UPDRD"
+
+            with m.State("RX_UPDRD"):           # S0_RX_RD += pkt_len, write back
+                m.d.sync += [xfer_addr.eq(_S0_RX_RD), xfer_rw.eq(1),
+                             xfer_stream.eq(0), xfer_sread.eq(0), xfer_wrap.eq(0),
+                             xfer_direct.eq(0), xfer_len.eq(2)]
+                m.d.sync += [wbuf[0].eq((rx_rd + pkt_len)[8:16]),
+                             wbuf[1].eq((rx_rd + pkt_len)[0:8])]
+                m.d.sync += xfer_start.eq(1)
+                m.next = "RX_UPDRD_W"
+            with m.State("RX_UPDRD_W"):
+                m.d.sync += xfer_start.eq(0)
+                with m.If(xfer_done):
+                    m.next = "RX_RECV"
+
+            # S0_CR = RECV, then clear the RECV interrupt bit (S0_IR[2]).
+            write_reg("RX_RECV",   _S0_CR, [_CR_RECV], "RX_CLR_IR")
+            write_reg("RX_CLR_IR", _S0_IR, [0x04],     "IDLE")
+
+        return m
+
+
+# ── Testbench ─────────────────────────────────────────────────────────────
+
+if __name__ == "__main__":
+    import sys
+    from amaranth.sim import Simulator, Period
+
+    dut = W5100ParallelMaster(strobe_cycles=3, reset_cycles=10)
+    errors = []
+
+    MAC = [0x11, 0x22, 0x33, 0x44, 0x55, 0x66]
+    PAR = sum(b << (8 * i) for i, b in enumerate(MAC))
+
+    # Expected indirect-address writes captured by the model (addr, value).
+    # MR is written directly (A=00) → captured as ('MR', value).
+    EXPECTED = [
+        ("MR", _MR_RST),
+        ("MR", _MR_IND | _MR_AI),
+        (_SHAR0 + 0, MAC[0]), (_SHAR0 + 1, MAC[1]), (_SHAR0 + 2, MAC[2]),
+        (_SHAR0 + 3, MAC[3]), (_SHAR0 + 4, MAC[4]), (_SHAR0 + 5, MAC[5]),
+        (_RMSR + 0, 0x55), (_RMSR + 1, 0x55),
+        (_S0_MR, _S0_MR_MACRAW),
+        (_S0_CR, _CR_OPEN),
+        (_IMR, 0x01),
+    ]
+
+    writes = []        # captured (addr-or-'MR', value) — IDM_DR + MR writes
+    model_mem = {}     # W5100 memory image (registers + TX/RX buffers)
+
+    async def w5100_model(ctx):
+        """W5100 indirect-bus slave model: tracks MR/IDM_AR, records IDM_DR and
+        MR writes, and drives bus_data_i for reads.  Mode-0 timing: a write is
+        latched on /WR rising while /CS low; reads driven while /RD low."""
+        idm_ar = 0
+        mr = 0
+        prev_cs = prev_rd = prev_wr = 1
+        async for vals in ctx.tick("sync").sample(
+                dut.cs_n, dut.rd_n, dut.wr_n,
+                dut.bus_addr, dut.bus_data_o, dut.bus_data_oe):
+            cs, rd, wr, a, do, doe = vals[-6:]
+            ai = (mr >> 1) & 1   # MR.AI
+
+            # Drive read data while /RD asserted (combinational, before sample).
+            if cs == 0 and rd == 0:
+                if a == _A_MR:
+                    val = mr
+                elif a == _A_AR0:
+                    val = (idm_ar >> 8) & 0xFF
+                elif a == _A_AR1:
+                    val = idm_ar & 0xFF
+                else:
+                    val = model_mem.get(idm_ar, 0)
+                ctx.set(dut.bus_data_i, val)
+
+            # Latch write on /WR rising edge.
+            if cs == 0 and prev_wr == 0 and wr == 1:
+                if a == _A_MR:
+                    mr = do
+                    writes.append(("MR", do))
+                elif a == _A_AR0:
+                    idm_ar = (idm_ar & 0x00FF) | (do << 8)
+                elif a == _A_AR1:
+                    idm_ar = (idm_ar & 0xFF00) | do
+                else:   # IDM_DR
+                    model_mem[idm_ar] = do
+                    writes.append((idm_ar, do))
+                    # RECV command consumes the RX data: clear RSR (mirrors HW).
+                    if idm_ar == _S0_CR and do == _CR_RECV:
+                        model_mem[_S0_RX_RSR] = 0
+                        model_mem[_S0_RX_RSR + 1] = 0
+                    if ai:
+                        idm_ar = (idm_ar + 1) & 0xFFFF
+            # Auto-increment after a data read (/RD rising, A=DR).
+            if cs == 0 and prev_rd == 0 and rd == 1 and a == _A_DR and ai:
+                idm_ar = (idm_ar + 1) & 0xFFFF
+
+            prev_cs, prev_rd, prev_wr = cs, rd, wr
+
+    async def testbench(ctx):
+        ctx.set(dut.par, PAR)
+        await ctx.tick("sync").repeat(2)
+
+        # T1: trigger init, wait for init_done.
+        ctx.set(dut.init_req, 1)
+        await ctx.tick("sync").repeat(1)
+        ctx.set(dut.init_req, 0)
+
+        done = False
+        for _ in range(4000):
+            await ctx.tick("sync").repeat(1)
+            if ctx.get(dut.init_done):
+                done = True
+                break
+        if not done:
+            errors.append("init_done never asserted")
+
+        print(f"T1 init captured {len(writes)} writes")
+        if writes != EXPECTED:
+            errors.append("init write sequence mismatch")
+            for i in range(max(len(writes), len(EXPECTED))):
+                g = writes[i] if i < len(writes) else None
+                e = EXPECTED[i] if i < len(EXPECTED) else None
+                mark = "" if g == e else "  <-- MISMATCH"
+                gs = f"({g[0]:#06x},{g[1]:#04x})" if g and isinstance(g[0], int) else str(g)
+                es = f"({e[0]:#06x},{e[1]:#04x})" if e and isinstance(e[0], int) else str(e)
+                print(f"  [{i:2}] got {gs:20} exp {es:20}{mark}")
+        else:
+            print("T1 init sequence matches expected (MR, SHAR, mem sizes, "
+                  "S0 MACRAW/OPEN, IMR)")
+
+        # ── helper: stream one TX frame through the external tx interface ─────
+        async def feed_frame(ctx, frame):
+            for i, b in enumerate(frame):
+                ctx.set(dut.tx_data, b)
+                ctx.set(dut.tx_valid, 1)
+                ctx.set(dut.tx_sof, 1 if i == 0 else 0)
+                ctx.set(dut.tx_eof, 1 if i == len(frame) - 1 else 0)
+                got = False
+                for _ in range(400):
+                    await ctx.tick("sync").repeat(1)
+                    if ctx.get(dut.tx_ready):
+                        got = True
+                        break
+                if not got:
+                    errors.append(f"feed_frame: byte {i} never consumed")
+                    return
+            ctx.set(dut.tx_valid, 0)
+            ctx.set(dut.tx_sof, 0)
+            ctx.set(dut.tx_eof, 0)
+            # let TX_UPDPTR + SEND complete
+            for _ in range(200):
+                await ctx.tick("sync").repeat(1)
+                if model_mem.get(_S0_CR) == _CR_SEND:
+                    break
+
+        # ── T2: TX MACRAW frame (TX_WR=0, no wrap) ───────────────────────────
+        FRAME = [0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0xFF, 0x10, 0x20]
+        await feed_frame(ctx, FRAME)
+
+        buf = [model_mem.get(_TX_BASE + i, None) for i in range(len(FRAME))]
+        if buf != FRAME:
+            errors.append(f"T2 TX buffer mismatch: {buf} != {FRAME}")
+        tx_wr_hi = model_mem.get(_S0_TX_WR, 0)
+        tx_wr_lo = model_mem.get(_S0_TX_WR + 1, 0)
+        adv = (tx_wr_hi << 8) | tx_wr_lo
+        if adv != len(FRAME):
+            errors.append(f"T2 S0_TX_WR advance: got {adv}, want {len(FRAME)}")
+        if model_mem.get(_S0_CR) != _CR_SEND:
+            errors.append("T2 SEND command not issued")
+        print(f"T2 TX: buffer={['0x%02X' % b for b in buf]}  "
+              f"TX_WR={adv}  SEND={model_mem.get(_S0_CR)==_CR_SEND}")
+
+        # ── T3: TX MACRAW with ring wraparound (TX_WR near 2 KB boundary) ─────
+        # Pre-load S0_TX_WR = 0x07FE so a 6-byte frame straddles the boundary:
+        # offsets 0x7FE,0x7FF then wraps to 0x000,0x001,0x002,0x003.
+        model_mem[_S0_TX_WR]     = 0x07
+        model_mem[_S0_TX_WR + 1] = 0xFE
+        model_mem[_S0_CR]        = 0x00     # clear so we can detect the new SEND
+        WFRAME = [0x41, 0x42, 0x43, 0x44, 0x45, 0x46]
+        await feed_frame(ctx, WFRAME)
+
+        # expected physical layout
+        exp = {
+            _TX_BASE + 0x7FE: WFRAME[0],
+            _TX_BASE + 0x7FF: WFRAME[1],
+            _TX_BASE + 0x000: WFRAME[2],
+            _TX_BASE + 0x001: WFRAME[3],
+            _TX_BASE + 0x002: WFRAME[4],
+            _TX_BASE + 0x003: WFRAME[5],
+        }
+        for addr, want in exp.items():
+            got = model_mem.get(addr)
+            if got != want:
+                errors.append(f"T3 wrap byte @0x{addr:04X}: got {got}, want 0x{want:02X}")
+        adv2 = (model_mem.get(_S0_TX_WR, 0) << 8) | model_mem.get(_S0_TX_WR + 1, 0)
+        want_wr = (0x07FE + len(WFRAME)) & 0xFFFF
+        if adv2 != want_wr:
+            errors.append(f"T3 wrap S0_TX_WR: got 0x{adv2:04X}, want 0x{want_wr:04X}")
+        ok = all(model_mem.get(a) == v for a, v in exp.items())
+        print(f"T3 TX wrap: bytes_placed_ok={ok}  TX_WR=0x{adv2:04X} (want 0x{want_wr:04X})")
+
+        # ── helper: drive an RX event and collect the streamed-out frame ─────
+        def load_rx(rx_rd_off, frame):
+            """Place a MACRAW packet [len_hi,len_lo,frame...] in the RX buffer at
+            offset rx_rd_off (ring), set RX_RSR/RX_RD, return the 16-bit length."""
+            plen = len(frame) + 2
+            payload = [(plen >> 8) & 0xFF, plen & 0xFF] + list(frame)
+            for i, b in enumerate(payload):
+                off = (rx_rd_off + i) & _S0_RX_MASK
+                model_mem[_RX_BASE + off] = b
+            model_mem[_S0_RX_RSR]     = (plen >> 8) & 0xFF
+            model_mem[_S0_RX_RSR + 1] = plen & 0xFF
+            model_mem[_S0_RX_RD]      = (rx_rd_off >> 8) & 0xFF
+            model_mem[_S0_RX_RD + 1]  = rx_rd_off & 0xFF
+            return plen
+
+        async def do_rx(ctx, rx_rd_off, frame):
+            plen = load_rx(rx_rd_off, frame)
+            ctx.set(dut.rx_ready, 1)
+            collected = []
+            ctx.set(dut.w5100_int_n, 0)          # assert RX interrupt
+            for _ in range(1500):
+                await ctx.tick("sync").repeat(1)
+                if ctx.get(dut.rx_valid) and ctx.get(dut.rx_ready):
+                    collected.append(ctx.get(dut.rx_data))
+                if model_mem.get(_S0_CR) == _CR_RECV:
+                    break
+            ctx.set(dut.w5100_int_n, 1)          # deassert; let it finish + idle
+            for _ in range(300):
+                await ctx.tick("sync").repeat(1)
+            ctx.set(dut.rx_ready, 0)
+            return collected, plen
+
+        # ── T4: RX MACRAW frame (RX_RD=0, no wrap) ───────────────────────────
+        model_mem[_S0_CR] = 0x00
+        RX_FRAME = [0xDE, 0xAD, 0xBE, 0xEF, 0x01, 0x02, 0x03]
+        got, plen = await do_rx(ctx, 0x0000, RX_FRAME)
+        if got != RX_FRAME:
+            errors.append(f"T4 RX frame mismatch: {['0x%02X'%b for b in got]} != "
+                          f"{['0x%02X'%b for b in RX_FRAME]}")
+        new_rd = (model_mem.get(_S0_RX_RD, 0) << 8) | model_mem.get(_S0_RX_RD + 1, 0)
+        if new_rd != plen:
+            errors.append(f"T4 RX_RD advance: got 0x{new_rd:04X}, want 0x{plen:04X}")
+        print(f"T4 RX: frame={['0x%02X'%b for b in got]}  RX_RD=0x{new_rd:04X}  "
+              f"RECV={model_mem.get(_S0_CR)==_CR_RECV}")
+
+        # ── T5: RX MACRAW with ring wraparound (RX_RD near 2 KB boundary) ─────
+        model_mem[_S0_CR] = 0x00
+        RX_FRAME2 = [0x51, 0x52, 0x53, 0x54, 0x55]
+        # rx_rd = 0x07FD: [len_hi@7FD][len_lo@7FE][f0@7FF][f1@000][f2@001]...
+        got2, plen2 = await do_rx(ctx, 0x07FD, RX_FRAME2)
+        if got2 != RX_FRAME2:
+            errors.append(f"T5 RX wrap frame mismatch: {['0x%02X'%b for b in got2]} != "
+                          f"{['0x%02X'%b for b in RX_FRAME2]}")
+        new_rd2 = (model_mem.get(_S0_RX_RD, 0) << 8) | model_mem.get(_S0_RX_RD + 1, 0)
+        want_rd2 = (0x07FD + plen2) & 0xFFFF
+        if new_rd2 != want_rd2:
+            errors.append(f"T5 RX wrap RX_RD: got 0x{new_rd2:04X}, want 0x{want_rd2:04X}")
+        print(f"T5 RX wrap: frame={['0x%02X'%b for b in got2]}  "
+              f"RX_RD=0x{new_rd2:04X} (want 0x{want_rd2:04X})")
+
+    sim = Simulator(dut)
+    sim.add_clock(Period(MHz=24), domain="sync")
+    sim.add_testbench(testbench)
+    sim.add_process(w5100_model)
+
+    sim.run()
+
+    if errors:
+        print("\nFAILURES:")
+        for e in errors:
+            print(" ", e)
+        sys.exit(1)
+    else:
+        print("\nAll tests passed.")
diff --git a/exi_bba/w5500_spi_master.py b/exi_bba/w5500_spi_master.py
new file mode 100644
index 0000000..092e158
--- /dev/null
+++ b/exi_bba/w5500_spi_master.py
@@ -0,0 +1,760 @@
+"""W5500 SPI master — sync domain (24 MHz).
+
+SPI Mode 0 (CPOL=0, CPHA=0): CLK idles LOW, data captured on rising edge.
+SCK = 12 MHz: the sync domain is 24 MHz and the bit engine toggles SCK via a
+clock-enable (sync ÷ 2).
+
+W5500 frame format
+------------------
+Byte 0–1  Address (16-bit big-endian)
+Byte 2    Control:  [7:3]=BSB  [2]=R/W  [1:0]=OM
+Byte 3+   Data
+
+BSB values used here:
+  0b00000  Common registers
+  0b00001  Socket 0 registers
+  0b00010  Socket 0 TX buffer
+  0b00011  Socket 0 RX buffer
+
+After NCRA reset the driver issues the W5500 init sequence (MR reset, SHAR,
+S0_MR MACRAW, S0_CR OPEN, S0_IMR).
+
+The module provides:
+  - A streaming TX interface (tx_data/tx_valid/tx_ready + sof/eof framing)
+  - A streaming RX interface (rx_data/rx_valid/rx_ready + sof/eof)
+  - init_req / init_done  for the NCRA-triggered init sequence
+  - MAC source address shadow input (par[0..5]) for SHAR programming
+"""
+
+from amaranth import *
+
+__all__ = ["W5500SPIMaster"]
+
+# W5500 register addresses.  The 16-bit address is the OFFSET WITHIN A BLOCK;
+# the block is selected by the BSB field of the control byte (see _CTRL_*),
+# NOT by the address.  So socket-0 registers use small offsets with BSB=1.
+_W5500_MR      = 0x0000   # Mode register            (common block)
+_W5500_SHAR    = 0x0009   # Source MAC, 6 bytes       (common block)
+_W5500_S0_MR   = 0x0000   # Socket 0 Mode             (socket-0 block)
+_W5500_S0_CR   = 0x0001   # Socket 0 Command
+_W5500_S0_IR   = 0x0002   # Socket 0 Interrupt
+_W5500_S0_RXBUF_SIZE = 0x001E  # Socket 0 RX buffer size
+_W5500_S0_TXBUF_SIZE = 0x001F  # Socket 0 TX buffer size
+_W5500_S0_TX_FSR = 0x0020  # Socket 0 TX Free Size (2 bytes)
+_W5500_S0_TX_WR  = 0x0024  # Socket 0 TX Write Pointer
+_W5500_S0_RX_RSR = 0x0026  # Socket 0 RX Received Size (2 bytes)
+_W5500_S0_RX_RD  = 0x0028  # Socket 0 RX Read Pointer
+_W5500_S0_IMR  = 0x002C   # Socket 0 Interrupt Mask
+
+# Control byte = (BSB << 3) | (RWB << 2) | OM.
+# RWB: 1=write 0=read.  OM=00 → Variable Data Mode (CS frames the length).
+# BSB: 0=common, 1=socket0 reg, 2=socket0 TX buffer, 3=socket0 RX buffer.
+_CTRL_WR_COMMON = (0 << 3) | (1 << 2)   # 0x04
+_CTRL_WR_S0REG  = (1 << 3) | (1 << 2)   # 0x0C
+_CTRL_RD_S0REG  = (1 << 3) | (0 << 2)   # 0x08
+_CTRL_WR_S0TX   = (2 << 3) | (1 << 2)   # 0x14
+_CTRL_RD_S0RX   = (3 << 3) | (0 << 2)   # 0x18
+
+
+class W5500SPIMaster(Elaboratable):
+    """W5500 SPI master in the sync clock domain.
+
+    Physical SPI pins
+    -----------------
+    spi_clk / spi_mosi / spi_miso / spi_cs_n : to W5500
+    w5500_int_n                               : W5500 INT_N input (active low)
+    w5500_rst_n                               : W5500 hardware reset (active low)
+
+    Init interface (from BBARegisterFile / BBATop)
+    ----------------------------------------------
+    init_req  : pulse to trigger the W5500 init sequence
+    init_done : pulse when init sequence completes
+    par       : 6-byte MAC address (sampled at init_req)
+
+    TX streaming interface (from TXFrameDrain, sync domain)
+    -------------------------------------------------------
+    tx_data / tx_valid / tx_ready : byte stream
+    tx_sof / tx_eof               : frame delimiters on the same cycle as tx_valid
+
+    RX streaming interface (to RXFrameAssembler, sync domain)
+    ----------------------------------------------------------
+    rx_data / rx_valid / rx_ready : byte stream
+    rx_sof / rx_eof               : frame delimiters
+    """
+
+    def __init__(self, clk_div=1, reset_cycles=24000):
+        # MR-reset settle wait (in sync cycles).  ~1 ms; the testbench
+        # overrides with a small value for fast simulation.
+        self._reset_cycles = reset_cycles
+
+        # SPI SCK = sync_clock / (2 * clk_div).  clk_div=1 → full rate (SCK =
+        # sync/2): at the 24 MHz slow domain that is 12 MHz SCK (~12 Mbit/s),
+        # which comfortably exceeds real-world GC BBA TCP throughput.  The W5500
+        # tolerates up to 80 MHz SCK, so the divider exists only as a safety
+        # knob for board-level signal-integrity issues, not a functional need.
+        self._clk_div = clk_div
+
+        # Physical SPI
+        self.spi_clk   = Signal()
+        self.spi_mosi  = Signal()
+        self.spi_miso  = Signal()
+        self.spi_cs_n  = Signal(init=1)
+        self.w5500_int_n = Signal(init=1)
+        self.w5500_rst_n = Signal(init=1)
+
+        # Init control
+        self.init_req  = Signal()
+        self.init_done = Signal()
+        self.par       = Signal(48)   # MAC address (PAR0..5 packed)
+
+        # TX stream
+        self.tx_data   = Signal(8)
+        self.tx_valid  = Signal()
+        self.tx_ready  = Signal()
+        self.tx_sof    = Signal()
+        self.tx_eof    = Signal()
+
+        # RX stream
+        self.rx_data   = Signal(8)
+        self.rx_valid  = Signal()
+        self.rx_ready  = Signal()
+        self.rx_sof    = Signal()
+        self.rx_eof    = Signal()
+
+    def elaborate(self, platform):
+        m = Module()
+
+        # ── SPI clock enable ─────────────────────────────────────────────
+        # clk_en high every `clk_div` sync cycles.  The bit engine toggles SCK
+        # on each enabled cycle, so SCK = sync / (2 * clk_div).
+        clk_en = Signal()
+        if self._clk_div <= 1:
+            m.d.comb += clk_en.eq(1)              # full rate: SCK = sync/2
+        else:
+            div_ctr = Signal(range(self._clk_div))
+            with m.If(div_ctr == self._clk_div - 1):
+                m.d.sync += div_ctr.eq(0)
+            with m.Else():
+                m.d.sync += div_ctr.eq(div_ctr + 1)
+            m.d.comb += clk_en.eq(div_ctr == self._clk_div - 1)
+
+        # ── SPI pin registers (Mode 0: SCK idles LOW) ────────────────────
+        sck_r     = Signal()
+        cs_r      = Signal(init=1)
+        shift_out = Signal(8)
+        shift_in  = Signal(8)
+        m.d.comb += self.spi_clk .eq(sck_r)
+        m.d.comb += self.spi_cs_n.eq(cs_r)
+        m.d.comb += self.spi_mosi.eq(shift_out[7])   # MSB first; valid pre-rising
+
+        # ── Byte-transfer engine (Mode 0) ────────────────────────────────
+        # On byte_start, shift out byte_tx MSB-first (8 SCK cycles) and capture
+        # MISO into byte_rx; pulse byte_done.  CS is owned by the xfer engine.
+        byte_start = Signal()
+        byte_tx    = Signal(8)
+        byte_rx    = Signal(8)
+        byte_done  = Signal()
+        bit_ctr    = Signal(4)
+
+        m.d.sync += byte_done.eq(0)
+        with m.FSM(domain="sync", name="byte_fsm"):
+            with m.State("IDLE"):
+                m.d.sync += sck_r.eq(0)
+                with m.If(byte_start):
+                    m.d.sync += shift_out.eq(byte_tx)
+                    m.d.sync += bit_ctr.eq(0)
+                    m.next = "RUN"
+            with m.State("RUN"):
+                with m.If(clk_en):
+                    with m.If(~sck_r):
+                        # rising edge: slave samples MOSI, master samples MISO
+                        m.d.sync += sck_r.eq(1)
+                        m.d.sync += shift_in.eq(Cat(self.spi_miso, shift_in[:-1]))
+                    with m.Else():
+                        # falling edge: advance / finish
+                        m.d.sync += sck_r.eq(0)
+                        with m.If(bit_ctr == 7):
+                            m.d.sync += byte_rx.eq(shift_in)
+                            m.d.sync += byte_done.eq(1)
+                            m.next = "IDLE"
+                        with m.Else():
+                            m.d.sync += shift_out.eq(Cat(0, shift_out[:-1]))
+                            m.d.sync += bit_ctr.eq(bit_ctr + 1)
+
+        # ── Generic register transaction engine (Variable Data Mode) ─────
+        # One CS-low frame: 3 header bytes (addr_hi, addr_lo, ctrl) then
+        # xfer_len payload bytes.  Writes source payload from wbuf; reads
+        # capture MISO into rbuf.
+        WBUF = 8
+        xfer_start = Signal()
+        xfer_addr  = Signal(16)
+        xfer_ctrl  = Signal(8)
+        xfer_len   = Signal(range(WBUF + 1))
+        xfer_done  = Signal()
+        wbuf = Array([Signal(8, name=f"wbuf{i}") for i in range(WBUF)])
+        rbuf = Array([Signal(8, name=f"rbuf{i}") for i in range(WBUF)])
+        xfer_idx = Signal(range(WBUF + 3))
+
+        # Stream-write mode: after the 3-byte header, payload bytes are pulled
+        # from (s_data, s_valid, s_last) instead of wbuf, until s_last.  Used to
+        # forward a frame straight into the W5500 TX buffer.  s_consume pulses
+        # as each streamed byte is accepted; s_count tracks the byte count.
+        xfer_stream = Signal()
+        s_data    = Signal(8)
+        s_valid   = Signal()
+        s_last    = Signal()
+        s_consume = Signal()
+        s_count   = Signal(16)
+        s_last_r  = Signal()      # latched s_last for the in-flight byte
+
+        # Stream-read mode: after the header, read `xfer_rcount` payload bytes
+        # (sending 0x00 dummies) and push each out via (r_data, r_valid,
+        # r_first, r_last) with r_ready back-pressure.  Used to pull a frame
+        # out of the W5500 RX buffer into RXFrameAssembler.
+        xfer_sread = Signal()
+        xfer_rcount = Signal(16)
+        r_data    = Signal(8)
+        r_valid   = Signal()
+        r_first   = Signal()
+        r_last    = Signal()
+        r_ready   = Signal()
+        r_idx     = Signal(16)
+
+        x_byte = Signal(8)
+        with m.If(xfer_idx == 0):
+            m.d.comb += x_byte.eq(xfer_addr[8:16])
+        with m.Elif(xfer_idx == 1):
+            m.d.comb += x_byte.eq(xfer_addr[0:8])
+        with m.Elif(xfer_idx == 2):
+            m.d.comb += x_byte.eq(xfer_ctrl)
+        with m.Else():
+            m.d.comb += x_byte.eq(wbuf[xfer_idx - 3])
+
+        m.d.comb += byte_start.eq(0)
+        m.d.comb += byte_tx.eq(0)
+        m.d.comb += s_consume.eq(0)
+        m.d.comb += r_valid.eq(0)
+        m.d.comb += r_data.eq(0)
+        m.d.comb += r_first.eq(0)
+        m.d.comb += r_last.eq(0)
+
+        m.d.sync += xfer_done.eq(0)
+        with m.FSM(domain="sync", name="xfer_fsm"):
+            with m.State("IDLE"):
+                with m.If(xfer_start):
+                    m.d.sync += cs_r.eq(0)          # assert CS for the frame
+                    m.d.sync += xfer_idx.eq(0)
+                    m.d.sync += s_count.eq(0)
+                    m.d.sync += r_idx.eq(0)
+                    m.next = "LOAD"
+            with m.State("LOAD"):
+                m.d.comb += byte_tx.eq(x_byte)
+                m.d.comb += byte_start.eq(1)
+                m.next = "WAIT"
+            with m.State("WAIT"):
+                with m.If(byte_done):
+                    with m.If(xfer_idx >= 3):
+                        m.d.sync += rbuf[xfer_idx - 3].eq(byte_rx)
+                    with m.If((xfer_idx == 2) & xfer_stream):
+                        m.next = "SLOAD"            # stream the payload (write)
+                    with m.Elif((xfer_idx == 2) & xfer_sread):
+                        m.next = "RLOAD"            # stream the payload (read)
+                    with m.Elif(~xfer_stream & ~xfer_sread
+                                & (xfer_idx == (xfer_len + 2))):
+                        m.next = "FINISH"           # 3 header + len − 1
+                    with m.Else():
+                        m.d.sync += xfer_idx.eq(xfer_idx + 1)
+                        m.next = "LOAD"
+
+            # ── Streamed-payload sub-loop (TX buffer write) ──────────────
+            with m.State("SLOAD"):
+                with m.If(s_valid):
+                    m.d.comb += byte_tx.eq(s_data)
+                    m.d.comb += byte_start.eq(1)
+                    m.d.sync += s_last_r.eq(s_last)
+                    m.next = "SWAIT"
+            with m.State("SWAIT"):
+                with m.If(byte_done):
+                    m.d.comb += s_consume.eq(1)     # accept this frame byte
+                    m.d.sync += s_count.eq(s_count + 1)
+                    with m.If(s_last_r):
+                        m.next = "FINISH"
+                    with m.Else():
+                        m.next = "SLOAD"
+
+            # ── Streamed-payload sub-loop (RX buffer read) ───────────────
+            with m.State("RLOAD"):
+                with m.If(r_idx == xfer_rcount):
+                    m.next = "FINISH"
+                with m.Else():
+                    m.d.comb += byte_tx.eq(0)       # dummy MOSI during read
+                    m.d.comb += byte_start.eq(1)
+                    m.next = "RWAIT"
+            with m.State("RWAIT"):
+                with m.If(byte_done):
+                    m.next = "RPUSH"
+            with m.State("RPUSH"):
+                m.d.comb += r_data .eq(byte_rx)
+                m.d.comb += r_valid.eq(1)
+                m.d.comb += r_first.eq(r_idx == 0)
+                m.d.comb += r_last .eq(r_idx == (xfer_rcount - 1))
+                with m.If(r_ready):
+                    m.d.sync += r_idx.eq(r_idx + 1)
+                    m.next = "RLOAD"
+
+            with m.State("FINISH"):
+                m.d.sync += cs_r.eq(1)             # deassert CS
+                m.d.sync += xfer_done.eq(1)
+                m.next = "IDLE"
+
+        # Saved MAC for SHAR programming; current W5500 TX write pointer.
+        mac_shadow = Array([Signal(8, name=f"mac{i}") for i in range(6)])
+        wait_ctr   = Signal(range(self._reset_cycles + 2))
+        tx_wr      = Signal(16)
+        rx_rsr     = Signal(16)   # RX received size
+        rx_rd      = Signal(16)   # RX read pointer
+        pkt_len    = Signal(16)   # MACRAW packet length (incl. 2-byte header)
+
+        # Frame stream from TXFrameDrain feeds the xfer engine's stream port.
+        # tx_ready pulses (= s_consume) as each frame byte is taken into the
+        # TX-buffer write transaction.
+        m.d.comb += [
+            s_data .eq(self.tx_data),
+            s_valid.eq(self.tx_valid),
+            s_last .eq(self.tx_eof),
+            self.tx_ready.eq(s_consume),
+        ]
+        # RX buffer read stream → RXFrameAssembler.
+        m.d.comb += [
+            self.rx_data .eq(r_data),
+            self.rx_valid.eq(r_valid),
+            self.rx_sof  .eq(r_first),
+            self.rx_eof  .eq(r_last),
+            r_ready      .eq(self.rx_ready),
+        ]
+
+        # Helper: a setup state that programs one register-write transaction
+        # then waits for it to complete and jumps to `nxt`.
+        def write_reg(name, addr, ctrl, payload, nxt):
+            with m.State(name):
+                m.d.sync += xfer_addr.eq(addr)
+                m.d.sync += xfer_ctrl.eq(ctrl)
+                m.d.sync += xfer_len.eq(len(payload))
+                m.d.sync += xfer_stream.eq(0)
+                m.d.sync += xfer_sread.eq(0)
+                for i, b in enumerate(payload):
+                    m.d.sync += wbuf[i].eq(b)
+                m.d.sync += xfer_start.eq(1)
+                m.next = name + "_W"
+            with m.State(name + "_W"):
+                m.d.sync += xfer_start.eq(0)
+                with m.If(xfer_done):
+                    m.next = nxt
+
+        # ── Main control FSM ─────────────────────────────────────────────
+        with m.FSM(domain="sync", name="main_fsm"):
+
+            with m.State("IDLE"):
+                m.d.sync += self.init_done.eq(0)
+                with m.If(self.init_req):
+                    for i in range(6):
+                        m.d.sync += mac_shadow[i].eq(self.par[i*8:(i+1)*8])
+                    m.next = "MR_RST"
+                with m.Elif(~self.w5500_int_n):
+                    m.next = "RX_CHECK"
+                with m.Elif(self.tx_valid & self.tx_sof):
+                    m.next = "TX_START"
+
+            # Step 1: MR = 0x80 (software reset), then settle ~1 ms.
+            write_reg("MR_RST", _W5500_MR, _CTRL_WR_COMMON, [0x80], "MR_WAIT")
+            with m.State("MR_WAIT"):
+                with m.If(wait_ctr == self._reset_cycles):
+                    m.d.sync += wait_ctr.eq(0)
+                    m.next = "SHAR"
+                with m.Else():
+                    m.d.sync += wait_ctr.eq(wait_ctr + 1)
+
+            # Step 2: SHAR = source MAC (6 bytes from PAR0–5).
+            with m.State("SHAR"):
+                m.d.sync += xfer_addr.eq(_W5500_SHAR)
+                m.d.sync += xfer_ctrl.eq(_CTRL_WR_COMMON)
+                m.d.sync += xfer_len.eq(6)
+                for i in range(6):
+                    m.d.sync += wbuf[i].eq(mac_shadow[i])
+                m.d.sync += xfer_start.eq(1)
+                m.next = "SHAR_W"
+            with m.State("SHAR_W"):
+                m.d.sync += xfer_start.eq(0)
+                with m.If(xfer_done):
+                    m.next = "S0_MR"
+
+            # Step 3–5: S0_MR=MACRAW, S0_CR=OPEN, S0_IMR=RECV|SEND_OK.
+            write_reg("S0_MR",  _W5500_S0_MR,  _CTRL_WR_S0REG, [0x04], "S0_CR")
+            write_reg("S0_CR",  _W5500_S0_CR,  _CTRL_WR_S0REG, [0x01], "S0_IMR")
+            write_reg("S0_IMR", _W5500_S0_IMR, _CTRL_WR_S0REG, [0x05], "INIT_DONE")
+
+            with m.State("INIT_DONE"):
+                m.d.sync += self.init_done.eq(1)
+                m.next = "IDLE"
+
+            # ── TX path (MACRAW) ─────────────────────────────────────────
+            # 1) read S0_TX_WR, 2) stream the frame into the TX buffer at that
+            # offset, 3) advance S0_TX_WR by the byte count, 4) issue SEND.
+            with m.State("TX_START"):
+                m.d.sync += xfer_addr.eq(_W5500_S0_TX_WR)
+                m.d.sync += xfer_ctrl.eq(_CTRL_RD_S0REG)
+                m.d.sync += xfer_len.eq(2)
+                m.d.sync += xfer_stream.eq(0)
+                m.d.sync += wbuf[0].eq(0)         # read → send 0x00 dummies
+                m.d.sync += wbuf[1].eq(0)
+                m.d.sync += xfer_start.eq(1)
+                m.next = "TX_RDPTR_W"
+            with m.State("TX_RDPTR_W"):
+                m.d.sync += xfer_start.eq(0)
+                with m.If(xfer_done):
+                    m.d.sync += tx_wr.eq(Cat(rbuf[1], rbuf[0]))   # big-endian
+                    m.next = "TX_DATA"
+
+            with m.State("TX_DATA"):
+                m.d.sync += xfer_addr.eq(tx_wr)
+                m.d.sync += xfer_ctrl.eq(_CTRL_WR_S0TX)   # socket-0 TX buffer
+                m.d.sync += xfer_stream.eq(1)
+                m.d.sync += xfer_start.eq(1)
+                m.next = "TX_DATA_W"
+            with m.State("TX_DATA_W"):
+                m.d.sync += xfer_start.eq(0)
+                with m.If(xfer_done):
+                    m.d.sync += xfer_stream.eq(0)
+                    m.d.sync += tx_wr.eq(tx_wr + s_count)     # advanced pointer
+                    m.next = "TX_UPDPTR"
+
+            with m.State("TX_UPDPTR"):
+                m.d.sync += xfer_addr.eq(_W5500_S0_TX_WR)
+                m.d.sync += xfer_ctrl.eq(_CTRL_WR_S0REG)
+                m.d.sync += xfer_len.eq(2)
+                m.d.sync += xfer_stream.eq(0)
+                m.d.sync += wbuf[0].eq(tx_wr[8:16])   # hi (already advanced)
+                m.d.sync += wbuf[1].eq(tx_wr[0:8])    # lo
+                m.d.sync += xfer_start.eq(1)
+                m.next = "TX_UPDPTR_W"
+            with m.State("TX_UPDPTR_W"):
+                m.d.sync += xfer_start.eq(0)
+                with m.If(xfer_done):
+                    m.next = "TX_SEND"
+
+            # S0_CR = SEND (0x20)
+            write_reg("TX_SEND", _W5500_S0_CR, _CTRL_WR_S0REG, [0x20], "IDLE")
+
+            # ── RX path (MACRAW) ─────────────────────────────────────────
+            # Triggered by W5500 INT (w5500_int_n low): read RX_RSR, read
+            # RX_RD, read the 2-byte MACRAW length, stream the frame out,
+            # advance RX_RD, issue RECV.
+            with m.State("RX_CHECK"):           # read S0_RX_RSR
+                m.d.sync += xfer_addr.eq(_W5500_S0_RX_RSR)
+                m.d.sync += xfer_ctrl.eq(_CTRL_RD_S0REG)
+                m.d.sync += xfer_len.eq(2)
+                m.d.sync += xfer_stream.eq(0)
+                m.d.sync += xfer_sread.eq(0)
+                m.d.sync += wbuf[0].eq(0)
+                m.d.sync += wbuf[1].eq(0)
+                m.d.sync += xfer_start.eq(1)
+                m.next = "RX_RSR_W"
+            with m.State("RX_RSR_W"):
+                m.d.sync += xfer_start.eq(0)
+                with m.If(xfer_done):
+                    m.d.sync += rx_rsr.eq(Cat(rbuf[1], rbuf[0]))
+                    m.next = "RX_RSR_CHK"
+            with m.State("RX_RSR_CHK"):
+                with m.If(rx_rsr == 0):
+                    m.next = "IDLE"             # nothing received
+                with m.Else():
+                    m.next = "RX_RDPTR"
+
+            with m.State("RX_RDPTR"):           # read S0_RX_RD
+                m.d.sync += xfer_addr.eq(_W5500_S0_RX_RD)
+                m.d.sync += xfer_ctrl.eq(_CTRL_RD_S0REG)
+                m.d.sync += xfer_len.eq(2)
+                m.d.sync += xfer_start.eq(1)
+                m.next = "RX_RDPTR_W"
+            with m.State("RX_RDPTR_W"):
+                m.d.sync += xfer_start.eq(0)
+                with m.If(xfer_done):
+                    m.d.sync += rx_rd.eq(Cat(rbuf[1], rbuf[0]))
+                    m.next = "RX_LEN"
+
+            with m.State("RX_LEN"):             # read 2-byte MACRAW length
+                m.d.sync += xfer_addr.eq(rx_rd)
+                m.d.sync += xfer_ctrl.eq(_CTRL_RD_S0RX)
+                m.d.sync += xfer_len.eq(2)
+                m.d.sync += xfer_start.eq(1)
+                m.next = "RX_LEN_W"
+            with m.State("RX_LEN_W"):
+                m.d.sync += xfer_start.eq(0)
+                with m.If(xfer_done):
+                    m.d.sync += pkt_len.eq(Cat(rbuf[1], rbuf[0]))
+                    m.next = "RX_FRAME"
+
+            with m.State("RX_FRAME"):           # stream pkt_len−2 frame bytes
+                m.d.sync += xfer_addr.eq(rx_rd + 2)
+                m.d.sync += xfer_ctrl.eq(_CTRL_RD_S0RX)
+                m.d.sync += xfer_sread.eq(1)
+                m.d.sync += xfer_rcount.eq(pkt_len - 2)
+                m.d.sync += xfer_start.eq(1)
+                m.next = "RX_FRAME_W"
+            with m.State("RX_FRAME_W"):
+                m.d.sync += xfer_start.eq(0)
+                with m.If(xfer_done):
+                    m.d.sync += xfer_sread.eq(0)
+                    m.next = "RX_UPDRD"
+
+            with m.State("RX_UPDRD"):           # S0_RX_RD += pkt_len
+                m.d.sync += xfer_addr.eq(_W5500_S0_RX_RD)
+                m.d.sync += xfer_ctrl.eq(_CTRL_WR_S0REG)
+                m.d.sync += xfer_len.eq(2)
+                m.d.sync += xfer_stream.eq(0)
+                m.d.sync += xfer_sread.eq(0)
+                m.d.sync += wbuf[0].eq((rx_rd + pkt_len)[8:16])
+                m.d.sync += wbuf[1].eq((rx_rd + pkt_len)[0:8])
+                m.d.sync += xfer_start.eq(1)
+                m.next = "RX_UPDRD_W"
+            with m.State("RX_UPDRD_W"):
+                m.d.sync += xfer_start.eq(0)
+                with m.If(xfer_done):
+                    m.next = "RX_RECV"
+
+            # S0_CR = RECV (0x40), then clear the RECV interrupt so INT_N
+            # deasserts (write 1 to Sn_IR[2]); otherwise the FSM would re-enter
+            # RX_CHECK forever on a real W5500.
+            write_reg("RX_RECV",   _W5500_S0_CR, _CTRL_WR_S0REG, [0x40], "RX_CLR_IR")
+            write_reg("RX_CLR_IR", _W5500_S0_IR, _CTRL_WR_S0REG, [0x04], "IDLE")
+
+        return m
+
+
+# ── Testbench ─────────────────────────────────────────────────────────────
+
+if __name__ == "__main__":
+    import sys
+    from amaranth.sim import Simulator, Period
+
+    # Short reset wait so the init sequence runs quickly in simulation.
+    dut = W5500SPIMaster(reset_cycles=10)
+    errors = []
+
+    # MAC for SHAR: par[i*8:(i+1)*8] = mac byte i → mac = 11 22 33 44 55 66
+    MAC = [0x11, 0x22, 0x33, 0x44, 0x55, 0x66]
+    PAR = sum(b << (8 * i) for i, b in enumerate(MAC))
+
+    # Expected W5500 init transactions: [addr_hi, addr_lo, ctrl, *payload].
+    # ctrl 0x04 = common-block write (VDM); 0x0C = socket-0-reg write (VDM).
+    EXPECTED = [
+        [0x00, 0x00, 0x04, 0x80],                 # MR = 0x80 (reset)
+        [0x00, 0x09, 0x04, *MAC],                 # SHAR = MAC
+        [0x00, 0x00, 0x0C, 0x04],                 # S0_MR = MACRAW
+        [0x00, 0x01, 0x0C, 0x01],                 # S0_CR = OPEN
+        [0x00, 0x2C, 0x0C, 0x05],                 # S0_IMR = RECV|SEND_OK
+    ]
+
+    txns = []   # transactions captured by the W5500 slave model
+
+    # RX frame the W5500 will hand back, and the MACRAW length it reports.
+    RX_FRAME = [0xDE, 0xAD, 0xBE, 0xEF, 0x01, 0x02]
+    RX_PKT_LEN = len(RX_FRAME) + 2          # MACRAW length includes the header
+
+    def build_response(bsb, addr):
+        """Bytes the W5500 drives on MISO for a read of (bsb, addr)."""
+        if bsb == 1 and addr == _W5500_S0_RX_RSR:
+            return [(RX_PKT_LEN >> 8) & 0xFF, RX_PKT_LEN & 0xFF]
+        if bsb == 1 and addr == _W5500_S0_RX_RD:
+            return [0x00, 0x00]             # RX read pointer = 0
+        if bsb == 3 and addr == 0x0000:
+            return [(RX_PKT_LEN >> 8) & 0xFF, RX_PKT_LEN & 0xFF]   # length
+        if bsb == 3 and addr == 0x0002:
+            return list(RX_FRAME)           # frame payload
+        return [0x00] * 64
+
+    async def w5500_model(ctx):
+        """W5500 SPI slave model: captures CS-framed transactions (MOSI) and,
+        for reads, drives MISO with canned register/buffer data.  Mode 0:
+        MOSI sampled on rising SCK, MISO shifted out MSB-first.
+        """
+        prev_cs, prev_sck = 1, 0
+        rx_byte = rx_bits = nbytes = 0
+        hdr = [0, 0, 0]
+        is_read = False
+        resp, ridx = [], 0
+        msr = msr_bits = 0
+        cur_txn = []
+        async for vals in ctx.tick("sync").sample(
+                dut.spi_cs_n, dut.spi_clk, dut.spi_mosi):
+            cs, sck, mosi = vals[-3:]
+            rising = (prev_sck == 0 and sck == 1)
+
+            if prev_cs == 1 and cs == 0:          # CS falling: start frame
+                cur_txn = []
+                rx_byte = rx_bits = nbytes = 0
+                is_read = False
+                resp, ridx, msr, msr_bits = [], 0, 0, 0
+
+            if cs == 0 and rising:
+                # MISO bit just sampled by the master → advance shift register
+                if is_read and nbytes >= 3:
+                    msr = (msr << 1) & 0xFF
+                    msr_bits -= 1
+                    if msr_bits == 0:
+                        msr = resp[ridx] if ridx < len(resp) else 0
+                        ridx += 1
+                        msr_bits = 8
+                # sample MOSI
+                rx_byte = ((rx_byte << 1) | mosi) & 0xFF
+                rx_bits += 1
+                if rx_bits == 8:
+                    cur_txn.append(rx_byte)
+                    if nbytes < 3:
+                        hdr[nbytes] = rx_byte
+                        if nbytes == 2:           # header complete → decode
+                            ctrl = hdr[2]
+                            is_read = (ctrl & 0x04) == 0
+                            bsb = ctrl >> 3
+                            addr = (hdr[0] << 8) | hdr[1]
+                            if is_read:
+                                resp = build_response(bsb, addr)
+                                msr, ridx, msr_bits = resp[0], 1, 8
+                    nbytes += 1
+                    rx_byte = rx_bits = 0
+
+            if prev_cs == 0 and cs == 1:          # CS rising: end frame
+                txns.append(list(cur_txn))
+
+            ctx.set(dut.spi_miso, (msr >> 7) & 1)
+            prev_cs, prev_sck = cs, sck
+
+    rx_collected = []
+
+    async def rx_collector(ctx):
+        async for vals in ctx.tick("sync").sample(
+                dut.rx_valid, dut.rx_ready, dut.rx_data):
+            valid, ready, data = vals[-3:]
+            if valid and ready:
+                rx_collected.append(data)
+
+    async def testbench(ctx):
+        ctx.set(dut.par, PAR)
+        await ctx.tick("sync").repeat(4)
+
+        # T1: SPI idle — CLK low (Mode 0), CS high
+        if ctx.get(dut.spi_clk) != 0:
+            errors.append("T1 CLK idle != 0")
+        if ctx.get(dut.spi_cs_n) != 1:
+            errors.append("T1 CS idle != 1")
+        print(f"T1 idle: CLK={ctx.get(dut.spi_clk)} CS={ctx.get(dut.spi_cs_n)}")
+
+        # T2: run the init sequence
+        ctx.set(dut.init_req, 1)
+        await ctx.tick("sync").repeat(1)
+        ctx.set(dut.init_req, 0)
+
+        for _ in range(4000):
+            await ctx.tick("sync").repeat(1)
+            if ctx.get(dut.init_done):
+                break
+        if not ctx.get(dut.init_done):
+            errors.append("T2 init_done never asserted")
+        await ctx.tick("sync").repeat(4)
+        print(f"T2 init_done: {ctx.get(dut.init_done)}")
+
+        # T3: verify the captured init transaction sequence
+        print(f"T3 captured {len(txns)} init transactions:")
+        for t in txns:
+            print("   ", [f"0x{b:02X}" for b in t])
+        if txns != EXPECTED:
+            errors.append(f"T3 init sequence mismatch:\n  got {txns}\n  want {EXPECTED}")
+
+        # ── T4: TX a frame (MACRAW) ──────────────────────────────────────
+        txns.clear()
+        FRAME = [0xAA, 0xBB, 0xCC, 0xDD]
+        # With MISO=0 the read returns S0_TX_WR = 0x0000.
+        TX_EXPECTED = [
+            [0x00, 0x24, 0x08, 0x00, 0x00],          # read S0_TX_WR (dummies)
+            [0x00, 0x00, 0x14, *FRAME],              # write TX buffer @ 0x0000
+            [0x00, 0x24, 0x0C, 0x00, len(FRAME)],    # S0_TX_WR += len
+            [0x00, 0x01, 0x0C, 0x20],                # S0_CR = SEND
+        ]
+
+        async def send_frame(frame):
+            for i, b in enumerate(frame):
+                ctx.set(dut.tx_data, b)
+                ctx.set(dut.tx_valid, 1)
+                ctx.set(dut.tx_sof, 1 if i == 0 else 0)
+                ctx.set(dut.tx_eof, 1 if i == len(frame) - 1 else 0)
+                for _ in range(2000):
+                    if ctx.get(dut.tx_ready):
+                        break
+                    await ctx.tick("sync").repeat(1)
+                await ctx.tick("sync").repeat(1)   # complete the consume
+            ctx.set(dut.tx_valid, 0)
+            ctx.set(dut.tx_sof, 0)
+            ctx.set(dut.tx_eof, 0)
+
+        await send_frame(FRAME)
+        # let the pointer-update + SEND transactions finish
+        for _ in range(2000):
+            await ctx.tick("sync").repeat(1)
+            if len(txns) >= len(TX_EXPECTED):
+                break
+        await ctx.tick("sync").repeat(4)
+
+        print(f"T4 captured {len(txns)} TX transactions:")
+        for t in txns:
+            print("   ", [f"0x{b:02X}" for b in t])
+        if txns != TX_EXPECTED:
+            errors.append(f"T4 TX sequence mismatch:\n  got {txns}\n  want {TX_EXPECTED}")
+
+        # ── T5: RX a frame (MACRAW) ──────────────────────────────────────
+        # The model returns RSR=pkt_len, RD=0, MACRAW length=pkt_len, then the
+        # frame.  Expected transactions (read dummies are 0x00):
+        RX_EXPECTED = [
+            [0x00, 0x26, 0x08, 0x00, 0x00],                 # read S0_RX_RSR
+            [0x00, 0x28, 0x08, 0x00, 0x00],                 # read S0_RX_RD
+            [0x00, 0x00, 0x18, 0x00, 0x00],                 # read MACRAW length
+            [0x00, 0x02, 0x18, *([0x00] * len(RX_FRAME))],  # read frame
+            [0x00, 0x28, 0x0C, 0x00, RX_PKT_LEN],           # S0_RX_RD += pkt_len
+            [0x00, 0x01, 0x0C, 0x40],                       # S0_CR = RECV
+            [0x00, 0x02, 0x0C, 0x04],                       # S0_IR clear RECV
+        ]
+        txns.clear()
+        ctx.set(dut.rx_ready, 1)
+        ctx.set(dut.w5500_int_n, 0)       # signal a received packet
+        for _ in range(4000):
+            await ctx.tick("sync").repeat(1)
+            if len(txns) >= len(RX_EXPECTED):
+                break
+        ctx.set(dut.w5500_int_n, 1)
+        await ctx.tick("sync").repeat(8)
+
+        print(f"T5 captured {len(txns)} RX transactions:")
+        for t in txns:
+            print("   ", [f"0x{b:02X}" for b in t])
+        print(f"T5 rx frame: {[f'0x{b:02X}' for b in rx_collected]}  "
+              f"(want {[f'0x{b:02X}' for b in RX_FRAME]})")
+        if txns != RX_EXPECTED:
+            errors.append(f"T5 RX sequence mismatch:\n  got {txns}\n  want {RX_EXPECTED}")
+        if rx_collected != RX_FRAME:
+            errors.append(f"T5 RX frame mismatch: got {rx_collected}, want {RX_FRAME}")
+
+    sim = Simulator(dut)
+    sim.add_clock(Period(MHz=24), domain="sync")
+    sim.add_testbench(testbench)
+    sim.add_process(w5500_model)
+    sim.add_process(rx_collector)
+
+    with sim.write_vcd("W5500SPIMaster.vcd"):
+        sim.run()
+
+    if errors:
+        print("\nFAILURES:")
+        for e in errors:
+            print(" ", e)
+        sys.exit(1)
+    else:
+        print("\nAll tests passed.")
diff --git a/requirements.txt b/requirements.txt
index 2318e38..54e5867 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,10 +1,10 @@
 amaranth @ git+https://github.com/amaranth-lang/amaranth@main
-amaranth-boards @ git+https://github.com/amaranth-lang/amaranth-boards.git@7e24efe2f6e95afddd0c1b56f1a9423c48caa472
-amaranth-yosys==0.50.0.0.post115
-importlib_resources==6.5.2
+amaranth-boards @ git+https://github.com/amaranth-lang/amaranth-boards.git@8bc91db6f68c5c36f30926bf56836739c138986f
+amaranth-yosys==0.50.0.0.post124
+importlib_resources==7.1.0
 Jinja2==3.1.6
 jschon==0.11.1
-MarkupSafe==3.0.2
+MarkupSafe==3.0.3
 pyvcd==0.4.1
 rfc3986==2.0.0
-wasmtime==36.0.0
+wasmtime==45.0.0