diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile new file mode 100644 index 0000000..2f97a41 --- /dev/null +++ b/.devcontainer/Dockerfile @@ -0,0 +1,16 @@ +FROM python:3.12-slim-bookworm + +RUN apt-get update && apt-get install -y --no-install-recommends \ + git \ + yosys \ + nextpnr-ice40 \ + fpga-icestorm \ + nodejs npm \ + && rm -rf /var/lib/apt/lists/* + +RUN npm install -g @anthropic-ai/claude-code + +WORKDIR /workspace + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt diff --git a/.devcontainer/attach-icebreaker.ps1 b/.devcontainer/attach-icebreaker.ps1 new file mode 100644 index 0000000..722a84d --- /dev/null +++ b/.devcontainer/attach-icebreaker.ps1 @@ -0,0 +1,32 @@ +#Requires -RunAsAdministrator +# Attaches the IceBreaker FPGA (FTDI FT2232H, VID 0403) to WSL2 via usbipd-win. +# Run this on the Windows host before opening the devcontainer. + +$ErrorActionPreference = 'Stop' + +if (-not (Get-Command usbipd -ErrorAction SilentlyContinue)) { + Write-Error "usbipd not found. Install it from: https://github.com/dorssel/usbipd-win/releases" + exit 1 +} + +# Find all devices with FTDI VID 0403 +$devices = usbipd list | Where-Object { $_ -match '0403' } + +if (-not $devices) { + Write-Error "No FTDI device (VID 0403) found. Is the IceBreaker plugged in?" + exit 1 +} + +if (($devices | Measure-Object).Count -gt 1) { + Write-Host "Multiple FTDI devices found:" + $devices | ForEach-Object { Write-Host " $_" } + Write-Error "Ambiguous. Unplug other FTDI devices or run 'usbipd attach --wsl --busid ' manually." + exit 1 +} + +# Extract BUSID (first token on the line, e.g. "3-1") +$busid = ($devices -split '\s+')[0].Trim() + +Write-Host "Attaching IceBreaker at bus ID $busid to WSL2..." +usbipd attach --wsl --busid $busid +Write-Host "Done. You can now open the devcontainer and use iceprog." diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 0000000..f719d95 --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,29 @@ +{ + "name": "Amaranth HDL - IceBreaker", + "build": { + "dockerfile": "Dockerfile", + "context": ".." + }, + // USB flashing (iceprog) requires the IceBreaker to be forwarded to WSL2 first. + // On Windows: install usbipd-win (https://github.com/dorssel/usbipd-win/releases), + // then run (as Administrator) before opening this devcontainer: + // .devcontainer/attach-icebreaker.ps1 + "runArgs": ["--privileged"], + "workspaceFolder": "/workspace", + "workspaceMount": "source=${localWorkspaceFolder},target=/workspace,type=bind,consistency=cached", + "mounts": [ + "source=${localEnv:USERPROFILE}/.claude,target=/root/.claude,type=bind,consistency=cached" + ], + "customizations": { + "vscode": { + "extensions": [ + "ms-python.python", + "ms-python.pylance", + "anthropic.claude-code" + ], + "settings": { + "python.defaultInterpreterPath": "/usr/local/bin/python" + } + } + } +} diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..33bf949 --- /dev/null +++ b/.gitignore @@ -0,0 +1,16 @@ +# Generated FPGA build artifacts (regenerate with: python -m exi_bba.synth) +build/ + +# Simulation waveforms (regenerate by running the testbenches) +*.vcd + +# Python +__pycache__/ +*.pyc +*.pyo +.venv/ +venv/ + +# Editor / OS cruft +.DS_Store +*.swp diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..3862be6 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,493 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Project: GC BBA FPGA Replacement + +Replace the GameCube Broadband Adapter (DOL-015 / MX98730EC) with an iCEbreaker +FPGA (Lattice iCE40UP5K) written in Amaranth HDL. The FPGA emulates the BBA +register interface over the GameCube EXI bus and bridges to a WIZnet ethernet +chip for real 100BASE-TX ethernet — default **W5100** (indirect parallel bus, +reaches the EXI throughput ceiling) or **W5500** (SPI Pmod, simpler wiring but +~12 Mbit/s). GC software (Swiss homebrew) sees an identical BBA. See "W5100 vs +W5500 ethernet back-end". + +--- + +## Development Environment + +**Preferred:** Use the devcontainer (`.devcontainer/`) which includes Python 3.12, +`nextpnr-ice40`, and `fpga-icestorm` pre-installed. + +**Windows host + WSL2 devcontainer — USB flashing setup:** +1. Install `usbipd-win` (https://github.com/dorssel/usbipd-win/releases) +2. Run `.devcontainer/attach-icebreaker.ps1` as Administrator before opening the devcontainer +3. The devcontainer runs `--privileged` to pass through the USB device + +**Local venv (outside devcontainer):** +```bash +python -m venv .venv +source .venv/bin/activate # Windows: .venv\Scripts\activate +pip install -r requirements.txt +``` + +Yosys is bundled in `amaranth-yosys`; `nextpnr-ice40` and `iceprog` must be +installed separately (via apt on Linux, or via the devcontainer). + +--- + +## Commands + +**Build and flash the iCEbreaker (must run from workspace root):** +```bash +python rebbarb/rebbarb.py +``` +Runs synthesis (yosys), place-and-route (nextpnr-ice40), and flashes via `iceprog`. +Set `ICEPROG=/path/to/iceprog` env var to override the binary location. +Note: `rebbarb/rebbarb.py` builds a 36 MHz LED blink demo. The BBA +implementation (`exi_bba/`) uses a split-domain clock: `capture` @ 54 MHz (PLL) +for the SPI bit engine, `exi`/`sync` @ 24 MHz (HFOSC) for everything else. +Synthesize/flash the real design with `python -m exi_bba.synth [--flash]`. + +**Run a simulation:** +```bash +# New-API testbench style (preferred for new code): +python rebbarb/toggle_button.py # writes ToggleButton.vcd +python rebbarb/pulse_button.py # writes PulseButton.vcd + +# Old-API process style (reference only, do not replicate in new code): +python examples/amaranth_cdc.py # CDC primitives demo +python examples/async_fifo.py # AsyncFIFO behaviour +python examples/icebreaker_fifo.py # iCEbreaker-specific FIFO (Verilog dump) +``` +Open VCD output with `gtkwave`. Simulations are the primary testing mechanism — +there is no separate test runner. + +**Verify PLL parameters:** +```bash +icepll -i 12 -o 54 # confirms DIVR=0 DIVF=71 DIVQ=4 → 54 MHz (capture domain) +``` +(`exi`/`sync` come from the internal SB_HFOSC ÷2 = 24 MHz — no PLL.) + +--- + +## Current Implementation State + +The `exi_bba/` module tree is **fully implemented** with simulation testbenches. +All modules elaborate without errors and pass their unit tests. The full design +**synthesizes, places, routes, and meets timing** on the iCE40UP5K +(`python -m exi_bba.synth`): `capture` closes ~70 MHz (target 54) and `exi`/ +`sync` close ~36 MHz (target 24) — both PASS. + +### `exi_bba/` module status + +| Module | File | Tests pass | +|---|---|---| +| `BBATop` | `exi_bba/bba_top.py` | ✅ EXI integration + full W5100→SPRAM→GC RX loop; synth PASS | +| `ExiCapture` | `exi_bba/exi_capture.py` | ✅ rx/tx byte-stream + over-push/flush | +| `SPIMode3Slave` | `exi_bba/spi_mode3_slave.py` | ✅ 4 tests (live-drive TX) | +| `BBARegisterFile` | `exi_bba/bba_register_file.py` | ✅ 7 tests (proactive push + DMA stream) | +| `SPRAMArbiter` | `exi_bba/spram_arbiter.py` | ✅ 3 tests | +| `RXFrameAssembler` | `exi_bba/rx_frame_assembler.py` | ✅ 3 tests | +| `TXFrameDrain` | `exi_bba/tx_frame_drain.py` | ✅ 2 tests | +| `W5100ParallelMaster` | `exi_bba/w5100_parallel_master.py` | ✅ 5 tests (init/TX/RX vs bus model, incl. ring wrap) — **default eth back-end** | +| `W5500SPIMaster` | `exi_bba/w5500_spi_master.py` | ✅ init/TX/RX vs SPI-slave model (alt back-end) | +| `StatusPanel` | `exi_bba/status_panel.py` | ✅ 6 tests (heartbeat, stretched activity LEDs, debounced buttons, freeze) | +| `EEPROMModel` | `exi_bba/eeprom_model.py` | ✅ 4 tests | + +**Bring-up status panel (optional):** `BBATop(status_panel=True)` adds a +`StatusPanel` driving onboard iCEbreaker LEDs + button (dedicated pins, so it +coexists with EXI + W5100). `synth.py` enables it: **LEDG=heartbeat**, +**LEDR=EXI activity** (the GC is talking), **BTN_N=manual re-init**. The full +EXI + W5100 + panel build synthesizes and meets timing (slow ~35≥24, capture +~64≥54, 44% LC). Panel LEDs 3–5 (rx/tx/ready) exist in the module but aren't +mapped on the iCEbreaker (only 2 discrete LEDs); the onboard RGB or a custom +PCB can expose them. + +**Ethernet back-end is selectable:** `BBATop(eth="w5100")` (default — indirect +parallel bus, reaches the ~27 Mbit/s EXI ceiling) or `BBATop(eth="w5500")` (SPI, +~12 Mbit/s). Both masters expose the identical tx/rx/init/par streaming +interface; only the physical pins differ. See "W5100 vs W5500" below. + +### Run all module testbenches (from workspace root) +```bash +python -m exi_bba.spi_mode3_slave +python -m exi_bba.exi_capture +python -m exi_bba.bba_register_file +python -m exi_bba.spram_arbiter +python -m exi_bba.rx_frame_assembler +python -m exi_bba.tx_frame_drain +python -m exi_bba.w5100_parallel_master # 5 tests: init, TX(+wrap), RX(+wrap) +python -m exi_bba.w5500_spi_master +python -m exi_bba.status_panel # 6 tests: heartbeat/activity/buttons +python -m exi_bba.eeprom_model +python -m exi_bba.bba_top # end-to-end EXI integration test (W5100 RX loop) +``` + +### Pending work +- **Synthesis/timing**: ✅ done — `python -m exi_bba.synth` synthesizes, P&Rs, + and meets timing on both clock domains (capture ~68≥54, slow ~40≥24). +- **W5500 init/TX/RX**: ✅ done — `W5500SPIMaster` has a real Mode-0 byte engine, + a generic register-transaction engine (header + wbuf/stream payload), the full + init sequence (MR reset, SHAR, S0_MR MACRAW, S0_CR OPEN, S0_IMR), MACRAW TX + (read TX_WR → stream frame to TX buffer → advance TX_WR → SEND) and MACRAW RX + (RSR → RD → 2-byte length → stream frame out → advance RD → RECV). All verified + on the wire by a responding W5500 SPI-slave model in the testbench. +- **PAR0–5 → W5500 SHAR**: ✅ done — `reg.par` wired to `w5500.par` in `BBATop` + (PAR0 packed in the low byte so it is the first SHAR octet). +- **NCRA SR bit**: ✅ done — `BBARegisterFile.ncra_sr` (= NCRA[3]) gates + `asm.rx_enabled` in `BBATop` (was hard-wired to 1). +- **W5500 SPI throughput**: SCK = sync÷2 = 12 MHz (~12 Mbit/s) — exceeds + real-world GC BBA TCP throughput (~6–10 Mbit/s) but is below the 27 Mbit/s raw + EXI ceiling. Pushing past 12 Mbit/s was investigated and found NOT achievable + on this UP5K (the W5500-operating logic is distributed ~40 MHz, not just the + bit-bang) — see the "Full-rate W5500 SPI" item below. + `W5500SPIMaster(clk_div=N)` divides SCK further if signal integrity needs it. +- **EXI DMA bulk reads**: ✅ done — SPRAM-region reads (addr ≥ 0x100) now STREAM + until CS deasserts instead of stopping at the header's 2-bit length, so they + serve both ≤4-byte immediate reads (Swiss) AND arbitrary-length DMA reads + (other GC software, and a future Swiss path for loading ROMs from a network + file store). Implementation: + - `SPIMode3Slave.cs_active` (synchronised CS level) → `ExiCapture` crosses it + to the exi domain (FFSynchronizer) → `BBARegisterFile.cs_active`. + - `BBARegisterFile` SPRAM_STREAM state: auto-increments the SPRAM address, + prefetches up to SP_LIMIT=4 reads in flight, pushes responses to tx_fifo; + SPRAM_END drains the in-flight pipeline + rx dummies on CS-rise. + - `ExiCapture` flushes tx_fifo on CS-fall to clear prefetch over-push so a + truncated DMA read can't leak stale bytes into the next transaction. + Tested: register-file streaming read (SPRAM model, 12 bytes), ExiCapture + over-push/flush, AND the full BBATop loop — a W5500 model delivers a frame → + W5500 master RX → RXFrameAssembler writes the SPRAM ring → GC reads RWP then + DMA-reads the descriptor+frame back (verified byte-for-byte). + Note: a DMA read header must keep length-1 within the 2-bit field; the GC + driver sets it ≤3 and clocks the real length via CS (the design streams + until CS regardless). (EXI DMA *writes* are not implemented; the GC's + DMA-write engine has a 1-bit-shift bug and Swiss avoids them — see + design-doc §"EXI DMA bug".) +- **S0_IR interrupt clear after RX**: ✅ done — `W5500SPIMaster` RX_CLR_IR state + writes Sn_IR[2]=1 after RECV so `INT_N` deasserts (else the FSM would re-enter + RX_CHECK forever on real hardware). +- **Full-rate W5500 SPI (27 Mbit/s) — INVESTIGATED, NOT achievable on UP5K**: + the W5500 SCK is sync÷2 = 12 MHz. Raising it needs the SPI engine on a ≥54 MHz + clock, but a standalone synth of `W5500SPIMaster` in the capture domain closes + only **40 MHz** — and the slack histogram shows the failure is *distributed* + (~140 endpoints fail 54, incl. the `wbuf`/header mux feeding the shift + register), NOT a single cuttable path. So the bottleneck is the **logic that + operates the SPI device** (transaction FSM, byte sourcing), not the bit-bang. + Consequences: + - The "split the bit engine to capture + per-byte CDC handshake" idea nets + only ~14 Mbit/s — the CDC round-trip ≈ the SPI byte time — not worth it. + - A capture-domain "streaming executor" would still contain that distributed + ~40 MHz logic, so it wouldn't close 54 either. + - Hardware `SB_SPI` wouldn't help (it only offloads the bit-bang, which was + never the bottleneck) and is unsimulatable. + - There is no usable clock between 24 (HFOSC) and 54 (the one PLL, needed at + 54 for the EXI front-end); PLL÷2 = 27 → SCK 13.5 MHz, a ~12% gain, not + worth the fabric divider. + Net: 12 Mbit/s is the practical W5500 ceiling on this part. It exceeds + real-world GC BBA TCP throughput and is fine for chunked ROM streaming. + Reaching 27 Mbit/s would need a faster FPGA or a much shallower W5500-operating + redesign (uncertain) — **OR a parallel-bus ethernet chip (see W5100 below)**, + which is the implemented solution for the ROM-streaming throughput target. + +## W5100 vs W5500 ethernet back-end + +The throughput insight: SPI serialises 8 bits/byte, so the W5500 byte rate is +(operating-logic clock)/16 — and that logic caps ~40 MHz on this UP5K → ~12 +Mbit/s. A **parallel** bus moves a whole byte per access, so the *same* ~24 MHz +`sync` logic clears the 27 Mbit/s EXI ceiling (the real hard limit — the GC EXI +bus tops out there). So `W5100ParallelMaster` is the throughput path and is now +the `BBATop` default. + +- **Interface:** W5100 **indirect parallel bus** (IDM). Only A[1:0] are wired + (board ties A[14:2]=0 so a power-up direct access at A=00 still hits MR): + `00`=MR, `01`=IDM_AR0(hi), `10`=IDM_AR1(lo), `11`=IDM_DR. A register/buffer + access = write IDM_AR (the 16-bit address) then read/write IDM_DR. With MR.AI + set, IDM_DR auto-increments → a multi-byte block is one address-set + a burst. +- **Bus engine:** drives A + D with `/CS` and `/RD`|`/WR` asserted for + `strobe_cycles` (default 3 ≈ 125 ns at 24 MHz, ≥ the W5100's ~80 ns access). + DATA[7:0] is bidirectional → an SB_IO tristate (`bus_data_o`/`oe`/`i`). +- **Pins (15):** A[1:0]=2, D[7:0]=8, /CS,/RD,/WR=3, /INT=1, /RST=1. With EXI (5) + + clk (1) = **21 of ~34 usable SG48 I/O** — comfortable. See `synth.py`. +- **MR.AI requires init first:** unlike the W5500 (each SPI transaction is + self-framed), the W5100's multi-byte accesses depend on MR.AI, so the init + sequence (triggered by the GC's NCRA reset) MUST run before any TX/RX. The + BBATop test issues NCRA-reset before its RX loop for this reason; on hardware + the GC driver already does. (`BBATop(reset_cycles=N)` shrinks the MR settle + wait for sim.) +- **Ring wraparound is in fabric:** the W5100 does NOT auto-wrap the IDM address + at the socket-buffer boundary (the W5500 did), so the streamer re-sets IDM_AR + to the buffer base when the running address reaches the 2 KB boundary. Handled + in the SW/SR/RB paths (`xfer_wrap`/`xfer_wbase`/`xfer_wend`/`cur_addr`); both + TX and RX wrap cases are tested. +- **Register map differs from the W5500:** common regs at 0x0000 (MR, SHAR 0x09, + IMR 0x16, RMSR/TMSR 0x1A/0x1B), socket 0 at 0x0400 (S0_MR/CR/IR, TX_WR 0x424, + RX_RSR 0x426, RX_RD 0x428), TX buffer 0x4000, RX buffer 0x6000. MACRAW mode. +- **Status:** init/TX/RX (with wrap) verified vs a bus model; BBATop full + W5100→SPRAM→GC RX loop passes byte-for-byte; synth PASS (slow ~32≥24, capture + ~56≥54, 42% LC). Register addresses/MR bits are from the datasheet (from + memory) — **confirm at hardware bring-up**. + +### `rebbarb/` — LED blink demo (unchanged) +- `rebbarb.py` — blinks LEDs via a PLL (36 MHz), demonstrates `IceBreakerPlatform` +- `debouncer.py` — `Debouncer(cycles)` — synchronous debounce, configurable hold +- `toggle_button.py` — `ToggleButton` — edge-to-toggle state machine (wraps Debouncer) +- `pulse_button.py` — `PulseButton` — single-cycle pulse on rising edge (wraps Debouncer) + +These components are reusable building blocks. The `Debouncer` and button wrappers +will be needed for any physical input in `exi_bba/`. + +**Import note:** `rebbarb/` files use bare imports (`from debouncer import Debouncer`). +Run them as `python rebbarb/.py` from the workspace root so Python adds +`rebbarb/` to `sys.path` automatically. + +**Simulation at module level:** `toggle_button.py` and `pulse_button.py` run +their simulations unconditionally (no `__main__` guard) — importing either file +triggers a VCD write. New modules should guard simulation code with +`if __name__ == "__main__":`. + +`examples/amaranth_cdc.py` contains handwritten `SyncFF` and `TogglePulseSync` +reference implementations — use `amaranth.lib.cdc` primitives (`FFSynchronizer`, +`PulseSynchronizer`) in production code instead. + +`hardware/sp1_test_plug/` — KiCad project for a physical SP1 edge-connector test +plug (schematic, PCB, custom GameCube symbol library). Used to verify pad geometry +before ordering the interposer PCB; not part of the FPGA build. + +--- + +## Amaranth Simulator API + +Two API generations are present in this repo: + +| API | Where used | Status | +|---|---|---| +| `sim.add_testbench(async_fn)` + `await ctx.tick()` + `Period(MHz=n)` | `rebbarb/*.py` | **Use this for new code** | +| `sim.add_sync_process(gen_fn)` + `sim.run_until(t)` | `examples/` | Old — reference only | + +New modules should use the testbench API (`add_testbench`, `sim.write_vcd(ctx)` +context manager). The old process API still works but is not idiomatic in current +Amaranth. + +**Critical testbench timing rule:** `ctx.get(signal)` reads signal values AFTER +the clock edge (post-update registered values). Combinatorial signals that depend +on registered signals that were updated by the SAME tick will already reflect the +new registered values. For example: if `tx_sof = tx_bytes_r_rdy & is_first` and +`is_first` is cleared synchronously on the first byte, then reading `tx_sof` after +the first byte's tick always returns 0 — read BEFORE the tick instead. + +**`ctx.set()` takes effect immediately** (combinatorial, not registered). Use it +AFTER `await ctx.tick()` to prepare inputs for the NEXT tick. + +The full design specification lives in `docs/gc_bba_fpga_design.md`. + +--- + +## Key Architecture Decisions + +- **No network stack in the FPGA.** The GC CPU runs TCP/IP. The FPGA is a dumb + MAC bridge. +- **Split-domain clocking — 3 domains, 2 sources (1 PLL + 1 HFOSC):** + - `capture` — 54 MHz (PLL, DIVR=0 DIVF=71 DIVQ=4). Hosts ONLY the SPI Mode 3 + bit engine inside `ExiCapture`. 54 MHz = 2× the **real 27 MHz** EXI clock — + the minimum oversampling for clean Mode 3. The isolated bit engine closes + ~91 MHz; integrated with the byte-FIFO read path the capture domain closes + ~62 MHz, so 54 passes with margin. + - `exi` — 24 MHz (HFOSC ÷2). BBA register file / transaction FSM. + - `sync` — 24 MHz (same HFOSC net as `exi`). SPRAM arbiter, RX/TX engines, + W5500 SPI master. + - **Why split:** only the tiny SPI bit engine needs a fast clock to sample + 27 MHz EXI. The bulky register-file/SPRAM/W5500 logic is routing-bound at + ~33–44 MHz on the UP5K and only needs the byte rate (27 MHz ÷ 8 ≈ 3.4 MHz). + `ExiCapture` bridges capture↔exi with rx/tx byte AsyncFIFOs. + - **EXI clock reality:** the GC EXI clock tops out at ~27 MHz. libogc's + `EXI_SPEED32MHZ` is a nominal name — the real rate is 27 MHz. The old + "96 MHz = 3× 32 MHz EXI" target was doubly wrong and unreachable on UP5K + (which caps ~44 MHz for non-trivial logic). + - **TX/MISO across the split:** the register file PROACTIVELY pushes read + responses into the tx byte FIFO during the EXI clock-idle gap (the GC pauses + the clock between an EXI_Imm header-write and the data-read). The bit engine + drives MISO live from the FIFO head; see `ExiCapture` / `SPIMode3Slave`. +- **All CDC via `amaranth.lib.cdc`.** Never pass raw multi-bit signals across + domains. Use `FFSynchronizer` for slow single bits, `PulseSynchronizer` for + events, `AsyncFIFO` for data streams, `ResetSynchronizer` for resets. +- **Register file lives entirely in `exi` domain.** The `sync` domain only + communicates through AsyncFIFOs and PulseSynchronizers — never direct register + reads/writes. + +--- + +## Critical Protocol Notes + +### EXI / SPI Mode 3 +- CLK idles **HIGH** (CPOL=1, CPHA=1). +- MOSI sampled on **falling** CLK edge. MISO driven on **rising** CLK edge. +- Getting this wrong means the GC never enumerates the device. +- CS is active **low**, delineates each transaction. + +### EXI Transaction Header (2 bytes before data) +``` +Byte 0: [7]=write_flag [6:0]=addr[12:6] +Byte 1: [7:2]=addr[5:0] [1:0]=xfer_len-1 (0=1B … 3=4B) +``` +Full address = 13 bits → 0x0000–0x1FFF. + +### Device ID Query +On power-on the GC writes `0x0000` (2 bytes) then reads 4 bytes. +Must return: `0x04 0x02 0x02 0x00`. + +--- + +## Memory Map (abridged) + +| Range | Region | +|---|---| +| 0x0000–0x0033 | MAC control registers (register file, exi domain) | +| 0x0048 | TXDATA — bulk TX data port (→ `tx_bytes` AsyncFIFO) | +| 0x0100–0x0FFF | RX ring buffer in SPRAM (15 × 256-byte pages, pages 1–15) | +| 0x0100–0x1FFF | any read ≥ 0x0100 streams from SPRAM (DMA path); the ring proper is pages 1–15 above | + +--- + +## Key Registers + +| Addr | Name | Notes | +|---|---|---| +| 0x00 | NCRA | [0]=RESET self-clears; pulses `ncra_rst` to sync domain | +| 0x08 | IMR | Interrupt mask | +| 0x09 | IR | Write-1-to-clear. [1]=RI, [2]=TI. INT_N asserts when IR & IMR ≠ 0 | +| 0x16–17 | RWP | RX write pointer — updated by sync domain via `rx_wptr` FIFO | +| 0x18–19 | RRP | RX read pointer — GC writes after consuming frames | +| 0x20–25 | PAR0–5 | MAC address; also forwarded to W5500 as SHAR | +| 0x31 | NWAYS | Hardcode **0x17** (100M full-duplex link up, autoneg complete) | +| 0x3A | HIPR | Hardcode **0x01** (BBA present) | +| 0x48 | TXDATA | GC streams TX frame bytes here | + +--- + +## Module Breakdown + +| Module | Domain | File | +|---|---|---| +| `BBATop` | all | `exi_bba/bba_top.py` | +| `ExiCapture` | capture (+exi FIFOs) | `exi_bba/exi_capture.py` | +| `SPIMode3Slave` | capture (param `domain`) | `exi_bba/spi_mode3_slave.py` | +| `BBARegisterFile` | exi (+FIFO to sync) | `exi_bba/bba_register_file.py` | +| `SPRAMArbiter` | sync | `exi_bba/spram_arbiter.py` | +| `RXFrameAssembler` | sync | `exi_bba/rx_frame_assembler.py` | +| `TXFrameDrain` | sync | `exi_bba/tx_frame_drain.py` | +| `W5100ParallelMaster` | sync | `exi_bba/w5100_parallel_master.py` (default eth) | +| `W5500SPIMaster` | sync | `exi_bba/w5500_spi_master.py` (alt eth) | +| `EEPROMModel` | exi | `exi_bba/eeprom_model.py` | + +`ExiCapture` wraps `SPIMode3Slave` (in the fast `capture` domain) plus the +capture↔exi rx/tx byte AsyncFIFOs. `BBARegisterFile` consumes the rx byte +stream and proactively pushes read responses into the tx byte FIFO — it no +longer sees the per-bit SPI cadence (that lives entirely in `capture`). + +--- + +## CDC Signal Inventory + +| Signal | Direction | Primitive | +|---|---|---| +| EXI CLK / MOSI / CS pins | async → capture | `FFSynchronizer` (stages=2) | +| RX byte stream (capture→core) | capture → exi | `AsyncFIFO` 8-bit, depth=4 | +| TX byte stream (core→capture) | exi → capture | `AsyncFIFO` 8-bit, depth=2 | +| cs_active (transaction in progress) | capture → exi | `FFSynchronizer` (DMA read length) | +| SPRAM read request (addr) | exi → sync | `AsyncFIFO` 16-bit, depth=4 | +| SPRAM read result (data) | sync → exi | `AsyncFIFO` 8-bit, depth=4 | +| TX packet bytes | exi → sync | `AsyncFIFO` 8-bit, depth=16 | +| TX frame length | exi → sync | `AsyncFIFO` 16-bit, depth=4 | +| RX frame bytes | sync → SPRAM | `RXFrameAssembler` → `SPRAMArbiter` (not a byte FIFO; the GC reads frames back out of SPRAM via the SPRAM read req/rsp FIFOs) | +| RWP update | sync → exi | `AsyncFIFO` 8-bit, depth=4 | +| RRP update | exi → sync | `AsyncFIFO` 8-bit, depth=4 | +| RX ready (IR[RI]) | sync → exi | `PulseSynchronizer` | +| TX done (IR[TI]) | sync → exi | `PulseSynchronizer` | +| NCRA reset pulse | exi → sync | `PulseSynchronizer` | + +--- + +## W5500 Configuration (on NCRA reset) + +The W5500 selects the register **block** via the BSB field of the control byte, +NOT via the address — so register addresses below are **block offsets**, not flat +0x4000-style addresses (see `_W5500_*` and `_CTRL_*` in `w5500_spi_master.py`). +``` +1. Write MR = 0x80 (common block, offset 0x0000) software reset +2. Wait ~1 ms +3. Write SHAR = MAC (common block, offset 0x0009, 6 bytes from PAR0–5) +4. Write S0_MR = 0x04 (socket-0 reg block, offset 0x0000) MACRAW +5. Write S0_CR = 0x01 (socket-0 reg block, offset 0x0001) OPEN +6. Write S0_IMR = 0x05 (socket-0 reg block, offset 0x002C) RECV | SEND_OK +``` + +W5500 SPI is **Mode 0** (CPOL=0 CPHA=0); SCK = **12 MHz** (the 24 MHz `sync` +domain ÷ 2 via a toggle clock-enable). Connect W5500 `INT_N` to an FPGA input +for low-latency RX detection. (The W5500 is the alternate back-end; the W5100 +parallel master is the default — see "W5100 vs W5500".) + +--- + +## Physical Interface (SP1 Edge Connector) + +- PCB must be **1.2 mm thick, ENIG finish**. +- Staggered (not mirrored) top/bottom contact rows — same geometry as PCI/ISA. +- Derive exact pad geometry from **SP1ETH KiCad project** (silverstee1/SP1ETH), + cross-referenced with ETH2SP1 (LaserBear). Do not rely on YAGCD alone. +- Add **100 µF bulk cap** on the interposer near FPGA power pins (3.3 V budget + is tight: iCEbreaker ~80 mA + W5500 ~150 mA ≈ 230 mA). +- **Pin 5 is 12 V — do not connect to FPGA I/O.** Test point or leave open. +- `EXTIN` (pin 1): tie to 3.3 V via 10 kΩ — required for GC device enumeration. +- All signal levels are 3.3 V. No level shifting needed. + +--- + +## SPRAM Notes + +- iCE40UP5K has 128 KB SPRAM (SB_SPRAM256KA, 16-bit wide). +- **1-cycle synchronous read latency** — result of read at cycle N is valid at N+1. +- Byte writes via `MASKWREN`: lower byte = `0b0011`, upper byte = `0b1100`. +- Address to SPRAM = byte_address >> 1. +- ETH writes take priority over EXI reads in the arbiter (safe by ring-buffer + invariant: GC only reads pages the ETH engine has already finished). + +--- + +## GC Initialisation Sequence (Swiss/BBA driver) + +``` +1. Write 0x0000 × 2, read 4 B → must get 0x04020200 (device ID) +2. Write NCRA = 0x01 (reset, self-clears; resets W5500 + SPRAM ptrs) +3. Poll NCRA bit 0 until 0 (wait reset complete) +4. Write PAR0–5 (MAC address) +5. Write MAR0–7 = 0xFF (promiscuous multicast) +6. Write ANALOG = 0xD6 (enable PHY — no FPGA effect, just store) +7. Write NWAYC (autoneg config — store only) +8. Write IMR = 0x86 (enable RBFI | TI | RI interrupts) +9. Write GCA (AUTOPUB bit) +10. Write NCRA SR bit = 0x08 (start receive) +11. Poll NWAYS until link up → return hardcoded 0x17 immediately +``` + +--- + +## Implementation Notes & Gotchas + +- **`NWAYS` must return `0x17` always.** GC polls it to confirm 100 Mbps link + before enabling RX. Do not attempt to reflect real W5500 link status. +- **`EEPROMModel` can be stubbed initially.** Many GC BBA drivers write their own + MAC to PAR0–5 rather than using the EEPROM. Pre-populate PAR0–5 reset state + with a valid Nintendo OUI MAC (`00:09:BF:xx:xx:xx`). +- **`tx_load` timing in `SPIMode3Slave`:** pulses at CS assertion (first byte) + and after each complete received byte. Upstream must register next TX byte + within one `exi` clock. +- **PLL target 54 MHz**: verify with `icepll -i 12 -o 54` (DIVR=0 DIVF=71 DIVQ=4) + before coding PLL parameters; the capture-domain bit engine oversamples the + 27 MHz EXI clock 2×. +- **TX buffer selection (NCRA ST bits):** Ignore buffer select (ST1 vs ST0). + Treat any non-zero ST as a TX trigger. +- **If nextpnr fails capture-domain timing at 54 MHz:** the isolated bit engine + closes ~91 MHz, so 54 has margin; if a seed fails, sweep seeds + (`synth.py --seeds N`) or instruct users to configure Swiss to a lower EXI + clock index. diff --git a/PulseButton.vcd b/PulseButton.vcd deleted file mode 100644 index 42710e4..0000000 --- a/PulseButton.vcd +++ /dev/null @@ -1,195 +0,0 @@ -$comment Generated by Amaranth $end -$date 2025-09-20 22:27:02.816595 $end -$timescale 1 fs $end -$scope module bench $end -$scope module top $end -$var wire 1 ! clk $end -$var wire 1 " rst $end -$var wire 1 # i $end -$var wire 1 $ i$3 $end -$var wire 14 % counter $end -$var wire 1 & o $end -$var wire 1 ' o$6 $end -$var wire 1 ( last_seen $end -$scope module U$0 $end -$var wire 1 ! clk $end -$var wire 1 " rst $end -$var wire 1 # i $end -$var wire 1 ' o $end -$var wire 1 ) prevInValid $end -$var wire 14 * count $end -$var wire 1 + state $end -$var wire 1 , prevIn $end -$upscope $end -$upscope $end -$upscope $end -$enddefinitions $end -#0 -$dumpvars -0! -0" -0# -0$ -b0 % -0& -0' -0( -0) -b10011100010000 * -0+ -0, -$end -#500000000 -1! -1) -b0 * -#1000000000 -0! -#1500000000 -1! -1$ -1# -#2000000000 -0! -#2500000000 -1! -1+ -1, -b10011100010000 * -1' -#3000000000 -0! -#3500000000 -1! -1& -1( -#4000000000 -0! -#4500000000 -1! -0& -b10011100010000 % -#5000000000 -0! -#5500000000 -1! -b10011100001111 % -#6000000000 -0! -#6500000000 -1! -b10011100001110 % -0$ -0# -#7000000000 -0! -#7500000000 -1! -0, -b10011100001111 * -b10011100001101 % -#8000000000 -0! -#8500000000 -1! -b10011100001110 * -b10011100001100 % -#9000000000 -0! -#9500000000 -1! -b10011100001101 * -b10011100001011 % -#10000000000 -0! -#10500000000 -1! -b10011100001100 * -b10011100001010 % -#11000000000 -0! -#11500000000 -1! -b10011100001011 * -b10011100001001 % -1$ -1# -#12000000000 -0! -#12500000000 -1! -1, -b10011100010000 * -b10011100001000 % -#13000000000 -0! -#13500000000 -1! -b10011100000111 % -#14000000000 -0! -#14500000000 -1! -b10011100000110 % -#15000000000 -0! -#15500000000 -1! -b10011100000101 % -#16000000000 -0! -#16500000000 -1! -b10011100000100 % -0$ -0# -#17000000000 -0! -#17500000000 -1! -0, -b10011100001111 * -b10011100000011 % -#18000000000 -0! -#18500000000 -1! -b10011100001110 * -b10011100000010 % -#19000000000 -0! -#19500000000 -1! -b10011100001101 * -b10011100000001 % -#20000000000 -0! -#20500000000 -1! -b10011100001100 * -b10011100000000 % -#21000000000 -0! -#21500000000 -1! -b10011100001011 * -b10011011111111 % -#22000000000 -0! -#22500000000 -1! -b10011100001010 * -b10011011111110 % -#23000000000 -0! -#23500000000 -1! -b10011100001001 * -b10011011111101 % -#24000000000 -0! -#24500000000 -1! -b10011100001000 * -b10011011111100 % -#25000000000 diff --git a/ToggleButton.vcd b/ToggleButton.vcd deleted file mode 100644 index da8a43c..0000000 --- a/ToggleButton.vcd +++ /dev/null @@ -1,171 +0,0 @@ -$comment Generated by Amaranth $end -$date 2025-09-20 22:27:02.809849 $end -$timescale 1 fs $end -$scope module bench $end -$scope module top $end -$var wire 1 ! clk $end -$var wire 1 " rst $end -$var wire 1 # i $end -$var wire 1 $ i$3 $end -$var wire 1 % o $end -$var wire 1 & last_seen $end -$var wire 1 ' o$6 $end -$scope module U$0 $end -$var wire 1 ! clk $end -$var wire 1 " rst $end -$var wire 1 # i $end -$var wire 1 % o $end -$var wire 1 ( prevInValid $end -$var wire 14 ) count $end -$var wire 1 * state $end -$var wire 1 + prevIn $end -$upscope $end -$upscope $end -$upscope $end -$enddefinitions $end -#0 -$dumpvars -0! -0" -0# -0$ -0% -0& -0' -0( -b10011100010000 ) -0* -0+ -$end -#500000000 -1! -b0 ) -1( -#1000000000 -0! -#1500000000 -1! -1$ -1# -#2000000000 -0! -#2500000000 -1! -b10011100010000 ) -1* -1+ -1% -#3000000000 -0! -#3500000000 -1! -1& -1' -#4000000000 -0! -#4500000000 -1! -#5000000000 -0! -#5500000000 -1! -#6000000000 -0! -#6500000000 -1! -0$ -0# -#7000000000 -0! -#7500000000 -1! -b10011100001111 ) -0+ -#8000000000 -0! -#8500000000 -1! -b10011100001110 ) -#9000000000 -0! -#9500000000 -1! -b10011100001101 ) -#10000000000 -0! -#10500000000 -1! -b10011100001100 ) -#11000000000 -0! -#11500000000 -1! -b10011100001011 ) -1$ -1# -#12000000000 -0! -#12500000000 -1! -b10011100010000 ) -1+ -#13000000000 -0! -#13500000000 -1! -#14000000000 -0! -#14500000000 -1! -#15000000000 -0! -#15500000000 -1! -#16000000000 -0! -#16500000000 -1! -0$ -0# -#17000000000 -0! -#17500000000 -1! -b10011100001111 ) -0+ -#18000000000 -0! -#18500000000 -1! -b10011100001110 ) -#19000000000 -0! -#19500000000 -1! -b10011100001101 ) -#20000000000 -0! -#20500000000 -1! -b10011100001100 ) -#21000000000 -0! -#21500000000 -1! -b10011100001011 ) -#22000000000 -0! -#22500000000 -1! -b10011100001010 ) -#23000000000 -0! -#23500000000 -1! -b10011100001001 ) -#24000000000 -0! -#24500000000 -1! -b10011100001000 ) -#25000000000 diff --git a/docs/.obsidian/.obsidian/app.json b/docs/.obsidian/.obsidian/app.json new file mode 100644 index 0000000..9e26dfe --- /dev/null +++ b/docs/.obsidian/.obsidian/app.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/docs/.obsidian/.obsidian/appearance.json b/docs/.obsidian/.obsidian/appearance.json new file mode 100644 index 0000000..9e26dfe --- /dev/null +++ b/docs/.obsidian/.obsidian/appearance.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/docs/.obsidian/.obsidian/core-plugins.json b/docs/.obsidian/.obsidian/core-plugins.json new file mode 100644 index 0000000..0faa60d --- /dev/null +++ b/docs/.obsidian/.obsidian/core-plugins.json @@ -0,0 +1,33 @@ +{ + "file-explorer": true, + "global-search": true, + "switcher": true, + "graph": true, + "backlink": true, + "canvas": true, + "outgoing-link": true, + "tag-pane": true, + "footnotes": false, + "properties": false, + "page-preview": true, + "daily-notes": true, + "templates": true, + "note-composer": true, + "command-palette": true, + "slash-command": false, + "editor-status": true, + "bookmarks": true, + "markdown-importer": false, + "zk-prefixer": false, + "random-note": false, + "outline": true, + "word-count": true, + "slides": false, + "audio-recorder": false, + "workspaces": false, + "file-recovery": true, + "publish": false, + "sync": true, + "bases": true, + "webviewer": false +} \ No newline at end of file diff --git a/docs/.obsidian/.obsidian/workspace.json b/docs/.obsidian/.obsidian/workspace.json new file mode 100644 index 0000000..41369cc --- /dev/null +++ b/docs/.obsidian/.obsidian/workspace.json @@ -0,0 +1,167 @@ +{ + "main": { + "id": "6eef6b982305e97c", + "type": "split", + "children": [ + { + "id": "ef28aa54abb02b7c", + "type": "tabs", + "children": [ + { + "id": "dd2aafdfa4873c3e", + "type": "leaf", + "state": { + "type": "empty", + "state": {}, + "icon": "lucide-file", + "title": "New tab" + } + } + ] + } + ], + "direction": "vertical" + }, + "left": { + "id": "7dcb0dd958c47669", + "type": "split", + "children": [ + { + "id": "5addbd6c8b989a49", + "type": "tabs", + "children": [ + { + "id": "10f89da0d72538c0", + "type": "leaf", + "state": { + "type": "file-explorer", + "state": { + "sortOrder": "alphabetical", + "autoReveal": false + }, + "icon": "lucide-folder-closed", + "title": "Files" + } + }, + { + "id": "476834a62536c756", + "type": "leaf", + "state": { + "type": "search", + "state": { + "query": "", + "matchingCase": false, + "explainSearch": false, + "collapseAll": false, + "extraContext": false, + "sortOrder": "alphabetical" + }, + "icon": "lucide-search", + "title": "Search" + } + }, + { + "id": "ce54c42efc557a72", + "type": "leaf", + "state": { + "type": "bookmarks", + "state": {}, + "icon": "lucide-bookmark", + "title": "Bookmarks" + } + } + ] + } + ], + "direction": "horizontal", + "width": 300 + }, + "right": { + "id": "87b1d8f1ca08108d", + "type": "split", + "children": [ + { + "id": "69cbc257ba71f388", + "type": "tabs", + "children": [ + { + "id": "739632e6a61f8d8e", + "type": "leaf", + "state": { + "type": "backlink", + "state": { + "collapseAll": false, + "extraContext": false, + "sortOrder": "alphabetical", + "showSearch": false, + "searchQuery": "", + "backlinkCollapsed": false, + "unlinkedCollapsed": true + }, + "icon": "links-coming-in", + "title": "Backlinks" + } + }, + { + "id": "e20c6e67aeb6eacb", + "type": "leaf", + "state": { + "type": "outgoing-link", + "state": { + "linksCollapsed": false, + "unlinkedCollapsed": true + }, + "icon": "links-going-out", + "title": "Outgoing links" + } + }, + { + "id": "858ad7c8f3ac4d90", + "type": "leaf", + "state": { + "type": "tag", + "state": { + "sortOrder": "frequency", + "useHierarchy": true, + "showSearch": false, + "searchQuery": "" + }, + "icon": "lucide-tags", + "title": "Tags" + } + }, + { + "id": "661ea018f1aa1171", + "type": "leaf", + "state": { + "type": "outline", + "state": { + "followCursor": false, + "showSearch": false, + "searchQuery": "" + }, + "icon": "lucide-list", + "title": "Outline" + } + } + ] + } + ], + "direction": "horizontal", + "width": 300, + "collapsed": true + }, + "left-ribbon": { + "hiddenItems": { + "switcher:Open quick switcher": false, + "graph:Open graph view": false, + "canvas:Create new canvas": false, + "daily-notes:Open today's daily note": false, + "templates:Insert template": false, + "command-palette:Open command palette": false, + "bases:Create new base": false + } + }, + "active": "dd2aafdfa4873c3e", + "lastOpenFiles": [] +} \ No newline at end of file diff --git a/docs/ReBbarb.md b/docs/ReBbarb.md deleted file mode 100644 index c0b6f9f..0000000 --- a/docs/ReBbarb.md +++ /dev/null @@ -1,24 +0,0 @@ -This project attempts to emulate the Gamecube BroadBand Adapter in an FPGA. The following things need to happen. - - - [x] [[Amaranth-Hdl project setup]] - - [x] Setup venv - - [x] Install packages - - [x] Flash Blinky on icebreaker - - [ ] Figuring out how to deal with [[external clocks]]. - - [x] How to get a clock greater than 12Mhz needed to interface with 32Mhz EXI - - [x] PLL configured to 48Mhz - - [ ] ~~48Mhz oscillator onboard? ~~ - - [ ] Check if Clock Domain Crossing is possible. - - [ ] Oversampeling approach was tedious but worked - - [ ] Interfacing with [[GameCube]] - - [ ] Figuring pinout of SP1. - - [ ] Unofficial gamecube docs? - - [ ] Make sure connecting [[SP1]] to IceBreaker is safe. - - [ ] Can we power the FPGA with the SP1? - - [ ] How much voltage do we get from SP1. - - [ ] How much current can we source? - - [ ] Figuring out basic [[EXI protocol]] - - [ ] What is the structure of the messages? - - [ ] How to know how long the message is - - [ ] Integrity checks? - - [ ] How fast do we need to respond to a message. \ No newline at end of file diff --git a/docs/gc_bba_fpga_design.md b/docs/gc_bba_fpga_design.md new file mode 100644 index 0000000..8661345 --- /dev/null +++ b/docs/gc_bba_fpga_design.md @@ -0,0 +1,1443 @@ +# GameCube BBA FPGA Replacement — Design Document + +**Target hardware:** iCEbreaker (Lattice iCE40UP5K) +**Target language:** Amaranth HDL (Python) +**Toolchain:** Yosys + nextpnr-ice40 + IceStorm +**Purpose:** Replace the Nintendo GameCube Broadband Adapter (DOL-015) with an +FPGA-based implementation, exposing a W5500 100BASE-TX ethernet chip to the GC +over the EXI (Expansion Interface) serial bus, enabling game ISO streaming via +Swiss homebrew. + +--- + +## Table of Contents + +1. [System Overview](#1-system-overview) +2. [Protocol References](#2-protocol-references) +3. [Physical Interface — SP1 Edge Connector](#3-physical-interface--sp1-edge-connector) +4. [Clock Domains](#4-clock-domains) +5. [Clock Domain Crossing Strategy](#5-clock-domain-crossing-strategy) +6. [Module Hierarchy](#6-module-hierarchy) +7. [Module Specifications](#7-module-specifications) + - 7.1 [SPIMode3Slave](#71-spimode3slave) + - 7.2 [BBARegisterFile](#72-bbaregisterfile) + - 7.3 [SPRAMArbiter](#73-spramarbiter) + - 7.4 [RXFrameAssembler](#74-rxframeassembler) + - 7.5 [TXFrameDrain](#75-txframedrain) + - 7.6 [W5500SPIMaster](#76-w5500spimaster) + - 7.7 [EEPROMModel](#77-eeprommodel) + - 7.8 [BBATop](#78-bbatop) +8. [Memory Map](#8-memory-map) +9. [EXI Transaction Protocol](#9-exi-transaction-protocol) +10. [BBA Register Reference](#10-bba-register-reference) +11. [Initialisation Sequence](#11-initialisation-sequence) +12. [RX Data Path — Detailed Flow](#12-rx-data-path--detailed-flow) +13. [TX Data Path — Detailed Flow](#13-tx-data-path--detailed-flow) +14. [SPRAM Layout](#14-spram-layout) +15. [Critical Timing Constraints](#15-critical-timing-constraints) +16. [SPRAM Read Prefetch Pipeline](#16-spram-read-prefetch-pipeline) +17. [Interrupt Handling](#17-interrupt-handling) +18. [EEPROM / MAC Address](#18-eeprom--mac-address) +19. [iCE40UP5K Resource Budget](#19-ice40up5k-resource-budget) +20. [PCB / Connector Notes](#20-pcb--connector-notes) +21. [Known Hardware Quirks](#21-known-hardware-quirks) +22. [File Structure](#22-file-structure) +23. [Simulation Strategy](#23-simulation-strategy) +24. [Open Issues and Extension Points](#24-open-issues-and-extension-points) + +--- + +## 1. System Overview + +The GameCube Broadband Adapter (BBA) is a hardware peripheral that plugs into +Serial Port 1 (SP1) on the underside of the GameCube. It presents a network +interface to the GC CPU using a Macronix MX98730EC custom IC. GC software +(primarily Swiss homebrew) communicates with the BBA through a memory-mapped +register interface accessed over the EXI serial bus. + +This project replaces the MX98730EC with an iCEbreaker FPGA that emulates the +register interface, and connects to a W5500 ethernet chip (on a Pmod-compatible +module) for actual network communication. + +### High-level data flow + +``` +GameCube CPU + │ EXI (SPI Mode 3, 32 MHz, Serial Port 1) + ▼ +iCEbreaker FPGA + ├── exi domain (64 MHz): SPI slave, register file, prefetch pipeline + └── sync domain (48 MHz): SPRAM arbiter, RX assembler, TX drain, W5500 driver + │ SPI (up to 40 MHz) + ▼ + W5500 Pmod module (100BASE-TX ethernet) + │ RJ-45 + ▼ + Network +``` + +### What this design does NOT implement + +- A network stack. The GC CPU runs TCP/IP. The FPGA is a dumb MAC bridge. +- IP address awareness. The FPGA never parses ethernet frame payloads. +- The GC's DMA engine quirk (only relevant to GC-side software). +- Video/audio streaming logic (handled by Swiss on the GC CPU side). + +--- + +## 2. Protocol References + +| Source | Content | +|---|---| +| YAGCD §2.4.1.4 | SP1 (P6) connector pinout | +| YAGCD §5.9 | EXI bus register descriptions | +| YAGCD §10.8 | MX98730EC (BBA chip) register map | +| Dolphin source `EXI_DeviceEthernet.h` | Register offsets, init sequence, RX/TX flow | +| Dolphin source `EXI_DeviceEthernet.cpp` | Transaction encoding, interrupt logic | +| Swiss source `bba.c` | GC-side driver, exact register access patterns | +| MX98730EC datasheet | Unavailable publicly; YAGCD is the primary reference | +| W5500 datasheet | SPI interface, register map, socket model | +| iCE40UP5K datasheet | SPRAM timing, PLL parameters, I/O standards | + +**Critical implementation note:** The MX98730EC uses **SPI Mode 3** (CPOL=1, +CPHA=1). CLK idles HIGH. Data is sampled on the FALLING edge of CLK and set up +on the RISING edge. This is the opposite of memory cards and the RTC chip, which +use SPI Mode 0. Getting this wrong means the GC will never enumerate the device. + +--- + +## 3. Physical Interface — SP1 Edge Connector + +### Slot characteristics + +- Dual-sided PCB edge connector +- Contacts on both top and bottom faces of the PCB edge +- Top and bottom contact rows are **staggered** (offset by half a pitch), not + mirrored — similar to ISA/PCI card edge geometry +- PCB must be ordered at **1.2 mm thickness** with **ENIG (gold) finish** +- Keying notch at top-right corner of housing (when looking into console socket + with front of console facing right) + +### Connector footprint + +Exact pad positions and pitch must be taken from the SP1ETH KiCad project +(github.com/silverstee1/SP1ETH). Do not attempt to derive dimensions from YAGCD +alone — the document lists signals but not physical geometry. Cross-reference +against the ETH2SP1 (LaserBear) open model files as a second source. + +Key parameters to verify from those files before PCB layout: +- Contact pitch (expected: 2.0 mm or 2.54 mm — measure from KiCad file) +- Stagger offset between top and bottom rows +- Total contact count per side (expected: 6 per side = 12 total, or 12 per side + = 24 total with duplicated power/ground) +- Insertion depth from board edge to first contact +- Board width at connector edge + +### Signal pinout (YAGCD §2.4.1.4) + +Pin numbering: looking into the console socket, front of console to the right, +pin 1 is on the left. On the adapter PCB (component side up, inserting down), +pin 1 is also on the left — numbering does not mirror. + +| Pin | Signal | Direction | Notes | +|---|---|---|---| +| 1 | EXTIN | Adapter → GC | Device detect/sense. Tie to 3.3V via 10 kΩ resistor. Without this the GC does not enumerate the device. | +| 2 | GND | — | Shield ground | +| 3 | INT | Adapter → GC | Active-low interrupt to GC CPU. Assert when IR & IMR != 0. | +| 4 | CLK | GC → Adapter | SPI clock, up to 32 MHz, idles HIGH (Mode 3) | +| 5 | 12V | — | 12 V supply from GC. **Do not connect to FPGA I/O.** Leave unconnected or route to a test point only. | +| 6 | DO (MISO) | Adapter → GC | Serial data out: adapter drives, GC samples | +| 7 | 3.3V | — | 3.3 V supply (~200 mA available combined with pin 8) | +| 8 | 3.3V | — | 3.3 V supply (parallel with pin 7) | +| 9 | DI (MOSI) | GC → Adapter | Serial data in: GC drives, adapter samples | +| 10 | CS | GC → Adapter | Chip select, active low. Delineates each transaction. | +| 11 | GND | — | Signal ground | +| 12 | GND | — | Signal ground | + +**Power budget:** Pins 7+8 together supply 3.3 V. The iCEbreaker draws ~80 mA +active, the W5500 ~150 mA peak. Total ~230 mA. The GC's 3.3 V rail on SP1 is +rated for the original BBA which also drew ~200 mA, so headroom is tight. Add a +100 µF bulk capacitor on the interposer PCB close to the FPGA power pins. + +**Voltage levels:** All EXI signals are 3.3 V logic. The iCEbreaker I/O is 3.3 V. +The W5500 is 3.3 V. No level shifting required anywhere in this design. + +--- + +## 4. Clock Domains + +The design uses two clock domains. The iCE40UP5K has one PLL and one internal +48 MHz oscillator (SB_HFOSC). + +### Domain table + +| Domain | Frequency | Source | Purpose | +|---|---|---|---| +| `exi` | 64 MHz | PLL (12 MHz × 16 / 3) | SPI Mode 3 slave, BBA register file, prefetch pipeline | +| `sync` | 48 MHz | SB_HFOSC internal oscillator | SPRAM arbiter, RX/TX ethernet engines, W5500 SPI master | + +### Rationale + +**Why 64 MHz for `exi`?** +The EXI bus runs at 32 MHz. The SPI Mode 3 slave needs to detect CLK edges and +respond on the correct edge. Running the `exi` domain at 2× the bus rate (64 MHz) +gives two FPGA ticks per EXI CLK half-period. One tick for the setup phase +(MOSI→shift register, prepare MISO), one tick for the sample/drive phase. This +is the minimum oversampling ratio that cleanly implements Mode 3 without +combinatorial timing risk on the MISO output path. + +**Why 48 MHz for `sync`?** +The iCE40UP5K's internal 48 MHz oscillator (SB_HFOSC) is available without +consuming the PLL. This leaves the one PLL free for the 64 MHz `exi` domain. The +W5500 SPI can run up to 80 MHz but we drive it at 24 MHz (48 MHz ÷ 2 via clock +enable), which is well within spec and requires no additional PLL output. + +### PLL configuration (iCE40UP5K) + +``` +Input: 12 MHz crystal (iCEbreaker on-board) +DIVR: 0 (input divider: 12 MHz / (0+1) = 12 MHz) +DIVF: 63 (feedback mult: 12 MHz × (63+1) = 768 MHz VCO) +DIVQ: 3 (output divider: 768 MHz / 2^3 = 96 MHz) +... actually for 64 MHz: +DIVR: 0 +DIVF: 15 (12 × 16 = 192 MHz VCO) -- VCO must be 533–1066 MHz on UP5K +``` + +The iCE40UP5K VCO range is 533–1066 MHz. To reach 64 MHz cleanly: + +``` +DIVR = 0 → F_pfd = 12 MHz +DIVF = 63 → F_vco = 12 × (63+1) = 768 MHz (within range) +DIVQ = 3 → F_out = 768 / 8 = 96 MHz (too fast) + +Better: target 64 MHz +DIVF = 53 → F_vco = 12 × 54 = 648 MHz +DIVQ = 3 → F_out = 648 / 8 = 81 MHz (still off) + +Correct combination: +DIVR = 0, DIVF = 42, DIVQ = 3 +F_vco = 12 × 43 = 516 MHz (just below range minimum — not valid) + +Use: +DIVR = 0, DIVF = 63, DIVQ = 3 → 96 MHz, then use clock enable for /1.5 +-- or -- +Accept 96 MHz exi domain (3× bus rate instead of 2×): more margin, same logic +-- or -- +DIVR = 2, DIVF = 63, DIVQ = 2 → (12/3) × 64 / 4 = 64 MHz exactly + F_pfd = 4 MHz, F_vco = 4×64 = 256 MHz — below 533 MHz minimum, invalid + +Recommended: use 96 MHz (DIVR=0, DIVF=63, DIVQ=3) for exi domain. +At 96 MHz there are 3 ticks per 32 MHz EXI half-period. +Adjust SPIMode3Slave edge detection accordingly (3-tick phases instead of 2). +``` + +**Implementation note:** Verify exact PLL parameters with `icepll` tool: +```bash +icepll -i 12 -o 64 # finds closest achievable output +icepll -i 12 -o 96 # alternative +``` +The agent implementing this should run `icepll` and use whatever output it +recommends, then adjust the `SPIMode3Slave` tick counts accordingly. + +### Reset strategy + +Each domain has its own reset, deasserted synchronously using +`ResetSynchronizer` from `amaranth.lib.cdc`: + +```python +# In platform create_missing_domain("exi"): +m.submodules.exi_rst = ResetSynchronizer( + arst = ResetSignal("sync"), + domain = "exi", +) +``` + +The `sync` domain reset comes from the iCEbreaker's on-chip power-on reset +(SB_GB driven by SB_HFOSC, which has built-in POR). + +--- + +## 5. Clock Domain Crossing Strategy + +All signals crossing between `exi` and `sync` domains must use one of the +following CDC primitives from `amaranth.lib.cdc`. Never pass a raw multi-bit +signal directly between domains — only one bit may change per clock crossing. + +### CDC primitive selection guide + +| Signal type | Primitive | Latency | +|---|---|---| +| Single bit, slow-changing (flags, status) | `FFSynchronizer` | 2 dest clocks | +| Single-cycle pulse / event | `PulseSynchronizer` | ~3–4 dest clocks | +| Multi-bit data stream (packet bytes) | `AsyncFIFO` | ~3–4 dest clocks | +| Reset deassertion | `ResetSynchronizer` | 2 dest clocks | +| Async external pin (CLK, MOSI, CS) | `FFSynchronizer` | 2 dest clocks | + +### CDC inventory for this design + +| Signal | From | To | Primitive | Notes | +|---|---|---|---|---| +| EXI CLK pin | async | exi | FFSynchronizer | stages=2, reset=1 (CLK idles high) | +| EXI MOSI pin | async | exi | FFSynchronizer | stages=2 | +| EXI CS pin | async | exi | FFSynchronizer | stages=2, reset=1 (CS idles high) | +| SPRAM read request (addr) | exi | sync | AsyncFIFO 16-bit wide, depth=4 | Prefetch pipeline | +| SPRAM read result (data) | sync | exi | AsyncFIFO 8-bit wide, depth=4 | Prefetch pipeline | +| TX packet bytes | exi | sync | AsyncFIFO 8-bit wide, depth=64 | GC→ethernet | +| TX packet start/len | exi | sync | AsyncFIFO 16-bit wide, depth=4 | Frame delimiter | +| RX packet bytes | sync | exi | AsyncFIFO 8-bit wide, depth=64 | ethernet→GC | +| RWP update (new value) | sync | exi | AsyncFIFO 8-bit wide, depth=4 | After frame committed | +| RRP update (new value) | exi | sync | AsyncFIFO 8-bit wide, depth=4 | After GC advances pointer | +| IR[RI] set (RX ready) | sync | exi | PulseSynchronizer | Triggers RI interrupt | +| IR[TI] set (TX done) | sync | exi | PulseSynchronizer | Triggers TI interrupt | +| NCRA reset pulse | exi | sync | PulseSynchronizer | Resets ethernet engine | +| exi_int_n output | exi | physical pin | Direct (output register) | Active-low to GC | + +**Critical rule:** The register file lives entirely in the `exi` domain. The +`sync` domain never directly reads or writes EXI registers. All interaction +between the two domains goes through the AsyncFIFOs and PulseSynchronizers +listed above. This ensures the GC's register reads always respond within the +`exi` domain without waiting on CDC latency. + +--- + +## 6. Module Hierarchy + +``` +BBATop (top-level, sets up clock domains) +├── SPIMode3Slave (exi domain — bit engine) +├── BBARegisterFile (exi domain — register decode + response) +│ ├── [AsyncFIFO: spram_req] (exi→sync: read address requests) +│ ├── [AsyncFIFO: spram_rsp] (sync→exi: read data responses) +│ ├── [AsyncFIFO: tx_bytes] (exi→sync: TX packet data) +│ ├── [AsyncFIFO: tx_ctrl] (exi→sync: TX frame length) +│ ├── [AsyncFIFO: rx_wptr] (sync→exi: RWP updates) +│ ├── [AsyncFIFO: rx_rptr] (exi→sync: RRP updates from GC) +│ ├── [PulseSynchronizer: rx_irq] (sync→exi) +│ ├── [PulseSynchronizer: tx_irq] (sync→exi) +│ └── [PulseSynchronizer: ncra_rst] (exi→sync) +├── SPRAMArbiter (sync domain — owns all SPRAM) +├── RXFrameAssembler (sync domain — ethernet→SPRAM) +├── TXFrameDrain (sync domain — SPRAM→ethernet) +├── W5500SPIMaster (sync domain — SPI master to W5500) +└── EEPROMModel (exi domain — 93C46 bit-bang model) +``` + +--- + +## 7. Module Specifications + +### 7.1 SPIMode3Slave + +**Domain:** `exi` +**File:** `exi_bba/spi_mode3_slave.py` + +Implements a byte-oriented SPI Mode 3 slave. Handles CLK/MOSI/MISO/CS at the +bit level and presents a clean byte interface to `BBARegisterFile`. + +**SPI Mode 3 timing recap:** +- CLK idles HIGH +- MOSI is set up by master before the FALLING edge +- Slave samples MOSI on the FALLING edge of CLK +- Slave drives MISO on the RISING edge of CLK (ready for master to sample on + next falling edge) + +**Port list:** + +| Port | Width | Dir | Domain | Description | +|---|---|---|---|---| +| `spi_clk` | 1 | in | async→exi | Raw SPI clock from GC, synchronized internally | +| `spi_mosi` | 1 | in | async→exi | Raw MOSI from GC, synchronized internally | +| `spi_miso` | 1 | out | exi | MISO output to GC | +| `spi_cs_n` | 1 | in | async→exi | Raw CS from GC (active low), synchronized internally | +| `rx_byte` | 8 | out | exi | Last complete received byte | +| `rx_valid` | 1 | out | exi | Pulses 1 cycle when `rx_byte` contains a new byte | +| `tx_byte` | 8 | in | exi | Byte to transmit; sampled when `tx_load` pulses | +| `tx_load` | 1 | out | exi | Requests next TX byte from upstream | + +**Internal behaviour:** + +1. Instantiate FFSynchronizer stages=2 on each of `spi_clk`, `spi_mosi`, + `spi_cs_n`. Reset values: `spi_clk`=1, `spi_cs_n`=1. +2. Register the synchronized signals one further cycle to form edge detectors: + `rising_clk = clk_s & ~clk_prev`, `falling_clk = ~clk_s & clk_prev`. +3. On CS falling edge: load `tx_byte` into internal shift register, pulse + `tx_load`, reset `bit_ctr` to 0. +4. On FALLING CLK edge (sample): shift `mosi_s` into `rx_shift` MSB-first, + increment `bit_ctr`. When `bit_ctr == 8`: register `rx_shift` into `rx_byte`, + pulse `rx_valid`, reset `bit_ctr` to 0, pulse `tx_load` to request next byte. +5. On RISING CLK edge (drive): shift `tx_shift` left by 1, drive MSB onto + `spi_miso`. +6. On CS rising edge: drive `spi_miso` high (idle), reset state. + +**Note on `tx_load` timing:** `tx_load` pulses at two points — CS assertion +(loads first byte before any bits are clocked) and after each complete received +byte (loads the next byte). The upstream (`BBARegisterFile`) must register the +next TX byte within one `exi` clock of `tx_load` pulsing. + +--- + +### 7.2 BBARegisterFile + +**Domain:** `exi` (with AsyncFIFO interfaces to `sync`) +**File:** `exi_bba/bba_register_file.py` + +Decodes EXI transactions (2-byte header + N data bytes), reads/writes the BBA +register space, and manages all CDC crossings to the `sync` domain. + +#### EXI transaction decoder FSM + +States: `HEADER0` → `HEADER1` → `DATA` → (back to `HEADER0`) + +**Header format:** + +``` +Byte 0: [7] = write flag (1 = write, 0 = read) + [6:0] = addr[12:6] (upper 7 bits of 13-bit address) + +Byte 1: [7:2] = addr[5:0] (lower 6 bits of 13-bit address) + [1:0] = xfer_len (0=1 byte, 1=2 bytes, 2=3 bytes, 3=4 bytes) +``` + +Full address = `{ byte0[6:0], byte1[7:2] }` = 13 bits → range 0x0000–0x1FFF. + +**`HEADER0` state:** Wait for `rx_valid`. Latch `rx_byte` as `hdr0`. + +**`HEADER1` state:** Wait for `rx_valid`. Decode address and flags. For read +transactions, immediately issue SPRAM prefetch request if address ≥ 0x100 +(ring buffer region). Load `tx_byte` with the register value for addresses +< 0x100 (register file region). Transition to `DATA`. + +**`DATA` state (write path):** For each `rx_valid`, write `rx_byte` to +`regs[addr + byte_ctr]` and handle side effects (see register side effects +table). Increment `byte_ctr`. When `byte_ctr == xfer_len`, go to `HEADER0`. + +**`DATA` state (read path):** Drive `tx_byte` from prefetch result (addresses +≥ 0x100) or directly from `regs[]` (addresses < 0x100). On each `tx_load`, +advance the read pointer and issue next prefetch. When `byte_ctr == xfer_len`, +go to `HEADER0`. + +**CS deassertion abort:** In any state, if `cs_n` rises, return to `HEADER0`. + +#### Register file storage + +Registers 0x00–0x1FF are implemented as an `Array` of 8-bit `Signal`s (512 +registers). In synthesis this maps to distributed RAM on iCE40. Not SPRAM — +SPRAM is reserved for the packet ring buffer. + +The register file is entirely in the `exi` domain. No CDC is needed to read +or write registers 0x00–0xFF. + +#### Register side effects + +| Register | Write side effect | +|---|---| +| NCRA (0x00) | If bit 0 (RESET) written: pulse `ncra_rst` PulseSynchronizer to `sync` domain. Self-clear bit 0 on next cycle. Reset TX/RX pointers in register file. | +| IR (0x09) | Write-1-to-clear: `IR <= IR & ~written_value` | +| RRP (0x18–0x19) | After GC writes new RRP value, push value into `rx_rptr` AsyncFIFO (exi→sync) so RX engine knows GC has consumed those pages | +| TWD (0x34–0x37) | Bytes written here are the TX frame length field (2 bytes little-endian). Latch for TX engine. | +| TXDATA (0x48) | Each byte written goes into `tx_bytes` AsyncFIFO (exi→sync). When `byte_ctr == xfer_len` on last write chunk, push frame length into `tx_ctrl` AsyncFIFO. | + +#### Interrupt register update (from sync domain) + +- `rx_irq` PulseSynchronizer arriving from sync: set `IR[1]` (RI bit) +- `tx_irq` PulseSynchronizer arriving from sync: set `IR[2]` (TI bit), clear + `NCRA[3:2]` (ST1:ST0 — transmit start bits) + +#### Interrupt output + +``` +exi_int_n <= ~|(IR & IMR) # active-low: assert when any unmasked bit set +``` + +Register this one flip-flop in the `exi` domain. The physical pin is a direct +output — no CDC needed because the GC only reads the interrupt state via polling +IR over EXI (which is already in the `exi` domain) or via the interrupt line +which the GC CPU samples asynchronously. + +#### NWAYS register + +Always return `0x17` (link up, 100 Mbps, full duplex, autoneg complete). +The GC's BBA driver polls NWAYS after reset to confirm link status before +enabling RX. Hardcode this value — do not attempt to forward real link status +from the W5500. + +```python +# NWAYS = 0x17: +# bit 4 (LS100) = 1: 100BASE-TX link up +# bit 2 (ANCLPT) = 1: autoneg complete +# bit 1 (100TXH) = 1: 100BASE-TX half (also set in practice) +# bit 0 (LS10) = 1: 10BASE-T (also reported) +``` + +--- + +### 7.3 SPRAMArbiter + +**Domain:** `sync` +**File:** `exi_bba/spram_arbiter.py` + +Arbitrates access to the iCE40UP5K's 128 KB SPRAM between two clients: + +- **Client A (EXI read):** Issues read requests from the prefetch pipeline + (`spram_req` AsyncFIFO). Must service requests fast enough to keep the + prefetch pipeline full. +- **Client B (ETH write):** The `RXFrameAssembler` writes incoming ethernet + frames into the ring buffer area. + +**Priority:** ETH write wins over EXI read when both request simultaneously. +This is safe because: +1. The GC only reads a ring buffer page after RWP has advanced past it (i.e., + the ETH engine has finished writing that page). +2. Even if an EXI read is delayed by one SPRAM cycle, the prefetch pipeline + has enough depth (4 entries) to absorb the stall without the SPI slave + running out of data. + +**SPRAM interface (iCE40UP5K SB_SPRAM256KA):** + +``` +WREN : write enable +CHIPSELECT : always 1 +CLOCK : sync domain clock (48 MHz) +STANDBY : 0 +SLEEP : 0 +POWEROFF_N : 1 +ADDRESS[13:0] : byte address divided by 2 (SPRAM is 16-bit wide) +DATAIN[15:0] : write data (use only [7:0] for byte writes, mask upper byte) +MASKWREN[3:0] : byte enable (0b0011 for lower byte, 0b1100 for upper byte) +DATAOUT[15:0] : read data +``` + +The SPRAM is 16-bit wide. Byte addressing is done via `MASKWREN`. For an 8-bit +write to address `A`: set `ADDRESS = A >> 1`, `MASKWREN = (A & 1) ? 0b1100 : +0b0011`, write data in the appropriate byte of `DATAIN`. + +**Read latency:** SPRAM has 1-cycle synchronous read latency. The result of a +read issued at cycle N is valid at cycle N+1. The arbiter must account for this +when responding to the prefetch pipeline. + +**Port list:** + +| Port | Width | Dir | Notes | +|---|---|---|---| +| `exi_req_addr` | 16 | in | From spram_req AsyncFIFO (exi→sync) | +| `exi_req_valid` | 1 | in | FIFO r_rdy | +| `exi_req_ready` | 1 | out | FIFO r_en (pop when serviced) | +| `exi_rsp_data` | 8 | out | To spram_rsp AsyncFIFO (sync→exi) | +| `exi_rsp_valid` | 1 | out | FIFO w_en | +| `eth_wr_addr` | 16 | in | From RXFrameAssembler | +| `eth_wr_data` | 8 | in | Byte to write | +| `eth_wr_valid` | 1 | in | Write request | +| `eth_wr_ready` | 1 | out | Write accepted this cycle | + +--- + +### 7.4 RXFrameAssembler + +**Domain:** `sync` +**File:** `exi_bba/rx_frame_assembler.py` + +Receives complete ethernet frames from `W5500SPIMaster` and writes them into +the SPRAM ring buffer in the correct MX98730EC format. + +**Ring buffer layout (in SPRAM):** + +``` +SPRAM address 0x0100–0x0FFF (3840 bytes = 15 × 256-byte pages) + Page 0x01: first usable RX page + Page 0x0F: last usable RX page (RHBP default) + Pages wrap: after 0x0F, next is 0x01 (not 0x00, which is reserved) +``` + +Each page is 256 bytes. A received frame may span multiple pages. + +**Frame descriptor (first 4 bytes of first page):** + +``` +Byte 0: LRPS value (Last Received Packet Status — set to 0x00 or actual status) +Byte 1: 0x00 +Byte 2: frame_length[15:8] (big-endian, includes descriptor bytes) +Byte 3: frame_length[7:0] +Bytes 4+: raw ethernet frame data (DA, SA, EtherType, payload, FCS) +``` + +**Flow:** + +1. Wait for `W5500SPIMaster` to signal frame available (`rx_sof` pulse). +2. Read frame bytes from W5500 frame FIFO. +3. Compute how many 256-byte pages are needed: + `pages_needed = ceil((frame_length + 4) / 256)` +4. Check that `(RWP + pages_needed) mod 16 != RRP` (ring not full). If full, + drop the frame and increment a drop counter. +5. Write 4-byte descriptor at SPRAM address `0x100 + (RWP * 0x100)`. +6. Write frame bytes sequentially, wrapping pages at 256-byte boundaries. + Page wrap: `next_page = (current_page % 15) + 1` (pages 1–15, skip 0). +7. After last byte written, update `RWP` in the `rx_wptr` AsyncFIFO (sync→exi). + The `exi` domain will update the RWP register from this FIFO. +8. Pulse `rx_irq` PulseSynchronizer to `exi` domain. + +**MAC address filter:** + +Before writing a frame, check destination MAC against PAR0–PAR5 (broadcast +FF:FF:FF:FF:FF:FF always accepted). The GC will typically configure PAR0–PAR5 +via EXI after boot, so the `BBARegisterFile` must expose these to the +`RXFrameAssembler`. Pass them via a dedicated small AsyncFIFO or by reading +them from a shared register shadow (6 bytes, sync domain copy updated when +GC writes PAR0–PAR5). Multicast hash table (MAR0–MAR7) filtering is optional +for initial implementation — accept all frames (promiscuous mode) until the GC +configures the filter. + +--- + +### 7.5 TXFrameDrain + +**Domain:** `sync` +**File:** `exi_bba/tx_frame_drain.py` + +Drains the TX byte FIFO (fed from the `exi` domain as the GC writes to TXDATA +register 0x48) and forwards complete frames to `W5500SPIMaster`. + +**Flow:** + +1. Wait for `tx_ctrl` AsyncFIFO to contain a frame length value. This is pushed + by `BBARegisterFile` when the GC has written the complete TX frame (i.e., + NCRA ST1:ST0 transitions to 01 or 10). +2. Pop `frame_length` from `tx_ctrl`. +3. Pop exactly `frame_length` bytes from `tx_bytes` AsyncFIFO. +4. Forward bytes to `W5500SPIMaster` TX interface with SOF/EOF framing. +5. Wait for `W5500SPIMaster` to signal TX complete. +6. Pulse `tx_irq` PulseSynchronizer to `exi` domain. + +**NCRA ST bits:** The GC writes NCRA with ST1:ST0 = 01 (start transmit from +buffer 1) or 10 (start transmit from buffer 2). The BBA hardware has two TX +buffers; this implementation uses a single TX FIFO and ignores the buffer +selection. When ST1:ST0 goes non-zero, treat it as a TX trigger regardless of +which bits are set. The `BBARegisterFile` should push the frame length into +`tx_ctrl` on this transition. + +--- + +### 7.6 W5500SPIMaster + +**Domain:** `sync` +**File:** `exi_bba/w5500_spi_master.py` + +Implements the W5500 SPI master interface. The W5500 uses SPI Mode 0 (CPOL=0, +CPHA=0), opposite to the BBA EXI interface. + +**W5500 SPI frame format:** + +``` +Byte 0–1: Address (16-bit, big-endian) +Byte 2: Control byte: + [7:3] = Block Select (BSB): + 00000 = Common Register + 00001 = Socket 0 Register + 00010 = Socket 0 TX buffer + 00011 = Socket 0 RX buffer + [2] = Read/Write (0=read, 1=write) + [1:0] = Operation Mode (00=variable, 01=fixed 1B, 10=fixed 2B, 11=fixed 4B) +Byte 3+: Data bytes +``` + +**W5500 configuration (to be performed once on NCRA reset):** + +``` +1. Write MR (Mode Register, 0x0000): 0x80 — software reset +2. Wait ~1 ms +3. Write SHAR (Source MAC, 0x0009–0x000E): copy from PAR0–PAR5 register shadow +4. Write S0_MR (Socket 0 Mode, 0x4000): 0x04 — MACRAW mode (raw ethernet) +5. Write S0_CR (Socket 0 Command, 0x4001): 0x01 — OPEN +6. Write S0_IMR (Socket 0 Interrupt Mask, 0x4024): 0x04 | 0x01 — RECV | SEND_OK +``` + +**MACRAW mode:** In MACRAW mode the W5500 Socket 0 sends and receives raw +ethernet frames including the full MAC header and FCS. This is exactly what +the MX98730EC presents to the GC. No IP stack runs in the FPGA. + +**RX polling:** The W5500 asserts its INT_N pin (active low) when a frame +arrives. Connect W5500 INT_N to an FPGA input pin and use it to trigger the +`RXFrameAssembler`. Alternatively poll `S0_IR` (Socket 0 Interrupt Register, +0x4002) periodically. The INT_N approach has lower latency and is preferred. + +**SPI clock rate:** Drive W5500 SPI at 24 MHz (sync clock 48 MHz ÷ 2 using a +clock enable toggle). The W5500 supports up to 80 MHz so there is ample margin. + +**Port list:** + +| Port | Width | Dir | Notes | +|---|---|---|---| +| `spi_clk` | 1 | out | To W5500 CLK pin (SPI Mode 0, idles LOW) | +| `spi_mosi` | 1 | out | To W5500 MOSI | +| `spi_miso` | 1 | in | From W5500 MISO | +| `spi_cs_n` | 1 | out | To W5500 CS (active low) | +| `w5500_int_n` | 1 | in | W5500 interrupt (active low) | +| `tx_data` | 8 | in | Byte to transmit (from TXFrameDrain) | +| `tx_valid` | 1 | in | TX byte available | +| `tx_ready` | 1 | out | TX byte consumed | +| `tx_sof` | 1 | in | Start of frame marker | +| `tx_eof` | 1 | in | End of frame marker | +| `rx_data` | 8 | out | Received byte (to RXFrameAssembler) | +| `rx_valid` | 1 | out | RX byte available | +| `rx_ready` | 1 | in | RX byte consumed | +| `rx_sof` | 1 | out | Start of frame | +| `rx_eof` | 1 | out | End of frame | + +--- + +### 7.7 EEPROMModel + +**Domain:** `exi` +**File:** `exi_bba/eeprom_model.py` + +Models the 93C46-compatible serial EEPROM that stores the BBA's MAC address. +The GC software bit-bangs the EEPROM interface through register 0x1C +(EEPROM Interface Register) of the BBA chip. + +**Register 0x1C bit fields:** + +``` +[3] EECK — EEPROM clock +[2] EECS — EEPROM chip select +[1] EEDI — EEPROM data in (GC → EEPROM) +[0] EEDO — EEPROM data out (EEPROM → GC) [read-only] +``` + +The GC reads EEDO by reading register 0x1C bit 0. + +**93C46 protocol summary:** + +The 93C46 uses a 3-wire serial protocol (SK=clock, CS=select, DI=data in, +DO=data out). Commands: +- READ: start bit (1) + opcode (10) + 6-bit address → 16-bit data out +- WRITE: start bit (1) + opcode (01) + 6-bit address + 16-bit data +- EWEN (write enable): start bit (1) + opcode (00) + address (11xxxx) + +Each 93C46 word is 16 bits. The MAC address occupies words 0–2 (6 bytes). + +**Implementation approach:** + +Maintain a small ROM of 64 × 16-bit words in the `exi` domain (as a Const +array, synthesises to LUTs). Pre-populate words 0–2 with the chosen MAC +address. Implement a small FSM that watches writes to register 0x1C for the +93C46 protocol, drives EEDO accordingly. + +**Simpler alternative:** Many GC BBA drivers read the EEPROM once at boot and +then write the MAC to PAR0–PAR5 themselves. Pre-populate PAR0–PAR5 in the +register file reset state with a valid Nintendo OUI MAC (00:09:BF:xx:xx:xx). +Skip a full 93C46 implementation for the first version — if Swiss ignores the +EEPROM read result and uses a hardcoded or user-configurable MAC, this is +sufficient. + +--- + +### 7.8 BBATop + +**Domain:** both +**File:** `exi_bba/bba_top.py` + +Top-level module. Instantiates all submodules, creates clock domains, connects +physical pins. + +**Clock domain creation:** + +```python +def elaborate(self, platform): + m = Module() + + # exi domain: 96 MHz from PLL (3× 32 MHz EXI bus rate) + exi_domain = ClockDomain("exi") + m.domains += exi_domain + pll = platform.get_pll() # platform-specific PLL primitive + m.d.comb += exi_domain.clk.eq(pll.clkout) + m.submodules.exi_rst = ResetSynchronizer( + arst=ResetSignal("sync"), domain="exi" + ) + + # sync domain: 48 MHz from SB_HFOSC (platform default) + # Created automatically by iCEbreaker platform + + # Instantiate submodules... + m.submodules.spi = spi = SPIMode3Slave() + m.submodules.regfile = regfile = BBARegisterFile() + m.submodules.arbiter = arbiter = SPRAMArbiter() + m.submodules.rx_asm = rx_asm = RXFrameAssembler() + m.submodules.tx_drn = tx_drn = TXFrameDrain() + m.submodules.w5500 = w5500 = W5500SPIMaster() + m.submodules.eeprom = eeprom = EEPROMModel() + # ... wiring ... +``` + +**Physical pin connections (iCEbreaker):** + +The SP1 EXI signals connect via the interposer PCB to iCEbreaker PMOD pins. +The W5500 Pmod connects to the second PMOD connector. Exact pin mapping depends +on the interposer PCB layout — define these in a platform resource file. + +```python +# Example resource definitions (add to iCEbreaker platform file): +Resource("exi", 0, + Subsignal("clk", Pins("1", conn=("pmod", 0), dir="i")), + Subsignal("mosi", Pins("2", conn=("pmod", 0), dir="i")), + Subsignal("miso", Pins("3", conn=("pmod", 0), dir="o")), + Subsignal("cs_n", Pins("4", conn=("pmod", 0), dir="i")), + Subsignal("int_n",Pins("7", conn=("pmod", 0), dir="o")), + Attrs(IO_STANDARD="SB_LVCMOS"), +), +Resource("w5500", 0, + Subsignal("clk", Pins("1", conn=("pmod", 1), dir="o")), + Subsignal("mosi", Pins("2", conn=("pmod", 1), dir="o")), + Subsignal("miso", Pins("3", conn=("pmod", 1), dir="i")), + Subsignal("cs_n", Pins("4", conn=("pmod", 1), dir="o")), + Subsignal("int_n",Pins("7", conn=("pmod", 1), dir="i")), + Subsignal("rst_n",Pins("8", conn=("pmod", 1), dir="o")), + Attrs(IO_STANDARD="SB_LVCMOS"), +), +``` + +--- + +## 8. Memory Map + +The BBA register address space is 13 bits wide (0x0000–0x1FFF). + +| Address range | Region | Implemented in | Notes | +|---|---|---|---| +| 0x0000–0x0033 | MAC control registers | Register file (exi) | NCRA, NCRB, IMR, IR, pointers | +| 0x0034–0x0037 | TWD — TX write data | Register file (exi) | TX frame length (2 bytes) | +| 0x0038–0x0039 | Reserved | — | Ignore | +| 0x003A | HIPR — Host Interface Protocol | Register file (exi) | Read: 0x01 (BBA present) | +| 0x003B | NAFR — Network Address Filter | Register file (exi) | | +| 0x003C | NWBA — Network Write Buffer Addr | Register file (exi) | | +| 0x003D–0x0047 | Reserved | — | Ignore | +| 0x0048 | TXDATA — Bulk TX data port | Register file → tx_bytes FIFO | Write path to ethernet | +| 0x0049–0x00FF | Reserved | — | Ignore | +| 0x0100–0x0FFF | RX ring buffer | SPRAM (sync) | Read path from ethernet | + +--- + +## 9. EXI Transaction Protocol + +All BBA register accesses follow a strict two-phase (header + data) format. + +### Header encoding + +``` +Byte 0: [7] write flag 1=write, 0=read + [6:0] addr[12:6] upper 7 bits of address + +Byte 1: [7:2] addr[5:0] lower 6 bits of address + [1:0] xfer_len-1 0=1 byte, 1=2 bytes, 2=3 bytes, 3=4 bytes +``` + +CS is asserted (low) before byte 0 and remains low through the entire +transaction including all data bytes. CS deasserts (high) after the last +data byte. + +### Read transaction timing + +``` +CS ─┐ ┌─ + └────────────────────────────────────┘ +CLK ┌┐┌┐┌┐┌┐┌┐┌┐┌┐┌┐ ┌┐┌┐┌┐┌┐┌┐┌┐┌┐┌┐ ┌┐┌┐... + header byte 0 header byte 1 data byte 0... +MOSI [addr+flags] [addr+len] [don't care] +MISO [don't care] [don't care] [register data] +``` + +The register file must have data ready on MISO from the **very first clock +edge of the data phase**. For register-file-backed reads (address < 0x100), +the data is available immediately after header decode. For SPRAM-backed reads +(address ≥ 0x100), the prefetch pipeline issues the SPRAM read request during +the header phase so data is ready in time. + +### Write transaction timing + +Identical header, then MOSI carries the write data. The FPGA samples MOSI on +each falling CLK edge during the data phase and writes to the register. + +### ID query + +On power-on the GC queries the device ID. The query is two 0x00 bytes written, +then four bytes read. The BBA returns `0x04020200`. Implement this as a special +case: when address decodes to 0x0000 on a read with no prior NCRA reset, return +the hardcoded ID. + +Alternatively, read the Dolphin source for the exact byte sequence GC software +uses to detect the BBA and replicate it faithfully. + +--- + +## 10. BBA Register Reference + +Key registers the GC driver accesses. Full register map in YAGCD §10.8. + +| Addr | Name | R/W | Reset | Description | +|---|---|---|---|---| +| 0x00 | NCRA | R/W | 0x00 | Network Control A. [0]=RESET (self-clear), [2:1]=ST (TX start), [3]=SR (start receive), [6]=INTMODE (0=int active low) | +| 0x01 | NCRB | R/W | 0x00 | Network Control B | +| 0x04 | LTPS | R | 0x00 | Last TX packet status | +| 0x05 | LRPS | R | 0x00 | Last RX packet status | +| 0x08 | IMR | R/W | 0x00 | Interrupt mask. Bits match IR. Interrupt fires when IR & IMR != 0 | +| 0x09 | IR | R/W | 0x00 | Interrupt register. Write 1 to clear. [7]=RBFI, [4]=TEI, [2]=TI, [1]=RI | +| 0x0A–0x0B | BP | R/W | — | Boundary page pointer | +| 0x0C–0x0D | TLBP | R/W | — | TX low boundary page | +| 0x0E–0x0F | TWP | R/W | 0x00 | TX write page pointer | +| 0x12–0x13 | TRP | R/W | 0x00 | TX read page pointer | +| 0x16–0x17 | RWP | R | updates | RX write page pointer. Advances after each frame written | +| 0x18–0x19 | RRP | R/W | 0x01 | RX read page pointer. GC writes to advance after consuming frames | +| 0x1A–0x1B | RHBP | R/W | 0x0F | RX high boundary page (last valid page). Default 0x0F | +| 0x1C | EEPROM | R/W | — | EEPROM bit-bang interface [3:0] = EECK, EECS, EEDI, EEDO | +| 0x20–0x25 | PAR0–5 | R/W | MAC | MAC address bytes 0–5. GC writes after reading EEPROM | +| 0x26–0x2D | MAR0–7 | R/W | 0xFF | Multicast hash table. 0xFF = accept all | +| 0x2E | ANALOG | R/W | — | PHY analog control. GC writes 0xD6 to enable PHY | +| 0x30 | NWAYC | R/W | — | Autoneg config. GC sets ANE + LTE bits | +| 0x31 | NWAYS | R | 0x17 | Autoneg status. Hardcode 0x17 = 100M full duplex link up | +| 0x32 | GCA | R/W | — | GMAC config A. GC sets AUTOPUB bit | +| 0x33 | GCB | R/W | — | GMAC config B | +| 0x34–0x37 | TWD | W | — | TX write data (frame length, 2 bytes LE, then ignored) | +| 0x3A | HIPR | R | 0x01 | Host interface protocol version. Return 0x01 | +| 0x3B | NAFR | R/W | — | Network address filter | +| 0x3C | NWBA | R/W | — | Network write buffer address | +| 0x48 | TXDATA | W | — | Bulk TX data port. GC streams frame bytes here | +| 0x100+ | RX buf | R | — | RX ring buffer. GC reads frames from here | + +--- + +## 11. Initialisation Sequence + +This is the exact sequence Swiss/GC software executes. The register file must +respond correctly to each step. + +``` +1. Assert CS, write 0x0000 (2 bytes), read 4 bytes + → Must return: 0x04 0x02 0x02 0x00 (device ID) + +2. Write 0x01 to NCRA (0x00) — software reset + → RESET bit self-clears next cycle + → Pulse ncra_rst to sync domain (resets W5500, clears SPRAM pointers) + +3. Poll NCRA bit 0 until clear — wait for reset complete + → Return 0x00 from NCRA reads after self-clear + +4. Write 6 bytes to PAR0–PAR5 (0x20–0x25) + → Latch MAC address; forward to sync domain MAC filter shadow + +5. Write 8 bytes to MAR0–MAR7 (0x26–0x2D) + → Typically all 0xFF (promiscuous mode) + +6. Write 0xD6 to ANALOG (0x2E) — enable PHY + → Store in register file; no hardware effect in FPGA + +7. Write NWAYC (0x30): set bits for ANE + LTE + → Store; no hardware effect + +8. Write IMR (0x08): typically 0x86 (RBFI | TI | RI) + → Enables interrupts; INT line will now assert when frames arrive + +9. Write GCA (0x32): set AUTOPUB bit + → Store; AUTOPUB means RWP auto-updates — we always do this anyway + +10. Write NCRA (0x00): set SR bit (0x08) — start receive + → Enable RX path; the RXFrameAssembler should begin accepting frames + +11. Poll NWAYS (0x31) until link up + → Return hardcoded 0x17 immediately +``` + +--- + +## 12. RX Data Path — Detailed Flow + +``` +W5500 receives frame on wire + │ + ▼ +W5500SPIMaster detects S0_IR[RECV] (via INT_N pin) +Reads frame length from S0_RX_RSR (Socket 0 RX Received Size, 0x4026) +Reads frame bytes from Socket 0 RX buffer (BSB=0b00011) +Pulses rx_sof, streams rx_data bytes, pulses rx_eof + │ + ▼ (sync domain) +RXFrameAssembler + - Checks destination MAC vs PAR shadow + - Checks NCRA SR bit is set (RX enabled) + - Computes pages_needed + - Checks ring buffer not full (RWP+pages != RRP) + - Writes descriptor + frame data into SPRAM via SPRAMArbiter + - Advances RWP (local register in sync domain) + - Pushes new RWP value into rx_wptr AsyncFIFO (sync→exi) + - Pulses rx_irq PulseSynchronizer (sync→exi) + │ + ▼ AsyncFIFO / PulseSynchronizer crossing + │ (exi domain) +BBARegisterFile + - Pops new RWP from rx_wptr FIFO, updates RWP register + - rx_irq pulse arrives: sets IR[1] (RI bit) + - IR & IMR now non-zero: asserts exi_int_n (INT low to GC) + │ + ▼ (GC CPU, driven by interrupt or polling) +GC reads IR register: sees RI=1 +GC reads RWP (0x16): gets updated pointer +GC reads frame from 0x100+RRP (bulk read, up to 1500+ bytes) + → BBARegisterFile issues SPRAM read requests via spram_req FIFO (exi→sync) + → SPRAMArbiter services reads from SPRAM + → Results flow back via spram_rsp FIFO (sync→exi) + → Prefetch pipeline keeps data ready for SPI bit engine +GC writes new RRP (0x18) to advance past consumed pages + → BBARegisterFile pushes RRP update into rx_rptr FIFO (exi→sync) + → RXFrameAssembler updates its local RRP shadow +GC writes IR register with RI=1 (write-1-to-clear) + → IR[1] clears, INT line deasserts +``` + +--- + +## 13. TX Data Path — Detailed Flow + +``` +GC CPU constructs ethernet frame in GC RAM + │ + ▼ (GC CPU → EXI) +GC writes 2-byte length to TWD register (0x34) +GC writes frame bytes to TXDATA register (0x48) in chunks + → BBARegisterFile: each written byte goes into tx_bytes AsyncFIFO (exi→sync) +GC writes NCRA with ST1:ST0 = 01 (transmit trigger) + → BBARegisterFile pushes frame_length into tx_ctrl AsyncFIFO (exi→sync) + │ + ▼ AsyncFIFO crossing + │ (sync domain) +TXFrameDrain + - Pops frame_length from tx_ctrl + - Pops frame_length bytes from tx_bytes + - Forwards to W5500SPIMaster with SOF/EOF + │ + ▼ (sync domain) +W5500SPIMaster + - Writes frame length to S0_TX_FSR (TX Free Size Register, 0x4020) + - Writes frame bytes into Socket 0 TX buffer (BSB=0b00010) + - Writes SEND command to S0_CR (0x4001 = 0x20) + - Polls S0_IR until SEND_OK bit set + - Clears S0_IR[SEND_OK] + - Pulses tx_irq PulseSynchronizer (sync→exi) + │ + ▼ PulseSynchronizer crossing + │ (exi domain) +BBARegisterFile + - tx_irq arrives: sets IR[2] (TI bit), clears NCRA ST1:ST0 + - If IMR[2] set: INT asserts to GC + │ + ▼ (GC CPU) +GC reads IR, sees TI=1 +GC writes IR with TI=1 to clear +``` + +--- + +## 14. SPRAM Layout + +The iCE40UP5K has 4 × 32 KB SPRAM banks (128 KB total). Map them as: + +| SPRAM region | Size | Usage | +|---|---|---| +| 0x0000–0x00FF | 256 B | Reserved (address 0x00 page not used by ring buffer) | +| 0x0100–0x0FFF | 3840 B | RX ring buffer (15 × 256-byte pages, pages 0x01–0x0F) | +| 0x1000–0x17FF | 2048 B | TX frame staging buffer | +| 0x1800–0x1FFF | 2048 B | Reserved / future use | + +The ring buffer uses pages 0x01–0x0F (15 pages × 256 bytes = 3840 bytes). This +matches the MX98730EC default `RHBP` (RX High Boundary Page) value of 0x0F and +`RRP` reset value of 0x01. + +**SPRAM addressing:** iCE40UP5K SB_SPRAM256KA instances are 64K × 16-bit +(128 KB total across 4 instances). To address the ring buffer region as bytes: +- Byte address 0x0100 maps to SPRAM word address 0x0080 (byte 0x0100 >> 1) +- The arbiter converts byte addresses to word addresses and uses MASKWREN for + byte selection + +--- + +## 15. Critical Timing Constraints + +### Must-meet timing in `exi` domain (96 MHz → 10.4 ns period) + +| Path | Budget | Notes | +|---|---|---| +| FFSynchronizer output → edge detect flip-flop | 1 cycle = 10.4 ns | Trivially met — just a register | +| Edge detect → shift register update | 1 cycle | Register-to-register, no logic | +| `rx_valid` → header decode → `spram_req` FIFO write | 2 cycles | Address decode is combinatorial MUX; must close at 96 MHz | +| `tx_load` → `tx_byte` driven from register file | 1 cycle | `regs[addr]` array lookup — critical path; keep address decode combinatorial depth ≤ 4 LUTs | +| `tx_load` → `tx_byte` driven from prefetch buffer | 1 cycle | Just a register read — trivial | + +### Must-meet timing in `sync` domain (48 MHz → 20.8 ns period) + +| Path | Budget | Notes | +|---|---|---| +| SPRAM read request → SPRAM address valid | 1 cycle | AsyncFIFO read + mux — easy | +| SPRAM DATAOUT → result FIFO write | 1 cycle | Register-to-FIFO — easy | +| W5500 SPI bit engine | N/A | Clock-enable based at 24 MHz effective; no hard timing | + +### Cross-domain latency budget for SPRAM prefetch + +``` +EXI header phase duration: 16 exi clocks at 96 MHz = 167 ns + +SPRAM prefetch round trip: + exi → spram_req FIFO write: 1 exi tick = 10 ns + FIFO cross-domain: 2 sync ticks = 42 ns + SPRAM read (1 cycle latency): 1 sync tick = 21 ns + Result → spram_rsp FIFO write: 1 sync tick = 21 ns + FIFO cross-domain: 2 exi ticks = 21 ns + Result available in prefetch buffer: = 21 ns + Total: ~136 ns + +136 ns < 167 ns header window → prefetch completes before first data bit needed ✓ +``` + +This is the tightest timing consideration in the design. The prefetch must be +issued during HEADER1 (not after) to make the deadline. + +--- + +## 16. SPRAM Read Prefetch Pipeline + +The prefetch pipeline ensures MISO data is always ready before the SPI slave +needs it for the data phase. + +### State machine (in BBARegisterFile, exi domain) + +``` +State HEADER1 (decoding second header byte): + If is_read AND address >= 0x100: + push address into spram_req AsyncFIFO ← issued NOW, during header decode + set prefetch_pending = True + +State DATA (read phase): + On each tx_load pulse: + If prefetch_pending AND spram_rsp FIFO has data: + pop byte from spram_rsp FIFO + load into tx_byte + push (address + byte_ctr + 1) into spram_req for NEXT byte ← pipelining + Elif address < 0x100: + tx_byte = regs[address + byte_ctr] ← direct register file read +``` + +### Pipeline depth + +The `spram_req` and `spram_rsp` FIFOs each have depth 4. This allows up to 4 +read requests to be in-flight simultaneously, which absorbs any SPRAM arbiter +stalls (ETH write winning the arbitration) without stalling the SPI data phase. + +### SPRAM arbiter stall handling + +If the SPRAM arbiter defers an EXI read by 1 cycle (due to ETH write priority), +the `spram_rsp` FIFO will be momentarily empty when `tx_load` arrives. The +BBARegisterFile must stall the SPI slave in this case. + +However: the SPI slave cannot be stalled mid-bit. The stall mechanism must +work at byte boundaries only — i.e., after a complete byte has been transmitted, +hold MISO at 0 (or 1) and do not toggle until the next byte is ready. Since the +GC is the SPI master and controls CLK, it will simply clock in garbage on the +retry byte. + +**Practical note:** At 48 MHz sync with 24 MHz effective W5500 access rate, the +ETH write path can only consume the SPRAM arbiter for ~1 sync cycle per byte +written. The EXI read path gets the remaining cycles. With 4-deep FIFOs the +pipeline should almost never stall in practice. Monitor the stall condition in +simulation. + +--- + +## 17. Interrupt Handling + +The `exi_int_n` output (pin 3 of SP1) is active-low. Assert it (drive low) +when `IR & IMR != 0`. + +```python +# In BBARegisterFile, exi domain: +ir_masked = Signal(8) +m.d.comb += ir_masked.eq(regs[BBARegs.IR] & regs[BBARegs.IMR]) +m.d.exi += exi_int_n.eq(~ir_masked.any()) +``` + +Register the output — do not drive `exi_int_n` combinatorially. A registered +output prevents glitches from propagating onto the GC board. + +**Interrupt sources and IR bit assignments:** + +| IR bit | Name | Set by | Cleared by | +|---|---|---|---| +| 7 | RBFI | RXFrameAssembler when ring full | GC write-1-to-clear | +| 4 | TEI | TXFrameDrain on TX error | GC write-1-to-clear | +| 2 | TI | tx_irq pulse from sync | GC write-1-to-clear | +| 1 | RI | rx_irq pulse from sync | GC write-1-to-clear | + +The GC typically masks in IMR: 0x86 = 0b10000110 (RBFI | TI | RI). + +--- + +## 18. EEPROM / MAC Address + +The GC software reads the MAC address from the 93C46 EEPROM during +initialisation (bit-banging through register 0x1C). It then writes the MAC +to PAR0–PAR5. + +**Recommended approach for initial implementation:** + +Skip full 93C46 emulation. Pre-populate `regs[0x1C]` with a pattern that makes +the EEPROM read return a valid MAC. Use Nintendo's OUI `00:09:BF` for the first +3 bytes, with locally administered bits for the last 3: + +``` +MAC: 00:09:BF:00:00:01 +``` + +Verify against Swiss source whether it validates the MAC read from EEPROM or +accepts whatever PAR0–PAR5 contains. If it re-reads EEPROM after writing PAR, +a full 93C46 model is required. If it only uses PAR0–PAR5, pre-populating the +register file is sufficient. + +**MAC address propagation:** + +When the GC writes PAR0–PAR5, forward the new MAC to the W5500 SHAR register +via the `sync` domain. Use a 6-byte AsyncFIFO or a dedicated MAC update pulse. +The W5500 uses SHAR as its source MAC for all transmitted frames. + +--- + +## 19. iCE40UP5K Resource Budget + +| Resource | Available | Estimated use | Margin | +|---|---|---|---| +| Logic cells (4-LUT + FF) | 5280 | ~1800 | 66% free | +| EBR (4 Kbit blocks) | 30 (120 Kbit) | 4 (FIFOs) | 26 free | +| SPRAM (32 KB banks) | 4 (128 KB) | 1 bank for ring buffer | 3 free | +| PLL | 1 | 1 (for exi domain) | 0 free | +| SB_HFOSC | 1 | 1 (sync domain) | 0 free | +| I/O pins | 39 usable | ~14 (EXI:5 + W5500:6 + misc:3) | 25 free | + +**Logic cell breakdown:** + +| Module | Estimated cells | +|---|---| +| SPIMode3Slave | 90 | +| BBARegisterFile FSM + decode | 250 | +| Register file (512 × 8b) | ~200 (distributed RAM) | +| AsyncFIFO × 8 | 400 | +| PulseSynchronizer × 4 | 40 | +| FFSynchronizer × 5 | 30 | +| SPRAMArbiter | 80 | +| RXFrameAssembler | 200 | +| TXFrameDrain | 150 | +| W5500SPIMaster | 200 | +| EEPROMModel | 100 | +| Misc glue | 60 | +| **Total** | **~1800** | + +iCE40UP5K fmax with nextpnr: typically 60–80 MHz for logic of this complexity. +The `exi` domain at 96 MHz is the tightest. If nextpnr fails to close timing: + +1. First option: reduce to 64 MHz `exi` domain (icepll alternative). +2. Second option: reduce EXI bus speed in Swiss settings to 16 MHz (clock index + 4 instead of 5), halving the FPGA timing requirement. +3. Third option: add pipeline registers on the critical address decode path. + +--- + +## 20. PCB / Connector Notes + +### Interposer PCB + +A simple pass-through interposer PCB connects the GC SP1 slot to the iCEbreaker +via a ribbon cable or header. + +**Required PCB spec:** +- Thickness: **1.2 mm** (not standard 1.6 mm — critical for fit) +- Copper finish: **ENIG (gold)** — prevents oxidation on edge contacts +- Board material: FR4 standard + +**Footprint source:** Copy the edge connector footprint from +`github.com/silverstee1/SP1ETH` KiCad files. Do not design from scratch. +The staggered dual-row geometry requires exact pad positions that have been +physically verified. Cross-reference with the ETH2SP1 LaserBear open files. + +**Additional interposer components:** +- 10 kΩ resistor: EXTIN (pin 1) to 3.3V (pin 7) — device detect +- 100 µF capacitor: 3.3V to GND — bulk decoupling near connector +- 100 nF capacitor × 2: additional HF decoupling +- ESD protection diode array: on CLK, MOSI, MISO, CS lines (optional but + recommended — the GC motherboard is difficult to repair if damaged) + +**Do not connect pin 5 (12V) to anything on the FPGA side.** + +### iCEbreaker connection + +The interposer PCB exposes EXI signals on a 2.54 mm pitch 8-pin header. +Connect to iCEbreaker PMOD1 connector using a short ribbon cable. Keep the +cable as short as possible (< 10 cm) to minimize signal integrity issues at +32 MHz. + +--- + +## 21. Known Hardware Quirks + +### EXI DMA bug + +The GC's EXI DMA engine has a bug where data on the MISO line during a DMA +write is clocked back out with a 1-bit shift. This only affects GC software +doing DMA writes (rare). Swiss uses IMM (immediate) mode transfers. No FPGA +workaround needed. + +### SPI Mode 3 vs Mode 0 + +Every other EXI device (memory cards, RTC, IPL) uses SPI Mode 0. The BBA +is the only device using Mode 3. Do not share the SPI slave implementation +with other EXI device implementations without parameterising CPOL/CPHA. + +### MISO tristate + +On real hardware, MISO (DO) is tristated when CS is deasserted. Other EXI +devices on the same bus would otherwise conflict. On this FPGA implementation, +drive MISO high (not tristated) when CS is deasserted. The iCE40UP5K does +not easily support pin tristate from user logic — drive high is safe because +the BBA occupies a dedicated CS line (SP1 device 2) separate from memory cards +and the RTC. + +### GC hardware revisions + +- DOL-001 (original): SP1 present, BBA compatible +- DOL-001 Rev B: SP1 physically absent on motherboard but case hole present +- DOL-101 (later): SP1 present again (but Serial Port 2 absent) +- Panasonic Q: SP1 present + +Swiss supports all revisions with SP1 via the EXI hypervisor driver (required +from Swiss build 1788 onwards for BBA emulation features). + +### EXI clock index + +The real BBA uses clock index 5 (32 MHz). Swiss allows configuring a lower +clock index for compatibility. If 96 MHz fmax is not achievable, instruct users +to configure Swiss to use clock index 4 (16 MHz EXI), which requires only +32 MHz `exi` domain and is trivially achievable. + +--- + +## 22. File Structure + +``` +gc_bba_fpga/ +├── exi_bba/ +│ ├── __init__.py +│ ├── spi_mode3_slave.py # SPIMode3Slave +│ ├── bba_register_file.py # BBARegisterFile + register constants +│ ├── spram_arbiter.py # SPRAMArbiter +│ ├── rx_frame_assembler.py # RXFrameAssembler +│ ├── tx_frame_drain.py # TXFrameDrain +│ ├── w5500_spi_master.py # W5500SPIMaster +│ ├── eeprom_model.py # EEPROMModel (93C46) +│ └── bba_top.py # BBATop + clock domain setup +├── sim/ +│ ├── sim_spi_slave.py # SPIMode3Slave unit test +│ ├── sim_register_file.py # BBARegisterFile unit test +│ ├── sim_bba_init.py # Full init sequence simulation +│ ├── sim_rx_path.py # RX data path end-to-end test +│ ├── sim_tx_path.py # TX data path end-to-end test +│ ├── gc_master_model.py # GC CPU SPI master simulation model +│ ├── w5500_slave_model.py # W5500 SPI slave simulation model +│ └── ethernet_frame_gen.py # Test frame generator +├── platform/ +│ ├── icebreaker_bba.py # iCEbreaker platform with BBA resources +│ └── interposer_pinmap.py # SP1 ↔ PMOD pin mapping +├── pcb/ +│ ├── interposer/ # KiCad project for interposer PCB +│ └── README.md # PCB ordering instructions (1.2mm, ENIG) +├── constraints/ +│ └── timing.py # nextpnr timing constraints (if needed) +├── tests/ +│ └── test_bba.py # pytest suite +├── build.py # Amaranth build script +└── README.md +``` + +--- + +## 23. Simulation Strategy + +Each module should have a standalone simulation before integration. All +simulations use Amaranth's `Simulator` with two clock domains: +`sim.add_clock(1/96e6, domain="exi")` and `sim.add_clock(1/48e6, domain="sync")`. + +### Unit tests + +**SPIMode3Slave:** Drive CLK/MOSI/CS manually from a process in the `exi` +domain. Verify `rx_byte`/`rx_valid` match sent data. Verify `spi_miso` +matches pre-loaded `tx_byte`. Test CS abort mid-byte. + +**BBARegisterFile:** Use a `GCMasterModel` (SPI Mode 3 master process) to +perform read/write transactions. Verify register writes are stored. Verify +register reads return correct values. Verify IR bit setting and clearing. +Verify NWAYS returns 0x17. Verify ID query returns 0x04020200. + +**SPRAMArbiter:** Issue concurrent EXI reads and ETH writes. Verify ETH writes +win arbitration. Verify EXI reads complete within 3 sync cycles. Verify no +data corruption. + +**RXFrameAssembler:** Feed a known ethernet frame byte-by-byte. Verify SPRAM +contents match expected descriptor + frame layout. Verify RWP advances by +correct page count. Verify rx_irq fires. + +**TXFrameDrain + W5500SPIMaster:** Issue TX frame from `tx_bytes` FIFO. Use +`W5500SlaveModel` process to simulate W5500 responses. Verify frame bytes +arrive at W5500 correctly. Verify tx_irq fires after SEND_OK. + +### Integration test + +**sim_bba_init.py:** Full GC init sequence (all 11 steps from Section 11). +`GCMasterModel` performs every transaction. Verify no stalls, correct responses. + +**sim_rx_path.py:** `W5500SlaveModel` delivers a 64-byte test frame. +`GCMasterModel` polls IR, reads RWP, bulk-reads the frame, advances RRP. +Verify GC receives identical bytes to what W5500 sent. + +**sim_tx_path.py:** `GCMasterModel` writes a 64-byte frame through TXDATA. +`W5500SlaveModel` captures it. Verify W5500 receives identical bytes. + +--- + +## 24. Open Issues and Extension Points + +### Must resolve before first synthesis + +- [ ] Exact PLL parameters for iCE40UP5K: run `icepll -i 12 -o 96` and + confirm the output is achievable (VCO in 533–1066 MHz range). +- [ ] SP1 connector footprint: clone SP1ETH repo, extract pad positions, verify + stagger geometry and pitch before PCB layout. +- [ ] W5500 Pmod module pin mapping: confirm which Pmod pins INT_N and RST_N + appear on (varies by module vendor). +- [ ] Swiss version requirement: confirm Swiss build ≥ 1788 for BBA hypervisor + support. Earlier builds use a different driver that may have different + register access patterns. + +### Known limitations + +- Single TX buffer (MX98730EC has two). ST1:ST0 = 01 and 10 are treated + identically. No known GC title relies on dual TX buffering. +- No DMA mode support. IMM mode only. Matches real-world Swiss usage. +- No Serial Port 2 support (different connector, different project scope). +- 93C46 EEPROM emulation is simplified (hardcoded MAC). A full bit-bang + model can be added later if Swiss requires it. +- RX ring buffer is 15 pages (3840 bytes). The real BBA has 4KB. Frames + larger than ~3800 bytes (jumbo frames) will be dropped. Standard 1500-byte + MTU frames fit in at most 7 pages — no practical issue. + +### Extension points + +- **Larger ring buffer:** Use additional SPRAM banks for more RX buffering. +- **Multiple sockets:** W5500 supports 8 sockets; only socket 0 in MACRAW + mode is used here. +- **Link status passthrough:** Read W5500 PHYCFGR register and forward real + link status to NWAYS instead of hardcoding 0x17. +- **Statistics counters:** LTPS/LRPS (last packet status) are currently 0x00. + A more complete implementation would fill these from W5500 socket status. +- **Serial Port 2 support:** Different physical connector and EXI channel but + same FPGA logic; would require a second interposer PCB. diff --git a/examples/amaranth_cdc.py b/examples/amaranth_cdc.py new file mode 100644 index 0000000..deb22ce --- /dev/null +++ b/examples/amaranth_cdc.py @@ -0,0 +1,107 @@ +from amaranth import * +from amaranth.sim import Simulator + + +class SyncFF(Elaboratable): + """Width-N multi-flop synchronizer from `src_domain` to `dst_domain`. + + Use when the source is a level signal that may be stable for multiple destination + cycles. Not suitable for single-cycle pulses (use TogglePulseSync instead). + """ + + def __init__(self, width=1, src_domain="src", dst_domain="dst"): + self.width = width + self.src_domain = src_domain + self.dst_domain = dst_domain + self.src = Signal(self.width) + self.dst = Signal(self.width) + + def elaborate(self, platform): + m = Module() + reg_src = Signal(self.width) + ff0 = Signal(self.width) + ff1 = Signal(self.width) + + m.d[self.src_domain] += reg_src.eq(self.src) + m.d[self.dst_domain] += ff0.eq(reg_src) + m.d[self.dst_domain] += ff1.eq(ff0) + m.d.comb += self.dst.eq(ff1) + + return m + + +class TogglePulseSync(Elaboratable): + """Reliable pulse transfer from `src_domain` into `dst_domain`. + + - Source toggles `toggle` whenever an event occurs. + - Destination synchronizes the toggle and detects edges. + Guarantees ordering and no lost pulses for single-bit events. + """ + + def __init__(self, src_domain="src", dst_domain="dst"): + self.src_domain = src_domain + self.dst_domain = dst_domain + self.src_pulse = Signal() + self.dst_pulse = Signal() + + def elaborate(self, platform): + m = Module() + toggle = Signal() + sync0 = Signal() + sync1 = Signal() + prev = Signal() + edge = Signal() + + # Source domain: flip the toggle when a pulse arrives + m.d[self.src_domain] += If(self.src_pulse, toggle.eq(~toggle)) + + # Destination domain: two-flop synchronize the toggle + m.d[self.dst_domain] += sync0.eq(toggle) + m.d[self.dst_domain] += sync1.eq(sync0) + + # Detect the change in the destination domain + m.d[self.dst_domain] += edge.eq(sync1 ^ prev) + m.d[self.dst_domain] += prev.eq(sync1) + m.d.comb += self.dst_pulse.eq(edge) + + return m + + +def _sim_toggle_pulse(): + """Simple simulation that drives pulses on the source domain and prints detections on the destination domain.""" + + top = Module() + t = TogglePulseSync(src_domain="src", dst_domain="dst") + top.submodules.t = t + + sim = Simulator(top) + # Create two asynchronous clocks (periods chosen arbitrarily for the sim) + sim.add_clock(1e-6, domain="src") + sim.add_clock(1.5e-6, domain="dst") + + def process(): + # Wait a little, then generate three source pulses at different phases + for _ in range(5): + yield + + for i in range(3): + yield t.src_pulse.eq(1) + yield + yield t.src_pulse.eq(0) + # let the domains run for a few cycles + for _ in range(10): + dp = (yield t.dst_pulse) + if dp: + print(f"dst detected pulse at sim tick") + yield + + # run a bit longer to observe behavior + for _ in range(20): + yield + + sim.add_sync_process(process, domain="src") + sim.run_until(100e-6) + + +if __name__ == "__main__": + _sim_toggle_pulse() diff --git a/examples/async_fifo.py b/examples/async_fifo.py new file mode 100644 index 0000000..b8686ad --- /dev/null +++ b/examples/async_fifo.py @@ -0,0 +1,182 @@ +from amaranth import * +from amaranth.sim import Simulator + + +def bin_to_gray(x): + return x ^ (x >> 1) + + +def gray_to_bin(g, width): + # convert gray to binary iteratively + b = 0 + for i in range(width - 1, -1, -1): + if i == width - 1: + b |= ((g >> i) & 1) << i + else: + b |= (((b >> (i + 1)) & 1) ^ ((g >> i) & 1)) << i + return b + + +class AsyncFIFO(Elaboratable): + """Parameterizable gray-pointer dual-clock FIFO. + + - width: data width in bits + - depth: must be a power of two + - wdomain: write (source) domain name + - rdomain: read (destination) domain name + """ + + def __init__(self, width=1, depth=16, wdomain="src", rdomain="dst"): + assert depth & (depth - 1) == 0 + self.width = width + self.depth = depth + self.aw = (depth - 1).bit_length() # address width + self.wdomain = wdomain + self.rdomain = rdomain + + # write-side interface + self.wdata = Signal(width) + self.w_en = Signal() + self.w_full = Signal() + + # read-side interface + self.rdata = Signal(width) + self.r_en = Signal() + self.r_valid = Signal() + self.r_empty = Signal() + + def elaborate(self, platform): + m = Module() + + mem = Memory(width=self.width, depth=self.depth) + wp = mem.write_port(domain=self.wdomain) + rp = mem.read_port(domain=self.rdomain, transparent=False) + m.submodules += wp, rp + + # pointers are AW+1 bits (extra MSB for wrap) + wbin = Signal(self.aw + 1) + wgray = Signal(self.aw + 1) + rbin = Signal(self.aw + 1) + rgray = Signal(self.aw + 1) + + # synchronized opposing domain gray pointers + rgray_sync0 = Signal(self.aw + 1) + rgray_sync1 = Signal(self.aw + 1) + wgray_sync0 = Signal(self.aw + 1) + wgray_sync1 = Signal(self.aw + 1) + + # write domain logic + with m.Domain(self.wdomain): + waddr = Signal(self.aw) + next_wbin = Signal(self.aw + 1) + next_wgray = Signal(self.aw + 1) + + # compute next pointer + m.d.comb += next_wbin.eq(wbin + self.w_en) + m.d.comb += next_wgray.eq(next_wbin ^ (next_wbin >> 1)) + + # synchronize rgray into write domain (two flops per bit) + m.d.comb += [] + for i in range(self.aw + 1): + m.d[self.wdomain] += rgray_sync0[i].eq(rgray[i]) + m.d[self.wdomain] += rgray_sync1[i].eq(rgray_sync0[i]) + + # full detection: next_wgray equals rgray_sync with top two bits inverted + if self.aw >= 1: + top = self.aw + msb_cmp = Signal() + low_eq = Signal() + m.d.comb += low_eq.eq(next_wgray[top - 1:0] == rgray_sync1[top - 1:0]) + m.d.comb += msb_cmp.eq((next_wgray[top] != rgray_sync1[top]) & (next_wgray[top - 1] != rgray_sync1[top - 1])) + m.d.comb += self.w_full.eq(low_eq & msb_cmp) + else: + # depth==2 special case + m.d.comb += self.w_full.eq(next_wgray != rgray_sync1) + + # write to memory when enabled & not full + with m.If(self.w_en & ~self.w_full): + m.d[self.wdomain] += wp.addr.eq(wbin[self.aw - 1:0]) + m.d[self.wdomain] += wp.data.eq(self.wdata) + m.d[self.wdomain] += wp.en.eq(1) + m.d[self.wdomain] += wbin.eq(next_wbin) + m.d[self.wdomain] += wgray.eq(next_wgray) + with m.Else(): + m.d[self.wdomain] += wp.en.eq(0) + + # read domain logic + with m.Domain(self.rdomain): + raddr = Signal(self.aw) + next_rbin = Signal(self.aw + 1) + next_rgray = Signal(self.aw + 1) + + # compute next pointer + m.d.comb += next_rbin.eq(rbin + self.r_en) + m.d.comb += next_rgray.eq(next_rbin ^ (next_rbin >> 1)) + + # synchronize wgray into read domain + for i in range(self.aw + 1): + m.d[self.rdomain] += wgray_sync0[i].eq(wgray[i]) + m.d[self.rdomain] += wgray_sync1[i].eq(wgray_sync0[i]) + + # empty detection + m.d.comb += self.r_empty.eq(rgray == wgray_sync1) + + # read when enabled and not empty + with m.If(self.r_en & ~self.r_empty): + m.d[self.rdomain] += rp.addr.eq(rbin[self.aw - 1:0]) + m.d[self.rdomain] += rp.en.eq(1) + m.d[self.rdomain] += rbin.eq(next_rbin) + m.d[self.rdomain] += rgray.eq(next_rgray) + m.d[self.rdomain] += self.r_valid.eq(1) + m.d[self.rdomain] += self.rdata.eq(rp.data) + with m.Else(): + m.d[self.rdomain] += rp.en.eq(0) + m.d[self.rdomain] += self.r_valid.eq(0) + + return m + + +def _sim_fifo(): + top = Module() + fifo = AsyncFIFO(width=1, depth=16, wdomain="src", rdomain="dst") + top.submodules.fifo = fifo + + sim = Simulator(top) + sim.add_clock(1e-6, domain="src") + sim.add_clock(1.7e-6, domain="dst") + + def writer(): + # write a sequence of bits (0..31 repeating pattern) + for i in range(32): + yield fifo.wdata.eq(i & 1) + yield fifo.w_en.eq(1) + yield + yield fifo.w_en.eq(0) + # allow some idle cycles + for _ in range((i % 3)): + yield + + def reader(): + seen = [] + for _ in range(200): + # try to consume if not empty + empty = (yield fifo.r_empty) + if not empty: + yield fifo.r_en.eq(1) + yield + yield fifo.r_en.eq(0) + if (yield fifo.r_valid): + d = (yield fifo.rdata) + seen.append(d) + print(f"read: {d}") + else: + yield + print(f"total read: {len(seen)}") + + sim.add_sync_process(writer, domain="src") + sim.add_sync_process(reader, domain="dst") + sim.run() + + +if __name__ == "__main__": + _sim_fifo() diff --git a/examples/icebreaker_fifo.py b/examples/icebreaker_fifo.py new file mode 100644 index 0000000..5507400 --- /dev/null +++ b/examples/icebreaker_fifo.py @@ -0,0 +1,119 @@ +"""IceBreaker (iCE40 UP5K) vendor-backed async FIFO example. + +This module uses Amaranth's `Memory` with separate write/read ports in different +clock domains. With the icestorm toolchain the memory typically maps to +`SB_RAM40_4K` block RAMs. The control (full/empty) is implemented with +gray-pointer logic and two-stage synchronization of opposing pointers. + +Notes: +- This prefers block RAM for storage (small LUT usage, lower power). +- The write/read ports are in independent domains; backend maps ports to + dual-port RAM primitives when available. +""" + +from amaranth import * + + +class Ice40AsyncFIFO(Elaboratable): + def __init__(self, depth=256, wdomain="src", rdomain="dst"): + assert depth & (depth - 1) == 0, "depth must be power of two" + self.depth = depth + self.aw = (depth - 1).bit_length() + self.wdomain = wdomain + self.rdomain = rdomain + + # serial (1-bit) interface + self.wdata = Signal() + self.w_en = Signal() + self.w_full = Signal() + + self.rdata = Signal() + self.r_en = Signal() + self.r_valid = Signal() + self.r_empty = Signal() + + def elaborate(self, platform): + m = Module() + + # single-bit-wide memory mapped to vendor BRAMs by the backend + mem = Memory(width=1, depth=self.depth) + wp = mem.write_port(domain=self.wdomain) + rp = mem.read_port(domain=self.rdomain, transparent=False) + m.submodules += wp, rp + + # pointers (aw+1 bits to include wrap bit) + wbin = Signal(self.aw + 1) + wgray = Signal(self.aw + 1) + rbin = Signal(self.aw + 1) + rgray = Signal(self.aw + 1) + + # sync registers for opposing pointers (two-stage) + rgray_sync0 = Signal(self.aw + 1) + rgray_sync1 = Signal(self.aw + 1) + wgray_sync0 = Signal(self.aw + 1) + wgray_sync1 = Signal(self.aw + 1) + + # write-side + with m.Domain(self.wdomain): + next_wbin = Signal(self.aw + 1) + next_wgray = Signal(self.aw + 1) + m.d.comb += next_wbin.eq(wbin + self.w_en) + m.d.comb += next_wgray.eq(next_wbin ^ (next_wbin >> 1)) + + # sync read pointer into write domain + for i in range(self.aw + 1): + m.d[self.wdomain] += rgray_sync0[i].eq(rgray[i]) + m.d[self.wdomain] += rgray_sync1[i].eq(rgray_sync0[i]) + + # full detection (standard gray-pointer trick) + top = self.aw + low_eq = Signal() + msb_cmp = Signal() + m.d.comb += low_eq.eq(next_wgray[top - 1:0] == rgray_sync1[top - 1:0]) + m.d.comb += msb_cmp.eq((next_wgray[top] != rgray_sync1[top]) & (next_wgray[top - 1] != rgray_sync1[top - 1])) + m.d.comb += self.w_full.eq(low_eq & msb_cmp) + + # perform write + with m.If(self.w_en & ~self.w_full): + m.d[self.wdomain] += wp.addr.eq(wbin[self.aw - 1:0]) + m.d[self.wdomain] += wp.data.eq(self.wdata) + m.d[self.wdomain] += wp.en.eq(1) + m.d[self.wdomain] += wbin.eq(next_wbin) + m.d[self.wdomain] += wgray.eq(next_wgray) + with m.Else(): + m.d[self.wdomain] += wp.en.eq(0) + + # read-side + with m.Domain(self.rdomain): + next_rbin = Signal(self.aw + 1) + next_rgray = Signal(self.aw + 1) + m.d.comb += next_rbin.eq(rbin + self.r_en) + m.d.comb += next_rgray.eq(next_rbin ^ (next_rbin >> 1)) + + # sync write pointer into read domain + for i in range(self.aw + 1): + m.d[self.rdomain] += wgray_sync0[i].eq(wgray[i]) + m.d[self.rdomain] += wgray_sync1[i].eq(wgray_sync0[i]) + + m.d.comb += self.r_empty.eq(rgray == wgray_sync1) + + with m.If(self.r_en & ~self.r_empty): + m.d[self.rdomain] += rp.addr.eq(rbin[self.aw - 1:0]) + m.d[self.rdomain] += rp.en.eq(1) + m.d[self.rdomain] += rbin.eq(next_rbin) + m.d[self.rdomain] += rgray.eq(next_rgray) + m.d[self.rdomain] += self.r_valid.eq(1) + m.d[self.rdomain] += self.rdata.eq(rp.data) + with m.Else(): + m.d[self.rdomain] += rp.en.eq(0) + m.d[self.rdomain] += self.r_valid.eq(0) + + return m + + +if __name__ == "__main__": + # Quick smoke-check: instantiate and print fragment + from amaranth.back import verilog + + fifo = Ice40AsyncFIFO(depth=256) + print(verilog.convert(fifo, ports=[fifo.wdata, fifo.w_en, fifo.w_full, fifo.rdata, fifo.r_en, fifo.r_valid, fifo.r_empty])) diff --git a/exi_bba/__init__.py b/exi_bba/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/exi_bba/bba_register_file.py b/exi_bba/bba_register_file.py new file mode 100644 index 0000000..0bac2b5 --- /dev/null +++ b/exi_bba/bba_register_file.py @@ -0,0 +1,617 @@ +"""BBA register file — EXI domain. + +Decodes EXI transactions (2-byte header + N data bytes), reads/writes the BBA +register space, and owns all AsyncFIFO / PulseSynchronizer CDC primitives. + +Transaction header format +-------------------------- +Byte 0 [7] write_flag + [6:0] addr[12:6] +Byte 1 [7:2] addr[5:0] + [1:0] xfer_len−1 (0=1B, 1=2B, 2=3B, 3=4B) + +Addresses 0x0000–0x00FF : register file (sparse individual Signals, exi domain). +Addresses 0x0100–0x1FFF : SPRAM ring buffer (sync domain, prefetch FIFOs). +""" + +from amaranth import * +from amaranth.lib.cdc import PulseSynchronizer +from amaranth.lib.fifo import AsyncFIFO + +__all__ = ["BBARegisterFile"] + +# Register addresses +_NCRA = 0x00 +_IMR = 0x08 +_IR = 0x09 +_RWP_LO = 0x16 +_RWP_HI = 0x17 +_RRP_LO = 0x18 +_RRP_HI = 0x19 +_PAR0 = 0x20 +_PAR1 = 0x21 +_PAR2 = 0x22 +_PAR3 = 0x23 +_PAR4 = 0x24 +_PAR5 = 0x25 +_NWAYS = 0x31 +_HIPR = 0x3A +_TWD_LO = 0x34 +_TWD_HI = 0x35 +_TXDATA = 0x48 + +# Read-only hardcoded values +_NWAYS_VAL = 0x17 +_HIPR_VAL = 0x01 + +# Device ID returned on first 4-byte read of addr 0x0000 +_DEVICE_ID = [0x04, 0x02, 0x02, 0x00] + + +class BBARegisterFile(Elaboratable): + """EXI transaction decoder and BBA register file with CDC bridges. + + Sync-domain FIFO/pulse ports are wired by BBATop to the sync-domain modules. + """ + + def __init__(self): + # ── EXI byte-stream interface (exi domain, from/to ExiCapture) ──── + # RX: received bytes (header + write data + read dummies) — FWFT read + # side of ExiCapture's rx_fifo. + self.rx_data = Signal(8) + self.rx_rdy = Signal() + self.rx_en = Signal() + # TX: response bytes pushed proactively into ExiCapture's tx_fifo. + self.tx_data = Signal(8) + self.tx_en = Signal() + self.tx_rdy = Signal() + + # High while an EXI transaction is in progress (from ExiCapture). + # SPRAM reads stream until this deasserts → supports variable-length + # (DMA) bulk reads, not just ≤4-byte immediate transfers. + self.cs_active = Signal() + + # ── Interrupt (exi domain) ──────────────────────────────────────── + self.exi_int_n = Signal(init=1) + + # ── PAR output (for forwarding to W5500 as source MAC) ─────────── + self.par = Signal(48) # PAR0-5 packed: PAR0 in low byte par[0:8] + + # NCRA[3] = SR (start receive) bit — gates the RX ring-buffer path. + self.ncra_sr = Signal() + + # ── CDC FIFO sync-domain sides (wired by BBATop) ────────────────── + # SPRAM request exi→sync: sync reads these + self.spram_req_r_data = Signal(16) + self.spram_req_r_en = Signal() + self.spram_req_r_rdy = Signal() + + # SPRAM response sync→exi: sync writes these + self.spram_rsp_w_data = Signal(8) + self.spram_rsp_w_en = Signal() + self.spram_rsp_w_rdy = Signal() + + # TX bytes exi→sync: sync reads these + self.tx_bytes_r_data = Signal(8) + self.tx_bytes_r_en = Signal() + self.tx_bytes_r_rdy = Signal() + + # TX ctrl (frame length) exi→sync: sync reads these + self.tx_ctrl_r_data = Signal(16) + self.tx_ctrl_r_en = Signal() + self.tx_ctrl_r_rdy = Signal() + + # RX write-pointer update sync→exi: sync writes these + self.rx_wptr_w_data = Signal(8) + self.rx_wptr_w_en = Signal() + self.rx_wptr_w_rdy = Signal() + + # RX read-pointer update exi→sync: sync reads these + self.rx_rptr_r_data = Signal(8) + self.rx_rptr_r_en = Signal() + self.rx_rptr_r_rdy = Signal() + + # PulseSynchronizer ports (exi↔sync) + self.ncra_rst_o = Signal() # exi→sync + self.rx_irq_i = Signal() # sync→exi + self.tx_irq_i = Signal() # sync→exi + + def elaborate(self, platform): + m = Module() + + # ── CDC FIFOs ──────────────────────────────────────────────────── + spram_req = AsyncFIFO(width=16, depth=4, w_domain="exi", r_domain="sync") + spram_rsp = AsyncFIFO(width=8, depth=4, w_domain="sync", r_domain="exi") + tx_bytes = AsyncFIFO(width=8, depth=16, w_domain="exi", r_domain="sync") + tx_ctrl = AsyncFIFO(width=16, depth=4, w_domain="exi", r_domain="sync") + rx_wptr = AsyncFIFO(width=8, depth=4, w_domain="sync", r_domain="exi") + rx_rptr = AsyncFIFO(width=8, depth=4, w_domain="exi", r_domain="sync") + + m.submodules.spram_req = spram_req + m.submodules.spram_rsp = spram_rsp + m.submodules.tx_bytes = tx_bytes + m.submodules.tx_ctrl = tx_ctrl + m.submodules.rx_wptr = rx_wptr + m.submodules.rx_rptr = rx_rptr + + # Expose sync-domain FIFO sides + m.d.comb += [ + self.spram_req_r_data .eq(spram_req.r_data), + spram_req.r_en .eq(self.spram_req_r_en), + self.spram_req_r_rdy .eq(spram_req.r_rdy), + + spram_rsp.w_data .eq(self.spram_rsp_w_data), + spram_rsp.w_en .eq(self.spram_rsp_w_en), + self.spram_rsp_w_rdy .eq(spram_rsp.w_rdy), + + self.tx_bytes_r_data .eq(tx_bytes.r_data), + tx_bytes.r_en .eq(self.tx_bytes_r_en), + self.tx_bytes_r_rdy .eq(tx_bytes.r_rdy), + + self.tx_ctrl_r_data .eq(tx_ctrl.r_data), + tx_ctrl.r_en .eq(self.tx_ctrl_r_en), + self.tx_ctrl_r_rdy .eq(tx_ctrl.r_rdy), + + rx_wptr.w_data .eq(self.rx_wptr_w_data), + rx_wptr.w_en .eq(self.rx_wptr_w_en), + self.rx_wptr_w_rdy .eq(rx_wptr.w_rdy), + + self.rx_rptr_r_data .eq(rx_rptr.r_data), + rx_rptr.r_en .eq(self.rx_rptr_r_en), + self.rx_rptr_r_rdy .eq(rx_rptr.r_rdy), + ] + + # ── PulseSynchronizers ─────────────────────────────────────────── + ncra_rst_ps = PulseSynchronizer(i_domain="exi", o_domain="sync") + rx_irq_ps = PulseSynchronizer(i_domain="sync", o_domain="exi") + tx_irq_ps = PulseSynchronizer(i_domain="sync", o_domain="exi") + + m.submodules.ncra_rst_ps = ncra_rst_ps + m.submodules.rx_irq_ps = rx_irq_ps + m.submodules.tx_irq_ps = tx_irq_ps + + m.d.comb += [ + self.ncra_rst_o .eq(ncra_rst_ps.o), + rx_irq_ps.i .eq(self.rx_irq_i), + tx_irq_ps.i .eq(self.tx_irq_i), + ] + + # ── Register file (sparse individual Signals, exi domain) ──────── + # Only the registers actually read/written by the GC or sync domain. + # Writes to unknown addresses are silently ignored; reads return 0. + r_ncra = Signal(8) + r_imr = Signal(8) + r_ir = Signal(8) + r_rwp_lo = Signal(8) + r_rrp_lo = Signal(8) + # PAR0–5 reset to a valid Nintendo OUI MAC (00:09:BF:00:00:01) so the + # device has a sane source MAC even before the GC driver programs its + # own. PAR0 is the first MAC octet. + _par_reset = [0x00, 0x09, 0xBF, 0x00, 0x00, 0x01] + r_par = Array([Signal(8, name=f"par{i}", init=_par_reset[i]) + for i in range(6)]) + r_twd_lo = Signal(8) + r_twd_hi = Signal(8) + + # PAR packed output: PAR0 in the LOW byte (par[0:8]). The W5500 master + # reads mac_shadow[i] = par[i*8:(i+1)*8], so this puts PAR0 first in the + # SHAR write — i.e. PAR0 is the first MAC octet on the wire. + m.d.comb += self.par.eq(Cat( + r_par[0], r_par[1], r_par[2], r_par[3], r_par[4], r_par[5], + )) + m.d.comb += self.ncra_sr.eq(r_ncra[3]) # start-receive bit + + # ── Transaction state ──────────────────────────────────────────── + hdr0 = Signal(8) + addr = Signal(13) + is_write = Signal() + xfer_len = Signal(2) # 0=1B … 3=4B + byte_ctr = Signal(2) + tx_frame_len = Signal(16) + + # True until first NCRA reset write: return device ID on addr=0 reads + id_phase = Signal(init=1) + + # Per-byte SPRAM read handshake (register-read path): sp_req marks a + # request in flight; drain_ctr counts the read-phase dummy bytes. + sp_req = Signal() + drain_ctr = Signal(2) + + # SPRAM streaming-read state (DMA / variable-length reads): + # sp_addr — next SPRAM byte address to request (auto-increments) + # outstanding — SPRAM requests issued but whose responses are not yet + # popped (bounds prefetch and is drained at end) + sp_addr = Signal(13) + outstanding = Signal(4) + SP_LIMIT = 4 # max prefetch depth in flight + + # Effective address of the current data byte — a REGISTERED running + # pointer (set to the base in HEADER1, incremented per byte). Keeping + # it registered keeps the 13-bit adder off the combinational path that + # feeds the read-response mux → tx_fifo write data. + eff_addr = Signal(13) + rd_sel = eff_addr[0:8] + + # ── Combinational read-response value (non-SPRAM) ──────────────── + reg_rdval = Signal(8) + with m.Switch(rd_sel): + with m.Case(_NCRA): m.d.comb += reg_rdval.eq(r_ncra) + with m.Case(_IMR): m.d.comb += reg_rdval.eq(r_imr) + with m.Case(_IR): m.d.comb += reg_rdval.eq(r_ir) + with m.Case(_RWP_LO): m.d.comb += reg_rdval.eq(r_rwp_lo) + with m.Case(_RRP_LO): m.d.comb += reg_rdval.eq(r_rrp_lo) + with m.Case(_PAR0, _PAR1, _PAR2, _PAR3, _PAR4, _PAR5): + m.d.comb += reg_rdval.eq(r_par[eff_addr[0:3]]) + with m.Case(_TWD_LO): m.d.comb += reg_rdval.eq(r_twd_lo) + with m.Case(_TWD_HI): m.d.comb += reg_rdval.eq(r_twd_hi) + with m.Case(_NWAYS): m.d.comb += reg_rdval.eq(_NWAYS_VAL) + with m.Case(_HIPR): m.d.comb += reg_rdval.eq(_HIPR_VAL) + with m.Default(): m.d.comb += reg_rdval.eq(0) + + # Device-ID bytes (addr 0 read while id_phase): 0x04 0x02 0x02 0x00 + devid = Signal(8) + with m.Switch(byte_ctr): + with m.Case(0): m.d.comb += devid.eq(0x04) + with m.Case(1): m.d.comb += devid.eq(0x02) + with m.Case(2): m.d.comb += devid.eq(0x02) + with m.Case(3): m.d.comb += devid.eq(0x00) + + rd_val = Signal(8) # response for the current non-SPRAM read byte + with m.If((addr == 0) & id_phase): + m.d.comb += rd_val.eq(devid) + with m.Else(): + m.d.comb += rd_val.eq(reg_rdval) + + # ── Default strobes ────────────────────────────────────────────── + m.d.exi += [ + spram_req.w_en .eq(0), + tx_bytes.w_en .eq(0), + tx_ctrl.w_en .eq(0), + rx_rptr.w_en .eq(0), + rx_wptr.r_en .eq(0), + ncra_rst_ps.i .eq(0), + ] + m.d.comb += [ + self.rx_en .eq(0), + self.tx_en .eq(0), + self.tx_data.eq(0), + # Combinational so the FIFO advances in the SAME cycle as the pop — + # a registered r_en would let `pop` re-fire on the same byte. + spram_rsp.r_en.eq(0), + ] + + # ── Transaction FSM (proactive push/pull over byte FIFOs) ──────── + # The SPI bit cadence lives in the capture domain; here we just consume + # received bytes and, for reads, push response bytes into tx_fifo during + # the EXI clock-idle gap before the GC clocks the data phase. + with m.FSM(domain="exi", name="exi_fsm"): + + with m.State("HEADER0"): + with m.If(self.rx_rdy): + m.d.comb += self.rx_en.eq(1) + m.d.exi += hdr0.eq(self.rx_data) + m.next = "HEADER1" + + with m.State("HEADER1"): + with m.If(self.rx_rdy): + m.d.comb += self.rx_en.eq(1) + new_addr = Cat(self.rx_data[2:8], hdr0[0:7]) # 13-bit addr + new_len = self.rx_data[0:2] + new_write = hdr0[7] + + m.d.exi += addr.eq(new_addr) + m.d.exi += eff_addr.eq(new_addr) # running pointer init + m.d.exi += xfer_len.eq(new_len) + m.d.exi += is_write.eq(new_write) + m.d.exi += byte_ctr.eq(0) + m.d.exi += sp_req.eq(0) + m.d.exi += drain_ctr.eq(0) + + with m.If(new_write): + m.next = "WRITE" + with m.Elif(new_addr >= 0x100): + # SPRAM region: stream until CS deasserts (DMA-capable). + m.d.exi += sp_addr.eq(new_addr) + m.d.exi += outstanding.eq(0) + m.next = "SPRAM_STREAM" + with m.Else(): + m.next = "REG_READ" + + with m.State("WRITE"): + # Consume xfer_len+1 data bytes, writing the register file. + with m.If(self.rx_rdy): + m.d.comb += self.rx_en.eq(1) + with m.Switch(rd_sel): + with m.Case(_NCRA): + m.d.exi += r_ncra.eq(self.rx_data) + with m.If(self.rx_data[0]): + m.d.exi += r_ncra[0].eq(0) # RESET self-clears + m.d.exi += ncra_rst_ps.i.eq(1) + m.d.exi += id_phase.eq(0) + with m.If(self.rx_data[1:3].any()): + with m.If(tx_ctrl.w_rdy): + m.d.exi += tx_ctrl.w_data.eq(tx_frame_len) + m.d.exi += tx_ctrl.w_en.eq(1) + with m.Case(_IMR): + m.d.exi += r_imr.eq(self.rx_data) + with m.Case(_IR): + m.d.exi += r_ir.eq(r_ir & ~self.rx_data) # write-1-clear + with m.Case(_RRP_LO): + m.d.exi += r_rrp_lo.eq(self.rx_data) + with m.If(rx_rptr.w_rdy): + m.d.exi += rx_rptr.w_data.eq(self.rx_data) + m.d.exi += rx_rptr.w_en.eq(1) + with m.Case(_PAR0, _PAR1, _PAR2, _PAR3, _PAR4, _PAR5): + m.d.exi += r_par[eff_addr[0:3]].eq(self.rx_data) + with m.Case(_TWD_LO): + m.d.exi += r_twd_lo.eq(self.rx_data) + m.d.exi += tx_frame_len[0:8].eq(self.rx_data) + with m.Case(_TWD_HI): + m.d.exi += r_twd_hi.eq(self.rx_data) + m.d.exi += tx_frame_len[8:16].eq(self.rx_data) + with m.Case(_TXDATA): + with m.If(tx_bytes.w_rdy): + m.d.exi += tx_bytes.w_data.eq(self.rx_data) + m.d.exi += tx_bytes.w_en.eq(1) + # All other addresses silently ignored + + with m.If(byte_ctr == xfer_len): + m.next = "HEADER0" + with m.Else(): + m.d.exi += byte_ctr.eq(byte_ctr + 1) + m.d.exi += eff_addr.eq(eff_addr + 1) + + with m.State("REG_READ"): + # Register / device-ID read (addr < 0x100): value available + # immediately, bounded by the header's xfer_len (≤4 bytes). + with m.If(self.tx_rdy): + m.d.comb += self.tx_data.eq(rd_val) + m.d.comb += self.tx_en.eq(1) + with m.If(byte_ctr == xfer_len): + m.next = "READ_DRAIN" + with m.Else(): + m.d.exi += byte_ctr.eq(byte_ctr + 1) + m.d.exi += eff_addr.eq(eff_addr + 1) + + with m.State("READ_DRAIN"): + # Discard the xfer_len+1 dummy bytes the GC clocks while reading. + with m.If(self.rx_rdy): + m.d.comb += self.rx_en.eq(1) + with m.If(drain_ctr == xfer_len): + m.next = "HEADER0" + with m.Else(): + m.d.exi += drain_ctr.eq(drain_ctr + 1) + + with m.State("SPRAM_STREAM"): + # Stream SPRAM bytes until CS deasserts — handles both ≤4-byte + # immediate reads and arbitrary-length DMA reads uniformly. + # Issue read requests ahead (prefetch, bounded by SP_LIMIT) and + # push responses into tx_fifo; the capture domain pops them as + # the GC clocks. Drain rx dummies as they arrive. + issue = Signal() + pop = Signal() + m.d.comb += issue.eq(self.cs_active & spram_req.w_rdy + & (outstanding < SP_LIMIT)) + m.d.comb += pop.eq(spram_rsp.r_rdy & self.tx_rdy) + + with m.If(issue): + m.d.exi += spram_req.w_data.eq(sp_addr) + m.d.exi += spram_req.w_en.eq(1) + m.d.exi += sp_addr.eq(sp_addr + 1) + with m.If(pop): + m.d.comb += self.tx_data.eq(spram_rsp.r_data) + m.d.comb += self.tx_en.eq(1) + m.d.comb += spram_rsp.r_en.eq(1) + m.d.exi += outstanding.eq(outstanding + issue - pop) + + with m.If(self.rx_rdy): + m.d.comb += self.rx_en.eq(1) # drain dummy bytes + + with m.If(~self.cs_active): + m.next = "SPRAM_END" + + with m.State("SPRAM_END"): + # CS deasserted: drain in-flight SPRAM responses and rx dummies, + # then idle. Leftover prefetch in tx_fifo is flushed by + # ExiCapture on the next CS assertion. + with m.If(spram_rsp.r_rdy): + m.d.comb += spram_rsp.r_en.eq(1) + m.d.exi += outstanding.eq(outstanding - 1) + with m.If(self.rx_rdy): + m.d.comb += self.rx_en.eq(1) + with m.If((outstanding == 0) & ~self.rx_rdy & ~spram_rsp.r_rdy): + m.next = "HEADER0" + + # ── Interrupt output ───────────────────────────────────────────── + m.d.exi += self.exi_int_n.eq(~(r_ir & r_imr).any()) + + # ── Consume RWP updates from sync domain ────────────────────────── + with m.If(rx_wptr.r_rdy): + m.d.exi += rx_wptr.r_en.eq(1) + m.d.exi += r_rwp_lo.eq(rx_wptr.r_data) + + # ── PulseSynchronizer arrivals ──────────────────────────────────── + with m.If(rx_irq_ps.o): + m.d.exi += r_ir[1].eq(1) # RI bit + with m.If(tx_irq_ps.o): + m.d.exi += r_ir[2].eq(1) # TI bit + m.d.exi += r_ncra[1:3].eq(0) # clear ST bits + + return m + + +# ── Testbench ───────────────────────────────────────────────────────────── + +if __name__ == "__main__": + import sys, os + sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) + + from amaranth.sim import Simulator, Period + + reg = BBARegisterFile() + + # Drive the byte-stream interface directly (the SPI bit cadence and FIFOs + # live in ExiCapture; here we model the byte producer/consumer). + async def push_rx(ctx, b): + """Present one received byte and wait for the register file to pop it.""" + ctx.set(reg.rx_data, b) + ctx.set(reg.rx_rdy, 1) + while True: + en = ctx.get(reg.rx_en) + await ctx.tick("exi") + if en: + break + ctx.set(reg.rx_rdy, 0) + + async def collect_tx(ctx, n): + """Collect n response bytes pushed by the register file (bounded).""" + out = [] + for _ in range(3000): + if ctx.get(reg.tx_en): + out.append(ctx.get(reg.tx_data)) + if len(out) >= n: + break + await ctx.tick("exi") + return out + + async def exi_read(ctx, addr, length=1): + hdr0 = (addr >> 6) & 0x7F + hdr1 = ((addr & 0x3F) << 2) | (length - 1) + await push_rx(ctx, hdr0) + await push_rx(ctx, hdr1) + result = await collect_tx(ctx, length) # READ pushes `length` bytes + for _ in range(length): # READ_DRAIN dummies + await push_rx(ctx, 0x00) + return result + + async def exi_write(ctx, addr, data): + hdr0 = 0x80 | ((addr >> 6) & 0x7F) + hdr1 = ((addr & 0x3F) << 2) | (len(data) - 1) + await push_rx(ctx, hdr0) + await push_rx(ctx, hdr1) + for b in data: + await push_rx(ctx, b) + + # SPRAM contents the streaming-read test reads back (byte i = 0xA0+i). + spram_mem = {0x100 + i: (0xA0 + i) & 0xFF for i in range(64)} + + async def spram_model(ctx): + """Model the SPRAM (sync side): answer spram_req with mem[addr]. + + One request at a time, with cleanly-pulsed r_en/w_en so the FIFO pop + and the response push stay in lock-step (no double-response races). + """ + state = "POP" + held = 0 + async for vals in ctx.tick("sync").sample( + reg.spram_req_r_rdy, reg.spram_req_r_data, reg.spram_rsp_w_rdy): + rdy, addr, rsp_rdy = vals[-3:] + ctx.set(reg.spram_req_r_en, 0) + ctx.set(reg.spram_rsp_w_en, 0) + if state == "POP": + if rdy: + held = spram_mem.get(addr, 0) + ctx.set(reg.spram_req_r_en, 1) # consume the request + state = "RESP" + else: # RESP + if rsp_rdy: + ctx.set(reg.spram_rsp_w_data, held) + ctx.set(reg.spram_rsp_w_en, 1) # deliver the response + state = "POP" + + errors = [] + + async def testbench(ctx): + ctx.set(reg.tx_rdy, 1) # tx_fifo always has room in this model + await ctx.tick("exi").repeat(8) + + # T1: Device ID (addr=0, 4-byte read) + result = await exi_read(ctx, 0x0000, length=4) + if result != _DEVICE_ID: + errors.append(f"T1 device ID: expected {_DEVICE_ID}, got {result}") + print(f"T1 device ID: {[f'0x{b:02X}' for b in result]}") + await ctx.tick("exi").repeat(4) + + # T2: Write and read back PAR0-PAR3 + await exi_write(ctx, _PAR0, [0xDE, 0xAD, 0xBE, 0xEF]) + await ctx.tick("exi").repeat(4) + result = await exi_read(ctx, _PAR0, length=4) + if result != [0xDE, 0xAD, 0xBE, 0xEF]: + errors.append(f"T2 PAR readback: {result}") + print(f"T2 PAR0-3: {[f'0x{b:02X}' for b in result]}") + await ctx.tick("exi").repeat(4) + + # T3: NWAYS hardcoded 0x17 + result = await exi_read(ctx, _NWAYS, length=1) + if result != [0x17]: + errors.append(f"T3 NWAYS: expected 0x17, got {result}") + print(f"T3 NWAYS: 0x{result[0]:02X}") + await ctx.tick("exi").repeat(4) + + # T4: HIPR hardcoded 0x01 + result = await exi_read(ctx, _HIPR, length=1) + if result != [0x01]: + errors.append(f"T4 HIPR: expected 0x01, got {result}") + print(f"T4 HIPR: 0x{result[0]:02X}") + await ctx.tick("exi").repeat(4) + + # T5: IMR write, rx_irq pulse, INT_N asserts, then IR clear + await exi_write(ctx, _IMR, [0x02]) # enable RI (bit 1) + await ctx.tick("exi").repeat(4) + ctx.set(reg.rx_irq_i, 1) + await ctx.tick("sync").repeat(1) + ctx.set(reg.rx_irq_i, 0) + await ctx.tick("exi").repeat(12) # wait for PS propagation + int_n = ctx.get(reg.exi_int_n) + if int_n != 0: + errors.append(f"T5 INT_N after RI: expected 0, got {int_n}") + print(f"T5 INT_N after RI pulse: {int_n} (want 0)") + await exi_write(ctx, _IR, [0x02]) # write-1-to-clear RI + await ctx.tick("exi").repeat(4) + int_n = ctx.get(reg.exi_int_n) + if int_n != 1: + errors.append(f"T5 INT_N after clear: expected 1, got {int_n}") + print(f"T5 INT_N after IR clear: {int_n} (want 1)") + + # T6: streaming SPRAM read (DMA) — read N>4 bytes from 0x100 by holding + # cs_active and clocking past the header's 4-byte length field. + N = 12 + ctx.set(reg.cs_active, 1) + await push_rx(ctx, 0x04) # hdr0 → addr[12:6]; addr 0x100, read + await push_rx(ctx, 0x00) # hdr1 → addr[5:0]=0, len field ignored + got = [] + for _ in range(5000): + if ctx.get(reg.tx_en): + got.append(ctx.get(reg.tx_data)) + if len(got) >= N: + break + await ctx.tick("exi") + ctx.set(reg.cs_active, 0) # end the transaction + await ctx.tick("exi").repeat(40) # let SPRAM_END drain/clean up + want = [spram_mem[0x100 + i] for i in range(N)] + print(f"T6 DMA read {N}B: {[f'0x{b:02X}' for b in got]}") + if got != want: + errors.append(f"T6 streaming SPRAM read: got {got}, want {want}") + + # T7: a normal register read still works after the streaming transaction + # (FSM cleaned up and returned to HEADER0) + result = await exi_read(ctx, _NWAYS, length=1) + if result != [0x17]: + errors.append(f"T7 NWAYS after DMA: got {result}") + print(f"T7 NWAYS after DMA read: 0x{result[0]:02X}") + + sim = Simulator(reg) + sim.add_clock(Period(MHz=24), domain="exi") + sim.add_clock(Period(MHz=24), domain="sync") + sim.add_testbench(testbench) + sim.add_process(spram_model) + + sim.run() + + if errors: + print("\nFAILURES:") + for e in errors: + print(" ", e) + sys.exit(1) + else: + print("\nAll tests passed.") diff --git a/exi_bba/bba_top.py b/exi_bba/bba_top.py new file mode 100644 index 0000000..f6a6ada --- /dev/null +++ b/exi_bba/bba_top.py @@ -0,0 +1,533 @@ +"""BBATop — top-level elaboratable for the GC BBA FPGA replacement. + +Clock domains +------------- +capture : 54 MHz, from 12 MHz crystal via SB_PLL40_PAD (DIVR=0 DIVF=71 DIVQ=4) +exi/sync : 24 MHz, from the iCE40UP5K internal SB_HFOSC (÷2, CLKHF_DIV=0b01) + +Submodule instantiation and signal wiring +----------------------------------------- +See CLAUDE.md "Module Breakdown" and "CDC Signal Inventory" for the full list. +""" + +from amaranth import * + +from exi_bba.exi_capture import ExiCapture +from exi_bba.bba_register_file import BBARegisterFile +from exi_bba.spram_arbiter import SPRAMArbiter +from exi_bba.rx_frame_assembler import RXFrameAssembler +from exi_bba.tx_frame_drain import TXFrameDrain +from exi_bba.w5500_spi_master import W5500SPIMaster +from exi_bba.w5100_parallel_master import W5100ParallelMaster +from exi_bba.status_panel import StatusPanel + +from amaranth.lib.cdc import FFSynchronizer + +__all__ = ["BBATop"] + + +class BBATop(Elaboratable): + """Top-level module. Wires all submodules and defines clock domains. + + External ports (exposed for platform or testbench connection) + ------------------------------------------------------------- + EXI / GC interface (SPI Mode 3) + exi_clk / exi_mosi / exi_cs_n : inputs from GC + exi_miso : output to GC + int_n : interrupt output (active low) + + W5500 SPI interface (SPI Mode 0) + w5500_clk / w5500_mosi / w5500_cs_n : outputs to W5500 + w5500_miso : input from W5500 + w5500_int_n : W5500 interrupt (input, active low) + w5500_rst_n : W5500 hardware reset (output, active low) + """ + + def __init__(self, eth="w5100", reset_cycles=24000, status_panel=False): + # Ethernet back-end: "w5100" (indirect parallel bus, reaches the EXI + # ceiling) or "w5500" (SPI, ~12 Mbit/s). Both expose the identical + # tx/rx/init/par interface, so only the physical pins differ. + self._eth = eth + # MR-reset settle wait passed to the ethernet master (~1 ms on hardware; + # the testbench overrides with a small value for fast simulation). + self._reset_cycles = reset_cycles + # Optional bring-up status panel (drives onboard LEDs/button on the + # iCEbreaker — see synth.py). panel_led bit order matches StatusPanel. + self._status_panel = status_panel + + # EXI (GC side) + self.exi_clk = Signal(init=1) + self.exi_mosi = Signal() + self.exi_cs_n = Signal(init=1) + self.exi_miso = Signal() + self.int_n = Signal(init=1) + + if eth == "w5500": + # W5500 SPI + self.w5500_clk = Signal() + self.w5500_mosi = Signal() + self.w5500_miso = Signal() + self.w5500_cs_n = Signal(init=1) + self.w5500_int_n = Signal(init=1) + self.w5500_rst_n = Signal(init=1) + else: + # W5100 indirect parallel bus. data_o/data_oe/data_i are the FPGA + # side of a bidirectional D[7:0] (wrapped in a tristate SB_IO at the + # platform level); a board ties the upper address lines to 0 so only + # A[1:0] are wired. + self.w5100_addr = Signal(2) + self.w5100_data_o = Signal(8) + self.w5100_data_oe = Signal() + self.w5100_data_i = Signal(8) + self.w5100_cs_n = Signal(init=1) + self.w5100_rd_n = Signal(init=1) + self.w5100_wr_n = Signal(init=1) + self.w5100_int_n = Signal(init=1) + self.w5100_rst_n = Signal(init=1) + + if status_panel: + self.panel_led = Signal(5) # to onboard LEDs (see StatusPanel) + self.panel_btn = Signal(3) # from onboard button(s) + + def elaborate(self, platform): + m = Module() + + # ── Clock domain generation ─────────────────────────────────────── + # Three domains, two physical sources (1 PLL + 1 internal HFOSC): + # capture @ 54 MHz (PLL) — SPI bit engine only; oversamples the + # 27 MHz EXI clock 2× (robust Mode-3). + # exi @ 24 MHz (HFOSC) — register file / transaction FSM. + # sync @ 24 MHz (HFOSC) — SPRAM, RX/TX engines, ethernet master. + # exi and sync share the HFOSC net (frequency- and phase-matched); the + # AsyncFIFOs between them are still valid CDC and keep the module + # boundaries clean. Only the tiny capture front-end needs the fast + # clock — which is why 27 MHz-EXI / OG performance is reachable on the + # iCE40UP5K even though the register-file logic tops out ~44 MHz. + if platform is not None: + # capture @ 54 MHz: icepll -i 12 -o 54 → DIVR=0 DIVF=71 DIVQ=4. + # 54 MHz = 2× the 27 MHz EXI clock — the minimum oversampling that + # cleanly implements SPI Mode 3. The isolated SPI bit engine closes + # ~91 MHz on this device; the byte-FIFO read path brings the + # integrated capture domain to ~62 MHz, so 54 closes with margin. + m.domains += ClockDomain("capture") + platform.lookup(platform.default_clk).attrs["GLOBAL"] = False + m.submodules.pll = Instance( + "SB_PLL40_PAD", + p_FEEDBACK_PATH = "SIMPLE", + p_DIVR = 0, + p_DIVF = 71, + p_DIVQ = 4, + p_FILTER_RANGE = 1, + i_PACKAGEPIN = platform.request("clk12", dir="-").io, + i_RESETB = Const(1, 1), + i_BYPASS = Const(0, 1), + o_PLLOUTGLOBAL = ClockSignal("capture"), + ) + + # exi & sync @ 24 MHz: one SB_HFOSC (÷2) drives both slow domains. + # The bulky register-file / SPRAM / W5500 logic is routing-bound at + # ~33–44 MHz on the UP5K; 24 MHz closes with large margin. The byte + # rate (27 MHz EXI ÷ 8 ≈ 3.4 MHz) leaves ~7 slow cycles per byte. + m.domains += ClockDomain("exi") + m.domains += ClockDomain("sync") + m.submodules.hfosc = Instance( + "SB_HFOSC", + p_CLKHF_DIV = "0b01", # 48 ÷ 2 → 24 MHz + i_CLKHFEN = Const(1, 1), + i_CLKHFPU = Const(1, 1), + o_CLKHF = ClockSignal("exi"), + ) + m.d.comb += ClockSignal("sync").eq(ClockSignal("exi")) + # (simulation: test harness provides capture/exi/sync clocks via add_clock) + + # ── Submodules ──────────────────────────────────────────────────── + cap = ExiCapture() # SPI bit engine (capture) + byte FIFOs + reg = BBARegisterFile() + arb = SPRAMArbiter() + asm = RXFrameAssembler() + drain = TXFrameDrain() + eth = (W5500SPIMaster(reset_cycles=self._reset_cycles) + if self._eth == "w5500" + else W5100ParallelMaster(reset_cycles=self._reset_cycles)) + + m.submodules.cap = cap + m.submodules.reg = reg + m.submodules.arb = arb + m.submodules.asm = asm + m.submodules.drain = drain + m.submodules.eth = eth + + # ── External pin connections ────────────────────────────────────── + m.d.comb += [ + # EXI inputs (to capture-domain front-end) + cap.spi_clk .eq(self.exi_clk), + cap.spi_mosi.eq(self.exi_mosi), + cap.spi_cs_n.eq(self.exi_cs_n), + # EXI outputs + self.exi_miso.eq(cap.spi_miso), + self.int_n .eq(reg.exi_int_n), + ] + + # Ethernet back-end physical pins + if self._eth == "w5500": + m.d.comb += [ + self.w5500_clk .eq(eth.spi_clk), + self.w5500_mosi.eq(eth.spi_mosi), + self.w5500_cs_n.eq(eth.spi_cs_n), + eth.spi_miso .eq(self.w5500_miso), + eth.w5500_int_n.eq(self.w5500_int_n), + self.w5500_rst_n.eq(eth.w5500_rst_n), + ] + else: + m.d.comb += [ + self.w5100_addr .eq(eth.bus_addr), + self.w5100_data_o .eq(eth.bus_data_o), + self.w5100_data_oe.eq(eth.bus_data_oe), + eth.bus_data_i .eq(self.w5100_data_i), + self.w5100_cs_n .eq(eth.cs_n), + self.w5100_rd_n .eq(eth.rd_n), + self.w5100_wr_n .eq(eth.wr_n), + eth.w5100_int_n .eq(self.w5100_int_n), + self.w5100_rst_n .eq(eth.w5100_rst_n), + ] + + # ── ExiCapture byte stream ↔ BBARegisterFile (exi domain) ──────── + m.d.comb += [ + reg.rx_data .eq(cap.rx_data), + reg.rx_rdy .eq(cap.rx_rdy), + cap.rx_en .eq(reg.rx_en), + + cap.tx_data .eq(reg.tx_data), + cap.tx_en .eq(reg.tx_en), + reg.tx_rdy .eq(cap.tx_rdy), + + reg.cs_active.eq(cap.cs_active), # transaction-active (for DMA reads) + ] + + # ── BBARegisterFile ↔ SPRAMArbiter (sync domain FIFO sides) ────── + # SPRAM request: reg exi→sync FIFO read side → arb + m.d.comb += [ + arb.exi_req_addr .eq(reg.spram_req_r_data), + arb.exi_req_valid.eq(reg.spram_req_r_rdy), + reg.spram_req_r_en.eq(arb.exi_req_ready), + ] + # SPRAM response: arb result → reg sync→exi FIFO write side + m.d.comb += [ + reg.spram_rsp_w_data.eq(arb.exi_rsp_data), + reg.spram_rsp_w_en .eq(arb.exi_rsp_valid), + # arb does not need w_rdy feedback (spram_rsp FIFO is deeper than latency) + ] + + # ── BBARegisterFile ↔ TXFrameDrain (sync domain FIFO sides) ────── + m.d.comb += [ + drain.tx_bytes_r_data.eq(reg.tx_bytes_r_data), + drain.tx_bytes_r_rdy .eq(reg.tx_bytes_r_rdy), + reg.tx_bytes_r_en .eq(drain.tx_bytes_r_en), + + drain.tx_ctrl_r_data.eq(reg.tx_ctrl_r_data), + drain.tx_ctrl_r_rdy .eq(reg.tx_ctrl_r_rdy), + reg.tx_ctrl_r_en .eq(drain.tx_ctrl_r_en), + ] + + # ── TXFrameDrain ↔ ethernet master (sync domain) ────────────────── + m.d.comb += [ + eth.tx_data .eq(drain.tx_data), + eth.tx_valid.eq(drain.tx_valid), + drain.tx_ready.eq(eth.tx_ready), + eth.tx_sof .eq(drain.tx_sof), + eth.tx_eof .eq(drain.tx_eof), + ] + + # ── ethernet master → RXFrameAssembler (sync domain) ───────────── + m.d.comb += [ + asm.rx_data .eq(eth.rx_data), + asm.rx_valid.eq(eth.rx_valid), + eth.rx_ready.eq(asm.rx_ready), + asm.rx_sof .eq(eth.rx_sof), + asm.rx_eof .eq(eth.rx_eof), + ] + + # ── RXFrameAssembler → SPRAMArbiter (ETH write, sync domain) ───── + m.d.comb += [ + arb.eth_wr_addr .eq(asm.eth_wr_addr), + arb.eth_wr_data .eq(asm.eth_wr_data), + arb.eth_wr_valid.eq(asm.eth_wr_valid), + asm.eth_wr_ready.eq(arb.eth_wr_ready), + ] + + # ── RXFrameAssembler → BBARegisterFile (rx_wptr FIFO write side) ─ + m.d.comb += [ + reg.rx_wptr_w_data.eq(asm.rx_wptr_w_data), + reg.rx_wptr_w_en .eq(asm.rx_wptr_w_en), + asm.rx_wptr_w_rdy .eq(reg.rx_wptr_w_rdy), + ] + + # ── Pulse synchronizer connections ──────────────────────────────── + m.d.comb += [ + # RX irq: sync → exi (RXFrameAssembler → reg → PS → exi domain) + reg.rx_irq_i.eq(asm.rx_irq), + # TX irq: sync → exi + reg.tx_irq_i.eq(drain.tx_irq), + # MAC address (PAR0–5) → SHAR. exi and sync share the HFOSC net, + # and par is quasi-static (sampled by the master at init_req). + eth.par.eq(reg.par), + ] + + # ── RX enabled gate (NCRA SR / start-receive bit) ───────────────── + # The RX ring-buffer path is active only after the GC sets NCRA[3]. + m.d.comb += asm.rx_enabled.eq(reg.ncra_sr) + + # ── Optional bring-up status panel (sync domain) ────────────────── + # init_req = NCRA reset (exi→sync PS), OR'd with the panel's manual + # re-init button when the panel is present. + if self._status_panel: + panel = StatusPanel() + m.submodules.panel = panel + + # cs_active lives in the exi domain; bring it to sync for the LED. + cs_a_sync = Signal() + m.submodules.panel_cs = FFSynchronizer( + cap.cs_active, cs_a_sync, o_domain="sync") + + # "ready" = ethernet init complete (latched until the next init). + ready = Signal() + with m.If(eth.init_done): + m.d.sync += ready.eq(1) + with m.Elif(reg.ncra_rst_o | panel.reinit): + m.d.sync += ready.eq(0) + + m.d.comb += [ + panel.cs_active.eq(cs_a_sync), + panel.rx_pulse .eq(asm.rx_irq), + panel.tx_pulse .eq(drain.tx_irq), + panel.ready .eq(ready), + panel.btn .eq(self.panel_btn), + self.panel_led .eq(panel.led), + eth.init_req .eq(reg.ncra_rst_o | panel.reinit), + ] + else: + m.d.comb += eth.init_req.eq(reg.ncra_rst_o) + + return m + + +# ── Integration testbench ───────────────────────────────────────────────── +# Drives real EXI Mode-3 transactions on the GC-facing pins and checks the +# response — exercising the full chain ExiCapture (capture domain) ↔ byte FIFOs +# ↔ BBARegisterFile (exi domain) ↔ sync modules, across all three clock domains. + +if __name__ == "__main__": + import sys + from amaranth.sim import Simulator, Period + + dut = BBATop(eth="w5100", reset_cycles=20, # small reset wait for sim + status_panel=True) # also exercise the panel wiring + errors = [] + + HALF = 8 # capture ticks per SPI half-period (well-oversampled) + + async def spi_byte(ctx, mosi_val): + """Drive one EXI Mode-3 byte; return the assembled MISO byte.""" + miso = 0 + for bit in range(7, -1, -1): + ctx.set(dut.exi_mosi, (mosi_val >> bit) & 1) + ctx.set(dut.exi_clk, 0) # falling: slave samples MOSI + await ctx.tick("capture").repeat(HALF) + miso = (miso << 1) | ctx.get(dut.exi_miso) + ctx.set(dut.exi_clk, 1) # rising + await ctx.tick("capture").repeat(HALF) + return miso + + async def exi_read(ctx, addr, length): + """EXI immediate read: 2-byte header, clock-idle gap, then `length` bytes.""" + hdr0 = (addr >> 6) & 0x7F + # The header length field is only 2 bits ([1:0]); mask it so a long + # (DMA) read doesn't overflow length-1 into the addr[5:0] bits. For + # SPRAM reads the field is ignored anyway — the stream runs until CS. + hdr1 = ((addr & 0x3F) << 2) | ((length - 1) & 0x3) + ctx.set(dut.exi_cs_n, 0) + ctx.set(dut.exi_clk, 1) + await ctx.tick("capture").repeat(HALF) + await spi_byte(ctx, hdr0) + await spi_byte(ctx, hdr1) + # EXI_Imm clock-idle gap: the core decodes the header and prefetches + # responses into the tx FIFO before the GC clocks the data phase. + await ctx.tick("capture").repeat(HALF * 12) + result = [await spi_byte(ctx, 0x00) for _ in range(length)] + ctx.set(dut.exi_cs_n, 1) + await ctx.tick("capture").repeat(HALF) + return result + + async def exi_write(ctx, addr, data): + """EXI immediate write: 2-byte header then the data bytes.""" + hdr0 = 0x80 | ((addr >> 6) & 0x7F) + hdr1 = ((addr & 0x3F) << 2) | (len(data) - 1) + ctx.set(dut.exi_cs_n, 0) + ctx.set(dut.exi_clk, 1) + await ctx.tick("capture").repeat(HALF) + await spi_byte(ctx, hdr0) + await spi_byte(ctx, hdr1) + for b in data: + await spi_byte(ctx, b) + ctx.set(dut.exi_cs_n, 1) + await ctx.tick("capture").repeat(HALF) + + # ── W5100 indirect-bus slave model (drives w5100_data_i) ───────────── + # Pre-loads a known MACRAW packet in the RX buffer so we can verify the full + # ethernet→SPRAM→GC path. Same protocol as the W5100ParallelMaster bench. + RX_FRAME = [0xDE, 0xAD, 0xBE, 0xEF, 0x01, 0x02, 0x03, 0x04] + _W_RX_BASE = 0x6000 + _W_S0_CR = 0x0401 + _W_S0_RX_RSR = 0x0426 + _W_S0_RX_RD = 0x0428 + _W_CR_RECV = 0x40 + _A_MR, _A_AR0, _A_AR1, _A_DR = 0b00, 0b01, 0b10, 0b11 + + def w5100_preload(): + plen = len(RX_FRAME) + 2 # MACRAW length includes its header + mem = {} + for i, b in enumerate([(plen >> 8) & 0xFF, plen & 0xFF] + RX_FRAME): + mem[_W_RX_BASE + i] = b + mem[_W_S0_RX_RSR], mem[_W_S0_RX_RSR + 1] = (plen >> 8) & 0xFF, plen & 0xFF + mem[_W_S0_RX_RD], mem[_W_S0_RX_RD + 1] = 0, 0 + return mem + + w5100_mem = w5100_preload() + + async def w5100_model(ctx): + idm_ar = 0 + mr = 0 + prev_cs = prev_rd = prev_wr = 1 + async for vals in ctx.tick("sync").sample( + dut.w5100_cs_n, dut.w5100_rd_n, dut.w5100_wr_n, + dut.w5100_addr, dut.w5100_data_o): + cs, rd, wr, a, do = vals[-5:] + ai = (mr >> 1) & 1 + if cs == 0 and rd == 0: # drive read data + if a == _A_MR: + val = mr + elif a == _A_AR0: + val = (idm_ar >> 8) & 0xFF + elif a == _A_AR1: + val = idm_ar & 0xFF + else: + val = w5100_mem.get(idm_ar, 0) + ctx.set(dut.w5100_data_i, val) + if cs == 0 and prev_wr == 0 and wr == 1: # latch write on /WR rising + if a == _A_MR: + mr = do + elif a == _A_AR0: + idm_ar = (idm_ar & 0x00FF) | (do << 8) + elif a == _A_AR1: + idm_ar = (idm_ar & 0xFF00) | do + else: + w5100_mem[idm_ar] = do + if idm_ar == _W_S0_CR and do == _W_CR_RECV: + w5100_mem[_W_S0_RX_RSR] = 0 + w5100_mem[_W_S0_RX_RSR + 1] = 0 + if ai: + idm_ar = (idm_ar + 1) & 0xFFFF + if cs == 0 and prev_rd == 0 and rd == 1 and a == _A_DR and ai: + idm_ar = (idm_ar + 1) & 0xFFFF + prev_cs, prev_rd, prev_wr = cs, rd, wr + + async def testbench(ctx): + ctx.set(dut.exi_clk, 1) + ctx.set(dut.exi_cs_n, 1) + ctx.set(dut.panel_btn, 0b111) # all buttons released (active-low idle) + await ctx.tick("capture").repeat(20) + + # T1: device ID — read 4 bytes from addr 0 → 0x04 0x02 0x02 0x00 + dev = await exi_read(ctx, 0x0000, 4) + print(f"T1 device ID: {[f'0x{b:02X}' for b in dev]}") + if dev != [0x04, 0x02, 0x02, 0x00]: + errors.append(f"T1 device ID: got {dev}") + await ctx.tick("capture").repeat(HALF) + + # T2: write PAR0–3, read them back through the full chain + await exi_write(ctx, 0x20, [0xDE, 0xAD, 0xBE, 0xEF]) + await ctx.tick("capture").repeat(HALF * 4) + par = await exi_read(ctx, 0x20, 4) + print(f"T2 PAR0-3 readback: {[f'0x{b:02X}' for b in par]}") + if par != [0xDE, 0xAD, 0xBE, 0xEF]: + errors.append(f"T2 PAR readback: got {par}") + await ctx.tick("capture").repeat(HALF) + + # T3: NWAYS must read back the hardcoded 0x17 (link-up sentinel) + nways = await exi_read(ctx, 0x31, 1) + print(f"T3 NWAYS: 0x{nways[0]:02X} (want 0x17)") + if nways != [0x17]: + errors.append(f"T3 NWAYS: got {nways}") + await ctx.tick("capture").repeat(HALF) + + # T4: DMA-style SPRAM read — clock 8 data bytes (past the 4-byte header + # limit) within one CS. Exercises the integrated streaming path: + # ExiCapture(cs_active) → register file SPRAM_STREAM → SPRAMArbiter → + # real SPRAM → MISO, plus the SPRAM_END cleanup. SPRAM is uninitialised + # here, so we check the stream completes (8 bytes, no underrun/hang) + # rather than specific data. + dma = await exi_read(ctx, 0x0100, 8) + print(f"T4 DMA read (8B from 0x100): {[f'0x{b:02X}' for b in dma]}") + if len(dma) != 8: + errors.append(f"T4 DMA read length: got {len(dma)}") + await ctx.tick("capture").repeat(HALF) + + # T5: a register read after the streaming read confirms the FSM cleaned + # up (SPRAM_END → HEADER0) and the device is responsive again. + nways2 = await exi_read(ctx, 0x31, 1) + print(f"T5 NWAYS after DMA: 0x{nways2[0]:02X} (want 0x17)") + if nways2 != [0x17]: + errors.append(f"T5 NWAYS after DMA read: got {nways2}") + await ctx.tick("capture").repeat(HALF) + + # ── T6: FULL ETHERNET→SPRAM→GC LOOP ────────────────────────────── + # A frame arrives from the network (W5500 model) → W5500 master reads it + # → RXFrameAssembler writes it to the SPRAM ring → GC reads RWP then + # DMA-reads the descriptor+frame back. Exercises the entire RX path. + # The W5100 needs its init sequence (which sets MR.AI / opens socket 0) + # before multi-byte bus accesses work — trigger it via NCRA reset, as + # the real GC driver does, and let it run before enabling RX. + await exi_write(ctx, 0x00, [0x01]) # NCRA reset → init_req pulse + await ctx.tick("capture").repeat(2000) # let W5100 init run + await exi_write(ctx, 0x00, [0x08]) # NCRA SR bit → enable RX + await ctx.tick("capture").repeat(HALF * 2) + ctx.set(dut.w5100_int_n, 0) # W5100: a packet was received + await ctx.tick("capture").repeat(4000) # let the W5100 RX + SPRAM write run + ctx.set(dut.w5100_int_n, 1) + await ctx.tick("capture").repeat(HALF * 2) + + rwp = await exi_read(ctx, 0x16, 1) # RX write pointer (page) + total_len = len(RX_FRAME) + 4 + got = await exi_read(ctx, 0x0100, total_len) # descriptor + frame + want = [0x00, 0x00, (total_len >> 8) & 0xFF, total_len & 0xFF] + RX_FRAME + print(f"T6 RWP=0x{rwp[0]:02X} (want 0x02)") + print(f"T6 SPRAM[0x100]: {[f'0x{b:02X}' for b in got]}") + print(f"T6 expected : {[f'0x{b:02X}' for b in want]}") + if rwp != [0x02]: + errors.append(f"T6 RWP: got {rwp}, want [0x02]") + if got != want: + errors.append(f"T6 RX frame mismatch:\n got {got}\n want {want}") + + # T7: status-panel integration — after all the EXI traffic above, the + # EXI-activity LED (panel led[1] = stretched cs_active) must be lit, + # proving cap.cs_active → FFSync → StatusPanel → LED is wired end-to-end. + leds = ctx.get(dut.panel_led) + if not (leds >> 1) & 1: + errors.append(f"T7 panel: EXI-activity LED not lit (led=0b{leds:05b})") + print(f"T7 panel led=0b{leds:05b} (bit1=EXI activity, expect 1)") + + sim = Simulator(dut) + sim.add_clock(Period(MHz=54), domain="capture") + sim.add_clock(Period(MHz=24), domain="exi") + sim.add_clock(Period(MHz=24), domain="sync") + sim.add_testbench(testbench) + sim.add_process(w5100_model) + sim.run() + + if errors: + print("\nFAILURES:") + for e in errors: + print(" ", e) + sys.exit(1) + else: + print("\nAll BBATop integration tests passed.") diff --git a/exi_bba/eeprom_model.py b/exi_bba/eeprom_model.py new file mode 100644 index 0000000..6e3cd75 --- /dev/null +++ b/exi_bba/eeprom_model.py @@ -0,0 +1,222 @@ +"""EEPROM model — exi domain. + +Emulates the MX98730EC's 93C46 serial EEPROM. + +93C46 protocol (Microwire, bit-bang) +------------------------------------- +CS=1 activates the device. +Data clocked on rising SK edge, 9-bit header then data: + Bit 0: start (always 1) + Bit 1: opcode MSB } READ = 10 + Bit 2: opcode LSB } + Bits 3–8: 6-bit address (MSB first) + +After the 9th rising SK the DO line presents the MSB of the 16-bit word. +Each subsequent rising SK advances one bit (MSB→LSB). + +Shift register `shift_in` convention +-------------------------------------- +`Cat(di_s, shift_in[:-1])` places di_s at bit 0 and shifts existing bits up. +After N edges: + shift_in[N-1] = first bit received (start) + shift_in[0] = last bit received so far + +At bit_ctr==8 (after 8 edges, receiving 9th on di_s): + shift_in[7] = start (bit 0) + shift_in[6] = opcode MSB (bit 1) + shift_in[5] = opcode LSB (bit 2) + shift_in[4:0] = addr[5:1] (bits 3–7, MSB first→LSB first in register) + di_s = addr[0] (bit 8) + + opcode = Cat(shift_in[5], shift_in[6]) → 0b10 = READ + address = Cat(di_s, shift_in[0:5]) → addr[0..5] + +EEPROM content (64 × 16-bit words) +------------------------------------- +Words 0–2 hold the source MAC address (Nintendo OUI 00:09:BF:AA:BB:CC). +The GC BBA driver reads words 0–3 then copies to PAR0–5. +""" + +from amaranth import * +from amaranth.lib.cdc import FFSynchronizer + +__all__ = ["EEPROMModel"] + +_EEPROM_WORDS = [ + 0x0009, # word 0: PAR0=0x00, PAR1=0x09 + 0xBFAA, # word 1: PAR2=0xBF, PAR3=0xAA + 0xBBCC, # word 2: PAR4=0xBB, PAR5=0xCC + 0x0000, # word 3: checksum placeholder +] +_EEPROM_WORDS += [0x0000] * (64 - len(_EEPROM_WORDS)) + +_OP_READ = 0b10 # opcode for READ + + +class EEPROMModel(Elaboratable): + """93C46 serial EEPROM model in the exi domain (read-only). + + Ports + ----- + sk / cs / di : bit-bang inputs (raw async; synchronized internally) + do : serial data output + """ + + def __init__(self): + self.sk = Signal() + self.cs = Signal() + self.di = Signal() + self.do = Signal() + + def elaborate(self, platform): + m = Module() + + words = Array([Signal(16, init=v, name=f"e{i}") for i, v in enumerate(_EEPROM_WORDS)]) + + # ── Input synchronization (async → exi, 2 stages) ──────────────── + sk_s = Signal() + cs_s = Signal() + di_s = Signal() + m.submodules.sync_sk = FFSynchronizer(self.sk, sk_s, o_domain="exi") + m.submodules.sync_cs = FFSynchronizer(self.cs, cs_s, o_domain="exi") + m.submodules.sync_di = FFSynchronizer(self.di, di_s, o_domain="exi") + + sk_prev = Signal() + m.d.exi += sk_prev.eq(sk_s) + rising_sk = Signal() + m.d.comb += rising_sk.eq(sk_s & ~sk_prev) + + # ── State ───────────────────────────────────────────────────────── + shift_in = Signal(9) + bit_ctr = Signal(4) # 0..8 during header receive + + shift_out = Signal(16) # data word being shifted out MSB-first + out_ctr = Signal(4) # 0..15, counts bits shifted out + in_read = Signal() # 1 while outputting a word + + # DO is combinatorial: MSB of shift_out while in read-out phase + m.d.comb += self.do.eq(Mux(in_read, shift_out[15], 0)) + + with m.If(~cs_s): + m.d.exi += bit_ctr.eq(0) + m.d.exi += in_read.eq(0) + m.d.exi += out_ctr.eq(0) + + with m.Elif(rising_sk): + with m.If(in_read): + # Shift out next bit (MSB first: left shift, zero into LSB) + m.d.exi += shift_out.eq(Cat(0, shift_out[:-1])) + with m.If(out_ctr == 15): + m.d.exi += in_read.eq(0) + m.d.exi += out_ctr.eq(0) + with m.Else(): + m.d.exi += out_ctr.eq(out_ctr + 1) + + with m.Else(): + # Shift di_s in at bit 0 (existing bits move up) + m.d.exi += shift_in.eq(Cat(di_s, shift_in[:-1])) + m.d.exi += bit_ctr.eq(bit_ctr + 1) + + with m.If(bit_ctr == 8): + # 9th bit (di_s = addr[0]) arrives. + # shift_in[7] = start, [6]=op_MSB, [5]=op_LSB, [4:0]=addr[5:1] + op = Cat(shift_in[5], shift_in[6]) # 0b10 for READ + adr = Cat(di_s, shift_in[0:5]) # addr[0..5] + with m.If(op == _OP_READ): + m.d.exi += shift_out.eq(words[adr]) + m.d.exi += in_read.eq(1) + m.d.exi += out_ctr.eq(0) + + return m + + +# ── Testbench ───────────────────────────────────────────────────────────── + +if __name__ == "__main__": + import sys + from amaranth.sim import Simulator, Period + + dut = EEPROMModel() + errors = [] + + HALF = 6 # exi-domain ticks per SK half-period (much longer than sync latency) + + async def eeprom_read(ctx, addr): + """93C46 READ at 6-bit address; returns 16-bit word. + + DO is read BEFORE each rising SK edge, since in_read=1 causes + shift_out[15] to be valid between edges. After 16 reads the full + 16-bit word is assembled MSB-first. + """ + ctx.set(dut.cs, 1) + ctx.set(dut.sk, 0) + await ctx.tick("exi").repeat(HALF) + + # Transmit 9 bits: start(1) + opcode READ(10) + addr[5:0] MSB-first + bits = [1, 1, 0] + for a in range(5, -1, -1): + bits.append((addr >> a) & 1) + + for bit in bits: + ctx.set(dut.di, bit) + ctx.set(dut.sk, 1) # rising edge: DUT latches bit + await ctx.tick("exi").repeat(HALF) + ctx.set(dut.sk, 0) + await ctx.tick("exi").repeat(HALF) + + # After 9th falling SK: in_read=1, shift_out=word[addr], do=MSB. + # Read DO before each rising edge (it is valid in the LOW phase). + result = 0 + for _ in range(16): + result = (result << 1) | ctx.get(dut.do) # sample before rising SK + ctx.set(dut.sk, 1) + await ctx.tick("exi").repeat(HALF) + ctx.set(dut.sk, 0) + await ctx.tick("exi").repeat(HALF) + + ctx.set(dut.cs, 0) + await ctx.tick("exi").repeat(HALF) + return result + + async def testbench(ctx): + await ctx.tick("exi").repeat(4) + ctx.set(dut.cs, 0) + ctx.set(dut.sk, 0) + ctx.set(dut.di, 0) + await ctx.tick("exi").repeat(4) + + w0 = await eeprom_read(ctx, 0) + print(f"T1 word 0 = 0x{w0:04X} (expected 0x0009)") + if w0 != 0x0009: + errors.append(f"T1: word 0 = 0x{w0:04X}, expected 0x0009") + + w1 = await eeprom_read(ctx, 1) + print(f"T2 word 1 = 0x{w1:04X} (expected 0xBFAA)") + if w1 != 0xBFAA: + errors.append(f"T2: word 1 = 0x{w1:04X}, expected 0xBFAA") + + w2 = await eeprom_read(ctx, 2) + print(f"T3 word 2 = 0x{w2:04X} (expected 0xBBCC)") + if w2 != 0xBBCC: + errors.append(f"T3: word 2 = 0x{w2:04X}, expected 0xBBCC") + + # T4: word 3 → 0x0000 + w3 = await eeprom_read(ctx, 3) + print(f"T4 word 3 = 0x{w3:04X} (expected 0x0000)") + if w3 != 0x0000: + errors.append(f"T4: word 3 = 0x{w3:04X}, expected 0x0000") + + sim = Simulator(dut) + sim.add_clock(Period(MHz=24), domain="exi") + sim.add_testbench(testbench) + + with sim.write_vcd("EEPROMModel.vcd"): + sim.run() + + if errors: + print("\nFAILURES:") + for e in errors: + print(" ", e) + sys.exit(1) + else: + print("\nAll tests passed.") diff --git a/exi_bba/exi_capture.py b/exi_bba/exi_capture.py new file mode 100644 index 0000000..93148d0 --- /dev/null +++ b/exi_bba/exi_capture.py @@ -0,0 +1,269 @@ +"""ExiCapture — fast EXI byte-capture front-end (capture domain, 54 MHz). + +Wraps the SPIMode3Slave bit engine and bridges it to the slower `exi` domain +(24 MHz) through two AsyncFIFOs: + + capture (54 MHz) exi (24 MHz) + ┌────────────────────┐ rx_fifo ───► received bytes (header + data) + │ SPIMode3Slave │ (8-bit, capture→exi) + │ (bit engine) │ tx_fifo ◄─── response bytes to drive on MISO + └────────────────────┘ (8-bit, exi→capture) + +Why split: the bit engine must oversample a 27 MHz EXI clock 2×, which needs a +54 MHz clock — far faster than the register-file logic can close (~44 MHz). +Only this small, shallow front-end runs fast; everything else stays at 24 MHz. + +TX response gating +------------------ +Every EXI transaction begins with 2 header bytes (write_flag/addr/len) during +which the GC ignores MISO. The core cannot have produced a response yet (it +hasn't even decoded the header), so the wrapper must NOT pop tx_fifo for those +2 bytes. A per-transaction counter (`txld_cnt`, reset by frame_start) gates the +pop: header bytes drive a don't-care 0xFF; from the first data byte onward the +wrapper pops tx_fifo (one byte per tx_load). `tx_hold` is registered at tx_load +time — before the FIFO advances — so the bit engine latches the correct byte on +the following SPI rising edge (the classic FWFT-advance off-by-one is avoided). +""" + +from amaranth import * +from amaranth.lib.cdc import FFSynchronizer +from amaranth.lib.fifo import AsyncFIFO + +from exi_bba.spi_mode3_slave import SPIMode3Slave + +__all__ = ["ExiCapture"] + + +class ExiCapture(Elaboratable): + """EXI front-end: SPI bit engine (capture domain) + byte FIFOs to core. + + Physical SPI pins (capture domain) + ---------------------------------- + spi_clk / spi_mosi / spi_cs_n : raw async inputs from the GC + spi_miso : output to the GC + + Core-facing RX byte stream (core domain, FWFT read side of rx_fifo) + ------------------------------------------------------------------ + rx_data : current received byte + rx_rdy : a received byte is available + rx_en : pop (assert for one core cycle to consume rx_data) + + Core-facing TX byte stream (core domain, write side of tx_fifo) + -------------------------------------------------------------- + tx_data : response byte to enqueue + tx_en : write strobe + tx_rdy : tx_fifo has room + """ + + def __init__(self, rx_depth=4, tx_depth=2): + self._rx_depth = rx_depth + self._tx_depth = tx_depth + + # Physical SPI (capture domain, wired to pins by BBATop) + self.spi_clk = Signal(init=1) + self.spi_mosi = Signal() + self.spi_cs_n = Signal(init=1) + self.spi_miso = Signal() + + # Core-facing RX read side + self.rx_data = Signal(8) + self.rx_rdy = Signal() + self.rx_en = Signal() + + # Core-facing TX write side + self.tx_data = Signal(8) + self.tx_en = Signal() + self.tx_rdy = Signal() + + # Core-facing: high (exi domain) while a transaction is in progress. + # The register file uses it to stream variable-length (DMA) reads until + # CS deasserts. + self.cs_active = Signal() + + def elaborate(self, platform): + m = Module() + + spi = SPIMode3Slave(domain="capture") + m.submodules.spi = spi + + rx_fifo = AsyncFIFO(width=8, depth=self._rx_depth, + w_domain="capture", r_domain="exi") + tx_fifo = AsyncFIFO(width=8, depth=self._tx_depth, + w_domain="exi", r_domain="capture") + m.submodules.rx_fifo = rx_fifo + m.submodules.tx_fifo = tx_fifo + + # cs_active (capture) → exi domain for the register file + m.submodules.cs_sync = FFSynchronizer(spi.cs_active, self.cs_active, + o_domain="exi") + + # ── Physical pins ↔ bit engine ─────────────────────────────────── + m.d.comb += [ + spi.spi_clk .eq(self.spi_clk), + spi.spi_mosi.eq(self.spi_mosi), + spi.spi_cs_n.eq(self.spi_cs_n), + self.spi_miso.eq(spi.spi_miso), + ] + + # ── RX: every received byte → rx_fifo (capture write side) ─────── + m.d.comb += [ + rx_fifo.w_data.eq(spi.rx_byte), + rx_fifo.w_en .eq(spi.rx_valid), + ] + # Core read side + m.d.comb += [ + self.rx_data .eq(rx_fifo.r_data), + self.rx_rdy .eq(rx_fifo.r_rdy), + rx_fifo.r_en .eq(self.rx_en), + ] + + # ── TX: core write side ────────────────────────────────────────── + m.d.comb += [ + tx_fifo.w_data.eq(self.tx_data), + tx_fifo.w_en .eq(self.tx_en), + self.tx_rdy .eq(tx_fifo.w_rdy), + ] + + # ── TX response gating (capture domain) ────────────────────────── + # The bit engine drives MISO LIVE from tx_byte = tx_fifo head, so the + # response byte at the head is what gets sent for the current data byte. + # `txld_cnt` counts completed bytes within the transaction (tx_load + # pulses at each byte completion): + # completion 0,1 → header bytes (no pop) + # completion ≥2 → a data byte finished → pop to advance the head + # The first data byte (data0) is served live from the head without a + # pop; the pop after it advances the head to data1's response, etc. + txld_cnt = Signal(2) + + m.d.comb += spi.tx_byte.eq(tx_fifo.r_data) + + # Pop depends ONLY on the registered tx_load and txld_cnt — NOT on + # frame_start. (frame_start precedes byte-0's tx_load by a cycle and + # has already reset txld_cnt to 0, so byte 0 is never a data byte.) + # Keeping cs_fall/frame_start off the pop path shortens the capture- + # domain critical path through the FIFO consume pointer. + # + # `flushing` clears prefetch over-push left in tx_fifo by the previous + # transaction: the register file streams response bytes ahead of the GC + # clock for DMA reads, so when CS deasserts mid-stream a few unsent + # bytes remain. On CS-fall (frame_start) drain tx_fifo to empty before + # the new transaction's data phase, so stale bytes never reach MISO. + flushing = Signal() + m.d.comb += tx_fifo.r_en.eq( + (spi.tx_load & (txld_cnt >= 2)) | (flushing & tx_fifo.r_rdy) + ) + with m.If(spi.frame_start): + m.d.capture += flushing.eq(1) + with m.Elif(~tx_fifo.r_rdy): + m.d.capture += flushing.eq(0) + + with m.If(spi.frame_start): + m.d.capture += txld_cnt.eq(0) + with m.Elif(spi.tx_load & (txld_cnt < 3)): + m.d.capture += txld_cnt.eq(txld_cnt + 1) + + return m + + +# ── Testbench ───────────────────────────────────────────────────────────── + +if __name__ == "__main__": + import sys + from amaranth.sim import Simulator, Period + + dut = ExiCapture() + errors = [] + + # SPI half-period in capture ticks. At 54 MHz capture / 27 MHz EXI the real + # ratio is ~2; use 4 here for a clean, well-oversampled functional check. + HALF = 4 + + async def spi_byte(ctx, mosi_val): + """Clock one SPI Mode 3 byte; return the assembled MISO byte.""" + miso = 0 + for bit in range(7, -1, -1): + ctx.set(dut.spi_mosi, (mosi_val >> bit) & 1) + ctx.set(dut.spi_clk, 0) + await ctx.tick("capture").repeat(HALF) + miso = (miso << 1) | ctx.get(dut.spi_miso) + ctx.set(dut.spi_clk, 1) + await ctx.tick("capture").repeat(HALF) + return miso + + async def core_drain_rx(ctx, into): + """Pop one byte from the core RX side if available.""" + if ctx.get(dut.rx_rdy): + into.append(ctx.get(dut.rx_data)) + ctx.set(dut.rx_en, 1) + await ctx.tick("exi").repeat(1) + ctx.set(dut.rx_en, 0) + return True + return False + + async def push_tx(ctx, b): + ctx.set(dut.tx_data, b) + ctx.set(dut.tx_en, 1) + await ctx.tick("exi").repeat(1) + ctx.set(dut.tx_en, 0) + + async def do_txn(ctx, hdr, responses, n_data, rx_seen): + """One EXI transaction: clock `hdr` bytes, model the clock-idle gap + (drain rx + prefetch `responses` into tx_fifo), then clock `n_data` + data bytes; return the MISO data bytes read.""" + ctx.set(dut.spi_cs_n, 0) + ctx.set(dut.spi_clk, 1) + await ctx.tick("capture").repeat(HALF) + for h in hdr: + await spi_byte(ctx, h) + for _ in range(20): # clock-idle gap + await core_drain_rx(ctx, rx_seen) + await ctx.tick("exi").repeat(1) + for r in responses: + await push_tx(ctx, r) + await ctx.tick("capture").repeat(2) + miso = [await spi_byte(ctx, 0x00) for _ in range(n_data)] + ctx.set(dut.spi_cs_n, 1) + await ctx.tick("capture").repeat(HALF) + for _ in range(20): # drain data-phase dummies + await core_drain_rx(ctx, rx_seen) + await ctx.tick("exi").repeat(1) + return miso + + async def testbench(ctx): + rx_seen = [] + await ctx.tick("capture").repeat(2) + + # ── T1: header + 2 data bytes read back ────────────────────────── + miso = await do_txn(ctx, [0x12, 0x34], [0xA5, 0x5A], 2, rx_seen) + print(f"T1 rx={[hex(b) for b in rx_seen[:2]]} MISO={[f'0x{b:02X}' for b in miso]}") + if rx_seen[:2] != [0x12, 0x34]: + errors.append(f"T1 header rx wrong: {rx_seen[:2]}") + if miso != [0xA5, 0x5A]: + errors.append(f"T1 MISO wrong: {[hex(b) for b in miso]}") + + # ── T2: prefetch over-push must NOT leak into the next transaction ─ + # Txn A pushes 2 responses but the GC clocks only 1 data byte, leaving + # one stale byte in tx_fifo. Txn B must read its OWN fresh responses, + # proving the CS-fall flush cleared the stale prefetch. + rx_seen.clear() + await do_txn(ctx, [0x12, 0x34], [0xA5, 0x5A], 1, rx_seen) # leaves 0x5A + misoB = await do_txn(ctx, [0x12, 0x34], [0x11, 0x22], 2, rx_seen) + print(f"T2 MISO after over-push: {[f'0x{b:02X}' for b in misoB]} (want 0x11 0x22)") + if misoB != [0x11, 0x22]: + errors.append(f"T2 flush failed — stale byte leaked: {[hex(b) for b in misoB]}") + + sim = Simulator(dut) + sim.add_clock(Period(MHz=54), domain="capture") + sim.add_clock(Period(MHz=24), domain="exi") + sim.add_testbench(testbench) + + with sim.write_vcd("ExiCapture.vcd"): + sim.run() + + if errors: + print("\nFAILURES:") + for e in errors: + print(" ", e) + sys.exit(1) + else: + print("\nAll tests passed.") diff --git a/exi_bba/rx_frame_assembler.py b/exi_bba/rx_frame_assembler.py new file mode 100644 index 0000000..e08a5a1 --- /dev/null +++ b/exi_bba/rx_frame_assembler.py @@ -0,0 +1,312 @@ +"""RX frame assembler — sync domain (24 MHz). + +Receives raw ethernet frames from W5500SPIMaster and writes them into the SPRAM +ring buffer in MX98730EC format. + +Ring buffer layout (SPRAM byte addresses) +------------------------------------------ +0x0100–0x0FFF 15 pages × 256 bytes = 3840 bytes +Pages 0x01–0x0F; page 0x00 is reserved. +Page wrap: after 0x0F → 0x01 (skip 0x00). + +Frame descriptor (4 bytes at page start) +----------------------------------------------- +Byte 0: LRPS (last received packet status) — 0x00 +Byte 1: 0x00 +Byte 2: total_length[15:8] (big-endian; includes 4 descriptor bytes) +Byte 3: total_length[7:0] +Bytes 4+: raw ethernet frame + +Write sequence +-------------- +1. Issue 4 SPRAM writes of 0x00 (placeholder descriptor). +2. For each byte received from W5500, issue one SPRAM write. +3. After EOF: rewrite descriptor bytes 2 and 3 with actual length. +4. Advance RWP, push to rx_wptr FIFO, pulse rx_irq. +""" + +from amaranth import * + +__all__ = ["RXFrameAssembler"] + +_RX_PAGE_FIRST = 0x01 +_RX_PAGE_LAST = 0x0F +_PAGES_TOTAL = _RX_PAGE_LAST - _RX_PAGE_FIRST + 1 # 15 + + +class RXFrameAssembler(Elaboratable): + """Writes incoming ethernet frames into the SPRAM ring buffer. + + W5500 streaming interface (sync domain) + ---------------------------------------- + rx_data / rx_valid / rx_ready : byte stream + rx_sof / rx_eof : frame delimiters (same cycle as rx_valid) + + SPRAM write interface (to SPRAMArbiter, sync domain) + ----------------------------------------------------- + eth_wr_addr / eth_wr_data / eth_wr_valid / eth_wr_ready + + CDC outputs (wired by BBATop) + ----------------------------- + rx_wptr_w_data / rx_wptr_w_en / rx_wptr_w_rdy + rx_irq : 1-cycle pulse → PulseSynchronizer input + rx_enabled : controlled by NCRA SR bit (from BBARegisterFile) + """ + + def __init__(self): + # W5500 stream in + self.rx_data = Signal(8) + self.rx_valid = Signal() + self.rx_ready = Signal() + self.rx_sof = Signal() + self.rx_eof = Signal() + + # SPRAM write out + self.eth_wr_addr = Signal(16) + self.eth_wr_data = Signal(8) + self.eth_wr_valid = Signal() + self.eth_wr_ready = Signal() + + # RWP FIFO write-side (sync→exi) + self.rx_wptr_w_data = Signal(8) + self.rx_wptr_w_en = Signal() + self.rx_wptr_w_rdy = Signal() + + # rx_irq pulse (→ PulseSynchronizer) + self.rx_irq = Signal() + + # RX gate from NCRA SR bit + self.rx_enabled = Signal() + + def elaborate(self, platform): + m = Module() + + # ── Ring-buffer state ───────────────────────────────────────────── + rwp = Signal(8, init=_RX_PAGE_FIRST) # current RX write page (1–15) + + # Write address within current frame + wr_addr = Signal(16) + # Number of frame data bytes received + data_ctr = Signal(12) + # Total length = data_ctr + 4 + total_len = Signal(12) + + # Descriptor base (rwp*256) — saved when frame starts + desc_base = Signal(16) + + # Placeholder descriptor byte counter (0..3) + desc_ctr = Signal(2) + + # Number of pages consumed by this frame (rounded up) + pages_used = Signal(5) + + # Default: no pulses + m.d.sync += self.rx_irq.eq(0) + m.d.sync += self.rx_wptr_w_en.eq(0) + + # Combinatorial outputs + m.d.comb += total_len.eq(data_ctr + 4) + + with m.FSM(domain="sync", name="rx_fsm"): + + with m.State("IDLE"): + m.d.comb += self.rx_ready.eq(0) + m.d.sync += self.eth_wr_valid.eq(0) + with m.If(self.rx_valid & self.rx_sof & self.rx_enabled): + frame_base = Signal(16) + m.d.comb += frame_base.eq(Cat(Const(0, 8), rwp)) + m.d.sync += desc_base.eq(frame_base) + m.d.sync += wr_addr.eq(frame_base) + m.d.sync += data_ctr.eq(0) + m.d.sync += desc_ctr.eq(0) + m.next = "WRITE_PLACEHOLDER" + + with m.State("WRITE_PLACEHOLDER"): + # Write 4 bytes of 0x00 as placeholder descriptor + m.d.sync += self.eth_wr_addr.eq(wr_addr) + m.d.sync += self.eth_wr_data.eq(0x00) + m.d.sync += self.eth_wr_valid.eq(1) + with m.If(self.eth_wr_ready): + m.d.sync += wr_addr.eq(wr_addr + 1) + with m.If(desc_ctr == 3): + m.d.sync += self.eth_wr_valid.eq(0) + m.next = "RECV_AND_WRITE" + with m.Else(): + m.d.sync += desc_ctr.eq(desc_ctr + 1) + + with m.State("RECV_AND_WRITE"): + # Accept bytes from W5500 and write each to SPRAM immediately + m.d.comb += self.rx_ready.eq(~self.eth_wr_valid | self.eth_wr_ready) + with m.If(self.rx_valid & (~self.eth_wr_valid | self.eth_wr_ready)): + m.d.sync += self.eth_wr_addr.eq(wr_addr) + m.d.sync += self.eth_wr_data.eq(self.rx_data) + m.d.sync += self.eth_wr_valid.eq(1) + m.d.sync += wr_addr.eq(wr_addr + 1) + m.d.sync += data_ctr.eq(data_ctr + 1) + with m.If(self.rx_eof): + m.next = "WAIT_LAST_WRITE" + with m.Elif(self.eth_wr_valid & self.eth_wr_ready): + m.d.sync += self.eth_wr_valid.eq(0) + + with m.State("WAIT_LAST_WRITE"): + # Wait for the last data byte write to be accepted + with m.If(~self.eth_wr_valid | self.eth_wr_ready): + m.d.sync += self.eth_wr_valid.eq(0) + # Compute pages used: ceil((data_ctr + 4) / 256) + # = (total_len + 255) >> 8 = total_len[11:8] + (total_len[7:0] != 0) + m.d.sync += pages_used.eq(total_len[8:12] + (total_len[:8] != 0)) + m.next = "WRITE_LEN_HI" + + with m.State("WRITE_LEN_HI"): + # Overwrite descriptor byte 2 with total_len[15:8] + m.d.sync += self.eth_wr_addr.eq(desc_base + 2) + m.d.sync += self.eth_wr_data.eq(total_len[8:12]) + m.d.sync += self.eth_wr_valid.eq(1) + with m.If(self.eth_wr_ready): + m.d.sync += self.eth_wr_valid.eq(0) + m.next = "WRITE_LEN_LO" + + with m.State("WRITE_LEN_LO"): + # Overwrite descriptor byte 3 with total_len[7:0] + m.d.sync += self.eth_wr_addr.eq(desc_base + 3) + m.d.sync += self.eth_wr_data.eq(total_len[:8]) + m.d.sync += self.eth_wr_valid.eq(1) + with m.If(self.eth_wr_ready): + m.d.sync += self.eth_wr_valid.eq(0) + m.next = "ADVANCE_RWP" + + with m.State("ADVANCE_RWP"): + # next_rwp = ((rwp - 1 + pages_used) % 15) + 1 + next_rwp_raw = Signal(8) + m.d.comb += next_rwp_raw.eq(rwp + pages_used) + with m.If(next_rwp_raw > _RX_PAGE_LAST): + m.d.sync += rwp.eq(next_rwp_raw - _PAGES_TOTAL) + with m.Else(): + m.d.sync += rwp.eq(next_rwp_raw) + m.next = "PUSH_WPT" + + with m.State("PUSH_WPT"): + with m.If(self.rx_wptr_w_rdy): + m.d.sync += self.rx_wptr_w_data.eq(rwp) + m.d.sync += self.rx_wptr_w_en.eq(1) + m.d.sync += self.rx_irq.eq(1) + m.next = "IDLE" + + return m + + +# ── Testbench ───────────────────────────────────────────────────────────── + +if __name__ == "__main__": + import sys + from amaranth.sim import Simulator, Period + + dut = RXFrameAssembler() + errors = [] + + # Track all SPRAM writes issued by the DUT + spram_writes = [] + + async def testbench(ctx): + # Setup: acknowledge all SPRAM writes immediately + ctx.set(dut.eth_wr_ready, 1) + ctx.set(dut.rx_wptr_w_rdy, 1) + ctx.set(dut.rx_enabled, 1) + await ctx.tick("sync").repeat(2) + + # ── T1: 10-byte frame → pages_used=1, rwp advances 1→2 ────────────── + # Send SOF + first byte + frame = [0xAA, 0xBB, 0xCC, 0xDD, 0x08, 0x00, 0x45, 0x00, 0x00, 0x01] + + ctx.set(dut.rx_data, frame[0]) + ctx.set(dut.rx_valid, 1) + ctx.set(dut.rx_sof, 1) + await ctx.tick("sync").repeat(1) + ctx.set(dut.rx_sof, 0) + + for i, b in enumerate(frame[1:], start=1): + ctx.set(dut.rx_data, b) + ctx.set(dut.rx_eof, 1 if i == len(frame) - 1 else 0) + await ctx.tick("sync").repeat(1) + + ctx.set(dut.rx_valid, 0) + ctx.set(dut.rx_eof, 0) + + # Poll for up to 30 ticks until rx_irq pulses (1-cycle pulse) + t1_irq_seen = False + t1_wptr_d = 0 + for _ in range(30): + await ctx.tick("sync").repeat(1) + if ctx.get(dut.rx_irq): + t1_irq_seen = True + t1_wptr_d = ctx.get(dut.rx_wptr_w_data) + break + + print(f"T1 rx_irq_seen={t1_irq_seen} wptr_data=0x{t1_wptr_d:02X}") + if not t1_irq_seen: + errors.append("T1: rx_irq never pulsed") + if t1_wptr_d != 2: + errors.append(f"T1: rwp should be 2 (page 1→2), got {t1_wptr_d}") + + await ctx.tick("sync").repeat(4) + + # ── T2: Send a second frame; verify rwp advances further ──────────── + frame2 = [0x11, 0x22, 0x33, 0x44, 0x55, 0x66] + ctx.set(dut.rx_data, frame2[0]) + ctx.set(dut.rx_valid, 1) + ctx.set(dut.rx_sof, 1) + await ctx.tick("sync").repeat(1) + ctx.set(dut.rx_sof, 0) + + for i, b in enumerate(frame2[1:], start=1): + ctx.set(dut.rx_data, b) + ctx.set(dut.rx_eof, 1 if i == len(frame2) - 1 else 0) + await ctx.tick("sync").repeat(1) + + ctx.set(dut.rx_valid, 0) + ctx.set(dut.rx_eof, 0) + + t2_irq_seen = False + t2_wptr_d = 0 + for _ in range(30): + await ctx.tick("sync").repeat(1) + if ctx.get(dut.rx_irq): + t2_irq_seen = True + t2_wptr_d = ctx.get(dut.rx_wptr_w_data) + break + + print(f"T2 rx_irq_seen={t2_irq_seen} wptr_data=0x{t2_wptr_d:02X}") + if not t2_irq_seen: + errors.append("T2: rx_irq never pulsed after second frame") + if t2_wptr_d != 3: + errors.append(f"T2: rwp should be 3 (page 2→3), got {t2_wptr_d}") + + # ── T3: RX disabled — SOF must be ignored ────────────────────────── + ctx.set(dut.rx_enabled, 0) + ctx.set(dut.rx_data, 0xDE) + ctx.set(dut.rx_valid, 1) + ctx.set(dut.rx_sof, 1) + await ctx.tick("sync").repeat(4) + ctx.set(dut.rx_valid, 0) + ctx.set(dut.rx_sof, 0) + + # No SPRAM write should have been issued + wr_valid = ctx.get(dut.eth_wr_valid) + if wr_valid: + errors.append("T3: SPRAM write issued while rx_enabled=0") + print(f"T3 rx disabled: eth_wr_valid={wr_valid} (expected 0)") + + sim = Simulator(dut) + sim.add_clock(Period(MHz=24), domain="sync") + sim.add_testbench(testbench) + + with sim.write_vcd("RXFrameAssembler.vcd"): + sim.run() + + if errors: + print("\nFAILURES:") + for e in errors: + print(" ", e) + sys.exit(1) + else: + print("\nAll tests passed.") diff --git a/exi_bba/spi_mode3_slave.py b/exi_bba/spi_mode3_slave.py new file mode 100644 index 0000000..3ae4bfe --- /dev/null +++ b/exi_bba/spi_mode3_slave.py @@ -0,0 +1,274 @@ +"""SPI Mode 3 byte-oriented slave for the EXI bus. + +CPOL=1, CPHA=1: CLK idles HIGH. +Slave samples MOSI on the FALLING CLK edge. +Slave drives MISO on the RISING CLK edge (master samples on next falling edge). +All three raw inputs are run through a 2-stage FFSynchronizer before use. +""" + +from amaranth import * +from amaranth.lib.cdc import FFSynchronizer + + +# ── public re-export for import convenience ───────────────────────────────── +__all__ = ["SPIMode3Slave"] + + +class SPIMode3Slave(Elaboratable): + """Byte-oriented SPI Mode 3 slave. + + Ports + ----- + spi_clk / spi_mosi / spi_cs_n : raw async inputs from GC (synchronized internally) + spi_miso : output to GC; idles HIGH when CS deasserted + rx_byte : last complete received byte (valid when rx_valid pulses) + rx_valid : 1-cycle pulse in exi domain when rx_byte contains a new byte + tx_byte : upstream loads this before or within one exi clock of tx_load pulsing + tx_load : 1-cycle pulse requesting the next TX byte from upstream + """ + + def __init__(self, domain="capture"): + # Clock domain this byte engine runs in. Split-domain design puts the + # bit engine in a fast `capture` domain (54 MHz) so it can oversample + # a 27 MHz EXI clock ~3×; the register file lives in a slower domain. + self._domain = domain + + self.spi_clk = Signal(init=1) # idles HIGH + self.spi_mosi = Signal() + self.spi_cs_n = Signal(init=1) # active LOW + + self.spi_miso = Signal() # combinatorial output + + self.rx_byte = Signal(8) + self.rx_valid = Signal() + self.tx_byte = Signal(8) + self.tx_load = Signal() + + # 1-cycle pulse on CS assertion (transaction start). The capture + # wrapper uses it to reset its per-transaction TX byte counter. + self.frame_start = Signal() + + # Level: high while CS is asserted (a transaction is in progress). + # Lets downstream logic detect variable-length (DMA) transaction ends. + self.cs_active = Signal() + + def elaborate(self, platform): + m = Module() + d = self._domain + + # ── Input synchronization (async → exi, 2 stages) ────────────────── + clk_s = Signal(init=1) + mosi_s = Signal() + cs_s = Signal(init=1) + + m.submodules.sync_clk = FFSynchronizer(self.spi_clk, clk_s, o_domain=d, init=1) + m.submodules.sync_mosi = FFSynchronizer(self.spi_mosi, mosi_s, o_domain=d) + m.submodules.sync_cs = FFSynchronizer(self.spi_cs_n, cs_s, o_domain=d, init=1) + + # ── Edge detection ────────────────────────────────────────────────── + clk_prev = Signal(init=1) + cs_prev = Signal(init=1) + m.d[d] += clk_prev.eq(clk_s) + m.d[d] += cs_prev.eq(cs_s) + + falling_clk = Signal() + rising_clk = Signal() + cs_fall = Signal() + cs_rise = Signal() + m.d.comb += falling_clk.eq(~clk_s & clk_prev) + m.d.comb += rising_clk .eq( clk_s & ~clk_prev) + m.d.comb += cs_fall .eq(~cs_s & cs_prev) + m.d.comb += cs_rise .eq( cs_s & ~cs_prev) + m.d.comb += self.frame_start.eq(cs_fall) + m.d.comb += self.cs_active.eq(~cs_s) + + # ── Shift registers ───────────────────────────────────────────────── + rx_shift = Signal(8) + tx_shift = Signal(8) + bit_ctr = Signal(4) # counts 0..7; 7 means "8th (last) bit" + armed = Signal(init=1) # between bytes: drive the LIVE tx_byte MSB + rearm = Signal() # arm for next byte on the next rising edge + + # MISO: idle HIGH when CS deasserted. While "armed" — i.e. at the start + # of a byte, including the inter-byte / clock-idle gap before the first + # falling edge — drive the LIVE tx_byte MSB. This is what lets a + # response that upstream pushes DURING the EXI clock-idle gap reach MISO + # in time: there is no clock edge during the gap to latch it, so MISO + # must be combinational on tx_byte until the byte actually starts. Once + # shifting (after the first falling edge) drive the latched shift reg. + m.d.comb += self.spi_miso.eq( + Mux(cs_s, 1, Mux(armed, self.tx_byte[7], tx_shift[7])) + ) + + # Default: deassert single-cycle pulses every cycle + m.d[d] += self.rx_valid.eq(0) + m.d[d] += self.tx_load.eq(0) + + with m.If(cs_fall): + # Transaction start: first byte drives its MSB live (armed). + m.d[d] += bit_ctr.eq(0) + m.d[d] += armed.eq(1) + + with m.Elif(cs_rise | cs_s): + # CS deasserted / idle: reset state + m.d[d] += bit_ctr.eq(0) + m.d[d] += armed.eq(1) + + with m.Else(): + # CS asserted: run bit engine + with m.If(falling_clk): + # Sample MOSI (MSB first: left-shift, new bit enters at LSB) + # Cat(a, b) → a at lower bits; so Cat(mosi, rx[6:0]) = {rx[6:0], mosi} + m.d[d] += rx_shift.eq(Cat(mosi_s, rx_shift[:-1])) + + with m.If(armed): + # First falling edge of this byte: master has just sampled + # the MSB (driven live above). Latch tx_byte so the + # remaining 7 bits shift out of a stable register. + m.d[d] += tx_shift.eq(self.tx_byte) + m.d[d] += armed.eq(0) + + with m.If(bit_ctr == 7): + # 8th falling edge: byte complete. The master samples the + # LSB on THIS edge, so MISO must still hold tx_shift[7]. + # Defer arming to the next rising edge (rearm) so MISO is + # not switched to the next byte's live MSB too early. + m.d[d] += self.rx_byte.eq(Cat(mosi_s, rx_shift[:-1])) + m.d[d] += self.rx_valid.eq(1) + m.d[d] += bit_ctr.eq(0) + m.d[d] += self.tx_load.eq(1) # advance source to next byte + m.d[d] += rearm.eq(1) # arm on the next rising edge + with m.Else(): + m.d[d] += bit_ctr.eq(bit_ctr + 1) + + with m.If(rising_clk): + with m.If(rearm): + # Byte boundary: arm for the next byte (live MSB drive). + m.d[d] += armed.eq(1) + m.d[d] += rearm.eq(0) + with m.Elif(~armed): + # Shift left: next bit into MSB position + # Cat(0, tx[6:0]) = {tx[6:0], 0} — left shift + m.d[d] += tx_shift.eq(Cat(0, tx_shift[:-1])) + + return m + + +# ── Testbench ─────────────────────────────────────────────────────────────── + +if __name__ == "__main__": + from amaranth.sim import Simulator, Period + + dut = SPIMode3Slave() + + # 4 exi ticks per SPI half-period → well above the 3-cycle (2 sync + 1 edge) latency. + HALF = 4 + + async def spi_send_byte(ctx, mosi_val, next_tx_byte=None): + """Drive one SPI Mode 3 byte on MOSI; return the MISO byte assembled. + + next_tx_byte: if given, written to tx_byte after the LAST falling edge + (before the last rising edge) so need_reload picks it up in time. + """ + miso_byte = 0 + for bit in range(7, -1, -1): + ctx.set(dut.spi_mosi, (mosi_val >> bit) & 1) + ctx.set(dut.spi_clk, 0) # falling edge + await ctx.tick("capture").repeat(HALF) + miso_byte = (miso_byte << 1) | ctx.get(dut.spi_miso) + # Set next TX byte here — after last fall, before rising edge. + # The rising edge is detected 3 cycles after we assert clk=1, + # so we have HALF ticks of margin. + if bit == 0 and next_tx_byte is not None: + ctx.set(dut.tx_byte, next_tx_byte) + ctx.set(dut.spi_clk, 1) # rising edge + await ctx.tick("capture").repeat(HALF) + return miso_byte + + errors = [] + + async def testbench(ctx): + # ── Test 1: Single byte TX/RX ────────────────────────────────────── + ctx.set(dut.spi_cs_n, 0) + ctx.set(dut.spi_clk, 1) + ctx.set(dut.tx_byte, 0xA5) # pre-load before CS fall is detected + await ctx.tick("capture").repeat(HALF) + + miso = await spi_send_byte(ctx, 0x37) + await ctx.tick("capture").repeat(2) + rx = ctx.get(dut.rx_byte) + + ctx.set(dut.spi_cs_n, 1) + await ctx.tick("capture").repeat(HALF) + + if rx != 0x37: + errors.append(f"Test1 rx_byte: expected 0x37, got 0x{rx:02X}") + if miso != 0xA5: + errors.append(f"Test1 miso: expected 0xA5, got 0x{miso:02X}") + print(f"Test1 – MOSI→rx_byte: 0x{rx:02X} MISO←tx_byte: 0x{miso:02X}") + + await ctx.tick("capture").repeat(HALF) + + # ── Test 2: Two-byte transaction; second byte loaded via need_reload ─ + ctx.set(dut.spi_cs_n, 0) + ctx.set(dut.tx_byte, 0xBE) # first response byte + await ctx.tick("capture").repeat(HALF) + + # Pass next_tx_byte=0xEF so it's set after last falling edge of byte 0, + # giving need_reload time to load it on the subsequent rising edge. + miso0 = await spi_send_byte(ctx, 0x00, next_tx_byte=0xEF) + miso1 = await spi_send_byte(ctx, 0xFF) + + await ctx.tick("capture").repeat(2) + rx1 = ctx.get(dut.rx_byte) + + ctx.set(dut.spi_cs_n, 1) + await ctx.tick("capture").repeat(HALF) + + if miso0 != 0xBE: + errors.append(f"Test2 miso0: expected 0xBE, got 0x{miso0:02X}") + if miso1 != 0xEF: + errors.append(f"Test2 miso1: expected 0xEF, got 0x{miso1:02X}") + if rx1 != 0xFF: + errors.append(f"Test2 rx1: expected 0xFF, got 0x{rx1:02X}") + print(f"Test2 – byte0 MISO: 0x{miso0:02X} byte1 MISO: 0x{miso1:02X} rx1: 0x{rx1:02X}") + + await ctx.tick("capture").repeat(HALF) + + # ── Test 3: MISO idles HIGH when CS deasserted ───────────────────── + miso_idle = ctx.get(dut.spi_miso) + if miso_idle != 1: + errors.append(f"Test3 MISO idle: expected 1, got {miso_idle}") + print(f"Test3 – MISO idle (CS=1): {miso_idle}") + + # ── Test 4: All-zeros byte (0x00) TX and RX ──────────────────────── + ctx.set(dut.spi_cs_n, 0) + ctx.set(dut.tx_byte, 0x00) + await ctx.tick("capture").repeat(HALF) + + miso = await spi_send_byte(ctx, 0xFF) + await ctx.tick("capture").repeat(2) + rx = ctx.get(dut.rx_byte) + ctx.set(dut.spi_cs_n, 1) + await ctx.tick("capture").repeat(HALF) + + if miso != 0x00: + errors.append(f"Test4 miso: expected 0x00, got 0x{miso:02X}") + if rx != 0xFF: + errors.append(f"Test4 rx: expected 0xFF, got 0x{rx:02X}") + print(f"Test4 – 0x00 TX / 0xFF RX: MISO=0x{miso:02X} rx=0x{rx:02X}") + + sim = Simulator(dut) + sim.add_clock(Period(MHz=54), domain="capture") + sim.add_testbench(testbench) + + with sim.write_vcd("SPIMode3Slave.vcd"): + sim.run() + + if errors: + print("\nFAILURES:") + for e in errors: + print(" ", e) + raise SystemExit(1) + else: + print("\nAll tests passed.") diff --git a/exi_bba/spram_arbiter.py b/exi_bba/spram_arbiter.py new file mode 100644 index 0000000..f1ddd92 --- /dev/null +++ b/exi_bba/spram_arbiter.py @@ -0,0 +1,276 @@ +"""SPRAM arbiter — sync domain (24 MHz). + +Owns the iCE40UP5K 128 KB SPRAM (SB_SPRAM256KA, 16-bit wide) and arbitrates +between two clients: + + Client A (EXI read) : prefetch pipeline; low priority. + Client B (ETH write): RXFrameAssembler; high priority. + +ETH writes win when both clients are active. This is safe because the GC only +reads pages that the ETH engine has already finished writing (ring-buffer +invariant). + +SPRAM addressing +----------------- +SB_SPRAM256KA is 64 K × 16-bit. Byte addressing: + ADDRESS = byte_addr >> 1 + MASKWREN[3:0]: + 0b0011 → write lower byte (byte_addr even) + 0b1100 → write upper byte (byte_addr odd) + Read: both bytes returned; pick the right one from DATAOUT based on addr bit 0. + +Read latency: 1 synchronous cycle — result of cycle N is valid at N+1. + +In simulation (platform is None) a behavioural Array model is used instead of +the SB_SPRAM256KA Instance so tests run without IceStorm. +""" + +from amaranth import * +from amaranth.lib.memory import Memory + +__all__ = ["SPRAMArbiter"] + +_SPRAM_WORDS = 65536 # 64 K 16-bit words = 128 KB + + +class SPRAMArbiter(Elaboratable): + """Arbitrated SPRAM controller in the sync domain. + + EXI read interface (from BBARegisterFile spram_req / spram_rsp FIFOs) + ---------------------------------------------------------------------- + exi_req_addr : 16-bit byte address to read + exi_req_valid : FIFO r_rdy — a request is waiting + exi_req_ready : FIFO r_en — pop the request (asserted when serviced) + exi_rsp_data : 8-bit result byte + exi_rsp_valid : FIFO w_en — push result when valid + + ETH write interface (from RXFrameAssembler) + ------------------------------------------- + eth_wr_addr : 16-bit byte address to write + eth_wr_data : 8-bit byte value + eth_wr_valid : write request present + eth_wr_ready : write accepted this cycle + """ + + def __init__(self): + # EXI read interface + self.exi_req_addr = Signal(16) + self.exi_req_valid = Signal() + self.exi_req_ready = Signal() + self.exi_rsp_data = Signal(8) + self.exi_rsp_valid = Signal() + + # ETH write interface + self.eth_wr_addr = Signal(16) + self.eth_wr_data = Signal(8) + self.eth_wr_valid = Signal() + self.eth_wr_ready = Signal() + + def elaborate(self, platform): + m = Module() + + # ── SPRAM instantiation (hardware vs simulation) ────────────────── + spram_addr = Signal(14) # word address (byte_addr >> 1) + spram_din = Signal(16) + spram_dout = Signal(16) + spram_wren = Signal() + spram_mask = Signal(4) # MASKWREN + + if platform is None: + # Behavioural model: synchronous read with 1-cycle latency. + # Memory is a Component; read/write ports are obtained from it + # and wired via its submodule ports (not added as separate submodules). + mem = Memory(shape=16, depth=_SPRAM_WORDS, init=[]) + m.submodules.mem = mem + mem_rd = mem.read_port(domain="sync", transparent_for=[]) + mem_wr = mem.write_port(domain="sync", granularity=8) + + # en[0] = lower byte enable, en[1] = upper byte enable + byte0_en = Signal() + byte1_en = Signal() + m.d.comb += [ + byte0_en .eq(spram_wren & (spram_mask[0] | spram_mask[1])), + byte1_en .eq(spram_wren & (spram_mask[2] | spram_mask[3])), + mem_rd.addr .eq(spram_addr), + mem_rd.en .eq(1), + spram_dout .eq(mem_rd.data), + mem_wr.addr .eq(spram_addr), + mem_wr.data .eq(spram_din), + mem_wr.en .eq(Cat(byte0_en, byte1_en)), + ] + else: + # Hardware: instantiate two SB_SPRAM256KA (64K×16 each; use one) + m.submodules.spram = Instance( + "SB_SPRAM256KA", + i_ADDRESS = spram_addr, + i_DATAIN = spram_din, + i_MASKWREN = spram_mask, + i_WREN = spram_wren, + i_CHIPSELECT = Const(1, 1), + i_CLOCK = ClockSignal("sync"), + i_STANDBY = Const(0, 1), + i_SLEEP = Const(0, 1), + i_POWEROFF = Const(1, 1), + o_DATAOUT = spram_dout, + ) + + # ── Arbiter pipeline ───────────────────────────────────────────── + # Stage 1: issue SPRAM address and control signals (combinatorial) + # Stage 2: capture SPRAM output into rsp_buf (synchronous, 1-cycle) + + read_pending = Signal() # a read address was issued last cycle + read_was_odd = Signal() # byte address bit 0 of the pending read + rsp_buf = Signal(8) # registered response byte; valid when exi_rsp_valid + + # Combinatorial defaults + m.d.comb += [ + spram_wren .eq(0), + spram_mask .eq(0), + spram_din .eq(0), + spram_addr .eq(0), + self.exi_req_ready.eq(0), + self.eth_wr_ready .eq(0), + self.exi_rsp_data .eq(rsp_buf), # always sourced from registered buffer + ] + # Registered defaults + m.d.sync += [ + self.exi_rsp_valid.eq(0), + read_pending .eq(0), + ] + + # ETH write has priority + with m.If(self.eth_wr_valid): + m.d.comb += [ + spram_addr .eq(self.eth_wr_addr[1:]), + spram_wren .eq(1), + self.eth_wr_ready.eq(1), + ] + with m.If(self.eth_wr_addr[0]): + m.d.comb += [ + spram_din [8:16].eq(self.eth_wr_data), + spram_mask .eq(0b1100), + ] + with m.Else(): + m.d.comb += [ + spram_din [0:8].eq(self.eth_wr_data), + spram_mask .eq(0b0011), + ] + + # EXI read (lower priority) + with m.Elif(self.exi_req_valid): + m.d.comb += [ + spram_addr .eq(self.exi_req_addr[1:]), + self.exi_req_ready.eq(1), + ] + m.d.sync += [ + read_pending.eq(1), + read_was_odd.eq(self.exi_req_addr[0]), + ] + + # Capture SPRAM output into registered buffer after 1-cycle latency + with m.If(read_pending): + with m.If(read_was_odd): + m.d.sync += rsp_buf.eq(spram_dout[8:16]) + with m.Else(): + m.d.sync += rsp_buf.eq(spram_dout[0:8]) + m.d.sync += self.exi_rsp_valid.eq(1) + + return m + + +# ── Testbench ───────────────────────────────────────────────────────────── + +if __name__ == "__main__": + import sys + from amaranth.sim import Simulator, Period + + dut = SPRAMArbiter() + errors = [] + + async def testbench(ctx): + await ctx.tick("sync").repeat(2) + + # T1: ETH write to even byte address 0x0100, then EXI read it back + ctx.set(dut.eth_wr_addr, 0x0100) + ctx.set(dut.eth_wr_data, 0xAB) + ctx.set(dut.eth_wr_valid, 1) + await ctx.tick("sync").repeat(1) + accepted = ctx.get(dut.eth_wr_ready) + if not accepted: + errors.append("T1 eth write not accepted") + ctx.set(dut.eth_wr_valid, 0) + await ctx.tick("sync").repeat(1) + + # Issue EXI read of the same address + ctx.set(dut.exi_req_addr, 0x0100) + ctx.set(dut.exi_req_valid, 1) + await ctx.tick("sync").repeat(1) # clock A: read issued, read_pending=1 + ctx.set(dut.exi_req_valid, 0) + await ctx.tick("sync").repeat(1) # clock B: SPRAM output captured, valid=1 + # Check HERE — exi_rsp_valid is 1 for exactly this one cycle + + rdata = ctx.get(dut.exi_rsp_data) + rvalid = ctx.get(dut.exi_rsp_valid) + if rdata != 0xAB: + errors.append(f"T1 read back: expected 0xAB, got 0x{rdata:02X}") + if not rvalid: + errors.append("T1 exi_rsp_valid not set") + print(f"T1 even addr read-back: data=0x{rdata:02X} valid={rvalid}") + + await ctx.tick("sync").repeat(2) + + # T2: ETH write to ODD byte address 0x0101, read back + ctx.set(dut.eth_wr_addr, 0x0101) + ctx.set(dut.eth_wr_data, 0xCD) + ctx.set(dut.eth_wr_valid, 1) + await ctx.tick("sync").repeat(1) + ctx.set(dut.eth_wr_valid, 0) + await ctx.tick("sync").repeat(1) + + ctx.set(dut.exi_req_addr, 0x0101) + ctx.set(dut.exi_req_valid, 1) + await ctx.tick("sync").repeat(1) + ctx.set(dut.exi_req_valid, 0) + await ctx.tick("sync").repeat(1) + + rdata = ctx.get(dut.exi_rsp_data) + if rdata != 0xCD: + errors.append(f"T2 odd addr read-back: expected 0xCD, got 0x{rdata:02X}") + print(f"T2 odd addr read-back: data=0x{rdata:02X}") + + await ctx.tick("sync").repeat(2) + + # T3: ETH write wins when both clients active simultaneously + # Write 0xEE to 0x0200 + ctx.set(dut.eth_wr_addr, 0x0200) + ctx.set(dut.eth_wr_data, 0xEE) + ctx.set(dut.eth_wr_valid, 1) + ctx.set(dut.exi_req_addr, 0x0100) # also wants to read + ctx.set(dut.exi_req_valid, 1) + await ctx.tick("sync").repeat(1) + + eth_won = ctx.get(dut.eth_wr_ready) + exi_blocked = not ctx.get(dut.exi_req_ready) + ctx.set(dut.eth_wr_valid, 0) + ctx.set(dut.exi_req_valid, 0) + + if not eth_won: + errors.append("T3 ETH priority: ETH write not accepted") + if not exi_blocked: + errors.append("T3 ETH priority: EXI read was not blocked") + print(f"T3 ETH priority: eth_won={eth_won} exi_blocked={exi_blocked}") + + sim = Simulator(dut) + sim.add_clock(Period(MHz=24), domain="sync") + sim.add_testbench(testbench) + + with sim.write_vcd("SPRAMArbiter.vcd"): + sim.run() + + if errors: + print("\nFAILURES:") + for e in errors: + print(" ", e) + sys.exit(1) + else: + print("\nAll tests passed.") diff --git a/exi_bba/status_panel.py b/exi_bba/status_panel.py new file mode 100644 index 0000000..fdd66fa --- /dev/null +++ b/exi_bba/status_panel.py @@ -0,0 +1,227 @@ +"""StatusPanel — 5-LED / 3-button bring-up panel (sync domain). + +A development/diagnostics front panel for the iCEbreaker LED+button PMOD. It +turns the device's internal liveness signals into something you can watch on a +real GameCube during bring-up, and gives three buttons for manual control. + +LEDs (logical, active-high; set `led_active_low=True` if the board sinks current) + led[0] heartbeat — ~1–2 Hz blink: clock alive, bitstream loaded + led[1] exi_active — stretched `cs_active`: the GC is talking on EXI + led[2] rx_act — stretched `rx_pulse`: a packet arrived from the net + led[3] tx_act — stretched `tx_pulse`: a packet went out + led[4] ready — `ready` level (e.g. ethernet init complete) + +Buttons (raw pin level; `btn_active_low=True` for the usual pull-up wiring) + btn[0] eth_rst — while held, drive `eth_rst_n` low (reset the ethernet chip) + btn[1] reinit — on press, emit a one-cycle `reinit` pulse (force re-init) + btn[2] freeze — toggle: latch the rx/tx activity LEDs so a single one-shot + blink sticks until you unfreeze (catch a lone packet) + +Single-cycle events (`rx_pulse`/`tx_pulse`) are stretched to ~`stretch_cycles` +so the eye can see them; `cs_active` is a level that is re-triggered while high. +Buttons are debounced (`debounce_cycles` stable samples) — same idea as +`rebbarb/debouncer.py`, inlined here to keep this module self-contained. +""" + +from amaranth import * + +__all__ = ["StatusPanel"] + + +class StatusPanel(Elaboratable): + def __init__(self, hb_bit=23, stretch_cycles=1_440_000, + debounce_cycles=240_000, led_active_low=False, + btn_active_low=True): + # hb_bit: heartbeat = bit `hb_bit` of a free-running counter + # (24 MHz / 2**23 ≈ 1.4 Hz). stretch_cycles ≈ 60 ms at 24 MHz. + self._hb_bit = hb_bit + self._stretch = stretch_cycles + self._deb = debounce_cycles + self._led_inv = led_active_low + self._btn_inv = btn_active_low + + # Status inputs (sync domain) + self.cs_active = Signal() # level: EXI transaction in progress + self.rx_pulse = Signal() # 1-cycle: frame received + self.tx_pulse = Signal() # 1-cycle: frame sent + self.ready = Signal() # level: ethernet ready + + # Raw button inputs (from pins) + self.btn = Signal(3) + + # Outputs + self.led = Signal(5) + self.eth_rst_n = Signal(init=1) # btn0 held → 0 + self.reinit = Signal() # btn1 press → 1-cycle pulse + + def elaborate(self, platform): + m = Module() + + # ── Heartbeat ──────────────────────────────────────────────────── + hb = Signal(self._hb_bit + 1) + m.d.sync += hb.eq(hb + 1) + heartbeat = hb[self._hb_bit] + + # ── Button conditioning (normalise polarity → debounce) ────────── + braw = Signal(3) + m.d.comb += braw.eq(self.btn ^ C(0b111 if self._btn_inv else 0, 3)) + + bdeb = Signal(3) + for i in range(3): + cnt = Signal(range(self._deb + 1), name=f"deb_cnt{i}") + with m.If(braw[i] == bdeb[i]): + m.d.sync += cnt.eq(0) # stable: hold + with m.Else(): + m.d.sync += cnt.eq(cnt + 1) # changing: count stable samples + with m.If(cnt == self._deb - 1): + m.d.sync += [bdeb[i].eq(braw[i]), cnt.eq(0)] + + # btn0: hold → ethernet reset asserted (active-low output) + m.d.comb += self.eth_rst_n.eq(~bdeb[0]) + + # btn1: rising edge → reinit pulse + b1_prev = Signal() + m.d.sync += b1_prev.eq(bdeb[1]) + m.d.comb += self.reinit.eq(bdeb[1] & ~b1_prev) + + # btn2: rising edge toggles freeze + b2_prev = Signal() + freeze = Signal() + m.d.sync += b2_prev.eq(bdeb[2]) + with m.If(bdeb[2] & ~b2_prev): + m.d.sync += freeze.eq(~freeze) + + # ── Activity stretchers (rx/tx), sticky while frozen ───────────── + def stretch(pulse, name): + cnt = Signal(range(self._stretch + 1), name=f"{name}_cnt") + sticky = Signal(name=f"{name}_sticky") + with m.If(pulse): + m.d.sync += cnt.eq(self._stretch) + with m.If(freeze): + m.d.sync += sticky.eq(1) # latch a one-shot when frozen + with m.Elif(cnt != 0): + m.d.sync += cnt.eq(cnt - 1) + with m.If(~freeze): + m.d.sync += sticky.eq(0) # clear sticky when unfrozen + return (cnt != 0) | sticky + + rx_led = stretch(self.rx_pulse, "rx") + tx_led = stretch(self.tx_pulse, "tx") + + # ── cs_active: level → stretched so brief transactions are visible ─ + cs_cnt = Signal(range(self._stretch + 1)) + with m.If(self.cs_active): + m.d.sync += cs_cnt.eq(self._stretch) + with m.Elif(cs_cnt != 0): + m.d.sync += cs_cnt.eq(cs_cnt - 1) + cs_led = cs_cnt != 0 + + leds = Cat(heartbeat, cs_led, rx_led, tx_led, self.ready) + m.d.comb += self.led.eq(leds ^ C(0b11111 if self._led_inv else 0, 5)) + + return m + + +# ── Testbench ───────────────────────────────────────────────────────────── + +if __name__ == "__main__": + import sys + from amaranth.sim import Simulator, Period + + # Tiny parameters so the timed behaviours are observable in a short sim. + dut = StatusPanel(hb_bit=3, stretch_cycles=8, debounce_cycles=3) + errors = [] + + async def settle(ctx, n=1): + await ctx.tick("sync").repeat(n) + + async def testbench(ctx): + ctx.set(dut.btn, 0b111) # active-low idle (no press) + await settle(ctx, 4) + + # T1: heartbeat toggles (bit 3 of the counter flips every 8 cycles) + h0 = ctx.get(dut.led) & 1 + await settle(ctx, 8) + h1 = ctx.get(dut.led) & 1 + if h0 == h1: + errors.append("T1 heartbeat did not toggle over 8 cycles") + print(f"T1 heartbeat toggled: {h0} -> {h1}") + + # T2: rx pulse lights led[2] and it stretches, then clears + ctx.set(dut.rx_pulse, 1) + await settle(ctx, 1) + ctx.set(dut.rx_pulse, 0) + await settle(ctx, 1) + on = (ctx.get(dut.led) >> 2) & 1 + if not on: + errors.append("T2 rx LED not lit after pulse") + await settle(ctx, 12) # > stretch_cycles + off = (ctx.get(dut.led) >> 2) & 1 + if off: + errors.append("T2 rx LED did not clear after stretch") + print(f"T2 rx LED: on={on} then off={not off}") + + # T3: ready level drives led[4] + ctx.set(dut.ready, 1) + await settle(ctx, 1) + if not ((ctx.get(dut.led) >> 4) & 1): + errors.append("T3 ready LED not lit") + ctx.set(dut.ready, 0) + print("T3 ready LED follows level") + + # T4: btn0 held (active-low → drive 0) asserts eth_rst_n low after debounce + ctx.set(dut.btn, 0b110) # btn0 pressed + await settle(ctx, 6) # > debounce + if ctx.get(dut.eth_rst_n) != 0: + errors.append("T4 eth_rst_n not asserted while btn0 held") + ctx.set(dut.btn, 0b111) # release + await settle(ctx, 6) + if ctx.get(dut.eth_rst_n) != 1: + errors.append("T4 eth_rst_n not released") + print("T4 btn0 → eth_rst_n hold/release ok") + + # T5: btn1 press emits exactly one reinit pulse + pulses = 0 + ctx.set(dut.btn, 0b101) # btn1 pressed + for _ in range(10): + await settle(ctx, 1) + pulses += (ctx.get(dut.reinit) & 1) + ctx.set(dut.btn, 0b111) + await settle(ctx, 6) + if pulses != 1: + errors.append(f"T5 reinit pulses: got {pulses}, want 1") + print(f"T5 btn1 → reinit pulses={pulses}") + + # T6: freeze (btn2) makes a single rx pulse stick + ctx.set(dut.btn, 0b011) # btn2 press → toggle freeze on + await settle(ctx, 6) + ctx.set(dut.btn, 0b111) + await settle(ctx, 2) + ctx.set(dut.rx_pulse, 1) # one-shot while frozen + await settle(ctx, 1) + ctx.set(dut.rx_pulse, 0) + await settle(ctx, 20) # well past stretch + stuck = (ctx.get(dut.led) >> 2) & 1 + if not stuck: + errors.append("T6 frozen rx LED did not stick") + ctx.set(dut.btn, 0b011) # toggle freeze off + await settle(ctx, 6) + ctx.set(dut.btn, 0b111) + await settle(ctx, 2) + cleared = ((ctx.get(dut.led) >> 2) & 1) == 0 + if not cleared: + errors.append("T6 rx LED did not clear after unfreeze") + print(f"T6 freeze: stuck={stuck} cleared_after_unfreeze={cleared}") + + sim = Simulator(dut) + sim.add_clock(Period(MHz=24), domain="sync") + sim.add_testbench(testbench) + sim.run() + + if errors: + print("\nFAILURES:") + for e in errors: + print(" ", e) + sys.exit(1) + else: + print("\nAll tests passed.") diff --git a/exi_bba/synth.py b/exi_bba/synth.py new file mode 100644 index 0000000..8dbb171 --- /dev/null +++ b/exi_bba/synth.py @@ -0,0 +1,197 @@ +"""Synthesis script for BBATop → iCEbreaker (iCE40UP5K SG48). + +Run from workspace root: + python -m exi_bba.synth # synthesize only + python -m exi_bba.synth --flash # synthesize and flash + +This file re-declares IceBreakerPlatform inline so that importing +rebbarb/rebbarb.py (which has a module-level platform.build() call) is avoided. +""" + +import os +import subprocess +import sys + +from amaranth import * +from amaranth.build import * +from amaranth.vendor import LatticeICE40Platform + +from exi_bba.bba_top import BBATop + + +# ── Platform definition ─────────────────────────────────────────────────── +# Pin assignments use the iCEbreaker PMOD connectors as placeholders. +# Replace with actual SP1-interposer pin numbers once PCB is finalised. +# +# PMOD1A (J2): pins 4 2 47 45 / 3 48 46 44 (top/bottom) +# PMOD1B (J3): pins 43 38 34 31 / 42 36 32 28 +# PMOD2 (J4): pins 27 25 21 19 / 26 23 20 18 +# +# EXI : CLK=4 MOSI=2 MISO=47 CS_N=45 INT_N=3 (PMOD1A) +# W5100 : indirect parallel bus — 15 pins across PMOD1B + PMOD2. +# ADDR[1:0]=43 38 DATA[7:0]=34 31 42 36 32 28 27 25 +# CS_N=21 RD_N=19 WR_N=26 INT_N=23 RST_N=20 (pin 18 free) +# Board: tie the W5100's upper address lines A[14:2] to 0 (only A[1:0] wired); +# DATA[7:0] is bidirectional (SB_IO tristate, single shared output-enable). + +class IceBreakerPlatform(LatticeICE40Platform): + device = "iCE40UP5K" + package = "SG48" + default_clk = "clk12" + + resources = [ + Resource("clk12", 0, + Pins("35", dir="i"), + Clock(12e6), + Attrs(GLOBAL=True, IO_STANDARD="SB_LVCMOS")), + + # EXI interface (GC side, SPI Mode 3) — PMOD1A FPGA pins + Resource("exi", 0, + Subsignal("clk", Pins("4", dir="i")), + Subsignal("mosi", Pins("2", dir="i")), + Subsignal("miso", Pins("47", dir="o")), + Subsignal("cs_n", Pins("45", dir="i")), + Subsignal("int_n", Pins("3", dir="o")), + Attrs(IO_STANDARD="SB_LVCMOS")), + + # W5100 indirect parallel bus — PMOD1B + PMOD2 FPGA pins + Resource("w5100", 0, + Subsignal("addr", Pins("43 38", dir="o")), + Subsignal("data", Pins("34 31 42 36 32 28 27 25", dir="io")), + Subsignal("cs_n", Pins("21", dir="o")), + Subsignal("rd_n", Pins("19", dir="o")), + Subsignal("wr_n", Pins("26", dir="o")), + Subsignal("int_n", Pins("23", dir="i")), + Subsignal("rst_n", Pins("20", dir="o")), + Attrs(IO_STANDARD="SB_LVCMOS")), + + # Bring-up status panel → iCEbreaker ONBOARD parts (dedicated pins, not + # on any PMOD, so they coexist with EXI + W5100). LEDR/LEDG are + # active-low discrete LEDs; BTN_N is the user button. + # (The onboard RGB LED on pins 39/40/41 needs an SB_RGBA_DRV instance + # wired to raw pads — board/version-specific — left as a future add-on + # to expose rx/tx/ready as colours; the 2 discrete LEDs cover bring-up.) + Resource("ledr", 0, Pins("11", dir="o"), Attrs(IO_STANDARD="SB_LVCMOS")), + Resource("ledg", 0, Pins("37", dir="o"), Attrs(IO_STANDARD="SB_LVCMOS")), + Resource("btn", 0, Pins("10", dir="i"), Attrs(IO_STANDARD="SB_LVCMOS")), + ] + + connectors = [] + + def toolchain_program(self, products, name): + iceprog = os.environ.get("ICEPROG", "iceprog") + with products.extract(f"{name}.bin") as bitstream_filename: + subprocess.check_call([iceprog, bitstream_filename]) + + +# ── BBATop with platform resource wiring ───────────────────────────────── + +class BBATopSynth(BBATop): + """BBATop with platform pin connections added in elaborate().""" + + def elaborate(self, platform): + m = super().elaborate(platform) + + if platform is not None: + exi = platform.request("exi", 0) + w5100 = platform.request("w5100", 0) + + m.d.comb += [ + self.exi_clk .eq(exi.clk.i), + self.exi_mosi .eq(exi.mosi.i), + self.exi_cs_n .eq(exi.cs_n.i), + exi.miso.o .eq(self.exi_miso), + exi.int_n.o .eq(self.int_n), + + # W5100 parallel bus (DATA[7:0] bidirectional via SB_IO) + w5100.addr.o .eq(self.w5100_addr), + w5100.data.o .eq(self.w5100_data_o), + w5100.data.oe .eq(self.w5100_data_oe), + self.w5100_data_i.eq(w5100.data.i), + w5100.cs_n.o .eq(self.w5100_cs_n), + w5100.rd_n.o .eq(self.w5100_rd_n), + w5100.wr_n.o .eq(self.w5100_wr_n), + self.w5100_int_n .eq(w5100.int_n.i), + w5100.rst_n.o .eq(self.w5100_rst_n), + ] + + # ── Bring-up status panel → onboard LEDs / button ────────────── + # Two discrete LEDs answer the #1 bring-up question on a real GC: + # LEDG = heartbeat (clock alive) LEDR = EXI activity (GC talking) + # The one onboard button → panel btn[1] (manual re-init). + if self._status_panel: + ledr = platform.request("ledr", 0) + ledg = platform.request("ledg", 0) + btn = platform.request("btn", 0) + led = self.panel_led + + m.d.comb += [ + ledg.o.eq(~led[0]), # heartbeat (active-low LED) + ledr.o.eq(~led[1]), # EXI activity (active-low LED) + # btn[0]/[2] held released (active-low idle = 1) + self.panel_btn.eq(Cat(C(1, 1), btn.i, C(1, 1))), + ] + + return m + + +# ── Entry point ─────────────────────────────────────────────────────────── +# +# Seed sweep: nextpnr placement is stochastic. With ~22% LC utilisation +# routing dominates timing, so different seeds can vary fmax by ±20%. +# Pass --seeds N to try N seeds (default 1, i.e. seed 1 only). +# The build directory is reused across seeds; the final artefact in +# build/top.bin is the result of the last (or best) seed tried. + +if __name__ == "__main__": + do_flash = "--flash" in sys.argv + n_seeds = next((int(sys.argv[i+1]) for i, a in enumerate(sys.argv) + if a == "--seeds"), 1) + + platform = IceBreakerPlatform() + print(f"Synthesizing BBATop for {platform.device}-{platform.package} " + f"(do_program={do_flash}, seeds=1..{n_seeds})") + + best_seed = 1 + best_fmax = 0.0 + for seed in range(1, n_seeds + 1): + print(f"\n{'='*60}") + print(f" Seed {seed}/{n_seeds}") + print(f"{'='*60}") + opts = (f"--opt-timing --seed {seed} --timing-allow-fail") + try: + platform.build(BBATopSynth(status_panel=True), do_program=False, + verbose=True, nextpnr_opts=opts) + except Exception as exc: + # nextpnr exits non-zero even with --timing-allow-fail on some + # versions; treat as non-fatal timing failure. + print(f" [seed {seed}] build exception (timing?): {exc}") + + # Parse fmax from nextpnr log in build/top.tim (if present) + import glob, re + tim_files = glob.glob("build/top.tim") + glob.glob("build/*.tim") + fmax_exi = 0.0 + for tf in tim_files: + try: + with open(tf) as f: + for line in f: + m_ = re.search( + r"Max frequency.*exi.*?:\s*([\d.]+)\s*MHz", line) + if m_: + fmax_exi = float(m_.group(1)) + except OSError: + pass + print(f" [seed {seed}] exi fmax extracted: {fmax_exi:.1f} MHz") + if fmax_exi > best_fmax: + best_fmax = fmax_exi + best_seed = seed + + print(f"\nBest seed: {best_seed} exi fmax: {best_fmax:.1f} MHz") + + if do_flash: + print(f"\nFlashing with seed {best_seed}...") + opts = f"--opt-timing --seed {best_seed} --timing-allow-fail" + platform.build(BBATopSynth(status_panel=True), do_program=True, + verbose=True, nextpnr_opts=opts) + + print("Done.") diff --git a/exi_bba/tx_frame_drain.py b/exi_bba/tx_frame_drain.py new file mode 100644 index 0000000..5e6bc9e --- /dev/null +++ b/exi_bba/tx_frame_drain.py @@ -0,0 +1,253 @@ +"""TX frame drain — sync domain (24 MHz). + +Drains the tx_bytes AsyncFIFO (written by BBARegisterFile in the exi domain), +forwards each byte to W5500SPIMaster with SOF/EOF framing, then pulses tx_irq +to notify the GC that the transmit is complete. + +Flow +---- +1. Wait for tx_len FIFO to have a length word (signals a complete frame queued). +2. Pop the length from tx_len FIFO. +3. Assert tx_sof on first byte, tx_eof on last byte, consuming tx_bytes FIFO. +4. When W5500SPIMaster accepts the final byte: pulse tx_irq. + +The tx_bytes AsyncFIFO (exi→sync, 8-bit, depth=16) and tx_ctrl FIFO (exi→sync, +16-bit, depth=4) are instantiated in BBARegisterFile and their sync-domain read +sides are exposed as ports wired here by BBATop. +""" + +from amaranth import * + +__all__ = ["TXFrameDrain"] + + +class TXFrameDrain(Elaboratable): + """Drains BBA TX FIFOs and forwards frames to W5500SPIMaster. + + TX FIFO read interfaces (async FIFOs, sync-domain read side) + --------------------------------------------------------------- + tx_bytes_r_data / tx_bytes_r_en / tx_bytes_r_rdy : byte stream + tx_ctrl_r_data / tx_ctrl_r_en / tx_ctrl_r_rdy : 16-bit frame length + + W5500 streaming output (sync domain, to W5500SPIMaster) + ------------------------------------------------------- + tx_data / tx_valid / tx_ready / tx_sof / tx_eof + + CDC output (sync→exi, via PulseSynchronizer in BBATop) + ------------------------------------------------------- + tx_irq : 1-cycle pulse when frame transmission is handed off to W5500SPIMaster + """ + + def __init__(self): + # tx_bytes FIFO read side + self.tx_bytes_r_data = Signal(8) + self.tx_bytes_r_en = Signal() + self.tx_bytes_r_rdy = Signal() + + # tx_ctrl FIFO read side (frame length) + self.tx_ctrl_r_data = Signal(16) + self.tx_ctrl_r_en = Signal() + self.tx_ctrl_r_rdy = Signal() + + # W5500 streaming TX interface + self.tx_data = Signal(8) + self.tx_valid = Signal() + self.tx_ready = Signal() + self.tx_sof = Signal() + self.tx_eof = Signal() + + # TX done pulse → PulseSynchronizer + self.tx_irq = Signal() + + def elaborate(self, platform): + m = Module() + + frame_len = Signal(16) # bytes still to LOAD from FIFO (incl. held one) + is_first = Signal() # next byte loaded is the first (SOF) + load_pending = Signal() # 1-bit "more bytes to load" flag (replaces + # a 16-bit frame_len!=0 compare in the + # combinational FIFO read-enable path) + + # ── Registered holding stage presented to W5500 ────────────────── + # All W5500-facing outputs are driven from these registers. This + # breaks the long combinational path that previously ran from the + # tx_bytes FIFO read pointer, out through W5500 (tx_ready) and the + # is_first/eof logic, and back into the FIFO pointer increment — the + # sync-domain critical path. The FIFO read-enable now depends only on + # the registered hold_valid and the FIFO's own r_rdy. + hold_data = Signal(8) + hold_valid = Signal() + hold_sof = Signal() + hold_eof = Signal() + + m.d.sync += self.tx_irq.eq(0) # default + + m.d.comb += [ + self.tx_data .eq(hold_data), + self.tx_valid.eq(hold_valid), + self.tx_sof .eq(hold_sof), + self.tx_eof .eq(hold_eof), + ] + + # W5500 took the currently-held byte this cycle + hold_consumed = Signal() + m.d.comb += hold_consumed.eq(hold_valid & self.tx_ready) + + # FIFO read-enable defaults (combinational, no W5500 dependency) + m.d.comb += self.tx_bytes_r_en.eq(0) + m.d.comb += self.tx_ctrl_r_en .eq(0) + + with m.FSM(domain="sync", name="tx_fsm"): + + with m.State("IDLE"): + # Wait for a complete frame length in tx_ctrl FIFO + with m.If(self.tx_ctrl_r_rdy): + m.d.comb += self.tx_ctrl_r_en.eq(1) + m.d.sync += frame_len.eq(self.tx_ctrl_r_data) + m.d.sync += is_first.eq(1) + # A frame with length 0 has nothing to load. + m.d.sync += load_pending.eq(self.tx_ctrl_r_data != 0) + m.next = "DRAIN" + + with m.State("DRAIN"): + # Load the next byte into the holding register only when it is + # empty. Costs one idle sync cycle per byte, negligible + # against the W5500 SPI rate (~16 sync cycles/byte), and keeps + # tx_ready off the FIFO read-enable path entirely. + # + # The gate uses the registered 1-bit load_pending instead of a + # 16-bit (frame_len != 0) reduction, so the combinational path + # consume_r_gry → r_rdy → do_load → tx_bytes_r_en stays shallow. + do_load = Signal() + m.d.comb += do_load.eq( + ~hold_valid & self.tx_bytes_r_rdy & load_pending + ) + m.d.comb += self.tx_bytes_r_en.eq(do_load) + + with m.If(hold_consumed): + m.d.sync += hold_valid.eq(0) + with m.If(hold_eof): + m.d.sync += self.tx_irq.eq(1) + m.next = "IDLE" + + with m.If(do_load): + m.d.sync += hold_data .eq(self.tx_bytes_r_data) + m.d.sync += hold_valid.eq(1) + m.d.sync += hold_sof .eq(is_first) + m.d.sync += hold_eof .eq(frame_len == 1) + m.d.sync += is_first .eq(0) + m.d.sync += frame_len .eq(frame_len - 1) + # Last byte just loaded → stop further loads (registered). + with m.If(frame_len == 1): + m.d.sync += load_pending.eq(0) + + return m + + +# ── Testbench ───────────────────────────────────────────────────────────── + +if __name__ == "__main__": + import sys + from amaranth.sim import Simulator, Period + + dut = TXFrameDrain() + errors = [] + + async def _send_frame(ctx, frame): + """Drive one frame through the TXFrameDrain DUT. + + Returns (received_bytes, seen_sof, seen_eof, saw_irq). + + Key timing: tx_sof/tx_eof are combinatorial outputs that depend on + registered signals (is_first, frame_len) BEFORE they update. We read + them BEFORE each tick to capture the correct values, then advance the + FIFO AFTER the tick. + """ + ctx.set(dut.tx_ctrl_r_data, len(frame)) + ctx.set(dut.tx_ctrl_r_rdy, 1) + ctx.set(dut.tx_bytes_r_data, frame[0]) + ctx.set(dut.tx_bytes_r_rdy, 1) + + # Tick 0: IDLE pops ctrl word (comb), FSM→DRAIN, frame_len registered + await ctx.tick("sync").repeat(1) + # Deassert ctrl FIFO so FSM doesn't re-pop when it returns to IDLE + ctx.set(dut.tx_ctrl_r_rdy, 0) + + received = [] + seen_sof = False + seen_eof = False + saw_irq = False + + for _ in range(len(frame) + 10): + # Read comb signals BEFORE the tick (is_first and frame_len still + # reflect pre-tick registered values, so sof/eof are correct) + if ctx.get(dut.tx_valid): + d = ctx.get(dut.tx_data) + sof = ctx.get(dut.tx_sof) + eof = ctx.get(dut.tx_eof) + received.append(d) + seen_sof = seen_sof or sof + seen_eof = seen_eof or eof + + await ctx.tick("sync").repeat(1) + + if ctx.get(dut.tx_irq): + saw_irq = True + break + + # Advance FIFO AFTER the tick: present next byte for next tick + if len(received) < len(frame): + ctx.set(dut.tx_bytes_r_data, frame[len(received)]) + elif len(received) == len(frame): + ctx.set(dut.tx_bytes_r_rdy, 0) + + return received, seen_sof, seen_eof, saw_irq + + async def testbench(ctx): + await ctx.tick("sync").repeat(2) + ctx.set(dut.tx_ready, 1) + + # ── T1: 4-byte frame ───────────────────────────────────────────────── + frame = [0xDE, 0xAD, 0xBE, 0xEF] + received, seen_sof, seen_eof, saw_irq = await _send_frame(ctx, frame) + + print(f"T1 received={[hex(b) for b in received]} sof={seen_sof} eof={seen_eof} tx_irq={saw_irq}") + + if received != frame: + errors.append(f"T1 bytes mismatch: got {received}, want {frame}") + if not seen_sof: + errors.append("T1: SOF never seen") + if not seen_eof: + errors.append("T1: EOF never seen") + if not saw_irq: + errors.append("T1: tx_irq never pulsed") + + await ctx.tick("sync").repeat(4) + + # ── T2: Single-byte frame — SOF and EOF on same byte ───────────────── + frame2 = [0x42] + received2, s2_sof, s2_eof, s2_irq = await _send_frame(ctx, frame2) + + print(f"T2 byte=0x{received2[0] if received2 else 0:02X} sof={s2_sof} eof={s2_eof} tx_irq={s2_irq}") + + if received2 != frame2: + errors.append(f"T2: bytes wrong, got {received2}") + if not (s2_sof and s2_eof): + errors.append("T2: SOF+EOF both must be set for 1-byte frame") + if not s2_irq: + errors.append("T2: tx_irq not seen for 1-byte frame") + + sim = Simulator(dut) + sim.add_clock(Period(MHz=24), domain="sync") + sim.add_testbench(testbench) + + with sim.write_vcd("TXFrameDrain.vcd"): + sim.run() + + if errors: + print("\nFAILURES:") + for e in errors: + print(" ", e) + sys.exit(1) + else: + print("\nAll tests passed.") diff --git a/exi_bba/w5100_parallel_master.py b/exi_bba/w5100_parallel_master.py new file mode 100644 index 0000000..ca354bf --- /dev/null +++ b/exi_bba/w5100_parallel_master.py @@ -0,0 +1,840 @@ +"""W5100 parallel-bus master — sync domain. + +A drop-in alternative to `W5500SPIMaster` that talks to a WIZnet **W5100** over +its **indirect parallel bus** instead of SPI. The external streaming interface +(init_req/init_done/par, tx_*, rx_*) is identical, so BBATop wiring is unchanged; +only the physical pins differ (a parallel bus instead of 4 SPI wires). + +Why parallel +------------ +SPI serialises 8 bits per byte, so on this UP5K (whose W5500-operating logic +closes only ~40 MHz) the SPI byte rate caps at ~12 Mbit/s. A parallel bus moves +a whole byte per access, so the same ~24 MHz sync logic clears the 27 Mbit/s EXI +ceiling — the real hard limit — with margin. See CLAUDE.md. + +W5100 indirect bus interface (IDM) +---------------------------------- +Only two address lines A[1:0] are wired (the upper address lines are tied to 0 +on the board, so a power-up *direct*-mode access at A=00 still lands on MR): + + A[1:0] register + 00 MR (Mode Register — also reachable directly at power-up) + 01 IDM_AR0 (indirect address, high byte) + 10 IDM_AR1 (indirect address, low byte) + 11 IDM_DR (indirect data — accesses mem[IDM_AR]; auto-increments + IDM_AR when MR.AI is set) + +So a register/buffer access is: write IDM_AR0/AR1 with the 16-bit address, then +read/write IDM_DR. With MR.AI=1 a multi-byte block is one address-set followed +by a burst of IDM_DR accesses (the chip auto-increments) — used for SHAR and for +streaming frame data. + +A bus cycle drives A + (for writes) D with /CS and /RD or /WR asserted for +`strobe_cycles` sync clocks (≥ the W5100's ~80 ns access time at 24 MHz). + +Phase status +------------ +Phase 1 (this file): bus access engine + transaction engine + init sequence, +verified against a W5100 bus model. TX/RX MACRAW (with socket-buffer ring +wraparound) land in phases 2–3. +""" + +from amaranth import * + +__all__ = ["W5100ParallelMaster"] + +# ── W5100 register addresses (indirect 16-bit address space) ──────────────── +_MR = 0x0000 # Mode register (common) +_SHAR0 = 0x0009 # Source MAC, 6 bytes +_IR = 0x0015 # Interrupt register +_IMR = 0x0016 # Interrupt mask +_RMSR = 0x001A # RX memory size (2 bits/socket) +_TMSR = 0x001B # TX memory size +_S0_MR = 0x0400 # Socket 0 mode +_S0_CR = 0x0401 # Socket 0 command +_S0_IR = 0x0402 # Socket 0 interrupt +_S0_SR = 0x0403 # Socket 0 status +_S0_TX_FSR = 0x0420 # Socket 0 TX free size (2 bytes) +_S0_TX_RD = 0x0422 # Socket 0 TX read pointer +_S0_TX_WR = 0x0424 # Socket 0 TX write pointer +_S0_RX_RSR = 0x0426 # Socket 0 RX received size (2 bytes) +_S0_RX_RD = 0x0428 # Socket 0 RX read pointer + +_TX_BASE = 0x4000 # Socket 0 TX buffer base (default 2 KB window) +_RX_BASE = 0x6000 # Socket 0 RX buffer base +_S0_TX_MASK = 0x07FF # 2 KB ring mask +_S0_RX_MASK = 0x07FF + +# MR bits / command / mode values +_MR_RST = 0x80 +_MR_AI = 0x02 # address auto-increment (indirect mode) +_MR_IND = 0x01 # indirect bus interface mode +_S0_MR_MACRAW = 0x04 +_CR_OPEN = 0x01 +_CR_SEND = 0x20 +_CR_RECV = 0x40 + +# Indirect-mode address selects (A[1:0]) +_A_MR = 0b00 +_A_AR0 = 0b01 # IDM_AR high byte +_A_AR1 = 0b10 # IDM_AR low byte +_A_DR = 0b11 # IDM_DR (data) + + +class W5100ParallelMaster(Elaboratable): + """W5100 master over the indirect parallel bus, sync clock domain. + + Physical bus pins + ----------------- + bus_addr : A[1:0] output + bus_data_o : D[7:0] output value (drive when bus_data_oe=1) + bus_data_oe: data-bus output enable (1=FPGA drives D, 0=W5100 drives D) + bus_data_i : D[7:0] input value (sampled during reads) + cs_n / rd_n / wr_n : bus control (active low) + w5100_int_n : W5100 INT_N input (active low) + w5100_rst_n : W5100 hardware reset (active low) + + Init / TX / RX interfaces are identical to W5500SPIMaster. + """ + + def __init__(self, strobe_cycles=3, reset_cycles=24000): + # /RD//WR strobe width in sync cycles (≥ W5100 access time). + self._strobe = strobe_cycles + # MR-reset settle wait; testbench overrides with a small value. + self._reset_cycles = reset_cycles + + # Physical parallel bus + self.bus_addr = Signal(2) + self.bus_data_o = Signal(8) + self.bus_data_oe = Signal() + self.bus_data_i = Signal(8) + self.cs_n = Signal(init=1) + self.rd_n = Signal(init=1) + self.wr_n = Signal(init=1) + self.w5100_int_n = Signal(init=1) + self.w5100_rst_n = Signal(init=1) + + # Init control + self.init_req = Signal() + self.init_done = Signal() + self.par = Signal(48) # MAC address (PAR0..5 packed) + + # TX stream + self.tx_data = Signal(8) + self.tx_valid = Signal() + self.tx_ready = Signal() + self.tx_sof = Signal() + self.tx_eof = Signal() + + # RX stream + self.rx_data = Signal(8) + self.rx_valid = Signal() + self.rx_ready = Signal() + self.rx_sof = Signal() + self.rx_eof = Signal() + + def elaborate(self, platform): + m = Module() + STROBE = self._strobe + + # ── Bus access engine: one indirect-bus read or write cycle ────────── + bus_go = Signal() + bus_rw = Signal() # 1 = write, 0 = read + bus_a = Signal(2) + bus_wdata = Signal(8) + bus_rdata = Signal(8) + bus_done = Signal() + bus_ctr = Signal(range(STROBE + 2)) + rw_r = Signal() + + # registered physical outputs + a_o = Signal(2) + d_o = Signal(8) + d_oe = Signal() + cs_r = Signal(init=1) + rd_r = Signal(init=1) + wr_r = Signal(init=1) + m.d.comb += [ + self.bus_addr .eq(a_o), + self.bus_data_o .eq(d_o), + self.bus_data_oe.eq(d_oe), + self.cs_n .eq(cs_r), + self.rd_n .eq(rd_r), + self.wr_n .eq(wr_r), + ] + + m.d.sync += bus_done.eq(0) + with m.FSM(domain="sync", name="bus_fsm"): + with m.State("IDLE"): + m.d.sync += [cs_r.eq(1), rd_r.eq(1), wr_r.eq(1), d_oe.eq(0)] + with m.If(bus_go): + m.d.sync += [a_o.eq(bus_a), rw_r.eq(bus_rw), + cs_r.eq(0), bus_ctr.eq(0)] + with m.If(bus_rw): + m.d.sync += [d_o.eq(bus_wdata), d_oe.eq(1), wr_r.eq(0)] + with m.Else(): + m.d.sync += rd_r.eq(0) + m.next = "STROBE" + with m.State("STROBE"): + m.d.sync += bus_ctr.eq(bus_ctr + 1) + with m.If(bus_ctr == STROBE - 1): + with m.If(~rw_r): + m.d.sync += bus_rdata.eq(self.bus_data_i) # sample read + m.d.sync += [rd_r.eq(1), wr_r.eq(1)] + m.next = "FINISH" + with m.State("FINISH"): + m.d.sync += [cs_r.eq(1), d_oe.eq(0), bus_done.eq(1)] + m.next = "IDLE" + + # ── Transaction engine: address-set + payload over the bus engine ──── + WBUF = 8 + xfer_start = Signal() + xfer_direct = Signal() # 1 = single A=00 access (MR), addr ignored + xfer_addr = Signal(16) + xfer_rw = Signal() # payload direction: 1=write, 0=read + xfer_len = Signal(range(WBUF + 1)) + xfer_stream = Signal() # stream-write payload from s_* + xfer_sread = Signal() # stream-read payload to r_* + xfer_rcount = Signal(16) + xfer_done = Signal() + + wbuf = Array([Signal(8, name=f"wbuf{i}") for i in range(WBUF)]) + rbuf = Array([Signal(8, name=f"rbuf{i}") for i in range(WBUF)]) + s_count = Signal(16) # bytes streamed-written (advances pointers) + xfer_idx = Signal(range(WBUF + 1)) + s_last_r = Signal() + r_idx = Signal(16) + + # Streaming payload interfaces. + s_data, s_valid, s_last, s_consume = Signal(8), Signal(), Signal(), Signal() + r_data, r_valid, r_first, r_last, r_ready = ( + Signal(8), Signal(), Signal(), Signal(), Signal()) + # TX stream source = external tx interface (Phase 2). + m.d.comb += [s_data.eq(self.tx_data), s_valid.eq(self.tx_valid), + s_last.eq(self.tx_eof), self.tx_ready.eq(s_consume)] + # RX stream sink = external rx interface (Phase 3). + m.d.comb += [self.rx_data.eq(r_data), self.rx_valid.eq(r_valid), + self.rx_sof.eq(r_first), self.rx_eof.eq(r_last), + r_ready.eq(self.rx_ready)] + + # Socket-buffer ring wraparound. Unlike the W5500, the W5100's IDM + # address does NOT auto-wrap at the socket-buffer boundary — it just + # increments linearly into the next region. So when a streamed access + # reaches `xfer_wend`, the engine re-sets IDM_AR back to `xfer_wbase`. + xfer_wrap = Signal() + xfer_wbase = Signal(16) + xfer_wend = Signal(16) + cur_addr = Signal(16) + + m.d.comb += [bus_go.eq(0), bus_rw.eq(0), bus_a.eq(0), bus_wdata.eq(0)] + m.d.comb += [s_consume.eq(0), r_valid.eq(0), r_data.eq(0), + r_first.eq(0), r_last.eq(0)] + m.d.sync += xfer_done.eq(0) + + def bus_write(a, data): + m.d.comb += [bus_go.eq(1), bus_rw.eq(1), bus_a.eq(a), bus_wdata.eq(data)] + + def bus_read(a): + m.d.comb += [bus_go.eq(1), bus_rw.eq(0), bus_a.eq(a)] + + with m.FSM(domain="sync", name="xfer_fsm"): + with m.State("IDLE"): + with m.If(xfer_start): + m.d.sync += [xfer_idx.eq(0), s_count.eq(0), r_idx.eq(0), + cur_addr.eq(xfer_addr)] + with m.If(xfer_direct): + m.next = "DIRECT" + with m.Else(): + m.next = "AR_HI" + + # Direct MR write (A=00) + with m.State("DIRECT"): + bus_write(_A_MR, wbuf[0]) + m.next = "DIRECT_W" + with m.State("DIRECT_W"): + with m.If(bus_done): + m.next = "FINISH" + + # Set indirect address IDM_AR (high then low) + with m.State("AR_HI"): + bus_write(_A_AR0, xfer_addr[8:16]) + m.next = "AR_HI_W" + with m.State("AR_HI_W"): + with m.If(bus_done): + m.next = "AR_LO" + with m.State("AR_LO"): + bus_write(_A_AR1, xfer_addr[0:8]) + m.next = "AR_LO_W" + with m.State("AR_LO_W"): + with m.If(bus_done): + with m.If(xfer_stream): + m.next = "SW_LOAD" + with m.Elif(xfer_sread): + m.next = "SR_LOAD" + with m.Elif(xfer_rw): + m.next = "WB_ISSUE" + with m.Else(): + m.next = "RB_ISSUE" + + # Fixed-length write from wbuf (IDM_DR burst, auto-increment) + with m.State("WB_ISSUE"): + bus_write(_A_DR, wbuf[xfer_idx]) + m.next = "WB_WAIT" + with m.State("WB_WAIT"): + with m.If(bus_done): + m.d.sync += xfer_idx.eq(xfer_idx + 1) + with m.If(xfer_idx + 1 == xfer_len): + m.next = "FINISH" + with m.Else(): + m.next = "WB_ISSUE" + + # Fixed-length read into rbuf (with ring wrap, for the length header) + with m.State("RB_ISSUE"): + with m.If(xfer_wrap & (cur_addr == xfer_wend)): + m.next = "RB_WRAP_HI" + with m.Else(): + bus_read(_A_DR) + m.next = "RB_WAIT" + with m.State("RB_WAIT"): + with m.If(bus_done): + m.d.sync += rbuf[xfer_idx].eq(bus_rdata) + m.d.sync += [xfer_idx.eq(xfer_idx + 1), cur_addr.eq(cur_addr + 1)] + with m.If(xfer_idx + 1 == xfer_len): + m.next = "FINISH" + with m.Else(): + m.next = "RB_ISSUE" + with m.State("RB_WRAP_HI"): + bus_write(_A_AR0, xfer_wbase[8:16]) + m.next = "RB_WRAP_HI_W" + with m.State("RB_WRAP_HI_W"): + with m.If(bus_done): + m.next = "RB_WRAP_LO" + with m.State("RB_WRAP_LO"): + bus_write(_A_AR1, xfer_wbase[0:8]) + m.next = "RB_WRAP_LO_W" + with m.State("RB_WRAP_LO_W"): + with m.If(bus_done): + m.d.sync += cur_addr.eq(xfer_wbase) + m.next = "RB_ISSUE" + + # Stream-write payload from s_* until s_last (with ring wrap) + with m.State("SW_LOAD"): + with m.If(xfer_wrap & (cur_addr == xfer_wend)): + m.next = "SW_WRAP_HI" + with m.Elif(s_valid): + bus_write(_A_DR, s_data) + m.d.sync += s_last_r.eq(s_last) + m.next = "SW_WAIT" + with m.State("SW_WAIT"): + with m.If(bus_done): + m.d.comb += s_consume.eq(1) + m.d.sync += [s_count.eq(s_count + 1), cur_addr.eq(cur_addr + 1)] + with m.If(s_last_r): + m.next = "FINISH" + with m.Else(): + m.next = "SW_LOAD" + with m.State("SW_WRAP_HI"): + bus_write(_A_AR0, xfer_wbase[8:16]) + m.next = "SW_WRAP_HI_W" + with m.State("SW_WRAP_HI_W"): + with m.If(bus_done): + m.next = "SW_WRAP_LO" + with m.State("SW_WRAP_LO"): + bus_write(_A_AR1, xfer_wbase[0:8]) + m.next = "SW_WRAP_LO_W" + with m.State("SW_WRAP_LO_W"): + with m.If(bus_done): + m.d.sync += cur_addr.eq(xfer_wbase) + m.next = "SW_LOAD" + + # Stream-read payload to r_* for rcount bytes (with ring wrap) + with m.State("SR_LOAD"): + with m.If(r_idx == xfer_rcount): + m.next = "FINISH" + with m.Elif(xfer_wrap & (cur_addr == xfer_wend)): + m.next = "SR_WRAP_HI" + with m.Else(): + bus_read(_A_DR) + m.next = "SR_WAIT" + with m.State("SR_WAIT"): + with m.If(bus_done): + m.next = "SR_PUSH" + with m.State("SR_PUSH"): + m.d.comb += [r_data.eq(bus_rdata), r_valid.eq(1), + r_first.eq(r_idx == 0), + r_last.eq(r_idx + 1 == xfer_rcount)] + with m.If(r_ready): + m.d.sync += [r_idx.eq(r_idx + 1), cur_addr.eq(cur_addr + 1)] + m.next = "SR_LOAD" + with m.State("SR_WRAP_HI"): + bus_write(_A_AR0, xfer_wbase[8:16]) + m.next = "SR_WRAP_HI_W" + with m.State("SR_WRAP_HI_W"): + with m.If(bus_done): + m.next = "SR_WRAP_LO" + with m.State("SR_WRAP_LO"): + bus_write(_A_AR1, xfer_wbase[0:8]) + m.next = "SR_WRAP_LO_W" + with m.State("SR_WRAP_LO_W"): + with m.If(bus_done): + m.d.sync += cur_addr.eq(xfer_wbase) + m.next = "SR_LOAD" + + with m.State("FINISH"): + m.d.sync += xfer_done.eq(1) + m.next = "IDLE" + + # ── Control regs ───────────────────────────────────────────────────── + mac_shadow = Array([Signal(8, name=f"mac{i}") for i in range(6)]) + wait_ctr = Signal(range(self._reset_cycles + 2)) + tx_wr = Signal(16) + rx_rsr = Signal(16) + rx_rd = Signal(16) + pkt_len = Signal(16) + + def write_reg(name, addr, payload, nxt, direct=False): + """Emit a 2-state block that writes `payload` (a list) to `addr`.""" + with m.State(name): + m.d.sync += [xfer_addr.eq(addr), xfer_rw.eq(1), + xfer_stream.eq(0), xfer_sread.eq(0), xfer_wrap.eq(0), + xfer_direct.eq(1 if direct else 0), + xfer_len.eq(len(payload))] + for i, b in enumerate(payload): + m.d.sync += wbuf[i].eq(b) + m.d.sync += xfer_start.eq(1) + m.next = name + "_W" + with m.State(name + "_W"): + m.d.sync += xfer_start.eq(0) + with m.If(xfer_done): + m.next = nxt + + # ── Main control FSM (Phase 1: init only) ──────────────────────────── + with m.FSM(domain="sync", name="main_fsm"): + with m.State("IDLE"): + m.d.sync += self.init_done.eq(0) + with m.If(self.init_req): + for i in range(6): + m.d.sync += mac_shadow[i].eq(self.par[i*8:(i+1)*8]) + m.next = "MR_RST" + with m.Elif(~self.w5100_int_n): + m.next = "RX_CHECK" + with m.Elif(self.tx_valid & self.tx_sof): + m.next = "TX_START" + + # MR = 0x80 software reset (direct A=00), then settle. + write_reg("MR_RST", _MR, [_MR_RST], "MR_WAIT", direct=True) + with m.State("MR_WAIT"): + with m.If(wait_ctr == self._reset_cycles): + m.d.sync += wait_ctr.eq(0) + m.next = "MR_MODE" + with m.Else(): + m.d.sync += wait_ctr.eq(wait_ctr + 1) + + # MR = indirect + auto-increment (direct A=00). + write_reg("MR_MODE", _MR, [_MR_IND | _MR_AI], "SHAR", direct=True) + + # SHAR = source MAC (6-byte auto-increment burst). + with m.State("SHAR"): + m.d.sync += [xfer_addr.eq(_SHAR0), xfer_rw.eq(1), + xfer_stream.eq(0), xfer_sread.eq(0), + xfer_direct.eq(0), xfer_len.eq(6)] + for i in range(6): + m.d.sync += wbuf[i].eq(mac_shadow[i]) + m.d.sync += xfer_start.eq(1) + m.next = "SHAR_W" + with m.State("SHAR_W"): + m.d.sync += xfer_start.eq(0) + with m.If(xfer_done): + m.next = "MEMSZ" + + # RMSR/TMSR = 0x55 (2 KB per socket — default; socket 0 used). + write_reg("MEMSZ", _RMSR, [0x55, 0x55], "S0_MODE") # RMSR then TMSR + # Socket 0: MACRAW mode, OPEN, enable interrupt. + write_reg("S0_MODE", _S0_MR, [_S0_MR_MACRAW], "S0_OPEN") + write_reg("S0_OPEN", _S0_CR, [_CR_OPEN], "S0_IMR") + write_reg("S0_IMR", _IMR, [0x01], "INIT_DONE") # enable S0 IRQ + + with m.State("INIT_DONE"): + m.d.sync += self.init_done.eq(1) + m.next = "IDLE" + + # ── TX MACRAW ──────────────────────────────────────────────────── + # read S0_TX_WR → stream frame into the TX buffer at that offset + # (ring-wrapping at the 2 KB boundary) → advance S0_TX_WR → SEND. + with m.State("TX_START"): # read S0_TX_WR (2 bytes) + m.d.sync += [xfer_addr.eq(_S0_TX_WR), xfer_rw.eq(0), + xfer_stream.eq(0), xfer_sread.eq(0), xfer_wrap.eq(0), + xfer_direct.eq(0), xfer_len.eq(2)] + m.d.sync += xfer_start.eq(1) + m.next = "TX_RDPTR_W" + with m.State("TX_RDPTR_W"): + m.d.sync += xfer_start.eq(0) + with m.If(xfer_done): + m.d.sync += tx_wr.eq(Cat(rbuf[1], rbuf[0])) # big-endian + m.next = "TX_DATA" + + with m.State("TX_DATA"): # stream frame → TX buffer + m.d.sync += [xfer_addr.eq(_TX_BASE + (tx_wr & _S0_TX_MASK)), + xfer_rw.eq(1), xfer_stream.eq(1), xfer_sread.eq(0), + xfer_direct.eq(0), xfer_wrap.eq(1), + xfer_wbase.eq(_TX_BASE), + xfer_wend.eq(_TX_BASE + _S0_TX_MASK + 1)] + m.d.sync += xfer_start.eq(1) + m.next = "TX_DATA_W" + with m.State("TX_DATA_W"): + m.d.sync += xfer_start.eq(0) + with m.If(xfer_done): + m.d.sync += [xfer_stream.eq(0), xfer_wrap.eq(0), + tx_wr.eq(tx_wr + s_count)] # advanced pointer + m.next = "TX_UPDPTR" + + with m.State("TX_UPDPTR"): # write back S0_TX_WR + m.d.sync += [xfer_addr.eq(_S0_TX_WR), xfer_rw.eq(1), + xfer_stream.eq(0), xfer_sread.eq(0), xfer_wrap.eq(0), + xfer_direct.eq(0), xfer_len.eq(2)] + m.d.sync += [wbuf[0].eq(tx_wr[8:16]), wbuf[1].eq(tx_wr[0:8])] + m.d.sync += xfer_start.eq(1) + m.next = "TX_UPDPTR_W" + with m.State("TX_UPDPTR_W"): + m.d.sync += xfer_start.eq(0) + with m.If(xfer_done): + m.next = "TX_SEND" + + # S0_CR = SEND + write_reg("TX_SEND", _S0_CR, [_CR_SEND], "IDLE") + + # ── RX MACRAW ──────────────────────────────────────────────────── + # On W5100 INT: read RX_RSR; if non-zero read RX_RD, read the 2-byte + # MACRAW length, stream (length−2) frame bytes out (ring-wrapping), + # advance RX_RD by the length, issue RECV, clear the RECV interrupt. + with m.State("RX_CHECK"): # read S0_RX_RSR (2 bytes) + m.d.sync += [xfer_addr.eq(_S0_RX_RSR), xfer_rw.eq(0), + xfer_stream.eq(0), xfer_sread.eq(0), xfer_wrap.eq(0), + xfer_direct.eq(0), xfer_len.eq(2)] + m.d.sync += xfer_start.eq(1) + m.next = "RX_RSR_W" + with m.State("RX_RSR_W"): + m.d.sync += xfer_start.eq(0) + with m.If(xfer_done): + m.d.sync += rx_rsr.eq(Cat(rbuf[1], rbuf[0])) + m.next = "RX_RSR_CHK" + with m.State("RX_RSR_CHK"): + with m.If(rx_rsr == 0): + m.next = "IDLE" # nothing received + with m.Else(): + m.next = "RX_RDPTR" + + with m.State("RX_RDPTR"): # read S0_RX_RD (2 bytes) + m.d.sync += [xfer_addr.eq(_S0_RX_RD), xfer_rw.eq(0), + xfer_stream.eq(0), xfer_sread.eq(0), xfer_wrap.eq(0), + xfer_direct.eq(0), xfer_len.eq(2)] + m.d.sync += xfer_start.eq(1) + m.next = "RX_RDPTR_W" + with m.State("RX_RDPTR_W"): + m.d.sync += xfer_start.eq(0) + with m.If(xfer_done): + m.d.sync += rx_rd.eq(Cat(rbuf[1], rbuf[0])) + m.next = "RX_LEN" + + with m.State("RX_LEN"): # read 2-byte MACRAW length (wrap) + m.d.sync += [xfer_addr.eq(_RX_BASE + (rx_rd & _S0_RX_MASK)), + xfer_rw.eq(0), xfer_stream.eq(0), xfer_sread.eq(0), + xfer_direct.eq(0), xfer_len.eq(2), xfer_wrap.eq(1), + xfer_wbase.eq(_RX_BASE), + xfer_wend.eq(_RX_BASE + _S0_RX_MASK + 1)] + m.d.sync += xfer_start.eq(1) + m.next = "RX_LEN_W" + with m.State("RX_LEN_W"): + m.d.sync += xfer_start.eq(0) + with m.If(xfer_done): + m.d.sync += pkt_len.eq(Cat(rbuf[1], rbuf[0])) + m.next = "RX_FRAME" + + with m.State("RX_FRAME"): # stream (pkt_len−2) frame bytes + m.d.sync += [xfer_addr.eq(_RX_BASE + ((rx_rd + 2) & _S0_RX_MASK)), + xfer_rw.eq(0), xfer_stream.eq(0), xfer_sread.eq(1), + xfer_direct.eq(0), xfer_rcount.eq(pkt_len - 2), + xfer_wrap.eq(1), xfer_wbase.eq(_RX_BASE), + xfer_wend.eq(_RX_BASE + _S0_RX_MASK + 1)] + m.d.sync += xfer_start.eq(1) + m.next = "RX_FRAME_W" + with m.State("RX_FRAME_W"): + m.d.sync += xfer_start.eq(0) + with m.If(xfer_done): + m.d.sync += [xfer_sread.eq(0), xfer_wrap.eq(0)] + m.next = "RX_UPDRD" + + with m.State("RX_UPDRD"): # S0_RX_RD += pkt_len, write back + m.d.sync += [xfer_addr.eq(_S0_RX_RD), xfer_rw.eq(1), + xfer_stream.eq(0), xfer_sread.eq(0), xfer_wrap.eq(0), + xfer_direct.eq(0), xfer_len.eq(2)] + m.d.sync += [wbuf[0].eq((rx_rd + pkt_len)[8:16]), + wbuf[1].eq((rx_rd + pkt_len)[0:8])] + m.d.sync += xfer_start.eq(1) + m.next = "RX_UPDRD_W" + with m.State("RX_UPDRD_W"): + m.d.sync += xfer_start.eq(0) + with m.If(xfer_done): + m.next = "RX_RECV" + + # S0_CR = RECV, then clear the RECV interrupt bit (S0_IR[2]). + write_reg("RX_RECV", _S0_CR, [_CR_RECV], "RX_CLR_IR") + write_reg("RX_CLR_IR", _S0_IR, [0x04], "IDLE") + + return m + + +# ── Testbench ───────────────────────────────────────────────────────────── + +if __name__ == "__main__": + import sys + from amaranth.sim import Simulator, Period + + dut = W5100ParallelMaster(strobe_cycles=3, reset_cycles=10) + errors = [] + + MAC = [0x11, 0x22, 0x33, 0x44, 0x55, 0x66] + PAR = sum(b << (8 * i) for i, b in enumerate(MAC)) + + # Expected indirect-address writes captured by the model (addr, value). + # MR is written directly (A=00) → captured as ('MR', value). + EXPECTED = [ + ("MR", _MR_RST), + ("MR", _MR_IND | _MR_AI), + (_SHAR0 + 0, MAC[0]), (_SHAR0 + 1, MAC[1]), (_SHAR0 + 2, MAC[2]), + (_SHAR0 + 3, MAC[3]), (_SHAR0 + 4, MAC[4]), (_SHAR0 + 5, MAC[5]), + (_RMSR + 0, 0x55), (_RMSR + 1, 0x55), + (_S0_MR, _S0_MR_MACRAW), + (_S0_CR, _CR_OPEN), + (_IMR, 0x01), + ] + + writes = [] # captured (addr-or-'MR', value) — IDM_DR + MR writes + model_mem = {} # W5100 memory image (registers + TX/RX buffers) + + async def w5100_model(ctx): + """W5100 indirect-bus slave model: tracks MR/IDM_AR, records IDM_DR and + MR writes, and drives bus_data_i for reads. Mode-0 timing: a write is + latched on /WR rising while /CS low; reads driven while /RD low.""" + idm_ar = 0 + mr = 0 + prev_cs = prev_rd = prev_wr = 1 + async for vals in ctx.tick("sync").sample( + dut.cs_n, dut.rd_n, dut.wr_n, + dut.bus_addr, dut.bus_data_o, dut.bus_data_oe): + cs, rd, wr, a, do, doe = vals[-6:] + ai = (mr >> 1) & 1 # MR.AI + + # Drive read data while /RD asserted (combinational, before sample). + if cs == 0 and rd == 0: + if a == _A_MR: + val = mr + elif a == _A_AR0: + val = (idm_ar >> 8) & 0xFF + elif a == _A_AR1: + val = idm_ar & 0xFF + else: + val = model_mem.get(idm_ar, 0) + ctx.set(dut.bus_data_i, val) + + # Latch write on /WR rising edge. + if cs == 0 and prev_wr == 0 and wr == 1: + if a == _A_MR: + mr = do + writes.append(("MR", do)) + elif a == _A_AR0: + idm_ar = (idm_ar & 0x00FF) | (do << 8) + elif a == _A_AR1: + idm_ar = (idm_ar & 0xFF00) | do + else: # IDM_DR + model_mem[idm_ar] = do + writes.append((idm_ar, do)) + # RECV command consumes the RX data: clear RSR (mirrors HW). + if idm_ar == _S0_CR and do == _CR_RECV: + model_mem[_S0_RX_RSR] = 0 + model_mem[_S0_RX_RSR + 1] = 0 + if ai: + idm_ar = (idm_ar + 1) & 0xFFFF + # Auto-increment after a data read (/RD rising, A=DR). + if cs == 0 and prev_rd == 0 and rd == 1 and a == _A_DR and ai: + idm_ar = (idm_ar + 1) & 0xFFFF + + prev_cs, prev_rd, prev_wr = cs, rd, wr + + async def testbench(ctx): + ctx.set(dut.par, PAR) + await ctx.tick("sync").repeat(2) + + # T1: trigger init, wait for init_done. + ctx.set(dut.init_req, 1) + await ctx.tick("sync").repeat(1) + ctx.set(dut.init_req, 0) + + done = False + for _ in range(4000): + await ctx.tick("sync").repeat(1) + if ctx.get(dut.init_done): + done = True + break + if not done: + errors.append("init_done never asserted") + + print(f"T1 init captured {len(writes)} writes") + if writes != EXPECTED: + errors.append("init write sequence mismatch") + for i in range(max(len(writes), len(EXPECTED))): + g = writes[i] if i < len(writes) else None + e = EXPECTED[i] if i < len(EXPECTED) else None + mark = "" if g == e else " <-- MISMATCH" + gs = f"({g[0]:#06x},{g[1]:#04x})" if g and isinstance(g[0], int) else str(g) + es = f"({e[0]:#06x},{e[1]:#04x})" if e and isinstance(e[0], int) else str(e) + print(f" [{i:2}] got {gs:20} exp {es:20}{mark}") + else: + print("T1 init sequence matches expected (MR, SHAR, mem sizes, " + "S0 MACRAW/OPEN, IMR)") + + # ── helper: stream one TX frame through the external tx interface ───── + async def feed_frame(ctx, frame): + for i, b in enumerate(frame): + ctx.set(dut.tx_data, b) + ctx.set(dut.tx_valid, 1) + ctx.set(dut.tx_sof, 1 if i == 0 else 0) + ctx.set(dut.tx_eof, 1 if i == len(frame) - 1 else 0) + got = False + for _ in range(400): + await ctx.tick("sync").repeat(1) + if ctx.get(dut.tx_ready): + got = True + break + if not got: + errors.append(f"feed_frame: byte {i} never consumed") + return + ctx.set(dut.tx_valid, 0) + ctx.set(dut.tx_sof, 0) + ctx.set(dut.tx_eof, 0) + # let TX_UPDPTR + SEND complete + for _ in range(200): + await ctx.tick("sync").repeat(1) + if model_mem.get(_S0_CR) == _CR_SEND: + break + + # ── T2: TX MACRAW frame (TX_WR=0, no wrap) ─────────────────────────── + FRAME = [0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0xFF, 0x10, 0x20] + await feed_frame(ctx, FRAME) + + buf = [model_mem.get(_TX_BASE + i, None) for i in range(len(FRAME))] + if buf != FRAME: + errors.append(f"T2 TX buffer mismatch: {buf} != {FRAME}") + tx_wr_hi = model_mem.get(_S0_TX_WR, 0) + tx_wr_lo = model_mem.get(_S0_TX_WR + 1, 0) + adv = (tx_wr_hi << 8) | tx_wr_lo + if adv != len(FRAME): + errors.append(f"T2 S0_TX_WR advance: got {adv}, want {len(FRAME)}") + if model_mem.get(_S0_CR) != _CR_SEND: + errors.append("T2 SEND command not issued") + print(f"T2 TX: buffer={['0x%02X' % b for b in buf]} " + f"TX_WR={adv} SEND={model_mem.get(_S0_CR)==_CR_SEND}") + + # ── T3: TX MACRAW with ring wraparound (TX_WR near 2 KB boundary) ───── + # Pre-load S0_TX_WR = 0x07FE so a 6-byte frame straddles the boundary: + # offsets 0x7FE,0x7FF then wraps to 0x000,0x001,0x002,0x003. + model_mem[_S0_TX_WR] = 0x07 + model_mem[_S0_TX_WR + 1] = 0xFE + model_mem[_S0_CR] = 0x00 # clear so we can detect the new SEND + WFRAME = [0x41, 0x42, 0x43, 0x44, 0x45, 0x46] + await feed_frame(ctx, WFRAME) + + # expected physical layout + exp = { + _TX_BASE + 0x7FE: WFRAME[0], + _TX_BASE + 0x7FF: WFRAME[1], + _TX_BASE + 0x000: WFRAME[2], + _TX_BASE + 0x001: WFRAME[3], + _TX_BASE + 0x002: WFRAME[4], + _TX_BASE + 0x003: WFRAME[5], + } + for addr, want in exp.items(): + got = model_mem.get(addr) + if got != want: + errors.append(f"T3 wrap byte @0x{addr:04X}: got {got}, want 0x{want:02X}") + adv2 = (model_mem.get(_S0_TX_WR, 0) << 8) | model_mem.get(_S0_TX_WR + 1, 0) + want_wr = (0x07FE + len(WFRAME)) & 0xFFFF + if adv2 != want_wr: + errors.append(f"T3 wrap S0_TX_WR: got 0x{adv2:04X}, want 0x{want_wr:04X}") + ok = all(model_mem.get(a) == v for a, v in exp.items()) + print(f"T3 TX wrap: bytes_placed_ok={ok} TX_WR=0x{adv2:04X} (want 0x{want_wr:04X})") + + # ── helper: drive an RX event and collect the streamed-out frame ───── + def load_rx(rx_rd_off, frame): + """Place a MACRAW packet [len_hi,len_lo,frame...] in the RX buffer at + offset rx_rd_off (ring), set RX_RSR/RX_RD, return the 16-bit length.""" + plen = len(frame) + 2 + payload = [(plen >> 8) & 0xFF, plen & 0xFF] + list(frame) + for i, b in enumerate(payload): + off = (rx_rd_off + i) & _S0_RX_MASK + model_mem[_RX_BASE + off] = b + model_mem[_S0_RX_RSR] = (plen >> 8) & 0xFF + model_mem[_S0_RX_RSR + 1] = plen & 0xFF + model_mem[_S0_RX_RD] = (rx_rd_off >> 8) & 0xFF + model_mem[_S0_RX_RD + 1] = rx_rd_off & 0xFF + return plen + + async def do_rx(ctx, rx_rd_off, frame): + plen = load_rx(rx_rd_off, frame) + ctx.set(dut.rx_ready, 1) + collected = [] + ctx.set(dut.w5100_int_n, 0) # assert RX interrupt + for _ in range(1500): + await ctx.tick("sync").repeat(1) + if ctx.get(dut.rx_valid) and ctx.get(dut.rx_ready): + collected.append(ctx.get(dut.rx_data)) + if model_mem.get(_S0_CR) == _CR_RECV: + break + ctx.set(dut.w5100_int_n, 1) # deassert; let it finish + idle + for _ in range(300): + await ctx.tick("sync").repeat(1) + ctx.set(dut.rx_ready, 0) + return collected, plen + + # ── T4: RX MACRAW frame (RX_RD=0, no wrap) ─────────────────────────── + model_mem[_S0_CR] = 0x00 + RX_FRAME = [0xDE, 0xAD, 0xBE, 0xEF, 0x01, 0x02, 0x03] + got, plen = await do_rx(ctx, 0x0000, RX_FRAME) + if got != RX_FRAME: + errors.append(f"T4 RX frame mismatch: {['0x%02X'%b for b in got]} != " + f"{['0x%02X'%b for b in RX_FRAME]}") + new_rd = (model_mem.get(_S0_RX_RD, 0) << 8) | model_mem.get(_S0_RX_RD + 1, 0) + if new_rd != plen: + errors.append(f"T4 RX_RD advance: got 0x{new_rd:04X}, want 0x{plen:04X}") + print(f"T4 RX: frame={['0x%02X'%b for b in got]} RX_RD=0x{new_rd:04X} " + f"RECV={model_mem.get(_S0_CR)==_CR_RECV}") + + # ── T5: RX MACRAW with ring wraparound (RX_RD near 2 KB boundary) ───── + model_mem[_S0_CR] = 0x00 + RX_FRAME2 = [0x51, 0x52, 0x53, 0x54, 0x55] + # rx_rd = 0x07FD: [len_hi@7FD][len_lo@7FE][f0@7FF][f1@000][f2@001]... + got2, plen2 = await do_rx(ctx, 0x07FD, RX_FRAME2) + if got2 != RX_FRAME2: + errors.append(f"T5 RX wrap frame mismatch: {['0x%02X'%b for b in got2]} != " + f"{['0x%02X'%b for b in RX_FRAME2]}") + new_rd2 = (model_mem.get(_S0_RX_RD, 0) << 8) | model_mem.get(_S0_RX_RD + 1, 0) + want_rd2 = (0x07FD + plen2) & 0xFFFF + if new_rd2 != want_rd2: + errors.append(f"T5 RX wrap RX_RD: got 0x{new_rd2:04X}, want 0x{want_rd2:04X}") + print(f"T5 RX wrap: frame={['0x%02X'%b for b in got2]} " + f"RX_RD=0x{new_rd2:04X} (want 0x{want_rd2:04X})") + + sim = Simulator(dut) + sim.add_clock(Period(MHz=24), domain="sync") + sim.add_testbench(testbench) + sim.add_process(w5100_model) + + sim.run() + + if errors: + print("\nFAILURES:") + for e in errors: + print(" ", e) + sys.exit(1) + else: + print("\nAll tests passed.") diff --git a/exi_bba/w5500_spi_master.py b/exi_bba/w5500_spi_master.py new file mode 100644 index 0000000..092e158 --- /dev/null +++ b/exi_bba/w5500_spi_master.py @@ -0,0 +1,760 @@ +"""W5500 SPI master — sync domain (24 MHz). + +SPI Mode 0 (CPOL=0, CPHA=0): CLK idles LOW, data captured on rising edge. +SCK = 12 MHz: the sync domain is 24 MHz and the bit engine toggles SCK via a +clock-enable (sync ÷ 2). + +W5500 frame format +------------------ +Byte 0–1 Address (16-bit big-endian) +Byte 2 Control: [7:3]=BSB [2]=R/W [1:0]=OM +Byte 3+ Data + +BSB values used here: + 0b00000 Common registers + 0b00001 Socket 0 registers + 0b00010 Socket 0 TX buffer + 0b00011 Socket 0 RX buffer + +After NCRA reset the driver issues the W5500 init sequence (MR reset, SHAR, +S0_MR MACRAW, S0_CR OPEN, S0_IMR). + +The module provides: + - A streaming TX interface (tx_data/tx_valid/tx_ready + sof/eof framing) + - A streaming RX interface (rx_data/rx_valid/rx_ready + sof/eof) + - init_req / init_done for the NCRA-triggered init sequence + - MAC source address shadow input (par[0..5]) for SHAR programming +""" + +from amaranth import * + +__all__ = ["W5500SPIMaster"] + +# W5500 register addresses. The 16-bit address is the OFFSET WITHIN A BLOCK; +# the block is selected by the BSB field of the control byte (see _CTRL_*), +# NOT by the address. So socket-0 registers use small offsets with BSB=1. +_W5500_MR = 0x0000 # Mode register (common block) +_W5500_SHAR = 0x0009 # Source MAC, 6 bytes (common block) +_W5500_S0_MR = 0x0000 # Socket 0 Mode (socket-0 block) +_W5500_S0_CR = 0x0001 # Socket 0 Command +_W5500_S0_IR = 0x0002 # Socket 0 Interrupt +_W5500_S0_RXBUF_SIZE = 0x001E # Socket 0 RX buffer size +_W5500_S0_TXBUF_SIZE = 0x001F # Socket 0 TX buffer size +_W5500_S0_TX_FSR = 0x0020 # Socket 0 TX Free Size (2 bytes) +_W5500_S0_TX_WR = 0x0024 # Socket 0 TX Write Pointer +_W5500_S0_RX_RSR = 0x0026 # Socket 0 RX Received Size (2 bytes) +_W5500_S0_RX_RD = 0x0028 # Socket 0 RX Read Pointer +_W5500_S0_IMR = 0x002C # Socket 0 Interrupt Mask + +# Control byte = (BSB << 3) | (RWB << 2) | OM. +# RWB: 1=write 0=read. OM=00 → Variable Data Mode (CS frames the length). +# BSB: 0=common, 1=socket0 reg, 2=socket0 TX buffer, 3=socket0 RX buffer. +_CTRL_WR_COMMON = (0 << 3) | (1 << 2) # 0x04 +_CTRL_WR_S0REG = (1 << 3) | (1 << 2) # 0x0C +_CTRL_RD_S0REG = (1 << 3) | (0 << 2) # 0x08 +_CTRL_WR_S0TX = (2 << 3) | (1 << 2) # 0x14 +_CTRL_RD_S0RX = (3 << 3) | (0 << 2) # 0x18 + + +class W5500SPIMaster(Elaboratable): + """W5500 SPI master in the sync clock domain. + + Physical SPI pins + ----------------- + spi_clk / spi_mosi / spi_miso / spi_cs_n : to W5500 + w5500_int_n : W5500 INT_N input (active low) + w5500_rst_n : W5500 hardware reset (active low) + + Init interface (from BBARegisterFile / BBATop) + ---------------------------------------------- + init_req : pulse to trigger the W5500 init sequence + init_done : pulse when init sequence completes + par : 6-byte MAC address (sampled at init_req) + + TX streaming interface (from TXFrameDrain, sync domain) + ------------------------------------------------------- + tx_data / tx_valid / tx_ready : byte stream + tx_sof / tx_eof : frame delimiters on the same cycle as tx_valid + + RX streaming interface (to RXFrameAssembler, sync domain) + ---------------------------------------------------------- + rx_data / rx_valid / rx_ready : byte stream + rx_sof / rx_eof : frame delimiters + """ + + def __init__(self, clk_div=1, reset_cycles=24000): + # MR-reset settle wait (in sync cycles). ~1 ms; the testbench + # overrides with a small value for fast simulation. + self._reset_cycles = reset_cycles + + # SPI SCK = sync_clock / (2 * clk_div). clk_div=1 → full rate (SCK = + # sync/2): at the 24 MHz slow domain that is 12 MHz SCK (~12 Mbit/s), + # which comfortably exceeds real-world GC BBA TCP throughput. The W5500 + # tolerates up to 80 MHz SCK, so the divider exists only as a safety + # knob for board-level signal-integrity issues, not a functional need. + self._clk_div = clk_div + + # Physical SPI + self.spi_clk = Signal() + self.spi_mosi = Signal() + self.spi_miso = Signal() + self.spi_cs_n = Signal(init=1) + self.w5500_int_n = Signal(init=1) + self.w5500_rst_n = Signal(init=1) + + # Init control + self.init_req = Signal() + self.init_done = Signal() + self.par = Signal(48) # MAC address (PAR0..5 packed) + + # TX stream + self.tx_data = Signal(8) + self.tx_valid = Signal() + self.tx_ready = Signal() + self.tx_sof = Signal() + self.tx_eof = Signal() + + # RX stream + self.rx_data = Signal(8) + self.rx_valid = Signal() + self.rx_ready = Signal() + self.rx_sof = Signal() + self.rx_eof = Signal() + + def elaborate(self, platform): + m = Module() + + # ── SPI clock enable ───────────────────────────────────────────── + # clk_en high every `clk_div` sync cycles. The bit engine toggles SCK + # on each enabled cycle, so SCK = sync / (2 * clk_div). + clk_en = Signal() + if self._clk_div <= 1: + m.d.comb += clk_en.eq(1) # full rate: SCK = sync/2 + else: + div_ctr = Signal(range(self._clk_div)) + with m.If(div_ctr == self._clk_div - 1): + m.d.sync += div_ctr.eq(0) + with m.Else(): + m.d.sync += div_ctr.eq(div_ctr + 1) + m.d.comb += clk_en.eq(div_ctr == self._clk_div - 1) + + # ── SPI pin registers (Mode 0: SCK idles LOW) ──────────────────── + sck_r = Signal() + cs_r = Signal(init=1) + shift_out = Signal(8) + shift_in = Signal(8) + m.d.comb += self.spi_clk .eq(sck_r) + m.d.comb += self.spi_cs_n.eq(cs_r) + m.d.comb += self.spi_mosi.eq(shift_out[7]) # MSB first; valid pre-rising + + # ── Byte-transfer engine (Mode 0) ──────────────────────────────── + # On byte_start, shift out byte_tx MSB-first (8 SCK cycles) and capture + # MISO into byte_rx; pulse byte_done. CS is owned by the xfer engine. + byte_start = Signal() + byte_tx = Signal(8) + byte_rx = Signal(8) + byte_done = Signal() + bit_ctr = Signal(4) + + m.d.sync += byte_done.eq(0) + with m.FSM(domain="sync", name="byte_fsm"): + with m.State("IDLE"): + m.d.sync += sck_r.eq(0) + with m.If(byte_start): + m.d.sync += shift_out.eq(byte_tx) + m.d.sync += bit_ctr.eq(0) + m.next = "RUN" + with m.State("RUN"): + with m.If(clk_en): + with m.If(~sck_r): + # rising edge: slave samples MOSI, master samples MISO + m.d.sync += sck_r.eq(1) + m.d.sync += shift_in.eq(Cat(self.spi_miso, shift_in[:-1])) + with m.Else(): + # falling edge: advance / finish + m.d.sync += sck_r.eq(0) + with m.If(bit_ctr == 7): + m.d.sync += byte_rx.eq(shift_in) + m.d.sync += byte_done.eq(1) + m.next = "IDLE" + with m.Else(): + m.d.sync += shift_out.eq(Cat(0, shift_out[:-1])) + m.d.sync += bit_ctr.eq(bit_ctr + 1) + + # ── Generic register transaction engine (Variable Data Mode) ───── + # One CS-low frame: 3 header bytes (addr_hi, addr_lo, ctrl) then + # xfer_len payload bytes. Writes source payload from wbuf; reads + # capture MISO into rbuf. + WBUF = 8 + xfer_start = Signal() + xfer_addr = Signal(16) + xfer_ctrl = Signal(8) + xfer_len = Signal(range(WBUF + 1)) + xfer_done = Signal() + wbuf = Array([Signal(8, name=f"wbuf{i}") for i in range(WBUF)]) + rbuf = Array([Signal(8, name=f"rbuf{i}") for i in range(WBUF)]) + xfer_idx = Signal(range(WBUF + 3)) + + # Stream-write mode: after the 3-byte header, payload bytes are pulled + # from (s_data, s_valid, s_last) instead of wbuf, until s_last. Used to + # forward a frame straight into the W5500 TX buffer. s_consume pulses + # as each streamed byte is accepted; s_count tracks the byte count. + xfer_stream = Signal() + s_data = Signal(8) + s_valid = Signal() + s_last = Signal() + s_consume = Signal() + s_count = Signal(16) + s_last_r = Signal() # latched s_last for the in-flight byte + + # Stream-read mode: after the header, read `xfer_rcount` payload bytes + # (sending 0x00 dummies) and push each out via (r_data, r_valid, + # r_first, r_last) with r_ready back-pressure. Used to pull a frame + # out of the W5500 RX buffer into RXFrameAssembler. + xfer_sread = Signal() + xfer_rcount = Signal(16) + r_data = Signal(8) + r_valid = Signal() + r_first = Signal() + r_last = Signal() + r_ready = Signal() + r_idx = Signal(16) + + x_byte = Signal(8) + with m.If(xfer_idx == 0): + m.d.comb += x_byte.eq(xfer_addr[8:16]) + with m.Elif(xfer_idx == 1): + m.d.comb += x_byte.eq(xfer_addr[0:8]) + with m.Elif(xfer_idx == 2): + m.d.comb += x_byte.eq(xfer_ctrl) + with m.Else(): + m.d.comb += x_byte.eq(wbuf[xfer_idx - 3]) + + m.d.comb += byte_start.eq(0) + m.d.comb += byte_tx.eq(0) + m.d.comb += s_consume.eq(0) + m.d.comb += r_valid.eq(0) + m.d.comb += r_data.eq(0) + m.d.comb += r_first.eq(0) + m.d.comb += r_last.eq(0) + + m.d.sync += xfer_done.eq(0) + with m.FSM(domain="sync", name="xfer_fsm"): + with m.State("IDLE"): + with m.If(xfer_start): + m.d.sync += cs_r.eq(0) # assert CS for the frame + m.d.sync += xfer_idx.eq(0) + m.d.sync += s_count.eq(0) + m.d.sync += r_idx.eq(0) + m.next = "LOAD" + with m.State("LOAD"): + m.d.comb += byte_tx.eq(x_byte) + m.d.comb += byte_start.eq(1) + m.next = "WAIT" + with m.State("WAIT"): + with m.If(byte_done): + with m.If(xfer_idx >= 3): + m.d.sync += rbuf[xfer_idx - 3].eq(byte_rx) + with m.If((xfer_idx == 2) & xfer_stream): + m.next = "SLOAD" # stream the payload (write) + with m.Elif((xfer_idx == 2) & xfer_sread): + m.next = "RLOAD" # stream the payload (read) + with m.Elif(~xfer_stream & ~xfer_sread + & (xfer_idx == (xfer_len + 2))): + m.next = "FINISH" # 3 header + len − 1 + with m.Else(): + m.d.sync += xfer_idx.eq(xfer_idx + 1) + m.next = "LOAD" + + # ── Streamed-payload sub-loop (TX buffer write) ────────────── + with m.State("SLOAD"): + with m.If(s_valid): + m.d.comb += byte_tx.eq(s_data) + m.d.comb += byte_start.eq(1) + m.d.sync += s_last_r.eq(s_last) + m.next = "SWAIT" + with m.State("SWAIT"): + with m.If(byte_done): + m.d.comb += s_consume.eq(1) # accept this frame byte + m.d.sync += s_count.eq(s_count + 1) + with m.If(s_last_r): + m.next = "FINISH" + with m.Else(): + m.next = "SLOAD" + + # ── Streamed-payload sub-loop (RX buffer read) ─────────────── + with m.State("RLOAD"): + with m.If(r_idx == xfer_rcount): + m.next = "FINISH" + with m.Else(): + m.d.comb += byte_tx.eq(0) # dummy MOSI during read + m.d.comb += byte_start.eq(1) + m.next = "RWAIT" + with m.State("RWAIT"): + with m.If(byte_done): + m.next = "RPUSH" + with m.State("RPUSH"): + m.d.comb += r_data .eq(byte_rx) + m.d.comb += r_valid.eq(1) + m.d.comb += r_first.eq(r_idx == 0) + m.d.comb += r_last .eq(r_idx == (xfer_rcount - 1)) + with m.If(r_ready): + m.d.sync += r_idx.eq(r_idx + 1) + m.next = "RLOAD" + + with m.State("FINISH"): + m.d.sync += cs_r.eq(1) # deassert CS + m.d.sync += xfer_done.eq(1) + m.next = "IDLE" + + # Saved MAC for SHAR programming; current W5500 TX write pointer. + mac_shadow = Array([Signal(8, name=f"mac{i}") for i in range(6)]) + wait_ctr = Signal(range(self._reset_cycles + 2)) + tx_wr = Signal(16) + rx_rsr = Signal(16) # RX received size + rx_rd = Signal(16) # RX read pointer + pkt_len = Signal(16) # MACRAW packet length (incl. 2-byte header) + + # Frame stream from TXFrameDrain feeds the xfer engine's stream port. + # tx_ready pulses (= s_consume) as each frame byte is taken into the + # TX-buffer write transaction. + m.d.comb += [ + s_data .eq(self.tx_data), + s_valid.eq(self.tx_valid), + s_last .eq(self.tx_eof), + self.tx_ready.eq(s_consume), + ] + # RX buffer read stream → RXFrameAssembler. + m.d.comb += [ + self.rx_data .eq(r_data), + self.rx_valid.eq(r_valid), + self.rx_sof .eq(r_first), + self.rx_eof .eq(r_last), + r_ready .eq(self.rx_ready), + ] + + # Helper: a setup state that programs one register-write transaction + # then waits for it to complete and jumps to `nxt`. + def write_reg(name, addr, ctrl, payload, nxt): + with m.State(name): + m.d.sync += xfer_addr.eq(addr) + m.d.sync += xfer_ctrl.eq(ctrl) + m.d.sync += xfer_len.eq(len(payload)) + m.d.sync += xfer_stream.eq(0) + m.d.sync += xfer_sread.eq(0) + for i, b in enumerate(payload): + m.d.sync += wbuf[i].eq(b) + m.d.sync += xfer_start.eq(1) + m.next = name + "_W" + with m.State(name + "_W"): + m.d.sync += xfer_start.eq(0) + with m.If(xfer_done): + m.next = nxt + + # ── Main control FSM ───────────────────────────────────────────── + with m.FSM(domain="sync", name="main_fsm"): + + with m.State("IDLE"): + m.d.sync += self.init_done.eq(0) + with m.If(self.init_req): + for i in range(6): + m.d.sync += mac_shadow[i].eq(self.par[i*8:(i+1)*8]) + m.next = "MR_RST" + with m.Elif(~self.w5500_int_n): + m.next = "RX_CHECK" + with m.Elif(self.tx_valid & self.tx_sof): + m.next = "TX_START" + + # Step 1: MR = 0x80 (software reset), then settle ~1 ms. + write_reg("MR_RST", _W5500_MR, _CTRL_WR_COMMON, [0x80], "MR_WAIT") + with m.State("MR_WAIT"): + with m.If(wait_ctr == self._reset_cycles): + m.d.sync += wait_ctr.eq(0) + m.next = "SHAR" + with m.Else(): + m.d.sync += wait_ctr.eq(wait_ctr + 1) + + # Step 2: SHAR = source MAC (6 bytes from PAR0–5). + with m.State("SHAR"): + m.d.sync += xfer_addr.eq(_W5500_SHAR) + m.d.sync += xfer_ctrl.eq(_CTRL_WR_COMMON) + m.d.sync += xfer_len.eq(6) + for i in range(6): + m.d.sync += wbuf[i].eq(mac_shadow[i]) + m.d.sync += xfer_start.eq(1) + m.next = "SHAR_W" + with m.State("SHAR_W"): + m.d.sync += xfer_start.eq(0) + with m.If(xfer_done): + m.next = "S0_MR" + + # Step 3–5: S0_MR=MACRAW, S0_CR=OPEN, S0_IMR=RECV|SEND_OK. + write_reg("S0_MR", _W5500_S0_MR, _CTRL_WR_S0REG, [0x04], "S0_CR") + write_reg("S0_CR", _W5500_S0_CR, _CTRL_WR_S0REG, [0x01], "S0_IMR") + write_reg("S0_IMR", _W5500_S0_IMR, _CTRL_WR_S0REG, [0x05], "INIT_DONE") + + with m.State("INIT_DONE"): + m.d.sync += self.init_done.eq(1) + m.next = "IDLE" + + # ── TX path (MACRAW) ───────────────────────────────────────── + # 1) read S0_TX_WR, 2) stream the frame into the TX buffer at that + # offset, 3) advance S0_TX_WR by the byte count, 4) issue SEND. + with m.State("TX_START"): + m.d.sync += xfer_addr.eq(_W5500_S0_TX_WR) + m.d.sync += xfer_ctrl.eq(_CTRL_RD_S0REG) + m.d.sync += xfer_len.eq(2) + m.d.sync += xfer_stream.eq(0) + m.d.sync += wbuf[0].eq(0) # read → send 0x00 dummies + m.d.sync += wbuf[1].eq(0) + m.d.sync += xfer_start.eq(1) + m.next = "TX_RDPTR_W" + with m.State("TX_RDPTR_W"): + m.d.sync += xfer_start.eq(0) + with m.If(xfer_done): + m.d.sync += tx_wr.eq(Cat(rbuf[1], rbuf[0])) # big-endian + m.next = "TX_DATA" + + with m.State("TX_DATA"): + m.d.sync += xfer_addr.eq(tx_wr) + m.d.sync += xfer_ctrl.eq(_CTRL_WR_S0TX) # socket-0 TX buffer + m.d.sync += xfer_stream.eq(1) + m.d.sync += xfer_start.eq(1) + m.next = "TX_DATA_W" + with m.State("TX_DATA_W"): + m.d.sync += xfer_start.eq(0) + with m.If(xfer_done): + m.d.sync += xfer_stream.eq(0) + m.d.sync += tx_wr.eq(tx_wr + s_count) # advanced pointer + m.next = "TX_UPDPTR" + + with m.State("TX_UPDPTR"): + m.d.sync += xfer_addr.eq(_W5500_S0_TX_WR) + m.d.sync += xfer_ctrl.eq(_CTRL_WR_S0REG) + m.d.sync += xfer_len.eq(2) + m.d.sync += xfer_stream.eq(0) + m.d.sync += wbuf[0].eq(tx_wr[8:16]) # hi (already advanced) + m.d.sync += wbuf[1].eq(tx_wr[0:8]) # lo + m.d.sync += xfer_start.eq(1) + m.next = "TX_UPDPTR_W" + with m.State("TX_UPDPTR_W"): + m.d.sync += xfer_start.eq(0) + with m.If(xfer_done): + m.next = "TX_SEND" + + # S0_CR = SEND (0x20) + write_reg("TX_SEND", _W5500_S0_CR, _CTRL_WR_S0REG, [0x20], "IDLE") + + # ── RX path (MACRAW) ───────────────────────────────────────── + # Triggered by W5500 INT (w5500_int_n low): read RX_RSR, read + # RX_RD, read the 2-byte MACRAW length, stream the frame out, + # advance RX_RD, issue RECV. + with m.State("RX_CHECK"): # read S0_RX_RSR + m.d.sync += xfer_addr.eq(_W5500_S0_RX_RSR) + m.d.sync += xfer_ctrl.eq(_CTRL_RD_S0REG) + m.d.sync += xfer_len.eq(2) + m.d.sync += xfer_stream.eq(0) + m.d.sync += xfer_sread.eq(0) + m.d.sync += wbuf[0].eq(0) + m.d.sync += wbuf[1].eq(0) + m.d.sync += xfer_start.eq(1) + m.next = "RX_RSR_W" + with m.State("RX_RSR_W"): + m.d.sync += xfer_start.eq(0) + with m.If(xfer_done): + m.d.sync += rx_rsr.eq(Cat(rbuf[1], rbuf[0])) + m.next = "RX_RSR_CHK" + with m.State("RX_RSR_CHK"): + with m.If(rx_rsr == 0): + m.next = "IDLE" # nothing received + with m.Else(): + m.next = "RX_RDPTR" + + with m.State("RX_RDPTR"): # read S0_RX_RD + m.d.sync += xfer_addr.eq(_W5500_S0_RX_RD) + m.d.sync += xfer_ctrl.eq(_CTRL_RD_S0REG) + m.d.sync += xfer_len.eq(2) + m.d.sync += xfer_start.eq(1) + m.next = "RX_RDPTR_W" + with m.State("RX_RDPTR_W"): + m.d.sync += xfer_start.eq(0) + with m.If(xfer_done): + m.d.sync += rx_rd.eq(Cat(rbuf[1], rbuf[0])) + m.next = "RX_LEN" + + with m.State("RX_LEN"): # read 2-byte MACRAW length + m.d.sync += xfer_addr.eq(rx_rd) + m.d.sync += xfer_ctrl.eq(_CTRL_RD_S0RX) + m.d.sync += xfer_len.eq(2) + m.d.sync += xfer_start.eq(1) + m.next = "RX_LEN_W" + with m.State("RX_LEN_W"): + m.d.sync += xfer_start.eq(0) + with m.If(xfer_done): + m.d.sync += pkt_len.eq(Cat(rbuf[1], rbuf[0])) + m.next = "RX_FRAME" + + with m.State("RX_FRAME"): # stream pkt_len−2 frame bytes + m.d.sync += xfer_addr.eq(rx_rd + 2) + m.d.sync += xfer_ctrl.eq(_CTRL_RD_S0RX) + m.d.sync += xfer_sread.eq(1) + m.d.sync += xfer_rcount.eq(pkt_len - 2) + m.d.sync += xfer_start.eq(1) + m.next = "RX_FRAME_W" + with m.State("RX_FRAME_W"): + m.d.sync += xfer_start.eq(0) + with m.If(xfer_done): + m.d.sync += xfer_sread.eq(0) + m.next = "RX_UPDRD" + + with m.State("RX_UPDRD"): # S0_RX_RD += pkt_len + m.d.sync += xfer_addr.eq(_W5500_S0_RX_RD) + m.d.sync += xfer_ctrl.eq(_CTRL_WR_S0REG) + m.d.sync += xfer_len.eq(2) + m.d.sync += xfer_stream.eq(0) + m.d.sync += xfer_sread.eq(0) + m.d.sync += wbuf[0].eq((rx_rd + pkt_len)[8:16]) + m.d.sync += wbuf[1].eq((rx_rd + pkt_len)[0:8]) + m.d.sync += xfer_start.eq(1) + m.next = "RX_UPDRD_W" + with m.State("RX_UPDRD_W"): + m.d.sync += xfer_start.eq(0) + with m.If(xfer_done): + m.next = "RX_RECV" + + # S0_CR = RECV (0x40), then clear the RECV interrupt so INT_N + # deasserts (write 1 to Sn_IR[2]); otherwise the FSM would re-enter + # RX_CHECK forever on a real W5500. + write_reg("RX_RECV", _W5500_S0_CR, _CTRL_WR_S0REG, [0x40], "RX_CLR_IR") + write_reg("RX_CLR_IR", _W5500_S0_IR, _CTRL_WR_S0REG, [0x04], "IDLE") + + return m + + +# ── Testbench ───────────────────────────────────────────────────────────── + +if __name__ == "__main__": + import sys + from amaranth.sim import Simulator, Period + + # Short reset wait so the init sequence runs quickly in simulation. + dut = W5500SPIMaster(reset_cycles=10) + errors = [] + + # MAC for SHAR: par[i*8:(i+1)*8] = mac byte i → mac = 11 22 33 44 55 66 + MAC = [0x11, 0x22, 0x33, 0x44, 0x55, 0x66] + PAR = sum(b << (8 * i) for i, b in enumerate(MAC)) + + # Expected W5500 init transactions: [addr_hi, addr_lo, ctrl, *payload]. + # ctrl 0x04 = common-block write (VDM); 0x0C = socket-0-reg write (VDM). + EXPECTED = [ + [0x00, 0x00, 0x04, 0x80], # MR = 0x80 (reset) + [0x00, 0x09, 0x04, *MAC], # SHAR = MAC + [0x00, 0x00, 0x0C, 0x04], # S0_MR = MACRAW + [0x00, 0x01, 0x0C, 0x01], # S0_CR = OPEN + [0x00, 0x2C, 0x0C, 0x05], # S0_IMR = RECV|SEND_OK + ] + + txns = [] # transactions captured by the W5500 slave model + + # RX frame the W5500 will hand back, and the MACRAW length it reports. + RX_FRAME = [0xDE, 0xAD, 0xBE, 0xEF, 0x01, 0x02] + RX_PKT_LEN = len(RX_FRAME) + 2 # MACRAW length includes the header + + def build_response(bsb, addr): + """Bytes the W5500 drives on MISO for a read of (bsb, addr).""" + if bsb == 1 and addr == _W5500_S0_RX_RSR: + return [(RX_PKT_LEN >> 8) & 0xFF, RX_PKT_LEN & 0xFF] + if bsb == 1 and addr == _W5500_S0_RX_RD: + return [0x00, 0x00] # RX read pointer = 0 + if bsb == 3 and addr == 0x0000: + return [(RX_PKT_LEN >> 8) & 0xFF, RX_PKT_LEN & 0xFF] # length + if bsb == 3 and addr == 0x0002: + return list(RX_FRAME) # frame payload + return [0x00] * 64 + + async def w5500_model(ctx): + """W5500 SPI slave model: captures CS-framed transactions (MOSI) and, + for reads, drives MISO with canned register/buffer data. Mode 0: + MOSI sampled on rising SCK, MISO shifted out MSB-first. + """ + prev_cs, prev_sck = 1, 0 + rx_byte = rx_bits = nbytes = 0 + hdr = [0, 0, 0] + is_read = False + resp, ridx = [], 0 + msr = msr_bits = 0 + cur_txn = [] + async for vals in ctx.tick("sync").sample( + dut.spi_cs_n, dut.spi_clk, dut.spi_mosi): + cs, sck, mosi = vals[-3:] + rising = (prev_sck == 0 and sck == 1) + + if prev_cs == 1 and cs == 0: # CS falling: start frame + cur_txn = [] + rx_byte = rx_bits = nbytes = 0 + is_read = False + resp, ridx, msr, msr_bits = [], 0, 0, 0 + + if cs == 0 and rising: + # MISO bit just sampled by the master → advance shift register + if is_read and nbytes >= 3: + msr = (msr << 1) & 0xFF + msr_bits -= 1 + if msr_bits == 0: + msr = resp[ridx] if ridx < len(resp) else 0 + ridx += 1 + msr_bits = 8 + # sample MOSI + rx_byte = ((rx_byte << 1) | mosi) & 0xFF + rx_bits += 1 + if rx_bits == 8: + cur_txn.append(rx_byte) + if nbytes < 3: + hdr[nbytes] = rx_byte + if nbytes == 2: # header complete → decode + ctrl = hdr[2] + is_read = (ctrl & 0x04) == 0 + bsb = ctrl >> 3 + addr = (hdr[0] << 8) | hdr[1] + if is_read: + resp = build_response(bsb, addr) + msr, ridx, msr_bits = resp[0], 1, 8 + nbytes += 1 + rx_byte = rx_bits = 0 + + if prev_cs == 0 and cs == 1: # CS rising: end frame + txns.append(list(cur_txn)) + + ctx.set(dut.spi_miso, (msr >> 7) & 1) + prev_cs, prev_sck = cs, sck + + rx_collected = [] + + async def rx_collector(ctx): + async for vals in ctx.tick("sync").sample( + dut.rx_valid, dut.rx_ready, dut.rx_data): + valid, ready, data = vals[-3:] + if valid and ready: + rx_collected.append(data) + + async def testbench(ctx): + ctx.set(dut.par, PAR) + await ctx.tick("sync").repeat(4) + + # T1: SPI idle — CLK low (Mode 0), CS high + if ctx.get(dut.spi_clk) != 0: + errors.append("T1 CLK idle != 0") + if ctx.get(dut.spi_cs_n) != 1: + errors.append("T1 CS idle != 1") + print(f"T1 idle: CLK={ctx.get(dut.spi_clk)} CS={ctx.get(dut.spi_cs_n)}") + + # T2: run the init sequence + ctx.set(dut.init_req, 1) + await ctx.tick("sync").repeat(1) + ctx.set(dut.init_req, 0) + + for _ in range(4000): + await ctx.tick("sync").repeat(1) + if ctx.get(dut.init_done): + break + if not ctx.get(dut.init_done): + errors.append("T2 init_done never asserted") + await ctx.tick("sync").repeat(4) + print(f"T2 init_done: {ctx.get(dut.init_done)}") + + # T3: verify the captured init transaction sequence + print(f"T3 captured {len(txns)} init transactions:") + for t in txns: + print(" ", [f"0x{b:02X}" for b in t]) + if txns != EXPECTED: + errors.append(f"T3 init sequence mismatch:\n got {txns}\n want {EXPECTED}") + + # ── T4: TX a frame (MACRAW) ────────────────────────────────────── + txns.clear() + FRAME = [0xAA, 0xBB, 0xCC, 0xDD] + # With MISO=0 the read returns S0_TX_WR = 0x0000. + TX_EXPECTED = [ + [0x00, 0x24, 0x08, 0x00, 0x00], # read S0_TX_WR (dummies) + [0x00, 0x00, 0x14, *FRAME], # write TX buffer @ 0x0000 + [0x00, 0x24, 0x0C, 0x00, len(FRAME)], # S0_TX_WR += len + [0x00, 0x01, 0x0C, 0x20], # S0_CR = SEND + ] + + async def send_frame(frame): + for i, b in enumerate(frame): + ctx.set(dut.tx_data, b) + ctx.set(dut.tx_valid, 1) + ctx.set(dut.tx_sof, 1 if i == 0 else 0) + ctx.set(dut.tx_eof, 1 if i == len(frame) - 1 else 0) + for _ in range(2000): + if ctx.get(dut.tx_ready): + break + await ctx.tick("sync").repeat(1) + await ctx.tick("sync").repeat(1) # complete the consume + ctx.set(dut.tx_valid, 0) + ctx.set(dut.tx_sof, 0) + ctx.set(dut.tx_eof, 0) + + await send_frame(FRAME) + # let the pointer-update + SEND transactions finish + for _ in range(2000): + await ctx.tick("sync").repeat(1) + if len(txns) >= len(TX_EXPECTED): + break + await ctx.tick("sync").repeat(4) + + print(f"T4 captured {len(txns)} TX transactions:") + for t in txns: + print(" ", [f"0x{b:02X}" for b in t]) + if txns != TX_EXPECTED: + errors.append(f"T4 TX sequence mismatch:\n got {txns}\n want {TX_EXPECTED}") + + # ── T5: RX a frame (MACRAW) ────────────────────────────────────── + # The model returns RSR=pkt_len, RD=0, MACRAW length=pkt_len, then the + # frame. Expected transactions (read dummies are 0x00): + RX_EXPECTED = [ + [0x00, 0x26, 0x08, 0x00, 0x00], # read S0_RX_RSR + [0x00, 0x28, 0x08, 0x00, 0x00], # read S0_RX_RD + [0x00, 0x00, 0x18, 0x00, 0x00], # read MACRAW length + [0x00, 0x02, 0x18, *([0x00] * len(RX_FRAME))], # read frame + [0x00, 0x28, 0x0C, 0x00, RX_PKT_LEN], # S0_RX_RD += pkt_len + [0x00, 0x01, 0x0C, 0x40], # S0_CR = RECV + [0x00, 0x02, 0x0C, 0x04], # S0_IR clear RECV + ] + txns.clear() + ctx.set(dut.rx_ready, 1) + ctx.set(dut.w5500_int_n, 0) # signal a received packet + for _ in range(4000): + await ctx.tick("sync").repeat(1) + if len(txns) >= len(RX_EXPECTED): + break + ctx.set(dut.w5500_int_n, 1) + await ctx.tick("sync").repeat(8) + + print(f"T5 captured {len(txns)} RX transactions:") + for t in txns: + print(" ", [f"0x{b:02X}" for b in t]) + print(f"T5 rx frame: {[f'0x{b:02X}' for b in rx_collected]} " + f"(want {[f'0x{b:02X}' for b in RX_FRAME]})") + if txns != RX_EXPECTED: + errors.append(f"T5 RX sequence mismatch:\n got {txns}\n want {RX_EXPECTED}") + if rx_collected != RX_FRAME: + errors.append(f"T5 RX frame mismatch: got {rx_collected}, want {RX_FRAME}") + + sim = Simulator(dut) + sim.add_clock(Period(MHz=24), domain="sync") + sim.add_testbench(testbench) + sim.add_process(w5500_model) + sim.add_process(rx_collector) + + with sim.write_vcd("W5500SPIMaster.vcd"): + sim.run() + + if errors: + print("\nFAILURES:") + for e in errors: + print(" ", e) + sys.exit(1) + else: + print("\nAll tests passed.") diff --git a/requirements.txt b/requirements.txt index 2318e38..54e5867 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,10 +1,10 @@ amaranth @ git+https://github.com/amaranth-lang/amaranth@main -amaranth-boards @ git+https://github.com/amaranth-lang/amaranth-boards.git@7e24efe2f6e95afddd0c1b56f1a9423c48caa472 -amaranth-yosys==0.50.0.0.post115 -importlib_resources==6.5.2 +amaranth-boards @ git+https://github.com/amaranth-lang/amaranth-boards.git@8bc91db6f68c5c36f30926bf56836739c138986f +amaranth-yosys==0.50.0.0.post124 +importlib_resources==7.1.0 Jinja2==3.1.6 jschon==0.11.1 -MarkupSafe==3.0.2 +MarkupSafe==3.0.3 pyvcd==0.4.1 rfc3986==2.0.0 -wasmtime==36.0.0 +wasmtime==45.0.0