Designing hardware for 1992-1993 retro classics
Design a portable console
Specialized graphics hardware
Early 1990s retro classics
On a Lattice ice40 UP5K FPGA
Doom-Comanche crossover
by Id software
by NovaLogic
Doom on lamp MCu
by Nicola Wrachien
Doom nRF5340
by Audun Wilhelmsen
We’re tasked with creating a specialized GPU
Adding a terrain
Btw, a notable Doom re-creation
IceBreaker by @1bitsquared
Verilog / VHDL
Silice
algorithm main(output uint8 leds) {
uint28 counter(0);
always {
leds = counter[20,8];
counter = counter + 1;
}
}
Let’s compare to MiSTer
De10-nano board, CycloneV FPGA
by Sylvain ‘@tnt’ Munaut
by Sylvain ‘@tnt’ Munaut
Source port
Architectured around RISC-V
Pseudo SRAM mod
FPGA 128KB SPRAM:
by Sylvain ‘@tnt’ Munaut
Sets quite a standard!
But there is no GPU 😢
🡆 Could we do faster at full 320x200?
🡆 Can we provide a drawing ‘API’?
🡆 Can we squeeze a voxel terrain in?
🡆 Without the PSRAM mod?
Constraints recap:
What we would need:
– Textures can only go in SPIflash, good news, RO
– Level is not strictly RO, could go to RAM
🡆 But this would leave us with < 32KB for runtime
128KB - 35KB(level) - 64KB(framebuffer)
A portable console
Still
Means rendering top/bottom or left/right
We’ll use left/right, rendering columns
Cost of a division
🡆 A standard design is 1 cycle per bit
🡆 More bits per cycles costs logic and/or MHz
🡆 Fine in a few vertices
You could choose to not be perspective correct
But … nah (PS1 anyone?)
Vertical / horizontal surfaces!
Z-constant along screen columns / rows
// R_DrawSpan
// With DOOM style restrictions on view orientation,
// the floors and ceilings consist of horizontal slices
// or spans with constant z depth.
// However, rotation around the world z axis is possible,
// thus [..] has to traverse the texture at an angle
of columns
of columns
Depth and visibility
Doom levels are organized in a BSP tree
Go through the scene BSP
Project candidate walls on screen
For each screen column
for (int c = 0 ; c != doomchip_width ; ++c) {
// ..
for ( ; v < v_end ; ++v ) {
if (c >= vis[v].i0 && c <= vis[v].i1) {
// lower wall
if (bspSegs[seg].lwr) {
// ..
}
// upper wall
if (bspSegs[seg].upr) {
// ..
}
// middle wall
if (bspSegs[seg].mid) {
// ..
// close column?
if ((bspSegs[seg].flags&1) == 0) {
// ^^^^^ transparent?
top = btm; // opaque, close column
break;
}
}
}
} }
Our renderer design tradeoffs
// draw column command
volatile unsigned int* const COLDRAW0 = (unsigned int* )0x40014;
volatile unsigned int* const COLDRAW1 = (unsigned int* )0x40010;
static inline void col_send(unsigned int t0,unsigned int t1) {
*COLDRAW0 = t0; *COLDRAW1 = t1;
}
// ceiling with flat texturing
col_send(COLDRAW_FLAT(-sec_c_h,cx),
COLDRAW_COL(bspSectors[sec].c_T,c_h,top, seclight) | FLAT);
// upper wall
col_send(COLDRAW_WALL(y,tex_v,tc_u),
COLDRAW_COL(bspSegs[seg].upr, c_o,top, seclight) | WALL);
// terrain
col_send(COLDRAW_TERRAIN(start_dist,end_dist,pick),
COLDRAW_COL (terrain_texture_id, btm, top, 15) | TERRAIN);
// end of column (EOC)
col_send(0, COLDRAW_EOC);
// memory mapping
if ((prev_mem_wenable != 0) & prev_mem_addr[16,1]) {
switch (prev_mem_addr[2,4]) {
case 4b0001: {
if (prev_mem_addr[0,1]) {
// received COLDRAW0
coldraw.in_tex0 = prev_mem_wdata[0,32];
} else {
// received COLDRAW1
uint8 start <: prev_mem_wdata[10,8];
uint8 end <: prev_mem_wdata[18,8];
uint1 empty <: start == end;
uint1 eoc <: prev_mem_wdata[9,1];
// send segment to drawer
coldraw.in_tex1 = prev_mem_wdata[0,32];
coldraw.in_ready = ~empty | eoc; // not null or eoc tag
}
}
// ..
algorithm column_drawer(
input uint1 in_ready, // pulse
input uint32 in_tex1,
input uint32 in_tex0,
output uint1 scr_send(0),
output uint17 scr_data,
input uint1 scr_full,
output uint1 fifo_empty,
output uint1 fifo_full,
output uint8 pickedh,
spiflash_user sf,
input view vw,
) <autorun> {
$$log_n_fifo = 8
$$n_fifo = 1<<log_n_fifo
simple_dualport_bram uint64 fifo [$n_fifo$] =uninitialized;
simple_dualport_bram uint12 colbufs[$1 << (doomchip_height_p2+1)$] =uninitialized;
// ...
segment_drawer drawer<reginputs>(
colbufs <:> colbufs,
sf <:> sf,
vw <:> vw,
pickedh :> pickedh,
);
column_sender sender<reginputs>(
colbufs <:> colbufs,
scr_send :> scr_send,
scr_data :> scr_data
);
always {
// ..
if (in_ready) {
// store draw command in FIFO
fifo.wenable1 = 1;
fifo.wdata1 = {in_tex0,in_tex1};
fifo.addr1 = fifo.addr1 + 1;
// ..
} else {
if ( ~is_empty & ~drawer.busy /*..*/ ) { // process next
uint1 draw_seg <:: ~eoc;
uint1 send_col <:: eoc & ~sender.busy;
// ..
// draw the next segment?
drawer.in_start = draw_seg;
// send the column?
sender.in_start = send_col;
draw_buffer = send_col ^ draw_buffer;
// ..
fifo.addr0 = (draw_seg | send_col) ? fifo.addr0 + 1 : fifo.addr0;
}
}
}
Responsible for drawing
All of this is interleaved in the same logic
Pipeline running in parallel
algorithm segment_drawer(
input uint1 in_start(0), // pulse
input uint32 in_tex1,
input uint32 in_tex0,
input uint1 buffer, // which buffer?
simple_dualport_bram_port1 colbufs,
output uint1 busy(0),
output uint8 pickedh,
spiflash_user sf,
input view vw,
) {
// ..
sampler2D sampler_io;
texture_sampler sampler(sf <:> sf, smplr <:> sampler_io);
// BRAM for single column depth buffer
simple_dualport_bram uint16 depths[$doomchip_height$] = uninitialized;
// BRAM for 1/y table (flats, terrain)
bram uint16 inv_y[2048] = {
65535,
$$for hscr=1,2047 do
$math.round(65535/hscr)$,
$$end
};
uint8 tex_id <: in_tex1[0,8];
uint8 col_start <: in_tex1[10,8];
uint8 col_end <: in_tex1[18,8] > 8d$doomchip_height-1$
? 8d$doomchip_height-1$ : in_tex1[18,8];
// multiply and add
int24 result <:: (a * b) + c;
// goes through transform computations
// for both flats and terrain columns
// (sampler works in parallel)
always {
switch ({terrain,state})
{
case 1: { // ---- computes v (flats)
a = __signed(inv_y.rdata); //_ 1/y_screen
b = __signed(in_tex0[0,14]); //_ *h
c = {24{1b0}};
}
case 2: { // ---- computes u (flats)
mul_d = result >>> 6;
a = __signed(mul_d); //_ h/y_screen
b = __signed(in_tex0[16,16]); //_ *x_screen
c = {24{1b0}};
}
// ..
}
state = state[3,1] ? state : (state+1);
if (in_start) {
end = terrain ? col_start : col_end;
current = col_start;
drawing = 1;
// bind texture
sampler_io.do_bind = (tex_id != sampler_io.tex_id);
sampler_io.tex_id = tex_id;
// init tc_u and tc_v
tc_u = __signed( in_tex0[24,8] );
tc_v = terrain ? __signed({in_tex0[16,11], 8b0})
: __signed({in_tex0[16, 8],11b0});
// ..
} else {
if (smplr_delay[$delay_bit$,1]) {
// a texture sample is available
drawing = still_drawing;
sampler_io.do_fetch = still_drawing;
state = 0;
// ..
} else {
smplr_delay = (drawing & sampler_io.ready)
? {smplr_delay[0,$delay_bit$],smplr_delay[$delay_bit$,1]}
: 1;
} }
Dual-core 6.25 MHz RISCV RV32IM-ish CPU
320x240 SPI screen (LCD)
16MB SPI flash
128KB fast RAM (FPGA SPRAM)
Column drawer GPU with walls, flats, terrains
Hardware: ~ 1700 lines of Silice
Firmware: ~ 1400 lines of C
(comments and all)
MHz: 23-24/50-60 for main design, runs at 30/60 MHz
How can we know the terrain height?
We can’t from the CPU
So we ask the hardware
CPU side
int pick = (col == 160 && start_dist == 0) ? PICK : 0;
col_send(
COLDRAW_TERRAIN(start_dist,end_dist,pick),
COLDRAW_COL (terrain_texture_id, btm, top, 15) | TERRAIN
);
Hardware side
pickedh = pickh & ~pickh_done ? sampler_io.texel : pickedh;
pickh_done = 1;
Icarus Verilog and Verilator
SPI flash could run at 100 MHz (texturing x2!)
Sprites …
Fixed point is not everywhere robust
API, documentation
So stay tuned! Follow @sylefeb