Remove soft link to libdvi (copy full directory instead)

For Arduino Library Manager compliance
This commit is contained in:
Phillip Burgess 2023-03-09 15:00:54 -08:00
parent 506fca674a
commit bb7dc7c20d
20 changed files with 2661 additions and 3 deletions

View file

@ -19,8 +19,7 @@ RP2040 core).
Changes vs main PicoDVI repo: Changes vs main PicoDVI repo:
- Add library.properties file, src and examples directories per Arduino - Add library.properties file, src and examples directories per Arduino
requirements. requirements.
- software/libdvi is soft-linked into src so Arduino IDE can compile these - A full copy of software/libdvi is made in src (originally was soft-linked but Arduino Library Manager does not approve). If any updates are made in the original PicoDVI libdvi directory, copy them here!
parts.
- The file dvi_serialiser.pio.h, normally not part of the distribution and - The file dvi_serialiser.pio.h, normally not part of the distribution and
generated during the Pico SDK build process, is provided here for Arduino generated during the Pico SDK build process, is provided here for Arduino
build to work. If any changes are made in dvi_serialiser.pio (either here build to work. If any changes are made in dvi_serialiser.pio (either here

View file

@ -1 +0,0 @@
../software/libdvi

33
src/libdvi/CMakeLists.txt Normal file
View file

@ -0,0 +1,33 @@
# Note we are using INTERFACE so that the library can be configured per-app
# with compile-time defines
add_library(libdvi INTERFACE)
target_sources(libdvi INTERFACE
${CMAKE_CURRENT_LIST_DIR}/dvi.c
${CMAKE_CURRENT_LIST_DIR}/dvi.h
${CMAKE_CURRENT_LIST_DIR}/dvi_config_defs.h
${CMAKE_CURRENT_LIST_DIR}/dvi_serialiser.c
${CMAKE_CURRENT_LIST_DIR}/dvi_serialiser.h
${CMAKE_CURRENT_LIST_DIR}/dvi_timing.c
${CMAKE_CURRENT_LIST_DIR}/dvi_timing.h
${CMAKE_CURRENT_LIST_DIR}/tmds_encode.S
${CMAKE_CURRENT_LIST_DIR}/tmds_encode.c
${CMAKE_CURRENT_LIST_DIR}/tmds_encode.h
${CMAKE_CURRENT_LIST_DIR}/tmds_table.h
${CMAKE_CURRENT_LIST_DIR}/tmds_table_fullres.h
${CMAKE_CURRENT_LIST_DIR}/util_queue_u32_inline.h
)
target_include_directories(libdvi INTERFACE ${CMAKE_CURRENT_LIST_DIR})
target_link_libraries(libdvi INTERFACE
pico_base_headers
pico_util
hardware_dma
hardware_interp
hardware_pio
hardware_pwm
)
pico_generate_pio_header(libdvi ${CMAKE_CURRENT_LIST_DIR}/dvi_serialiser.pio)
pico_generate_pio_header(libdvi ${CMAKE_CURRENT_LIST_DIR}/tmds_encode_1bpp.pio)

255
src/libdvi/dvi.c Normal file
View file

@ -0,0 +1,255 @@
#include <stdlib.h>
#include "hardware/dma.h"
#include "hardware/irq.h"
#include "dvi.h"
#include "dvi_timing.h"
#include "dvi_serialiser.h"
#include "tmds_encode.h"
// Adafruit PicoDVI fork requires a couple global items run-time configurable:
uint8_t dvi_vertical_repeat = DVI_VERTICAL_REPEAT;
bool dvi_monochrome_tmds = DVI_MONOCHROME_TMDS;
// Time-critical functions pulled into RAM but each in a unique section to
// allow garbage collection
#define __dvi_func(f) __not_in_flash_func(f)
#define __dvi_func_x(f) __scratch_x(__STRING(f)) f
// We require exclusive use of a DMA IRQ line. (you wouldn't want to share
// anyway). It's possible in theory to hook both IRQs and have two DVI outs.
static struct dvi_inst *dma_irq_privdata[2];
static void dvi_dma0_irq();
static void dvi_dma1_irq();
void dvi_init(struct dvi_inst *inst, uint spinlock_tmds_queue, uint spinlock_colour_queue) {
dvi_timing_state_init(&inst->timing_state);
dvi_serialiser_init(&inst->ser_cfg);
for (int i = 0; i < N_TMDS_LANES; ++i) {
inst->dma_cfg[i].chan_ctrl = dma_claim_unused_channel(true);
inst->dma_cfg[i].chan_data = dma_claim_unused_channel(true);
inst->dma_cfg[i].tx_fifo = (void*)&inst->ser_cfg.pio->txf[inst->ser_cfg.sm_tmds[i]];
inst->dma_cfg[i].dreq = pio_get_dreq(inst->ser_cfg.pio, inst->ser_cfg.sm_tmds[i], true);
}
inst->late_scanline_ctr = 0;
inst->tmds_buf_release_next = NULL;
inst->tmds_buf_release = NULL;
queue_init_with_spinlock(&inst->q_tmds_valid, sizeof(void*), 8, spinlock_tmds_queue);
queue_init_with_spinlock(&inst->q_tmds_free, sizeof(void*), 8, spinlock_tmds_queue);
queue_init_with_spinlock(&inst->q_colour_valid, sizeof(void*), 8, spinlock_colour_queue);
queue_init_with_spinlock(&inst->q_colour_free, sizeof(void*), 8, spinlock_colour_queue);
dvi_setup_scanline_for_vblank(inst->timing, inst->dma_cfg, true, &inst->dma_list_vblank_sync);
dvi_setup_scanline_for_vblank(inst->timing, inst->dma_cfg, false, &inst->dma_list_vblank_nosync);
#if defined(ARDUINO)
dvi_setup_scanline_for_active(inst->timing, inst->dma_cfg, (uint32_t*)SRAM_BASE, &inst->dma_list_active);
#else
dvi_setup_scanline_for_active(inst->timing, inst->dma_cfg, (void*)SRAM_BASE, &inst->dma_list_active);
#endif
dvi_setup_scanline_for_active(inst->timing, inst->dma_cfg, NULL, &inst->dma_list_error);
for (int i = 0; i < DVI_N_TMDS_BUFFERS; ++i) {
void *tmdsbuf;
if (dvi_monochrome_tmds)
tmdsbuf = malloc(inst->timing->h_active_pixels / DVI_SYMBOLS_PER_WORD * sizeof(uint32_t));
else
tmdsbuf = malloc(3 * inst->timing->h_active_pixels / DVI_SYMBOLS_PER_WORD * sizeof(uint32_t));
if (!tmdsbuf)
panic("TMDS buffer allocation failed");
queue_add_blocking_u32(&inst->q_tmds_free, &tmdsbuf);
}
}
// The IRQs will run on whichever core calls this function (this is why it's
// called separately from dvi_init)
void dvi_register_irqs_this_core(struct dvi_inst *inst, uint irq_num) {
uint32_t mask_sync_channel = 1u << inst->dma_cfg[TMDS_SYNC_LANE].chan_data;
uint32_t mask_all_channels = 0;
for (int i = 0; i < N_TMDS_LANES; ++i)
mask_all_channels |= 1u << inst->dma_cfg[i].chan_ctrl | 1u << inst->dma_cfg[i].chan_data;
dma_hw->ints0 = mask_sync_channel;
if (irq_num == DMA_IRQ_0) {
hw_write_masked(&dma_hw->inte0, mask_sync_channel, mask_all_channels);
dma_irq_privdata[0] = inst;
irq_set_exclusive_handler(DMA_IRQ_0, dvi_dma0_irq);
}
else {
hw_write_masked(&dma_hw->inte1, mask_sync_channel, mask_all_channels);
dma_irq_privdata[1] = inst;
irq_set_exclusive_handler(DMA_IRQ_1, dvi_dma1_irq);
}
irq_set_enabled(irq_num, true);
}
// Set up control channels to make transfers to data channels' control
// registers (but don't trigger the control channels -- this is done either by
// data channel CHAIN_TO or an initial write to MULTI_CHAN_TRIGGER)
static inline void __attribute__((always_inline)) _dvi_load_dma_op(const struct dvi_lane_dma_cfg dma_cfg[], struct dvi_scanline_dma_list *l) {
for (int i = 0; i < N_TMDS_LANES; ++i) {
dma_channel_config cfg = dma_channel_get_default_config(dma_cfg[i].chan_ctrl);
channel_config_set_ring(&cfg, true, 4); // 16-byte write wrap
channel_config_set_read_increment(&cfg, true);
channel_config_set_write_increment(&cfg, true);
dma_channel_configure(
dma_cfg[i].chan_ctrl,
&cfg,
&dma_hw->ch[dma_cfg[i].chan_data],
dvi_lane_from_list(l, i),
4, // Configure all 4 registers then halt until next CHAIN_TO
false
);
}
}
// Setup first set of control block lists, configure the control channels, and
// trigger them. Control channels will subsequently be triggered only by DMA
// CHAIN_TO on data channel completion. IRQ handler *must* be prepared before
// calling this. (Hooked to DMA IRQ0)
void dvi_start(struct dvi_inst *inst) {
_dvi_load_dma_op(inst->dma_cfg, &inst->dma_list_vblank_nosync);
dma_start_channel_mask(
(1u << inst->dma_cfg[0].chan_ctrl) |
(1u << inst->dma_cfg[1].chan_ctrl) |
(1u << inst->dma_cfg[2].chan_ctrl));
// We really don't want the FIFOs to bottom out, so wait for full before
// starting the shift-out.
for (int i = 0; i < N_TMDS_LANES; ++i)
while (!pio_sm_is_tx_fifo_full(inst->ser_cfg.pio, inst->ser_cfg.sm_tmds[i]))
tight_loop_contents();
dvi_serialiser_enable(&inst->ser_cfg, true);
}
static inline void __dvi_func_x(_dvi_prepare_scanline_8bpp)(struct dvi_inst *inst, uint32_t *scanbuf) {
uint32_t *tmdsbuf;
queue_remove_blocking_u32(&inst->q_tmds_free, &tmdsbuf);
uint pixwidth = inst->timing->h_active_pixels;
uint words_per_channel = pixwidth / DVI_SYMBOLS_PER_WORD;
// Scanline buffers are half-resolution; the functions take the number of *input* pixels as parameter.
tmds_encode_data_channel_8bpp(scanbuf, tmdsbuf + 0 * words_per_channel, pixwidth / 2, DVI_8BPP_BLUE_MSB, DVI_8BPP_BLUE_LSB );
tmds_encode_data_channel_8bpp(scanbuf, tmdsbuf + 1 * words_per_channel, pixwidth / 2, DVI_8BPP_GREEN_MSB, DVI_8BPP_GREEN_LSB);
tmds_encode_data_channel_8bpp(scanbuf, tmdsbuf + 2 * words_per_channel, pixwidth / 2, DVI_8BPP_RED_MSB, DVI_8BPP_RED_LSB );
queue_add_blocking_u32(&inst->q_tmds_valid, &tmdsbuf);
}
static inline void __dvi_func_x(_dvi_prepare_scanline_16bpp)(struct dvi_inst *inst, uint32_t *scanbuf) {
uint32_t *tmdsbuf;
queue_remove_blocking_u32(&inst->q_tmds_free, &tmdsbuf);
uint pixwidth = inst->timing->h_active_pixels;
uint words_per_channel = pixwidth / DVI_SYMBOLS_PER_WORD;
tmds_encode_data_channel_16bpp(scanbuf, tmdsbuf + 0 * words_per_channel, pixwidth / 2, DVI_16BPP_BLUE_MSB, DVI_16BPP_BLUE_LSB );
tmds_encode_data_channel_16bpp(scanbuf, tmdsbuf + 1 * words_per_channel, pixwidth / 2, DVI_16BPP_GREEN_MSB, DVI_16BPP_GREEN_LSB);
tmds_encode_data_channel_16bpp(scanbuf, tmdsbuf + 2 * words_per_channel, pixwidth / 2, DVI_16BPP_RED_MSB, DVI_16BPP_RED_LSB );
queue_add_blocking_u32(&inst->q_tmds_valid, &tmdsbuf);
}
// "Worker threads" for TMDS encoding (core enters and never returns, but still handles IRQs)
// Version where each record in q_colour_valid is one scanline:
void __dvi_func(dvi_scanbuf_main_8bpp)(struct dvi_inst *inst) {
uint y = 0;
while (1) {
uint32_t *scanbuf;
queue_remove_blocking_u32(&inst->q_colour_valid, &scanbuf);
_dvi_prepare_scanline_8bpp(inst, scanbuf);
queue_add_blocking_u32(&inst->q_colour_free, &scanbuf);
++y;
if (y == inst->timing->v_active_lines) {
y = 0;
}
}
__builtin_unreachable();
}
// Ugh copy/paste but it lets us garbage collect the TMDS stuff that is not being used from .scratch_x
void __dvi_func(dvi_scanbuf_main_16bpp)(struct dvi_inst *inst) {
uint y = 0;
while (1) {
uint32_t *scanbuf;
queue_remove_blocking_u32(&inst->q_colour_valid, &scanbuf);
_dvi_prepare_scanline_16bpp(inst, scanbuf);
queue_add_blocking_u32(&inst->q_colour_free, &scanbuf);
++y;
if (y == inst->timing->v_active_lines) {
y = 0;
}
}
__builtin_unreachable();
}
static void __dvi_func(dvi_dma_irq_handler)(struct dvi_inst *inst) {
// Every fourth interrupt marks the start of the horizontal active region. We
// now have until the end of this region to generate DMA blocklist for next
// scanline.
dvi_timing_state_advance(inst->timing, &inst->timing_state);
if (inst->tmds_buf_release && !queue_try_add_u32(&inst->q_tmds_free, &inst->tmds_buf_release))
panic("TMDS free queue full in IRQ!");
inst->tmds_buf_release = inst->tmds_buf_release_next;
inst->tmds_buf_release_next = NULL;
// Make sure all three channels have definitely loaded their last block
// (should be within a few cycles of one another)
for (int i = 0; i < N_TMDS_LANES; ++i) {
while (dma_debug_hw->ch[inst->dma_cfg[i].chan_data].tcr != inst->timing->h_active_pixels / DVI_SYMBOLS_PER_WORD)
tight_loop_contents();
}
uint32_t *tmdsbuf;
while (inst->late_scanline_ctr > 0 && queue_try_remove_u32(&inst->q_tmds_valid, &tmdsbuf)) {
// If we displayed this buffer then it would be in the wrong vertical
// position on-screen. Just pass it back.
queue_add_blocking_u32(&inst->q_tmds_free, &tmdsbuf);
--inst->late_scanline_ctr;
}
if (inst->timing_state.v_state != DVI_STATE_ACTIVE) {
// Don't care
tmdsbuf = NULL;
}
else if (queue_try_peek_u32(&inst->q_tmds_valid, &tmdsbuf)) {
if (inst->timing_state.v_ctr % dvi_vertical_repeat == dvi_vertical_repeat - 1) {
queue_remove_blocking_u32(&inst->q_tmds_valid, &tmdsbuf);
inst->tmds_buf_release_next = tmdsbuf;
}
}
else {
// No valid scanline was ready (generates solid red scanline)
tmdsbuf = NULL;
if (inst->timing_state.v_ctr % dvi_vertical_repeat == dvi_vertical_repeat - 1)
++inst->late_scanline_ctr;
}
switch (inst->timing_state.v_state) {
case DVI_STATE_ACTIVE:
if (tmdsbuf) {
dvi_update_scanline_data_dma(inst->timing, tmdsbuf, &inst->dma_list_active);
_dvi_load_dma_op(inst->dma_cfg, &inst->dma_list_active);
}
else {
_dvi_load_dma_op(inst->dma_cfg, &inst->dma_list_error);
}
if (inst->scanline_callback && inst->timing_state.v_ctr % dvi_vertical_repeat == dvi_vertical_repeat - 1) {
inst->scanline_callback();
}
break;
case DVI_STATE_SYNC:
_dvi_load_dma_op(inst->dma_cfg, &inst->dma_list_vblank_sync);
break;
default:
_dvi_load_dma_op(inst->dma_cfg, &inst->dma_list_vblank_nosync);
break;
}
}
static void __dvi_func(dvi_dma0_irq)() {
struct dvi_inst *inst = dma_irq_privdata[0];
dma_hw->ints0 = 1u << inst->dma_cfg[TMDS_SYNC_LANE].chan_data;
dvi_dma_irq_handler(inst);
}
static void __dvi_func(dvi_dma1_irq)() {
struct dvi_inst *inst = dma_irq_privdata[1];
dma_hw->ints1 = 1u << inst->dma_cfg[TMDS_SYNC_LANE].chan_data;
dvi_dma_irq_handler(inst);
}

81
src/libdvi/dvi.h Normal file
View file

@ -0,0 +1,81 @@
#ifndef _DVI_H
#define _DVI_H
#define N_TMDS_LANES 3
#define TMDS_SYNC_LANE 0 // blue!
#include "pico/util/queue.h"
#include "dvi_config_defs.h"
#include "dvi_timing.h"
#include "dvi_serialiser.h"
#include "util_queue_u32_inline.h"
typedef void (*dvi_callback_t)(void);
struct dvi_inst {
// Config ---
const struct dvi_timing *timing;
struct dvi_lane_dma_cfg dma_cfg[N_TMDS_LANES];
struct dvi_timing_state timing_state;
struct dvi_serialiser_cfg ser_cfg;
// Called in the DMA IRQ once per scanline -- careful with the run time!
dvi_callback_t scanline_callback;
// State ---
struct dvi_scanline_dma_list dma_list_vblank_sync;
struct dvi_scanline_dma_list dma_list_vblank_nosync;
struct dvi_scanline_dma_list dma_list_active;
struct dvi_scanline_dma_list dma_list_error;
// After a TMDS buffer has been enqueue via a control block for the last
// time, two IRQs must go by before freeing. The first indicates the control
// block for this buf has been loaded, and the second occurs some time after
// the actual data DMA transfer has completed.
uint32_t *tmds_buf_release_next;
uint32_t *tmds_buf_release;
// Remember how far behind the source is on TMDS scanlines, so we can output
// solid colour until they catch up (rather than dying spectacularly)
uint late_scanline_ctr;
// Encoded scanlines:
queue_t q_tmds_valid;
queue_t q_tmds_free;
// Either scanline buffers or frame buffers:
queue_t q_colour_valid;
queue_t q_colour_free;
};
#if defined(__cplusplus)
extern "C"
{
#endif
// Set up data structures and hardware for DVI.
void dvi_init(struct dvi_inst *inst, uint spinlock_tmds_queue, uint spinlock_colour_queue);
// Call this after calling dvi_init(). DVI DMA interrupts will be routed to
// whichever core called this function. Registers an exclusive IRQ handler.
void dvi_register_irqs_this_core(struct dvi_inst *inst, uint irq_num);
// Start actually wiggling TMDS pairs. Call this once you have initialised the
// DVI, have registered the IRQs, and are producing rendered scanlines.
void dvi_start(struct dvi_inst *inst);
// TMDS encode worker function: core enters and doesn't leave, but still
// responds to IRQs. Repeatedly pop a scanline buffer from q_colour_valid,
// TMDS encode it, and pass it to the tmds valid queue.
void dvi_scanbuf_main_8bpp(struct dvi_inst *inst);
void dvi_scanbuf_main_16bpp(struct dvi_inst *inst);
// Same as above, but each q_colour_valid entry is a framebuffer
void dvi_framebuf_main_8bpp(struct dvi_inst *inst);
void dvi_framebuf_main_16bpp(struct dvi_inst *inst);
#if defined(__cplusplus)
}
#endif
#endif

View file

@ -0,0 +1,151 @@
#ifndef _DVI_CONFIG_DEFS_H
#define _DVI_CONFIG_DEFS_H
// Compile-time configuration definitions for libdvi. This file provides
// defaults -- you can override using a board header, or setting compile
// definitions directly from the commandline (e.g. using CMake
// target_compile_definitions())
// Pull in base headers to make sure board definitions override the
// definitions provided here. Note this file is included in asm and C.
#include "hardware/platform_defs.h"
#include "pico/config.h"
// ----------------------------------------------------------------------------
// General DVI defines
// How many times to output the same TMDS buffer before recyling it onto the
// free queue. Pixels are repeated vertically if this is >1.
#ifndef DVI_VERTICAL_REPEAT
#define DVI_VERTICAL_REPEAT 2
#endif
// Number of TMDS buffers to allocate (malloc()) in DVI init. You can set this
// to 0 if you want to allocate your own (e.g. if you want static buffers)
#ifndef DVI_N_TMDS_BUFFERS
#define DVI_N_TMDS_BUFFERS 3
#endif
// If 1, replace the DVI serialiser with a 10n1 UART (1 start bit, 10 data
// bits, 1 stop bit) so the stream can be dumped and analysed easily.
#ifndef DVI_SERIAL_DEBUG
#define DVI_SERIAL_DEBUG 0
#endif
// If 1, the same TMDS symbols are sent to all 3 lanes during the horizontal
// active period. This means only monochrome colour is available, but the TMDS
// buffers are 3 times smaller as a result, and the performance requirements
// for encode are also cut by 3.
#ifndef DVI_MONOCHROME_TMDS
#define DVI_MONOCHROME_TMDS 0
#endif
// By default, we assume each 32-bit word written to a PIO FIFO contains 2x
// 10-bit TMDS symbols, concatenated into the lower 20 bits, least-significant
// first. This is convenient if you are generating two or more pixels at once,
// e.g. using the pixel-doubling TMDS encode. You can change this value to 1
// (so each word contains 1 symbol) for e.g. full resolution RGB encode. Note
// that this value needs to divide the DVI horizontal timings, so is limited
// to 1 or 2.
#ifndef DVI_SYMBOLS_PER_WORD
#define DVI_SYMBOLS_PER_WORD 2
#endif
#if DVI_SYMBOLS_PER_WORD != 1 && DVI_SYMBOLS_PER_WORD !=2
#error "Unsupported value for DVI_SYMBOLS_PER_WORD"
#endif
// ----------------------------------------------------------------------------
// Pixel component layout
// By default we go R, G, B from MSB -> LSB. Override to e.g. swap RGB <-> BGR
// Default 8bpp layout: RGB332, {r[1:0], g[2:0], b[1:0]}
#ifndef DVI_8BPP_RED_MSB
#define DVI_8BPP_RED_MSB 7
#endif
#ifndef DVI_8BPP_RED_LSB
#define DVI_8BPP_RED_LSB 5
#endif
#ifndef DVI_8BPP_GREEN_MSB
#define DVI_8BPP_GREEN_MSB 4
#endif
#ifndef DVI_8BPP_GREEN_LSB
#define DVI_8BPP_GREEN_LSB 2
#endif
#ifndef DVI_8BPP_BLUE_MSB
#define DVI_8BPP_BLUE_MSB 1
#endif
#ifndef DVI_8BPP_BLUE_LSB
#define DVI_8BPP_BLUE_LSB 0
#endif
// Default 16bpp layout: RGB565, {r[4:0], g[5:0], b[4:0]}
#ifndef DVI_16BPP_RED_MSB
#define DVI_16BPP_RED_MSB 15
#endif
#ifndef DVI_16BPP_RED_LSB
#define DVI_16BPP_RED_LSB 11
#endif
#ifndef DVI_16BPP_GREEN_MSB
#define DVI_16BPP_GREEN_MSB 10
#endif
#ifndef DVI_16BPP_GREEN_LSB
#define DVI_16BPP_GREEN_LSB 5
#endif
#ifndef DVI_16BPP_BLUE_MSB
#define DVI_16BPP_BLUE_MSB 4
#endif
#ifndef DVI_16BPP_BLUE_LSB
#define DVI_16BPP_BLUE_LSB 0
#endif
// Default 1bpp layout: bitwise little-endian, i.e. least significant bit of
// each word is the first (leftmost) of a block of 32 pixels.
// If 1, reverse the order of pixels within each byte. Order of bytes within
// each word is still little-endian.
#ifndef DVI_1BPP_BIT_REVERSE
#define DVI_1BPP_BIT_REVERSE 1 // Adafruit_GFX GFXcanvas1 requires this 1
#endif
// ----------------------------------------------------------------------------
// TMDS encode controls
// Number of TMDS loop bodies between branches. cmp + branch costs 3 cycles,
// so you can easily save 10% of encode time by bumping this. Note that body
// will *already* produce multiple pixels, and total symbols per iteration
// must cleanly divide symbols per scanline, else the loop won't terminate.
// Point gun away from foot.
#ifndef TMDS_ENCODE_UNROLL
#define TMDS_ENCODE_UNROLL 1
#endif
// If 1, don't save/restore the interpolators on full-resolution TMDS encode.
// Speed hack. The TMDS code uses both interpolators, for each of the 3 data
// channels, so this define avoids 6 save/restores per scanline.
#ifndef TMDS_FULLRES_NO_INTERP_SAVE
#define TMDS_FULLRES_NO_INTERP_SAVE 0
#endif
// If 1, don't DC-balance the output of full resolution encode. Hilariously
// noncompliant, but Dell Ultrasharp -- the honey badger of computer monitors
// -- does not seem to mind (it helps that we DC-couple). Another speed hack,
// useful when you are trying to get everything else up to speed.
#ifndef TMDS_FULLRES_NO_DC_BALANCE
#define TMDS_FULLRES_NO_DC_BALANCE 0
#endif
#endif

View file

@ -0,0 +1,73 @@
#include "pico.h"
#include "hardware/pio.h"
#include "hardware/gpio.h"
#include "hardware/pwm.h"
#include "hardware/structs/padsbank0.h"
#include "dvi.h"
#include "dvi_serialiser.h"
#include "dvi_serialiser.pio.h"
static void dvi_configure_pad(uint gpio, bool invert) {
// 2 mA drive, enable slew rate limiting (this seems fine even at 720p30, and
// the 3V3 LDO doesn't get warm like when turning all the GPIOs up to 11).
// Also disable digital receiver.
hw_write_masked(
&padsbank0_hw->io[gpio],
(0 << PADS_BANK0_GPIO0_DRIVE_LSB),
PADS_BANK0_GPIO0_DRIVE_BITS | PADS_BANK0_GPIO0_SLEWFAST_BITS | PADS_BANK0_GPIO0_IE_BITS
);
gpio_set_outover(gpio, invert ? GPIO_OVERRIDE_INVERT : GPIO_OVERRIDE_NORMAL);
}
void dvi_serialiser_init(struct dvi_serialiser_cfg *cfg) {
#if DVI_SERIAL_DEBUG
uint offset = pio_add_program(cfg->pio, &dvi_serialiser_debug_program);
#else
uint offset = pio_add_program(cfg->pio, &dvi_serialiser_program);
#endif
cfg->prog_offs = offset;
for (int i = 0; i < N_TMDS_LANES; ++i) {
pio_sm_claim(cfg->pio, cfg->sm_tmds[i]);
dvi_serialiser_program_init(
cfg->pio,
cfg->sm_tmds[i],
offset,
cfg->pins_tmds[i],
DVI_SERIAL_DEBUG
);
dvi_configure_pad(cfg->pins_tmds[i], cfg->invert_diffpairs);
dvi_configure_pad(cfg->pins_tmds[i] + 1, cfg->invert_diffpairs);
}
// Use a PWM slice to drive the pixel clock. Both GPIOs must be on the same
// slice (lower-numbered GPIO must be even).
assert(cfg->pins_clk % 2 == 0);
uint slice = pwm_gpio_to_slice_num(cfg->pins_clk);
// 5 cycles high, 5 low. Invert one channel so that we get complementary outputs.
pwm_config pwm_cfg = pwm_get_default_config();
pwm_config_set_output_polarity(&pwm_cfg, true, false);
pwm_config_set_wrap(&pwm_cfg, 9);
pwm_init(slice, &pwm_cfg, false);
pwm_set_both_levels(slice, 5, 5);
for (uint i = cfg->pins_clk; i <= cfg->pins_clk + 1; ++i) {
gpio_set_function(i, GPIO_FUNC_PWM);
dvi_configure_pad(i, cfg->invert_diffpairs);
}
}
void dvi_serialiser_enable(struct dvi_serialiser_cfg *cfg, bool enable) {
uint mask = 0;
for (int i = 0; i < N_TMDS_LANES; ++i)
mask |= 1u << (cfg->sm_tmds[i] + PIO_CTRL_SM_ENABLE_LSB);
if (enable) {
hw_set_bits(&cfg->pio->ctrl, mask);
pwm_set_enabled(pwm_gpio_to_slice_num(cfg->pins_clk), true);
}
else {
hw_clear_bits(&cfg->pio->ctrl, mask);
pwm_set_enabled(pwm_gpio_to_slice_num(cfg->pins_clk), false);
}
}

View file

@ -0,0 +1,22 @@
#ifndef _DVI_SERIALISER_H
#define _DVI_SERIALISER_H
#include "hardware/pio.h"
#include "dvi_config_defs.h"
#define N_TMDS_LANES 3
struct dvi_serialiser_cfg {
PIO pio;
uint sm_tmds[N_TMDS_LANES];
uint pins_tmds[N_TMDS_LANES];
uint pins_clk;
bool invert_diffpairs;
uint prog_offs;
};
void dvi_serialiser_init(struct dvi_serialiser_cfg *cfg);
void dvi_serialiser_enable(struct dvi_serialiser_cfg *cfg, bool enable);
uint32_t dvi_single_to_diff(uint32_t in);
#endif

View file

@ -0,0 +1,53 @@
.program dvi_serialiser
.side_set 2
.origin 0
; Single-ended -> differential serial
out pc, 1 side 0b10
out pc, 1 side 0b01
.program dvi_serialiser_debug
.side_set 1 opt
; The debug variant behaves as a UART with 1 start bit, 10 data bits, 1 stop
; bit, and 5/6ths the data throughput of the TMDS version.
pull ifempty side 1 ; Extend stop bit with FIFO stall
nop side 0
out pins, 1 ; Unrolled because we require 1 bit / clk
out pins, 1
out pins, 1
out pins, 1
out pins, 1
out pins, 1
out pins, 1
out pins, 1
out pins, 1
out pins, 1
% c-sdk {
#include "dvi_config_defs.h"
static inline void dvi_serialiser_program_init(PIO pio, uint sm, uint offset, uint data_pins, bool debug) {
pio_sm_set_pins_with_mask(pio, sm, 2u << data_pins, 3u << data_pins);
pio_sm_set_pindirs_with_mask(pio, sm, ~0u, 3u << data_pins);
pio_gpio_init(pio, data_pins);
pio_gpio_init(pio, data_pins + 1);
pio_sm_config c;
if (debug) {
c = dvi_serialiser_debug_program_get_default_config(offset);
}
else {
c = dvi_serialiser_program_get_default_config(offset);
}
sm_config_set_sideset_pins(&c, data_pins);
if (debug)
sm_config_set_out_pins(&c, data_pins, 1);
sm_config_set_out_shift(&c, true, !debug, 10 * DVI_SYMBOLS_PER_WORD);
sm_config_set_fifo_join(&c, PIO_FIFO_JOIN_TX);
pio_sm_init(pio, sm, offset, &c);
pio_sm_set_enabled(pio, sm, false);
}
%}

View file

@ -0,0 +1,101 @@
// -------------------------------------------------- //
// This file is autogenerated by pioasm; do not edit! //
// -------------------------------------------------- //
#pragma once
#if !PICO_NO_HARDWARE
#include "hardware/pio.h"
#endif
// -------------- //
// dvi_serialiser //
// -------------- //
#define dvi_serialiser_wrap_target 0
#define dvi_serialiser_wrap 1
static const uint16_t dvi_serialiser_program_instructions[] = {
// .wrap_target
0x70a1, // 0: out pc, 1 side 2
0x68a1, // 1: out pc, 1 side 1
// .wrap
};
#if !PICO_NO_HARDWARE
static const struct pio_program dvi_serialiser_program = {
.instructions = dvi_serialiser_program_instructions,
.length = 2,
.origin = 0,
};
static inline pio_sm_config dvi_serialiser_program_get_default_config(uint offset) {
pio_sm_config c = pio_get_default_sm_config();
sm_config_set_wrap(&c, offset + dvi_serialiser_wrap_target, offset + dvi_serialiser_wrap);
sm_config_set_sideset(&c, 2, false, false);
return c;
}
#endif
// -------------------- //
// dvi_serialiser_debug //
// -------------------- //
#define dvi_serialiser_debug_wrap_target 0
#define dvi_serialiser_debug_wrap 11
static const uint16_t dvi_serialiser_debug_program_instructions[] = {
// .wrap_target
0x98e0, // 0: pull ifempty block side 1
0xb042, // 1: nop side 0
0x6001, // 2: out pins, 1
0x6001, // 3: out pins, 1
0x6001, // 4: out pins, 1
0x6001, // 5: out pins, 1
0x6001, // 6: out pins, 1
0x6001, // 7: out pins, 1
0x6001, // 8: out pins, 1
0x6001, // 9: out pins, 1
0x6001, // 10: out pins, 1
0x6001, // 11: out pins, 1
// .wrap
};
#if !PICO_NO_HARDWARE
static const struct pio_program dvi_serialiser_debug_program = {
.instructions = dvi_serialiser_debug_program_instructions,
.length = 12,
.origin = -1,
};
static inline pio_sm_config dvi_serialiser_debug_program_get_default_config(uint offset) {
pio_sm_config c = pio_get_default_sm_config();
sm_config_set_wrap(&c, offset + dvi_serialiser_debug_wrap_target, offset + dvi_serialiser_debug_wrap);
sm_config_set_sideset(&c, 2, true, false);
return c;
}
#include "dvi_config_defs.h"
static inline void dvi_serialiser_program_init(PIO pio, uint sm, uint offset, uint data_pins, bool debug) {
pio_sm_set_pins_with_mask(pio, sm, 2u << data_pins, 3u << data_pins);
pio_sm_set_pindirs_with_mask(pio, sm, ~0u, 3u << data_pins);
pio_gpio_init(pio, data_pins);
pio_gpio_init(pio, data_pins + 1);
pio_sm_config c;
if (debug) {
c = dvi_serialiser_debug_program_get_default_config(offset);
}
else {
c = dvi_serialiser_program_get_default_config(offset);
}
sm_config_set_sideset_pins(&c, data_pins);
if (debug)
sm_config_set_out_pins(&c, data_pins, 1);
sm_config_set_out_shift(&c, true, !debug, 10 * DVI_SYMBOLS_PER_WORD);
sm_config_set_fifo_join(&c, PIO_FIFO_JOIN_TX);
pio_sm_init(pio, sm, offset, &c);
pio_sm_set_enabled(pio, sm, false);
}
#endif

324
src/libdvi/dvi_timing.c Normal file
View file

@ -0,0 +1,324 @@
#include "dvi.h"
#include "dvi_timing.h"
#include "hardware/dma.h"
// This file contains:
// - Timing parameters for DVI modes (horizontal + vertical counts, best
// achievable bit clock from 12 MHz crystal)
// - Helper functions for generating DMA lists based on these timings
extern bool dvi_monochrome_tmds; // In dvi.c
// Pull into RAM but apply unique section suffix to allow linker GC
#define __dvi_func(x) __not_in_flash_func(x)
#define __dvi_const(x) __not_in_flash_func(x)
// VGA -- we do this mode properly, with a pretty comfortable clk_sys (252 MHz)
const struct dvi_timing __dvi_const(dvi_timing_640x480p_60hz) = {
.h_sync_polarity = false,
.h_front_porch = 16,
.h_sync_width = 96,
.h_back_porch = 48,
.h_active_pixels = 640,
.v_sync_polarity = false,
.v_front_porch = 10,
.v_sync_width = 2,
.v_back_porch = 33,
.v_active_lines = 480,
.bit_clk_khz = 252000
};
// SVGA -- completely by-the-book but requires 400 MHz clk_sys
const struct dvi_timing __dvi_const(dvi_timing_800x600p_60hz) = {
.h_sync_polarity = false,
.h_front_porch = 44,
.h_sync_width = 128,
.h_back_porch = 88,
.h_active_pixels = 800,
.v_sync_polarity = false,
.v_front_porch = 1,
.v_sync_width = 4,
.v_back_porch = 23,
.v_active_lines = 600,
.bit_clk_khz = 400000
};
// 800x480p 60 Hz (note this doesn't seem to be a CEA mode, I just used the
// output of `cvt 800 480 60`), 295 MHz bit clock
const struct dvi_timing __dvi_const(dvi_timing_800x480p_60hz) = {
.h_sync_polarity = false,
.h_front_porch = 24,
.h_sync_width = 72,
.h_back_porch = 96,
.h_active_pixels = 800,
.v_sync_polarity = true,
.v_front_porch = 3,
.v_sync_width = 10,
.v_back_porch = 7,
.v_active_lines = 480,
.bit_clk_khz = 295200
};
// SVGA reduced blanking (355 MHz bit clock) -- valid CVT mode, less common
// than fully-blanked SVGA, but doesn't require such a high system clock
const struct dvi_timing __dvi_const(dvi_timing_800x600p_reduced_60hz) = {
.h_sync_polarity = true,
.h_front_porch = 48,
.h_sync_width = 32,
.h_back_porch = 80,
.h_active_pixels = 800,
.v_sync_polarity = false,
.v_front_porch = 3,
.v_sync_width = 4,
.v_back_porch = 11,
.v_active_lines = 600,
.bit_clk_khz = 354000
};
// Also known as qHD, bit uncommon, but it's a nice modest-resolution 16:9
// aspect mode. Pixel clock 37.3 MHz
const struct dvi_timing __dvi_const(dvi_timing_960x540p_60hz) = {
.h_sync_polarity = true,
.h_front_porch = 16,
.h_sync_width = 32,
.h_back_porch = 96,
.h_active_pixels = 960,
.v_sync_polarity = true,
.v_front_porch = 2,
.v_sync_width = 6,
.v_back_porch = 15,
.v_active_lines = 540,
.bit_clk_khz = 372000
};
// Note this is NOT the correct 720p30 CEA mode, but rather 720p60 run at half
// pixel clock. Seems to be commonly accepted (and is a valid CVT mode). The
// actual CEA mode is the same pixel clock as 720p60 but with >50% blanking,
// which would require a clk_sys of 742 MHz!
const struct dvi_timing __dvi_const(dvi_timing_1280x720p_30hz) = {
.h_sync_polarity = true,
.h_front_porch = 110,
.h_sync_width = 40,
.h_back_porch = 220,
.h_active_pixels = 1280,
.v_sync_polarity = true,
.v_front_porch = 5,
.v_sync_width = 5,
.v_back_porch = 20,
.v_active_lines = 720,
.bit_clk_khz = 372000
};
// Reduced-blanking (CVT) 720p. You aren't supposed to use reduced blanking
// modes below 60 Hz, but I won't tell anyone (and it works on the monitors
// I've tried). This nets a lower system clock than regular 720p30 (319 MHz)
const struct dvi_timing __dvi_const(dvi_timing_1280x720p_reduced_30hz) = {
.h_sync_polarity = true,
.h_front_porch = 48,
.h_sync_width = 32,
.h_back_porch = 80,
.h_active_pixels = 1280,
.v_sync_polarity = false,
.v_front_porch = 3,
.v_sync_width = 5,
.v_back_porch = 13,
.v_active_lines = 720,
.bit_clk_khz = 319200
};
// This requires a spicy 488 MHz system clock and is illegal in most countries
// (you need to have a very lucky piece of silicon to run this at 1.3 V, or
// connect an external supply and give it a bit more juice)
const struct dvi_timing __dvi_const(dvi_timing_1600x900p_reduced_30hz) = {
.h_sync_polarity = true,
.h_front_porch = 48,
.h_sync_width = 32,
.h_back_porch = 80,
.h_active_pixels = 1600,
.v_sync_polarity = false,
.v_front_porch = 3,
.v_sync_width = 5,
.v_back_porch = 18,
.v_active_lines = 900,
.bit_clk_khz = 488000
};
// ----------------------------------------------------------------------------
// The DMA scheme is:
//
// - One channel transferring data to each of the three PIO state machines
// performing TMDS serialisation
//
// - One channel programming the registers of each of these data channels,
// triggered (CHAIN_TO) each time the corresponding data channel completes
//
// - Lanes 1 and 2 have one block for blanking and one for video data
//
// - Lane 0 has one block for each horizontal region (front porch, hsync, back
// porch, active)
//
// - The IRQ_QUIET flag is used to select which data block on the sync lane is
// allowed to generate an IRQ upon completion. This is the block immediately
// before the horizontal active region. The IRQ is entered at ~the same time
// as the last data transfer starts
//
// - The IRQ points the control channels at new blocklists for next scanline.
// The DMA starts the new list automatically at end-of-scanline, via
// CHAIN_TO.
//
// The horizontal active region is the longest continuous transfer, so this
// gives the most time to handle the IRQ and load new blocklists.
//
// Note a null trigger IRQ is not suitable because we get that *after* the
// last data transfer finishes, and the FIFOs bottom out very shortly
// afterward. For pure DVI (four blocks per scanline), it works ok to take
// four regular IRQs per scanline and return early from 3 of them, but this
// breaks down when you have very short scanline sections like guard bands.
// Each symbol appears twice, concatenated in one word. Note these must be in
// RAM because they see a lot of DMA traffic
const uint32_t __dvi_const(dvi_ctrl_syms)[4] = {
0xd5354,
0x2acab,
0x55154,
0xaaeab
};
// Output solid red scanline if we are given NULL for tmdsbuff
#if DVI_SYMBOLS_PER_WORD == 2
static uint32_t __dvi_const(empty_scanline_tmds)[3] = {
0x7fd00u, // 0x00, 0x00
0x7fd00u, // 0x00, 0x00
0xbfa01u // 0xfc, 0xfc
};
#else
static uint32_t __attribute__((aligned(8))) __dvi_const(empty_scanline_tmds)[6] = {
0x100u, 0x1ffu, // 0x00, 0x00
0x100u, 0x1ffu, // 0x00, 0x00
0x201u, 0x2feu // 0xfc, 0xfc
};
#endif
void dvi_timing_state_init(struct dvi_timing_state *t) {
t->v_ctr = 0;
t->v_state = DVI_STATE_FRONT_PORCH;
};
void __dvi_func(dvi_timing_state_advance)(const struct dvi_timing *t, struct dvi_timing_state *s) {
s->v_ctr++;
if ((s->v_state == DVI_STATE_FRONT_PORCH && s->v_ctr == t->v_front_porch) ||
(s->v_state == DVI_STATE_SYNC && s->v_ctr == t->v_sync_width) ||
(s->v_state == DVI_STATE_BACK_PORCH && s->v_ctr == t->v_back_porch) ||
(s->v_state == DVI_STATE_ACTIVE && s->v_ctr == t->v_active_lines)) {
s->v_state = (s->v_state + 1) % DVI_STATE_COUNT;
s->v_ctr = 0;
}
}
void dvi_scanline_dma_list_init(struct dvi_scanline_dma_list *dma_list) {
*dma_list = (struct dvi_scanline_dma_list){};
}
static const uint32_t *get_ctrl_sym(bool vsync, bool hsync) {
return &dvi_ctrl_syms[!!vsync << 1 | !!hsync];
}
// Make a sequence of paced transfers to the relevant FIFO
static void _set_data_cb(dma_cb_t *cb, const struct dvi_lane_dma_cfg *dma_cfg,
const void *read_addr, uint transfer_count, uint read_ring, bool irq_on_finish) {
cb->read_addr = read_addr;
cb->write_addr = dma_cfg->tx_fifo;
cb->transfer_count = transfer_count;
cb->c = dma_channel_get_default_config(dma_cfg->chan_data);
channel_config_set_ring(&cb->c, false, read_ring);
channel_config_set_dreq(&cb->c, dma_cfg->dreq);
// Call back to control channel for reconfiguration:
channel_config_set_chain_to(&cb->c, dma_cfg->chan_ctrl);
// Note we never send a null trigger, so IRQ_QUIET is an IRQ suppression flag
channel_config_set_irq_quiet(&cb->c, !irq_on_finish);
};
void dvi_setup_scanline_for_vblank(const struct dvi_timing *t, const struct dvi_lane_dma_cfg dma_cfg[],
bool vsync_asserted, struct dvi_scanline_dma_list *l) {
bool vsync = t->v_sync_polarity == vsync_asserted;
const uint32_t *sym_hsync_off = get_ctrl_sym(vsync, !t->h_sync_polarity);
const uint32_t *sym_hsync_on = get_ctrl_sym(vsync, t->h_sync_polarity);
const uint32_t *sym_no_sync = get_ctrl_sym(false, false );
dma_cb_t *synclist = dvi_lane_from_list(l, TMDS_SYNC_LANE);
// The symbol table contains each control symbol *twice*, concatenated into 20 LSBs of table word, so we can always do word-repeat.
_set_data_cb(&synclist[0], &dma_cfg[TMDS_SYNC_LANE], sym_hsync_off, t->h_front_porch / DVI_SYMBOLS_PER_WORD, 2, false);
_set_data_cb(&synclist[1], &dma_cfg[TMDS_SYNC_LANE], sym_hsync_on, t->h_sync_width / DVI_SYMBOLS_PER_WORD, 2, false);
_set_data_cb(&synclist[2], &dma_cfg[TMDS_SYNC_LANE], sym_hsync_off, t->h_back_porch / DVI_SYMBOLS_PER_WORD, 2, true);
_set_data_cb(&synclist[3], &dma_cfg[TMDS_SYNC_LANE], sym_hsync_off, t->h_active_pixels / DVI_SYMBOLS_PER_WORD, 2, false);
for (int i = 0; i < N_TMDS_LANES; ++i) {
if (i == TMDS_SYNC_LANE)
continue;
dma_cb_t *cblist = dvi_lane_from_list(l, i);
_set_data_cb(&cblist[0], &dma_cfg[i], sym_no_sync,(t->h_front_porch + t->h_sync_width + t->h_back_porch) / DVI_SYMBOLS_PER_WORD, 2, false);
_set_data_cb(&cblist[1], &dma_cfg[i], sym_no_sync, t->h_active_pixels / DVI_SYMBOLS_PER_WORD, 2, false);
}
}
void dvi_setup_scanline_for_active(const struct dvi_timing *t, const struct dvi_lane_dma_cfg dma_cfg[],
uint32_t *tmdsbuf, struct dvi_scanline_dma_list *l) {
const uint32_t *sym_hsync_off = get_ctrl_sym(!t->v_sync_polarity, !t->h_sync_polarity);
const uint32_t *sym_hsync_on = get_ctrl_sym(!t->v_sync_polarity, t->h_sync_polarity);
const uint32_t *sym_no_sync = get_ctrl_sym(false, false );
dma_cb_t *synclist = dvi_lane_from_list(l, TMDS_SYNC_LANE);
_set_data_cb(&synclist[0], &dma_cfg[TMDS_SYNC_LANE], sym_hsync_off, t->h_front_porch / DVI_SYMBOLS_PER_WORD, 2, false);
_set_data_cb(&synclist[1], &dma_cfg[TMDS_SYNC_LANE], sym_hsync_on, t->h_sync_width / DVI_SYMBOLS_PER_WORD, 2, false);
_set_data_cb(&synclist[2], &dma_cfg[TMDS_SYNC_LANE], sym_hsync_off, t->h_back_porch / DVI_SYMBOLS_PER_WORD, 2, true);
for (int i = 0; i < N_TMDS_LANES; ++i) {
dma_cb_t *cblist = dvi_lane_from_list(l, i);
if (i != TMDS_SYNC_LANE) {
_set_data_cb(&cblist[0], &dma_cfg[i], sym_no_sync,
(t->h_front_porch + t->h_sync_width + t->h_back_porch) / DVI_SYMBOLS_PER_WORD, 2, false);
}
int target_block = i == TMDS_SYNC_LANE ? DVI_SYNC_LANE_CHUNKS - 1 : DVI_NOSYNC_LANE_CHUNKS - 1;
if (tmdsbuf) {
// Non-repeating DMA for the freshly-encoded TMDS buffer
_set_data_cb(&cblist[target_block], &dma_cfg[i], tmdsbuf + i * (t->h_active_pixels / DVI_SYMBOLS_PER_WORD),
t->h_active_pixels / DVI_SYMBOLS_PER_WORD, 0, false);
}
else {
// Use read ring to repeat the correct DC-balanced symbol pair on blank scanlines (4 or 8 byte period)
_set_data_cb(&cblist[target_block], &dma_cfg[i], &empty_scanline_tmds[2 * i / DVI_SYMBOLS_PER_WORD],
t->h_active_pixels / DVI_SYMBOLS_PER_WORD, DVI_SYMBOLS_PER_WORD == 2 ? 2 : 3, false);
}
}
}
void __dvi_func(dvi_update_scanline_data_dma)(const struct dvi_timing *t, const uint32_t *tmdsbuf, struct dvi_scanline_dma_list *l) {
for (int i = 0; i < N_TMDS_LANES; ++i) {
const uint32_t *lane_tmdsbuf = dvi_monochrome_tmds ? tmdsbuf : tmdsbuf + i * t->h_active_pixels / DVI_SYMBOLS_PER_WORD;
if (i == TMDS_SYNC_LANE)
dvi_lane_from_list(l, i)[3].read_addr = lane_tmdsbuf;
else
dvi_lane_from_list(l, i)[1].read_addr = lane_tmdsbuf;
}
}

99
src/libdvi/dvi_timing.h Normal file
View file

@ -0,0 +1,99 @@
#ifndef _DVI_TIMING_H
#define _DVI_TIMING_H
#include "hardware/dma.h"
#include "pico/util/queue.h"
#include "dvi.h"
struct dvi_timing {
bool h_sync_polarity;
uint h_front_porch;
uint h_sync_width;
uint h_back_porch;
uint h_active_pixels;
bool v_sync_polarity;
uint v_front_porch;
uint v_sync_width;
uint v_back_porch;
uint v_active_lines;
uint bit_clk_khz;
};
enum dvi_line_state {
DVI_STATE_FRONT_PORCH = 0,
DVI_STATE_SYNC,
DVI_STATE_BACK_PORCH,
DVI_STATE_ACTIVE,
DVI_STATE_COUNT
};
struct dvi_timing_state {
uint v_ctr;
enum dvi_line_state v_state;
};
// This should map directly to DMA register layout, but more convenient types
// (also this really shouldn't be here... we don't have a dma_cb in the SDK
// because there are many valid formats due to aliases)
typedef struct dma_cb {
const void *read_addr;
void *write_addr;
uint32_t transfer_count;
dma_channel_config c;
} dma_cb_t;
static_assert(sizeof(dma_cb_t) == 4 * sizeof(uint32_t), "bad dma layout");
static_assert(__builtin_offsetof(dma_cb_t, c.ctrl) == __builtin_offsetof(dma_channel_hw_t, ctrl_trig), "bad dma layout");
#define DVI_SYNC_LANE_CHUNKS DVI_STATE_COUNT
#define DVI_NOSYNC_LANE_CHUNKS 2
struct dvi_scanline_dma_list {
dma_cb_t l0[DVI_SYNC_LANE_CHUNKS];
dma_cb_t l1[DVI_NOSYNC_LANE_CHUNKS];
dma_cb_t l2[DVI_NOSYNC_LANE_CHUNKS];
};
static inline dma_cb_t* dvi_lane_from_list(struct dvi_scanline_dma_list *l, int i) {
return i == 0 ? l->l0 : i == 1 ? l->l1 : l->l2;
}
// Each TMDS lane uses one DMA channel to transfer data to a PIO state
// machine, and another channel to load control blocks into this channel.
struct dvi_lane_dma_cfg {
uint chan_ctrl;
uint chan_data;
void *tx_fifo;
uint dreq;
};
// Note these are already converted to pseudo-differential representation
extern const uint32_t dvi_ctrl_syms[4];
extern const struct dvi_timing dvi_timing_640x480p_60hz;
extern const struct dvi_timing dvi_timing_800x480p_60hz;
extern const struct dvi_timing dvi_timing_800x600p_60hz;
extern const struct dvi_timing dvi_timing_960x540p_60hz;
extern const struct dvi_timing dvi_timing_1280x720p_30hz;
extern const struct dvi_timing dvi_timing_800x600p_reduced_60hz;
extern const struct dvi_timing dvi_timing_1280x720p_reduced_30hz;
void dvi_timing_state_init(struct dvi_timing_state *t);
void dvi_timing_state_advance(const struct dvi_timing *t, struct dvi_timing_state *s);
void dvi_scanline_dma_list_init(struct dvi_scanline_dma_list *dma_list);
void dvi_setup_scanline_for_vblank(const struct dvi_timing *t, const struct dvi_lane_dma_cfg dma_cfg[],
bool vsync_asserted, struct dvi_scanline_dma_list *l);
void dvi_setup_scanline_for_active(const struct dvi_timing *t, const struct dvi_lane_dma_cfg dma_cfg[],
uint32_t *tmdsbuf, struct dvi_scanline_dma_list *l);
void dvi_update_scanline_data_dma(const struct dvi_timing *t, const uint32_t *tmdsbuf, struct dvi_scanline_dma_list *l);
#endif

623
src/libdvi/tmds_encode.S Normal file
View file

@ -0,0 +1,623 @@
#include "hardware/regs/addressmap.h"
#include "hardware/regs/sio.h"
#include "dvi_config_defs.h"
// Offsets suitable for ldr/str (must be <= 0x7c):
#define ACCUM0_OFFS (SIO_INTERP0_ACCUM0_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
#define ACCUM1_OFFS (SIO_INTERP0_ACCUM1_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
#define ACCUM1_ADD_OFFS (SIO_INTERP0_ACCUM1_ADD_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
#define PEEK0_OFFS (SIO_INTERP0_PEEK_LANE0_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
#define PEEK1_OFFS (SIO_INTERP0_PEEK_LANE1_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
#define PEEK2_OFFS (SIO_INTERP0_PEEK_FULL_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
#define INTERP1 (SIO_INTERP1_ACCUM0_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
// Note the entirety of INTERP0 and INTERP1 fits inside this 5-bit
// word-addressed space... almost as though it were intentional! :)
.syntax unified
.cpu cortex-m0plus
.thumb
.macro decl_func_x name
.section .scratch_x.\name, "ax"
.global \name
.type \name,%function
.thumb_func
\name:
.endm
.macro decl_func_y name
.section .scratch_y.\name, "ax"
.global \name
.type \name,%function
.thumb_func
\name:
.endm
#define decl_func decl_func_x
// ----------------------------------------------------------------------------
// Pixel-doubling encoders for RGB
// r0: Input buffer (word-aligned)
// r1: Output buffer (word-aligned)
// r2: Input size (pixels)
.macro do_channel_16bpp r_ibase r_inout0 r_out1
str \r_inout0, [\r_ibase, #ACCUM0_OFFS]
ldr \r_inout0, [\r_ibase, #PEEK0_OFFS]
ldr \r_inout0, [\r_inout0]
ldr \r_out1, [\r_ibase, #PEEK1_OFFS]
ldr \r_out1, [\r_out1]
.endm
decl_func tmds_encode_loop_16bpp
push {r4, r5, r6, r7, lr}
lsls r2, #2
add r2, r1
mov ip, r2
ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
b 2f
.align 2
1:
.rept TMDS_ENCODE_UNROLL
ldmia r0!, {r4, r6}
do_channel_16bpp r2, r4, r5
do_channel_16bpp r2, r6, r7
stmia r1!, {r4, r5, r6, r7}
.endr
2:
cmp r1, ip
bne 1b
pop {r4, r5, r6, r7, pc}
// Same as above, but scale data to make up for lack of left shift
// in interpolator (costs 1 cycle per 2 pixels)
//
// r0: Input buffer (word-aligned)
// r1: Output buffer (word-aligned)
// r2: Input size (pixels)
// r3: Left shift amount
decl_func tmds_encode_loop_16bpp_leftshift
push {r4, r5, r6, r7, lr}
lsls r2, #2
add r2, r1
mov ip, r2
ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
b 2f
.align 2
1:
.rept TMDS_ENCODE_UNROLL
ldmia r0!, {r4, r6}
lsls r4, r3
do_channel_16bpp r2, r4, r5
lsls r6, r3
do_channel_16bpp r2, r6, r7
stmia r1!, {r4, r5, r6, r7}
.endr
2:
cmp r1, ip
bne 1b
pop {r4, r5, r6, r7, pc}
// r0: Input buffer (word-aligned)
// r1: Output buffer (word-aligned)
// r2: Input size (pixels)
decl_func tmds_encode_loop_8bpp
push {r4, r5, r6, r7, lr}
lsls r2, #2
add r2, r1
mov ip, r2
ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
b 2f
.align 2
1:
.rept TMDS_ENCODE_UNROLL
ldmia r0!, {r4}
str r4, [r2, #ACCUM0_OFFS + INTERP1]
str r4, [r2, #ACCUM0_OFFS]
ldr r4, [r2, #PEEK0_OFFS]
ldr r4, [r4]
ldr r5, [r2, #PEEK1_OFFS]
ldr r5, [r5]
ldr r6, [r2, #PEEK0_OFFS + INTERP1]
ldr r6, [r6]
ldr r7, [r2, #PEEK1_OFFS + INTERP1]
ldr r7, [r7]
stmia r1!, {r4, r5, r6, r7}
.endr
2:
cmp r1, ip
bne 1b
pop {r4, r5, r6, r7, pc}
// r0: Input buffer (word-aligned)
// r1: Output buffer (word-aligned)
// r2: Input size (pixels)
// r3: Left shift amount
//
// Note that only the data written to interp0 (pixel 0, 1) is leftshifted, not
// the data written to interp1 (pixel 2, 3). Otherwise we always lose MSBs, as
// the LUT offset MSB is at bit 8, so pixel 0 always requires some left shift,
// since its channel MSBs are no greater than 7.
decl_func tmds_encode_loop_8bpp_leftshift
push {r4, r5, r6, r7, lr}
lsls r2, #3
add r2, r1
mov ip, r2
ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
b 2f
.align 2
1:
.rept TMDS_ENCODE_UNROLL
ldmia r0!, {r4}
str r4, [r2, #ACCUM0_OFFS + INTERP1]
lsls r4, r3
str r4, [r2, #ACCUM0_OFFS]
ldr r4, [r2, #PEEK0_OFFS]
ldr r4, [r4]
ldr r5, [r2, #PEEK1_OFFS]
ldr r5, [r5]
ldr r6, [r2, #PEEK0_OFFS + INTERP1]
ldr r6, [r6]
ldr r7, [r2, #PEEK1_OFFS + INTERP1]
ldr r7, [r7]
stmia r1!, {r4, r5, r6, r7}
.endr
2:
cmp r1, ip
bne 1b
pop {r4, r5, r6, r7, pc}
// ----------------------------------------------------------------------------
// Fast 1bpp black/white encoder (full res)
// Taking the encoder from DVI spec, with initial balance 0:
//
// - Encoding either 0x00 or 0xff will produce a running balance of -8, with
// output symbol of 0x100 or 0x200
//
// - Subsequently encoding either 0x01 or 0xfe will return the balance to 0, with
// output symbol of 0x1ff or 0x2ff
//
// So we can do 1bpp encode with a lookup of x coordinate LSB, and input
// colour bit. If we process pixels in even-sized blocks, only the colour
// lookup is needed.
// Encode 8 pixels @ 1bpp (using two table lookups)
// r3 contains lookup mask (preshifted)
// r8 contains pointer to encode table
// 2.125 cyc/pix
.macro tmds_encode_1bpp_body shift_instr0 shamt0 shift_instr1 shamt1
\shift_instr0 r4, r2, #\shamt0
ands r4, r3
add r4, r8
ldmia r4, {r4, r5}
\shift_instr1 r6, r2, #\shamt1
ands r6, r3
add r6, r8
ldmia r6, {r6, r7}
stmia r1!, {r4, r5, r6, r7}
.endm
// r0: input buffer (word-aligned)
// r1: output buffer (word-aligned)
// r2: output pixel count
decl_func tmds_encode_1bpp
push {r4-r7, lr}
mov r7, r8
push {r7}
lsls r2, #1
add r2, r1
mov ip, r2
adr r4, tmds_1bpp_table
mov r8, r4
// Mask: 4 bit index, 8 bytes per entry
movs r3, #0x78
b 2f
1:
ldmia r0!, {r2}
#if !DVI_1BPP_BIT_REVERSE
tmds_encode_1bpp_body lsls 3 lsrs 1
tmds_encode_1bpp_body lsrs 5 lsrs 9
tmds_encode_1bpp_body lsrs 13 lsrs 17
tmds_encode_1bpp_body lsrs 21 lsrs 25
#else
tmds_encode_1bpp_body lsrs 1 lsls 3
tmds_encode_1bpp_body lsrs 9 lsrs 5
tmds_encode_1bpp_body lsrs 17 lsrs 13
tmds_encode_1bpp_body lsrs 25 lsrs 21
#endif
2:
cmp r1, ip
blo 1b
pop {r7}
mov r8, r7
pop {r4-r7, pc}
.align 2
tmds_1bpp_table:
#if !DVI_1BPP_BIT_REVERSE
.word 0x7fd00, 0x7fd00 // 0000
.word 0x7fe00, 0x7fd00 // 0001
.word 0xbfd00, 0x7fd00 // 0010
.word 0xbfe00, 0x7fd00 // 0011
.word 0x7fd00, 0x7fe00 // 0100
.word 0x7fe00, 0x7fe00 // 0101
.word 0xbfd00, 0x7fe00 // 0110
.word 0xbfe00, 0x7fe00 // 0111
.word 0x7fd00, 0xbfd00 // 1000
.word 0x7fe00, 0xbfd00 // 1001
.word 0xbfd00, 0xbfd00 // 1010
.word 0xbfe00, 0xbfd00 // 1011
.word 0x7fd00, 0xbfe00 // 1100
.word 0x7fe00, 0xbfe00 // 1101
.word 0xbfd00, 0xbfe00 // 1110
.word 0xbfe00, 0xbfe00 // 1111
#else
.word 0x7fd00, 0x7fd00 // 0000
.word 0x7fd00, 0xbfd00 // 1000
.word 0x7fd00, 0x7fe00 // 0100
.word 0x7fd00, 0xbfe00 // 1100
.word 0xbfd00, 0x7fd00 // 0010
.word 0xbfd00, 0xbfd00 // 1010
.word 0xbfd00, 0x7fe00 // 0110
.word 0xbfd00, 0xbfe00 // 1110
.word 0x7fe00, 0x7fd00 // 0001
.word 0x7fe00, 0xbfd00 // 1001
.word 0x7fe00, 0x7fe00 // 0101
.word 0x7fe00, 0xbfe00 // 1101
.word 0xbfe00, 0x7fd00 // 0011
.word 0xbfe00, 0xbfd00 // 1011
.word 0xbfe00, 0x7fe00 // 0111
.word 0xbfe00, 0xbfe00 // 1111
#endif
// ----------------------------------------------------------------------------
// Full-resolution 2bpp encode (for 2bpp grayscale, or bitplaned RGB222)
// Even-x-position pixels are encoded as symbols with imbalance -4, and odd
// pixels with +4, so that we can mix-and-match our even/odd codewords and
// always get a properly balanced sequence:
//
// level 0: (05 -> 103), then (04 -> 1fc) (decimal 5, 4)
// level 1: (50 -> 130), then (51 -> 1cf) (decimal 80, 81)
// level 2: (af -> 230), then (ae -> 2cf) (decimal 175, 174)
// level 3: (fa -> 203), then (fb -> 2fc) (decimal 250, 251)
//
// These correspond to roughly 255 times (0, 1/3, 2/3, 1).
//
// Alternatively we could use symbols with 0 balance, which results in lower
// contrast but avoids the LSB bobble:
//
// level 0: (10 -> 1f0) always
// level 1: (5a -> 263) always
// level 2: (a5 -> 163) always
// level 3: (ef -> 2f0) always
// Table base pointer in r0. Input pixels in r2.
.macro encode_2bpp_body shift_instr shamt rd
\shift_instr \rd, r2, #\shamt
ands \rd, r3
ldr \rd, [r0, \rd]
.endm
// r0: input buffer (word-aligned)
// r1: output buffer (word-aligned)
// r2: output pixel count
decl_func tmds_encode_2bpp
push {r4-r7, lr}
mov r7, r8
push {r7}
mov r8, r0
adr r0, tmds_2bpp_table
// Mask: 4-bit index into 4-byte entries.
movs r3, #0x3c
// Limit pointer: 1 word per 2 pixels
lsls r2, #1
add r2, r1
mov ip, r2
b 2f
1:
mov r4, r8
ldmia r4!, {r2}
mov r8, r4
encode_2bpp_body lsls 2 r4
encode_2bpp_body lsrs 2 r5
encode_2bpp_body lsrs 6 r6
encode_2bpp_body lsrs 10 r7
stmia r1!, {r4-r7}
encode_2bpp_body lsrs 14 r4
encode_2bpp_body lsrs 18 r5
encode_2bpp_body lsrs 22 r6
encode_2bpp_body lsrs 26 r7
stmia r1!, {r4-r7}
2:
cmp r1, ip
blo 1b
pop {r7}
mov r8, r7
pop {r4-r7, pc}
.align 2
tmds_2bpp_table:
.word 0x7f103 // 00, 00
.word 0x7f130 // 01, 00
.word 0x7f230 // 10, 00
.word 0x7f203 // 11, 00
.word 0x73d03 // 00, 01
.word 0x73d30 // 01, 01
.word 0x73e30 // 10, 01
.word 0x73e03 // 11, 01
.word 0xb3d03 // 00, 10
.word 0xb3d30 // 01, 10
.word 0xb3e30 // 10, 10
.word 0xb3e03 // 11, 10
.word 0xbf103 // 00, 11
.word 0xbf130 // 01, 11
.word 0xbf230 // 10, 11
.word 0xbf203 // 11, 11
// ----------------------------------------------------------------------------
// Full-resolution RGB encode (not very practical)
// Non-doubled TMDS encode. 8.333 cycles per pixel, no exceptions. (This is
// taking horizontal blanking (at VGA) and dual core into account, and
// assuming the 3 channels are encoded individually.)
//
// Here is an idea
// Have a table with a 7 bit lookup. The lookup is the 6 colour data bits (in
// ACCUM0), concatenated with the sign bit of our running disparity (from
// ACCUM1). Each table entry is a 20-bit TMDS symbol (pseudodifferential),
// with the symbol's disparity stored left-justified in the upper 12 bits, as
// e.g. a 6 bit signed integer.
//
// - Load pixel data. cyc: 0.75 (ldmia 2 words, every 4 pixels)
// - Write pixel to ACCUM0. cyc: 1
// - Read address from PEEK2. cyc: 1
// - Load encoded pixel from address. cyc: 2
// - Write disparity data to ACCUM1_ADD cyc: 1
// - Write encoded data to output buffer. cyc: 1.25 (stmia 4 words, every 4 pixels)
//
// With decent register allocation we may be able to load 4 pixels at
// once (2 words), and write 4 at once (4 words). This gives 7 cyc/pix.
//
// One issue is that the TMDS data in the bottom of ACCUM1 will eventually
// overflow and affect the running disparity, but with 16 zeroes in between,
// this would take much longer than one scanline, so everything is fine if
// we clear the accumulator at the start of the scanline.
//
// Note that we need to use two interpolators to get the bits from both pixels
// -- we are not outputting a single DC-balanced stream, but rather two
// interleaved streams which are each DC-balanced. This is fine electrically,
// but our output here will *NOT* match the TMDS encoder given in the DVI
// spec.
// You can define TMDS_FULLRES_NO_DC_BALANCE to disable the running balance
// feedback. With the feedback enabled (default), the output is DC balanced,
// but there are just barely enough CPU cycles to do all the encode, so it's
// essentially a party trick. If you disable DC balancing, the performance is
// much better, and many monitors will still accept the signals as long as you
// DC couple your DVI signals.
.macro tmds_fullres_encode_loop_body ra rb
str \ra, [r2, #ACCUM0_OFFS + INTERP1]
str \ra, [r2, #ACCUM0_OFFS]
ldr \ra, [r2, #PEEK2_OFFS]
ldr \ra, [\ra]
#if !TMDS_FULLRES_NO_DC_BALANCE
str \ra, [r2, #ACCUM1_ADD_OFFS]
#endif
ldr \rb, [r2, #PEEK2_OFFS + INTERP1]
ldr \rb, [\rb]
#if !TMDS_FULLRES_NO_DC_BALANCE
str \rb, [r2, #ACCUM1_ADD_OFFS + INTERP1]
#endif
.endm
// r0: Input buffer (word-aligned)
// r1: Output buffer (word-aligned)
// r2: Pixel count
.macro tmds_fullres_encode_loop_16bpp
push {r4-r7, lr}
mov r4, r8
push {r4}
lsls r2, #2
add r2, r1
mov ip, r2
ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
// DC balance defined to be 0 at start of scanline:
movs r4, #0
str r4, [r2, #ACCUM1_OFFS]
#if TMDS_FULLRES_NO_DC_BALANCE
// Alternate parity between odd/even symbols if no feedback
mvns r4, r4
#endif
str r4, [r2, #ACCUM1_OFFS + INTERP1]
// Keep loop start pointer in r8 so we can get a longer backward branch
adr r4, 1f
adds r4, #1 // god damn thumb bit why is this a thing
mov r8, r4
b 2f
.align 2
1:
.rept 16
ldmia r0!, {r4, r6}
tmds_fullres_encode_loop_body r4 r5
tmds_fullres_encode_loop_body r6 r7
stmia r1!, {r4, r5, r6, r7}
.endr
2:
cmp r1, ip
beq 1f
bx r8
1:
pop {r4}
mov r8, r4
pop {r4-r7, pc}
.endm
// One copy each in X and Y, so the two cores don't step on each other
decl_func_x tmds_fullres_encode_loop_16bpp_x
tmds_fullres_encode_loop_16bpp
decl_func_y tmds_fullres_encode_loop_16bpp_y
tmds_fullres_encode_loop_16bpp
.macro tmds_fullres_encode_loop_body_leftshift ra rb
// Note we apply the leftshift for INTERP0 only
str \ra, [r2, #ACCUM0_OFFS + INTERP1]
lsls \ra, r3
str \ra, [r2, #ACCUM0_OFFS]
ldr \ra, [r2, #PEEK2_OFFS]
ldr \ra, [\ra]
#if !TMDS_FULLRES_NO_DC_BALANCE
str \ra, [r2, #ACCUM1_ADD_OFFS]
#endif
ldr \rb, [r2, #PEEK2_OFFS + INTERP1]
ldr \rb, [\rb]
#if !TMDS_FULLRES_NO_DC_BALANCE
str \rb, [r2, #ACCUM1_ADD_OFFS + INTERP1]
#endif
.endm
// r0: Input buffer (word-aligned)
// r1: Output buffer (word-aligned)
// r2: Pixel count
// r3: Left shift amount
.macro tmds_fullres_encode_loop_16bpp_leftshift
push {r4-r7, lr}
mov r4, r8
mov r5, r9
push {r4-r5}
lsls r2, #2
add r2, r1
mov ip, r2
ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
// DC balance defined to be 0 at start of scanline:
movs r4, #0
str r4, [r2, #ACCUM1_OFFS]
#if TMDS_FULLRES_NO_DC_BALANCE
// Alternate parity between odd/even symbols if there's no balance feedback
mvns r4, r4
#endif
str r4, [r2, #ACCUM1_OFFS + INTERP1]
adr r4, 1f
adds r4, #1
mov r8, r4
b 2f
.align 2
1:
.rept 16 // 64 pixels per iteration
ldmia r0!, {r4, r6}
tmds_fullres_encode_loop_body_leftshift r4 r5
tmds_fullres_encode_loop_body_leftshift r6 r7
stmia r1!, {r4, r5, r6, r7}
.endr
2:
cmp r1, ip
beq 1f
bx r8
1:
pop {r4-r5}
mov r8, r4
mov r9, r5
pop {r4-r7, pc}
.endm
decl_func_x tmds_fullres_encode_loop_16bpp_leftshift_x
tmds_fullres_encode_loop_16bpp_leftshift
decl_func_y tmds_fullres_encode_loop_16bpp_leftshift_y
tmds_fullres_encode_loop_16bpp_leftshift
// ----------------------------------------------------------------------------
// Full-resolution 8bpp paletted encode
// Variant of tmds_fullres_encode_loop_16bpp that reads
// 8-bit wide pixels packed 4 per word. The interpolator
// base is set to a reordered list of TMDS symbols based
// on a user colour palette.
// Two pixels input in rd[17:2]. Two symbols output in rd[19:0]. r2 contains
// interp base pointer. r7 used as temporary.
.macro tmds_palette_encode_loop_body rd
str \rd, [r2, #ACCUM0_OFFS]
str \rd, [r2, #ACCUM0_OFFS + INTERP1]
ldr \rd, [r2, #PEEK2_OFFS]
ldr \rd, [\rd]
#if !TMDS_FULLRES_NO_DC_BALANCE
str \rd, [r2, #ACCUM1_ADD_OFFS]
#endif
ldr r7, [r2, #PEEK2_OFFS + INTERP1]
ldr r7, [r7]
#if !TMDS_FULLRES_NO_DC_BALANCE
str r7, [r2, #ACCUM1_ADD_OFFS + INTERP1]
#endif
lsls r7, #10
orrs \rd, r7
.endm
.macro tmds_palette_encode_loop
push {r4-r7, lr}
mov r4, r8
push {r4}
lsls r2, #1
add r2, r1
mov ip, r2
ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
// DC balance defined to be 0 at start of scanline:
movs r4, #0
str r4, [r2, #ACCUM1_OFFS]
#if TMDS_FULLRES_NO_DC_BALANCE
// Alternate parity between odd/even symbols if there's no balance feedback
mvns r4, r4
#endif
str r4, [r2, #ACCUM1_OFFS + INTERP1]
// Keep loop start pointer in r8 so we can get a longer backward branch
adr r4, 1f
adds r4, #1 // god damn thumb bit why is this a thing
mov r8, r4
b 2f
.align 2
1:
.rept 10
ldmia r0!, {r3, r5}
lsrs r4, r3, #14
lsls r3, #2
lsrs r6, r5, #14
lsls r5, #2
tmds_palette_encode_loop_body r3
tmds_palette_encode_loop_body r4
tmds_palette_encode_loop_body r5
tmds_palette_encode_loop_body r6
stmia r1!, {r3, r4, r5, r6}
.endr
2:
cmp r1, ip
beq 1f
bx r8
1:
pop {r4}
mov r8, r4
pop {r4-r7, pc}
.endm
decl_func_x tmds_palette_encode_loop_x
tmds_palette_encode_loop
decl_func_y tmds_palette_encode_loop_y
tmds_palette_encode_loop

305
src/libdvi/tmds_encode.c Normal file
View file

@ -0,0 +1,305 @@
#include "hardware/interp.h"
#include "tmds_encode.h"
#include "hardware/gpio.h"
#include "hardware/sync.h"
static const uint32_t __scratch_x("tmds_table") tmds_table[] = {
#include "tmds_table.h"
};
// Fullres table is bandwidth-critical, so gets one copy for each scratch
// memory. There is a third copy which can go in flash, because it's just used
// to generate palette LUTs. The ones we don't use will get garbage collected
// during linking.
const uint32_t __scratch_x("tmds_table_fullres_x") tmds_table_fullres_x[] = {
#include "tmds_table_fullres.h"
};
const uint32_t __scratch_y("tmds_table_fullres_y") tmds_table_fullres_y[] = {
#include "tmds_table_fullres.h"
};
// Configure an interpolator to extract a single colour channel from each of a pair
// of pixels, with the first pixel's lsb at pixel_lsb, and the pixels being
// pixel_width wide. Produce a LUT address for the first pixel's colour data on
// LANE0, and the second pixel's colour data on LANE1.
//
// Returns nonzero if the *_leftshift variant of the encoder loop must be used
// (needed for blue channel because I was a stubborn idiot and didn't put
// signed/bidirectional shift on interpolator, very slightly slower). The
// return value is the size of left shift required.
static int __not_in_flash_func(configure_interp_for_addrgen)(interp_hw_t *interp, uint channel_msb, uint channel_lsb, uint pixel_lsb, uint pixel_width, uint lut_index_width, const uint32_t *lutbase) {
interp_config c;
const uint index_shift = 2; // scaled lookup for 4-byte LUT entries
int shift_channel_to_index = pixel_lsb + channel_msb - (lut_index_width - 1) - index_shift;
int oops = 0;
if (shift_channel_to_index < 0) {
// "It's ok we'll fix it in software"
oops = -shift_channel_to_index;
shift_channel_to_index = 0;
}
uint index_msb = index_shift + lut_index_width - 1;
c = interp_default_config();
interp_config_set_shift(&c, shift_channel_to_index);
interp_config_set_mask(&c, index_msb - (channel_msb - channel_lsb), index_msb);
interp_set_config(interp, 0, &c);
c = interp_default_config();
interp_config_set_shift(&c, pixel_width + shift_channel_to_index);
interp_config_set_mask(&c, index_msb - (channel_msb - channel_lsb), index_msb);
interp_config_set_cross_input(&c, true);
interp_set_config(interp, 1, &c);
interp->base[0] = (uint32_t)lutbase;
interp->base[1] = (uint32_t)lutbase;
return oops;
}
// Extract up to 6 bits from a buffer of 16 bit pixels, and produce a buffer
// of TMDS symbols from this colour channel. Number of pixels must be even,
// pixel buffer must be word-aligned.
void __not_in_flash_func(tmds_encode_data_channel_16bpp)(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint channel_msb, uint channel_lsb) {
interp_hw_save_t interp0_save;
interp_save(interp0_hw, &interp0_save);
int require_lshift = configure_interp_for_addrgen(interp0_hw, channel_msb, channel_lsb, 0, 16, 6, tmds_table);
if (require_lshift)
tmds_encode_loop_16bpp_leftshift(pixbuf, symbuf, n_pix, require_lshift);
else
tmds_encode_loop_16bpp(pixbuf, symbuf, n_pix);
interp_restore(interp0_hw, &interp0_save);
}
// As above, but 8 bits per pixel, multiple of 4 pixels, and still word-aligned.
void __not_in_flash_func(tmds_encode_data_channel_8bpp)(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint channel_msb, uint channel_lsb) {
interp_hw_save_t interp0_save, interp1_save;
interp_save(interp0_hw, &interp0_save);
interp_save(interp1_hw, &interp1_save);
// Note that for 8bpp, some left shift is always required for pixel 0 (any
// channel), which destroys some MSBs of pixel 3. To get around this, pixel
// data sent to interp1 is *not left-shifted*
int require_lshift = configure_interp_for_addrgen(interp0_hw, channel_msb, channel_lsb, 0, 8, 6, tmds_table);
int lshift_upper = configure_interp_for_addrgen(interp1_hw, channel_msb, channel_lsb, 16, 8, 6, tmds_table);
assert(!lshift_upper); (void)lshift_upper;
if (require_lshift)
tmds_encode_loop_8bpp_leftshift(pixbuf, symbuf, n_pix, require_lshift);
else
tmds_encode_loop_8bpp(pixbuf, symbuf, n_pix);
interp_restore(interp0_hw, &interp0_save);
interp_restore(interp1_hw, &interp1_save);
}
// ----------------------------------------------------------------------------
// Code for full-resolution TMDS encode (barely possible, utterly impractical):
// Different scheme used for full res as the fun pixel-doubling DC balance
// trick doesn't work, so we need to actually do running disparity. ACCUM0 has
// pixel data, ACCUM1 has running disparity. INTERP0 is used to process even
// pixels, and INTERP1 for odd pixels. Note this means that even and odd
// symbols have their DC balance handled separately, which is not to spec.
static int __not_in_flash_func(configure_interp_for_addrgen_fullres)(interp_hw_t *interp, uint channel_msb, uint channel_lsb, uint lut_index_width, const uint32_t *lutbase) {
const uint index_shift = 2; // scaled lookup for 4-byte LUT entries
int shift_channel_to_index = channel_msb - (lut_index_width - 1) - index_shift;
int oops = 0;
if (shift_channel_to_index < 0) {
// "It's ok we'll fix it in software"
oops = -shift_channel_to_index;
shift_channel_to_index = 0;
}
uint index_msb = index_shift + lut_index_width - 1;
interp_config c;
// Shift and mask colour channel to lower 6 bits of LUT index (note lut_index_width excludes disparity sign)
c = interp_default_config();
interp_config_set_shift(&c, shift_channel_to_index);
interp_config_set_mask(&c, index_msb - (channel_msb - channel_lsb), index_msb);
interp_set_config(interp, 0, &c);
// Concatenate disparity (ACCUM1) sign onto the LUT index
c = interp_default_config();
interp_config_set_shift(&c, 30 - index_msb);
interp_config_set_mask(&c, index_msb + 1, index_msb + 1);
interp_set_config(interp, 1, &c);
interp->base[2] = (uint32_t)lutbase;
return oops;
}
void __not_in_flash_func(tmds_encode_data_channel_fullres_16bpp)(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint channel_msb, uint channel_lsb) {
uint core = get_core_num();
#if !TMDS_FULLRES_NO_INTERP_SAVE
interp_hw_save_t interp0_save, interp1_save;
interp_save(interp0_hw, &interp0_save);
interp_save(interp1_hw, &interp1_save);
#endif
// There is a copy of the inner loop and the LUT in both scratch X and
// scratch Y memories. Use X on core 1 and Y on core 0 so the cores don't
// tread on each other's toes too much.
const uint32_t *lutbase = core ? tmds_table_fullres_x : tmds_table_fullres_y;
int lshift_lower = configure_interp_for_addrgen_fullres(interp0_hw, channel_msb, channel_lsb, 6, lutbase);
int lshift_upper = configure_interp_for_addrgen_fullres(interp1_hw, channel_msb + 16, channel_lsb + 16, 6, lutbase);
assert(!lshift_upper); (void)lshift_upper;
if (lshift_lower) {
(core ?
tmds_fullres_encode_loop_16bpp_leftshift_x :
tmds_fullres_encode_loop_16bpp_leftshift_y
)(pixbuf, symbuf, n_pix, lshift_lower);
}
else {
(core ?
tmds_fullres_encode_loop_16bpp_x :
tmds_fullres_encode_loop_16bpp_y
)(pixbuf, symbuf, n_pix);
}
#if !TMDS_FULLRES_NO_INTERP_SAVE
interp_restore(interp0_hw, &interp0_save);
interp_restore(interp1_hw, &interp1_save);
#endif
}
static const int8_t imbalance_lookup[16] = { -4, -2, -2, 0, -2, 0, 0, 2, -2, 0, 0, 2, 0, 2, 2, 4 };
static inline int byte_imbalance(uint32_t x)
{
return imbalance_lookup[x >> 4] + imbalance_lookup[x & 0xF];
}
static void tmds_encode_symbols(uint8_t pixel, uint32_t* negative_balance_sym, uint32_t* positive_balance_sym)
{
int pixel_imbalance = byte_imbalance(pixel);
uint32_t sym = pixel & 1;
if (pixel_imbalance > 0 || (pixel_imbalance == 0 && sym == 0)) {
for (int i = 0; i < 7; ++i) {
sym |= (~((sym >> i) ^ (pixel >> (i + 1))) & 1) << (i + 1);
}
}
else {
for (int i = 0; i < 7; ++i) {
sym |= ( ((sym >> i) ^ (pixel >> (i + 1))) & 1) << (i + 1);
}
sym |= 0x100;
}
int imbalance = byte_imbalance(sym & 0xFF);
if (imbalance == 0) {
if ((sym & 0x100) == 0) sym ^= 0x2ff;
*positive_balance_sym = sym;
*negative_balance_sym = sym;
return;
}
else if (imbalance > 0) {
*negative_balance_sym = (sym ^ 0x2ff) | (((-imbalance + imbalance_lookup[2 ^ (sym >> 8)] + 2) & 0x3F) << 26);
*positive_balance_sym = sym | ((imbalance + imbalance_lookup[sym >> 8] + 2) << 26);
}
else {
*negative_balance_sym = sym | (((imbalance + imbalance_lookup[sym >> 8] + 2) & 0x3F) << 26);
*positive_balance_sym = (sym ^ 0x2ff) | ((-imbalance + imbalance_lookup[2 ^ (sym >> 8)] + 2) << 26);
}
}
// This takes a 16-bit (RGB 565) colour palette and makes palettes of TMDS symbols suitable
// for performing fullres encode.
// The TMDS palette buffer should be 6 * n_palette words long.
// n_palette must be a power of 2 <= 256.
void tmds_setup_palette_symbols(const uint16_t *palette, uint32_t *tmds_palette, size_t n_palette) {
uint32_t* tmds_palette_blue = tmds_palette;
uint32_t* tmds_palette_green = tmds_palette + 2 * n_palette;
uint32_t* tmds_palette_red = tmds_palette + 4 * n_palette;
for (int i = 0; i < n_palette; ++i) {
uint16_t blue = (palette[i] << 3) & 0xf8;
uint16_t green = (palette[i] >> 3) & 0xfc;
uint16_t red = (palette[i] >> 8) & 0xf8;
tmds_encode_symbols(blue, &tmds_palette_blue[i], &tmds_palette_blue[i + n_palette]);
tmds_encode_symbols(green, &tmds_palette_green[i], &tmds_palette_green[i + n_palette]);
tmds_encode_symbols(red, &tmds_palette_red[i], &tmds_palette_red[i + n_palette]);
}
}
// This takes a 24-bit (RGB 888) colour palette and makes palettes of TMDS symbols suitable
// for performing fullres encode.
// The TMDS palette buffer should be 6 * n_palette words long.
// n_palette must be a power of 2 <= 256.
void tmds_setup_palette24_symbols(const uint32_t *palette, uint32_t *tmds_palette, size_t n_palette) {
uint32_t* tmds_palette_blue = tmds_palette;
uint32_t* tmds_palette_green = tmds_palette + 2 * n_palette;
uint32_t* tmds_palette_red = tmds_palette + 4 * n_palette;
for (int i = 0; i < n_palette; ++i) {
uint16_t blue = palette[i] & 0xff;
uint16_t green = (palette[i] >> 8) & 0xff;
uint16_t red = (palette[i] >> 16) & 0xff;
tmds_encode_symbols(blue, &tmds_palette_blue[i], &tmds_palette_blue[i + n_palette]);
tmds_encode_symbols(green, &tmds_palette_green[i], &tmds_palette_green[i + n_palette]);
tmds_encode_symbols(red, &tmds_palette_red[i], &tmds_palette_red[i + n_palette]);
}
}
// Encode palette data for all 3 channels.
// pixbuf is an array of n_pix 8-bit wide pixels containing palette values (32-bit word aligned)
// tmds_palette is a palette of TMDS symbols produced by tmds_setup_palette_symbols
// symbuf is 3*n_pix 32-bit words, this function writes the symbol values for each of the channels to it.
void __not_in_flash_func(tmds_encode_palette_data)(const uint32_t *pixbuf, const uint32_t *tmds_palette, uint32_t *symbuf, size_t n_pix, uint32_t palette_bits) {
uint core = get_core_num();
#if !TMDS_FULLRES_NO_INTERP_SAVE
interp_hw_save_t interp0_save, interp1_save;
interp_save(interp0_hw, &interp0_save);
interp_save(interp1_hw, &interp1_save);
#endif
interp0_hw->base[2] = (uint32_t)tmds_palette;
interp1_hw->base[2] = (uint32_t)tmds_palette;
// Lane 0 on both interpolators masks the palette bits, starting at bit 2,
// The second interpolator also shifts to read the 2nd or 4th byte of the word.
interp0_hw->ctrl[0] =
(2 << SIO_INTERP0_CTRL_LANE0_MASK_LSB_LSB) |
((palette_bits + 1) << SIO_INTERP0_CTRL_LANE0_MASK_MSB_LSB);
interp1_hw->ctrl[0] =
(8 << SIO_INTERP0_CTRL_LANE0_SHIFT_LSB) |
(2 << SIO_INTERP0_CTRL_LANE0_MASK_LSB_LSB) |
((palette_bits + 1) << SIO_INTERP0_CTRL_LANE0_MASK_MSB_LSB);
// Lane 1 shifts and masks the sign bit into the right position to add to the symbol
// table index to choose the negative disparity symbols if the sign is negative.
const uint32_t ctrl_lane_1 =
((31 - (palette_bits + 2)) << SIO_INTERP0_CTRL_LANE0_SHIFT_LSB) |
(palette_bits + 2) * ((1 << SIO_INTERP0_CTRL_LANE0_MASK_LSB_LSB) | (1 << SIO_INTERP0_CTRL_LANE0_MASK_MSB_LSB));
interp0_hw->ctrl[1] = ctrl_lane_1;
interp1_hw->ctrl[1] = ctrl_lane_1;
if (core) {
tmds_palette_encode_loop_x(pixbuf, symbuf, n_pix);
interp0_hw->base[2] = (uint32_t)(tmds_palette + (2 << palette_bits));
interp1_hw->base[2] = (uint32_t)(tmds_palette + (2 << palette_bits));
tmds_palette_encode_loop_x(pixbuf, symbuf + (n_pix >> 1), n_pix);
interp0_hw->base[2] = (uint32_t)(tmds_palette + (4 << palette_bits));
interp1_hw->base[2] = (uint32_t)(tmds_palette + (4 << palette_bits));
tmds_palette_encode_loop_x(pixbuf, symbuf + n_pix, n_pix);
} else {
tmds_palette_encode_loop_y(pixbuf, symbuf, n_pix);
interp0_hw->base[2] = (uint32_t)(tmds_palette + (2 << palette_bits));
interp1_hw->base[2] = (uint32_t)(tmds_palette + (2 << palette_bits));
tmds_palette_encode_loop_y(pixbuf, symbuf + (n_pix >> 1), n_pix);
interp0_hw->base[2] = (uint32_t)(tmds_palette + (4 << palette_bits));
interp1_hw->base[2] = (uint32_t)(tmds_palette + (4 << palette_bits));
tmds_palette_encode_loop_y(pixbuf, symbuf + n_pix, n_pix);
}
#if !TMDS_FULLRES_NO_INTERP_SAVE
interp_restore(interp0_hw, &interp0_save);
interp_restore(interp1_hw, &interp1_save);
#endif
}

46
src/libdvi/tmds_encode.h Normal file
View file

@ -0,0 +1,46 @@
#ifndef _TMDS_ENCODE_H_
#define _TMDS_ENCODE_H_
#include "hardware/interp.h"
#include "dvi_config_defs.h"
#if defined(__cplusplus)
extern "C"
{
#endif
// Functions from tmds_encode.c
void tmds_encode_data_channel_16bpp(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint channel_msb, uint channel_lsb);
void tmds_encode_data_channel_8bpp(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint channel_msb, uint channel_lsb);
void tmds_encode_data_channel_fullres_16bpp(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint channel_msb, uint channel_lsb);
void tmds_setup_palette_symbols(const uint16_t *palette, uint32_t *symbuf, size_t n_palette);
void tmds_setup_palette24_symbols(const uint32_t *palette, uint32_t *symbuf, size_t n_palette);
void tmds_encode_palette_data(const uint32_t *pixbuf, const uint32_t *tmds_palette, uint32_t *symbuf, size_t n_pix, uint32_t palette_bits);
// Functions from tmds_encode.S
void tmds_encode_1bpp(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
void tmds_encode_2bpp(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
// Uses interp0:
void tmds_encode_loop_16bpp(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
void tmds_encode_loop_16bpp_leftshift(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint leftshift);
// Uses interp0 and interp1:
void tmds_encode_loop_8bpp(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
void tmds_encode_loop_8bpp_leftshift(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint leftshift);
// Uses interp0 and interp1:
// (Note a copy is provided in scratch memories X and Y)
void tmds_fullres_encode_loop_16bpp_x(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
void tmds_fullres_encode_loop_16bpp_y(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
void tmds_fullres_encode_loop_16bpp_leftshift_x(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint leftshift);
void tmds_fullres_encode_loop_16bpp_leftshift_y(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint leftshift);
void tmds_palette_encode_loop_x(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
void tmds_palette_encode_loop_y(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
#if defined(__cplusplus)
}
#endif
#endif

View file

@ -0,0 +1,46 @@
.program tmds_encode_1bpp
; 1bpp black/white pixels go in, TMDS symbols come out.
; Each output word contains two output symbols, each 10 bits in size,
; right-justified. The least-significant symbol is displayed first.
;
; We can encode using the following LUT: (yes this is compliant)
;
; x % 2 | colour | symbol
; ------+--------+-------
; 0 | 0 | 0x100
; 0 | 1 | 0x200
; 1 | 0 | 0x1ff
; 1 | 1 | 0x2ff
;
; OSR: shift to right, autopull, threshold 32
; ISR: shift to right, autopush, threshold 24
;
; Note the ISR needs to be shifted to *right* so that we can get the first
; pixel in the less-significant position. Threshold 24 so we can get 8x 0-bits
; at the LSBs for free :)
even_pixel:
out x, 1
mov y, ~x
in y, 1
in x, 1
odd_pixel:
mov x, ~null
in x, 8
out x, 1
mov y, ~x
in y, 1
in x, 13 ; Bring total shift to 24, triggering push.
% c-sdk {
static inline void tmds_encode_1bpp_init(PIO pio, uint sm) {
uint offset = pio_add_program(pio, &tmds_encode_1bpp_program);
pio_sm_config c = tmds_encode_1bpp_program_get_default_config(offset);
sm_config_set_out_shift(&c, true, true, 32);
sm_config_set_in_shift(&c, true, true, 24);
pio_sm_init(pio, sm, offset, &c);
pio_sm_set_enabled(pio, sm, true);
}
%}

76
src/libdvi/tmds_table.h Normal file
View file

@ -0,0 +1,76 @@
// Generated from tmds_table_gen.py
//
// This table converts a 6 bit data input into a pair of TMDS data symbols
// with data content *almost* equal (1 LSB off) to input value left shifted by
// two. The pairs of symbols have a net DC balance of 0.
//
// The two symbols are concatenated in the 20 LSBs of a data word, with the
// first symbol in least-significant position.
//
// Note the declaration isn't included here, just the table body. This is in
// case you want multiple copies of the table in different SRAMs (particularly
// scratch X/Y).
0x7fd00u,
0x40dfcu,
0x41df8u,
0x7ed04u,
0x43df0u,
0x7cd0cu,
0x7dd08u,
0x42df4u,
0x47de0u,
0x78d1cu,
0x79d18u,
0x46de4u,
0x7bd10u,
0x44decu,
0x45de8u,
0xafa41u,
0x4fdc0u,
0x70d3cu,
0x71d38u,
0x4edc4u,
0x73d30u,
0x4cdccu,
0x4ddc8u,
0xa7a61u,
0x77d20u,
0x48ddcu,
0x49dd8u,
0xa3a71u,
0x4bdd0u,
0xa1a79u,
0xa0a7du,
0x9fa81u,
0x5fd80u,
0x60d7cu,
0x61d78u,
0x5ed84u,
0x63d70u,
0x5cd8cu,
0x5dd88u,
0xb7a21u,
0x67d60u,
0x58d9cu,
0x59d98u,
0xb3a31u,
0x5bd90u,
0xb1a39u,
0xb0a3du,
0x8fac1u,
0x6fd40u,
0x50dbcu,
0x51db8u,
0xbba11u,
0x53db0u,
0xb9a19u,
0xb8a1du,
0x87ae1u,
0x57da0u,
0xbda09u,
0xbca0du,
0x83af1u,
0xbea05u,
0x81af9u,
0x80afdu,
0xbfa01u,

View file

@ -0,0 +1,139 @@
// Each entry consists of a 10 bit TMDS symbol in pseudo-differential format
// (10 LSBs) and the symbol's disparity as a 6 bit signed integer (the 6
// MSBs). There is a 16 bit gap in between them, which is actually vital for
// the way the TMDS encode works!
//
// There are 128 1-word entries. The lookup index should be the concatenation
// of the sign bit of current running disparity, with 6 bits of colour channel
// data.
// Non-negative running disparity:
0xe0000100,
0xf8000303,
0x00000307,
0xe8000104,
0x000001f0,
0xf000010c,
0xe8000108,
0x0000030b,
0xf80001e0,
0xf800011c,
0xf0000118,
0x000001e4,
0xe8000110,
0x00000313,
0x000001e8,
0xf0000241,
0xf00001c0,
0x0000013c,
0xf8000138,
0xf80001c4,
0xf0000130,
0x000001cc,
0xf80001c8,
0xf8000261,
0xe8000120,
0x00000323,
0x000001d8,
0x00000271,
0xf80001d0,
0xf0000086,
0xe8000082,
0xf0000281,
0xe8000180,
0x00000383,
0x00000178,
0xf0000184,
0xf8000170,
0xf800018c,
0xf0000188,
0xf0000221,
0xf0000160,
0x0000019c,
0xf8000198,
0xf8000231,
0xf0000190,
0x00000239,
0xf00000c2,
0xf80002c1,
0xe8000140,
0x00000343,
0x000001b8,
0xf0000211,
0xf80001b0,
0xf8000219,
0x0000021d,
0x000002e1,
0xf00001a0,
0xf0000209,
0xf800020d,
0xf000000e,
0xf0000205,
0xe8000006,
0xe0000002,
0xe8000201,
// Negative running disparity:
0x280003ff,
0x100001fc,
0x080001f8,
0x200003fb,
0x000001f0,
0x180003f3,
0x200003f7,
0x080001f4,
0x1000031f,
0x100003e3,
0x180003e7,
0x000001e4,
0x200003ef,
0x080001ec,
0x000001e8,
0x080000be,
0x1800033f,
0x0000013c,
0x100003c7,
0x1000033b,
0x180003cf,
0x000001cc,
0x10000337,
0x0000009e,
0x200003df,
0x080001dc,
0x000001d8,
0x00000271,
0x1000032f,
0x08000279,
0x1000027d,
0x0800007e,
0x2000037f,
0x0800017c,
0x00000178,
0x1800037b,
0x1000038f,
0x10000373,
0x18000377,
0x080000de,
0x1800039f,
0x0000019c,
0x10000367,
0x000000ce,
0x1800036f,
0x00000239,
0x0800023d,
0x0000003e,
0x200003bf,
0x080001bc,
0x000001b8,
0x080000ee,
0x1000034f,
0x000000e6,
0x0000021d,
0x000002e1,
0x1800035f,
0x080000f6,
0x000000f2,
0x080002f1,
0x080000fa,
0x100002f9,
0x180002fd,
0x100000fe,

150
src/libdvi/tmds_table_gen.py Executable file
View file

@ -0,0 +1,150 @@
#!/usr/bin/env python3
# The key fact is that, if x is even, and the encoder currently has a running
# imbalance of 0, encoding x followed by x + 1 produces a symbol pair with a
# net balance of 0.
#
# This is a reasonable constraint, because we only want RGB565 (so 6 valid
# channel data bits -> data is multiple of 4), and can probably tolerate
# 0.25LSB of noise :)
#
# This means that encoding a half-horizontal-resolution scanline buffer is a
# simple LUT operation for each colour channel, because we have made the
# encoding process stateless by guaranteeing 0 balance.
def popcount(x):
n = 0
while x:
n += 1
x = x & (x - 1)
return n
# Equivalent to N1(q) - N0(q) in the DVI spec
def byteimbalance(x):
return 2 * popcount(x) - 8
# This is a direct translation of "Figure 3-5. T.M.D.S. Encode Algorithm" on
# page 29 of DVI 1.0 spec
class TMDSEncode:
ctrl_syms = {
0b00: 0b1101010100,
0b01: 0b0010101011,
0b10: 0b0101010100,
0b11: 0b1010101011
}
def __init__(self):
self.imbalance = 0
def encode(self, d, c, de):
if not de:
self.imbalance = 0
return self.ctrl_syms[c]
# Minimise transitions
q_m = d & 0x1
if popcount(d) > 4 or (popcount(d) == 4 and not d & 0x1):
for i in range(7):
q_m = q_m | (~(q_m >> i ^ d >> i + 1) & 0x1) << i + 1
else:
for i in range(7):
q_m = q_m | ( (q_m >> i ^ d >> i + 1) & 0x1) << i + 1
q_m = q_m | 0x100
# Correct DC balance
inversion_mask = 0x2ff
q_out = 0
if self.imbalance == 0 or byteimbalance(q_m & 0xff) == 0:
q_out = q_m ^ (0 if q_m & 0x100 else inversion_mask)
if q_m & 0x100:
self.imbalance += byteimbalance(q_m & 0xff)
else:
self.imbalance -= byteimbalance(q_m & 0xff)
elif (self.imbalance > 0) == (byteimbalance(q_m & 0xff) > 0):
q_out = q_m ^ inversion_mask
self.imbalance += ((q_m & 0x100) >> 7) - byteimbalance(q_m & 0xff)
else:
q_out = q_m
self.imbalance += byteimbalance(q_m & 0xff) - ((~q_m & 0x100) >> 7)
return q_out
# Turn a bitmap of width n into n pairs of pseudo-differential bits
def differentialise(x, n):
accum = 0
for i in range(n):
accum <<= 2
if x & (1 << (n - 1)):
accum |= 0b01
else:
accum |= 0b10
x <<= 1
return accum
enc = TMDSEncode()
###
# Pixel-doubled table:
# for i in range(0, 256, 4):
# sym0 = enc.encode(i, 0, 1)
# sym1 = enc.encode(i ^ 1, 0, 1)
# assert(enc.imbalance == 0)
# print(f"0x{sym0 | (sym1 << 10):05x}u,")
###
# Fullres 1bpp table: (each entry is 2 words, 4 pixels)
# (note trick here is that encoding 0x00 or 0xff sets imbalance to -8, and
# (encoding 0x01 or 0xfe returns imbalance to 0, so we alternate between these
# (two pairs of dark/light colours. Creates some fairly subtle vertical
# (banding, but it's cheap.
# for i in range(1 << 4):
# syms = list(enc.encode((0xff if i & 1 << j else 0) ^ j & 0x01, 0, 1) for j in range(4))
# print(f"0x{syms[0] | syms[1] << 10:05x}, 0x{syms[2] | syms[3] << 10:05x}")
# assert(enc.imbalance == 0)
###
# Fullres table stuff:
# def disptable_format(sym):
# return sym | ((popcount(sym) * 2 - 10 & 0x3f) << 26)
# print("// Non-negative running disparity:")
# for i in range(0, 256, 4):
# enc.imbalance = 1
# print("0x{:08x},".format(disptable_format(enc.encode(i, 0, 1))))
# print("// Negative running disparity:")
# for i in range(0, 256, 4):
# enc.imbalance = -1
# print("0x{:08x},".format(disptable_format(enc.encode(i, 0, 1))))
###
# Control symbols:
# for i in range(4):
# sym = enc.encode(0, i, 0)
# print(f"0x{sym << 10 | sym:05x},")
###
# Find zero-balance symbols:
# for i in range(256):
# enc.imbalance = 0
# sym = enc.encode(i, 0, 1)
# if enc.imbalance == 0:
# print(f"{i:02x}: {sym:03x}")
###
# Generate 2bpp table based on above experiment:
levels_2bpp_even = [0x05, 0x50, 0xaf, 0xfa]
levels_2bpp_odd = [0x04, 0x51, 0xae, 0xfb]
for i1, p1 in enumerate(levels_2bpp_odd):
for i0, p0 in enumerate(levels_2bpp_even):
sym0 = enc.encode(p0, 0, 1)
sym1 = enc.encode(p1, 0, 1)
assert(enc.imbalance == 0)
print(f".word 0x{sym1 << 10 | sym0:05x} // {i0:02b}, {i1:02b}")

View file

@ -0,0 +1,83 @@
#ifndef _UTIL_QUEUE_U32_INLINE_H
#define _UTIL_QUEUE_U32_INLINE_H
// Faster versions of the functions found in pico/util/queue.h, for the common
// case of 32-bit-sized elements. Can be used on the same queue data
// structure, and mixed freely with the generic access methods, as long as
// element_size == 4.
#include "pico/util/queue.h"
#include "hardware/sync.h"
static inline uint16_t _queue_inc_index_u32(queue_t *q, uint16_t index) {
if (++index > q->element_count) { // > because we have element_count + 1 elements
index = 0;
}
return index;
}
static inline bool queue_try_add_u32(queue_t *q, void *data) {
bool success = false;
uint32_t flags = spin_lock_blocking(q->core.spin_lock);
if (queue_get_level_unsafe(q) != q->element_count) {
((uint32_t*)q->data)[q->wptr] = *(uint32_t*)data;
q->wptr = _queue_inc_index_u32(q, q->wptr);
success = true;
}
spin_unlock(q->core.spin_lock, flags);
if (success) __sev();
return success;
}
static inline bool queue_try_remove_u32(queue_t *q, void *data) {
bool success = false;
uint32_t flags = spin_lock_blocking(q->core.spin_lock);
if (queue_get_level_unsafe(q) != 0) {
*(uint32_t*)data = ((uint32_t*)q->data)[q->rptr];
q->rptr = _queue_inc_index_u32(q, q->rptr);
success = true;
}
spin_unlock(q->core.spin_lock, flags);
if (success) __sev();
return success;
}
static inline bool queue_try_peek_u32(queue_t *q, void *data) {
bool success = false;
uint32_t flags = spin_lock_blocking(q->core.spin_lock);
if (queue_get_level_unsafe(q) != 0) {
*(uint32_t*)data = ((uint32_t*)q->data)[q->rptr];
success = true;
}
spin_unlock(q->core.spin_lock, flags);
return success;
}
static inline void queue_add_blocking_u32(queue_t *q, void *data) {
bool done;
do {
done = queue_try_add_u32(q, data);
if (done) break;
__wfe();
} while (true);
}
static inline void queue_remove_blocking_u32(queue_t *q, void *data) {
bool done;
do {
done = queue_try_remove_u32(q, data);
if (done) break;
__wfe();
} while (true);
}
static inline void queue_peek_blocking_u32(queue_t *q, void *data) {
bool done;
do {
done = queue_try_peek_u32(q, data);
if (done) break;
__wfe();
} while (true);
}
#endif