Remove soft link to libdvi (copy full directory instead)

For Arduino Library Manager compliance
This commit is contained in:
Phillip Burgess 2023-03-09 15:00:54 -08:00
parent 506fca674a
commit bb7dc7c20d
20 changed files with 2661 additions and 3 deletions

View file

@ -19,8 +19,7 @@ RP2040 core).
Changes vs main PicoDVI repo:
- Add library.properties file, src and examples directories per Arduino
requirements.
- software/libdvi is soft-linked into src so Arduino IDE can compile these
parts.
- A full copy of software/libdvi is made in src (originally was soft-linked but Arduino Library Manager does not approve). If any updates are made in the original PicoDVI libdvi directory, copy them here!
- The file dvi_serialiser.pio.h, normally not part of the distribution and
generated during the Pico SDK build process, is provided here for Arduino
build to work. If any changes are made in dvi_serialiser.pio (either here

View file

@ -1 +0,0 @@
../software/libdvi

33
src/libdvi/CMakeLists.txt Normal file
View file

@ -0,0 +1,33 @@
# Note we are using INTERFACE so that the library can be configured per-app
# with compile-time defines
add_library(libdvi INTERFACE)
target_sources(libdvi INTERFACE
${CMAKE_CURRENT_LIST_DIR}/dvi.c
${CMAKE_CURRENT_LIST_DIR}/dvi.h
${CMAKE_CURRENT_LIST_DIR}/dvi_config_defs.h
${CMAKE_CURRENT_LIST_DIR}/dvi_serialiser.c
${CMAKE_CURRENT_LIST_DIR}/dvi_serialiser.h
${CMAKE_CURRENT_LIST_DIR}/dvi_timing.c
${CMAKE_CURRENT_LIST_DIR}/dvi_timing.h
${CMAKE_CURRENT_LIST_DIR}/tmds_encode.S
${CMAKE_CURRENT_LIST_DIR}/tmds_encode.c
${CMAKE_CURRENT_LIST_DIR}/tmds_encode.h
${CMAKE_CURRENT_LIST_DIR}/tmds_table.h
${CMAKE_CURRENT_LIST_DIR}/tmds_table_fullres.h
${CMAKE_CURRENT_LIST_DIR}/util_queue_u32_inline.h
)
target_include_directories(libdvi INTERFACE ${CMAKE_CURRENT_LIST_DIR})
target_link_libraries(libdvi INTERFACE
pico_base_headers
pico_util
hardware_dma
hardware_interp
hardware_pio
hardware_pwm
)
pico_generate_pio_header(libdvi ${CMAKE_CURRENT_LIST_DIR}/dvi_serialiser.pio)
pico_generate_pio_header(libdvi ${CMAKE_CURRENT_LIST_DIR}/tmds_encode_1bpp.pio)

255
src/libdvi/dvi.c Normal file
View file

@ -0,0 +1,255 @@
#include <stdlib.h>
#include "hardware/dma.h"
#include "hardware/irq.h"
#include "dvi.h"
#include "dvi_timing.h"
#include "dvi_serialiser.h"
#include "tmds_encode.h"
// Adafruit PicoDVI fork requires a couple global items run-time configurable:
uint8_t dvi_vertical_repeat = DVI_VERTICAL_REPEAT;
bool dvi_monochrome_tmds = DVI_MONOCHROME_TMDS;
// Time-critical functions pulled into RAM but each in a unique section to
// allow garbage collection
#define __dvi_func(f) __not_in_flash_func(f)
#define __dvi_func_x(f) __scratch_x(__STRING(f)) f
// We require exclusive use of a DMA IRQ line. (you wouldn't want to share
// anyway). It's possible in theory to hook both IRQs and have two DVI outs.
static struct dvi_inst *dma_irq_privdata[2];
static void dvi_dma0_irq();
static void dvi_dma1_irq();
void dvi_init(struct dvi_inst *inst, uint spinlock_tmds_queue, uint spinlock_colour_queue) {
dvi_timing_state_init(&inst->timing_state);
dvi_serialiser_init(&inst->ser_cfg);
for (int i = 0; i < N_TMDS_LANES; ++i) {
inst->dma_cfg[i].chan_ctrl = dma_claim_unused_channel(true);
inst->dma_cfg[i].chan_data = dma_claim_unused_channel(true);
inst->dma_cfg[i].tx_fifo = (void*)&inst->ser_cfg.pio->txf[inst->ser_cfg.sm_tmds[i]];
inst->dma_cfg[i].dreq = pio_get_dreq(inst->ser_cfg.pio, inst->ser_cfg.sm_tmds[i], true);
}
inst->late_scanline_ctr = 0;
inst->tmds_buf_release_next = NULL;
inst->tmds_buf_release = NULL;
queue_init_with_spinlock(&inst->q_tmds_valid, sizeof(void*), 8, spinlock_tmds_queue);
queue_init_with_spinlock(&inst->q_tmds_free, sizeof(void*), 8, spinlock_tmds_queue);
queue_init_with_spinlock(&inst->q_colour_valid, sizeof(void*), 8, spinlock_colour_queue);
queue_init_with_spinlock(&inst->q_colour_free, sizeof(void*), 8, spinlock_colour_queue);
dvi_setup_scanline_for_vblank(inst->timing, inst->dma_cfg, true, &inst->dma_list_vblank_sync);
dvi_setup_scanline_for_vblank(inst->timing, inst->dma_cfg, false, &inst->dma_list_vblank_nosync);
#if defined(ARDUINO)
dvi_setup_scanline_for_active(inst->timing, inst->dma_cfg, (uint32_t*)SRAM_BASE, &inst->dma_list_active);
#else
dvi_setup_scanline_for_active(inst->timing, inst->dma_cfg, (void*)SRAM_BASE, &inst->dma_list_active);
#endif
dvi_setup_scanline_for_active(inst->timing, inst->dma_cfg, NULL, &inst->dma_list_error);
for (int i = 0; i < DVI_N_TMDS_BUFFERS; ++i) {
void *tmdsbuf;
if (dvi_monochrome_tmds)
tmdsbuf = malloc(inst->timing->h_active_pixels / DVI_SYMBOLS_PER_WORD * sizeof(uint32_t));
else
tmdsbuf = malloc(3 * inst->timing->h_active_pixels / DVI_SYMBOLS_PER_WORD * sizeof(uint32_t));
if (!tmdsbuf)
panic("TMDS buffer allocation failed");
queue_add_blocking_u32(&inst->q_tmds_free, &tmdsbuf);
}
}
// The IRQs will run on whichever core calls this function (this is why it's
// called separately from dvi_init)
void dvi_register_irqs_this_core(struct dvi_inst *inst, uint irq_num) {
uint32_t mask_sync_channel = 1u << inst->dma_cfg[TMDS_SYNC_LANE].chan_data;
uint32_t mask_all_channels = 0;
for (int i = 0; i < N_TMDS_LANES; ++i)
mask_all_channels |= 1u << inst->dma_cfg[i].chan_ctrl | 1u << inst->dma_cfg[i].chan_data;
dma_hw->ints0 = mask_sync_channel;
if (irq_num == DMA_IRQ_0) {
hw_write_masked(&dma_hw->inte0, mask_sync_channel, mask_all_channels);
dma_irq_privdata[0] = inst;
irq_set_exclusive_handler(DMA_IRQ_0, dvi_dma0_irq);
}
else {
hw_write_masked(&dma_hw->inte1, mask_sync_channel, mask_all_channels);
dma_irq_privdata[1] = inst;
irq_set_exclusive_handler(DMA_IRQ_1, dvi_dma1_irq);
}
irq_set_enabled(irq_num, true);
}
// Set up control channels to make transfers to data channels' control
// registers (but don't trigger the control channels -- this is done either by
// data channel CHAIN_TO or an initial write to MULTI_CHAN_TRIGGER)
static inline void __attribute__((always_inline)) _dvi_load_dma_op(const struct dvi_lane_dma_cfg dma_cfg[], struct dvi_scanline_dma_list *l) {
for (int i = 0; i < N_TMDS_LANES; ++i) {
dma_channel_config cfg = dma_channel_get_default_config(dma_cfg[i].chan_ctrl);
channel_config_set_ring(&cfg, true, 4); // 16-byte write wrap
channel_config_set_read_increment(&cfg, true);
channel_config_set_write_increment(&cfg, true);
dma_channel_configure(
dma_cfg[i].chan_ctrl,
&cfg,
&dma_hw->ch[dma_cfg[i].chan_data],
dvi_lane_from_list(l, i),
4, // Configure all 4 registers then halt until next CHAIN_TO
false
);
}
}
// Setup first set of control block lists, configure the control channels, and
// trigger them. Control channels will subsequently be triggered only by DMA
// CHAIN_TO on data channel completion. IRQ handler *must* be prepared before
// calling this. (Hooked to DMA IRQ0)
void dvi_start(struct dvi_inst *inst) {
_dvi_load_dma_op(inst->dma_cfg, &inst->dma_list_vblank_nosync);
dma_start_channel_mask(
(1u << inst->dma_cfg[0].chan_ctrl) |
(1u << inst->dma_cfg[1].chan_ctrl) |
(1u << inst->dma_cfg[2].chan_ctrl));
// We really don't want the FIFOs to bottom out, so wait for full before
// starting the shift-out.
for (int i = 0; i < N_TMDS_LANES; ++i)
while (!pio_sm_is_tx_fifo_full(inst->ser_cfg.pio, inst->ser_cfg.sm_tmds[i]))
tight_loop_contents();
dvi_serialiser_enable(&inst->ser_cfg, true);
}
static inline void __dvi_func_x(_dvi_prepare_scanline_8bpp)(struct dvi_inst *inst, uint32_t *scanbuf) {
uint32_t *tmdsbuf;
queue_remove_blocking_u32(&inst->q_tmds_free, &tmdsbuf);
uint pixwidth = inst->timing->h_active_pixels;
uint words_per_channel = pixwidth / DVI_SYMBOLS_PER_WORD;
// Scanline buffers are half-resolution; the functions take the number of *input* pixels as parameter.
tmds_encode_data_channel_8bpp(scanbuf, tmdsbuf + 0 * words_per_channel, pixwidth / 2, DVI_8BPP_BLUE_MSB, DVI_8BPP_BLUE_LSB );
tmds_encode_data_channel_8bpp(scanbuf, tmdsbuf + 1 * words_per_channel, pixwidth / 2, DVI_8BPP_GREEN_MSB, DVI_8BPP_GREEN_LSB);
tmds_encode_data_channel_8bpp(scanbuf, tmdsbuf + 2 * words_per_channel, pixwidth / 2, DVI_8BPP_RED_MSB, DVI_8BPP_RED_LSB );
queue_add_blocking_u32(&inst->q_tmds_valid, &tmdsbuf);
}
static inline void __dvi_func_x(_dvi_prepare_scanline_16bpp)(struct dvi_inst *inst, uint32_t *scanbuf) {
uint32_t *tmdsbuf;
queue_remove_blocking_u32(&inst->q_tmds_free, &tmdsbuf);
uint pixwidth = inst->timing->h_active_pixels;
uint words_per_channel = pixwidth / DVI_SYMBOLS_PER_WORD;
tmds_encode_data_channel_16bpp(scanbuf, tmdsbuf + 0 * words_per_channel, pixwidth / 2, DVI_16BPP_BLUE_MSB, DVI_16BPP_BLUE_LSB );
tmds_encode_data_channel_16bpp(scanbuf, tmdsbuf + 1 * words_per_channel, pixwidth / 2, DVI_16BPP_GREEN_MSB, DVI_16BPP_GREEN_LSB);
tmds_encode_data_channel_16bpp(scanbuf, tmdsbuf + 2 * words_per_channel, pixwidth / 2, DVI_16BPP_RED_MSB, DVI_16BPP_RED_LSB );
queue_add_blocking_u32(&inst->q_tmds_valid, &tmdsbuf);
}
// "Worker threads" for TMDS encoding (core enters and never returns, but still handles IRQs)
// Version where each record in q_colour_valid is one scanline:
void __dvi_func(dvi_scanbuf_main_8bpp)(struct dvi_inst *inst) {
uint y = 0;
while (1) {
uint32_t *scanbuf;
queue_remove_blocking_u32(&inst->q_colour_valid, &scanbuf);
_dvi_prepare_scanline_8bpp(inst, scanbuf);
queue_add_blocking_u32(&inst->q_colour_free, &scanbuf);
++y;
if (y == inst->timing->v_active_lines) {
y = 0;
}
}
__builtin_unreachable();
}
// Ugh copy/paste but it lets us garbage collect the TMDS stuff that is not being used from .scratch_x
void __dvi_func(dvi_scanbuf_main_16bpp)(struct dvi_inst *inst) {
uint y = 0;
while (1) {
uint32_t *scanbuf;
queue_remove_blocking_u32(&inst->q_colour_valid, &scanbuf);
_dvi_prepare_scanline_16bpp(inst, scanbuf);
queue_add_blocking_u32(&inst->q_colour_free, &scanbuf);
++y;
if (y == inst->timing->v_active_lines) {
y = 0;
}
}
__builtin_unreachable();
}
static void __dvi_func(dvi_dma_irq_handler)(struct dvi_inst *inst) {
// Every fourth interrupt marks the start of the horizontal active region. We
// now have until the end of this region to generate DMA blocklist for next
// scanline.
dvi_timing_state_advance(inst->timing, &inst->timing_state);
if (inst->tmds_buf_release && !queue_try_add_u32(&inst->q_tmds_free, &inst->tmds_buf_release))
panic("TMDS free queue full in IRQ!");
inst->tmds_buf_release = inst->tmds_buf_release_next;
inst->tmds_buf_release_next = NULL;
// Make sure all three channels have definitely loaded their last block
// (should be within a few cycles of one another)
for (int i = 0; i < N_TMDS_LANES; ++i) {
while (dma_debug_hw->ch[inst->dma_cfg[i].chan_data].tcr != inst->timing->h_active_pixels / DVI_SYMBOLS_PER_WORD)
tight_loop_contents();
}
uint32_t *tmdsbuf;
while (inst->late_scanline_ctr > 0 && queue_try_remove_u32(&inst->q_tmds_valid, &tmdsbuf)) {
// If we displayed this buffer then it would be in the wrong vertical
// position on-screen. Just pass it back.
queue_add_blocking_u32(&inst->q_tmds_free, &tmdsbuf);
--inst->late_scanline_ctr;
}
if (inst->timing_state.v_state != DVI_STATE_ACTIVE) {
// Don't care
tmdsbuf = NULL;
}
else if (queue_try_peek_u32(&inst->q_tmds_valid, &tmdsbuf)) {
if (inst->timing_state.v_ctr % dvi_vertical_repeat == dvi_vertical_repeat - 1) {
queue_remove_blocking_u32(&inst->q_tmds_valid, &tmdsbuf);
inst->tmds_buf_release_next = tmdsbuf;
}
}
else {
// No valid scanline was ready (generates solid red scanline)
tmdsbuf = NULL;
if (inst->timing_state.v_ctr % dvi_vertical_repeat == dvi_vertical_repeat - 1)
++inst->late_scanline_ctr;
}
switch (inst->timing_state.v_state) {
case DVI_STATE_ACTIVE:
if (tmdsbuf) {
dvi_update_scanline_data_dma(inst->timing, tmdsbuf, &inst->dma_list_active);
_dvi_load_dma_op(inst->dma_cfg, &inst->dma_list_active);
}
else {
_dvi_load_dma_op(inst->dma_cfg, &inst->dma_list_error);
}
if (inst->scanline_callback && inst->timing_state.v_ctr % dvi_vertical_repeat == dvi_vertical_repeat - 1) {
inst->scanline_callback();
}
break;
case DVI_STATE_SYNC:
_dvi_load_dma_op(inst->dma_cfg, &inst->dma_list_vblank_sync);
break;
default:
_dvi_load_dma_op(inst->dma_cfg, &inst->dma_list_vblank_nosync);
break;
}
}
static void __dvi_func(dvi_dma0_irq)() {
struct dvi_inst *inst = dma_irq_privdata[0];
dma_hw->ints0 = 1u << inst->dma_cfg[TMDS_SYNC_LANE].chan_data;
dvi_dma_irq_handler(inst);
}
static void __dvi_func(dvi_dma1_irq)() {
struct dvi_inst *inst = dma_irq_privdata[1];
dma_hw->ints1 = 1u << inst->dma_cfg[TMDS_SYNC_LANE].chan_data;
dvi_dma_irq_handler(inst);
}

81
src/libdvi/dvi.h Normal file
View file

@ -0,0 +1,81 @@
#ifndef _DVI_H
#define _DVI_H
#define N_TMDS_LANES 3
#define TMDS_SYNC_LANE 0 // blue!
#include "pico/util/queue.h"
#include "dvi_config_defs.h"
#include "dvi_timing.h"
#include "dvi_serialiser.h"
#include "util_queue_u32_inline.h"
typedef void (*dvi_callback_t)(void);
struct dvi_inst {
// Config ---
const struct dvi_timing *timing;
struct dvi_lane_dma_cfg dma_cfg[N_TMDS_LANES];
struct dvi_timing_state timing_state;
struct dvi_serialiser_cfg ser_cfg;
// Called in the DMA IRQ once per scanline -- careful with the run time!
dvi_callback_t scanline_callback;
// State ---
struct dvi_scanline_dma_list dma_list_vblank_sync;
struct dvi_scanline_dma_list dma_list_vblank_nosync;
struct dvi_scanline_dma_list dma_list_active;
struct dvi_scanline_dma_list dma_list_error;
// After a TMDS buffer has been enqueue via a control block for the last
// time, two IRQs must go by before freeing. The first indicates the control
// block for this buf has been loaded, and the second occurs some time after
// the actual data DMA transfer has completed.
uint32_t *tmds_buf_release_next;
uint32_t *tmds_buf_release;
// Remember how far behind the source is on TMDS scanlines, so we can output
// solid colour until they catch up (rather than dying spectacularly)
uint late_scanline_ctr;
// Encoded scanlines:
queue_t q_tmds_valid;
queue_t q_tmds_free;
// Either scanline buffers or frame buffers:
queue_t q_colour_valid;
queue_t q_colour_free;
};
#if defined(__cplusplus)
extern "C"
{
#endif
// Set up data structures and hardware for DVI.
void dvi_init(struct dvi_inst *inst, uint spinlock_tmds_queue, uint spinlock_colour_queue);
// Call this after calling dvi_init(). DVI DMA interrupts will be routed to
// whichever core called this function. Registers an exclusive IRQ handler.
void dvi_register_irqs_this_core(struct dvi_inst *inst, uint irq_num);
// Start actually wiggling TMDS pairs. Call this once you have initialised the
// DVI, have registered the IRQs, and are producing rendered scanlines.
void dvi_start(struct dvi_inst *inst);
// TMDS encode worker function: core enters and doesn't leave, but still
// responds to IRQs. Repeatedly pop a scanline buffer from q_colour_valid,
// TMDS encode it, and pass it to the tmds valid queue.
void dvi_scanbuf_main_8bpp(struct dvi_inst *inst);
void dvi_scanbuf_main_16bpp(struct dvi_inst *inst);
// Same as above, but each q_colour_valid entry is a framebuffer
void dvi_framebuf_main_8bpp(struct dvi_inst *inst);
void dvi_framebuf_main_16bpp(struct dvi_inst *inst);
#if defined(__cplusplus)
}
#endif
#endif

View file

@ -0,0 +1,151 @@
#ifndef _DVI_CONFIG_DEFS_H
#define _DVI_CONFIG_DEFS_H
// Compile-time configuration definitions for libdvi. This file provides
// defaults -- you can override using a board header, or setting compile
// definitions directly from the commandline (e.g. using CMake
// target_compile_definitions())
// Pull in base headers to make sure board definitions override the
// definitions provided here. Note this file is included in asm and C.
#include "hardware/platform_defs.h"
#include "pico/config.h"
// ----------------------------------------------------------------------------
// General DVI defines
// How many times to output the same TMDS buffer before recyling it onto the
// free queue. Pixels are repeated vertically if this is >1.
#ifndef DVI_VERTICAL_REPEAT
#define DVI_VERTICAL_REPEAT 2
#endif
// Number of TMDS buffers to allocate (malloc()) in DVI init. You can set this
// to 0 if you want to allocate your own (e.g. if you want static buffers)
#ifndef DVI_N_TMDS_BUFFERS
#define DVI_N_TMDS_BUFFERS 3
#endif
// If 1, replace the DVI serialiser with a 10n1 UART (1 start bit, 10 data
// bits, 1 stop bit) so the stream can be dumped and analysed easily.
#ifndef DVI_SERIAL_DEBUG
#define DVI_SERIAL_DEBUG 0
#endif
// If 1, the same TMDS symbols are sent to all 3 lanes during the horizontal
// active period. This means only monochrome colour is available, but the TMDS
// buffers are 3 times smaller as a result, and the performance requirements
// for encode are also cut by 3.
#ifndef DVI_MONOCHROME_TMDS
#define DVI_MONOCHROME_TMDS 0
#endif
// By default, we assume each 32-bit word written to a PIO FIFO contains 2x
// 10-bit TMDS symbols, concatenated into the lower 20 bits, least-significant
// first. This is convenient if you are generating two or more pixels at once,
// e.g. using the pixel-doubling TMDS encode. You can change this value to 1
// (so each word contains 1 symbol) for e.g. full resolution RGB encode. Note
// that this value needs to divide the DVI horizontal timings, so is limited
// to 1 or 2.
#ifndef DVI_SYMBOLS_PER_WORD
#define DVI_SYMBOLS_PER_WORD 2
#endif
#if DVI_SYMBOLS_PER_WORD != 1 && DVI_SYMBOLS_PER_WORD !=2
#error "Unsupported value for DVI_SYMBOLS_PER_WORD"
#endif
// ----------------------------------------------------------------------------
// Pixel component layout
// By default we go R, G, B from MSB -> LSB. Override to e.g. swap RGB <-> BGR
// Default 8bpp layout: RGB332, {r[1:0], g[2:0], b[1:0]}
#ifndef DVI_8BPP_RED_MSB
#define DVI_8BPP_RED_MSB 7
#endif
#ifndef DVI_8BPP_RED_LSB
#define DVI_8BPP_RED_LSB 5
#endif
#ifndef DVI_8BPP_GREEN_MSB
#define DVI_8BPP_GREEN_MSB 4
#endif
#ifndef DVI_8BPP_GREEN_LSB
#define DVI_8BPP_GREEN_LSB 2
#endif
#ifndef DVI_8BPP_BLUE_MSB
#define DVI_8BPP_BLUE_MSB 1
#endif
#ifndef DVI_8BPP_BLUE_LSB
#define DVI_8BPP_BLUE_LSB 0
#endif
// Default 16bpp layout: RGB565, {r[4:0], g[5:0], b[4:0]}
#ifndef DVI_16BPP_RED_MSB
#define DVI_16BPP_RED_MSB 15
#endif
#ifndef DVI_16BPP_RED_LSB
#define DVI_16BPP_RED_LSB 11
#endif
#ifndef DVI_16BPP_GREEN_MSB
#define DVI_16BPP_GREEN_MSB 10
#endif
#ifndef DVI_16BPP_GREEN_LSB
#define DVI_16BPP_GREEN_LSB 5
#endif
#ifndef DVI_16BPP_BLUE_MSB
#define DVI_16BPP_BLUE_MSB 4
#endif
#ifndef DVI_16BPP_BLUE_LSB
#define DVI_16BPP_BLUE_LSB 0
#endif
// Default 1bpp layout: bitwise little-endian, i.e. least significant bit of
// each word is the first (leftmost) of a block of 32 pixels.
// If 1, reverse the order of pixels within each byte. Order of bytes within
// each word is still little-endian.
#ifndef DVI_1BPP_BIT_REVERSE
#define DVI_1BPP_BIT_REVERSE 1 // Adafruit_GFX GFXcanvas1 requires this 1
#endif
// ----------------------------------------------------------------------------
// TMDS encode controls
// Number of TMDS loop bodies between branches. cmp + branch costs 3 cycles,
// so you can easily save 10% of encode time by bumping this. Note that body
// will *already* produce multiple pixels, and total symbols per iteration
// must cleanly divide symbols per scanline, else the loop won't terminate.
// Point gun away from foot.
#ifndef TMDS_ENCODE_UNROLL
#define TMDS_ENCODE_UNROLL 1
#endif
// If 1, don't save/restore the interpolators on full-resolution TMDS encode.
// Speed hack. The TMDS code uses both interpolators, for each of the 3 data
// channels, so this define avoids 6 save/restores per scanline.
#ifndef TMDS_FULLRES_NO_INTERP_SAVE
#define TMDS_FULLRES_NO_INTERP_SAVE 0
#endif
// If 1, don't DC-balance the output of full resolution encode. Hilariously
// noncompliant, but Dell Ultrasharp -- the honey badger of computer monitors
// -- does not seem to mind (it helps that we DC-couple). Another speed hack,
// useful when you are trying to get everything else up to speed.
#ifndef TMDS_FULLRES_NO_DC_BALANCE
#define TMDS_FULLRES_NO_DC_BALANCE 0
#endif
#endif

View file

@ -0,0 +1,73 @@
#include "pico.h"
#include "hardware/pio.h"
#include "hardware/gpio.h"
#include "hardware/pwm.h"
#include "hardware/structs/padsbank0.h"
#include "dvi.h"
#include "dvi_serialiser.h"
#include "dvi_serialiser.pio.h"
static void dvi_configure_pad(uint gpio, bool invert) {
// 2 mA drive, enable slew rate limiting (this seems fine even at 720p30, and
// the 3V3 LDO doesn't get warm like when turning all the GPIOs up to 11).
// Also disable digital receiver.
hw_write_masked(
&padsbank0_hw->io[gpio],
(0 << PADS_BANK0_GPIO0_DRIVE_LSB),
PADS_BANK0_GPIO0_DRIVE_BITS | PADS_BANK0_GPIO0_SLEWFAST_BITS | PADS_BANK0_GPIO0_IE_BITS
);
gpio_set_outover(gpio, invert ? GPIO_OVERRIDE_INVERT : GPIO_OVERRIDE_NORMAL);
}
void dvi_serialiser_init(struct dvi_serialiser_cfg *cfg) {
#if DVI_SERIAL_DEBUG
uint offset = pio_add_program(cfg->pio, &dvi_serialiser_debug_program);
#else
uint offset = pio_add_program(cfg->pio, &dvi_serialiser_program);
#endif
cfg->prog_offs = offset;
for (int i = 0; i < N_TMDS_LANES; ++i) {
pio_sm_claim(cfg->pio, cfg->sm_tmds[i]);
dvi_serialiser_program_init(
cfg->pio,
cfg->sm_tmds[i],
offset,
cfg->pins_tmds[i],
DVI_SERIAL_DEBUG
);
dvi_configure_pad(cfg->pins_tmds[i], cfg->invert_diffpairs);
dvi_configure_pad(cfg->pins_tmds[i] + 1, cfg->invert_diffpairs);
}
// Use a PWM slice to drive the pixel clock. Both GPIOs must be on the same
// slice (lower-numbered GPIO must be even).
assert(cfg->pins_clk % 2 == 0);
uint slice = pwm_gpio_to_slice_num(cfg->pins_clk);
// 5 cycles high, 5 low. Invert one channel so that we get complementary outputs.
pwm_config pwm_cfg = pwm_get_default_config();
pwm_config_set_output_polarity(&pwm_cfg, true, false);
pwm_config_set_wrap(&pwm_cfg, 9);
pwm_init(slice, &pwm_cfg, false);
pwm_set_both_levels(slice, 5, 5);
for (uint i = cfg->pins_clk; i <= cfg->pins_clk + 1; ++i) {
gpio_set_function(i, GPIO_FUNC_PWM);
dvi_configure_pad(i, cfg->invert_diffpairs);
}
}
void dvi_serialiser_enable(struct dvi_serialiser_cfg *cfg, bool enable) {
uint mask = 0;
for (int i = 0; i < N_TMDS_LANES; ++i)
mask |= 1u << (cfg->sm_tmds[i] + PIO_CTRL_SM_ENABLE_LSB);
if (enable) {
hw_set_bits(&cfg->pio->ctrl, mask);
pwm_set_enabled(pwm_gpio_to_slice_num(cfg->pins_clk), true);
}
else {
hw_clear_bits(&cfg->pio->ctrl, mask);
pwm_set_enabled(pwm_gpio_to_slice_num(cfg->pins_clk), false);
}
}

View file

@ -0,0 +1,22 @@
#ifndef _DVI_SERIALISER_H
#define _DVI_SERIALISER_H
#include "hardware/pio.h"
#include "dvi_config_defs.h"
#define N_TMDS_LANES 3
struct dvi_serialiser_cfg {
PIO pio;
uint sm_tmds[N_TMDS_LANES];
uint pins_tmds[N_TMDS_LANES];
uint pins_clk;
bool invert_diffpairs;
uint prog_offs;
};
void dvi_serialiser_init(struct dvi_serialiser_cfg *cfg);
void dvi_serialiser_enable(struct dvi_serialiser_cfg *cfg, bool enable);
uint32_t dvi_single_to_diff(uint32_t in);
#endif

View file

@ -0,0 +1,53 @@
.program dvi_serialiser
.side_set 2
.origin 0
; Single-ended -> differential serial
out pc, 1 side 0b10
out pc, 1 side 0b01
.program dvi_serialiser_debug
.side_set 1 opt
; The debug variant behaves as a UART with 1 start bit, 10 data bits, 1 stop
; bit, and 5/6ths the data throughput of the TMDS version.
pull ifempty side 1 ; Extend stop bit with FIFO stall
nop side 0
out pins, 1 ; Unrolled because we require 1 bit / clk
out pins, 1
out pins, 1
out pins, 1
out pins, 1
out pins, 1
out pins, 1
out pins, 1
out pins, 1
out pins, 1
% c-sdk {
#include "dvi_config_defs.h"
static inline void dvi_serialiser_program_init(PIO pio, uint sm, uint offset, uint data_pins, bool debug) {
pio_sm_set_pins_with_mask(pio, sm, 2u << data_pins, 3u << data_pins);
pio_sm_set_pindirs_with_mask(pio, sm, ~0u, 3u << data_pins);
pio_gpio_init(pio, data_pins);
pio_gpio_init(pio, data_pins + 1);
pio_sm_config c;
if (debug) {
c = dvi_serialiser_debug_program_get_default_config(offset);
}
else {
c = dvi_serialiser_program_get_default_config(offset);
}
sm_config_set_sideset_pins(&c, data_pins);
if (debug)
sm_config_set_out_pins(&c, data_pins, 1);
sm_config_set_out_shift(&c, true, !debug, 10 * DVI_SYMBOLS_PER_WORD);
sm_config_set_fifo_join(&c, PIO_FIFO_JOIN_TX);
pio_sm_init(pio, sm, offset, &c);
pio_sm_set_enabled(pio, sm, false);
}
%}

View file

@ -0,0 +1,101 @@
// -------------------------------------------------- //
// This file is autogenerated by pioasm; do not edit! //
// -------------------------------------------------- //
#pragma once
#if !PICO_NO_HARDWARE
#include "hardware/pio.h"
#endif
// -------------- //
// dvi_serialiser //
// -------------- //
#define dvi_serialiser_wrap_target 0
#define dvi_serialiser_wrap 1
static const uint16_t dvi_serialiser_program_instructions[] = {
// .wrap_target
0x70a1, // 0: out pc, 1 side 2
0x68a1, // 1: out pc, 1 side 1
// .wrap
};
#if !PICO_NO_HARDWARE
static const struct pio_program dvi_serialiser_program = {
.instructions = dvi_serialiser_program_instructions,
.length = 2,
.origin = 0,
};
static inline pio_sm_config dvi_serialiser_program_get_default_config(uint offset) {
pio_sm_config c = pio_get_default_sm_config();
sm_config_set_wrap(&c, offset + dvi_serialiser_wrap_target, offset + dvi_serialiser_wrap);
sm_config_set_sideset(&c, 2, false, false);
return c;
}
#endif
// -------------------- //
// dvi_serialiser_debug //
// -------------------- //
#define dvi_serialiser_debug_wrap_target 0
#define dvi_serialiser_debug_wrap 11
static const uint16_t dvi_serialiser_debug_program_instructions[] = {
// .wrap_target
0x98e0, // 0: pull ifempty block side 1
0xb042, // 1: nop side 0
0x6001, // 2: out pins, 1
0x6001, // 3: out pins, 1
0x6001, // 4: out pins, 1
0x6001, // 5: out pins, 1
0x6001, // 6: out pins, 1
0x6001, // 7: out pins, 1
0x6001, // 8: out pins, 1
0x6001, // 9: out pins, 1
0x6001, // 10: out pins, 1
0x6001, // 11: out pins, 1
// .wrap
};
#if !PICO_NO_HARDWARE
static const struct pio_program dvi_serialiser_debug_program = {
.instructions = dvi_serialiser_debug_program_instructions,
.length = 12,
.origin = -1,
};
static inline pio_sm_config dvi_serialiser_debug_program_get_default_config(uint offset) {
pio_sm_config c = pio_get_default_sm_config();
sm_config_set_wrap(&c, offset + dvi_serialiser_debug_wrap_target, offset + dvi_serialiser_debug_wrap);
sm_config_set_sideset(&c, 2, true, false);
return c;
}
#include "dvi_config_defs.h"
static inline void dvi_serialiser_program_init(PIO pio, uint sm, uint offset, uint data_pins, bool debug) {
pio_sm_set_pins_with_mask(pio, sm, 2u << data_pins, 3u << data_pins);
pio_sm_set_pindirs_with_mask(pio, sm, ~0u, 3u << data_pins);
pio_gpio_init(pio, data_pins);
pio_gpio_init(pio, data_pins + 1);
pio_sm_config c;
if (debug) {
c = dvi_serialiser_debug_program_get_default_config(offset);
}
else {
c = dvi_serialiser_program_get_default_config(offset);
}
sm_config_set_sideset_pins(&c, data_pins);
if (debug)
sm_config_set_out_pins(&c, data_pins, 1);
sm_config_set_out_shift(&c, true, !debug, 10 * DVI_SYMBOLS_PER_WORD);
sm_config_set_fifo_join(&c, PIO_FIFO_JOIN_TX);
pio_sm_init(pio, sm, offset, &c);
pio_sm_set_enabled(pio, sm, false);
}
#endif

324
src/libdvi/dvi_timing.c Normal file
View file

@ -0,0 +1,324 @@
#include "dvi.h"
#include "dvi_timing.h"
#include "hardware/dma.h"
// This file contains:
// - Timing parameters for DVI modes (horizontal + vertical counts, best
// achievable bit clock from 12 MHz crystal)
// - Helper functions for generating DMA lists based on these timings
extern bool dvi_monochrome_tmds; // In dvi.c
// Pull into RAM but apply unique section suffix to allow linker GC
#define __dvi_func(x) __not_in_flash_func(x)
#define __dvi_const(x) __not_in_flash_func(x)
// VGA -- we do this mode properly, with a pretty comfortable clk_sys (252 MHz)
const struct dvi_timing __dvi_const(dvi_timing_640x480p_60hz) = {
.h_sync_polarity = false,
.h_front_porch = 16,
.h_sync_width = 96,
.h_back_porch = 48,
.h_active_pixels = 640,
.v_sync_polarity = false,
.v_front_porch = 10,
.v_sync_width = 2,
.v_back_porch = 33,
.v_active_lines = 480,
.bit_clk_khz = 252000
};
// SVGA -- completely by-the-book but requires 400 MHz clk_sys
const struct dvi_timing __dvi_const(dvi_timing_800x600p_60hz) = {
.h_sync_polarity = false,
.h_front_porch = 44,
.h_sync_width = 128,
.h_back_porch = 88,
.h_active_pixels = 800,
.v_sync_polarity = false,
.v_front_porch = 1,
.v_sync_width = 4,
.v_back_porch = 23,
.v_active_lines = 600,
.bit_clk_khz = 400000
};
// 800x480p 60 Hz (note this doesn't seem to be a CEA mode, I just used the
// output of `cvt 800 480 60`), 295 MHz bit clock
const struct dvi_timing __dvi_const(dvi_timing_800x480p_60hz) = {
.h_sync_polarity = false,
.h_front_porch = 24,
.h_sync_width = 72,
.h_back_porch = 96,
.h_active_pixels = 800,
.v_sync_polarity = true,
.v_front_porch = 3,
.v_sync_width = 10,
.v_back_porch = 7,
.v_active_lines = 480,
.bit_clk_khz = 295200
};
// SVGA reduced blanking (355 MHz bit clock) -- valid CVT mode, less common
// than fully-blanked SVGA, but doesn't require such a high system clock
const struct dvi_timing __dvi_const(dvi_timing_800x600p_reduced_60hz) = {
.h_sync_polarity = true,
.h_front_porch = 48,
.h_sync_width = 32,
.h_back_porch = 80,
.h_active_pixels = 800,
.v_sync_polarity = false,
.v_front_porch = 3,
.v_sync_width = 4,
.v_back_porch = 11,
.v_active_lines = 600,
.bit_clk_khz = 354000
};
// Also known as qHD, bit uncommon, but it's a nice modest-resolution 16:9
// aspect mode. Pixel clock 37.3 MHz
const struct dvi_timing __dvi_const(dvi_timing_960x540p_60hz) = {
.h_sync_polarity = true,
.h_front_porch = 16,
.h_sync_width = 32,
.h_back_porch = 96,
.h_active_pixels = 960,
.v_sync_polarity = true,
.v_front_porch = 2,
.v_sync_width = 6,
.v_back_porch = 15,
.v_active_lines = 540,
.bit_clk_khz = 372000
};
// Note this is NOT the correct 720p30 CEA mode, but rather 720p60 run at half
// pixel clock. Seems to be commonly accepted (and is a valid CVT mode). The
// actual CEA mode is the same pixel clock as 720p60 but with >50% blanking,
// which would require a clk_sys of 742 MHz!
const struct dvi_timing __dvi_const(dvi_timing_1280x720p_30hz) = {
.h_sync_polarity = true,
.h_front_porch = 110,
.h_sync_width = 40,
.h_back_porch = 220,
.h_active_pixels = 1280,
.v_sync_polarity = true,
.v_front_porch = 5,
.v_sync_width = 5,
.v_back_porch = 20,
.v_active_lines = 720,
.bit_clk_khz = 372000
};
// Reduced-blanking (CVT) 720p. You aren't supposed to use reduced blanking
// modes below 60 Hz, but I won't tell anyone (and it works on the monitors
// I've tried). This nets a lower system clock than regular 720p30 (319 MHz)
const struct dvi_timing __dvi_const(dvi_timing_1280x720p_reduced_30hz) = {
.h_sync_polarity = true,
.h_front_porch = 48,
.h_sync_width = 32,
.h_back_porch = 80,
.h_active_pixels = 1280,
.v_sync_polarity = false,
.v_front_porch = 3,
.v_sync_width = 5,
.v_back_porch = 13,
.v_active_lines = 720,
.bit_clk_khz = 319200
};
// This requires a spicy 488 MHz system clock and is illegal in most countries
// (you need to have a very lucky piece of silicon to run this at 1.3 V, or
// connect an external supply and give it a bit more juice)
const struct dvi_timing __dvi_const(dvi_timing_1600x900p_reduced_30hz) = {
.h_sync_polarity = true,
.h_front_porch = 48,
.h_sync_width = 32,
.h_back_porch = 80,
.h_active_pixels = 1600,
.v_sync_polarity = false,
.v_front_porch = 3,
.v_sync_width = 5,
.v_back_porch = 18,
.v_active_lines = 900,
.bit_clk_khz = 488000
};
// ----------------------------------------------------------------------------
// The DMA scheme is:
//
// - One channel transferring data to each of the three PIO state machines
// performing TMDS serialisation
//
// - One channel programming the registers of each of these data channels,
// triggered (CHAIN_TO) each time the corresponding data channel completes
//
// - Lanes 1 and 2 have one block for blanking and one for video data
//
// - Lane 0 has one block for each horizontal region (front porch, hsync, back
// porch, active)
//
// - The IRQ_QUIET flag is used to select which data block on the sync lane is
// allowed to generate an IRQ upon completion. This is the block immediately
// before the horizontal active region. The IRQ is entered at ~the same time
// as the last data transfer starts
//
// - The IRQ points the control channels at new blocklists for next scanline.
// The DMA starts the new list automatically at end-of-scanline, via
// CHAIN_TO.
//
// The horizontal active region is the longest continuous transfer, so this
// gives the most time to handle the IRQ and load new blocklists.
//
// Note a null trigger IRQ is not suitable because we get that *after* the
// last data transfer finishes, and the FIFOs bottom out very shortly
// afterward. For pure DVI (four blocks per scanline), it works ok to take
// four regular IRQs per scanline and return early from 3 of them, but this
// breaks down when you have very short scanline sections like guard bands.
// Each symbol appears twice, concatenated in one word. Note these must be in
// RAM because they see a lot of DMA traffic
const uint32_t __dvi_const(dvi_ctrl_syms)[4] = {
0xd5354,
0x2acab,
0x55154,
0xaaeab
};
// Output solid red scanline if we are given NULL for tmdsbuff
#if DVI_SYMBOLS_PER_WORD == 2
static uint32_t __dvi_const(empty_scanline_tmds)[3] = {
0x7fd00u, // 0x00, 0x00
0x7fd00u, // 0x00, 0x00
0xbfa01u // 0xfc, 0xfc
};
#else
static uint32_t __attribute__((aligned(8))) __dvi_const(empty_scanline_tmds)[6] = {
0x100u, 0x1ffu, // 0x00, 0x00
0x100u, 0x1ffu, // 0x00, 0x00
0x201u, 0x2feu // 0xfc, 0xfc
};
#endif
void dvi_timing_state_init(struct dvi_timing_state *t) {
t->v_ctr = 0;
t->v_state = DVI_STATE_FRONT_PORCH;
};
void __dvi_func(dvi_timing_state_advance)(const struct dvi_timing *t, struct dvi_timing_state *s) {
s->v_ctr++;
if ((s->v_state == DVI_STATE_FRONT_PORCH && s->v_ctr == t->v_front_porch) ||
(s->v_state == DVI_STATE_SYNC && s->v_ctr == t->v_sync_width) ||
(s->v_state == DVI_STATE_BACK_PORCH && s->v_ctr == t->v_back_porch) ||
(s->v_state == DVI_STATE_ACTIVE && s->v_ctr == t->v_active_lines)) {
s->v_state = (s->v_state + 1) % DVI_STATE_COUNT;
s->v_ctr = 0;
}
}
void dvi_scanline_dma_list_init(struct dvi_scanline_dma_list *dma_list) {
*dma_list = (struct dvi_scanline_dma_list){};
}
static const uint32_t *get_ctrl_sym(bool vsync, bool hsync) {
return &dvi_ctrl_syms[!!vsync << 1 | !!hsync];
}
// Make a sequence of paced transfers to the relevant FIFO
static void _set_data_cb(dma_cb_t *cb, const struct dvi_lane_dma_cfg *dma_cfg,
const void *read_addr, uint transfer_count, uint read_ring, bool irq_on_finish) {
cb->read_addr = read_addr;
cb->write_addr = dma_cfg->tx_fifo;
cb->transfer_count = transfer_count;
cb->c = dma_channel_get_default_config(dma_cfg->chan_data);
channel_config_set_ring(&cb->c, false, read_ring);
channel_config_set_dreq(&cb->c, dma_cfg->dreq);
// Call back to control channel for reconfiguration:
channel_config_set_chain_to(&cb->c, dma_cfg->chan_ctrl);
// Note we never send a null trigger, so IRQ_QUIET is an IRQ suppression flag
channel_config_set_irq_quiet(&cb->c, !irq_on_finish);
};
void dvi_setup_scanline_for_vblank(const struct dvi_timing *t, const struct dvi_lane_dma_cfg dma_cfg[],
bool vsync_asserted, struct dvi_scanline_dma_list *l) {
bool vsync = t->v_sync_polarity == vsync_asserted;
const uint32_t *sym_hsync_off = get_ctrl_sym(vsync, !t->h_sync_polarity);
const uint32_t *sym_hsync_on = get_ctrl_sym(vsync, t->h_sync_polarity);
const uint32_t *sym_no_sync = get_ctrl_sym(false, false );
dma_cb_t *synclist = dvi_lane_from_list(l, TMDS_SYNC_LANE);
// The symbol table contains each control symbol *twice*, concatenated into 20 LSBs of table word, so we can always do word-repeat.
_set_data_cb(&synclist[0], &dma_cfg[TMDS_SYNC_LANE], sym_hsync_off, t->h_front_porch / DVI_SYMBOLS_PER_WORD, 2, false);
_set_data_cb(&synclist[1], &dma_cfg[TMDS_SYNC_LANE], sym_hsync_on, t->h_sync_width / DVI_SYMBOLS_PER_WORD, 2, false);
_set_data_cb(&synclist[2], &dma_cfg[TMDS_SYNC_LANE], sym_hsync_off, t->h_back_porch / DVI_SYMBOLS_PER_WORD, 2, true);
_set_data_cb(&synclist[3], &dma_cfg[TMDS_SYNC_LANE], sym_hsync_off, t->h_active_pixels / DVI_SYMBOLS_PER_WORD, 2, false);
for (int i = 0; i < N_TMDS_LANES; ++i) {
if (i == TMDS_SYNC_LANE)
continue;
dma_cb_t *cblist = dvi_lane_from_list(l, i);
_set_data_cb(&cblist[0], &dma_cfg[i], sym_no_sync,(t->h_front_porch + t->h_sync_width + t->h_back_porch) / DVI_SYMBOLS_PER_WORD, 2, false);
_set_data_cb(&cblist[1], &dma_cfg[i], sym_no_sync, t->h_active_pixels / DVI_SYMBOLS_PER_WORD, 2, false);
}
}
void dvi_setup_scanline_for_active(const struct dvi_timing *t, const struct dvi_lane_dma_cfg dma_cfg[],
uint32_t *tmdsbuf, struct dvi_scanline_dma_list *l) {
const uint32_t *sym_hsync_off = get_ctrl_sym(!t->v_sync_polarity, !t->h_sync_polarity);
const uint32_t *sym_hsync_on = get_ctrl_sym(!t->v_sync_polarity, t->h_sync_polarity);
const uint32_t *sym_no_sync = get_ctrl_sym(false, false );
dma_cb_t *synclist = dvi_lane_from_list(l, TMDS_SYNC_LANE);
_set_data_cb(&synclist[0], &dma_cfg[TMDS_SYNC_LANE], sym_hsync_off, t->h_front_porch / DVI_SYMBOLS_PER_WORD, 2, false);
_set_data_cb(&synclist[1], &dma_cfg[TMDS_SYNC_LANE], sym_hsync_on, t->h_sync_width / DVI_SYMBOLS_PER_WORD, 2, false);
_set_data_cb(&synclist[2], &dma_cfg[TMDS_SYNC_LANE], sym_hsync_off, t->h_back_porch / DVI_SYMBOLS_PER_WORD, 2, true);
for (int i = 0; i < N_TMDS_LANES; ++i) {
dma_cb_t *cblist = dvi_lane_from_list(l, i);
if (i != TMDS_SYNC_LANE) {
_set_data_cb(&cblist[0], &dma_cfg[i], sym_no_sync,
(t->h_front_porch + t->h_sync_width + t->h_back_porch) / DVI_SYMBOLS_PER_WORD, 2, false);
}
int target_block = i == TMDS_SYNC_LANE ? DVI_SYNC_LANE_CHUNKS - 1 : DVI_NOSYNC_LANE_CHUNKS - 1;
if (tmdsbuf) {
// Non-repeating DMA for the freshly-encoded TMDS buffer
_set_data_cb(&cblist[target_block], &dma_cfg[i], tmdsbuf + i * (t->h_active_pixels / DVI_SYMBOLS_PER_WORD),
t->h_active_pixels / DVI_SYMBOLS_PER_WORD, 0, false);
}
else {
// Use read ring to repeat the correct DC-balanced symbol pair on blank scanlines (4 or 8 byte period)
_set_data_cb(&cblist[target_block], &dma_cfg[i], &empty_scanline_tmds[2 * i / DVI_SYMBOLS_PER_WORD],
t->h_active_pixels / DVI_SYMBOLS_PER_WORD, DVI_SYMBOLS_PER_WORD == 2 ? 2 : 3, false);
}
}
}
void __dvi_func(dvi_update_scanline_data_dma)(const struct dvi_timing *t, const uint32_t *tmdsbuf, struct dvi_scanline_dma_list *l) {
for (int i = 0; i < N_TMDS_LANES; ++i) {
const uint32_t *lane_tmdsbuf = dvi_monochrome_tmds ? tmdsbuf : tmdsbuf + i * t->h_active_pixels / DVI_SYMBOLS_PER_WORD;
if (i == TMDS_SYNC_LANE)
dvi_lane_from_list(l, i)[3].read_addr = lane_tmdsbuf;
else
dvi_lane_from_list(l, i)[1].read_addr = lane_tmdsbuf;
}
}

99
src/libdvi/dvi_timing.h Normal file
View file

@ -0,0 +1,99 @@
#ifndef _DVI_TIMING_H
#define _DVI_TIMING_H
#include "hardware/dma.h"
#include "pico/util/queue.h"
#include "dvi.h"
struct dvi_timing {
bool h_sync_polarity;
uint h_front_porch;
uint h_sync_width;
uint h_back_porch;
uint h_active_pixels;
bool v_sync_polarity;
uint v_front_porch;
uint v_sync_width;
uint v_back_porch;
uint v_active_lines;
uint bit_clk_khz;
};
enum dvi_line_state {
DVI_STATE_FRONT_PORCH = 0,
DVI_STATE_SYNC,
DVI_STATE_BACK_PORCH,
DVI_STATE_ACTIVE,
DVI_STATE_COUNT
};
struct dvi_timing_state {
uint v_ctr;
enum dvi_line_state v_state;
};
// This should map directly to DMA register layout, but more convenient types
// (also this really shouldn't be here... we don't have a dma_cb in the SDK
// because there are many valid formats due to aliases)
typedef struct dma_cb {
const void *read_addr;
void *write_addr;
uint32_t transfer_count;
dma_channel_config c;
} dma_cb_t;
static_assert(sizeof(dma_cb_t) == 4 * sizeof(uint32_t), "bad dma layout");
static_assert(__builtin_offsetof(dma_cb_t, c.ctrl) == __builtin_offsetof(dma_channel_hw_t, ctrl_trig), "bad dma layout");
#define DVI_SYNC_LANE_CHUNKS DVI_STATE_COUNT
#define DVI_NOSYNC_LANE_CHUNKS 2
struct dvi_scanline_dma_list {
dma_cb_t l0[DVI_SYNC_LANE_CHUNKS];
dma_cb_t l1[DVI_NOSYNC_LANE_CHUNKS];
dma_cb_t l2[DVI_NOSYNC_LANE_CHUNKS];
};
static inline dma_cb_t* dvi_lane_from_list(struct dvi_scanline_dma_list *l, int i) {
return i == 0 ? l->l0 : i == 1 ? l->l1 : l->l2;
}
// Each TMDS lane uses one DMA channel to transfer data to a PIO state
// machine, and another channel to load control blocks into this channel.
struct dvi_lane_dma_cfg {
uint chan_ctrl;
uint chan_data;
void *tx_fifo;
uint dreq;
};
// Note these are already converted to pseudo-differential representation
extern const uint32_t dvi_ctrl_syms[4];
extern const struct dvi_timing dvi_timing_640x480p_60hz;
extern const struct dvi_timing dvi_timing_800x480p_60hz;
extern const struct dvi_timing dvi_timing_800x600p_60hz;
extern const struct dvi_timing dvi_timing_960x540p_60hz;
extern const struct dvi_timing dvi_timing_1280x720p_30hz;
extern const struct dvi_timing dvi_timing_800x600p_reduced_60hz;
extern const struct dvi_timing dvi_timing_1280x720p_reduced_30hz;
void dvi_timing_state_init(struct dvi_timing_state *t);
void dvi_timing_state_advance(const struct dvi_timing *t, struct dvi_timing_state *s);
void dvi_scanline_dma_list_init(struct dvi_scanline_dma_list *dma_list);
void dvi_setup_scanline_for_vblank(const struct dvi_timing *t, const struct dvi_lane_dma_cfg dma_cfg[],
bool vsync_asserted, struct dvi_scanline_dma_list *l);
void dvi_setup_scanline_for_active(const struct dvi_timing *t, const struct dvi_lane_dma_cfg dma_cfg[],
uint32_t *tmdsbuf, struct dvi_scanline_dma_list *l);
void dvi_update_scanline_data_dma(const struct dvi_timing *t, const uint32_t *tmdsbuf, struct dvi_scanline_dma_list *l);
#endif

623
src/libdvi/tmds_encode.S Normal file
View file

@ -0,0 +1,623 @@
#include "hardware/regs/addressmap.h"
#include "hardware/regs/sio.h"
#include "dvi_config_defs.h"
// Offsets suitable for ldr/str (must be <= 0x7c):
#define ACCUM0_OFFS (SIO_INTERP0_ACCUM0_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
#define ACCUM1_OFFS (SIO_INTERP0_ACCUM1_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
#define ACCUM1_ADD_OFFS (SIO_INTERP0_ACCUM1_ADD_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
#define PEEK0_OFFS (SIO_INTERP0_PEEK_LANE0_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
#define PEEK1_OFFS (SIO_INTERP0_PEEK_LANE1_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
#define PEEK2_OFFS (SIO_INTERP0_PEEK_FULL_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
#define INTERP1 (SIO_INTERP1_ACCUM0_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
// Note the entirety of INTERP0 and INTERP1 fits inside this 5-bit
// word-addressed space... almost as though it were intentional! :)
.syntax unified
.cpu cortex-m0plus
.thumb
.macro decl_func_x name
.section .scratch_x.\name, "ax"
.global \name
.type \name,%function
.thumb_func
\name:
.endm
.macro decl_func_y name
.section .scratch_y.\name, "ax"
.global \name
.type \name,%function
.thumb_func
\name:
.endm
#define decl_func decl_func_x
// ----------------------------------------------------------------------------
// Pixel-doubling encoders for RGB
// r0: Input buffer (word-aligned)
// r1: Output buffer (word-aligned)
// r2: Input size (pixels)
.macro do_channel_16bpp r_ibase r_inout0 r_out1
str \r_inout0, [\r_ibase, #ACCUM0_OFFS]
ldr \r_inout0, [\r_ibase, #PEEK0_OFFS]
ldr \r_inout0, [\r_inout0]
ldr \r_out1, [\r_ibase, #PEEK1_OFFS]
ldr \r_out1, [\r_out1]
.endm
decl_func tmds_encode_loop_16bpp
push {r4, r5, r6, r7, lr}
lsls r2, #2
add r2, r1
mov ip, r2
ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
b 2f
.align 2
1:
.rept TMDS_ENCODE_UNROLL
ldmia r0!, {r4, r6}
do_channel_16bpp r2, r4, r5
do_channel_16bpp r2, r6, r7
stmia r1!, {r4, r5, r6, r7}
.endr
2:
cmp r1, ip
bne 1b
pop {r4, r5, r6, r7, pc}
// Same as above, but scale data to make up for lack of left shift
// in interpolator (costs 1 cycle per 2 pixels)
//
// r0: Input buffer (word-aligned)
// r1: Output buffer (word-aligned)
// r2: Input size (pixels)
// r3: Left shift amount
decl_func tmds_encode_loop_16bpp_leftshift
push {r4, r5, r6, r7, lr}
lsls r2, #2
add r2, r1
mov ip, r2
ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
b 2f
.align 2
1:
.rept TMDS_ENCODE_UNROLL
ldmia r0!, {r4, r6}
lsls r4, r3
do_channel_16bpp r2, r4, r5
lsls r6, r3
do_channel_16bpp r2, r6, r7
stmia r1!, {r4, r5, r6, r7}
.endr
2:
cmp r1, ip
bne 1b
pop {r4, r5, r6, r7, pc}
// r0: Input buffer (word-aligned)
// r1: Output buffer (word-aligned)
// r2: Input size (pixels)
decl_func tmds_encode_loop_8bpp
push {r4, r5, r6, r7, lr}
lsls r2, #2
add r2, r1
mov ip, r2
ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
b 2f
.align 2
1:
.rept TMDS_ENCODE_UNROLL
ldmia r0!, {r4}
str r4, [r2, #ACCUM0_OFFS + INTERP1]
str r4, [r2, #ACCUM0_OFFS]
ldr r4, [r2, #PEEK0_OFFS]
ldr r4, [r4]
ldr r5, [r2, #PEEK1_OFFS]
ldr r5, [r5]
ldr r6, [r2, #PEEK0_OFFS + INTERP1]
ldr r6, [r6]
ldr r7, [r2, #PEEK1_OFFS + INTERP1]
ldr r7, [r7]
stmia r1!, {r4, r5, r6, r7}
.endr
2:
cmp r1, ip
bne 1b
pop {r4, r5, r6, r7, pc}
// r0: Input buffer (word-aligned)
// r1: Output buffer (word-aligned)
// r2: Input size (pixels)
// r3: Left shift amount
//
// Note that only the data written to interp0 (pixel 0, 1) is leftshifted, not
// the data written to interp1 (pixel 2, 3). Otherwise we always lose MSBs, as
// the LUT offset MSB is at bit 8, so pixel 0 always requires some left shift,
// since its channel MSBs are no greater than 7.
decl_func tmds_encode_loop_8bpp_leftshift
push {r4, r5, r6, r7, lr}
lsls r2, #3
add r2, r1
mov ip, r2
ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
b 2f
.align 2
1:
.rept TMDS_ENCODE_UNROLL
ldmia r0!, {r4}
str r4, [r2, #ACCUM0_OFFS + INTERP1]
lsls r4, r3
str r4, [r2, #ACCUM0_OFFS]
ldr r4, [r2, #PEEK0_OFFS]
ldr r4, [r4]
ldr r5, [r2, #PEEK1_OFFS]
ldr r5, [r5]
ldr r6, [r2, #PEEK0_OFFS + INTERP1]
ldr r6, [r6]
ldr r7, [r2, #PEEK1_OFFS + INTERP1]
ldr r7, [r7]
stmia r1!, {r4, r5, r6, r7}
.endr
2:
cmp r1, ip
bne 1b
pop {r4, r5, r6, r7, pc}
// ----------------------------------------------------------------------------
// Fast 1bpp black/white encoder (full res)
// Taking the encoder from DVI spec, with initial balance 0:
//
// - Encoding either 0x00 or 0xff will produce a running balance of -8, with
// output symbol of 0x100 or 0x200
//
// - Subsequently encoding either 0x01 or 0xfe will return the balance to 0, with
// output symbol of 0x1ff or 0x2ff
//
// So we can do 1bpp encode with a lookup of x coordinate LSB, and input
// colour bit. If we process pixels in even-sized blocks, only the colour
// lookup is needed.
// Encode 8 pixels @ 1bpp (using two table lookups)
// r3 contains lookup mask (preshifted)
// r8 contains pointer to encode table
// 2.125 cyc/pix
.macro tmds_encode_1bpp_body shift_instr0 shamt0 shift_instr1 shamt1
\shift_instr0 r4, r2, #\shamt0
ands r4, r3
add r4, r8
ldmia r4, {r4, r5}
\shift_instr1 r6, r2, #\shamt1
ands r6, r3
add r6, r8
ldmia r6, {r6, r7}
stmia r1!, {r4, r5, r6, r7}
.endm
// r0: input buffer (word-aligned)
// r1: output buffer (word-aligned)
// r2: output pixel count
decl_func tmds_encode_1bpp
push {r4-r7, lr}
mov r7, r8
push {r7}
lsls r2, #1
add r2, r1
mov ip, r2
adr r4, tmds_1bpp_table
mov r8, r4
// Mask: 4 bit index, 8 bytes per entry
movs r3, #0x78
b 2f
1:
ldmia r0!, {r2}
#if !DVI_1BPP_BIT_REVERSE
tmds_encode_1bpp_body lsls 3 lsrs 1
tmds_encode_1bpp_body lsrs 5 lsrs 9
tmds_encode_1bpp_body lsrs 13 lsrs 17
tmds_encode_1bpp_body lsrs 21 lsrs 25
#else
tmds_encode_1bpp_body lsrs 1 lsls 3
tmds_encode_1bpp_body lsrs 9 lsrs 5
tmds_encode_1bpp_body lsrs 17 lsrs 13
tmds_encode_1bpp_body lsrs 25 lsrs 21
#endif
2:
cmp r1, ip
blo 1b
pop {r7}
mov r8, r7
pop {r4-r7, pc}
.align 2
tmds_1bpp_table:
#if !DVI_1BPP_BIT_REVERSE
.word 0x7fd00, 0x7fd00 // 0000
.word 0x7fe00, 0x7fd00 // 0001
.word 0xbfd00, 0x7fd00 // 0010
.word 0xbfe00, 0x7fd00 // 0011
.word 0x7fd00, 0x7fe00 // 0100
.word 0x7fe00, 0x7fe00 // 0101
.word 0xbfd00, 0x7fe00 // 0110
.word 0xbfe00, 0x7fe00 // 0111
.word 0x7fd00, 0xbfd00 // 1000
.word 0x7fe00, 0xbfd00 // 1001
.word 0xbfd00, 0xbfd00 // 1010
.word 0xbfe00, 0xbfd00 // 1011
.word 0x7fd00, 0xbfe00 // 1100
.word 0x7fe00, 0xbfe00 // 1101
.word 0xbfd00, 0xbfe00 // 1110
.word 0xbfe00, 0xbfe00 // 1111
#else
.word 0x7fd00, 0x7fd00 // 0000
.word 0x7fd00, 0xbfd00 // 1000
.word 0x7fd00, 0x7fe00 // 0100
.word 0x7fd00, 0xbfe00 // 1100
.word 0xbfd00, 0x7fd00 // 0010
.word 0xbfd00, 0xbfd00 // 1010
.word 0xbfd00, 0x7fe00 // 0110
.word 0xbfd00, 0xbfe00 // 1110
.word 0x7fe00, 0x7fd00 // 0001
.word 0x7fe00, 0xbfd00 // 1001
.word 0x7fe00, 0x7fe00 // 0101
.word 0x7fe00, 0xbfe00 // 1101
.word 0xbfe00, 0x7fd00 // 0011
.word 0xbfe00, 0xbfd00 // 1011
.word 0xbfe00, 0x7fe00 // 0111
.word 0xbfe00, 0xbfe00 // 1111
#endif
// ----------------------------------------------------------------------------
// Full-resolution 2bpp encode (for 2bpp grayscale, or bitplaned RGB222)
// Even-x-position pixels are encoded as symbols with imbalance -4, and odd
// pixels with +4, so that we can mix-and-match our even/odd codewords and
// always get a properly balanced sequence:
//
// level 0: (05 -> 103), then (04 -> 1fc) (decimal 5, 4)
// level 1: (50 -> 130), then (51 -> 1cf) (decimal 80, 81)
// level 2: (af -> 230), then (ae -> 2cf) (decimal 175, 174)
// level 3: (fa -> 203), then (fb -> 2fc) (decimal 250, 251)
//
// These correspond to roughly 255 times (0, 1/3, 2/3, 1).
//
// Alternatively we could use symbols with 0 balance, which results in lower
// contrast but avoids the LSB bobble:
//
// level 0: (10 -> 1f0) always
// level 1: (5a -> 263) always
// level 2: (a5 -> 163) always
// level 3: (ef -> 2f0) always
// Table base pointer in r0. Input pixels in r2.
.macro encode_2bpp_body shift_instr shamt rd
\shift_instr \rd, r2, #\shamt
ands \rd, r3
ldr \rd, [r0, \rd]
.endm
// r0: input buffer (word-aligned)
// r1: output buffer (word-aligned)
// r2: output pixel count
decl_func tmds_encode_2bpp
push {r4-r7, lr}
mov r7, r8
push {r7}
mov r8, r0
adr r0, tmds_2bpp_table
// Mask: 4-bit index into 4-byte entries.
movs r3, #0x3c
// Limit pointer: 1 word per 2 pixels
lsls r2, #1
add r2, r1
mov ip, r2
b 2f
1:
mov r4, r8
ldmia r4!, {r2}
mov r8, r4
encode_2bpp_body lsls 2 r4
encode_2bpp_body lsrs 2 r5
encode_2bpp_body lsrs 6 r6
encode_2bpp_body lsrs 10 r7
stmia r1!, {r4-r7}
encode_2bpp_body lsrs 14 r4
encode_2bpp_body lsrs 18 r5
encode_2bpp_body lsrs 22 r6
encode_2bpp_body lsrs 26 r7
stmia r1!, {r4-r7}
2:
cmp r1, ip
blo 1b
pop {r7}
mov r8, r7
pop {r4-r7, pc}
.align 2
tmds_2bpp_table:
.word 0x7f103 // 00, 00
.word 0x7f130 // 01, 00
.word 0x7f230 // 10, 00
.word 0x7f203 // 11, 00
.word 0x73d03 // 00, 01
.word 0x73d30 // 01, 01
.word 0x73e30 // 10, 01
.word 0x73e03 // 11, 01
.word 0xb3d03 // 00, 10
.word 0xb3d30 // 01, 10
.word 0xb3e30 // 10, 10
.word 0xb3e03 // 11, 10
.word 0xbf103 // 00, 11
.word 0xbf130 // 01, 11
.word 0xbf230 // 10, 11
.word 0xbf203 // 11, 11
// ----------------------------------------------------------------------------
// Full-resolution RGB encode (not very practical)
// Non-doubled TMDS encode. 8.333 cycles per pixel, no exceptions. (This is
// taking horizontal blanking (at VGA) and dual core into account, and
// assuming the 3 channels are encoded individually.)
//
// Here is an idea
// Have a table with a 7 bit lookup. The lookup is the 6 colour data bits (in
// ACCUM0), concatenated with the sign bit of our running disparity (from
// ACCUM1). Each table entry is a 20-bit TMDS symbol (pseudodifferential),
// with the symbol's disparity stored left-justified in the upper 12 bits, as
// e.g. a 6 bit signed integer.
//
// - Load pixel data. cyc: 0.75 (ldmia 2 words, every 4 pixels)
// - Write pixel to ACCUM0. cyc: 1
// - Read address from PEEK2. cyc: 1
// - Load encoded pixel from address. cyc: 2
// - Write disparity data to ACCUM1_ADD cyc: 1
// - Write encoded data to output buffer. cyc: 1.25 (stmia 4 words, every 4 pixels)
//
// With decent register allocation we may be able to load 4 pixels at
// once (2 words), and write 4 at once (4 words). This gives 7 cyc/pix.
//
// One issue is that the TMDS data in the bottom of ACCUM1 will eventually
// overflow and affect the running disparity, but with 16 zeroes in between,
// this would take much longer than one scanline, so everything is fine if
// we clear the accumulator at the start of the scanline.
//
// Note that we need to use two interpolators to get the bits from both pixels
// -- we are not outputting a single DC-balanced stream, but rather two
// interleaved streams which are each DC-balanced. This is fine electrically,
// but our output here will *NOT* match the TMDS encoder given in the DVI
// spec.
// You can define TMDS_FULLRES_NO_DC_BALANCE to disable the running balance
// feedback. With the feedback enabled (default), the output is DC balanced,
// but there are just barely enough CPU cycles to do all the encode, so it's
// essentially a party trick. If you disable DC balancing, the performance is
// much better, and many monitors will still accept the signals as long as you
// DC couple your DVI signals.
.macro tmds_fullres_encode_loop_body ra rb
str \ra, [r2, #ACCUM0_OFFS + INTERP1]
str \ra, [r2, #ACCUM0_OFFS]
ldr \ra, [r2, #PEEK2_OFFS]
ldr \ra, [\ra]
#if !TMDS_FULLRES_NO_DC_BALANCE
str \ra, [r2, #ACCUM1_ADD_OFFS]
#endif
ldr \rb, [r2, #PEEK2_OFFS + INTERP1]
ldr \rb, [\rb]
#if !TMDS_FULLRES_NO_DC_BALANCE
str \rb, [r2, #ACCUM1_ADD_OFFS + INTERP1]
#endif
.endm
// r0: Input buffer (word-aligned)
// r1: Output buffer (word-aligned)
// r2: Pixel count
.macro tmds_fullres_encode_loop_16bpp
push {r4-r7, lr}
mov r4, r8
push {r4}
lsls r2, #2
add r2, r1
mov ip, r2
ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
// DC balance defined to be 0 at start of scanline:
movs r4, #0
str r4, [r2, #ACCUM1_OFFS]
#if TMDS_FULLRES_NO_DC_BALANCE
// Alternate parity between odd/even symbols if no feedback
mvns r4, r4
#endif
str r4, [r2, #ACCUM1_OFFS + INTERP1]
// Keep loop start pointer in r8 so we can get a longer backward branch
adr r4, 1f
adds r4, #1 // god damn thumb bit why is this a thing
mov r8, r4
b 2f
.align 2
1:
.rept 16
ldmia r0!, {r4, r6}
tmds_fullres_encode_loop_body r4 r5
tmds_fullres_encode_loop_body r6 r7
stmia r1!, {r4, r5, r6, r7}
.endr
2:
cmp r1, ip
beq 1f
bx r8
1:
pop {r4}
mov r8, r4
pop {r4-r7, pc}
.endm
// One copy each in X and Y, so the two cores don't step on each other
decl_func_x tmds_fullres_encode_loop_16bpp_x
tmds_fullres_encode_loop_16bpp
decl_func_y tmds_fullres_encode_loop_16bpp_y
tmds_fullres_encode_loop_16bpp
.macro tmds_fullres_encode_loop_body_leftshift ra rb
// Note we apply the leftshift for INTERP0 only
str \ra, [r2, #ACCUM0_OFFS + INTERP1]
lsls \ra, r3
str \ra, [r2, #ACCUM0_OFFS]
ldr \ra, [r2, #PEEK2_OFFS]
ldr \ra, [\ra]
#if !TMDS_FULLRES_NO_DC_BALANCE
str \ra, [r2, #ACCUM1_ADD_OFFS]
#endif
ldr \rb, [r2, #PEEK2_OFFS + INTERP1]
ldr \rb, [\rb]
#if !TMDS_FULLRES_NO_DC_BALANCE
str \rb, [r2, #ACCUM1_ADD_OFFS + INTERP1]
#endif
.endm
// r0: Input buffer (word-aligned)
// r1: Output buffer (word-aligned)
// r2: Pixel count
// r3: Left shift amount
.macro tmds_fullres_encode_loop_16bpp_leftshift
push {r4-r7, lr}
mov r4, r8
mov r5, r9
push {r4-r5}
lsls r2, #2
add r2, r1
mov ip, r2
ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
// DC balance defined to be 0 at start of scanline:
movs r4, #0
str r4, [r2, #ACCUM1_OFFS]
#if TMDS_FULLRES_NO_DC_BALANCE
// Alternate parity between odd/even symbols if there's no balance feedback
mvns r4, r4
#endif
str r4, [r2, #ACCUM1_OFFS + INTERP1]
adr r4, 1f
adds r4, #1
mov r8, r4
b 2f
.align 2
1:
.rept 16 // 64 pixels per iteration
ldmia r0!, {r4, r6}
tmds_fullres_encode_loop_body_leftshift r4 r5
tmds_fullres_encode_loop_body_leftshift r6 r7
stmia r1!, {r4, r5, r6, r7}
.endr
2:
cmp r1, ip
beq 1f
bx r8
1:
pop {r4-r5}
mov r8, r4
mov r9, r5
pop {r4-r7, pc}
.endm
decl_func_x tmds_fullres_encode_loop_16bpp_leftshift_x
tmds_fullres_encode_loop_16bpp_leftshift
decl_func_y tmds_fullres_encode_loop_16bpp_leftshift_y
tmds_fullres_encode_loop_16bpp_leftshift
// ----------------------------------------------------------------------------
// Full-resolution 8bpp paletted encode
// Variant of tmds_fullres_encode_loop_16bpp that reads
// 8-bit wide pixels packed 4 per word. The interpolator
// base is set to a reordered list of TMDS symbols based
// on a user colour palette.
// Two pixels input in rd[17:2]. Two symbols output in rd[19:0]. r2 contains
// interp base pointer. r7 used as temporary.
.macro tmds_palette_encode_loop_body rd
str \rd, [r2, #ACCUM0_OFFS]
str \rd, [r2, #ACCUM0_OFFS + INTERP1]
ldr \rd, [r2, #PEEK2_OFFS]
ldr \rd, [\rd]
#if !TMDS_FULLRES_NO_DC_BALANCE
str \rd, [r2, #ACCUM1_ADD_OFFS]
#endif
ldr r7, [r2, #PEEK2_OFFS + INTERP1]
ldr r7, [r7]
#if !TMDS_FULLRES_NO_DC_BALANCE
str r7, [r2, #ACCUM1_ADD_OFFS + INTERP1]
#endif
lsls r7, #10
orrs \rd, r7
.endm
.macro tmds_palette_encode_loop
push {r4-r7, lr}
mov r4, r8
push {r4}
lsls r2, #1
add r2, r1
mov ip, r2
ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
// DC balance defined to be 0 at start of scanline:
movs r4, #0
str r4, [r2, #ACCUM1_OFFS]
#if TMDS_FULLRES_NO_DC_BALANCE
// Alternate parity between odd/even symbols if there's no balance feedback
mvns r4, r4
#endif
str r4, [r2, #ACCUM1_OFFS + INTERP1]
// Keep loop start pointer in r8 so we can get a longer backward branch
adr r4, 1f
adds r4, #1 // god damn thumb bit why is this a thing
mov r8, r4
b 2f
.align 2
1:
.rept 10
ldmia r0!, {r3, r5}
lsrs r4, r3, #14
lsls r3, #2
lsrs r6, r5, #14
lsls r5, #2
tmds_palette_encode_loop_body r3
tmds_palette_encode_loop_body r4
tmds_palette_encode_loop_body r5
tmds_palette_encode_loop_body r6
stmia r1!, {r3, r4, r5, r6}
.endr
2:
cmp r1, ip
beq 1f
bx r8
1:
pop {r4}
mov r8, r4
pop {r4-r7, pc}
.endm
decl_func_x tmds_palette_encode_loop_x
tmds_palette_encode_loop
decl_func_y tmds_palette_encode_loop_y
tmds_palette_encode_loop

305
src/libdvi/tmds_encode.c Normal file
View file

@ -0,0 +1,305 @@
#include "hardware/interp.h"
#include "tmds_encode.h"
#include "hardware/gpio.h"
#include "hardware/sync.h"
static const uint32_t __scratch_x("tmds_table") tmds_table[] = {
#include "tmds_table.h"
};
// Fullres table is bandwidth-critical, so gets one copy for each scratch
// memory. There is a third copy which can go in flash, because it's just used
// to generate palette LUTs. The ones we don't use will get garbage collected
// during linking.
const uint32_t __scratch_x("tmds_table_fullres_x") tmds_table_fullres_x[] = {
#include "tmds_table_fullres.h"
};
const uint32_t __scratch_y("tmds_table_fullres_y") tmds_table_fullres_y[] = {
#include "tmds_table_fullres.h"
};
// Configure an interpolator to extract a single colour channel from each of a pair
// of pixels, with the first pixel's lsb at pixel_lsb, and the pixels being
// pixel_width wide. Produce a LUT address for the first pixel's colour data on
// LANE0, and the second pixel's colour data on LANE1.
//
// Returns nonzero if the *_leftshift variant of the encoder loop must be used
// (needed for blue channel because I was a stubborn idiot and didn't put
// signed/bidirectional shift on interpolator, very slightly slower). The
// return value is the size of left shift required.
static int __not_in_flash_func(configure_interp_for_addrgen)(interp_hw_t *interp, uint channel_msb, uint channel_lsb, uint pixel_lsb, uint pixel_width, uint lut_index_width, const uint32_t *lutbase) {
interp_config c;
const uint index_shift = 2; // scaled lookup for 4-byte LUT entries
int shift_channel_to_index = pixel_lsb + channel_msb - (lut_index_width - 1) - index_shift;
int oops = 0;
if (shift_channel_to_index < 0) {
// "It's ok we'll fix it in software"
oops = -shift_channel_to_index;
shift_channel_to_index = 0;
}
uint index_msb = index_shift + lut_index_width - 1;
c = interp_default_config();
interp_config_set_shift(&c, shift_channel_to_index);
interp_config_set_mask(&c, index_msb - (channel_msb - channel_lsb), index_msb);
interp_set_config(interp, 0, &c);
c = interp_default_config();
interp_config_set_shift(&c, pixel_width + shift_channel_to_index);
interp_config_set_mask(&c, index_msb - (channel_msb - channel_lsb), index_msb);
interp_config_set_cross_input(&c, true);
interp_set_config(interp, 1, &c);
interp->base[0] = (uint32_t)lutbase;
interp->base[1] = (uint32_t)lutbase;
return oops;
}
// Extract up to 6 bits from a buffer of 16 bit pixels, and produce a buffer
// of TMDS symbols from this colour channel. Number of pixels must be even,
// pixel buffer must be word-aligned.
void __not_in_flash_func(tmds_encode_data_channel_16bpp)(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint channel_msb, uint channel_lsb) {
interp_hw_save_t interp0_save;
interp_save(interp0_hw, &interp0_save);
int require_lshift = configure_interp_for_addrgen(interp0_hw, channel_msb, channel_lsb, 0, 16, 6, tmds_table);
if (require_lshift)
tmds_encode_loop_16bpp_leftshift(pixbuf, symbuf, n_pix, require_lshift);
else
tmds_encode_loop_16bpp(pixbuf, symbuf, n_pix);
interp_restore(interp0_hw, &interp0_save);
}
// As above, but 8 bits per pixel, multiple of 4 pixels, and still word-aligned.
void __not_in_flash_func(tmds_encode_data_channel_8bpp)(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint channel_msb, uint channel_lsb) {
interp_hw_save_t interp0_save, interp1_save;
interp_save(interp0_hw, &interp0_save);
interp_save(interp1_hw, &interp1_save);
// Note that for 8bpp, some left shift is always required for pixel 0 (any
// channel), which destroys some MSBs of pixel 3. To get around this, pixel
// data sent to interp1 is *not left-shifted*
int require_lshift = configure_interp_for_addrgen(interp0_hw, channel_msb, channel_lsb, 0, 8, 6, tmds_table);
int lshift_upper = configure_interp_for_addrgen(interp1_hw, channel_msb, channel_lsb, 16, 8, 6, tmds_table);
assert(!lshift_upper); (void)lshift_upper;
if (require_lshift)
tmds_encode_loop_8bpp_leftshift(pixbuf, symbuf, n_pix, require_lshift);
else
tmds_encode_loop_8bpp(pixbuf, symbuf, n_pix);
interp_restore(interp0_hw, &interp0_save);
interp_restore(interp1_hw, &interp1_save);
}
// ----------------------------------------------------------------------------
// Code for full-resolution TMDS encode (barely possible, utterly impractical):
// Different scheme used for full res as the fun pixel-doubling DC balance
// trick doesn't work, so we need to actually do running disparity. ACCUM0 has
// pixel data, ACCUM1 has running disparity. INTERP0 is used to process even
// pixels, and INTERP1 for odd pixels. Note this means that even and odd
// symbols have their DC balance handled separately, which is not to spec.
static int __not_in_flash_func(configure_interp_for_addrgen_fullres)(interp_hw_t *interp, uint channel_msb, uint channel_lsb, uint lut_index_width, const uint32_t *lutbase) {
const uint index_shift = 2; // scaled lookup for 4-byte LUT entries
int shift_channel_to_index = channel_msb - (lut_index_width - 1) - index_shift;
int oops = 0;
if (shift_channel_to_index < 0) {
// "It's ok we'll fix it in software"
oops = -shift_channel_to_index;
shift_channel_to_index = 0;
}
uint index_msb = index_shift + lut_index_width - 1;
interp_config c;
// Shift and mask colour channel to lower 6 bits of LUT index (note lut_index_width excludes disparity sign)
c = interp_default_config();
interp_config_set_shift(&c, shift_channel_to_index);
interp_config_set_mask(&c, index_msb - (channel_msb - channel_lsb), index_msb);
interp_set_config(interp, 0, &c);
// Concatenate disparity (ACCUM1) sign onto the LUT index
c = interp_default_config();
interp_config_set_shift(&c, 30 - index_msb);
interp_config_set_mask(&c, index_msb + 1, index_msb + 1);
interp_set_config(interp, 1, &c);
interp->base[2] = (uint32_t)lutbase;
return oops;
}
void __not_in_flash_func(tmds_encode_data_channel_fullres_16bpp)(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint channel_msb, uint channel_lsb) {
uint core = get_core_num();
#if !TMDS_FULLRES_NO_INTERP_SAVE
interp_hw_save_t interp0_save, interp1_save;
interp_save(interp0_hw, &interp0_save);
interp_save(interp1_hw, &interp1_save);
#endif
// There is a copy of the inner loop and the LUT in both scratch X and
// scratch Y memories. Use X on core 1 and Y on core 0 so the cores don't
// tread on each other's toes too much.
const uint32_t *lutbase = core ? tmds_table_fullres_x : tmds_table_fullres_y;
int lshift_lower = configure_interp_for_addrgen_fullres(interp0_hw, channel_msb, channel_lsb, 6, lutbase);
int lshift_upper = configure_interp_for_addrgen_fullres(interp1_hw, channel_msb + 16, channel_lsb + 16, 6, lutbase);
assert(!lshift_upper); (void)lshift_upper;
if (lshift_lower) {
(core ?
tmds_fullres_encode_loop_16bpp_leftshift_x :
tmds_fullres_encode_loop_16bpp_leftshift_y
)(pixbuf, symbuf, n_pix, lshift_lower);
}
else {
(core ?
tmds_fullres_encode_loop_16bpp_x :
tmds_fullres_encode_loop_16bpp_y
)(pixbuf, symbuf, n_pix);
}
#if !TMDS_FULLRES_NO_INTERP_SAVE
interp_restore(interp0_hw, &interp0_save);
interp_restore(interp1_hw, &interp1_save);
#endif
}
static const int8_t imbalance_lookup[16] = { -4, -2, -2, 0, -2, 0, 0, 2, -2, 0, 0, 2, 0, 2, 2, 4 };
static inline int byte_imbalance(uint32_t x)
{
return imbalance_lookup[x >> 4] + imbalance_lookup[x & 0xF];
}
static void tmds_encode_symbols(uint8_t pixel, uint32_t* negative_balance_sym, uint32_t* positive_balance_sym)
{
int pixel_imbalance = byte_imbalance(pixel);
uint32_t sym = pixel & 1;
if (pixel_imbalance > 0 || (pixel_imbalance == 0 && sym == 0)) {
for (int i = 0; i < 7; ++i) {
sym |= (~((sym >> i) ^ (pixel >> (i + 1))) & 1) << (i + 1);
}
}
else {
for (int i = 0; i < 7; ++i) {
sym |= ( ((sym >> i) ^ (pixel >> (i + 1))) & 1) << (i + 1);
}
sym |= 0x100;
}
int imbalance = byte_imbalance(sym & 0xFF);
if (imbalance == 0) {
if ((sym & 0x100) == 0) sym ^= 0x2ff;
*positive_balance_sym = sym;
*negative_balance_sym = sym;
return;
}
else if (imbalance > 0) {
*negative_balance_sym = (sym ^ 0x2ff) | (((-imbalance + imbalance_lookup[2 ^ (sym >> 8)] + 2) & 0x3F) << 26);
*positive_balance_sym = sym | ((imbalance + imbalance_lookup[sym >> 8] + 2) << 26);
}
else {
*negative_balance_sym = sym | (((imbalance + imbalance_lookup[sym >> 8] + 2) & 0x3F) << 26);
*positive_balance_sym = (sym ^ 0x2ff) | ((-imbalance + imbalance_lookup[2 ^ (sym >> 8)] + 2) << 26);
}
}
// This takes a 16-bit (RGB 565) colour palette and makes palettes of TMDS symbols suitable
// for performing fullres encode.
// The TMDS palette buffer should be 6 * n_palette words long.
// n_palette must be a power of 2 <= 256.
void tmds_setup_palette_symbols(const uint16_t *palette, uint32_t *tmds_palette, size_t n_palette) {
uint32_t* tmds_palette_blue = tmds_palette;
uint32_t* tmds_palette_green = tmds_palette + 2 * n_palette;
uint32_t* tmds_palette_red = tmds_palette + 4 * n_palette;
for (int i = 0; i < n_palette; ++i) {
uint16_t blue = (palette[i] << 3) & 0xf8;
uint16_t green = (palette[i] >> 3) & 0xfc;
uint16_t red = (palette[i] >> 8) & 0xf8;
tmds_encode_symbols(blue, &tmds_palette_blue[i], &tmds_palette_blue[i + n_palette]);
tmds_encode_symbols(green, &tmds_palette_green[i], &tmds_palette_green[i + n_palette]);
tmds_encode_symbols(red, &tmds_palette_red[i], &tmds_palette_red[i + n_palette]);
}
}
// This takes a 24-bit (RGB 888) colour palette and makes palettes of TMDS symbols suitable
// for performing fullres encode.
// The TMDS palette buffer should be 6 * n_palette words long.
// n_palette must be a power of 2 <= 256.
void tmds_setup_palette24_symbols(const uint32_t *palette, uint32_t *tmds_palette, size_t n_palette) {
uint32_t* tmds_palette_blue = tmds_palette;
uint32_t* tmds_palette_green = tmds_palette + 2 * n_palette;
uint32_t* tmds_palette_red = tmds_palette + 4 * n_palette;
for (int i = 0; i < n_palette; ++i) {
uint16_t blue = palette[i] & 0xff;
uint16_t green = (palette[i] >> 8) & 0xff;
uint16_t red = (palette[i] >> 16) & 0xff;
tmds_encode_symbols(blue, &tmds_palette_blue[i], &tmds_palette_blue[i + n_palette]);
tmds_encode_symbols(green, &tmds_palette_green[i], &tmds_palette_green[i + n_palette]);
tmds_encode_symbols(red, &tmds_palette_red[i], &tmds_palette_red[i + n_palette]);
}
}
// Encode palette data for all 3 channels.
// pixbuf is an array of n_pix 8-bit wide pixels containing palette values (32-bit word aligned)
// tmds_palette is a palette of TMDS symbols produced by tmds_setup_palette_symbols
// symbuf is 3*n_pix 32-bit words, this function writes the symbol values for each of the channels to it.
void __not_in_flash_func(tmds_encode_palette_data)(const uint32_t *pixbuf, const uint32_t *tmds_palette, uint32_t *symbuf, size_t n_pix, uint32_t palette_bits) {
uint core = get_core_num();
#if !TMDS_FULLRES_NO_INTERP_SAVE
interp_hw_save_t interp0_save, interp1_save;
interp_save(interp0_hw, &interp0_save);
interp_save(interp1_hw, &interp1_save);
#endif
interp0_hw->base[2] = (uint32_t)tmds_palette;
interp1_hw->base[2] = (uint32_t)tmds_palette;
// Lane 0 on both interpolators masks the palette bits, starting at bit 2,
// The second interpolator also shifts to read the 2nd or 4th byte of the word.
interp0_hw->ctrl[0] =
(2 << SIO_INTERP0_CTRL_LANE0_MASK_LSB_LSB) |
((palette_bits + 1) << SIO_INTERP0_CTRL_LANE0_MASK_MSB_LSB);
interp1_hw->ctrl[0] =
(8 << SIO_INTERP0_CTRL_LANE0_SHIFT_LSB) |
(2 << SIO_INTERP0_CTRL_LANE0_MASK_LSB_LSB) |
((palette_bits + 1) << SIO_INTERP0_CTRL_LANE0_MASK_MSB_LSB);
// Lane 1 shifts and masks the sign bit into the right position to add to the symbol
// table index to choose the negative disparity symbols if the sign is negative.
const uint32_t ctrl_lane_1 =
((31 - (palette_bits + 2)) << SIO_INTERP0_CTRL_LANE0_SHIFT_LSB) |
(palette_bits + 2) * ((1 << SIO_INTERP0_CTRL_LANE0_MASK_LSB_LSB) | (1 << SIO_INTERP0_CTRL_LANE0_MASK_MSB_LSB));
interp0_hw->ctrl[1] = ctrl_lane_1;
interp1_hw->ctrl[1] = ctrl_lane_1;
if (core) {
tmds_palette_encode_loop_x(pixbuf, symbuf, n_pix);
interp0_hw->base[2] = (uint32_t)(tmds_palette + (2 << palette_bits));
interp1_hw->base[2] = (uint32_t)(tmds_palette + (2 << palette_bits));
tmds_palette_encode_loop_x(pixbuf, symbuf + (n_pix >> 1), n_pix);
interp0_hw->base[2] = (uint32_t)(tmds_palette + (4 << palette_bits));
interp1_hw->base[2] = (uint32_t)(tmds_palette + (4 << palette_bits));
tmds_palette_encode_loop_x(pixbuf, symbuf + n_pix, n_pix);
} else {
tmds_palette_encode_loop_y(pixbuf, symbuf, n_pix);
interp0_hw->base[2] = (uint32_t)(tmds_palette + (2 << palette_bits));
interp1_hw->base[2] = (uint32_t)(tmds_palette + (2 << palette_bits));
tmds_palette_encode_loop_y(pixbuf, symbuf + (n_pix >> 1), n_pix);
interp0_hw->base[2] = (uint32_t)(tmds_palette + (4 << palette_bits));
interp1_hw->base[2] = (uint32_t)(tmds_palette + (4 << palette_bits));
tmds_palette_encode_loop_y(pixbuf, symbuf + n_pix, n_pix);
}
#if !TMDS_FULLRES_NO_INTERP_SAVE
interp_restore(interp0_hw, &interp0_save);
interp_restore(interp1_hw, &interp1_save);
#endif
}

46
src/libdvi/tmds_encode.h Normal file
View file

@ -0,0 +1,46 @@
#ifndef _TMDS_ENCODE_H_
#define _TMDS_ENCODE_H_
#include "hardware/interp.h"
#include "dvi_config_defs.h"
#if defined(__cplusplus)
extern "C"
{
#endif
// Functions from tmds_encode.c
void tmds_encode_data_channel_16bpp(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint channel_msb, uint channel_lsb);
void tmds_encode_data_channel_8bpp(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint channel_msb, uint channel_lsb);
void tmds_encode_data_channel_fullres_16bpp(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint channel_msb, uint channel_lsb);
void tmds_setup_palette_symbols(const uint16_t *palette, uint32_t *symbuf, size_t n_palette);
void tmds_setup_palette24_symbols(const uint32_t *palette, uint32_t *symbuf, size_t n_palette);
void tmds_encode_palette_data(const uint32_t *pixbuf, const uint32_t *tmds_palette, uint32_t *symbuf, size_t n_pix, uint32_t palette_bits);
// Functions from tmds_encode.S
void tmds_encode_1bpp(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
void tmds_encode_2bpp(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
// Uses interp0:
void tmds_encode_loop_16bpp(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
void tmds_encode_loop_16bpp_leftshift(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint leftshift);
// Uses interp0 and interp1:
void tmds_encode_loop_8bpp(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
void tmds_encode_loop_8bpp_leftshift(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint leftshift);
// Uses interp0 and interp1:
// (Note a copy is provided in scratch memories X and Y)
void tmds_fullres_encode_loop_16bpp_x(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
void tmds_fullres_encode_loop_16bpp_y(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
void tmds_fullres_encode_loop_16bpp_leftshift_x(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint leftshift);
void tmds_fullres_encode_loop_16bpp_leftshift_y(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint leftshift);
void tmds_palette_encode_loop_x(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
void tmds_palette_encode_loop_y(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
#if defined(__cplusplus)
}
#endif
#endif

View file

@ -0,0 +1,46 @@
.program tmds_encode_1bpp
; 1bpp black/white pixels go in, TMDS symbols come out.
; Each output word contains two output symbols, each 10 bits in size,
; right-justified. The least-significant symbol is displayed first.
;
; We can encode using the following LUT: (yes this is compliant)
;
; x % 2 | colour | symbol
; ------+--------+-------
; 0 | 0 | 0x100
; 0 | 1 | 0x200
; 1 | 0 | 0x1ff
; 1 | 1 | 0x2ff
;
; OSR: shift to right, autopull, threshold 32
; ISR: shift to right, autopush, threshold 24
;
; Note the ISR needs to be shifted to *right* so that we can get the first
; pixel in the less-significant position. Threshold 24 so we can get 8x 0-bits
; at the LSBs for free :)
even_pixel:
out x, 1
mov y, ~x
in y, 1
in x, 1
odd_pixel:
mov x, ~null
in x, 8
out x, 1
mov y, ~x
in y, 1
in x, 13 ; Bring total shift to 24, triggering push.
% c-sdk {
static inline void tmds_encode_1bpp_init(PIO pio, uint sm) {
uint offset = pio_add_program(pio, &tmds_encode_1bpp_program);
pio_sm_config c = tmds_encode_1bpp_program_get_default_config(offset);
sm_config_set_out_shift(&c, true, true, 32);
sm_config_set_in_shift(&c, true, true, 24);
pio_sm_init(pio, sm, offset, &c);
pio_sm_set_enabled(pio, sm, true);
}
%}

76
src/libdvi/tmds_table.h Normal file
View file

@ -0,0 +1,76 @@
// Generated from tmds_table_gen.py
//
// This table converts a 6 bit data input into a pair of TMDS data symbols
// with data content *almost* equal (1 LSB off) to input value left shifted by
// two. The pairs of symbols have a net DC balance of 0.
//
// The two symbols are concatenated in the 20 LSBs of a data word, with the
// first symbol in least-significant position.
//
// Note the declaration isn't included here, just the table body. This is in
// case you want multiple copies of the table in different SRAMs (particularly
// scratch X/Y).
0x7fd00u,
0x40dfcu,
0x41df8u,
0x7ed04u,
0x43df0u,
0x7cd0cu,
0x7dd08u,
0x42df4u,
0x47de0u,
0x78d1cu,
0x79d18u,
0x46de4u,
0x7bd10u,
0x44decu,
0x45de8u,
0xafa41u,
0x4fdc0u,
0x70d3cu,
0x71d38u,
0x4edc4u,
0x73d30u,
0x4cdccu,
0x4ddc8u,
0xa7a61u,
0x77d20u,
0x48ddcu,
0x49dd8u,
0xa3a71u,
0x4bdd0u,
0xa1a79u,
0xa0a7du,
0x9fa81u,
0x5fd80u,
0x60d7cu,
0x61d78u,
0x5ed84u,
0x63d70u,
0x5cd8cu,
0x5dd88u,
0xb7a21u,
0x67d60u,
0x58d9cu,
0x59d98u,
0xb3a31u,
0x5bd90u,
0xb1a39u,
0xb0a3du,
0x8fac1u,
0x6fd40u,
0x50dbcu,
0x51db8u,
0xbba11u,
0x53db0u,
0xb9a19u,
0xb8a1du,
0x87ae1u,
0x57da0u,
0xbda09u,
0xbca0du,
0x83af1u,
0xbea05u,
0x81af9u,
0x80afdu,
0xbfa01u,

View file

@ -0,0 +1,139 @@
// Each entry consists of a 10 bit TMDS symbol in pseudo-differential format
// (10 LSBs) and the symbol's disparity as a 6 bit signed integer (the 6
// MSBs). There is a 16 bit gap in between them, which is actually vital for
// the way the TMDS encode works!
//
// There are 128 1-word entries. The lookup index should be the concatenation
// of the sign bit of current running disparity, with 6 bits of colour channel
// data.
// Non-negative running disparity:
0xe0000100,
0xf8000303,
0x00000307,
0xe8000104,
0x000001f0,
0xf000010c,
0xe8000108,
0x0000030b,
0xf80001e0,
0xf800011c,
0xf0000118,
0x000001e4,
0xe8000110,
0x00000313,
0x000001e8,
0xf0000241,
0xf00001c0,
0x0000013c,
0xf8000138,
0xf80001c4,
0xf0000130,
0x000001cc,
0xf80001c8,
0xf8000261,
0xe8000120,
0x00000323,
0x000001d8,
0x00000271,
0xf80001d0,
0xf0000086,
0xe8000082,
0xf0000281,
0xe8000180,
0x00000383,
0x00000178,
0xf0000184,
0xf8000170,
0xf800018c,
0xf0000188,
0xf0000221,
0xf0000160,
0x0000019c,
0xf8000198,
0xf8000231,
0xf0000190,
0x00000239,
0xf00000c2,
0xf80002c1,
0xe8000140,
0x00000343,
0x000001b8,
0xf0000211,
0xf80001b0,
0xf8000219,
0x0000021d,
0x000002e1,
0xf00001a0,
0xf0000209,
0xf800020d,
0xf000000e,
0xf0000205,
0xe8000006,
0xe0000002,
0xe8000201,
// Negative running disparity:
0x280003ff,
0x100001fc,
0x080001f8,
0x200003fb,
0x000001f0,
0x180003f3,
0x200003f7,
0x080001f4,
0x1000031f,
0x100003e3,
0x180003e7,
0x000001e4,
0x200003ef,
0x080001ec,
0x000001e8,
0x080000be,
0x1800033f,
0x0000013c,
0x100003c7,
0x1000033b,
0x180003cf,
0x000001cc,
0x10000337,
0x0000009e,
0x200003df,
0x080001dc,
0x000001d8,
0x00000271,
0x1000032f,
0x08000279,
0x1000027d,
0x0800007e,
0x2000037f,
0x0800017c,
0x00000178,
0x1800037b,
0x1000038f,
0x10000373,
0x18000377,
0x080000de,
0x1800039f,
0x0000019c,
0x10000367,
0x000000ce,
0x1800036f,
0x00000239,
0x0800023d,
0x0000003e,
0x200003bf,
0x080001bc,
0x000001b8,
0x080000ee,
0x1000034f,
0x000000e6,
0x0000021d,
0x000002e1,
0x1800035f,
0x080000f6,
0x000000f2,
0x080002f1,
0x080000fa,
0x100002f9,
0x180002fd,
0x100000fe,

150
src/libdvi/tmds_table_gen.py Executable file
View file

@ -0,0 +1,150 @@
#!/usr/bin/env python3
# The key fact is that, if x is even, and the encoder currently has a running
# imbalance of 0, encoding x followed by x + 1 produces a symbol pair with a
# net balance of 0.
#
# This is a reasonable constraint, because we only want RGB565 (so 6 valid
# channel data bits -> data is multiple of 4), and can probably tolerate
# 0.25LSB of noise :)
#
# This means that encoding a half-horizontal-resolution scanline buffer is a
# simple LUT operation for each colour channel, because we have made the
# encoding process stateless by guaranteeing 0 balance.
def popcount(x):
n = 0
while x:
n += 1
x = x & (x - 1)
return n
# Equivalent to N1(q) - N0(q) in the DVI spec
def byteimbalance(x):
return 2 * popcount(x) - 8
# This is a direct translation of "Figure 3-5. T.M.D.S. Encode Algorithm" on
# page 29 of DVI 1.0 spec
class TMDSEncode:
ctrl_syms = {
0b00: 0b1101010100,
0b01: 0b0010101011,
0b10: 0b0101010100,
0b11: 0b1010101011
}
def __init__(self):
self.imbalance = 0
def encode(self, d, c, de):
if not de:
self.imbalance = 0
return self.ctrl_syms[c]
# Minimise transitions
q_m = d & 0x1
if popcount(d) > 4 or (popcount(d) == 4 and not d & 0x1):
for i in range(7):
q_m = q_m | (~(q_m >> i ^ d >> i + 1) & 0x1) << i + 1
else:
for i in range(7):
q_m = q_m | ( (q_m >> i ^ d >> i + 1) & 0x1) << i + 1
q_m = q_m | 0x100
# Correct DC balance
inversion_mask = 0x2ff
q_out = 0
if self.imbalance == 0 or byteimbalance(q_m & 0xff) == 0:
q_out = q_m ^ (0 if q_m & 0x100 else inversion_mask)
if q_m & 0x100:
self.imbalance += byteimbalance(q_m & 0xff)
else:
self.imbalance -= byteimbalance(q_m & 0xff)
elif (self.imbalance > 0) == (byteimbalance(q_m & 0xff) > 0):
q_out = q_m ^ inversion_mask
self.imbalance += ((q_m & 0x100) >> 7) - byteimbalance(q_m & 0xff)
else:
q_out = q_m
self.imbalance += byteimbalance(q_m & 0xff) - ((~q_m & 0x100) >> 7)
return q_out
# Turn a bitmap of width n into n pairs of pseudo-differential bits
def differentialise(x, n):
accum = 0
for i in range(n):
accum <<= 2
if x & (1 << (n - 1)):
accum |= 0b01
else:
accum |= 0b10
x <<= 1
return accum
enc = TMDSEncode()
###
# Pixel-doubled table:
# for i in range(0, 256, 4):
# sym0 = enc.encode(i, 0, 1)
# sym1 = enc.encode(i ^ 1, 0, 1)
# assert(enc.imbalance == 0)
# print(f"0x{sym0 | (sym1 << 10):05x}u,")
###
# Fullres 1bpp table: (each entry is 2 words, 4 pixels)
# (note trick here is that encoding 0x00 or 0xff sets imbalance to -8, and
# (encoding 0x01 or 0xfe returns imbalance to 0, so we alternate between these
# (two pairs of dark/light colours. Creates some fairly subtle vertical
# (banding, but it's cheap.
# for i in range(1 << 4):
# syms = list(enc.encode((0xff if i & 1 << j else 0) ^ j & 0x01, 0, 1) for j in range(4))
# print(f"0x{syms[0] | syms[1] << 10:05x}, 0x{syms[2] | syms[3] << 10:05x}")
# assert(enc.imbalance == 0)
###
# Fullres table stuff:
# def disptable_format(sym):
# return sym | ((popcount(sym) * 2 - 10 & 0x3f) << 26)
# print("// Non-negative running disparity:")
# for i in range(0, 256, 4):
# enc.imbalance = 1
# print("0x{:08x},".format(disptable_format(enc.encode(i, 0, 1))))
# print("// Negative running disparity:")
# for i in range(0, 256, 4):
# enc.imbalance = -1
# print("0x{:08x},".format(disptable_format(enc.encode(i, 0, 1))))
###
# Control symbols:
# for i in range(4):
# sym = enc.encode(0, i, 0)
# print(f"0x{sym << 10 | sym:05x},")
###
# Find zero-balance symbols:
# for i in range(256):
# enc.imbalance = 0
# sym = enc.encode(i, 0, 1)
# if enc.imbalance == 0:
# print(f"{i:02x}: {sym:03x}")
###
# Generate 2bpp table based on above experiment:
levels_2bpp_even = [0x05, 0x50, 0xaf, 0xfa]
levels_2bpp_odd = [0x04, 0x51, 0xae, 0xfb]
for i1, p1 in enumerate(levels_2bpp_odd):
for i0, p0 in enumerate(levels_2bpp_even):
sym0 = enc.encode(p0, 0, 1)
sym1 = enc.encode(p1, 0, 1)
assert(enc.imbalance == 0)
print(f".word 0x{sym1 << 10 | sym0:05x} // {i0:02b}, {i1:02b}")

View file

@ -0,0 +1,83 @@
#ifndef _UTIL_QUEUE_U32_INLINE_H
#define _UTIL_QUEUE_U32_INLINE_H
// Faster versions of the functions found in pico/util/queue.h, for the common
// case of 32-bit-sized elements. Can be used on the same queue data
// structure, and mixed freely with the generic access methods, as long as
// element_size == 4.
#include "pico/util/queue.h"
#include "hardware/sync.h"
static inline uint16_t _queue_inc_index_u32(queue_t *q, uint16_t index) {
if (++index > q->element_count) { // > because we have element_count + 1 elements
index = 0;
}
return index;
}
static inline bool queue_try_add_u32(queue_t *q, void *data) {
bool success = false;
uint32_t flags = spin_lock_blocking(q->core.spin_lock);
if (queue_get_level_unsafe(q) != q->element_count) {
((uint32_t*)q->data)[q->wptr] = *(uint32_t*)data;
q->wptr = _queue_inc_index_u32(q, q->wptr);
success = true;
}
spin_unlock(q->core.spin_lock, flags);
if (success) __sev();
return success;
}
static inline bool queue_try_remove_u32(queue_t *q, void *data) {
bool success = false;
uint32_t flags = spin_lock_blocking(q->core.spin_lock);
if (queue_get_level_unsafe(q) != 0) {
*(uint32_t*)data = ((uint32_t*)q->data)[q->rptr];
q->rptr = _queue_inc_index_u32(q, q->rptr);
success = true;
}
spin_unlock(q->core.spin_lock, flags);
if (success) __sev();
return success;
}
static inline bool queue_try_peek_u32(queue_t *q, void *data) {
bool success = false;
uint32_t flags = spin_lock_blocking(q->core.spin_lock);
if (queue_get_level_unsafe(q) != 0) {
*(uint32_t*)data = ((uint32_t*)q->data)[q->rptr];
success = true;
}
spin_unlock(q->core.spin_lock, flags);
return success;
}
static inline void queue_add_blocking_u32(queue_t *q, void *data) {
bool done;
do {
done = queue_try_add_u32(q, data);
if (done) break;
__wfe();
} while (true);
}
static inline void queue_remove_blocking_u32(queue_t *q, void *data) {
bool done;
do {
done = queue_try_remove_u32(q, data);
if (done) break;
__wfe();
} while (true);
}
static inline void queue_peek_blocking_u32(queue_t *q, void *data) {
bool done;
do {
done = queue_try_peek_u32(q, data);
if (done) break;
__wfe();
} while (true);
}
#endif