From bb7dc7c20dbe52e56005ee2c71187974cd0fa284 Mon Sep 17 00:00:00 2001 From: Phillip Burgess Date: Thu, 9 Mar 2023 15:00:54 -0800 Subject: [PATCH] Remove soft link to libdvi (copy full directory instead) For Arduino Library Manager compliance --- Readme.md | 3 +- src/libdvi | 1 - src/libdvi/CMakeLists.txt | 33 ++ src/libdvi/dvi.c | 255 ++++++++++++ src/libdvi/dvi.h | 81 ++++ src/libdvi/dvi_config_defs.h | 151 +++++++ src/libdvi/dvi_serialiser.c | 73 ++++ src/libdvi/dvi_serialiser.h | 22 + src/libdvi/dvi_serialiser.pio | 53 +++ src/libdvi/dvi_serialiser.pio.h | 101 +++++ src/libdvi/dvi_timing.c | 324 +++++++++++++++ src/libdvi/dvi_timing.h | 99 +++++ src/libdvi/tmds_encode.S | 623 +++++++++++++++++++++++++++++ src/libdvi/tmds_encode.c | 305 ++++++++++++++ src/libdvi/tmds_encode.h | 46 +++ src/libdvi/tmds_encode_1bpp.pio | 46 +++ src/libdvi/tmds_table.h | 76 ++++ src/libdvi/tmds_table_fullres.h | 139 +++++++ src/libdvi/tmds_table_gen.py | 150 +++++++ src/libdvi/util_queue_u32_inline.h | 83 ++++ 20 files changed, 2661 insertions(+), 3 deletions(-) delete mode 120000 src/libdvi create mode 100644 src/libdvi/CMakeLists.txt create mode 100644 src/libdvi/dvi.c create mode 100644 src/libdvi/dvi.h create mode 100644 src/libdvi/dvi_config_defs.h create mode 100644 src/libdvi/dvi_serialiser.c create mode 100644 src/libdvi/dvi_serialiser.h create mode 100644 src/libdvi/dvi_serialiser.pio create mode 100644 src/libdvi/dvi_serialiser.pio.h create mode 100644 src/libdvi/dvi_timing.c create mode 100644 src/libdvi/dvi_timing.h create mode 100644 src/libdvi/tmds_encode.S create mode 100644 src/libdvi/tmds_encode.c create mode 100644 src/libdvi/tmds_encode.h create mode 100644 src/libdvi/tmds_encode_1bpp.pio create mode 100644 src/libdvi/tmds_table.h create mode 100644 src/libdvi/tmds_table_fullres.h create mode 100755 src/libdvi/tmds_table_gen.py create mode 100644 src/libdvi/util_queue_u32_inline.h diff --git a/Readme.md b/Readme.md index 4193f5b..b57676c 100644 --- a/Readme.md +++ b/Readme.md @@ -19,8 +19,7 @@ RP2040 core). Changes vs main PicoDVI repo: - Add library.properties file, src and examples directories per Arduino requirements. -- software/libdvi is soft-linked into src so Arduino IDE can compile these -parts. +- A full copy of software/libdvi is made in src (originally was soft-linked but Arduino Library Manager does not approve). If any updates are made in the original PicoDVI libdvi directory, copy them here! - The file dvi_serialiser.pio.h, normally not part of the distribution and generated during the Pico SDK build process, is provided here for Arduino build to work. If any changes are made in dvi_serialiser.pio (either here diff --git a/src/libdvi b/src/libdvi deleted file mode 120000 index b457413..0000000 --- a/src/libdvi +++ /dev/null @@ -1 +0,0 @@ -../software/libdvi \ No newline at end of file diff --git a/src/libdvi/CMakeLists.txt b/src/libdvi/CMakeLists.txt new file mode 100644 index 0000000..7c52661 --- /dev/null +++ b/src/libdvi/CMakeLists.txt @@ -0,0 +1,33 @@ +# Note we are using INTERFACE so that the library can be configured per-app +# with compile-time defines + +add_library(libdvi INTERFACE) + +target_sources(libdvi INTERFACE + ${CMAKE_CURRENT_LIST_DIR}/dvi.c + ${CMAKE_CURRENT_LIST_DIR}/dvi.h + ${CMAKE_CURRENT_LIST_DIR}/dvi_config_defs.h + ${CMAKE_CURRENT_LIST_DIR}/dvi_serialiser.c + ${CMAKE_CURRENT_LIST_DIR}/dvi_serialiser.h + ${CMAKE_CURRENT_LIST_DIR}/dvi_timing.c + ${CMAKE_CURRENT_LIST_DIR}/dvi_timing.h + ${CMAKE_CURRENT_LIST_DIR}/tmds_encode.S + ${CMAKE_CURRENT_LIST_DIR}/tmds_encode.c + ${CMAKE_CURRENT_LIST_DIR}/tmds_encode.h + ${CMAKE_CURRENT_LIST_DIR}/tmds_table.h + ${CMAKE_CURRENT_LIST_DIR}/tmds_table_fullres.h + ${CMAKE_CURRENT_LIST_DIR}/util_queue_u32_inline.h + ) + +target_include_directories(libdvi INTERFACE ${CMAKE_CURRENT_LIST_DIR}) +target_link_libraries(libdvi INTERFACE + pico_base_headers + pico_util + hardware_dma + hardware_interp + hardware_pio + hardware_pwm + ) + +pico_generate_pio_header(libdvi ${CMAKE_CURRENT_LIST_DIR}/dvi_serialiser.pio) +pico_generate_pio_header(libdvi ${CMAKE_CURRENT_LIST_DIR}/tmds_encode_1bpp.pio) diff --git a/src/libdvi/dvi.c b/src/libdvi/dvi.c new file mode 100644 index 0000000..07ff5b6 --- /dev/null +++ b/src/libdvi/dvi.c @@ -0,0 +1,255 @@ +#include +#include "hardware/dma.h" +#include "hardware/irq.h" + +#include "dvi.h" +#include "dvi_timing.h" +#include "dvi_serialiser.h" +#include "tmds_encode.h" + +// Adafruit PicoDVI fork requires a couple global items run-time configurable: +uint8_t dvi_vertical_repeat = DVI_VERTICAL_REPEAT; +bool dvi_monochrome_tmds = DVI_MONOCHROME_TMDS; + +// Time-critical functions pulled into RAM but each in a unique section to +// allow garbage collection +#define __dvi_func(f) __not_in_flash_func(f) +#define __dvi_func_x(f) __scratch_x(__STRING(f)) f + +// We require exclusive use of a DMA IRQ line. (you wouldn't want to share +// anyway). It's possible in theory to hook both IRQs and have two DVI outs. +static struct dvi_inst *dma_irq_privdata[2]; +static void dvi_dma0_irq(); +static void dvi_dma1_irq(); + +void dvi_init(struct dvi_inst *inst, uint spinlock_tmds_queue, uint spinlock_colour_queue) { + dvi_timing_state_init(&inst->timing_state); + dvi_serialiser_init(&inst->ser_cfg); + for (int i = 0; i < N_TMDS_LANES; ++i) { + inst->dma_cfg[i].chan_ctrl = dma_claim_unused_channel(true); + inst->dma_cfg[i].chan_data = dma_claim_unused_channel(true); + inst->dma_cfg[i].tx_fifo = (void*)&inst->ser_cfg.pio->txf[inst->ser_cfg.sm_tmds[i]]; + inst->dma_cfg[i].dreq = pio_get_dreq(inst->ser_cfg.pio, inst->ser_cfg.sm_tmds[i], true); + } + inst->late_scanline_ctr = 0; + inst->tmds_buf_release_next = NULL; + inst->tmds_buf_release = NULL; + queue_init_with_spinlock(&inst->q_tmds_valid, sizeof(void*), 8, spinlock_tmds_queue); + queue_init_with_spinlock(&inst->q_tmds_free, sizeof(void*), 8, spinlock_tmds_queue); + queue_init_with_spinlock(&inst->q_colour_valid, sizeof(void*), 8, spinlock_colour_queue); + queue_init_with_spinlock(&inst->q_colour_free, sizeof(void*), 8, spinlock_colour_queue); + + dvi_setup_scanline_for_vblank(inst->timing, inst->dma_cfg, true, &inst->dma_list_vblank_sync); + dvi_setup_scanline_for_vblank(inst->timing, inst->dma_cfg, false, &inst->dma_list_vblank_nosync); +#if defined(ARDUINO) + dvi_setup_scanline_for_active(inst->timing, inst->dma_cfg, (uint32_t*)SRAM_BASE, &inst->dma_list_active); +#else + dvi_setup_scanline_for_active(inst->timing, inst->dma_cfg, (void*)SRAM_BASE, &inst->dma_list_active); +#endif + dvi_setup_scanline_for_active(inst->timing, inst->dma_cfg, NULL, &inst->dma_list_error); + + for (int i = 0; i < DVI_N_TMDS_BUFFERS; ++i) { + void *tmdsbuf; + if (dvi_monochrome_tmds) + tmdsbuf = malloc(inst->timing->h_active_pixels / DVI_SYMBOLS_PER_WORD * sizeof(uint32_t)); + else + tmdsbuf = malloc(3 * inst->timing->h_active_pixels / DVI_SYMBOLS_PER_WORD * sizeof(uint32_t)); + if (!tmdsbuf) + panic("TMDS buffer allocation failed"); + queue_add_blocking_u32(&inst->q_tmds_free, &tmdsbuf); + } +} + +// The IRQs will run on whichever core calls this function (this is why it's +// called separately from dvi_init) +void dvi_register_irqs_this_core(struct dvi_inst *inst, uint irq_num) { + uint32_t mask_sync_channel = 1u << inst->dma_cfg[TMDS_SYNC_LANE].chan_data; + uint32_t mask_all_channels = 0; + for (int i = 0; i < N_TMDS_LANES; ++i) + mask_all_channels |= 1u << inst->dma_cfg[i].chan_ctrl | 1u << inst->dma_cfg[i].chan_data; + + dma_hw->ints0 = mask_sync_channel; + if (irq_num == DMA_IRQ_0) { + hw_write_masked(&dma_hw->inte0, mask_sync_channel, mask_all_channels); + dma_irq_privdata[0] = inst; + irq_set_exclusive_handler(DMA_IRQ_0, dvi_dma0_irq); + } + else { + hw_write_masked(&dma_hw->inte1, mask_sync_channel, mask_all_channels); + dma_irq_privdata[1] = inst; + irq_set_exclusive_handler(DMA_IRQ_1, dvi_dma1_irq); + } + irq_set_enabled(irq_num, true); +} + +// Set up control channels to make transfers to data channels' control +// registers (but don't trigger the control channels -- this is done either by +// data channel CHAIN_TO or an initial write to MULTI_CHAN_TRIGGER) +static inline void __attribute__((always_inline)) _dvi_load_dma_op(const struct dvi_lane_dma_cfg dma_cfg[], struct dvi_scanline_dma_list *l) { + for (int i = 0; i < N_TMDS_LANES; ++i) { + dma_channel_config cfg = dma_channel_get_default_config(dma_cfg[i].chan_ctrl); + channel_config_set_ring(&cfg, true, 4); // 16-byte write wrap + channel_config_set_read_increment(&cfg, true); + channel_config_set_write_increment(&cfg, true); + dma_channel_configure( + dma_cfg[i].chan_ctrl, + &cfg, + &dma_hw->ch[dma_cfg[i].chan_data], + dvi_lane_from_list(l, i), + 4, // Configure all 4 registers then halt until next CHAIN_TO + false + ); + } +} + +// Setup first set of control block lists, configure the control channels, and +// trigger them. Control channels will subsequently be triggered only by DMA +// CHAIN_TO on data channel completion. IRQ handler *must* be prepared before +// calling this. (Hooked to DMA IRQ0) +void dvi_start(struct dvi_inst *inst) { + _dvi_load_dma_op(inst->dma_cfg, &inst->dma_list_vblank_nosync); + dma_start_channel_mask( + (1u << inst->dma_cfg[0].chan_ctrl) | + (1u << inst->dma_cfg[1].chan_ctrl) | + (1u << inst->dma_cfg[2].chan_ctrl)); + + // We really don't want the FIFOs to bottom out, so wait for full before + // starting the shift-out. + for (int i = 0; i < N_TMDS_LANES; ++i) + while (!pio_sm_is_tx_fifo_full(inst->ser_cfg.pio, inst->ser_cfg.sm_tmds[i])) + tight_loop_contents(); + dvi_serialiser_enable(&inst->ser_cfg, true); +} + +static inline void __dvi_func_x(_dvi_prepare_scanline_8bpp)(struct dvi_inst *inst, uint32_t *scanbuf) { + uint32_t *tmdsbuf; + queue_remove_blocking_u32(&inst->q_tmds_free, &tmdsbuf); + uint pixwidth = inst->timing->h_active_pixels; + uint words_per_channel = pixwidth / DVI_SYMBOLS_PER_WORD; + // Scanline buffers are half-resolution; the functions take the number of *input* pixels as parameter. + tmds_encode_data_channel_8bpp(scanbuf, tmdsbuf + 0 * words_per_channel, pixwidth / 2, DVI_8BPP_BLUE_MSB, DVI_8BPP_BLUE_LSB ); + tmds_encode_data_channel_8bpp(scanbuf, tmdsbuf + 1 * words_per_channel, pixwidth / 2, DVI_8BPP_GREEN_MSB, DVI_8BPP_GREEN_LSB); + tmds_encode_data_channel_8bpp(scanbuf, tmdsbuf + 2 * words_per_channel, pixwidth / 2, DVI_8BPP_RED_MSB, DVI_8BPP_RED_LSB ); + queue_add_blocking_u32(&inst->q_tmds_valid, &tmdsbuf); +} + +static inline void __dvi_func_x(_dvi_prepare_scanline_16bpp)(struct dvi_inst *inst, uint32_t *scanbuf) { + uint32_t *tmdsbuf; + queue_remove_blocking_u32(&inst->q_tmds_free, &tmdsbuf); + uint pixwidth = inst->timing->h_active_pixels; + uint words_per_channel = pixwidth / DVI_SYMBOLS_PER_WORD; + tmds_encode_data_channel_16bpp(scanbuf, tmdsbuf + 0 * words_per_channel, pixwidth / 2, DVI_16BPP_BLUE_MSB, DVI_16BPP_BLUE_LSB ); + tmds_encode_data_channel_16bpp(scanbuf, tmdsbuf + 1 * words_per_channel, pixwidth / 2, DVI_16BPP_GREEN_MSB, DVI_16BPP_GREEN_LSB); + tmds_encode_data_channel_16bpp(scanbuf, tmdsbuf + 2 * words_per_channel, pixwidth / 2, DVI_16BPP_RED_MSB, DVI_16BPP_RED_LSB ); + queue_add_blocking_u32(&inst->q_tmds_valid, &tmdsbuf); +} + +// "Worker threads" for TMDS encoding (core enters and never returns, but still handles IRQs) + +// Version where each record in q_colour_valid is one scanline: +void __dvi_func(dvi_scanbuf_main_8bpp)(struct dvi_inst *inst) { + uint y = 0; + while (1) { + uint32_t *scanbuf; + queue_remove_blocking_u32(&inst->q_colour_valid, &scanbuf); + _dvi_prepare_scanline_8bpp(inst, scanbuf); + queue_add_blocking_u32(&inst->q_colour_free, &scanbuf); + ++y; + if (y == inst->timing->v_active_lines) { + y = 0; + } + } + __builtin_unreachable(); +} + +// Ugh copy/paste but it lets us garbage collect the TMDS stuff that is not being used from .scratch_x +void __dvi_func(dvi_scanbuf_main_16bpp)(struct dvi_inst *inst) { + uint y = 0; + while (1) { + uint32_t *scanbuf; + queue_remove_blocking_u32(&inst->q_colour_valid, &scanbuf); + _dvi_prepare_scanline_16bpp(inst, scanbuf); + queue_add_blocking_u32(&inst->q_colour_free, &scanbuf); + ++y; + if (y == inst->timing->v_active_lines) { + y = 0; + } + } + __builtin_unreachable(); +} + +static void __dvi_func(dvi_dma_irq_handler)(struct dvi_inst *inst) { + // Every fourth interrupt marks the start of the horizontal active region. We + // now have until the end of this region to generate DMA blocklist for next + // scanline. + dvi_timing_state_advance(inst->timing, &inst->timing_state); + if (inst->tmds_buf_release && !queue_try_add_u32(&inst->q_tmds_free, &inst->tmds_buf_release)) + panic("TMDS free queue full in IRQ!"); + inst->tmds_buf_release = inst->tmds_buf_release_next; + inst->tmds_buf_release_next = NULL; + + // Make sure all three channels have definitely loaded their last block + // (should be within a few cycles of one another) + for (int i = 0; i < N_TMDS_LANES; ++i) { + while (dma_debug_hw->ch[inst->dma_cfg[i].chan_data].tcr != inst->timing->h_active_pixels / DVI_SYMBOLS_PER_WORD) + tight_loop_contents(); + } + + uint32_t *tmdsbuf; + while (inst->late_scanline_ctr > 0 && queue_try_remove_u32(&inst->q_tmds_valid, &tmdsbuf)) { + // If we displayed this buffer then it would be in the wrong vertical + // position on-screen. Just pass it back. + queue_add_blocking_u32(&inst->q_tmds_free, &tmdsbuf); + --inst->late_scanline_ctr; + } + + if (inst->timing_state.v_state != DVI_STATE_ACTIVE) { + // Don't care + tmdsbuf = NULL; + } + else if (queue_try_peek_u32(&inst->q_tmds_valid, &tmdsbuf)) { + if (inst->timing_state.v_ctr % dvi_vertical_repeat == dvi_vertical_repeat - 1) { + queue_remove_blocking_u32(&inst->q_tmds_valid, &tmdsbuf); + inst->tmds_buf_release_next = tmdsbuf; + } + } + else { + // No valid scanline was ready (generates solid red scanline) + tmdsbuf = NULL; + if (inst->timing_state.v_ctr % dvi_vertical_repeat == dvi_vertical_repeat - 1) + ++inst->late_scanline_ctr; + } + + switch (inst->timing_state.v_state) { + case DVI_STATE_ACTIVE: + if (tmdsbuf) { + dvi_update_scanline_data_dma(inst->timing, tmdsbuf, &inst->dma_list_active); + _dvi_load_dma_op(inst->dma_cfg, &inst->dma_list_active); + } + else { + _dvi_load_dma_op(inst->dma_cfg, &inst->dma_list_error); + } + if (inst->scanline_callback && inst->timing_state.v_ctr % dvi_vertical_repeat == dvi_vertical_repeat - 1) { + inst->scanline_callback(); + } + break; + case DVI_STATE_SYNC: + _dvi_load_dma_op(inst->dma_cfg, &inst->dma_list_vblank_sync); + break; + default: + _dvi_load_dma_op(inst->dma_cfg, &inst->dma_list_vblank_nosync); + break; + } +} + +static void __dvi_func(dvi_dma0_irq)() { + struct dvi_inst *inst = dma_irq_privdata[0]; + dma_hw->ints0 = 1u << inst->dma_cfg[TMDS_SYNC_LANE].chan_data; + dvi_dma_irq_handler(inst); +} + +static void __dvi_func(dvi_dma1_irq)() { + struct dvi_inst *inst = dma_irq_privdata[1]; + dma_hw->ints1 = 1u << inst->dma_cfg[TMDS_SYNC_LANE].chan_data; + dvi_dma_irq_handler(inst); +} diff --git a/src/libdvi/dvi.h b/src/libdvi/dvi.h new file mode 100644 index 0000000..ee4a1a1 --- /dev/null +++ b/src/libdvi/dvi.h @@ -0,0 +1,81 @@ +#ifndef _DVI_H +#define _DVI_H + +#define N_TMDS_LANES 3 +#define TMDS_SYNC_LANE 0 // blue! + +#include "pico/util/queue.h" + +#include "dvi_config_defs.h" +#include "dvi_timing.h" +#include "dvi_serialiser.h" +#include "util_queue_u32_inline.h" + +typedef void (*dvi_callback_t)(void); + +struct dvi_inst { + // Config --- + const struct dvi_timing *timing; + struct dvi_lane_dma_cfg dma_cfg[N_TMDS_LANES]; + struct dvi_timing_state timing_state; + struct dvi_serialiser_cfg ser_cfg; + // Called in the DMA IRQ once per scanline -- careful with the run time! + dvi_callback_t scanline_callback; + + // State --- + struct dvi_scanline_dma_list dma_list_vblank_sync; + struct dvi_scanline_dma_list dma_list_vblank_nosync; + struct dvi_scanline_dma_list dma_list_active; + struct dvi_scanline_dma_list dma_list_error; + + // After a TMDS buffer has been enqueue via a control block for the last + // time, two IRQs must go by before freeing. The first indicates the control + // block for this buf has been loaded, and the second occurs some time after + // the actual data DMA transfer has completed. + uint32_t *tmds_buf_release_next; + uint32_t *tmds_buf_release; + // Remember how far behind the source is on TMDS scanlines, so we can output + // solid colour until they catch up (rather than dying spectacularly) + uint late_scanline_ctr; + + // Encoded scanlines: + queue_t q_tmds_valid; + queue_t q_tmds_free; + + // Either scanline buffers or frame buffers: + queue_t q_colour_valid; + queue_t q_colour_free; + +}; + +#if defined(__cplusplus) +extern "C" +{ +#endif + +// Set up data structures and hardware for DVI. +void dvi_init(struct dvi_inst *inst, uint spinlock_tmds_queue, uint spinlock_colour_queue); + +// Call this after calling dvi_init(). DVI DMA interrupts will be routed to +// whichever core called this function. Registers an exclusive IRQ handler. +void dvi_register_irqs_this_core(struct dvi_inst *inst, uint irq_num); + +// Start actually wiggling TMDS pairs. Call this once you have initialised the +// DVI, have registered the IRQs, and are producing rendered scanlines. +void dvi_start(struct dvi_inst *inst); + +// TMDS encode worker function: core enters and doesn't leave, but still +// responds to IRQs. Repeatedly pop a scanline buffer from q_colour_valid, +// TMDS encode it, and pass it to the tmds valid queue. +void dvi_scanbuf_main_8bpp(struct dvi_inst *inst); +void dvi_scanbuf_main_16bpp(struct dvi_inst *inst); + +// Same as above, but each q_colour_valid entry is a framebuffer +void dvi_framebuf_main_8bpp(struct dvi_inst *inst); +void dvi_framebuf_main_16bpp(struct dvi_inst *inst); + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/src/libdvi/dvi_config_defs.h b/src/libdvi/dvi_config_defs.h new file mode 100644 index 0000000..66c1e58 --- /dev/null +++ b/src/libdvi/dvi_config_defs.h @@ -0,0 +1,151 @@ +#ifndef _DVI_CONFIG_DEFS_H +#define _DVI_CONFIG_DEFS_H + +// Compile-time configuration definitions for libdvi. This file provides +// defaults -- you can override using a board header, or setting compile +// definitions directly from the commandline (e.g. using CMake +// target_compile_definitions()) + +// Pull in base headers to make sure board definitions override the +// definitions provided here. Note this file is included in asm and C. +#include "hardware/platform_defs.h" +#include "pico/config.h" + +// ---------------------------------------------------------------------------- +// General DVI defines + +// How many times to output the same TMDS buffer before recyling it onto the +// free queue. Pixels are repeated vertically if this is >1. +#ifndef DVI_VERTICAL_REPEAT +#define DVI_VERTICAL_REPEAT 2 +#endif + +// Number of TMDS buffers to allocate (malloc()) in DVI init. You can set this +// to 0 if you want to allocate your own (e.g. if you want static buffers) +#ifndef DVI_N_TMDS_BUFFERS +#define DVI_N_TMDS_BUFFERS 3 +#endif + +// If 1, replace the DVI serialiser with a 10n1 UART (1 start bit, 10 data +// bits, 1 stop bit) so the stream can be dumped and analysed easily. +#ifndef DVI_SERIAL_DEBUG +#define DVI_SERIAL_DEBUG 0 +#endif + +// If 1, the same TMDS symbols are sent to all 3 lanes during the horizontal +// active period. This means only monochrome colour is available, but the TMDS +// buffers are 3 times smaller as a result, and the performance requirements +// for encode are also cut by 3. +#ifndef DVI_MONOCHROME_TMDS +#define DVI_MONOCHROME_TMDS 0 +#endif + +// By default, we assume each 32-bit word written to a PIO FIFO contains 2x +// 10-bit TMDS symbols, concatenated into the lower 20 bits, least-significant +// first. This is convenient if you are generating two or more pixels at once, +// e.g. using the pixel-doubling TMDS encode. You can change this value to 1 +// (so each word contains 1 symbol) for e.g. full resolution RGB encode. Note +// that this value needs to divide the DVI horizontal timings, so is limited +// to 1 or 2. +#ifndef DVI_SYMBOLS_PER_WORD +#define DVI_SYMBOLS_PER_WORD 2 +#endif + +#if DVI_SYMBOLS_PER_WORD != 1 && DVI_SYMBOLS_PER_WORD !=2 +#error "Unsupported value for DVI_SYMBOLS_PER_WORD" +#endif + +// ---------------------------------------------------------------------------- +// Pixel component layout + +// By default we go R, G, B from MSB -> LSB. Override to e.g. swap RGB <-> BGR + +// Default 8bpp layout: RGB332, {r[1:0], g[2:0], b[1:0]} + +#ifndef DVI_8BPP_RED_MSB +#define DVI_8BPP_RED_MSB 7 +#endif + +#ifndef DVI_8BPP_RED_LSB +#define DVI_8BPP_RED_LSB 5 +#endif + +#ifndef DVI_8BPP_GREEN_MSB +#define DVI_8BPP_GREEN_MSB 4 +#endif + +#ifndef DVI_8BPP_GREEN_LSB +#define DVI_8BPP_GREEN_LSB 2 +#endif + +#ifndef DVI_8BPP_BLUE_MSB +#define DVI_8BPP_BLUE_MSB 1 +#endif + +#ifndef DVI_8BPP_BLUE_LSB +#define DVI_8BPP_BLUE_LSB 0 +#endif + +// Default 16bpp layout: RGB565, {r[4:0], g[5:0], b[4:0]} + +#ifndef DVI_16BPP_RED_MSB +#define DVI_16BPP_RED_MSB 15 +#endif + +#ifndef DVI_16BPP_RED_LSB +#define DVI_16BPP_RED_LSB 11 +#endif + +#ifndef DVI_16BPP_GREEN_MSB +#define DVI_16BPP_GREEN_MSB 10 +#endif + +#ifndef DVI_16BPP_GREEN_LSB +#define DVI_16BPP_GREEN_LSB 5 +#endif + +#ifndef DVI_16BPP_BLUE_MSB +#define DVI_16BPP_BLUE_MSB 4 +#endif + +#ifndef DVI_16BPP_BLUE_LSB +#define DVI_16BPP_BLUE_LSB 0 +#endif + +// Default 1bpp layout: bitwise little-endian, i.e. least significant bit of +// each word is the first (leftmost) of a block of 32 pixels. + +// If 1, reverse the order of pixels within each byte. Order of bytes within +// each word is still little-endian. +#ifndef DVI_1BPP_BIT_REVERSE +#define DVI_1BPP_BIT_REVERSE 1 // Adafruit_GFX GFXcanvas1 requires this 1 +#endif + +// ---------------------------------------------------------------------------- +// TMDS encode controls + +// Number of TMDS loop bodies between branches. cmp + branch costs 3 cycles, +// so you can easily save 10% of encode time by bumping this. Note that body +// will *already* produce multiple pixels, and total symbols per iteration +// must cleanly divide symbols per scanline, else the loop won't terminate. +// Point gun away from foot. +#ifndef TMDS_ENCODE_UNROLL +#define TMDS_ENCODE_UNROLL 1 +#endif + +// If 1, don't save/restore the interpolators on full-resolution TMDS encode. +// Speed hack. The TMDS code uses both interpolators, for each of the 3 data +// channels, so this define avoids 6 save/restores per scanline. +#ifndef TMDS_FULLRES_NO_INTERP_SAVE +#define TMDS_FULLRES_NO_INTERP_SAVE 0 +#endif + +// If 1, don't DC-balance the output of full resolution encode. Hilariously +// noncompliant, but Dell Ultrasharp -- the honey badger of computer monitors +// -- does not seem to mind (it helps that we DC-couple). Another speed hack, +// useful when you are trying to get everything else up to speed. +#ifndef TMDS_FULLRES_NO_DC_BALANCE +#define TMDS_FULLRES_NO_DC_BALANCE 0 +#endif + +#endif diff --git a/src/libdvi/dvi_serialiser.c b/src/libdvi/dvi_serialiser.c new file mode 100644 index 0000000..308f23f --- /dev/null +++ b/src/libdvi/dvi_serialiser.c @@ -0,0 +1,73 @@ +#include "pico.h" +#include "hardware/pio.h" +#include "hardware/gpio.h" +#include "hardware/pwm.h" +#include "hardware/structs/padsbank0.h" + +#include "dvi.h" +#include "dvi_serialiser.h" +#include "dvi_serialiser.pio.h" + +static void dvi_configure_pad(uint gpio, bool invert) { + // 2 mA drive, enable slew rate limiting (this seems fine even at 720p30, and + // the 3V3 LDO doesn't get warm like when turning all the GPIOs up to 11). + // Also disable digital receiver. + hw_write_masked( + &padsbank0_hw->io[gpio], + (0 << PADS_BANK0_GPIO0_DRIVE_LSB), + PADS_BANK0_GPIO0_DRIVE_BITS | PADS_BANK0_GPIO0_SLEWFAST_BITS | PADS_BANK0_GPIO0_IE_BITS + ); + gpio_set_outover(gpio, invert ? GPIO_OVERRIDE_INVERT : GPIO_OVERRIDE_NORMAL); +} + +void dvi_serialiser_init(struct dvi_serialiser_cfg *cfg) { +#if DVI_SERIAL_DEBUG + uint offset = pio_add_program(cfg->pio, &dvi_serialiser_debug_program); +#else + uint offset = pio_add_program(cfg->pio, &dvi_serialiser_program); +#endif + cfg->prog_offs = offset; + + for (int i = 0; i < N_TMDS_LANES; ++i) { + pio_sm_claim(cfg->pio, cfg->sm_tmds[i]); + dvi_serialiser_program_init( + cfg->pio, + cfg->sm_tmds[i], + offset, + cfg->pins_tmds[i], + DVI_SERIAL_DEBUG + ); + dvi_configure_pad(cfg->pins_tmds[i], cfg->invert_diffpairs); + dvi_configure_pad(cfg->pins_tmds[i] + 1, cfg->invert_diffpairs); + } + + // Use a PWM slice to drive the pixel clock. Both GPIOs must be on the same + // slice (lower-numbered GPIO must be even). + assert(cfg->pins_clk % 2 == 0); + uint slice = pwm_gpio_to_slice_num(cfg->pins_clk); + // 5 cycles high, 5 low. Invert one channel so that we get complementary outputs. + pwm_config pwm_cfg = pwm_get_default_config(); + pwm_config_set_output_polarity(&pwm_cfg, true, false); + pwm_config_set_wrap(&pwm_cfg, 9); + pwm_init(slice, &pwm_cfg, false); + pwm_set_both_levels(slice, 5, 5); + + for (uint i = cfg->pins_clk; i <= cfg->pins_clk + 1; ++i) { + gpio_set_function(i, GPIO_FUNC_PWM); + dvi_configure_pad(i, cfg->invert_diffpairs); + } +} + +void dvi_serialiser_enable(struct dvi_serialiser_cfg *cfg, bool enable) { + uint mask = 0; + for (int i = 0; i < N_TMDS_LANES; ++i) + mask |= 1u << (cfg->sm_tmds[i] + PIO_CTRL_SM_ENABLE_LSB); + if (enable) { + hw_set_bits(&cfg->pio->ctrl, mask); + pwm_set_enabled(pwm_gpio_to_slice_num(cfg->pins_clk), true); + } + else { + hw_clear_bits(&cfg->pio->ctrl, mask); + pwm_set_enabled(pwm_gpio_to_slice_num(cfg->pins_clk), false); + } +} diff --git a/src/libdvi/dvi_serialiser.h b/src/libdvi/dvi_serialiser.h new file mode 100644 index 0000000..d978f60 --- /dev/null +++ b/src/libdvi/dvi_serialiser.h @@ -0,0 +1,22 @@ +#ifndef _DVI_SERIALISER_H +#define _DVI_SERIALISER_H + +#include "hardware/pio.h" +#include "dvi_config_defs.h" + +#define N_TMDS_LANES 3 + +struct dvi_serialiser_cfg { + PIO pio; + uint sm_tmds[N_TMDS_LANES]; + uint pins_tmds[N_TMDS_LANES]; + uint pins_clk; + bool invert_diffpairs; + uint prog_offs; +}; + +void dvi_serialiser_init(struct dvi_serialiser_cfg *cfg); +void dvi_serialiser_enable(struct dvi_serialiser_cfg *cfg, bool enable); +uint32_t dvi_single_to_diff(uint32_t in); + +#endif diff --git a/src/libdvi/dvi_serialiser.pio b/src/libdvi/dvi_serialiser.pio new file mode 100644 index 0000000..520c8e0 --- /dev/null +++ b/src/libdvi/dvi_serialiser.pio @@ -0,0 +1,53 @@ +.program dvi_serialiser +.side_set 2 +.origin 0 + +; Single-ended -> differential serial + + out pc, 1 side 0b10 + out pc, 1 side 0b01 + +.program dvi_serialiser_debug +.side_set 1 opt + +; The debug variant behaves as a UART with 1 start bit, 10 data bits, 1 stop +; bit, and 5/6ths the data throughput of the TMDS version. + + pull ifempty side 1 ; Extend stop bit with FIFO stall + nop side 0 + out pins, 1 ; Unrolled because we require 1 bit / clk + out pins, 1 + out pins, 1 + out pins, 1 + out pins, 1 + out pins, 1 + out pins, 1 + out pins, 1 + out pins, 1 + out pins, 1 + +% c-sdk { +#include "dvi_config_defs.h" + +static inline void dvi_serialiser_program_init(PIO pio, uint sm, uint offset, uint data_pins, bool debug) { + pio_sm_set_pins_with_mask(pio, sm, 2u << data_pins, 3u << data_pins); + pio_sm_set_pindirs_with_mask(pio, sm, ~0u, 3u << data_pins); + pio_gpio_init(pio, data_pins); + pio_gpio_init(pio, data_pins + 1); + + pio_sm_config c; + if (debug) { + c = dvi_serialiser_debug_program_get_default_config(offset); + } + else { + c = dvi_serialiser_program_get_default_config(offset); + } + sm_config_set_sideset_pins(&c, data_pins); + if (debug) + sm_config_set_out_pins(&c, data_pins, 1); + sm_config_set_out_shift(&c, true, !debug, 10 * DVI_SYMBOLS_PER_WORD); + sm_config_set_fifo_join(&c, PIO_FIFO_JOIN_TX); + pio_sm_init(pio, sm, offset, &c); + pio_sm_set_enabled(pio, sm, false); +} +%} diff --git a/src/libdvi/dvi_serialiser.pio.h b/src/libdvi/dvi_serialiser.pio.h new file mode 100644 index 0000000..d1275fe --- /dev/null +++ b/src/libdvi/dvi_serialiser.pio.h @@ -0,0 +1,101 @@ +// -------------------------------------------------- // +// This file is autogenerated by pioasm; do not edit! // +// -------------------------------------------------- // + +#pragma once + +#if !PICO_NO_HARDWARE +#include "hardware/pio.h" +#endif + +// -------------- // +// dvi_serialiser // +// -------------- // + +#define dvi_serialiser_wrap_target 0 +#define dvi_serialiser_wrap 1 + +static const uint16_t dvi_serialiser_program_instructions[] = { + // .wrap_target + 0x70a1, // 0: out pc, 1 side 2 + 0x68a1, // 1: out pc, 1 side 1 + // .wrap +}; + +#if !PICO_NO_HARDWARE +static const struct pio_program dvi_serialiser_program = { + .instructions = dvi_serialiser_program_instructions, + .length = 2, + .origin = 0, +}; + +static inline pio_sm_config dvi_serialiser_program_get_default_config(uint offset) { + pio_sm_config c = pio_get_default_sm_config(); + sm_config_set_wrap(&c, offset + dvi_serialiser_wrap_target, offset + dvi_serialiser_wrap); + sm_config_set_sideset(&c, 2, false, false); + return c; +} +#endif + +// -------------------- // +// dvi_serialiser_debug // +// -------------------- // + +#define dvi_serialiser_debug_wrap_target 0 +#define dvi_serialiser_debug_wrap 11 + +static const uint16_t dvi_serialiser_debug_program_instructions[] = { + // .wrap_target + 0x98e0, // 0: pull ifempty block side 1 + 0xb042, // 1: nop side 0 + 0x6001, // 2: out pins, 1 + 0x6001, // 3: out pins, 1 + 0x6001, // 4: out pins, 1 + 0x6001, // 5: out pins, 1 + 0x6001, // 6: out pins, 1 + 0x6001, // 7: out pins, 1 + 0x6001, // 8: out pins, 1 + 0x6001, // 9: out pins, 1 + 0x6001, // 10: out pins, 1 + 0x6001, // 11: out pins, 1 + // .wrap +}; + +#if !PICO_NO_HARDWARE +static const struct pio_program dvi_serialiser_debug_program = { + .instructions = dvi_serialiser_debug_program_instructions, + .length = 12, + .origin = -1, +}; + +static inline pio_sm_config dvi_serialiser_debug_program_get_default_config(uint offset) { + pio_sm_config c = pio_get_default_sm_config(); + sm_config_set_wrap(&c, offset + dvi_serialiser_debug_wrap_target, offset + dvi_serialiser_debug_wrap); + sm_config_set_sideset(&c, 2, true, false); + return c; +} + +#include "dvi_config_defs.h" +static inline void dvi_serialiser_program_init(PIO pio, uint sm, uint offset, uint data_pins, bool debug) { + pio_sm_set_pins_with_mask(pio, sm, 2u << data_pins, 3u << data_pins); + pio_sm_set_pindirs_with_mask(pio, sm, ~0u, 3u << data_pins); + pio_gpio_init(pio, data_pins); + pio_gpio_init(pio, data_pins + 1); + pio_sm_config c; + if (debug) { + c = dvi_serialiser_debug_program_get_default_config(offset); + } + else { + c = dvi_serialiser_program_get_default_config(offset); + } + sm_config_set_sideset_pins(&c, data_pins); + if (debug) + sm_config_set_out_pins(&c, data_pins, 1); + sm_config_set_out_shift(&c, true, !debug, 10 * DVI_SYMBOLS_PER_WORD); + sm_config_set_fifo_join(&c, PIO_FIFO_JOIN_TX); + pio_sm_init(pio, sm, offset, &c); + pio_sm_set_enabled(pio, sm, false); +} + +#endif + diff --git a/src/libdvi/dvi_timing.c b/src/libdvi/dvi_timing.c new file mode 100644 index 0000000..54ba8e1 --- /dev/null +++ b/src/libdvi/dvi_timing.c @@ -0,0 +1,324 @@ +#include "dvi.h" +#include "dvi_timing.h" +#include "hardware/dma.h" + +// This file contains: +// - Timing parameters for DVI modes (horizontal + vertical counts, best +// achievable bit clock from 12 MHz crystal) +// - Helper functions for generating DMA lists based on these timings + +extern bool dvi_monochrome_tmds; // In dvi.c + +// Pull into RAM but apply unique section suffix to allow linker GC +#define __dvi_func(x) __not_in_flash_func(x) +#define __dvi_const(x) __not_in_flash_func(x) + +// VGA -- we do this mode properly, with a pretty comfortable clk_sys (252 MHz) +const struct dvi_timing __dvi_const(dvi_timing_640x480p_60hz) = { + .h_sync_polarity = false, + .h_front_porch = 16, + .h_sync_width = 96, + .h_back_porch = 48, + .h_active_pixels = 640, + + .v_sync_polarity = false, + .v_front_porch = 10, + .v_sync_width = 2, + .v_back_porch = 33, + .v_active_lines = 480, + + .bit_clk_khz = 252000 +}; + +// SVGA -- completely by-the-book but requires 400 MHz clk_sys +const struct dvi_timing __dvi_const(dvi_timing_800x600p_60hz) = { + .h_sync_polarity = false, + .h_front_porch = 44, + .h_sync_width = 128, + .h_back_porch = 88, + .h_active_pixels = 800, + + .v_sync_polarity = false, + .v_front_porch = 1, + .v_sync_width = 4, + .v_back_porch = 23, + .v_active_lines = 600, + + .bit_clk_khz = 400000 +}; + +// 800x480p 60 Hz (note this doesn't seem to be a CEA mode, I just used the +// output of `cvt 800 480 60`), 295 MHz bit clock +const struct dvi_timing __dvi_const(dvi_timing_800x480p_60hz) = { + .h_sync_polarity = false, + .h_front_porch = 24, + .h_sync_width = 72, + .h_back_porch = 96, + .h_active_pixels = 800, + + .v_sync_polarity = true, + .v_front_porch = 3, + .v_sync_width = 10, + .v_back_porch = 7, + .v_active_lines = 480, + + .bit_clk_khz = 295200 +}; + +// SVGA reduced blanking (355 MHz bit clock) -- valid CVT mode, less common +// than fully-blanked SVGA, but doesn't require such a high system clock +const struct dvi_timing __dvi_const(dvi_timing_800x600p_reduced_60hz) = { + .h_sync_polarity = true, + .h_front_porch = 48, + .h_sync_width = 32, + .h_back_porch = 80, + .h_active_pixels = 800, + + .v_sync_polarity = false, + .v_front_porch = 3, + .v_sync_width = 4, + .v_back_porch = 11, + .v_active_lines = 600, + + .bit_clk_khz = 354000 +}; + +// Also known as qHD, bit uncommon, but it's a nice modest-resolution 16:9 +// aspect mode. Pixel clock 37.3 MHz +const struct dvi_timing __dvi_const(dvi_timing_960x540p_60hz) = { + .h_sync_polarity = true, + .h_front_porch = 16, + .h_sync_width = 32, + .h_back_porch = 96, + .h_active_pixels = 960, + + .v_sync_polarity = true, + .v_front_porch = 2, + .v_sync_width = 6, + .v_back_porch = 15, + .v_active_lines = 540, + + .bit_clk_khz = 372000 +}; + +// Note this is NOT the correct 720p30 CEA mode, but rather 720p60 run at half +// pixel clock. Seems to be commonly accepted (and is a valid CVT mode). The +// actual CEA mode is the same pixel clock as 720p60 but with >50% blanking, +// which would require a clk_sys of 742 MHz! +const struct dvi_timing __dvi_const(dvi_timing_1280x720p_30hz) = { + .h_sync_polarity = true, + .h_front_porch = 110, + .h_sync_width = 40, + .h_back_porch = 220, + .h_active_pixels = 1280, + + .v_sync_polarity = true, + .v_front_porch = 5, + .v_sync_width = 5, + .v_back_porch = 20, + .v_active_lines = 720, + + .bit_clk_khz = 372000 +}; + +// Reduced-blanking (CVT) 720p. You aren't supposed to use reduced blanking +// modes below 60 Hz, but I won't tell anyone (and it works on the monitors +// I've tried). This nets a lower system clock than regular 720p30 (319 MHz) +const struct dvi_timing __dvi_const(dvi_timing_1280x720p_reduced_30hz) = { + .h_sync_polarity = true, + .h_front_porch = 48, + .h_sync_width = 32, + .h_back_porch = 80, + .h_active_pixels = 1280, + + .v_sync_polarity = false, + .v_front_porch = 3, + .v_sync_width = 5, + .v_back_porch = 13, + .v_active_lines = 720, + + .bit_clk_khz = 319200 +}; + +// This requires a spicy 488 MHz system clock and is illegal in most countries +// (you need to have a very lucky piece of silicon to run this at 1.3 V, or +// connect an external supply and give it a bit more juice) +const struct dvi_timing __dvi_const(dvi_timing_1600x900p_reduced_30hz) = { + .h_sync_polarity = true, + .h_front_porch = 48, + .h_sync_width = 32, + .h_back_porch = 80, + .h_active_pixels = 1600, + + .v_sync_polarity = false, + .v_front_porch = 3, + .v_sync_width = 5, + .v_back_porch = 18, + .v_active_lines = 900, + + .bit_clk_khz = 488000 +}; + +// ---------------------------------------------------------------------------- + +// The DMA scheme is: +// +// - One channel transferring data to each of the three PIO state machines +// performing TMDS serialisation +// +// - One channel programming the registers of each of these data channels, +// triggered (CHAIN_TO) each time the corresponding data channel completes +// +// - Lanes 1 and 2 have one block for blanking and one for video data +// +// - Lane 0 has one block for each horizontal region (front porch, hsync, back +// porch, active) +// +// - The IRQ_QUIET flag is used to select which data block on the sync lane is +// allowed to generate an IRQ upon completion. This is the block immediately +// before the horizontal active region. The IRQ is entered at ~the same time +// as the last data transfer starts +// +// - The IRQ points the control channels at new blocklists for next scanline. +// The DMA starts the new list automatically at end-of-scanline, via +// CHAIN_TO. +// +// The horizontal active region is the longest continuous transfer, so this +// gives the most time to handle the IRQ and load new blocklists. +// +// Note a null trigger IRQ is not suitable because we get that *after* the +// last data transfer finishes, and the FIFOs bottom out very shortly +// afterward. For pure DVI (four blocks per scanline), it works ok to take +// four regular IRQs per scanline and return early from 3 of them, but this +// breaks down when you have very short scanline sections like guard bands. + +// Each symbol appears twice, concatenated in one word. Note these must be in +// RAM because they see a lot of DMA traffic +const uint32_t __dvi_const(dvi_ctrl_syms)[4] = { + 0xd5354, + 0x2acab, + 0x55154, + 0xaaeab +}; + +// Output solid red scanline if we are given NULL for tmdsbuff +#if DVI_SYMBOLS_PER_WORD == 2 +static uint32_t __dvi_const(empty_scanline_tmds)[3] = { + 0x7fd00u, // 0x00, 0x00 + 0x7fd00u, // 0x00, 0x00 + 0xbfa01u // 0xfc, 0xfc +}; +#else +static uint32_t __attribute__((aligned(8))) __dvi_const(empty_scanline_tmds)[6] = { + 0x100u, 0x1ffu, // 0x00, 0x00 + 0x100u, 0x1ffu, // 0x00, 0x00 + 0x201u, 0x2feu // 0xfc, 0xfc +}; +#endif + +void dvi_timing_state_init(struct dvi_timing_state *t) { + t->v_ctr = 0; + t->v_state = DVI_STATE_FRONT_PORCH; +}; + +void __dvi_func(dvi_timing_state_advance)(const struct dvi_timing *t, struct dvi_timing_state *s) { + s->v_ctr++; + if ((s->v_state == DVI_STATE_FRONT_PORCH && s->v_ctr == t->v_front_porch) || + (s->v_state == DVI_STATE_SYNC && s->v_ctr == t->v_sync_width) || + (s->v_state == DVI_STATE_BACK_PORCH && s->v_ctr == t->v_back_porch) || + (s->v_state == DVI_STATE_ACTIVE && s->v_ctr == t->v_active_lines)) { + + s->v_state = (s->v_state + 1) % DVI_STATE_COUNT; + s->v_ctr = 0; + } +} + +void dvi_scanline_dma_list_init(struct dvi_scanline_dma_list *dma_list) { + *dma_list = (struct dvi_scanline_dma_list){}; +} + +static const uint32_t *get_ctrl_sym(bool vsync, bool hsync) { + return &dvi_ctrl_syms[!!vsync << 1 | !!hsync]; +} + +// Make a sequence of paced transfers to the relevant FIFO +static void _set_data_cb(dma_cb_t *cb, const struct dvi_lane_dma_cfg *dma_cfg, + const void *read_addr, uint transfer_count, uint read_ring, bool irq_on_finish) { + cb->read_addr = read_addr; + cb->write_addr = dma_cfg->tx_fifo; + cb->transfer_count = transfer_count; + cb->c = dma_channel_get_default_config(dma_cfg->chan_data); + channel_config_set_ring(&cb->c, false, read_ring); + channel_config_set_dreq(&cb->c, dma_cfg->dreq); + // Call back to control channel for reconfiguration: + channel_config_set_chain_to(&cb->c, dma_cfg->chan_ctrl); + // Note we never send a null trigger, so IRQ_QUIET is an IRQ suppression flag + channel_config_set_irq_quiet(&cb->c, !irq_on_finish); +}; + +void dvi_setup_scanline_for_vblank(const struct dvi_timing *t, const struct dvi_lane_dma_cfg dma_cfg[], + bool vsync_asserted, struct dvi_scanline_dma_list *l) { + + bool vsync = t->v_sync_polarity == vsync_asserted; + const uint32_t *sym_hsync_off = get_ctrl_sym(vsync, !t->h_sync_polarity); + const uint32_t *sym_hsync_on = get_ctrl_sym(vsync, t->h_sync_polarity); + const uint32_t *sym_no_sync = get_ctrl_sym(false, false ); + + dma_cb_t *synclist = dvi_lane_from_list(l, TMDS_SYNC_LANE); + // The symbol table contains each control symbol *twice*, concatenated into 20 LSBs of table word, so we can always do word-repeat. + _set_data_cb(&synclist[0], &dma_cfg[TMDS_SYNC_LANE], sym_hsync_off, t->h_front_porch / DVI_SYMBOLS_PER_WORD, 2, false); + _set_data_cb(&synclist[1], &dma_cfg[TMDS_SYNC_LANE], sym_hsync_on, t->h_sync_width / DVI_SYMBOLS_PER_WORD, 2, false); + _set_data_cb(&synclist[2], &dma_cfg[TMDS_SYNC_LANE], sym_hsync_off, t->h_back_porch / DVI_SYMBOLS_PER_WORD, 2, true); + _set_data_cb(&synclist[3], &dma_cfg[TMDS_SYNC_LANE], sym_hsync_off, t->h_active_pixels / DVI_SYMBOLS_PER_WORD, 2, false); + + for (int i = 0; i < N_TMDS_LANES; ++i) { + if (i == TMDS_SYNC_LANE) + continue; + dma_cb_t *cblist = dvi_lane_from_list(l, i); + _set_data_cb(&cblist[0], &dma_cfg[i], sym_no_sync,(t->h_front_porch + t->h_sync_width + t->h_back_porch) / DVI_SYMBOLS_PER_WORD, 2, false); + _set_data_cb(&cblist[1], &dma_cfg[i], sym_no_sync, t->h_active_pixels / DVI_SYMBOLS_PER_WORD, 2, false); + } +} + +void dvi_setup_scanline_for_active(const struct dvi_timing *t, const struct dvi_lane_dma_cfg dma_cfg[], + uint32_t *tmdsbuf, struct dvi_scanline_dma_list *l) { + + const uint32_t *sym_hsync_off = get_ctrl_sym(!t->v_sync_polarity, !t->h_sync_polarity); + const uint32_t *sym_hsync_on = get_ctrl_sym(!t->v_sync_polarity, t->h_sync_polarity); + const uint32_t *sym_no_sync = get_ctrl_sym(false, false ); + + dma_cb_t *synclist = dvi_lane_from_list(l, TMDS_SYNC_LANE); + _set_data_cb(&synclist[0], &dma_cfg[TMDS_SYNC_LANE], sym_hsync_off, t->h_front_porch / DVI_SYMBOLS_PER_WORD, 2, false); + _set_data_cb(&synclist[1], &dma_cfg[TMDS_SYNC_LANE], sym_hsync_on, t->h_sync_width / DVI_SYMBOLS_PER_WORD, 2, false); + _set_data_cb(&synclist[2], &dma_cfg[TMDS_SYNC_LANE], sym_hsync_off, t->h_back_porch / DVI_SYMBOLS_PER_WORD, 2, true); + + for (int i = 0; i < N_TMDS_LANES; ++i) { + dma_cb_t *cblist = dvi_lane_from_list(l, i); + if (i != TMDS_SYNC_LANE) { + _set_data_cb(&cblist[0], &dma_cfg[i], sym_no_sync, + (t->h_front_porch + t->h_sync_width + t->h_back_porch) / DVI_SYMBOLS_PER_WORD, 2, false); + } + int target_block = i == TMDS_SYNC_LANE ? DVI_SYNC_LANE_CHUNKS - 1 : DVI_NOSYNC_LANE_CHUNKS - 1; + if (tmdsbuf) { + // Non-repeating DMA for the freshly-encoded TMDS buffer + _set_data_cb(&cblist[target_block], &dma_cfg[i], tmdsbuf + i * (t->h_active_pixels / DVI_SYMBOLS_PER_WORD), + t->h_active_pixels / DVI_SYMBOLS_PER_WORD, 0, false); + } + else { + // Use read ring to repeat the correct DC-balanced symbol pair on blank scanlines (4 or 8 byte period) + _set_data_cb(&cblist[target_block], &dma_cfg[i], &empty_scanline_tmds[2 * i / DVI_SYMBOLS_PER_WORD], + t->h_active_pixels / DVI_SYMBOLS_PER_WORD, DVI_SYMBOLS_PER_WORD == 2 ? 2 : 3, false); + } + } +} + +void __dvi_func(dvi_update_scanline_data_dma)(const struct dvi_timing *t, const uint32_t *tmdsbuf, struct dvi_scanline_dma_list *l) { + for (int i = 0; i < N_TMDS_LANES; ++i) { + const uint32_t *lane_tmdsbuf = dvi_monochrome_tmds ? tmdsbuf : tmdsbuf + i * t->h_active_pixels / DVI_SYMBOLS_PER_WORD; + if (i == TMDS_SYNC_LANE) + dvi_lane_from_list(l, i)[3].read_addr = lane_tmdsbuf; + else + dvi_lane_from_list(l, i)[1].read_addr = lane_tmdsbuf; + } +} + diff --git a/src/libdvi/dvi_timing.h b/src/libdvi/dvi_timing.h new file mode 100644 index 0000000..bf34937 --- /dev/null +++ b/src/libdvi/dvi_timing.h @@ -0,0 +1,99 @@ +#ifndef _DVI_TIMING_H +#define _DVI_TIMING_H + +#include "hardware/dma.h" +#include "pico/util/queue.h" + +#include "dvi.h" + +struct dvi_timing { + bool h_sync_polarity; + uint h_front_porch; + uint h_sync_width; + uint h_back_porch; + uint h_active_pixels; + + bool v_sync_polarity; + uint v_front_porch; + uint v_sync_width; + uint v_back_porch; + uint v_active_lines; + + uint bit_clk_khz; +}; + +enum dvi_line_state { + DVI_STATE_FRONT_PORCH = 0, + DVI_STATE_SYNC, + DVI_STATE_BACK_PORCH, + DVI_STATE_ACTIVE, + DVI_STATE_COUNT +}; + +struct dvi_timing_state { + uint v_ctr; + enum dvi_line_state v_state; +}; + +// This should map directly to DMA register layout, but more convenient types +// (also this really shouldn't be here... we don't have a dma_cb in the SDK +// because there are many valid formats due to aliases) +typedef struct dma_cb { + const void *read_addr; + void *write_addr; + uint32_t transfer_count; + dma_channel_config c; +} dma_cb_t; + +static_assert(sizeof(dma_cb_t) == 4 * sizeof(uint32_t), "bad dma layout"); +static_assert(__builtin_offsetof(dma_cb_t, c.ctrl) == __builtin_offsetof(dma_channel_hw_t, ctrl_trig), "bad dma layout"); + +#define DVI_SYNC_LANE_CHUNKS DVI_STATE_COUNT +#define DVI_NOSYNC_LANE_CHUNKS 2 + +struct dvi_scanline_dma_list { + dma_cb_t l0[DVI_SYNC_LANE_CHUNKS]; + dma_cb_t l1[DVI_NOSYNC_LANE_CHUNKS]; + dma_cb_t l2[DVI_NOSYNC_LANE_CHUNKS]; +}; + +static inline dma_cb_t* dvi_lane_from_list(struct dvi_scanline_dma_list *l, int i) { + return i == 0 ? l->l0 : i == 1 ? l->l1 : l->l2; +} + +// Each TMDS lane uses one DMA channel to transfer data to a PIO state +// machine, and another channel to load control blocks into this channel. +struct dvi_lane_dma_cfg { + uint chan_ctrl; + uint chan_data; + void *tx_fifo; + uint dreq; +}; + +// Note these are already converted to pseudo-differential representation +extern const uint32_t dvi_ctrl_syms[4]; + +extern const struct dvi_timing dvi_timing_640x480p_60hz; +extern const struct dvi_timing dvi_timing_800x480p_60hz; +extern const struct dvi_timing dvi_timing_800x600p_60hz; +extern const struct dvi_timing dvi_timing_960x540p_60hz; +extern const struct dvi_timing dvi_timing_1280x720p_30hz; + +extern const struct dvi_timing dvi_timing_800x600p_reduced_60hz; +extern const struct dvi_timing dvi_timing_1280x720p_reduced_30hz; + +void dvi_timing_state_init(struct dvi_timing_state *t); + +void dvi_timing_state_advance(const struct dvi_timing *t, struct dvi_timing_state *s); + +void dvi_scanline_dma_list_init(struct dvi_scanline_dma_list *dma_list); + +void dvi_setup_scanline_for_vblank(const struct dvi_timing *t, const struct dvi_lane_dma_cfg dma_cfg[], + bool vsync_asserted, struct dvi_scanline_dma_list *l); + +void dvi_setup_scanline_for_active(const struct dvi_timing *t, const struct dvi_lane_dma_cfg dma_cfg[], + uint32_t *tmdsbuf, struct dvi_scanline_dma_list *l); + +void dvi_update_scanline_data_dma(const struct dvi_timing *t, const uint32_t *tmdsbuf, struct dvi_scanline_dma_list *l); + +#endif diff --git a/src/libdvi/tmds_encode.S b/src/libdvi/tmds_encode.S new file mode 100644 index 0000000..065061d --- /dev/null +++ b/src/libdvi/tmds_encode.S @@ -0,0 +1,623 @@ +#include "hardware/regs/addressmap.h" +#include "hardware/regs/sio.h" +#include "dvi_config_defs.h" + +// Offsets suitable for ldr/str (must be <= 0x7c): +#define ACCUM0_OFFS (SIO_INTERP0_ACCUM0_OFFSET - SIO_INTERP0_ACCUM0_OFFSET) +#define ACCUM1_OFFS (SIO_INTERP0_ACCUM1_OFFSET - SIO_INTERP0_ACCUM0_OFFSET) +#define ACCUM1_ADD_OFFS (SIO_INTERP0_ACCUM1_ADD_OFFSET - SIO_INTERP0_ACCUM0_OFFSET) +#define PEEK0_OFFS (SIO_INTERP0_PEEK_LANE0_OFFSET - SIO_INTERP0_ACCUM0_OFFSET) +#define PEEK1_OFFS (SIO_INTERP0_PEEK_LANE1_OFFSET - SIO_INTERP0_ACCUM0_OFFSET) +#define PEEK2_OFFS (SIO_INTERP0_PEEK_FULL_OFFSET - SIO_INTERP0_ACCUM0_OFFSET) +#define INTERP1 (SIO_INTERP1_ACCUM0_OFFSET - SIO_INTERP0_ACCUM0_OFFSET) +// Note the entirety of INTERP0 and INTERP1 fits inside this 5-bit +// word-addressed space... almost as though it were intentional! :) + +.syntax unified +.cpu cortex-m0plus +.thumb + +.macro decl_func_x name +.section .scratch_x.\name, "ax" +.global \name +.type \name,%function +.thumb_func +\name: +.endm + +.macro decl_func_y name +.section .scratch_y.\name, "ax" +.global \name +.type \name,%function +.thumb_func +\name: +.endm + +#define decl_func decl_func_x + +// ---------------------------------------------------------------------------- +// Pixel-doubling encoders for RGB + +// r0: Input buffer (word-aligned) +// r1: Output buffer (word-aligned) +// r2: Input size (pixels) + +.macro do_channel_16bpp r_ibase r_inout0 r_out1 + str \r_inout0, [\r_ibase, #ACCUM0_OFFS] + ldr \r_inout0, [\r_ibase, #PEEK0_OFFS] + ldr \r_inout0, [\r_inout0] + ldr \r_out1, [\r_ibase, #PEEK1_OFFS] + ldr \r_out1, [\r_out1] +.endm + +decl_func tmds_encode_loop_16bpp + push {r4, r5, r6, r7, lr} + lsls r2, #2 + add r2, r1 + mov ip, r2 + ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET) + b 2f +.align 2 +1: +.rept TMDS_ENCODE_UNROLL + ldmia r0!, {r4, r6} + do_channel_16bpp r2, r4, r5 + do_channel_16bpp r2, r6, r7 + stmia r1!, {r4, r5, r6, r7} +.endr +2: + cmp r1, ip + bne 1b + pop {r4, r5, r6, r7, pc} + +// Same as above, but scale data to make up for lack of left shift +// in interpolator (costs 1 cycle per 2 pixels) +// +// r0: Input buffer (word-aligned) +// r1: Output buffer (word-aligned) +// r2: Input size (pixels) +// r3: Left shift amount + +decl_func tmds_encode_loop_16bpp_leftshift + push {r4, r5, r6, r7, lr} + lsls r2, #2 + add r2, r1 + mov ip, r2 + ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET) + b 2f +.align 2 +1: +.rept TMDS_ENCODE_UNROLL + ldmia r0!, {r4, r6} + lsls r4, r3 + do_channel_16bpp r2, r4, r5 + lsls r6, r3 + do_channel_16bpp r2, r6, r7 + stmia r1!, {r4, r5, r6, r7} +.endr +2: + cmp r1, ip + bne 1b + pop {r4, r5, r6, r7, pc} + +// r0: Input buffer (word-aligned) +// r1: Output buffer (word-aligned) +// r2: Input size (pixels) + +decl_func tmds_encode_loop_8bpp + push {r4, r5, r6, r7, lr} + lsls r2, #2 + add r2, r1 + mov ip, r2 + ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET) + b 2f +.align 2 +1: +.rept TMDS_ENCODE_UNROLL + ldmia r0!, {r4} + str r4, [r2, #ACCUM0_OFFS + INTERP1] + str r4, [r2, #ACCUM0_OFFS] + ldr r4, [r2, #PEEK0_OFFS] + ldr r4, [r4] + ldr r5, [r2, #PEEK1_OFFS] + ldr r5, [r5] + ldr r6, [r2, #PEEK0_OFFS + INTERP1] + ldr r6, [r6] + ldr r7, [r2, #PEEK1_OFFS + INTERP1] + ldr r7, [r7] + stmia r1!, {r4, r5, r6, r7} +.endr +2: + cmp r1, ip + bne 1b + pop {r4, r5, r6, r7, pc} + +// r0: Input buffer (word-aligned) +// r1: Output buffer (word-aligned) +// r2: Input size (pixels) +// r3: Left shift amount +// +// Note that only the data written to interp0 (pixel 0, 1) is leftshifted, not +// the data written to interp1 (pixel 2, 3). Otherwise we always lose MSBs, as +// the LUT offset MSB is at bit 8, so pixel 0 always requires some left shift, +// since its channel MSBs are no greater than 7. + +decl_func tmds_encode_loop_8bpp_leftshift + push {r4, r5, r6, r7, lr} + lsls r2, #3 + add r2, r1 + mov ip, r2 + ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET) + b 2f +.align 2 +1: +.rept TMDS_ENCODE_UNROLL + ldmia r0!, {r4} + str r4, [r2, #ACCUM0_OFFS + INTERP1] + lsls r4, r3 + str r4, [r2, #ACCUM0_OFFS] + ldr r4, [r2, #PEEK0_OFFS] + ldr r4, [r4] + ldr r5, [r2, #PEEK1_OFFS] + ldr r5, [r5] + ldr r6, [r2, #PEEK0_OFFS + INTERP1] + ldr r6, [r6] + ldr r7, [r2, #PEEK1_OFFS + INTERP1] + ldr r7, [r7] + stmia r1!, {r4, r5, r6, r7} +.endr +2: + cmp r1, ip + bne 1b + pop {r4, r5, r6, r7, pc} + +// ---------------------------------------------------------------------------- +// Fast 1bpp black/white encoder (full res) + +// Taking the encoder from DVI spec, with initial balance 0: +// +// - Encoding either 0x00 or 0xff will produce a running balance of -8, with +// output symbol of 0x100 or 0x200 +// +// - Subsequently encoding either 0x01 or 0xfe will return the balance to 0, with +// output symbol of 0x1ff or 0x2ff +// +// So we can do 1bpp encode with a lookup of x coordinate LSB, and input +// colour bit. If we process pixels in even-sized blocks, only the colour +// lookup is needed. + +// Encode 8 pixels @ 1bpp (using two table lookups) +// r3 contains lookup mask (preshifted) +// r8 contains pointer to encode table +// 2.125 cyc/pix +.macro tmds_encode_1bpp_body shift_instr0 shamt0 shift_instr1 shamt1 + \shift_instr0 r4, r2, #\shamt0 + ands r4, r3 + add r4, r8 + ldmia r4, {r4, r5} + \shift_instr1 r6, r2, #\shamt1 + ands r6, r3 + add r6, r8 + ldmia r6, {r6, r7} + stmia r1!, {r4, r5, r6, r7} +.endm + +// r0: input buffer (word-aligned) +// r1: output buffer (word-aligned) +// r2: output pixel count +decl_func tmds_encode_1bpp + push {r4-r7, lr} + mov r7, r8 + push {r7} + lsls r2, #1 + add r2, r1 + mov ip, r2 + adr r4, tmds_1bpp_table + mov r8, r4 + // Mask: 4 bit index, 8 bytes per entry + movs r3, #0x78 + b 2f +1: + ldmia r0!, {r2} +#if !DVI_1BPP_BIT_REVERSE + tmds_encode_1bpp_body lsls 3 lsrs 1 + tmds_encode_1bpp_body lsrs 5 lsrs 9 + tmds_encode_1bpp_body lsrs 13 lsrs 17 + tmds_encode_1bpp_body lsrs 21 lsrs 25 +#else + tmds_encode_1bpp_body lsrs 1 lsls 3 + tmds_encode_1bpp_body lsrs 9 lsrs 5 + tmds_encode_1bpp_body lsrs 17 lsrs 13 + tmds_encode_1bpp_body lsrs 25 lsrs 21 +#endif +2: + cmp r1, ip + blo 1b + + pop {r7} + mov r8, r7 + pop {r4-r7, pc} + +.align 2 +tmds_1bpp_table: +#if !DVI_1BPP_BIT_REVERSE + .word 0x7fd00, 0x7fd00 // 0000 + .word 0x7fe00, 0x7fd00 // 0001 + .word 0xbfd00, 0x7fd00 // 0010 + .word 0xbfe00, 0x7fd00 // 0011 + .word 0x7fd00, 0x7fe00 // 0100 + .word 0x7fe00, 0x7fe00 // 0101 + .word 0xbfd00, 0x7fe00 // 0110 + .word 0xbfe00, 0x7fe00 // 0111 + .word 0x7fd00, 0xbfd00 // 1000 + .word 0x7fe00, 0xbfd00 // 1001 + .word 0xbfd00, 0xbfd00 // 1010 + .word 0xbfe00, 0xbfd00 // 1011 + .word 0x7fd00, 0xbfe00 // 1100 + .word 0x7fe00, 0xbfe00 // 1101 + .word 0xbfd00, 0xbfe00 // 1110 + .word 0xbfe00, 0xbfe00 // 1111 +#else + .word 0x7fd00, 0x7fd00 // 0000 + .word 0x7fd00, 0xbfd00 // 1000 + .word 0x7fd00, 0x7fe00 // 0100 + .word 0x7fd00, 0xbfe00 // 1100 + .word 0xbfd00, 0x7fd00 // 0010 + .word 0xbfd00, 0xbfd00 // 1010 + .word 0xbfd00, 0x7fe00 // 0110 + .word 0xbfd00, 0xbfe00 // 1110 + .word 0x7fe00, 0x7fd00 // 0001 + .word 0x7fe00, 0xbfd00 // 1001 + .word 0x7fe00, 0x7fe00 // 0101 + .word 0x7fe00, 0xbfe00 // 1101 + .word 0xbfe00, 0x7fd00 // 0011 + .word 0xbfe00, 0xbfd00 // 1011 + .word 0xbfe00, 0x7fe00 // 0111 + .word 0xbfe00, 0xbfe00 // 1111 +#endif + + +// ---------------------------------------------------------------------------- +// Full-resolution 2bpp encode (for 2bpp grayscale, or bitplaned RGB222) + +// Even-x-position pixels are encoded as symbols with imbalance -4, and odd +// pixels with +4, so that we can mix-and-match our even/odd codewords and +// always get a properly balanced sequence: +// +// level 0: (05 -> 103), then (04 -> 1fc) (decimal 5, 4) +// level 1: (50 -> 130), then (51 -> 1cf) (decimal 80, 81) +// level 2: (af -> 230), then (ae -> 2cf) (decimal 175, 174) +// level 3: (fa -> 203), then (fb -> 2fc) (decimal 250, 251) +// +// These correspond to roughly 255 times (0, 1/3, 2/3, 1). +// +// Alternatively we could use symbols with 0 balance, which results in lower +// contrast but avoids the LSB bobble: +// +// level 0: (10 -> 1f0) always +// level 1: (5a -> 263) always +// level 2: (a5 -> 163) always +// level 3: (ef -> 2f0) always + +// Table base pointer in r0. Input pixels in r2. +.macro encode_2bpp_body shift_instr shamt rd + \shift_instr \rd, r2, #\shamt + ands \rd, r3 + ldr \rd, [r0, \rd] +.endm + +// r0: input buffer (word-aligned) +// r1: output buffer (word-aligned) +// r2: output pixel count +decl_func tmds_encode_2bpp + push {r4-r7, lr} + mov r7, r8 + push {r7} + mov r8, r0 + adr r0, tmds_2bpp_table + // Mask: 4-bit index into 4-byte entries. + movs r3, #0x3c + // Limit pointer: 1 word per 2 pixels + lsls r2, #1 + add r2, r1 + mov ip, r2 + b 2f +1: + mov r4, r8 + ldmia r4!, {r2} + mov r8, r4 + encode_2bpp_body lsls 2 r4 + encode_2bpp_body lsrs 2 r5 + encode_2bpp_body lsrs 6 r6 + encode_2bpp_body lsrs 10 r7 + stmia r1!, {r4-r7} + encode_2bpp_body lsrs 14 r4 + encode_2bpp_body lsrs 18 r5 + encode_2bpp_body lsrs 22 r6 + encode_2bpp_body lsrs 26 r7 + stmia r1!, {r4-r7} +2: + cmp r1, ip + blo 1b + pop {r7} + mov r8, r7 + pop {r4-r7, pc} + +.align 2 +tmds_2bpp_table: + .word 0x7f103 // 00, 00 + .word 0x7f130 // 01, 00 + .word 0x7f230 // 10, 00 + .word 0x7f203 // 11, 00 + .word 0x73d03 // 00, 01 + .word 0x73d30 // 01, 01 + .word 0x73e30 // 10, 01 + .word 0x73e03 // 11, 01 + .word 0xb3d03 // 00, 10 + .word 0xb3d30 // 01, 10 + .word 0xb3e30 // 10, 10 + .word 0xb3e03 // 11, 10 + .word 0xbf103 // 00, 11 + .word 0xbf130 // 01, 11 + .word 0xbf230 // 10, 11 + .word 0xbf203 // 11, 11 + +// ---------------------------------------------------------------------------- +// Full-resolution RGB encode (not very practical) + +// Non-doubled TMDS encode. 8.333 cycles per pixel, no exceptions. (This is +// taking horizontal blanking (at VGA) and dual core into account, and +// assuming the 3 channels are encoded individually.) +// +// Here is an idea +// Have a table with a 7 bit lookup. The lookup is the 6 colour data bits (in +// ACCUM0), concatenated with the sign bit of our running disparity (from +// ACCUM1). Each table entry is a 20-bit TMDS symbol (pseudodifferential), +// with the symbol's disparity stored left-justified in the upper 12 bits, as +// e.g. a 6 bit signed integer. +// +// - Load pixel data. cyc: 0.75 (ldmia 2 words, every 4 pixels) +// - Write pixel to ACCUM0. cyc: 1 +// - Read address from PEEK2. cyc: 1 +// - Load encoded pixel from address. cyc: 2 +// - Write disparity data to ACCUM1_ADD cyc: 1 +// - Write encoded data to output buffer. cyc: 1.25 (stmia 4 words, every 4 pixels) +// +// With decent register allocation we may be able to load 4 pixels at +// once (2 words), and write 4 at once (4 words). This gives 7 cyc/pix. +// +// One issue is that the TMDS data in the bottom of ACCUM1 will eventually +// overflow and affect the running disparity, but with 16 zeroes in between, +// this would take much longer than one scanline, so everything is fine if +// we clear the accumulator at the start of the scanline. +// +// Note that we need to use two interpolators to get the bits from both pixels +// -- we are not outputting a single DC-balanced stream, but rather two +// interleaved streams which are each DC-balanced. This is fine electrically, +// but our output here will *NOT* match the TMDS encoder given in the DVI +// spec. + +// You can define TMDS_FULLRES_NO_DC_BALANCE to disable the running balance +// feedback. With the feedback enabled (default), the output is DC balanced, +// but there are just barely enough CPU cycles to do all the encode, so it's +// essentially a party trick. If you disable DC balancing, the performance is +// much better, and many monitors will still accept the signals as long as you +// DC couple your DVI signals. + +.macro tmds_fullres_encode_loop_body ra rb + str \ra, [r2, #ACCUM0_OFFS + INTERP1] + str \ra, [r2, #ACCUM0_OFFS] + ldr \ra, [r2, #PEEK2_OFFS] + ldr \ra, [\ra] +#if !TMDS_FULLRES_NO_DC_BALANCE + str \ra, [r2, #ACCUM1_ADD_OFFS] +#endif + ldr \rb, [r2, #PEEK2_OFFS + INTERP1] + ldr \rb, [\rb] +#if !TMDS_FULLRES_NO_DC_BALANCE + str \rb, [r2, #ACCUM1_ADD_OFFS + INTERP1] +#endif +.endm + +// r0: Input buffer (word-aligned) +// r1: Output buffer (word-aligned) +// r2: Pixel count + +.macro tmds_fullres_encode_loop_16bpp + push {r4-r7, lr} + mov r4, r8 + push {r4} + + + lsls r2, #2 + add r2, r1 + mov ip, r2 + ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET) + // DC balance defined to be 0 at start of scanline: + movs r4, #0 + str r4, [r2, #ACCUM1_OFFS] +#if TMDS_FULLRES_NO_DC_BALANCE + // Alternate parity between odd/even symbols if no feedback + mvns r4, r4 +#endif + str r4, [r2, #ACCUM1_OFFS + INTERP1] + + // Keep loop start pointer in r8 so we can get a longer backward branch + adr r4, 1f + adds r4, #1 // god damn thumb bit why is this a thing + mov r8, r4 + b 2f + .align 2 +1: +.rept 16 + ldmia r0!, {r4, r6} + tmds_fullres_encode_loop_body r4 r5 + tmds_fullres_encode_loop_body r6 r7 + stmia r1!, {r4, r5, r6, r7} +.endr +2: + cmp r1, ip + beq 1f + bx r8 +1: + pop {r4} + mov r8, r4 + pop {r4-r7, pc} +.endm + +// One copy each in X and Y, so the two cores don't step on each other +decl_func_x tmds_fullres_encode_loop_16bpp_x + tmds_fullres_encode_loop_16bpp +decl_func_y tmds_fullres_encode_loop_16bpp_y + tmds_fullres_encode_loop_16bpp + + +.macro tmds_fullres_encode_loop_body_leftshift ra rb + // Note we apply the leftshift for INTERP0 only + str \ra, [r2, #ACCUM0_OFFS + INTERP1] + lsls \ra, r3 + str \ra, [r2, #ACCUM0_OFFS] + ldr \ra, [r2, #PEEK2_OFFS] + ldr \ra, [\ra] +#if !TMDS_FULLRES_NO_DC_BALANCE + str \ra, [r2, #ACCUM1_ADD_OFFS] +#endif + ldr \rb, [r2, #PEEK2_OFFS + INTERP1] + ldr \rb, [\rb] +#if !TMDS_FULLRES_NO_DC_BALANCE + str \rb, [r2, #ACCUM1_ADD_OFFS + INTERP1] +#endif +.endm + +// r0: Input buffer (word-aligned) +// r1: Output buffer (word-aligned) +// r2: Pixel count +// r3: Left shift amount + +.macro tmds_fullres_encode_loop_16bpp_leftshift + push {r4-r7, lr} + mov r4, r8 + mov r5, r9 + push {r4-r5} + + lsls r2, #2 + add r2, r1 + mov ip, r2 + ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET) + // DC balance defined to be 0 at start of scanline: + movs r4, #0 + str r4, [r2, #ACCUM1_OFFS] +#if TMDS_FULLRES_NO_DC_BALANCE + // Alternate parity between odd/even symbols if there's no balance feedback + mvns r4, r4 +#endif + str r4, [r2, #ACCUM1_OFFS + INTERP1] + + adr r4, 1f + adds r4, #1 + mov r8, r4 + b 2f + .align 2 +1: +.rept 16 // 64 pixels per iteration + ldmia r0!, {r4, r6} + tmds_fullres_encode_loop_body_leftshift r4 r5 + tmds_fullres_encode_loop_body_leftshift r6 r7 + stmia r1!, {r4, r5, r6, r7} +.endr +2: + cmp r1, ip + beq 1f + bx r8 +1: + pop {r4-r5} + mov r8, r4 + mov r9, r5 + pop {r4-r7, pc} +.endm + +decl_func_x tmds_fullres_encode_loop_16bpp_leftshift_x + tmds_fullres_encode_loop_16bpp_leftshift +decl_func_y tmds_fullres_encode_loop_16bpp_leftshift_y + tmds_fullres_encode_loop_16bpp_leftshift + + +// ---------------------------------------------------------------------------- +// Full-resolution 8bpp paletted encode + +// Variant of tmds_fullres_encode_loop_16bpp that reads +// 8-bit wide pixels packed 4 per word. The interpolator +// base is set to a reordered list of TMDS symbols based +// on a user colour palette. + +// Two pixels input in rd[17:2]. Two symbols output in rd[19:0]. r2 contains +// interp base pointer. r7 used as temporary. +.macro tmds_palette_encode_loop_body rd + str \rd, [r2, #ACCUM0_OFFS] + str \rd, [r2, #ACCUM0_OFFS + INTERP1] + ldr \rd, [r2, #PEEK2_OFFS] + ldr \rd, [\rd] +#if !TMDS_FULLRES_NO_DC_BALANCE + str \rd, [r2, #ACCUM1_ADD_OFFS] +#endif + ldr r7, [r2, #PEEK2_OFFS + INTERP1] + ldr r7, [r7] +#if !TMDS_FULLRES_NO_DC_BALANCE + str r7, [r2, #ACCUM1_ADD_OFFS + INTERP1] +#endif + lsls r7, #10 + orrs \rd, r7 +.endm + +.macro tmds_palette_encode_loop + push {r4-r7, lr} + mov r4, r8 + push {r4} + + + lsls r2, #1 + add r2, r1 + mov ip, r2 + ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET) + // DC balance defined to be 0 at start of scanline: + movs r4, #0 + str r4, [r2, #ACCUM1_OFFS] +#if TMDS_FULLRES_NO_DC_BALANCE + // Alternate parity between odd/even symbols if there's no balance feedback + mvns r4, r4 +#endif + str r4, [r2, #ACCUM1_OFFS + INTERP1] + + // Keep loop start pointer in r8 so we can get a longer backward branch + adr r4, 1f + adds r4, #1 // god damn thumb bit why is this a thing + mov r8, r4 + b 2f + .align 2 +1: +.rept 10 + ldmia r0!, {r3, r5} + lsrs r4, r3, #14 + lsls r3, #2 + lsrs r6, r5, #14 + lsls r5, #2 + tmds_palette_encode_loop_body r3 + tmds_palette_encode_loop_body r4 + tmds_palette_encode_loop_body r5 + tmds_palette_encode_loop_body r6 + stmia r1!, {r3, r4, r5, r6} +.endr +2: + cmp r1, ip + beq 1f + bx r8 +1: + pop {r4} + mov r8, r4 + pop {r4-r7, pc} +.endm + +decl_func_x tmds_palette_encode_loop_x + tmds_palette_encode_loop +decl_func_y tmds_palette_encode_loop_y + tmds_palette_encode_loop diff --git a/src/libdvi/tmds_encode.c b/src/libdvi/tmds_encode.c new file mode 100644 index 0000000..472b1a9 --- /dev/null +++ b/src/libdvi/tmds_encode.c @@ -0,0 +1,305 @@ +#include "hardware/interp.h" +#include "tmds_encode.h" +#include "hardware/gpio.h" +#include "hardware/sync.h" + +static const uint32_t __scratch_x("tmds_table") tmds_table[] = { +#include "tmds_table.h" +}; + +// Fullres table is bandwidth-critical, so gets one copy for each scratch +// memory. There is a third copy which can go in flash, because it's just used +// to generate palette LUTs. The ones we don't use will get garbage collected +// during linking. +const uint32_t __scratch_x("tmds_table_fullres_x") tmds_table_fullres_x[] = { +#include "tmds_table_fullres.h" +}; + +const uint32_t __scratch_y("tmds_table_fullres_y") tmds_table_fullres_y[] = { +#include "tmds_table_fullres.h" +}; + +// Configure an interpolator to extract a single colour channel from each of a pair +// of pixels, with the first pixel's lsb at pixel_lsb, and the pixels being +// pixel_width wide. Produce a LUT address for the first pixel's colour data on +// LANE0, and the second pixel's colour data on LANE1. +// +// Returns nonzero if the *_leftshift variant of the encoder loop must be used +// (needed for blue channel because I was a stubborn idiot and didn't put +// signed/bidirectional shift on interpolator, very slightly slower). The +// return value is the size of left shift required. + +static int __not_in_flash_func(configure_interp_for_addrgen)(interp_hw_t *interp, uint channel_msb, uint channel_lsb, uint pixel_lsb, uint pixel_width, uint lut_index_width, const uint32_t *lutbase) { + interp_config c; + const uint index_shift = 2; // scaled lookup for 4-byte LUT entries + + int shift_channel_to_index = pixel_lsb + channel_msb - (lut_index_width - 1) - index_shift; + int oops = 0; + if (shift_channel_to_index < 0) { + // "It's ok we'll fix it in software" + oops = -shift_channel_to_index; + shift_channel_to_index = 0; + } + + uint index_msb = index_shift + lut_index_width - 1; + + c = interp_default_config(); + interp_config_set_shift(&c, shift_channel_to_index); + interp_config_set_mask(&c, index_msb - (channel_msb - channel_lsb), index_msb); + interp_set_config(interp, 0, &c); + + c = interp_default_config(); + interp_config_set_shift(&c, pixel_width + shift_channel_to_index); + interp_config_set_mask(&c, index_msb - (channel_msb - channel_lsb), index_msb); + interp_config_set_cross_input(&c, true); + interp_set_config(interp, 1, &c); + + interp->base[0] = (uint32_t)lutbase; + interp->base[1] = (uint32_t)lutbase; + + return oops; +} + +// Extract up to 6 bits from a buffer of 16 bit pixels, and produce a buffer +// of TMDS symbols from this colour channel. Number of pixels must be even, +// pixel buffer must be word-aligned. + +void __not_in_flash_func(tmds_encode_data_channel_16bpp)(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint channel_msb, uint channel_lsb) { + interp_hw_save_t interp0_save; + interp_save(interp0_hw, &interp0_save); + int require_lshift = configure_interp_for_addrgen(interp0_hw, channel_msb, channel_lsb, 0, 16, 6, tmds_table); + if (require_lshift) + tmds_encode_loop_16bpp_leftshift(pixbuf, symbuf, n_pix, require_lshift); + else + tmds_encode_loop_16bpp(pixbuf, symbuf, n_pix); + interp_restore(interp0_hw, &interp0_save); +} + +// As above, but 8 bits per pixel, multiple of 4 pixels, and still word-aligned. +void __not_in_flash_func(tmds_encode_data_channel_8bpp)(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint channel_msb, uint channel_lsb) { + interp_hw_save_t interp0_save, interp1_save; + interp_save(interp0_hw, &interp0_save); + interp_save(interp1_hw, &interp1_save); + // Note that for 8bpp, some left shift is always required for pixel 0 (any + // channel), which destroys some MSBs of pixel 3. To get around this, pixel + // data sent to interp1 is *not left-shifted* + int require_lshift = configure_interp_for_addrgen(interp0_hw, channel_msb, channel_lsb, 0, 8, 6, tmds_table); + int lshift_upper = configure_interp_for_addrgen(interp1_hw, channel_msb, channel_lsb, 16, 8, 6, tmds_table); + assert(!lshift_upper); (void)lshift_upper; + if (require_lshift) + tmds_encode_loop_8bpp_leftshift(pixbuf, symbuf, n_pix, require_lshift); + else + tmds_encode_loop_8bpp(pixbuf, symbuf, n_pix); + interp_restore(interp0_hw, &interp0_save); + interp_restore(interp1_hw, &interp1_save); +} + +// ---------------------------------------------------------------------------- +// Code for full-resolution TMDS encode (barely possible, utterly impractical): + +// Different scheme used for full res as the fun pixel-doubling DC balance +// trick doesn't work, so we need to actually do running disparity. ACCUM0 has +// pixel data, ACCUM1 has running disparity. INTERP0 is used to process even +// pixels, and INTERP1 for odd pixels. Note this means that even and odd +// symbols have their DC balance handled separately, which is not to spec. + +static int __not_in_flash_func(configure_interp_for_addrgen_fullres)(interp_hw_t *interp, uint channel_msb, uint channel_lsb, uint lut_index_width, const uint32_t *lutbase) { + const uint index_shift = 2; // scaled lookup for 4-byte LUT entries + + int shift_channel_to_index = channel_msb - (lut_index_width - 1) - index_shift; + int oops = 0; + if (shift_channel_to_index < 0) { + // "It's ok we'll fix it in software" + oops = -shift_channel_to_index; + shift_channel_to_index = 0; + } + + uint index_msb = index_shift + lut_index_width - 1; + + interp_config c; + // Shift and mask colour channel to lower 6 bits of LUT index (note lut_index_width excludes disparity sign) + c = interp_default_config(); + interp_config_set_shift(&c, shift_channel_to_index); + interp_config_set_mask(&c, index_msb - (channel_msb - channel_lsb), index_msb); + interp_set_config(interp, 0, &c); + + // Concatenate disparity (ACCUM1) sign onto the LUT index + c = interp_default_config(); + interp_config_set_shift(&c, 30 - index_msb); + interp_config_set_mask(&c, index_msb + 1, index_msb + 1); + interp_set_config(interp, 1, &c); + + interp->base[2] = (uint32_t)lutbase; + + return oops; +} + +void __not_in_flash_func(tmds_encode_data_channel_fullres_16bpp)(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint channel_msb, uint channel_lsb) { + uint core = get_core_num(); +#if !TMDS_FULLRES_NO_INTERP_SAVE + interp_hw_save_t interp0_save, interp1_save; + interp_save(interp0_hw, &interp0_save); + interp_save(interp1_hw, &interp1_save); +#endif + + // There is a copy of the inner loop and the LUT in both scratch X and + // scratch Y memories. Use X on core 1 and Y on core 0 so the cores don't + // tread on each other's toes too much. + const uint32_t *lutbase = core ? tmds_table_fullres_x : tmds_table_fullres_y; + int lshift_lower = configure_interp_for_addrgen_fullres(interp0_hw, channel_msb, channel_lsb, 6, lutbase); + int lshift_upper = configure_interp_for_addrgen_fullres(interp1_hw, channel_msb + 16, channel_lsb + 16, 6, lutbase); + assert(!lshift_upper); (void)lshift_upper; + if (lshift_lower) { + (core ? + tmds_fullres_encode_loop_16bpp_leftshift_x : + tmds_fullres_encode_loop_16bpp_leftshift_y + )(pixbuf, symbuf, n_pix, lshift_lower); + } + else { + (core ? + tmds_fullres_encode_loop_16bpp_x : + tmds_fullres_encode_loop_16bpp_y + )(pixbuf, symbuf, n_pix); + } +#if !TMDS_FULLRES_NO_INTERP_SAVE + interp_restore(interp0_hw, &interp0_save); + interp_restore(interp1_hw, &interp1_save); +#endif +} + +static const int8_t imbalance_lookup[16] = { -4, -2, -2, 0, -2, 0, 0, 2, -2, 0, 0, 2, 0, 2, 2, 4 }; + +static inline int byte_imbalance(uint32_t x) +{ + return imbalance_lookup[x >> 4] + imbalance_lookup[x & 0xF]; +} + +static void tmds_encode_symbols(uint8_t pixel, uint32_t* negative_balance_sym, uint32_t* positive_balance_sym) +{ + int pixel_imbalance = byte_imbalance(pixel); + uint32_t sym = pixel & 1; + if (pixel_imbalance > 0 || (pixel_imbalance == 0 && sym == 0)) { + for (int i = 0; i < 7; ++i) { + sym |= (~((sym >> i) ^ (pixel >> (i + 1))) & 1) << (i + 1); + } + } + else { + for (int i = 0; i < 7; ++i) { + sym |= ( ((sym >> i) ^ (pixel >> (i + 1))) & 1) << (i + 1); + } + sym |= 0x100; + } + + int imbalance = byte_imbalance(sym & 0xFF); + if (imbalance == 0) { + if ((sym & 0x100) == 0) sym ^= 0x2ff; + *positive_balance_sym = sym; + *negative_balance_sym = sym; + return; + } + else if (imbalance > 0) { + *negative_balance_sym = (sym ^ 0x2ff) | (((-imbalance + imbalance_lookup[2 ^ (sym >> 8)] + 2) & 0x3F) << 26); + *positive_balance_sym = sym | ((imbalance + imbalance_lookup[sym >> 8] + 2) << 26); + } + else { + *negative_balance_sym = sym | (((imbalance + imbalance_lookup[sym >> 8] + 2) & 0x3F) << 26); + *positive_balance_sym = (sym ^ 0x2ff) | ((-imbalance + imbalance_lookup[2 ^ (sym >> 8)] + 2) << 26); + } +} + +// This takes a 16-bit (RGB 565) colour palette and makes palettes of TMDS symbols suitable +// for performing fullres encode. +// The TMDS palette buffer should be 6 * n_palette words long. +// n_palette must be a power of 2 <= 256. +void tmds_setup_palette_symbols(const uint16_t *palette, uint32_t *tmds_palette, size_t n_palette) { + uint32_t* tmds_palette_blue = tmds_palette; + uint32_t* tmds_palette_green = tmds_palette + 2 * n_palette; + uint32_t* tmds_palette_red = tmds_palette + 4 * n_palette; + for (int i = 0; i < n_palette; ++i) { + uint16_t blue = (palette[i] << 3) & 0xf8; + uint16_t green = (palette[i] >> 3) & 0xfc; + uint16_t red = (palette[i] >> 8) & 0xf8; + tmds_encode_symbols(blue, &tmds_palette_blue[i], &tmds_palette_blue[i + n_palette]); + tmds_encode_symbols(green, &tmds_palette_green[i], &tmds_palette_green[i + n_palette]); + tmds_encode_symbols(red, &tmds_palette_red[i], &tmds_palette_red[i + n_palette]); + } +} + +// This takes a 24-bit (RGB 888) colour palette and makes palettes of TMDS symbols suitable +// for performing fullres encode. +// The TMDS palette buffer should be 6 * n_palette words long. +// n_palette must be a power of 2 <= 256. +void tmds_setup_palette24_symbols(const uint32_t *palette, uint32_t *tmds_palette, size_t n_palette) { + uint32_t* tmds_palette_blue = tmds_palette; + uint32_t* tmds_palette_green = tmds_palette + 2 * n_palette; + uint32_t* tmds_palette_red = tmds_palette + 4 * n_palette; + for (int i = 0; i < n_palette; ++i) { + uint16_t blue = palette[i] & 0xff; + uint16_t green = (palette[i] >> 8) & 0xff; + uint16_t red = (palette[i] >> 16) & 0xff; + tmds_encode_symbols(blue, &tmds_palette_blue[i], &tmds_palette_blue[i + n_palette]); + tmds_encode_symbols(green, &tmds_palette_green[i], &tmds_palette_green[i + n_palette]); + tmds_encode_symbols(red, &tmds_palette_red[i], &tmds_palette_red[i + n_palette]); + } +} + +// Encode palette data for all 3 channels. +// pixbuf is an array of n_pix 8-bit wide pixels containing palette values (32-bit word aligned) +// tmds_palette is a palette of TMDS symbols produced by tmds_setup_palette_symbols +// symbuf is 3*n_pix 32-bit words, this function writes the symbol values for each of the channels to it. +void __not_in_flash_func(tmds_encode_palette_data)(const uint32_t *pixbuf, const uint32_t *tmds_palette, uint32_t *symbuf, size_t n_pix, uint32_t palette_bits) { + uint core = get_core_num(); +#if !TMDS_FULLRES_NO_INTERP_SAVE + interp_hw_save_t interp0_save, interp1_save; + interp_save(interp0_hw, &interp0_save); + interp_save(interp1_hw, &interp1_save); +#endif + + interp0_hw->base[2] = (uint32_t)tmds_palette; + interp1_hw->base[2] = (uint32_t)tmds_palette; + + // Lane 0 on both interpolators masks the palette bits, starting at bit 2, + // The second interpolator also shifts to read the 2nd or 4th byte of the word. + interp0_hw->ctrl[0] = + (2 << SIO_INTERP0_CTRL_LANE0_MASK_LSB_LSB) | + ((palette_bits + 1) << SIO_INTERP0_CTRL_LANE0_MASK_MSB_LSB); + interp1_hw->ctrl[0] = + (8 << SIO_INTERP0_CTRL_LANE0_SHIFT_LSB) | + (2 << SIO_INTERP0_CTRL_LANE0_MASK_LSB_LSB) | + ((palette_bits + 1) << SIO_INTERP0_CTRL_LANE0_MASK_MSB_LSB); + + // Lane 1 shifts and masks the sign bit into the right position to add to the symbol + // table index to choose the negative disparity symbols if the sign is negative. + const uint32_t ctrl_lane_1 = + ((31 - (palette_bits + 2)) << SIO_INTERP0_CTRL_LANE0_SHIFT_LSB) | + (palette_bits + 2) * ((1 << SIO_INTERP0_CTRL_LANE0_MASK_LSB_LSB) | (1 << SIO_INTERP0_CTRL_LANE0_MASK_MSB_LSB)); + interp0_hw->ctrl[1] = ctrl_lane_1; + interp1_hw->ctrl[1] = ctrl_lane_1; + + if (core) { + tmds_palette_encode_loop_x(pixbuf, symbuf, n_pix); + + interp0_hw->base[2] = (uint32_t)(tmds_palette + (2 << palette_bits)); + interp1_hw->base[2] = (uint32_t)(tmds_palette + (2 << palette_bits)); + tmds_palette_encode_loop_x(pixbuf, symbuf + (n_pix >> 1), n_pix); + + interp0_hw->base[2] = (uint32_t)(tmds_palette + (4 << palette_bits)); + interp1_hw->base[2] = (uint32_t)(tmds_palette + (4 << palette_bits)); + tmds_palette_encode_loop_x(pixbuf, symbuf + n_pix, n_pix); + } else { + tmds_palette_encode_loop_y(pixbuf, symbuf, n_pix); + + interp0_hw->base[2] = (uint32_t)(tmds_palette + (2 << palette_bits)); + interp1_hw->base[2] = (uint32_t)(tmds_palette + (2 << palette_bits)); + tmds_palette_encode_loop_y(pixbuf, symbuf + (n_pix >> 1), n_pix); + + interp0_hw->base[2] = (uint32_t)(tmds_palette + (4 << palette_bits)); + interp1_hw->base[2] = (uint32_t)(tmds_palette + (4 << palette_bits)); + tmds_palette_encode_loop_y(pixbuf, symbuf + n_pix, n_pix); + } + +#if !TMDS_FULLRES_NO_INTERP_SAVE + interp_restore(interp0_hw, &interp0_save); + interp_restore(interp1_hw, &interp1_save); +#endif +} diff --git a/src/libdvi/tmds_encode.h b/src/libdvi/tmds_encode.h new file mode 100644 index 0000000..633d630 --- /dev/null +++ b/src/libdvi/tmds_encode.h @@ -0,0 +1,46 @@ +#ifndef _TMDS_ENCODE_H_ +#define _TMDS_ENCODE_H_ + +#include "hardware/interp.h" +#include "dvi_config_defs.h" + +#if defined(__cplusplus) +extern "C" +{ +#endif + +// Functions from tmds_encode.c +void tmds_encode_data_channel_16bpp(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint channel_msb, uint channel_lsb); +void tmds_encode_data_channel_8bpp(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint channel_msb, uint channel_lsb); +void tmds_encode_data_channel_fullres_16bpp(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint channel_msb, uint channel_lsb); +void tmds_setup_palette_symbols(const uint16_t *palette, uint32_t *symbuf, size_t n_palette); +void tmds_setup_palette24_symbols(const uint32_t *palette, uint32_t *symbuf, size_t n_palette); +void tmds_encode_palette_data(const uint32_t *pixbuf, const uint32_t *tmds_palette, uint32_t *symbuf, size_t n_pix, uint32_t palette_bits); + +// Functions from tmds_encode.S + +void tmds_encode_1bpp(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix); +void tmds_encode_2bpp(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix); + +// Uses interp0: +void tmds_encode_loop_16bpp(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix); +void tmds_encode_loop_16bpp_leftshift(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint leftshift); + +// Uses interp0 and interp1: +void tmds_encode_loop_8bpp(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix); +void tmds_encode_loop_8bpp_leftshift(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint leftshift); + +// Uses interp0 and interp1: +// (Note a copy is provided in scratch memories X and Y) +void tmds_fullres_encode_loop_16bpp_x(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix); +void tmds_fullres_encode_loop_16bpp_y(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix); +void tmds_fullres_encode_loop_16bpp_leftshift_x(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint leftshift); +void tmds_fullres_encode_loop_16bpp_leftshift_y(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint leftshift); +void tmds_palette_encode_loop_x(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix); +void tmds_palette_encode_loop_y(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix); + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/src/libdvi/tmds_encode_1bpp.pio b/src/libdvi/tmds_encode_1bpp.pio new file mode 100644 index 0000000..4ca31dc --- /dev/null +++ b/src/libdvi/tmds_encode_1bpp.pio @@ -0,0 +1,46 @@ +.program tmds_encode_1bpp + +; 1bpp black/white pixels go in, TMDS symbols come out. +; Each output word contains two output symbols, each 10 bits in size, +; right-justified. The least-significant symbol is displayed first. +; +; We can encode using the following LUT: (yes this is compliant) +; +; x % 2 | colour | symbol +; ------+--------+------- +; 0 | 0 | 0x100 +; 0 | 1 | 0x200 +; 1 | 0 | 0x1ff +; 1 | 1 | 0x2ff +; +; OSR: shift to right, autopull, threshold 32 +; ISR: shift to right, autopush, threshold 24 +; +; Note the ISR needs to be shifted to *right* so that we can get the first +; pixel in the less-significant position. Threshold 24 so we can get 8x 0-bits +; at the LSBs for free :) + +even_pixel: + out x, 1 + mov y, ~x + in y, 1 + in x, 1 + +odd_pixel: + mov x, ~null + in x, 8 + out x, 1 + mov y, ~x + in y, 1 + in x, 13 ; Bring total shift to 24, triggering push. + +% c-sdk { +static inline void tmds_encode_1bpp_init(PIO pio, uint sm) { + uint offset = pio_add_program(pio, &tmds_encode_1bpp_program); + pio_sm_config c = tmds_encode_1bpp_program_get_default_config(offset); + sm_config_set_out_shift(&c, true, true, 32); + sm_config_set_in_shift(&c, true, true, 24); + pio_sm_init(pio, sm, offset, &c); + pio_sm_set_enabled(pio, sm, true); +} +%} diff --git a/src/libdvi/tmds_table.h b/src/libdvi/tmds_table.h new file mode 100644 index 0000000..48ddf20 --- /dev/null +++ b/src/libdvi/tmds_table.h @@ -0,0 +1,76 @@ +// Generated from tmds_table_gen.py +// +// This table converts a 6 bit data input into a pair of TMDS data symbols +// with data content *almost* equal (1 LSB off) to input value left shifted by +// two. The pairs of symbols have a net DC balance of 0. +// +// The two symbols are concatenated in the 20 LSBs of a data word, with the +// first symbol in least-significant position. +// +// Note the declaration isn't included here, just the table body. This is in +// case you want multiple copies of the table in different SRAMs (particularly +// scratch X/Y). +0x7fd00u, +0x40dfcu, +0x41df8u, +0x7ed04u, +0x43df0u, +0x7cd0cu, +0x7dd08u, +0x42df4u, +0x47de0u, +0x78d1cu, +0x79d18u, +0x46de4u, +0x7bd10u, +0x44decu, +0x45de8u, +0xafa41u, +0x4fdc0u, +0x70d3cu, +0x71d38u, +0x4edc4u, +0x73d30u, +0x4cdccu, +0x4ddc8u, +0xa7a61u, +0x77d20u, +0x48ddcu, +0x49dd8u, +0xa3a71u, +0x4bdd0u, +0xa1a79u, +0xa0a7du, +0x9fa81u, +0x5fd80u, +0x60d7cu, +0x61d78u, +0x5ed84u, +0x63d70u, +0x5cd8cu, +0x5dd88u, +0xb7a21u, +0x67d60u, +0x58d9cu, +0x59d98u, +0xb3a31u, +0x5bd90u, +0xb1a39u, +0xb0a3du, +0x8fac1u, +0x6fd40u, +0x50dbcu, +0x51db8u, +0xbba11u, +0x53db0u, +0xb9a19u, +0xb8a1du, +0x87ae1u, +0x57da0u, +0xbda09u, +0xbca0du, +0x83af1u, +0xbea05u, +0x81af9u, +0x80afdu, +0xbfa01u, diff --git a/src/libdvi/tmds_table_fullres.h b/src/libdvi/tmds_table_fullres.h new file mode 100644 index 0000000..872d7ff --- /dev/null +++ b/src/libdvi/tmds_table_fullres.h @@ -0,0 +1,139 @@ +// Each entry consists of a 10 bit TMDS symbol in pseudo-differential format +// (10 LSBs) and the symbol's disparity as a 6 bit signed integer (the 6 +// MSBs). There is a 16 bit gap in between them, which is actually vital for +// the way the TMDS encode works! +// +// There are 128 1-word entries. The lookup index should be the concatenation +// of the sign bit of current running disparity, with 6 bits of colour channel +// data. + +// Non-negative running disparity: +0xe0000100, +0xf8000303, +0x00000307, +0xe8000104, +0x000001f0, +0xf000010c, +0xe8000108, +0x0000030b, +0xf80001e0, +0xf800011c, +0xf0000118, +0x000001e4, +0xe8000110, +0x00000313, +0x000001e8, +0xf0000241, +0xf00001c0, +0x0000013c, +0xf8000138, +0xf80001c4, +0xf0000130, +0x000001cc, +0xf80001c8, +0xf8000261, +0xe8000120, +0x00000323, +0x000001d8, +0x00000271, +0xf80001d0, +0xf0000086, +0xe8000082, +0xf0000281, +0xe8000180, +0x00000383, +0x00000178, +0xf0000184, +0xf8000170, +0xf800018c, +0xf0000188, +0xf0000221, +0xf0000160, +0x0000019c, +0xf8000198, +0xf8000231, +0xf0000190, +0x00000239, +0xf00000c2, +0xf80002c1, +0xe8000140, +0x00000343, +0x000001b8, +0xf0000211, +0xf80001b0, +0xf8000219, +0x0000021d, +0x000002e1, +0xf00001a0, +0xf0000209, +0xf800020d, +0xf000000e, +0xf0000205, +0xe8000006, +0xe0000002, +0xe8000201, +// Negative running disparity: +0x280003ff, +0x100001fc, +0x080001f8, +0x200003fb, +0x000001f0, +0x180003f3, +0x200003f7, +0x080001f4, +0x1000031f, +0x100003e3, +0x180003e7, +0x000001e4, +0x200003ef, +0x080001ec, +0x000001e8, +0x080000be, +0x1800033f, +0x0000013c, +0x100003c7, +0x1000033b, +0x180003cf, +0x000001cc, +0x10000337, +0x0000009e, +0x200003df, +0x080001dc, +0x000001d8, +0x00000271, +0x1000032f, +0x08000279, +0x1000027d, +0x0800007e, +0x2000037f, +0x0800017c, +0x00000178, +0x1800037b, +0x1000038f, +0x10000373, +0x18000377, +0x080000de, +0x1800039f, +0x0000019c, +0x10000367, +0x000000ce, +0x1800036f, +0x00000239, +0x0800023d, +0x0000003e, +0x200003bf, +0x080001bc, +0x000001b8, +0x080000ee, +0x1000034f, +0x000000e6, +0x0000021d, +0x000002e1, +0x1800035f, +0x080000f6, +0x000000f2, +0x080002f1, +0x080000fa, +0x100002f9, +0x180002fd, +0x100000fe, diff --git a/src/libdvi/tmds_table_gen.py b/src/libdvi/tmds_table_gen.py new file mode 100755 index 0000000..0ad554e --- /dev/null +++ b/src/libdvi/tmds_table_gen.py @@ -0,0 +1,150 @@ +#!/usr/bin/env python3 + +# The key fact is that, if x is even, and the encoder currently has a running +# imbalance of 0, encoding x followed by x + 1 produces a symbol pair with a +# net balance of 0. +# +# This is a reasonable constraint, because we only want RGB565 (so 6 valid +# channel data bits -> data is multiple of 4), and can probably tolerate +# 0.25LSB of noise :) +# +# This means that encoding a half-horizontal-resolution scanline buffer is a +# simple LUT operation for each colour channel, because we have made the +# encoding process stateless by guaranteeing 0 balance. + +def popcount(x): + n = 0 + while x: + n += 1 + x = x & (x - 1) + return n + +# Equivalent to N1(q) - N0(q) in the DVI spec +def byteimbalance(x): + return 2 * popcount(x) - 8 + +# This is a direct translation of "Figure 3-5. T.M.D.S. Encode Algorithm" on +# page 29 of DVI 1.0 spec + +class TMDSEncode: + ctrl_syms = { + 0b00: 0b1101010100, + 0b01: 0b0010101011, + 0b10: 0b0101010100, + 0b11: 0b1010101011 + } + def __init__(self): + self.imbalance = 0 + + def encode(self, d, c, de): + if not de: + self.imbalance = 0 + return self.ctrl_syms[c] + # Minimise transitions + q_m = d & 0x1 + if popcount(d) > 4 or (popcount(d) == 4 and not d & 0x1): + for i in range(7): + q_m = q_m | (~(q_m >> i ^ d >> i + 1) & 0x1) << i + 1 + else: + for i in range(7): + q_m = q_m | ( (q_m >> i ^ d >> i + 1) & 0x1) << i + 1 + q_m = q_m | 0x100 + # Correct DC balance + inversion_mask = 0x2ff + q_out = 0 + if self.imbalance == 0 or byteimbalance(q_m & 0xff) == 0: + q_out = q_m ^ (0 if q_m & 0x100 else inversion_mask) + if q_m & 0x100: + self.imbalance += byteimbalance(q_m & 0xff) + else: + self.imbalance -= byteimbalance(q_m & 0xff) + elif (self.imbalance > 0) == (byteimbalance(q_m & 0xff) > 0): + q_out = q_m ^ inversion_mask + self.imbalance += ((q_m & 0x100) >> 7) - byteimbalance(q_m & 0xff) + else: + q_out = q_m + self.imbalance += byteimbalance(q_m & 0xff) - ((~q_m & 0x100) >> 7) + return q_out + +# Turn a bitmap of width n into n pairs of pseudo-differential bits +def differentialise(x, n): + accum = 0 + for i in range(n): + accum <<= 2 + if x & (1 << (n - 1)): + accum |= 0b01 + else: + accum |= 0b10 + x <<= 1 + return accum + +enc = TMDSEncode() + + +### +# Pixel-doubled table: + +# for i in range(0, 256, 4): +# sym0 = enc.encode(i, 0, 1) +# sym1 = enc.encode(i ^ 1, 0, 1) +# assert(enc.imbalance == 0) +# print(f"0x{sym0 | (sym1 << 10):05x}u,") + +### +# Fullres 1bpp table: (each entry is 2 words, 4 pixels) + +# (note trick here is that encoding 0x00 or 0xff sets imbalance to -8, and +# (encoding 0x01 or 0xfe returns imbalance to 0, so we alternate between these +# (two pairs of dark/light colours. Creates some fairly subtle vertical +# (banding, but it's cheap. + +# for i in range(1 << 4): +# syms = list(enc.encode((0xff if i & 1 << j else 0) ^ j & 0x01, 0, 1) for j in range(4)) +# print(f"0x{syms[0] | syms[1] << 10:05x}, 0x{syms[2] | syms[3] << 10:05x}") +# assert(enc.imbalance == 0) + +### +# Fullres table stuff: + +# def disptable_format(sym): +# return sym | ((popcount(sym) * 2 - 10 & 0x3f) << 26) + +# print("// Non-negative running disparity:") +# for i in range(0, 256, 4): +# enc.imbalance = 1 +# print("0x{:08x},".format(disptable_format(enc.encode(i, 0, 1)))) + +# print("// Negative running disparity:") +# for i in range(0, 256, 4): +# enc.imbalance = -1 +# print("0x{:08x},".format(disptable_format(enc.encode(i, 0, 1)))) + +### +# Control symbols: + +# for i in range(4): +# sym = enc.encode(0, i, 0) +# print(f"0x{sym << 10 | sym:05x},") + + +### +# Find zero-balance symbols: + +# for i in range(256): +# enc.imbalance = 0 +# sym = enc.encode(i, 0, 1) +# if enc.imbalance == 0: +# print(f"{i:02x}: {sym:03x}") + +### +# Generate 2bpp table based on above experiment: + +levels_2bpp_even = [0x05, 0x50, 0xaf, 0xfa] +levels_2bpp_odd = [0x04, 0x51, 0xae, 0xfb] + +for i1, p1 in enumerate(levels_2bpp_odd): + for i0, p0 in enumerate(levels_2bpp_even): + sym0 = enc.encode(p0, 0, 1) + sym1 = enc.encode(p1, 0, 1) + assert(enc.imbalance == 0) + print(f".word 0x{sym1 << 10 | sym0:05x} // {i0:02b}, {i1:02b}") diff --git a/src/libdvi/util_queue_u32_inline.h b/src/libdvi/util_queue_u32_inline.h new file mode 100644 index 0000000..32a1413 --- /dev/null +++ b/src/libdvi/util_queue_u32_inline.h @@ -0,0 +1,83 @@ +#ifndef _UTIL_QUEUE_U32_INLINE_H +#define _UTIL_QUEUE_U32_INLINE_H + +// Faster versions of the functions found in pico/util/queue.h, for the common +// case of 32-bit-sized elements. Can be used on the same queue data +// structure, and mixed freely with the generic access methods, as long as +// element_size == 4. + +#include "pico/util/queue.h" +#include "hardware/sync.h" + +static inline uint16_t _queue_inc_index_u32(queue_t *q, uint16_t index) { + if (++index > q->element_count) { // > because we have element_count + 1 elements + index = 0; + } + return index; +} + +static inline bool queue_try_add_u32(queue_t *q, void *data) { + bool success = false; + uint32_t flags = spin_lock_blocking(q->core.spin_lock); + if (queue_get_level_unsafe(q) != q->element_count) { + ((uint32_t*)q->data)[q->wptr] = *(uint32_t*)data; + q->wptr = _queue_inc_index_u32(q, q->wptr); + success = true; + } + spin_unlock(q->core.spin_lock, flags); + if (success) __sev(); + return success; +} + +static inline bool queue_try_remove_u32(queue_t *q, void *data) { + bool success = false; + uint32_t flags = spin_lock_blocking(q->core.spin_lock); + if (queue_get_level_unsafe(q) != 0) { + *(uint32_t*)data = ((uint32_t*)q->data)[q->rptr]; + q->rptr = _queue_inc_index_u32(q, q->rptr); + success = true; + } + spin_unlock(q->core.spin_lock, flags); + if (success) __sev(); + return success; +} + +static inline bool queue_try_peek_u32(queue_t *q, void *data) { + bool success = false; + uint32_t flags = spin_lock_blocking(q->core.spin_lock); + if (queue_get_level_unsafe(q) != 0) { + *(uint32_t*)data = ((uint32_t*)q->data)[q->rptr]; + success = true; + } + spin_unlock(q->core.spin_lock, flags); + return success; +} + +static inline void queue_add_blocking_u32(queue_t *q, void *data) { + bool done; + do { + done = queue_try_add_u32(q, data); + if (done) break; + __wfe(); + } while (true); +} + +static inline void queue_remove_blocking_u32(queue_t *q, void *data) { + bool done; + do { + done = queue_try_remove_u32(q, data); + if (done) break; + __wfe(); + } while (true); +} + +static inline void queue_peek_blocking_u32(queue_t *q, void *data) { + bool done; + do { + done = queue_try_peek_u32(q, data); + if (done) break; + __wfe(); + } while (true); +} + +#endif