674 lines
29 KiB
C
674 lines
29 KiB
C
// Device- and environment-neutral core matrix-driving functionality.
|
||
// See notes near top of arch.h regarding assumptions of hardware
|
||
// "common ground." If you find yourself doing an "#ifdef ARDUINO" or
|
||
// "#ifdef _SAMD21_" in this file, STOP. Idea is that the code in this
|
||
// file is neutral and portable (within aforementioned assumptions).
|
||
// Nonportable elements should appear in arch.h. If arch.h functionality
|
||
// is lacking, extend it there, do not go making device- or environment-
|
||
// specific cases within this file.
|
||
|
||
// Function names are intentionally a little obtuse, idea is that one writes
|
||
// a more sensible wrapper around this for specific environments (e.g. the
|
||
// Arduino stuff in Adafruit_Protomatter.cpp). The "_PM_" prefix on most
|
||
// things hopefully makes function and variable name collisions much less
|
||
// likely with one's own code.
|
||
|
||
#include <stddef.h>
|
||
#include <string.h>
|
||
#include "core.h" // enums and structs
|
||
#include "arch.h" // Do NOT include this in any other source files
|
||
|
||
// Overall matrix refresh rate (frames/second) is a function of matrix width
|
||
// and chain length, number of address lines, number of bit planes, CPU speed
|
||
// and whether or not a GPIO toggle register is available. There is no "this
|
||
// will run at X-frames-per-second" constant figure. You typically just have
|
||
// to try it out and perhaps trade off some bit planes for refresh rate until
|
||
// the image looks good and stable. Anything over 100 Hz is usually passable,
|
||
// around 250 Hz is where things firm up. And while this could proceed higher
|
||
// in some situations, the tradeoff is that faster rates use progressively
|
||
// more CPU time (because it's timer interrupt based and not using DMA or
|
||
// special peripherals). So a throttle is set here, an approximate maximum
|
||
// frame rate which the software will attempt to avoid exceeding (but may
|
||
// refresh slower than this, and in many cases will...just need to set an
|
||
// upper limit to avoid excessive CPU load). An incredibly long comment block
|
||
// for a single constant, thank you for coming to my TED talk!
|
||
#define _PM_MAX_REFRESH_HZ 250
|
||
|
||
// Time (in microseconds) to pause following any change in address lines
|
||
// (individually or collectively). Some matrices respond slowly there...
|
||
// must pause on change for matrix to catch up. Defined here (rather than
|
||
// arch.h) because it's not architecture-specific.
|
||
#define _PM_ROW_DELAY 8
|
||
|
||
// These are the lowest-level functions for issing data to matrices.
|
||
// There are three versions because it depends on how the six RGB data bits
|
||
// (and clock bit) are arranged within a 32-bit PORT register. If all six
|
||
// (seven) fit within one byte or word of the PORT, the library's memory
|
||
// use (and corresponding data-issuing function) change. This will also have
|
||
// an impact on parallel chains in the future, where the number of concurrent
|
||
// RGB data bits isn't always six, but some multiple thereof (i.e. up to five
|
||
// parallel outputs -- 30 RGB bits + clock -- on a 32-bit PORT, though that's
|
||
// largely hypothetical as the chance of finding a PORT with that many bits
|
||
// exposed and NOT interfering with other peripherals on a board is highly
|
||
// improbable. But I could see four happening, maybe on a Grand Central or
|
||
// other kitchen-sink board.
|
||
static void blast_byte(Protomatter_core *core, uint8_t *data);
|
||
static void blast_word(Protomatter_core *core, uint16_t *data);
|
||
static void blast_long(Protomatter_core *core, uint32_t *data);
|
||
|
||
// Validate and populate vital elements of core structure.
|
||
// Does NOT allocate core struct -- calling function must provide that.
|
||
// (In the Arduino C++ library, it’s part of the Protomatter class.)
|
||
ProtomatterStatus _PM_init(Protomatter_core *core,
|
||
uint16_t bitWidth, uint8_t bitDepth,
|
||
uint8_t rgbCount, uint8_t *rgbList,
|
||
uint8_t addrCount, uint8_t *addrList,
|
||
uint8_t clockPin, uint8_t latchPin, uint8_t oePin,
|
||
bool doubleBuffer, void *timer) {
|
||
if(!core) return PROTOMATTER_ERR_ARG;
|
||
|
||
if(rgbCount > 5) rgbCount = 5; // Max 5 in parallel (32-bit PORT)
|
||
if(addrCount > 5) addrCount = 5; // Max 5 address lines (A-E)
|
||
// bitDepth is NOT constrained here, handle in calling function
|
||
// (varies with implementation, e.g. GFX lib is max 6 bitplanes,
|
||
// but might be more or less elsewhere)
|
||
|
||
#if defined(_PM_TIMER_DEFAULT)
|
||
// If NULL timer was passed in (the default case for the constructor),
|
||
// use default value from arch.h. For example, in the Arduino case it's
|
||
// tied to TC4 specifically.
|
||
if(timer == NULL) timer = _PM_TIMER_DEFAULT;
|
||
#else
|
||
if(timer == NULL) return PROTOMATTER_ERR_ARG;
|
||
#endif
|
||
|
||
core->timer = timer;
|
||
core->width = bitWidth; // Total matrix chain length in bits
|
||
core->numPlanes = bitDepth;
|
||
core->parallel = rgbCount;
|
||
core->numAddressLines = addrCount;
|
||
core->clockPin = clockPin;
|
||
core->latch.pin = latchPin;
|
||
core->oe.pin = oePin;
|
||
core->doubleBuffer = doubleBuffer;
|
||
core->addr = NULL;
|
||
core->screenData = NULL;
|
||
|
||
// Make a copy of the rgbList and addrList tables in case they're
|
||
// passed from local vars on the stack or some other non-persistent
|
||
// source. screenData is NOT allocated here because data size (byte,
|
||
// word, long) is not known until the begin function evaluates all
|
||
// the pin bitmasks.
|
||
|
||
rgbCount *= 6; // Convert parallel count to pin count
|
||
if((core->rgbPins = (uint8_t *)_PM_ALLOCATOR(rgbCount * sizeof(uint8_t)))) {
|
||
if((core->addr = (_PM_pin *)_PM_ALLOCATOR(addrCount * sizeof(_PM_pin)))) {
|
||
memcpy(core->rgbPins, rgbList, rgbCount * sizeof(uint8_t));
|
||
for(uint8_t i=0; i<addrCount; i++) {
|
||
core->addr[i].pin = addrList[i];
|
||
}
|
||
return PROTOMATTER_OK;
|
||
}
|
||
_PM_FREE(core->rgbPins);
|
||
core->rgbPins = NULL;
|
||
}
|
||
return PROTOMATTER_ERR_MALLOC;
|
||
}
|
||
|
||
// Allocate display buffers and populate additional elements.
|
||
ProtomatterStatus _PM_begin(Protomatter_core *core) {
|
||
if(!core) return PROTOMATTER_ERR_ARG;
|
||
|
||
if(!core->rgbPins) { // NULL if copy failed to allocate
|
||
return PROTOMATTER_ERR_MALLOC;
|
||
}
|
||
|
||
// Verify that rgbPins and clockPin are all on the same PORT. If not,
|
||
// return an error. Pin list is not freed; please call dealloc function.
|
||
// Also get bitmask of which bits within 32-bit PORT register are
|
||
// referenced.
|
||
uint8_t *port = (uint8_t *)_PM_portOutRegister(core->clockPin);
|
||
#if defined(_PM_portToggleRegister)
|
||
// If a bit-toggle register is present, the clock pin is included
|
||
// in determining which bytes of the PORT register are used (and thus
|
||
// the data storage efficiency).
|
||
uint32_t bitMask = _PM_portBitMask(core->clockPin);
|
||
#else
|
||
// If no bit-toggle register, clock pin can be on any bit, doesn't
|
||
// affect storage efficiency.
|
||
uint32_t bitMask = 0;
|
||
#endif
|
||
|
||
for(uint8_t i=0; i<core->parallel * 6; i++) {
|
||
uint8_t *p2 = (uint8_t *)_PM_portOutRegister(core->rgbPins[i]);
|
||
if(p2 != port) {
|
||
return PROTOMATTER_ERR_PINS;
|
||
}
|
||
bitMask |= _PM_portBitMask(core->rgbPins[i]);
|
||
}
|
||
|
||
// RGB + clock are on same port, we can proceed...
|
||
|
||
// Determine data type for internal representation. If all the data
|
||
// bitmasks (and possibly clock bitmask, depending whether toggle-bits
|
||
// register is present) are in the same byte, this can be stored more
|
||
// compact than if they're spread across a word or long.
|
||
uint8_t byteMask = 0;
|
||
if(bitMask & 0xFF000000) byteMask |= 0b1000;
|
||
if(bitMask & 0x00FF0000) byteMask |= 0b0100;
|
||
if(bitMask & 0x0000FF00) byteMask |= 0b0010;
|
||
if(bitMask & 0x000000FF) byteMask |= 0b0001;
|
||
switch(byteMask) {
|
||
case 0b0001: // If all PORT bits are in the same byte...
|
||
case 0b0010:
|
||
case 0b0100:
|
||
case 0b1000:
|
||
core->bytesPerElement = 1; // Use 8-bit PORT accesses.
|
||
break;
|
||
case 0b0011: // If all PORT bits in upper/lower word...
|
||
case 0b1100:
|
||
core->bytesPerElement = 2; // Use 16-bit PORT accesses.
|
||
// Although some devices might tolerate unaligned 16-bit accesses
|
||
// ('middle' word of 32-bit PORT), that is NOT handled here.
|
||
// It's a portability liability.
|
||
break;
|
||
default: // Any other situation...
|
||
core->bytesPerElement = 4; // Use 32-bit PORT accesses.
|
||
break;
|
||
}
|
||
|
||
// Planning for screen data allocation...
|
||
core->numRowPairs = 1 << core->numAddressLines;
|
||
uint8_t chunks = (core->width + (_PM_chunkSize - 1)) / _PM_chunkSize;
|
||
uint16_t columns = chunks * _PM_chunkSize; // Padded matrix width
|
||
uint32_t screenBytes = columns * core->numRowPairs * core->numPlanes *
|
||
core->bytesPerElement;
|
||
|
||
core->bufferSize = screenBytes; // Bytes per matrix buffer (1 or 2)
|
||
if(core->doubleBuffer) screenBytes *= 2; // Total for matrix buffer(s)
|
||
uint32_t rgbMaskBytes = core->parallel * 6 * core->bytesPerElement;
|
||
|
||
// Allocate matrix buffer(s). Don't worry about the return type...
|
||
// though we might be using words or longs for certain pin configs,
|
||
// _PM_ALLOCATOR() by definition always aligns to the longest type.
|
||
if(!(core->screenData = (uint8_t *)_PM_ALLOCATOR(screenBytes + rgbMaskBytes))) {
|
||
return PROTOMATTER_ERR_MALLOC;
|
||
}
|
||
|
||
// rgbMask data follows the matrix buffer(s)
|
||
core->rgbMask = core->screenData + screenBytes;
|
||
|
||
#if !defined(_PM_portToggleRegister)
|
||
// Clear entire screenData buffer so there's no cruft in any pad bytes
|
||
// (if using toggle register, each is set to clockMask below instead).
|
||
memset(core->screenData, 0, screenBytes);
|
||
#endif
|
||
|
||
// Figure out clockMask and rgbAndClockMask, clear matrix buffers
|
||
if(core->bytesPerElement == 1) {
|
||
core->portOffset = _PM_byteOffset(core->rgbPins[0]);
|
||
#if defined(_PM_portToggleRegister)
|
||
// Clock and rgbAndClockMask are 8-bit values
|
||
core->clockMask = _PM_portBitMask(core->clockPin) >>
|
||
(core->portOffset * 8);
|
||
core->rgbAndClockMask = (bitMask >> (core->portOffset * 8)) |
|
||
core->clockMask;
|
||
memset(core->screenData, core->clockMask, screenBytes);
|
||
#else
|
||
// Clock and rgbAndClockMask are 32-bit values
|
||
core->clockMask = _PM_portBitMask(core->clockPin);
|
||
core->rgbAndClockMask = bitMask | core->clockMask;
|
||
#endif
|
||
for(uint8_t i=0; i<core->parallel * 6; i++) {
|
||
((uint8_t *)core->rgbMask)[i] = // Pin bitmasks are 8-bit
|
||
_PM_portBitMask(core->rgbPins[i]) >> (core->portOffset * 8);
|
||
}
|
||
} else if(core->bytesPerElement == 2) {
|
||
core->portOffset = _PM_wordOffset(core->rgbPins[0]);
|
||
#if defined(_PM_portToggleRegister)
|
||
// Clock and rgbAndClockMask are 16-bit values
|
||
core->clockMask = _PM_portBitMask(core->clockPin) >>
|
||
(core->portOffset * 16);
|
||
core->rgbAndClockMask = (bitMask >> (core->portOffset * 16)) |
|
||
core->clockMask;
|
||
uint32_t elements = screenBytes / 2;
|
||
for(uint32_t i=0; i<elements; i++) {
|
||
((uint16_t *)core->screenData)[i] = core->clockMask;
|
||
}
|
||
#else
|
||
// Clock and rgbAndClockMask are 32-bit values
|
||
core->clockMask = _PM_portBitMask(core->clockPin);
|
||
core->rgbAndClockMask = bitMask | core->clockMask;
|
||
#endif
|
||
for(uint8_t i=0; i<core->parallel * 6; i++) {
|
||
((uint16_t *)core->rgbMask)[i] = // Pin bitmasks are 16-bit
|
||
_PM_portBitMask(core->rgbPins[i]) >> (core->portOffset * 16);
|
||
}
|
||
} else {
|
||
core->portOffset = 0;
|
||
core->clockMask = _PM_portBitMask(core->clockPin);
|
||
core->rgbAndClockMask = bitMask | core->clockMask;
|
||
#if defined(_PM_portToggleRegister)
|
||
uint32_t elements = screenBytes / 4;
|
||
for(uint32_t i=0; i<elements; i++) {
|
||
((uint32_t *)core->screenData)[i] = core->clockMask;
|
||
}
|
||
#endif
|
||
for(uint8_t i=0; i<core->parallel * 6; i++) {
|
||
((uint32_t *)core->rgbMask)[i] = // Pin bitmasks are 32-bit
|
||
_PM_portBitMask(core->rgbPins[i]);
|
||
}
|
||
}
|
||
|
||
// Estimate minimum bitplane #0 period for _PM_MAX_REFRESH_HZ rate.
|
||
uint32_t minPeriodPerFrame = _PM_timerFreq / _PM_MAX_REFRESH_HZ;
|
||
uint32_t minPeriodPerLine = minPeriodPerFrame / core->numRowPairs;
|
||
core->minPeriod = minPeriodPerLine / ((1 << core->numPlanes) - 1);
|
||
if(core->minPeriod < _PM_minMinPeriod) {
|
||
core->minPeriod = _PM_minMinPeriod;
|
||
}
|
||
// Actual frame rate may be lower than this...it's only an estimate
|
||
// and does not factor in things like address line selection delays
|
||
// or interrupt overhead. That's OK, just don't want to exceed this
|
||
// rate, as it'll eat all the CPU cycles.
|
||
// Make a wild guess for the initial bit-zero interval. It's okay
|
||
// that this is off, code adapts to actual timer results pretty quick.
|
||
|
||
core->bitZeroPeriod = core->width * 5; // Initial guesstimate
|
||
|
||
core->activeBuffer = 0;
|
||
|
||
// Configure pins as outputs and initialize their states.
|
||
|
||
core->latch.setReg = _PM_portSetRegister(core->latch.pin);
|
||
core->latch.clearReg = _PM_portClearRegister(core->latch.pin);
|
||
core->latch.bit = _PM_portBitMask(core->latch.pin);
|
||
core->oe.setReg = _PM_portSetRegister(core->oe.pin);
|
||
core->oe.clearReg = _PM_portClearRegister(core->oe.pin);
|
||
core->oe.bit = _PM_portBitMask(core->oe.pin);
|
||
|
||
_PM_pinOutput(core->clockPin);
|
||
_PM_pinLow(core->clockPin); // Init clock LOW
|
||
_PM_pinOutput(core->latch.pin);
|
||
_PM_pinLow(core->latch.pin); // Init latch LOW
|
||
_PM_pinOutput(core->oe.pin);
|
||
_PM_pinHigh(core->oe.pin); // Init OE HIGH (disable output)
|
||
|
||
for(uint8_t i=0; i<core->parallel * 6; i++) {
|
||
_PM_pinOutput(core->rgbPins[i]);
|
||
_PM_pinLow(core->rgbPins[i]);
|
||
}
|
||
#if defined(_PM_portToggleRegister)
|
||
core->addrPortToggle = _PM_portToggleRegister(core->addr[0].pin);
|
||
core->singleAddrPort = 1;
|
||
#endif
|
||
for(uint8_t line=0,bit=1; line<core->numAddressLines; line++, bit<<=1) {
|
||
core->addr[line].setReg =
|
||
_PM_portSetRegister(core->addr[line].pin);
|
||
core->addr[line].clearReg =
|
||
_PM_portClearRegister(core->addr[line].pin);
|
||
core->addr[line].bit =
|
||
_PM_portBitMask(core->addr[line].pin);
|
||
_PM_pinOutput(core->addr[line].pin);
|
||
if(core->prevRow & bit) {
|
||
_PM_pinHigh(core->addr[line].pin);
|
||
} else {
|
||
_PM_pinLow(core->addr[line].pin);
|
||
}
|
||
#if defined(_PM_portToggleRegister)
|
||
// If address pin on different port than addr 0, no singleAddrPort.
|
||
if(_PM_portToggleRegister(core->addr[line].pin) !=
|
||
core->addrPortToggle) {
|
||
core->singleAddrPort = 0;
|
||
}
|
||
#endif
|
||
}
|
||
|
||
// Get pointers to bit set and clear registers (and toggle, if present)
|
||
core->setReg = (uint8_t *)_PM_portSetRegister(core->clockPin);
|
||
core->clearReg = (uint8_t *)_PM_portClearRegister(core->clockPin);
|
||
#if defined(_PM_portToggleRegister)
|
||
core->toggleReg = (uint8_t *)_PM_portToggleRegister(core->clockPin);
|
||
#endif
|
||
|
||
// Reset plane/row counters, config and start timer
|
||
_PM_resume(core);
|
||
|
||
return PROTOMATTER_OK;
|
||
}
|
||
|
||
// Disable (but do not deallocate) a Protomatter matrix. Disables matrix by
|
||
// setting OE pin HIGH and writing all-zero data to matrix shift registers,
|
||
// so it won't halt with lit LEDs.
|
||
void _PM_stop(Protomatter_core *core) {
|
||
if((core)) {
|
||
while(core->swapBuffers); // Wait for any pending buffer swap
|
||
_PM_timerStop(core->timer); // Halt timer
|
||
*core->oe.setReg = core->oe.bit; // Set OE HIGH (disable output)
|
||
// So, in PRINCIPLE, setting OE high would be sufficient...
|
||
// but in case that pin is shared with another function such
|
||
// as the onloard LED (which pulses during bootloading) let's
|
||
// also clear out the matrix shift registers for good measure.
|
||
// Set all RGB pins LOW...
|
||
for(uint8_t i=0; i<core->parallel * 6; i++) {
|
||
_PM_pinLow(core->rgbPins[i]);
|
||
}
|
||
// Clock out bits (just need to toggle clock with RGBs held low)
|
||
for(uint32_t i=0; i<core->width; i++) {
|
||
_PM_pinHigh(core->clockPin);
|
||
_PM_clockHoldHigh;
|
||
_PM_pinLow(core->clockPin);
|
||
_PM_clockHoldLow;
|
||
}
|
||
// Latch data
|
||
*core->latch.setReg = core->latch.bit;
|
||
*core->latch.clearReg = core->latch.bit;
|
||
}
|
||
}
|
||
|
||
void _PM_resume(Protomatter_core *core) {
|
||
if((core)) {
|
||
// Init plane & row to max values so they roll over on 1st interrupt
|
||
core->plane = core->numPlanes - 1;
|
||
core->row = core->numRowPairs - 1;
|
||
core->prevRow = (core->numRowPairs > 1) ? (core->row - 1) : 1;
|
||
core->swapBuffers = 0;
|
||
core->frameCount = 0;
|
||
|
||
_PM_timerInit(core->timer); // Configure timer
|
||
_PM_timerStart(core->timer, 1000); // Start timer
|
||
}
|
||
}
|
||
|
||
// Free memory associated with core structure. Does NOT dealloc struct.
|
||
void _PM_free(Protomatter_core *core) {
|
||
if((core)) {
|
||
_PM_stop(core);
|
||
// TO DO: Set all pins back to inputs here?
|
||
if(core->screenData) _PM_FREE(core->screenData);
|
||
if(core->addr) _PM_FREE(core->addr);
|
||
if(core->rgbPins) {
|
||
_PM_FREE(core->rgbPins);
|
||
core->rgbPins = NULL;
|
||
}
|
||
}
|
||
}
|
||
|
||
|
||
// ISR function (in arch.h) calls this function which it extern'd.
|
||
void _PM_row_handler(Protomatter_core *core) {
|
||
|
||
*core->oe.setReg = core->oe.bit; // Disable LED output
|
||
|
||
*core->latch.setReg = core->latch.bit; // Latch data from PRIOR pass
|
||
// Stop timer, save count value at stop
|
||
uint32_t elapsed = _PM_timerStop(core->timer);
|
||
uint8_t prevPlane = core->plane; // Save that plane # for later timing
|
||
*core->latch.clearReg = core->latch.bit; // (split to add a few cycles)
|
||
|
||
// If plane 0 just finished being displayed (plane 1 was loaded on prior
|
||
// pass, or there's only one plane...I know, it's confusing), take note
|
||
// of the elapsed timer value, for subsequent bitplane timing (each
|
||
// plane period is double the previous). Value is filtered slightly to
|
||
// avoid jitter.
|
||
if((prevPlane == 1) || (core->numPlanes == 1)) {
|
||
core->bitZeroPeriod = ((core->bitZeroPeriod * 7) + elapsed) / 8;
|
||
if(core->bitZeroPeriod < core->minPeriod) {
|
||
core->bitZeroPeriod = core->minPeriod;
|
||
}
|
||
}
|
||
|
||
if(prevPlane == 0) { // Plane 0 just finished loading
|
||
#if defined(_PM_portToggleRegister)
|
||
// If all address lines are on a single PORT (and bit toggle is
|
||
// available), do address line change all at once. Even doing all
|
||
// this math takes MUCH less time than the delays required when
|
||
// doing line-by-line changes.
|
||
if(core->singleAddrPort) {
|
||
// Make bitmasks of prior and new row bits
|
||
uint32_t priorBits = 0, newBits = 0;
|
||
for(uint8_t line=0,bit=1; line<core->numAddressLines;
|
||
line++, bit<<=1) {
|
||
if(core->row & bit) {
|
||
newBits |= core->addr[line].bit;
|
||
}
|
||
if(core->prevRow & bit) {
|
||
priorBits |= core->addr[line].bit;
|
||
}
|
||
}
|
||
*core->addrPortToggle = newBits ^ priorBits;
|
||
_PM_delayMicroseconds(_PM_ROW_DELAY);
|
||
} else {
|
||
#endif
|
||
// Configure row address lines individually, making changes
|
||
// (with delays) only where necessary.
|
||
for(uint8_t line=0,bit=1; line<core->numAddressLines;
|
||
line++, bit<<=1) {
|
||
if((core->row & bit) != (core->prevRow & bit)) {
|
||
if(core->row & bit) { // Set addr line high
|
||
*core->addr[line].setReg = core->addr[line].bit;
|
||
} else { // Set addr line low
|
||
*core->addr[line].clearReg = core->addr[line].bit;
|
||
}
|
||
_PM_delayMicroseconds(_PM_ROW_DELAY);
|
||
}
|
||
}
|
||
#if defined(_PM_portToggleRegister)
|
||
}
|
||
#endif
|
||
core->prevRow = core->row;
|
||
}
|
||
|
||
// Advance bitplane index and/or row as necessary
|
||
if(++core->plane >= core->numPlanes) { // Next data bitplane, or
|
||
core->plane = 0; // roll over bitplane to start
|
||
if(++core->row >= core->numRowPairs) { // Next row, or
|
||
core->row = 0; // roll over row to start
|
||
// Switch matrix buffers if due (only if double-buffered)
|
||
if(core->swapBuffers) {
|
||
core->activeBuffer = 1 - core->activeBuffer;
|
||
core->swapBuffers = 0; // Swapped!
|
||
}
|
||
core->frameCount++;
|
||
}
|
||
}
|
||
|
||
// 'plane' now is index of data to issue, NOT data to display.
|
||
// 'prevPlane' is the previously-loaded data, which gets displayed
|
||
// now while the next plane data is loaded.
|
||
|
||
// Set timer and enable LED output for data loaded on PRIOR pass:
|
||
_PM_timerStart(core->timer, core->bitZeroPeriod << prevPlane);
|
||
*core->oe.clearReg = core->oe.bit; // Enable LED output
|
||
|
||
uint32_t elementsPerLine = _PM_chunkSize *
|
||
((core->width + (_PM_chunkSize - 1)) / _PM_chunkSize);
|
||
uint32_t srcOffset = elementsPerLine *
|
||
(core->numPlanes * core->row + core->plane) * core->bytesPerElement;
|
||
if(core->doubleBuffer) {
|
||
srcOffset += core->bufferSize * core->activeBuffer;
|
||
}
|
||
|
||
if(core->bytesPerElement == 1) {
|
||
blast_byte(core, (uint8_t *)(core->screenData + srcOffset));
|
||
} else if(core->bytesPerElement == 2) {
|
||
blast_word(core, (uint16_t *)(core->screenData + srcOffset));
|
||
} else {
|
||
blast_long(core, (uint32_t *)(core->screenData + srcOffset));
|
||
}
|
||
|
||
// 'plane' data is now loaded, will be shown on NEXT pass
|
||
}
|
||
|
||
// Innermost data-stuffing loop functions
|
||
|
||
// The presence of a bit-toggle register can make the data-stuffing loop a
|
||
// fair bit faster (2 PORT accesses per column vs 3). But ironically, some
|
||
// devices (e.g. SAMD51) can outpace the matrix max CLK speed, so we slow
|
||
// them down with a few NOPs. These are defined in arch.h as needed.
|
||
// _PM_clockHoldLow is whatever code necessary to delay the clock rise
|
||
// after data is placed on the PORT. _PM_clockHoldHigh is code for delay
|
||
// before setting the clock back low. If undefined, nothing goes there.
|
||
|
||
#if defined(_PM_portToggleRegister)
|
||
#define PEW \
|
||
*toggle = *data++; /* Toggle in new data + toggle clock low */ \
|
||
_PM_clockHoldLow; \
|
||
*toggle = clock; /* Toggle clock high */ \
|
||
_PM_clockHoldHigh;
|
||
#else
|
||
#define PEW \
|
||
*set = *data++; /* Set RGB data high */ \
|
||
_PM_clockHoldLow; \
|
||
*set32 = clock; /* Set clock high */ \
|
||
_PM_clockHoldHigh; \
|
||
*clear32 = rgbclock; /* Clear RGB data + clock */
|
||
#endif
|
||
|
||
#if _PM_chunkSize == 1
|
||
#define PEW_UNROLL PEW
|
||
#elif _PM_chunkSize == 8
|
||
#define PEW_UNROLL PEW PEW PEW PEW PEW PEW PEW PEW
|
||
#elif _PM_chunkSize == 16
|
||
#define PEW_UNROLL \
|
||
PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW
|
||
#elif _PM_chunkSize == 32
|
||
#define PEW_UNROLL \
|
||
PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW \
|
||
PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW
|
||
#elif _PM_chunkSize == 64
|
||
#define PEW_UNROLL \
|
||
PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW \
|
||
PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW \
|
||
PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW \
|
||
PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW
|
||
#else
|
||
#error "Unimplemented _PM_chunkSize value"
|
||
#endif
|
||
|
||
// There are THREE COPIES of the following function -- one each for byte,
|
||
// word and long. If changes are made in any one of them, the others MUST
|
||
// be updated to match! (Decided against using macro tricks for the
|
||
// function, too often ends in disaster...but must be vigilant in the
|
||
// three-function maintenance then.)
|
||
|
||
static void blast_byte(Protomatter_core *core, uint8_t *data) {
|
||
#if defined(_PM_portToggleRegister)
|
||
// If here, it was established in begin() that the RGB data bits and
|
||
// clock are all within the same byte of a PORT register, else we'd be
|
||
// in the word- or long-blasting functions now. So we just need an
|
||
// 8-bit pointer to the PORT.
|
||
volatile uint8_t *toggle = (volatile uint8_t *)core->toggleReg +
|
||
core->portOffset;
|
||
#else
|
||
// No-toggle version is a little different. If here, RGB data is all
|
||
// in one byte of PORT register, clock can be any bit in 32-bit PORT.
|
||
volatile uint8_t *set; // For RGB data set
|
||
volatile uint32_t *set32; // For clock set
|
||
volatile uint32_t *clear32; // For RGB data + clock clear
|
||
set = (volatile uint8_t *)core->setReg + core->portOffset;
|
||
set32 = (volatile uint32_t *)core->setReg;
|
||
clear32 = (volatile uint32_t *)core->clearReg;
|
||
uint32_t rgbclock = core->rgbAndClockMask; // RGB + clock bit
|
||
#endif
|
||
uint32_t clock = core->clockMask; // Clock bit
|
||
uint8_t chunks = (core->width + (_PM_chunkSize - 1)) / _PM_chunkSize;
|
||
|
||
// PORT has already been initialized with RGB data + clock bits
|
||
// all LOW, so we don't need to initialize that state here.
|
||
|
||
while(chunks--) {
|
||
PEW_UNROLL // _PM_chunkSize RGB+clock writes
|
||
}
|
||
|
||
#if defined(_PM_portToggleRegister)
|
||
// Want the PORT left with RGB data and clock LOW on function exit
|
||
// (so it's easier to see on 'scope, and to prime it for the next call).
|
||
// This is implicit in the no-toggle case (due to how the PEW macro
|
||
// works), but toggle case requires explicitly clearing those bits.
|
||
// rgbAndClockMask is an 8-bit value when toggling, hence offset here.
|
||
*((volatile uint8_t *)core->clearReg + core->portOffset) =
|
||
core->rgbAndClockMask;
|
||
#endif
|
||
}
|
||
|
||
static void blast_word(Protomatter_core *core, uint16_t *data) {
|
||
#if defined(_PM_portToggleRegister)
|
||
// See notes above -- except now 16-bit word in PORT.
|
||
volatile uint16_t *toggle = (volatile uint16_t *)core->toggleReg +
|
||
core->portOffset;
|
||
#else
|
||
volatile uint16_t *set; // For RGB data set
|
||
volatile uint32_t *set32; // For clock set
|
||
volatile uint32_t *clear32; // For RGB data + clock clear
|
||
set = (volatile uint16_t *)core->setReg + core->portOffset;
|
||
set32 = (volatile uint32_t *)core->setReg;
|
||
clear32 = (volatile uint32_t *)core->clearReg;
|
||
uint32_t rgbclock = core->rgbAndClockMask; // RGB + clock bit
|
||
#endif
|
||
uint32_t clock = core->clockMask; // Clock bit
|
||
uint8_t chunks = (core->width + (_PM_chunkSize - 1)) / _PM_chunkSize;
|
||
while(chunks--) {
|
||
PEW_UNROLL // _PM_chunkSize RGB+clock writes
|
||
}
|
||
#if defined(_PM_portToggleRegister)
|
||
// rgbAndClockMask is a 16-bit value when toggling, hence offset here.
|
||
*((volatile uint16_t *)core->clearReg + core->portOffset) =
|
||
core->rgbAndClockMask;
|
||
#endif
|
||
}
|
||
|
||
static void blast_long(Protomatter_core *core, uint32_t *data) {
|
||
#if defined(_PM_portToggleRegister)
|
||
// See notes above -- except now full 32-bit PORT.
|
||
volatile uint32_t *toggle = (volatile uint32_t *)core->toggleReg;
|
||
#else
|
||
// Note in this case two copies exist of the PORT set register.
|
||
// The optimizer will most likely simplify this; leaving as-is, not
|
||
// wanting a special case of the PEW macro due to divergence risk.
|
||
volatile uint32_t *set; // For RGB data set
|
||
volatile uint32_t *set32; // For clock set
|
||
volatile uint32_t *clear32; // For RGB data + clock clear
|
||
set = (volatile uint32_t *)core->setReg;
|
||
set32 = (volatile uint32_t *)core->setReg;
|
||
clear32 = (volatile uint32_t *)core->clearReg;
|
||
uint32_t rgbclock = core->rgbAndClockMask; // RGB + clock bit
|
||
#endif
|
||
uint32_t clock = core->clockMask; // Clock bit
|
||
uint8_t chunks = (core->width + (_PM_chunkSize - 1)) / _PM_chunkSize;
|
||
while(chunks--) {
|
||
PEW_UNROLL // _PM_chunkSize RGB+clock writes
|
||
}
|
||
#if defined(_PM_portToggleRegister)
|
||
*(volatile uint32_t *)core->clearReg = core->rgbAndClockMask;
|
||
#endif
|
||
}
|
||
|
||
// Returns current value of frame counter and resets its value to zero.
|
||
// Two calls to this, timed one second apart (or use math with other
|
||
// intervals), can be used to get a rough frames-per-second value for
|
||
// the matrix (since this is difficult to estimate beforehand).
|
||
uint32_t _PM_getFrameCount(Protomatter_core *core) {
|
||
uint32_t count = 0;
|
||
if((core)) {
|
||
count = core->frameCount;
|
||
core->frameCount = 0;
|
||
}
|
||
return count;
|
||
}
|
||
|
||
// Note to future self: I've gone back and forth between implementing all
|
||
// this either as it currently is (with byte, word and long cases for various
|
||
// steps), or using a uint32_t[64] table for expanding RGB bit combos to PORT
|
||
// bit combos. The latter would certainly simplify the code a ton, and the
|
||
// additional table lookup step wouldn't significantly impact performance,
|
||
// especially going forward with faster processors (the SAMD51 code already
|
||
// requires a few NOPs in the innermost loop to avoid outpacing the matrix).
|
||
// BUT, the reason this is NOT currently done is that it only allows for a
|
||
// single matrix chain (doing parallel chains would require either an
|
||
// impractically large lookup table, or adding together multiple tables'
|
||
// worth of bitmasks, which would slow things down in the vital inner loop).
|
||
// Although parallel matrix chains aren't yet 100% implemented in this code
|
||
// right now, I wanted to leave that possibility for the future, as a way to
|
||
// handle larger matrix combos, because long chains will slow down the
|
||
// refresh rate.
|