MCUME/MCUME_pico/picovga_t4/render/vga_tile2.S
2021-11-14 21:50:46 +01:00

376 lines
12 KiB
ArmAsm
Executable file

// ****************************************************************************
//
// VGA render GF_TILE2
//
// ****************************************************************************
// u16 par3; // SSEGM_PAR3 tile width (must be multiple of 4)
// u32 par; // SSEGM_PAR tile table with one column of tiles
// u32 par2; // SSEGM_PAR2 LOW tile height, HIGH tile width bytes
#include "../define.h" // common definitions of C and ASM
#include "hardware/regs/sio.h" // registers of hardware divider
#include "hardware/regs/addressmap.h" // SIO base address
.syntax unified
.section .time_critical.Render, "ax"
.cpu cortex-m0plus
.thumb // use 16-bit instructions
// extern "C" u32* RenderTile2(u32* cbuf, int x, int y, int w, sSegm* segm);
// render tiles GF_TILE2
// cbuf ... destination control buffer
// x ... start X coordinate (must be multiple of 4)
// y ... start Y coordinate
// w ... width of this segment (must be multiple of 4)
// segm ... video segment
// Output new cbuf pointer.
// 320 pixels takes on 151 MHz: tiles 8x8 3.5 us, tile 16x16 2 us, tiles 32x32 1.3 us, tiles 64x64 0.9 us.
.thumb_func
.global RenderTile2
RenderTile2:
// push registers
push {r2-r7,lr}
// Input registers and stack content:
// R0 ... destination control buffer
// R1 ... X coordinate
// SP+0: R2 ... Y coordinate
// SP+4: R3 ... width to display
// SP+8: R4
// SP+12: R5
// SP+16: R6
// SP+20: R7
// SP+24: LR
// SP+28: video segment
// get pointer to video segment -> R4
ldr r4,[sp,#28] // load video segment -> R4
// R0 ... pointer to destination control buffer
// R1 ... X coordinate
// R2 ... Y coordinate
// R3 ... remaining width
// R4 ... sSegm*
// start divide Y/tile_height
ldr r5,RenderTile_pSioBase // get address of SIO base -> R5
str r2,[r5,#SIO_DIV_UDIVIDEND_OFFSET] // store dividend, Y coordinate
ldrh r2,[r4,#SSEGM_PAR2] // tile height -> R2
str r2,[r5,#SIO_DIV_UDIVISOR_OFFSET] // store divisor, tile height
// - now we must wait at least 8 clock cycles to get result of division
// R0 ... pointer to destination control buffer
// R1 ... X coordinate
// R3 ... remaining width
// R4 ... sSegm*
// R5 ... SIO_BASE
// [6] get wrap width -> [SP+0]
ldrh r7,[r4,#SSEGM_WRAPX] // [2] get wrap width
movs r6,#3 // [1] mask to align to 32-bit
bics r7,r6 // [1] align wrap
str r7,[sp,#0] // [2] save wrap width
// R0 ... pointer to destination control buffer
// R1 ... X coordinate
// R3 ... remaining width
// R4 ... sSegm*
// R5 ... SIO_BASE
// R6 ... align mask #3
// [SP+0] ... wrap width
// [1] align X coordinate to 32-bit -> R1
bics r1,r6 // [1] align X
// R0 ... pointer to destination control buffer
// R1 ... X coordinate
// R3 ... remaining width
// R4 ... sSegm*
// R5 ... SIO_BASE
// R6 ... align mask #3
// [SP+0] ... wrap width
// [3] align remaining width -> [SP+4]
bics r3,r6 // [1] align width
str r3,[sp,#4] // [2] store aligned width to [SP+4]
// R0 ... pointer to destination control buffer
// R1 ... X coordinate
// R4 ... sSegm*
// R5 ... SIO_BASE
// [SP+0] ... wrap width
// [SP+4] ... remaining width
// [2] prepare tile width -> R3
ldrh r3,[r4,#SSEGM_PAR3] // [2] get tile width -> R3
// R0 ... pointer to destination control buffer
// R1 ... X coordinate
// R3 ... tile width
// R4 ... sSegm*
// R5 ... SIO_BASE
// [SP+0] ... wrap width
// [SP+4] ... remaining width
// load result of division Y/tile_height -> R6 Y relative at row, R7 Y row
// Note: QUOTIENT must be read last
ldr r6,[r5,#SIO_DIV_REMAINDER_OFFSET] // get remainder of result -> R6, Y coordinate relative to current row
ldr r7,[r5,#SIO_DIV_QUOTIENT_OFFSET] // get quotient-> R7, index of row
// R0 ... pointer to destination control buffer
// R1 ... X coordinate
// R3 ... tile width
// R4 ... sSegm*
// R5 ... SIO_BASE
// R6 ... Y relative at row
// R7 ... Y row index
// [SP+0] ... wrap width
// [SP+4] ... remaining width
// start divide X/tile_width
str r1,[r5,#SIO_DIV_UDIVIDEND_OFFSET] // store dividend, X coordinate
str r3,[r5,#SIO_DIV_UDIVISOR_OFFSET] // store divisor, tile width
// - now we must wait at least 8 clock cycles to get result of division
// R0 ... pointer to destination control buffer
// R1 ... X coordinate
// R3 ... tile width
// R4 ... sSegm*
// R5 ... SIO_BASE
// R6 ... Y relative at row
// R7 ... Y row index
// [SP+0] ... wrap width
// [SP+4] ... remaining width
// [7] base pointer to source data buffer (without X) -> LR, R7
ldrh r2,[r4,#SSEGM_WB] // [2] get pitch of rows -> R2
muls r7,r2 // [1] pitch * row (Y * WB) -> offset of row in data buffer
ldr r2,[r4,#SSEGM_DATA] // [2] pointer to data -> R2
adds r7,r2 // [1] base address of data buffer
mov lr,r7 // [1] save base address
// R0 ... pointer to destination control buffer
// R1 ... X coordinate
// R3 ... tile width
// R4 ... sSegm*
// R5 ... SIO_BASE
// R6 ... Y relative at row
// R7 ... base address of data buffer (without X)
// LR ... base address of data buffer (without X)
// [SP+0] ... wrap width
// [SP+4] ... remaining width
// [6] tile base address -> R4
ldrh r2,[r4,#SSEGM_PAR2+2] // [2] tile width bytes -> R2
muls r6,r2 // [1] tile width bytes * Y relative to row -> tile line offset R6
ldr r4,[r4,#SSEGM_PAR] // [2] pointer to tiles
adds r4,r6 // [1] tile base address -> R4
// R0 ... pointer to destination control buffer
// R1 ... X coordinate
// R3 ... tile width
// R4 ... tile base address
// R5 ... SIO_BASE
// R7 ... base address of data buffer (without X)
// LR ... base address of data buffer (without X)
// [SP+0] ... wrap width
// [SP+4] ... remaining width
// load result of division X/tile_width -> R6 X pixel relative, R5 tile position
// Note: QUOTIENT must be read last
ldr r6,[r5,#SIO_DIV_REMAINDER_OFFSET] // get remainder of result -> R6, X pixel relative in tile
ldr r5,[r5,#SIO_DIV_QUOTIENT_OFFSET] // get quotient-> R5, tile position
// R0 ... pointer to destination control buffer
// R1 ... X coordinate
// R3 ... tile width
// R4 ... tile base address
// R5 ... tile position
// R6 ... X pixel relative in tile
// R7 ... base address of data buffer (without X)
// LR ... base address of data buffer (without X)
// [SP+0] ... wrap width
// [SP+4] ... remaining width
// prepare current pointer to source data buffer with X -> R7
adds r7,r5 // tile source address -> R7
// R0 ... pointer to destination control buffer
// R1 ... X coordinate
// R3 ... tile width
// R4 ... tile base address
// R6 ... X pixel relative in tile
// R7 ... pointer to source data buffer (with X)
// LR ... base address of data buffer (without X)
// [SP+0] ... wrap width
// [SP+4] ... remaining width
// ---- render rest of first tile
// check if X is tile-aligned
tst r6,r6 // check tile align
beq 2f // X is tile aligned
// shift X coordinate
subs r5,r3,r6 // pixels remain in current tile -> R5
adds r1,r5 // shift X coordinate (align to next tile)
// shift remaining width
ldr r2,[sp,#4] // get remaining width
subs r2,r5 // shift width
str r2,[sp,#4] // store remaining width
// write number of 4-pixels
lsrs r5,#2 // number of 4-pixels
stmia r0!,{r5} // save width
// load tile index -> R2
ldrb r2,[r7,#0] // [2] load tile index
adds r7,#1 // [1] increase tile address
// write tile addres
muls r2,r3 // tile index * tile width = tile offset
add r2,r4 // [1] add tile base address
add r2,r6 // [1] shift to tile start
stmia r0!,{r2} // [3] save pointer
// check end of segment
ldr r2,[sp,#0] // get wrap width
cmp r1,r2 // check end of segment
blo 2f // not end of segment
movs r1,#0 // reset X coordinate
mov r7,lr // get base pointer to tile data
// prepare wrap width - start X -> R5
2: ldr r2,[sp,#0] // get wrap width
subs r5,r2,r1 // pixels remaining to end of segment
ldr r2,[sp,#4] // total remaining width -> R3
// ---- start outer loop, render one part of segment
// R0 ... pointer to destination control buffer
// R2 ... total remaining width
// R3 ... tile width
// R4 ... tile base address
// R5 ... wrap width of this segment
// R7 ... pointer to source data buffer
// LR ... base address of data buffer (without X)
// [SP+0] ... wrap width
RenderTile_OutLoop:
// limit wrap width by total width -> R5
cmp r5,r2 // compare wrap width with total width
bls 2f // width is OK
mov r5,r2 // limit wrap width
// check if remain whole tile
2: cmp r5,r3 // check number of remaining pixels
bhs 5f // remain whole tiles
// check if start of last tile remains
cmp r5,#4 // check start of last tile
blo 3f // all done
mov r1,r5 // width to render
// ---- render start of last tile
// R0 ... pointer to destination control buffer
// R1 ... width to render in this segment
// R2 ... total remaining width
// R3 ... tile width
// R4 ... tile base address
// R5 ... wrap width of this segment
// R7 ... pointer to source data buffer (with X)
// LR ... base address of data buffer (without X)
// [SP+0] ... wrap width
RenderTile_Last:
// save width
lsrs r6,r1,#2 // number of 4-pixels
stmia r0!,{r6} // save width
// load tile index -> R6
ldrb r6,[r7,#0] // [2] load tile index
adds r7,#1 // [1] increase tile index
// save tile addres
muls r6,r3 // multiply tile index * tile width
add r6,r4 // [1] add tile base address
stmia r0!,{r6} // [3] save pointer
// check if continue with next segment
mov r7,lr // get base pointer to tile data
cmp r5,r3 // whole tile remains?
bhs RenderTile_OutLoop // render next segment
// pop registers and return
3: pop {r2-r7,pc}
// ---- prepare to render whole tiles
// R0 ... pointer to destination control buffer
// R2 ... total remaining width
// R3 ... tile width
// R4 ... tile base address
// R5 ... width of this segment
// R7 ... pointer to source data buffer (with X)
// LR ... base address of data buffer (without X)
// [SP+0] ... wrap width
// prepare number of 4-pixels to render -> R1
5: lsrs r1,r5,#2 // shift to get number of tiles in multiply of 4-pixels -> R1
lsls r5,r1,#2 // shift back to get number of pixels, rounded down -> R5
subs r2,r5 // update remaining width -> R2
lsrs r5,r3,#2 // tile width/4 -> R5
subs r1,r5 // number of 4-pixels - width/4
adds r1,#1 // number of 4-pixels - (width/4-1)
// ---- [11*N-1] start inner loop, render in one part of segment
// R0 ... pointer to destination control buffer
// R1 ... number of 4-pixels to generate - 1 (loop counter)
// R2 ... total remaining width
// R3 ... tile width
// R4 ... tile base address
// R5 ... tile width/4
// R7 ... pointer to source data buffer (with X)
// LR ... base address of data buffer (without X)
// [SP+0] ... wrap width
RenderTile_InLoop:
// [3] load tile index -> R6
ldrb r6,[r7,#0] // [2] load tile index
adds r7,#1 // [1] increase tile index
// [2] get tile addres
muls r6,r3 // [1] multiply tile index * tile width
add r6,r4 // [1] add tile base address
// [3] save control block
stmia r0!,{r5,r6} // [3] save width and pointer
// [2,3] loop
subs r1,r5 // [1] shift loop counter, subtract tile width/4
bhi RenderTile_InLoop // [1,2] > 0, render next whole tile
// ---- end inner loop, continue with last tile, or start new part
// continue to outer loop
adds r1,r5 // return size of last tile
subs r1,#1 // add "tile size/4 - 1"
ldr r5,[sp,#0] // load wrap width -> R5
lsls r1,#2 // convert back to pixels
bne RenderTile_Last // render start of last tile
mov r7,lr // get base pointer to tile data -> R7
b RenderTile_OutLoop // go back to outer loop
.align 2
// pointer to SIO base
RenderTile_pSioBase:
.word SIO_BASE // addres of SIO base