[fceu.git] / drivers / gp2x / asmutils.s

@ vim:filetype=armasm

@ test
.global flushcache @ beginning_addr, end_addr, flags

flushcache:
    swi #0x9f0002
    mov pc, lr


.global block_or @ void *src, size_t n, int pat

block_or:
    stmfd   sp!, {r4-r5}
    orr     r2, r2, r2, lsl #8
    orr     r2, r2, r2, lsl #16
    mov     r1, r1, lsr #4
block_loop_or:
    ldmia   r0, {r3-r5,r12}
    subs    r1, r1, #1
    orr     r3, r3, r2
    orr     r4, r4, r2
    orr     r5, r5, r2
    orr     r12,r12,r2
    stmia   r0!, {r3-r5,r12}
    bne     block_loop_or
    ldmfd   sp!, {r4-r5}
    bx      lr


.global block_andor @ void *src, size_t n, int andpat, int orpat

block_andor:
    stmfd   sp!, {r4-r6}
    orr     r2, r2, r2, lsl #8
    orr     r2, r2, r2, lsl #16
    orr     r3, r3, r3, lsl #8
    orr     r3, r3, r3, lsl #16
    mov     r1, r1, lsr #4
block_loop_andor:
    ldmia   r0, {r4-r6,r12}
    subs    r1, r1, #1
    and     r4, r4, r2
    orr     r4, r4, r3
    and     r5, r5, r2
    orr     r5, r5, r3
    and     r6, r6, r2
    orr     r6, r6, r3
    and     r12,r12,r2
    orr     r12,r12,r3
    stmia   r0!, {r4-r6,r12}
    bne     block_loop_andor
    ldmfd   sp!, {r4-r6}
    bx      lr


.global spend_cycles @ c

spend_cycles:
    mov     r0, r0, lsr #2  @ 4 cycles/iteration
    sub     r0, r0, #2      @ entry/exit/init
.sc_loop:
    subs    r0, r0, #1
    bpl     .sc_loop

    bx      lr


.global soft_scale @ void *dst, unsigned short *pal, int offs, int lines

soft_scale:
    stmfd   sp!,{r4-r11,lr}
    mov     lr, #0xff
    mov     lr, lr, lsl #1
    mov     r9, #0x3900        @ f800 07e0 001f | e000 0780 001c | 3800 01e0 0007
    orr     r9, r9, #0x00e7

    mov     r11,r3             @ r11= line counter
    mov     r3, r1             @ r3 = pal base

    mov     r12,#320
    mul     r2, r12,r2
    add     r4, r0, r2, lsl #1 @ r4 = dst_start
    add     r5, r0, r2         @ r5 = src_start
    mul     r12,r11,r12
    add     r0, r4, r12,lsl #1 @ r0 = dst_end
    add     r1, r5, r12        @ r1 = src_end

soft_scale_loop:
    sub     r1, r1, #64        @ skip borders
    mov     r2, #256/8

soft_scale_loop_line:
    ldr     r12, [r1, #-8]!
    ldr     r7,  [r1, #4]

    and     r4, lr, r12,lsl #1
    ldrh    r4, [r3, r4]
    and     r5, lr, r12,lsr #7
    ldrh    r5, [r3, r5]
    and     r4, r4, r9, lsl #2
    orr     r4, r4, r4, lsl #14       @ r4[31:16] = 1/4 pix_s 0
    and     r5, r5, r9, lsl #2
    sub     r6, r5, r5, lsr #2        @ r6 = 3/4 pix_s 1
    add     r4, r4, r6, lsl #16       @ pix_d 0, 1
    and     r6, lr, r12,lsr #15
    ldrh    r6, [r3, r6]
    and     r12,lr, r12,lsr #23
    ldrh    r12,[r3, r12]
    and     r6, r6, r9, lsl #2
    add     r5, r5, r6
    mov     r5, r5, lsr #1
    sub     r6, r6, r6, lsr #2        @ r6 = 3/4 pix_s 2
    orr     r5, r5, r6, lsl #16

    and     r6, lr, r7, lsl #1
    ldrh    r6, [r3, r6]
    and     r12,r12,r9, lsl #2
    add     r5, r5, r12,lsl #14       @ pix_d 2, 3
    and     r6, r6, r9, lsl #2
    orr     r6, r12,r6, lsl #16       @ pix_d 4, 5

    and     r12,lr, r7, lsr #7
    ldrh    r12,[r3, r12]
    and     r10,lr, r7, lsr #15
    ldrh    r10,[r3, r10]
    and     r12,r12,r9, lsl #2
    sub     r8, r12,r12,lsr #2        @ r8 = 3/4 pix_s 1
    add     r8, r8, r6, lsr #18
    and     r7, lr, r7, lsr #23
    ldrh    r7, [r3, r7]
    and     r10,r10,r9, lsl #2
    orr     r8, r8, r10,lsl #15
    add     r8, r8, r12,lsl #15       @ pix_d 6, 7
    sub     r10,r10,r10,lsr #2        @ r10= 3/4 pix_s 2
    and     r7, r7, r9, lsl #2
    add     r10,r10,r7, lsr #2        @ += 1/4 pix_s 3
    orr     r10,r10,r7, lsl #16       @ pix_d 8, 9

    subs    r2, r2, #1

    stmdb   r0!, {r4,r5,r6,r8,r10}
    bne     soft_scale_loop_line

    subs    r11,r11,#1
    bne     soft_scale_loop

    ldmfd   sp!,{r4-r11,lr}
    bx      lr


/* buggy and slow, probably because function call overhead
@ renderer helper, based on bitbank's method
.global draw8pix @ uint8 *P, uint8 *C, uint8 *PALRAM @ dest, src, pal

draw8pix:
    stmfd sp!, {r4,r5}

    ldrb  r3, [r1]            @ get bit 0 pixels
    mov   r12,#1
    orr   r12,r12,r12,lsl #8
    orr   r12,r12,r12,lsl #16
    ldrb  r1, [r1, #8]        @ get bit 1 pixels
    orr   r3, r3, r3, lsl #9  @ shift them over 1 byte + 1 bit
    orr   r3, r3, r3, lsl #18 @ now 4 pixels take up 4 bytes
    and   r4, r12,r3, lsr #7  @ mask off the upper nibble pixels we want
    and   r5, r12,r3, lsr #3  @ mask off the lower nibble pixels we want
    ldr   r2, [r2]

    orr   r1, r1, r1, lsl #9  @ process the bit 1 pixels
    orr   r1, r1, r1, lsl #18
    and   r3, r12,r1, lsr #7  @ mask off the upper nibble pixels we want
    and   r1, r12,r1, lsr #3  @ mask off the lower nibble
    orr   r4, r4, r3, lsl #1
    orr   r5, r5, r1, lsl #5

    @ can this be avoided?
    mov   r4, r4, lsl #3      @ *8
    mov   r3, r2, ror r4
    strb  r3, [r0], #1
    mov   r4, r4, lsr #8
    mov   r3, r2, ror r4
    strb  r3, [r0], #1
    mov   r4, r4, lsr #8
    mov   r3, r2, ror r4
    strb  r3, [r0], #1
    mov   r4, r4, lsr #8
    mov   r3, r2, ror r4
    strb  r3, [r0], #1

    mov   r5, r5, lsl #3      @ *8
    mov   r3, r2, ror r5
    strb  r3, [r0], #1
    mov   r5, r5, lsr #8
    mov   r3, r2, ror r5
    strb  r3, [r0], #1
    mov   r5, r5, lsr #8
    mov   r3, r2, ror r5
    strb  r3, [r0], #1
    mov   r5, r5, lsr #8
    mov   r3, r2, ror r5
    strb  r3, [r0], #1

    ldmfd sp!, {r4,r5}
    bx    lr
*/
Commit	Line	Data
	1	@ vim:filetype=armasm
	2
	3	@ test
	4	.global flushcache @ beginning_addr, end_addr, flags
	5
	6	flushcache:
	7	swi #0x9f0002
	8	mov pc, lr
	9
	10
	11	.global block_or @ void *src, size_t n, int pat
	12
	13	block_or:
	14	stmfd sp!, {r4-r5}
	15	orr r2, r2, r2, lsl #8
	16	orr r2, r2, r2, lsl #16
	17	mov r1, r1, lsr #4
	18	block_loop_or:
	19	ldmia r0, {r3-r5,r12}
	20	subs r1, r1, #1
	21	orr r3, r3, r2
	22	orr r4, r4, r2
	23	orr r5, r5, r2
	24	orr r12,r12,r2
	25	stmia r0!, {r3-r5,r12}
	26	bne block_loop_or
	27	ldmfd sp!, {r4-r5}
	28	bx lr
	29
	30
	31	.global block_andor @ void *src, size_t n, int andpat, int orpat
	32
	33	block_andor:
	34	stmfd sp!, {r4-r6}
	35	orr r2, r2, r2, lsl #8
	36	orr r2, r2, r2, lsl #16
	37	orr r3, r3, r3, lsl #8
	38	orr r3, r3, r3, lsl #16
	39	mov r1, r1, lsr #4
	40	block_loop_andor:
	41	ldmia r0, {r4-r6,r12}
	42	subs r1, r1, #1
	43	and r4, r4, r2
	44	orr r4, r4, r3
	45	and r5, r5, r2
	46	orr r5, r5, r3
	47	and r6, r6, r2
	48	orr r6, r6, r3
	49	and r12,r12,r2
	50	orr r12,r12,r3
	51	stmia r0!, {r4-r6,r12}
	52	bne block_loop_andor
	53	ldmfd sp!, {r4-r6}
	54	bx lr
	55
	56
	57	.global spend_cycles @ c
	58
	59	spend_cycles:
	60	mov r0, r0, lsr #2 @ 4 cycles/iteration
	61	sub r0, r0, #2 @ entry/exit/init
	62	.sc_loop:
	63	subs r0, r0, #1
	64	bpl .sc_loop
	65
	66	bx lr
	67
	68
	69	.global soft_scale @ void dst, unsigned short pal, int offs, int lines
	70
	71	soft_scale:
	72	stmfd sp!,{r4-r11,lr}
	73	mov lr, #0xff
	74	mov lr, lr, lsl #1
	75	mov r9, #0x3900 @ f800 07e0 001f \| e000 0780 001c \| 3800 01e0 0007
	76	orr r9, r9, #0x00e7
	77
	78	mov r11,r3 @ r11= line counter
	79	mov r3, r1 @ r3 = pal base
	80
	81	mov r12,#320
	82	mul r2, r12,r2
	83	add r4, r0, r2, lsl #1 @ r4 = dst_start
	84	add r5, r0, r2 @ r5 = src_start
	85	mul r12,r11,r12
	86	add r0, r4, r12,lsl #1 @ r0 = dst_end
	87	add r1, r5, r12 @ r1 = src_end
	88
	89	soft_scale_loop:
	90	sub r1, r1, #64 @ skip borders
	91	mov r2, #256/8
	92
	93	soft_scale_loop_line:
	94	ldr r12, [r1, #-8]!
	95	ldr r7, [r1, #4]
	96
	97	and r4, lr, r12,lsl #1
	98	ldrh r4, [r3, r4]
	99	and r5, lr, r12,lsr #7
	100	ldrh r5, [r3, r5]
	101	and r4, r4, r9, lsl #2
	102	orr r4, r4, r4, lsl #14 @ r4[31:16] = 1/4 pix_s 0
	103	and r5, r5, r9, lsl #2
	104	sub r6, r5, r5, lsr #2 @ r6 = 3/4 pix_s 1
	105	add r4, r4, r6, lsl #16 @ pix_d 0, 1
	106	and r6, lr, r12,lsr #15
	107	ldrh r6, [r3, r6]
	108	and r12,lr, r12,lsr #23
	109	ldrh r12,[r3, r12]
	110	and r6, r6, r9, lsl #2
	111	add r5, r5, r6
	112	mov r5, r5, lsr #1
	113	sub r6, r6, r6, lsr #2 @ r6 = 3/4 pix_s 2
	114	orr r5, r5, r6, lsl #16
	115
	116	and r6, lr, r7, lsl #1
	117	ldrh r6, [r3, r6]
	118	and r12,r12,r9, lsl #2
	119	add r5, r5, r12,lsl #14 @ pix_d 2, 3
	120	and r6, r6, r9, lsl #2
	121	orr r6, r12,r6, lsl #16 @ pix_d 4, 5
	122
	123	and r12,lr, r7, lsr #7
	124	ldrh r12,[r3, r12]
	125	and r10,lr, r7, lsr #15
	126	ldrh r10,[r3, r10]
	127	and r12,r12,r9, lsl #2
	128	sub r8, r12,r12,lsr #2 @ r8 = 3/4 pix_s 1
	129	add r8, r8, r6, lsr #18
	130	and r7, lr, r7, lsr #23
	131	ldrh r7, [r3, r7]
	132	and r10,r10,r9, lsl #2
	133	orr r8, r8, r10,lsl #15
	134	add r8, r8, r12,lsl #15 @ pix_d 6, 7
	135	sub r10,r10,r10,lsr #2 @ r10= 3/4 pix_s 2
	136	and r7, r7, r9, lsl #2
	137	add r10,r10,r7, lsr #2 @ += 1/4 pix_s 3
	138	orr r10,r10,r7, lsl #16 @ pix_d 8, 9
	139
	140	subs r2, r2, #1
	141
	142	stmdb r0!, {r4,r5,r6,r8,r10}
	143	bne soft_scale_loop_line
	144
	145	subs r11,r11,#1
	146	bne soft_scale_loop
	147
	148	ldmfd sp!,{r4-r11,lr}
	149	bx lr
	150
	151
	152	/* buggy and slow, probably because function call overhead
	153	@ renderer helper, based on bitbank's method
	154	.global draw8pix @ uint8 P, uint8 C, uint8 *PALRAM @ dest, src, pal
	155
	156	draw8pix:
	157	stmfd sp!, {r4,r5}
	158
	159	ldrb r3, [r1] @ get bit 0 pixels
	160	mov r12,#1
	161	orr r12,r12,r12,lsl #8
	162	orr r12,r12,r12,lsl #16
	163	ldrb r1, [r1, #8] @ get bit 1 pixels
	164	orr r3, r3, r3, lsl #9 @ shift them over 1 byte + 1 bit
	165	orr r3, r3, r3, lsl #18 @ now 4 pixels take up 4 bytes
	166	and r4, r12,r3, lsr #7 @ mask off the upper nibble pixels we want
	167	and r5, r12,r3, lsr #3 @ mask off the lower nibble pixels we want
	168	ldr r2, [r2]
	169
	170	orr r1, r1, r1, lsl #9 @ process the bit 1 pixels
	171	orr r1, r1, r1, lsl #18
	172	and r3, r12,r1, lsr #7 @ mask off the upper nibble pixels we want
	173	and r1, r12,r1, lsr #3 @ mask off the lower nibble
	174	orr r4, r4, r3, lsl #1
	175	orr r5, r5, r1, lsl #5
	176
	177	@ can this be avoided?
	178	mov r4, r4, lsl #3 @ *8
	179	mov r3, r2, ror r4
	180	strb r3, [r0], #1
	181	mov r4, r4, lsr #8
	182	mov r3, r2, ror r4
	183	strb r3, [r0], #1
	184	mov r4, r4, lsr #8
	185	mov r3, r2, ror r4
	186	strb r3, [r0], #1
	187	mov r4, r4, lsr #8
	188	mov r3, r2, ror r4
	189	strb r3, [r0], #1
	190
	191	mov r5, r5, lsl #3 @ *8
	192	mov r3, r2, ror r5
	193	strb r3, [r0], #1
	194	mov r5, r5, lsr #8
	195	mov r3, r2, ror r5
	196	strb r3, [r0], #1
	197	mov r5, r5, lsr #8
	198	mov r3, r2, ror r5
	199	strb r3, [r0], #1
	200	mov r5, r5, lsr #8
	201	mov r3, r2, ror r5
	202	strb r3, [r0], #1
	203
	204	ldmfd sp!, {r4,r5}
	205	bx lr
	206	*/
	207