[fceu.git] / drivers / gp2x / asmutils.s

@ vim:filetype=armasm

@ Assembly optimized routines for gpfce - FCE Ultra port 
@ (c) Copyright 2007, Grazvydas "notaz" Ignotas

@ test
.global flushcache @ beginning_addr, end_addr, flags

flushcache:
    swi #0x9f0002
    mov pc, lr


.global block_or @ void *src, size_t n, int pat

block_or:
    stmfd   sp!, {r4-r5}
    orr     r2, r2, r2, lsl #8
    orr     r2, r2, r2, lsl #16
    mov     r1, r1, lsr #4
block_loop_or:
    ldmia   r0, {r3-r5,r12}
    subs    r1, r1, #1
    orr     r3, r3, r2
    orr     r4, r4, r2
    orr     r5, r5, r2
    orr     r12,r12,r2
    stmia   r0!, {r3-r5,r12}
    bne     block_loop_or
    ldmfd   sp!, {r4-r5}
    bx      lr


.global block_and @ void *src, size_t n, int andpat

block_and:
    stmfd   sp!, {r4-r5}
    orr     r2, r2, r2, lsl #8
    orr     r2, r2, r2, lsl #16
    mov     r1, r1, lsr #4
block_loop_and:
    ldmia   r0, {r3-r5,r12}
    subs    r1, r1, #1
    and     r3, r3, r2
    and     r4, r4, r2
    and     r5, r5, r2
    and     r12,r12,r2
    stmia   r0!, {r3-r5,r12}
    bne     block_loop_and
    ldmfd   sp!, {r4-r5}
    bx      lr


.global block_andor @ void *src, size_t n, int andpat, int orpat

block_andor:
    stmfd   sp!, {r4-r6}
    orr     r2, r2, r2, lsl #8
    orr     r2, r2, r2, lsl #16
    orr     r3, r3, r3, lsl #8
    orr     r3, r3, r3, lsl #16
    mov     r1, r1, lsr #4
block_loop_andor:
    ldmia   r0, {r4-r6,r12}
    subs    r1, r1, #1
    and     r4, r4, r2
    orr     r4, r4, r3
    and     r5, r5, r2
    orr     r5, r5, r3
    and     r6, r6, r2
    orr     r6, r6, r3
    and     r12,r12,r2
    orr     r12,r12,r3
    stmia   r0!, {r4-r6,r12}
    bne     block_loop_andor
    ldmfd   sp!, {r4-r6}
    bx      lr


.global spend_cycles @ c

spend_cycles:
    mov     r0, r0, lsr #2  @ 4 cycles/iteration
    sub     r0, r0, #2      @ entry/exit/init
.sc_loop:
    subs    r0, r0, #1
    bpl     .sc_loop

    bx      lr


.global memset32 @ int *dest, int c, int count

memset32:
    stmfd   sp!, {lr}

    mov     r3, r1
    subs    r2, r2, #4
    bmi     mst32_fin

    mov     r12,r1
    mov     lr, r1

mst32_loop:
    subs    r2, r2, #4
    stmia   r0!, {r1,r3,r12,lr}
    bpl     mst32_loop

mst32_fin:
    tst     r2, #1
    strne   r1, [r0], #4

    tst     r2, #2
    stmneia r0!, {r1,r3}

    ldmfd   sp!, {lr}
    bx      lr


.global soft_scale @ void *dst, unsigned short *pal, int line_offs, int lines

soft_scale:
    stmfd   sp!,{r4-r11,lr}
    mov     lr, #0xff
    mov     lr, lr, lsl #1
    mov     r9, #0x3900        @ f800 07e0 001f | e000 0780 001c | 3800 01e0 0007
    orr     r9, r9, #0x00e7

    mov     r11,r3             @ r11= line counter
    mov     r3, r1             @ r3 = pal base

    mov     r12,#320
    mul     r2, r12,r2
    add     r4, r0, r2, lsl #1 @ r4 = dst_start
    add     r5, r0, r2         @ r5 = src_start
    mul     r12,r11,r12
    add     r0, r4, r12,lsl #1 @ r0 = dst_end
    add     r1, r5, r12        @ r1 = src_end

    mov     r2, r11

soft_scale_loop:
    sub     r1, r1, #64        @ skip borders
    orr     r2, r2, #(256/8-1)<<24

soft_scale_loop_line:
    ldr     r12, [r1, #-8]!
    ldr     r7,  [r1, #4]

    and     r4, lr, r12,lsl #1
    ldrh    r4, [r3, r4]
    and     r5, lr, r12,lsr #7
    ldrh    r5, [r3, r5]
    and     r11,r4, r9, lsl #2
    orr     r4, r4, r11,lsl #14       @ r4[31:16] = 1/4 pix_s 0
    and     r11,r5, r9, lsl #2
    sub     r6, r5, r11,lsr #2        @ r6 = 3/4 pix_s 1
    add     r4, r4, r6, lsl #16       @ pix_d 0, 1
    and     r6, lr, r12,lsr #15
    ldrh    r6, [r3, r6]
    and     r12,lr, r12,lsr #23
    ldrh    r12,[r3, r12]
    bic     r11,r6, #0x0820
    bic     r5, r5, #0x0820
    add     r5, r5, r11
    mov     r5, r5, lsr #1
    and     r11,r6, r9, lsl #2
    sub     r6, r6, r11,lsr #2        @ r6 = 3/4 pix_s 2
    orr     r5, r5, r6, lsl #16

    and     r6, lr, r7, lsl #1
    ldrh    r6, [r3, r6]
    and     r11,r12,r9, lsl #2
    add     r5, r5, r11,lsl #14       @ pix_d 2, 3
    orr     r6, r12,r6, lsl #16       @ pix_d 4, 5

    and     r12,lr, r7, lsr #7
    ldrh    r12,[r3, r12]
    and     r10,lr, r7, lsr #15
    ldrh    r10,[r3, r10]
    and     r11,r12,r9, lsl #2
    sub     r8, r12,r11,lsr #2        @ r8 = 3/4 pix_s 1
    and     r11,r6, r9, lsl #18
    add     r8, r8, r11,lsr #18
    mov     r8, r8, lsl #16
    and     r7, lr, r7, lsr #23
    ldrh    r7, [r3, r7]
    bic     r11,r10,#0x0820
    bic     r12,r12,#0x0820
    add     r12,r12,r11
    add     r8, r8, r12,lsr #1        @ pix_d 6, 7
    mov     r8, r8, ror #16
    and     r11,r10,r9, lsl #2
    sub     r10,r10,r11,lsr #2        @ r10= 3/4 pix_s 2
    and     r11,r7, r9, lsl #2
    add     r10,r10,r11,lsr #2        @ += 1/4 pix_s 3
    orr     r10,r10,r7, lsl #16       @ pix_d 8, 9

    subs    r2, r2, #1<<24

    stmdb   r0!, {r4,r5,r6,r8,r10}
    bpl     soft_scale_loop_line

    add     r2, r2, #1<<24
    subs    r2, r2, #1
    bne     soft_scale_loop

    ldmfd   sp!,{r4-r11,lr}
    bx      lr


@ void convert2RGB555(unsigned short *dst, unsigned char *src, unsigned short *pal, int count);

.global convert2RGB555

convert2RGB555:
    stmfd   sp!,{r4-r8,lr}
    mov     lr, #0xff
    mov     lr, lr, lsl #1

    mov     r3, r3, lsr #3

convert2RGB555_loop:
    ldmia   r1!,{r4,r5}

    and     r6, lr, r4, lsl #1
    ldrh    r6, [r2, r6]
    and     r7, lr, r4, lsr #7
    ldrh    r7, [r2, r7]
    and     r8, lr, r4, lsr #15
    ldrh    r8, [r2, r8]
    and     r4, lr, r4, lsr #23
    ldrh    r4, [r2, r4]

    orr     r6, r6, r7, lsl #16
    and     r12,lr, r5, lsl #1
    ldrh    r12, [r2, r12]
    orr     r7, r8, r4, lsl #16
    and     r8, lr, r5, lsr #7
    ldrh    r8, [r2, r8]
    and     r4, lr, r5, lsr #15
    ldrh    r4, [r2, r4]
    and     r5, lr, r5, lsr #23
    ldrh    r5, [r2, r5]
    orr     r8, r12,r8, lsl #16
    orr     r12,r4, r5, lsl #16

    stmia   r0!,{r6,r7,r8,r12}
    subs    r3, r3, #1
    bne     convert2RGB555_loop

    ldmfd   sp!,{r4-r8,lr}
    bx      lr
Commit	Line	Data
937bf65b	1	@ vim:filetype=armasm
937bf65b	2
f5eb372f	3	@ Assembly optimized routines for gpfce - FCE Ultra port
	4	@ (c) Copyright 2007, Grazvydas "notaz" Ignotas
	5
937bf65b	6	@ test
	7	.global flushcache @ beginning_addr, end_addr, flags
	8
	9	flushcache:
	10	swi #0x9f0002
	11	mov pc, lr
	12
c0bf6f9f	13
6587f346	14	.global block_or @ void *src, size_t n, int pat
	15
	16	block_or:
	17	stmfd sp!, {r4-r5}
	18	orr r2, r2, r2, lsl #8
	19	orr r2, r2, r2, lsl #16
	20	mov r1, r1, lsr #4
	21	block_loop_or:
	22	ldmia r0, {r3-r5,r12}
	23	subs r1, r1, #1
	24	orr r3, r3, r2
	25	orr r4, r4, r2
	26	orr r5, r5, r2
	27	orr r12,r12,r2
	28	stmia r0!, {r3-r5,r12}
	29	bne block_loop_or
	30	ldmfd sp!, {r4-r5}
	31	bx lr
	32
	33
e328100e	34	.global block_and @ void *src, size_t n, int andpat
	35
	36	block_and:
	37	stmfd sp!, {r4-r5}
	38	orr r2, r2, r2, lsl #8
	39	orr r2, r2, r2, lsl #16
	40	mov r1, r1, lsr #4
	41	block_loop_and:
	42	ldmia r0, {r3-r5,r12}
	43	subs r1, r1, #1
	44	and r3, r3, r2
	45	and r4, r4, r2
	46	and r5, r5, r2
	47	and r12,r12,r2
	48	stmia r0!, {r3-r5,r12}
	49	bne block_loop_and
	50	ldmfd sp!, {r4-r5}
	51	bx lr
	52
	53
6587f346	54	.global block_andor @ void *src, size_t n, int andpat, int orpat
	55
	56	block_andor:
	57	stmfd sp!, {r4-r6}
	58	orr r2, r2, r2, lsl #8
	59	orr r2, r2, r2, lsl #16
	60	orr r3, r3, r3, lsl #8
	61	orr r3, r3, r3, lsl #16
	62	mov r1, r1, lsr #4
	63	block_loop_andor:
	64	ldmia r0, {r4-r6,r12}
	65	subs r1, r1, #1
	66	and r4, r4, r2
	67	orr r4, r4, r3
	68	and r5, r5, r2
	69	orr r5, r5, r3
	70	and r6, r6, r2
	71	orr r6, r6, r3
	72	and r12,r12,r2
	73	orr r12,r12,r3
	74	stmia r0!, {r4-r6,r12}
	75	bne block_loop_andor
	76	ldmfd sp!, {r4-r6}
	77	bx lr
	78
	79
b2b95d2e	80	.global spend_cycles @ c
	81
	82	spend_cycles:
	83	mov r0, r0, lsr #2 @ 4 cycles/iteration
	84	sub r0, r0, #2 @ entry/exit/init
	85	.sc_loop:
	86	subs r0, r0, #1
	87	bpl .sc_loop
	88
	89	bx lr
	90
	91
21afaa36	92	.global memset32 @ int *dest, int c, int count
	93
	94	memset32:
	95	stmfd sp!, {lr}
	96
	97	mov r3, r1
	98	subs r2, r2, #4
	99	bmi mst32_fin
	100
	101	mov r12,r1
	102	mov lr, r1
	103
	104	mst32_loop:
	105	subs r2, r2, #4
	106	stmia r0!, {r1,r3,r12,lr}
	107	bpl mst32_loop
	108
	109	mst32_fin:
	110	tst r2, #1
	111	strne r1, [r0], #4
	112
	113	tst r2, #2
	114	stmneia r0!, {r1,r3}
	115
	116	ldmfd sp!, {lr}
	117	bx lr
	118
	119
	120
	121	.global soft_scale @ void dst, unsigned short pal, int line_offs, int lines
989672f4	122
	123	soft_scale:
	124	stmfd sp!,{r4-r11,lr}
	125	mov lr, #0xff
	126	mov lr, lr, lsl #1
	127	mov r9, #0x3900 @ f800 07e0 001f \| e000 0780 001c \| 3800 01e0 0007
	128	orr r9, r9, #0x00e7
	129
	130	mov r11,r3 @ r11= line counter
	131	mov r3, r1 @ r3 = pal base
	132
	133	mov r12,#320
	134	mul r2, r12,r2
	135	add r4, r0, r2, lsl #1 @ r4 = dst_start
	136	add r5, r0, r2 @ r5 = src_start
	137	mul r12,r11,r12
	138	add r0, r4, r12,lsl #1 @ r0 = dst_end
	139	add r1, r5, r12 @ r1 = src_end
	140
f5eb372f	141	mov r2, r11
f5eb372f	142
989672f4	143	soft_scale_loop:
989672f4	144	sub r1, r1, #64 @ skip borders
f5eb372f	145	orr r2, r2, #(256/8-1)<<24
989672f4	146
	147	soft_scale_loop_line:
	148	ldr r12, [r1, #-8]!
	149	ldr r7, [r1, #4]
	150
	151	and r4, lr, r12,lsl #1
	152	ldrh r4, [r3, r4]
	153	and r5, lr, r12,lsr #7
	154	ldrh r5, [r3, r5]
f5eb372f	155	and r11,r4, r9, lsl #2
	156	orr r4, r4, r11,lsl #14 @ r4[31:16] = 1/4 pix_s 0
	157	and r11,r5, r9, lsl #2
	158	sub r6, r5, r11,lsr #2 @ r6 = 3/4 pix_s 1
989672f4	159	add r4, r4, r6, lsl #16 @ pix_d 0, 1
	160	and r6, lr, r12,lsr #15
	161	ldrh r6, [r3, r6]
	162	and r12,lr, r12,lsr #23
	163	ldrh r12,[r3, r12]
f5eb372f	164	bic r11,r6, #0x0820
	165	bic r5, r5, #0x0820
	166	add r5, r5, r11
989672f4	167	mov r5, r5, lsr #1
f5eb372f	168	and r11,r6, r9, lsl #2
f5eb372f	169	sub r6, r6, r11,lsr #2 @ r6 = 3/4 pix_s 2
989672f4	170	orr r5, r5, r6, lsl #16
	171
	172	and r6, lr, r7, lsl #1
	173	ldrh r6, [r3, r6]
f5eb372f	174	and r11,r12,r9, lsl #2
f5eb372f	175	add r5, r5, r11,lsl #14 @ pix_d 2, 3
989672f4	176	orr r6, r12,r6, lsl #16 @ pix_d 4, 5
	177
	178	and r12,lr, r7, lsr #7
	179	ldrh r12,[r3, r12]
	180	and r10,lr, r7, lsr #15
	181	ldrh r10,[r3, r10]
f5eb372f	182	and r11,r12,r9, lsl #2
	183	sub r8, r12,r11,lsr #2 @ r8 = 3/4 pix_s 1
	184	and r11,r6, r9, lsl #18
	185	add r8, r8, r11,lsr #18
	186	mov r8, r8, lsl #16
989672f4	187	and r7, lr, r7, lsr #23
989672f4	188	ldrh r7, [r3, r7]
f5eb372f	189	bic r11,r10,#0x0820
	190	bic r12,r12,#0x0820
	191	add r12,r12,r11
	192	add r8, r8, r12,lsr #1 @ pix_d 6, 7
	193	mov r8, r8, ror #16
	194	and r11,r10,r9, lsl #2
	195	sub r10,r10,r11,lsr #2 @ r10= 3/4 pix_s 2
	196	and r11,r7, r9, lsl #2
	197	add r10,r10,r11,lsr #2 @ += 1/4 pix_s 3
989672f4	198	orr r10,r10,r7, lsl #16 @ pix_d 8, 9
989672f4	199
f5eb372f	200	subs r2, r2, #1<<24
989672f4	201
989672f4	202	stmdb r0!, {r4,r5,r6,r8,r10}
f5eb372f	203	bpl soft_scale_loop_line
989672f4	204
f5eb372f	205	add r2, r2, #1<<24
f5eb372f	206	subs r2, r2, #1
989672f4	207	bne soft_scale_loop
	208
	209	ldmfd sp!,{r4-r11,lr}
	210	bx lr
	211
6587f346	212
f5eb372f	213	@ void convert2RGB555(unsigned short dst, unsigned char src, unsigned short *pal, int count);
	214
	215	.global convert2RGB555
	216
	217	convert2RGB555:
	218	stmfd sp!,{r4-r8,lr}
	219	mov lr, #0xff
	220	mov lr, lr, lsl #1
	221
	222	mov r3, r3, lsr #3
	223
	224	convert2RGB555_loop:
	225	ldmia r1!,{r4,r5}
	226
	227	and r6, lr, r4, lsl #1
	228	ldrh r6, [r2, r6]
	229	and r7, lr, r4, lsr #7
	230	ldrh r7, [r2, r7]
	231	and r8, lr, r4, lsr #15
	232	ldrh r8, [r2, r8]
	233	and r4, lr, r4, lsr #23
	234	ldrh r4, [r2, r4]
	235
	236	orr r6, r6, r7, lsl #16
	237	and r12,lr, r5, lsl #1
	238	ldrh r12, [r2, r12]
	239	orr r7, r8, r4, lsl #16
	240	and r8, lr, r5, lsr #7
	241	ldrh r8, [r2, r8]
	242	and r4, lr, r5, lsr #15
	243	ldrh r4, [r2, r4]
	244	and r5, lr, r5, lsr #23
	245	ldrh r5, [r2, r5]
	246	orr r8, r12,r8, lsl #16
	247	orr r12,r4, r5, lsl #16
	248
	249	stmia r0!,{r6,r7,r8,r12}
	250	subs r3, r3, #1
	251	bne convert2RGB555_loop
	252
	253	ldmfd sp!,{r4-r8,lr}
	254	bx lr
c0bf6f9f	255