#include "vector_types.h"
#include "psx_gpu.h"
-#define unlikely(x) __builtin_expect((x), 0)
-
#endif
y##set##_b.e[1] = vertex->b \
-void compute_all_gradients(psx_gpu_struct *psx_gpu, vertex_struct *a,
- vertex_struct *b, vertex_struct *c)
+void compute_all_gradients(psx_gpu_struct * __restrict__ psx_gpu,
+ const vertex_struct * __restrict__ a, const vertex_struct * __restrict__ b,
+ const vertex_struct * __restrict__ c)
{
u32 triangle_area = psx_gpu->triangle_area;
u32 winding_mask_scalar;
setup_spans_set_x4(alternate, down, alternate_active); \
height -= 4; \
} while(height > 0); \
+ if (psx_gpu->hacks_active & (AHACK_TEXTURE_ADJ_U | AHACK_TEXTURE_ADJ_V)) \
+ span_uvrg_offset[height - 1].low = span_uvrg_offset[height - 2].low; \
} \
setup_spans_set_x4(alternate, up, alternate_active); \
height -= 4; \
} \
+ if (psx_gpu->hacks_active & AHACK_TEXTURE_ADJ_V) \
+ psx_gpu->span_uvrg_offset[0].low = psx_gpu->span_uvrg_offset[1].low; \
} \
#define index_left 0
setup_spans_set_x4(none, down, no);
height_minor_b -= 4;
}
+ if (psx_gpu->hacks_active & (AHACK_TEXTURE_ADJ_U | AHACK_TEXTURE_ADJ_V))
+ {
+ span_uvrg_offset[height_minor_b - 1].low =
+ span_uvrg_offset[height_minor_b - 2].low;
+ }
}
left_split_triangles++;
#endif
+// this is some hacky mess, can this be improved somehow?
+// ideally change things to not have to do this hack at all
+void __attribute__((noinline))
+setup_blocks_uv_adj_hack(psx_gpu_struct *psx_gpu, block_struct *block,
+ edge_data_struct *span_edge_data, vec_4x32u *span_uvrg_offset)
+{
+ size_t span_i = span_uvrg_offset - psx_gpu->span_uvrg_offset;
+ if (span_i != 0 && span_i != psx_gpu->num_spans - 1
+ && !(psx_gpu->hacks_active & AHACK_TEXTURE_ADJ_U))
+ return;
+ u32 num_blocks = span_edge_data->num_blocks - 1;
+ s32 offset = __builtin_ctz(span_edge_data->right_mask | 0x100) - 1;
+ s32 toffset = 8 * num_blocks + offset - 1;
+ if (toffset < 0 && !(psx_gpu->hacks_active & AHACK_TEXTURE_ADJ_U))
+ return;
+
+ toffset += span_edge_data->left_x;
+ s32 u_dx = psx_gpu->uvrg_dx.low.e[0];
+ s32 v_dx = psx_gpu->uvrg_dx.low.e[1];
+ u32 u = span_uvrg_offset->low.e[0];
+ u32 v = span_uvrg_offset->low.e[1];
+ u += u_dx * toffset;
+ v += v_dx * toffset;
+ u = (u >> 16) & psx_gpu->texture_mask_width;
+ v = (v >> 16) & psx_gpu->texture_mask_height;
+ if (!(psx_gpu->render_state_base & (TEXTURE_MODE_16BPP << 8))) {
+ // 4bpp 8bpp are swizzled
+ u32 u_ = u;
+ u = (u & 0x0f) | ((v & 0x0f) << 4);
+ v = (v & 0xf0) | (u_ >> 4);
+ }
+ assert(offset >= 0);
+ //assert(block->uv.e[offset] == ((v << 8) | u));
+ block->uv.e[offset] = (v << 8) | u;
+}
#define dither_table_entry_normal(value) \
(value) \
#define setup_blocks_store_draw_mask_untextured_direct(_block, bits) \
+#define setup_blocks_uv_adj_hack_untextured(_block, edge_data, uvrg_offset) \
+
+#define setup_blocks_uv_adj_hack_textured(_block, edge_data, uvrg_offset) \
+{ \
+ u32 m_ = AHACK_TEXTURE_ADJ_U | AHACK_TEXTURE_ADJ_V; \
+ if (unlikely(psx_gpu->hacks_active & m_)) \
+ setup_blocks_uv_adj_hack(psx_gpu, _block, edge_data, uvrg_offset); \
+} \
#define setup_blocks_add_blocks_indirect() \
num_blocks += span_num_blocks; \
setup_blocks_store_##shading##_##texturing(sw, dithering, target, edge); \
setup_blocks_store_draw_mask_##texturing##_##target(block, \
span_edge_data->right_mask); \
+ setup_blocks_uv_adj_hack_##texturing(block, span_edge_data, \
+ span_uvrg_offset); \
\
block++; \
} \
psx_gpu->primitive_type = PRIMITIVE_TYPE_UNKNOWN;
psx_gpu->saved_hres = 256;
+ psx_gpu->hacks_active = 0;
- // check some offset
+ // check some offsets, asm relies on these
+ psx_gpu->reserved_a[(offsetof(psx_gpu_struct, test_mask) == 0) - 1] = 0;
psx_gpu->reserved_a[(offsetof(psx_gpu_struct, blocks) == psx_gpu_blocks_offset) - 1] = 0;
}
#define SPAN_DATA_BLOCKS_SIZE 32
+#define AHACK_TEXTURE_ADJ_U (1 << 0)
+#define AHACK_TEXTURE_ADJ_V (1 << 1)
+
#ifndef __ASSEMBLER__
#include "vector_types.h"
+#ifndef unlikely
+#define unlikely(x) __builtin_expect((x), 0)
+#endif
+
typedef enum
{
PRIMITIVE_TYPE_TRIANGLE = 0,
// enhancement stuff
u16 *enhancement_buf_ptr; // main alloc
u16 *enhancement_current_buf_ptr; // offset into above, 4 bufs
+ u32 hacks_active; // AHACK_TEXTURE_ADJ_U ...
u32 saved_hres;
s16 saved_viewport_start_x;
s16 saved_viewport_start_y;
// Align up to 64 byte boundary to keep the upcoming buffers cache line
// aligned, also make reachable with single immediate addition
- u8 reserved_a[184 + 9*4 - 9*sizeof(void *)];
+ u8 reserved_a[184 + 8*4 - 9*sizeof(void *)];
// 8KB
block_struct blocks[MAX_BLOCKS_PER_ROW];
void update_texture_8bpp_cache(psx_gpu_struct *psx_gpu);
void flush_render_block_buffer(psx_gpu_struct *psx_gpu);
+void setup_blocks_uv_adj_hack(psx_gpu_struct *psx_gpu, block_struct *block,
+ edge_data_struct *span_edge_data, vec_4x32u *span_uvrg_offset);
+
void initialize_psx_gpu(psx_gpu_struct *psx_gpu, u16 *vram);
u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
s32 *cpu_cycles_sum_out, s32 *cpu_cycles_last, u32 *last_command);
#ifdef __MACH__
#define flush_render_block_buffer _flush_render_block_buffer
#define update_texture_8bpp_cache _update_texture_8bpp_cache
+#define setup_blocks_uv_adj_hack _setup_blocks_uv_adj_hack
#endif
@ r0: psx_gpu
#define uvrg q14
#define uvrg_dy q15
+#define uv d28
#define alternate_x_16 d4
subs height, height, #4; \
bhi 2b; \
\
+ nop; \
+ ldr temp, [psx_gpu, #psx_gpu_hacks_active_offset]; \
+ tst temp, #(AHACK_TEXTURE_ADJ_U | AHACK_TEXTURE_ADJ_V); \
+ beq 1f; \
+ add temp, span_uvrg_offset, height, lsl #4; \
+ vldr uv, [temp, #(-16*2)]; \
+ vstr uv, [temp, #(-16)]; \
+ \
1: \
subs height, height, #4; \
bhi 2b; \
\
+ nop; \
+ ldr temp, [psx_gpu, #psx_gpu_hacks_active_offset]; \
+ tst temp, #AHACK_TEXTURE_ADJ_V; \
+ beq 1f; \
+ add temp, psx_gpu, #psx_gpu_span_uvrg_offset_offset; \
+ vldr uv, [temp, #16]; \
+ vstr uv, [temp, #0]; \
+ \
1: \
subs height_minor_b, height_minor_b, #4
bhi 2b
+ nop
+ ldr temp, [psx_gpu, #psx_gpu_hacks_active_offset]
+ tst temp, #(AHACK_TEXTURE_ADJ_U | AHACK_TEXTURE_ADJ_V)
+ beq 1f
+ add temp, span_uvrg_offset, height, lsl #4
+ vldr uv, [temp, #(-16*2)]
+ vstr uv, [temp, #(-16)]
+
1:
setup_spans_epilogue()
#define uvrg_dx_ptr r2
#define texture_mask_ptr r3
+#define hacks_active r6
#define dither_shift r8
#define dither_row r10
#define color_b r5
#undef uvrg
+#undef uv
#define u_block q0
#define v_block q1
#define setup_blocks_texture_unswizzled() \
+#define setup_blocks_uv_adj_hack_textured(hacks_active) \
+ tst hacks_active, #(AHACK_TEXTURE_ADJ_U | AHACK_TEXTURE_ADJ_V); \
+ beq 91f; \
+ /* see flush_render_block_buffer below for a reg saving note */ \
+ vpush { texture_mask }; \
+ vpush { uvrg_dx4 }; \
+ \
+ stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }; \
+ mov r12, span_uvrg_offset; \
+ sub r1, block_ptr_a, #64; \
+ mov r2, span_edge_data; \
+ mov r3, r12; \
+ bl setup_blocks_uv_adj_hack; /* psx_gpu=r0 */ \
+ ldmia sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }; \
+ \
+ vpop { uvrg_dx4 }; \
+ vpop { texture_mask }; \
+ vadd.u32 uvrg_dx8, uvrg_dx4, uvrg_dx4; \
+91: \
+
#define setup_blocks_shaded_textured_builder(swizzling) \
.align 3; \
vld1.u32 { test_mask }, [psx_gpu, :128]; \
vdup.u8 draw_mask, right_mask; \
\
+ ldr hacks_active, [psx_gpu, #psx_gpu_hacks_active_offset]; \
vmov.u32 fb_mask_ptrs[0], right_mask; \
vtst.u16 draw_mask, draw_mask, test_mask; \
vzip.u8 u_whole_8, v_whole_8; \
vst1.u32 { dither_offsets }, [block_ptr_b, :128], c_32; \
vst1.u32 { b_whole_8, fb_mask_ptrs }, [block_ptr_a, :128], c_32; \
\
+ setup_blocks_uv_adj_hack_textured(hacks_active); \
+ \
1: \
add span_uvrg_offset, span_uvrg_offset, #16; \
add span_b_offset, span_b_offset, #4; \
ldmia sp!, { r4 - r11, pc }; \
\
2: \
- /* TODO: Load from psx_gpu instead of saving/restoring these */\
+ /* this callee-save reg saving may look unnecessary but it actually is */ \
+ /* because the callee violates the ABI */ \
vpush { texture_mask }; \
vpush { uvrg_dx4 }; \
\
vld1.u32 { test_mask }, [psx_gpu, :128]; \
vdup.u8 draw_mask, right_mask; \
\
+ ldr hacks_active, [psx_gpu, #psx_gpu_hacks_active_offset]; \
vmov.u32 fb_mask_ptrs[0], right_mask; \
vtst.u16 draw_mask, draw_mask, test_mask; \
vzip.u8 u_whole_8, v_whole_8; \
vst1.u32 { dither_offsets }, [block_ptr_b, :128], c_32; \
vst1.u32 { b_whole_8, fb_mask_ptrs }, [block_ptr_a, :128], c_32; \
\
+ setup_blocks_uv_adj_hack_textured(hacks_active); \
+ \
1: \
add span_uvrg_offset, span_uvrg_offset, #16; \
add span_edge_data, span_edge_data, #8; \
ldmia sp!, { r4 - r11, pc }; \
\
2: \
- /* TODO: Load from psx_gpu instead of saving/restoring these */\
vpush { texture_mask }; \
vpush { uvrg_dx4 }; \
\
ldmia sp!, { r4 - r11, pc }; \
\
2: \
- /* TODO: Load from psx_gpu instead of saving/restoring these */\
vpush { rg_dx4 }; \
\
stmdb sp!, { r0 - r3, EXTRA_UNSAVED_REGS r12, r14 }; \
#define psx_gpu_texture_mask_width_offset 0xfa
#define psx_gpu_texture_mask_height_offset 0xfb
#define psx_gpu_reciprocal_table_ptr_offset 0x108
+#define psx_gpu_hacks_active_offset 0x114
#define psx_gpu_blocks_offset 0x200
#define psx_gpu_span_uvrg_offset_offset 0x2200
#define psx_gpu_span_edge_data_offset 0x4200
//WRITE_OFFSET(f, clut_settings);
//WRITE_OFFSET(f, texture_settings);
WRITE_OFFSET(f, reciprocal_table_ptr);
+ WRITE_OFFSET(f, hacks_active);
WRITE_OFFSET(f, blocks);
WRITE_OFFSET(f, span_uvrg_offset);
WRITE_OFFSET(f, span_edge_data);
psx_gpu->viewport_start_y = psx_gpu->saved_viewport_start_y; \
psx_gpu->viewport_end_x = psx_gpu->saved_viewport_end_x; \
psx_gpu->viewport_end_y = psx_gpu->saved_viewport_end_y; \
+ psx_gpu->hacks_active = 0; \
psx_gpu->uvrgb_phase = 0x8000; \
}
psx_gpu->viewport_end_y = psx_gpu->saved_viewport_end_y * 2 + 1;
if (psx_gpu->viewport_end_x - psx_gpu->viewport_start_x + 1 > 1024)
psx_gpu->viewport_end_x = psx_gpu->viewport_start_x + 1023;
- psx_gpu->uvrgb_phase = 0x7fff;
+ //psx_gpu->uvrgb_phase = 0x7fff;
return 1;
}
return 1;
}
-static int is_in_array(int val, int array[], int len)
+static u32 uv_hack(psx_gpu_struct *psx_gpu, const vertex_struct *vertex_ptrs)
{
- int i;
- for (i = 0; i < len; i++)
- if (array[i] == val)
- return 1;
- return 0;
-}
-
-static int make_members_unique(int array[], int len)
-{
- int i, j;
- for (i = j = 1; i < len; i++)
- if (!is_in_array(array[i], array, j))
- array[j++] = array[i];
-
- if (array[0] > array[1]) {
- i = array[0]; array[0] = array[1]; array[1] = i;
- }
- return j;
-}
-
-static void patch_u(vertex_struct *vertex_ptrs, int count, int old, int new)
-{
- int i;
- for (i = 0; i < count; i++)
- if (vertex_ptrs[i].u == old)
- vertex_ptrs[i].u = new;
-}
-
-static void patch_v(vertex_struct *vertex_ptrs, int count, int old, int new)
-{
- int i;
- for (i = 0; i < count; i++)
- if (vertex_ptrs[i].v == old)
- vertex_ptrs[i].v = new;
-}
-
-// this sometimes does more harm than good, like in PE2
-static void uv_hack(vertex_struct *vertex_ptrs, int vertex_count)
-{
- int i, u[4], v[4];
-
- for (i = 0; i < vertex_count; i++) {
- u[i] = vertex_ptrs[i].u;
- v[i] = vertex_ptrs[i].v;
- }
- if (make_members_unique(u, vertex_count) == 2 && u[1] - u[0] >= 8) {
- if ((u[0] & 7) == 7) {
- patch_u(vertex_ptrs, vertex_count, u[0], u[0] + 1);
- //printf("u hack: %3u-%3u -> %3u-%3u\n", u[0], u[1], u[0]+1, u[1]);
- }
- else if ((u[1] & 7) == 0 || u[1] - u[0] > 128) {
- patch_u(vertex_ptrs, vertex_count, u[1], u[1] - 1);
- //printf("u hack: %3u-%3u -> %3u-%3u\n", u[0], u[1], u[0], u[1]-1);
- }
- }
- if (make_members_unique(v, vertex_count) == 2 && ((v[0] - v[1]) & 7) == 0) {
- if ((v[0] & 7) == 7) {
- patch_v(vertex_ptrs, vertex_count, v[0], v[0] + 1);
- //printf("v hack: %3u-%3u -> %3u-%3u\n", v[0], v[1], v[0]+1, v[1]);
- }
- else if ((v[1] & 7) == 0) {
- patch_v(vertex_ptrs, vertex_count, v[1], v[1] - 1);
- //printf("v hack: %3u-%3u -> %3u-%3u\n", v[0], v[1], v[0], v[1]-1);
- }
+ int i, have_right_edge = 0, have_bottom_edge = 0, bad_u = 0, bad_v = 0;
+ u32 hacks = 0;
+
+ for (i = 0; i < 3; i++) {
+ int j = (i + 1) % 3, k = (i + 2) % 3;
+ int du = abs((int)vertex_ptrs[i].u - (int)vertex_ptrs[j].u);
+ int dv = abs((int)vertex_ptrs[i].v - (int)vertex_ptrs[j].v);
+ if (du && (du & 7) != 7)
+ bad_u = 1;
+ if (dv && (dv & 7) != 7)
+ bad_v = 1;
+ if (vertex_ptrs[i].x == vertex_ptrs[j].x && vertex_ptrs[k].x < vertex_ptrs[j].x)
+ have_right_edge = 1;
+ if (vertex_ptrs[i].y == vertex_ptrs[j].y)// && vertex_ptrs[k].y < vertex_ptrs[j].y)
+ have_bottom_edge = 1;
}
+ if (have_right_edge && bad_u)
+ hacks |= AHACK_TEXTURE_ADJ_U;
+ if (have_bottom_edge && bad_v)
+ hacks |= AHACK_TEXTURE_ADJ_V;
+ return hacks;
}
static void do_triangle_enhanced(psx_gpu_struct *psx_gpu,
if (!enhancement_enable(psx_gpu))
return;
+ if ((current_command & RENDER_FLAGS_TEXTURE_MAP) && psx_gpu->hack_texture_adj)
+ psx_gpu->hacks_active |= uv_hack(psx_gpu, vertexes);
shift_vertices3(vertex_ptrs);
shift_triangle_area();
render_triangle_p(psx_gpu, vertex_ptrs, current_command);
get_vertex_data_xy_uv(2, 10);
get_vertex_data_xy_uv(3, 14);
- if (psx_gpu->hack_texture_adj)
- uv_hack(vertexes, 4);
do_quad_enhanced(psx_gpu, vertexes, current_command);
gput_sum(cpu_cycles_sum, cpu_cycles, gput_quad_base_t());
break;
get_vertex_data_xy_uv_rgb(2, 12);
get_vertex_data_xy_uv_rgb(3, 18);
- if (psx_gpu->hack_texture_adj)
- uv_hack(vertexes, 4);
do_quad_enhanced(psx_gpu, vertexes, current_command);
gput_sum(cpu_cycles_sum, cpu_cycles, gput_quad_base_gt());
break;
#define gvld1_u8(d, s) d.u8 = vld1_u8(s)
#define gvld1_u32(d, s) d.u32 = vld1_u32((const u32 *)(s))
+#define gvld1_u64(d, s) d.u64 = vld1_u64((const u64 *)(s))
#define gvld1q_u8(d, s) d.u8 = vld1q_u8(s)
#define gvld1q_u16(d, s) d.u16 = vld1q_u16(s)
#define gvld1q_u32(d, s) d.u32 = vld1q_u32((const u32 *)(s))
#define gvst1_u8(v, p) \
vst1_u8(p, v.u8)
+#define gvst1_u64(v, p) \
+ vst1_u64((u64 *)(p), v.u64)
#define gvst1q_u16(v, p) \
vst1q_u16(p, v.u16)
#define gvst1q_inc_u32(v, p, i) { \
#define gvld1_u8(d, s) d.m = _mm_loadu_si64(s)
#define gvld1_u32 gvld1_u8
+#define gvld1_u64 gvld1_u8
#define gvld1q_u8(d, s) d.m = _mm_loadu_si128((__m128i *)(s))
#define gvld1q_u16 gvld1q_u8
#define gvld1q_u32 gvld1q_u8
+#define gvst1_u8(v, p) _mm_storeu_si64(p, v.m)
+#define gvst1_u64 gvst1_u8
+
#define gvst4_4_inc_u32(v0, v1, v2, v3, p, i) { \
__m128i t0 = _mm_unpacklo_epi32(v0.m, v1.m); \
__m128i t1 = _mm_unpacklo_epi32(v2.m, v3.m); \
setup_spans_set_x4(alternate, down, alternate_active); \
height -= 4; \
} while(height > 0); \
+ if (psx_gpu->hacks_active & (AHACK_TEXTURE_ADJ_U | AHACK_TEXTURE_ADJ_V)) \
+ { \
+ vec_2x32u tmp; \
+ gvld1_u64(tmp, &span_uvrg_offset[height - 2]); \
+ gvst1_u64(tmp, &span_uvrg_offset[height - 1]); \
+ } \
} \
setup_spans_set_x4(alternate, up, alternate_active); \
height -= 4; \
} \
+ if (psx_gpu->hacks_active & AHACK_TEXTURE_ADJ_V) \
+ { \
+ vec_2x32u tmp; \
+ gvld1_u64(tmp, &psx_gpu->span_uvrg_offset[1]); \
+ gvst1_u64(tmp, &psx_gpu->span_uvrg_offset[0]); \
+ } \
} \
#define half_left lo
setup_spans_set_x4(none, down, no);
height_minor_b -= 4;
}
+ if (psx_gpu->hacks_active & (AHACK_TEXTURE_ADJ_U | AHACK_TEXTURE_ADJ_V))
+ {
+ vec_2x32u tmp;
+ gvld1_u64(tmp, &span_uvrg_offset[height_minor_b - 2]);
+ gvst1_u64(tmp, &span_uvrg_offset[height_minor_b - 1]);
+ }
}
}
#define setup_blocks_store_draw_mask_untextured_direct(_block, bits) \
+#define setup_blocks_uv_adj_hack_untextured(_block, edge_data, uvrg_offset) \
+
+#define setup_blocks_uv_adj_hack_textured(_block, edge_data, uvrg_offset) \
+{ \
+ u32 m_ = AHACK_TEXTURE_ADJ_U | AHACK_TEXTURE_ADJ_V; \
+ if (unlikely(psx_gpu->hacks_active & m_)) \
+ setup_blocks_uv_adj_hack(psx_gpu, _block, edge_data, (void *)uvrg_offset); \
+} \
+
#define setup_blocks_add_blocks_indirect() \
num_blocks += span_num_blocks; \
\
setup_blocks_store_##shading##_##texturing(sw, dithering, target, edge); \
setup_blocks_store_draw_mask_##texturing##_##target(block, \
span_edge_data->right_mask); \
+ setup_blocks_uv_adj_hack_##texturing(block, span_edge_data, \
+ span_uvrg_offset); \
\
block++; \
} \