Merge branch 'enhancement'
authornotaz <notasas@gmail.com>
Fri, 2 Nov 2012 01:07:05 +0000 (03:07 +0200)
committernotaz <notasas@gmail.com>
Fri, 2 Nov 2012 01:07:05 +0000 (03:07 +0200)
Conflicts:
frontend/libretro.c

38 files changed:
.gitmodules
Makefile
frontend/common/plat.h
frontend/libpicofe [new submodule]
frontend/libretro.c
frontend/linux/plat.c
frontend/main.c
frontend/main.h
frontend/menu.c
frontend/menu.h
frontend/pandora/pcsx.sh
frontend/plat_omap.c
frontend/plat_pandora.c
frontend/plat_pollux.c
frontend/plugin_lib.c
frontend/plugin_lib.h
libpcsxcore/psxmem.c
plugins/dfxvideo/draw_pl.c
plugins/dfxvideo/gpu.c
plugins/dfxvideo/gpulib_if.c
plugins/gpu-gles/gpulib_if.c
plugins/gpu_neon/Makefile
plugins/gpu_neon/psx_gpu/common.h
plugins/gpu_neon/psx_gpu/psx_gpu.c
plugins/gpu_neon/psx_gpu/psx_gpu.h
plugins/gpu_neon/psx_gpu/psx_gpu_4x.c [new file with mode: 0644]
plugins/gpu_neon/psx_gpu/psx_gpu_arm_neon.S
plugins/gpu_neon/psx_gpu/psx_gpu_offsets.h [new file with mode: 0644]
plugins/gpu_neon/psx_gpu/psx_gpu_offsets_update.c [new file with mode: 0644]
plugins/gpu_neon/psx_gpu/psx_gpu_parse.c
plugins/gpu_neon/psx_gpu/tests/Makefile
plugins/gpu_neon/psx_gpu/vector_ops.h
plugins/gpu_neon/psx_gpu_if.c
plugins/gpu_unai/gpu.cpp
plugins/gpu_unai/gpulib_if.cpp
plugins/gpulib/gpu.c
plugins/gpulib/gpu.h
plugins/gpulib/vout_pl.c

index 650250d..f93599e 100644 (file)
@@ -1,3 +1,6 @@
-[submodule "frontend/warm"]
+[submodule "libpicofe"]
+       path = frontend/libpicofe
+       url = git://notaz.gp2x.de/~notaz/libpicofe.git
+[submodule "warm"]
        path = frontend/warm
        url = git://notaz.gp2x.de/~notaz/warm.git
index c10f739..1f3e736 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -127,6 +127,14 @@ OBJS += plugins/cdrcimg/cdrcimg.o
 # dfinput
 OBJS += plugins/dfinput/main.o plugins/dfinput/pad.o plugins/dfinput/guncon.o
 
+# misc
+ifeq "$(HAVE_NEON)" "1"
+OBJS += frontend/libpicofe/arm/neon_scale2x.o
+OBJS += frontend/libpicofe/arm/neon_eagle2x.o
+frontend/libpicofe/arm/neon_scale2x.o: CFLAGS += -DDO_BGR_TO_RGB
+frontend/libpicofe/arm/neon_eagle2x.o: CFLAGS += -DDO_BGR_TO_RGB
+endif
+
 # gui
 OBJS += frontend/main.o frontend/plugin.o
 OBJS += frontend/common/readpng.o frontend/common/fonts.o
@@ -178,6 +186,12 @@ endif
 frontend/%.o: CFLAGS += -DIN_EVDEV
 frontend/menu.o frontend/main.o frontend/plat_sdl.o: frontend/revision.h
 
+frontend/libpicofe/arm/neon_scale2x.S frontend/libpicofe/menu.c:
+       @echo "libpicofe module is missing, please run:"
+       @echo "git submodule init && git submodule update"
+       @exit 1
+
+
 libpcsxcore/gte_nf.o: libpcsxcore/gte.c
        $(CC) -c -o $@ $^ $(CFLAGS) -DFLAGLESS
 
@@ -185,7 +199,6 @@ frontend/revision.h: FORCE
        @(git describe || echo) | sed -e 's/.*/#define REV "\0"/' > $@_
        @diff -q $@_ $@ > /dev/null 2>&1 || cp $@_ $@
        @rm $@_
-.PHONY: FORCE
 
 %.o: %.S
        $(CC) $(CFLAGS) -c $^ -o $@
@@ -213,9 +226,11 @@ plugins_:
 clean_plugins:
 endif
 
+.PHONY: all clean target_ plugins_ clean_plugins FORCE
+
 # ----------- release -----------
 
-VER ?= $(shell git describe master)
+VER ?= $(shell git describe HEAD)
 
 ifeq "$(PLATFORM)" "generic"
 OUT = pcsx_rearmed_$(VER)
index 0a9fc0b..1fb8767 100644 (file)
@@ -45,6 +45,10 @@ int  plat_is_dir(const char *path);
 int  plat_wait_event(int *fds_hnds, int count, int timeout_ms);
 void plat_sleep_ms(int ms);
 
+void *plat_mmap(unsigned long addr, size_t size, int need_exec, int is_fixed);
+void *plat_mremap(void *ptr, size_t oldsize, size_t newsize);
+void  plat_munmap(void *ptr, size_t size);
+
 /* timers, to be used for time diff and must refer to the same clock */
 unsigned int plat_get_ticks_ms(void);
 unsigned int plat_get_ticks_us(void);
diff --git a/frontend/libpicofe b/frontend/libpicofe
new file mode 160000 (submodule)
index 0000000..6ce097b
--- /dev/null
@@ -0,0 +1 @@
+Subproject commit 6ce097ba2f3cd1c269bacd032b775b6d296433fc
index 4305aa7..1eb2147 100644 (file)
@@ -13,6 +13,7 @@
 #include "../libpcsxcore/psxcounters.h"
 #include "../libpcsxcore/new_dynarec/new_dynarec.h"
 #include "../plugins/dfsound/out.h"
+#include "../plugins/gpulib/cspace.h"
 #include "main.h"
 #include "plugin.h"
 #include "plugin_lib.h"
@@ -26,7 +27,6 @@ static retro_environment_t environ_cb;
 static retro_audio_sample_batch_t audio_batch_cb;
 
 static void *vout_buf;
-static int vout_width, vout_height;
 static int samples_sent, samples_to_send;
 static int plugins_opened;
 static int native_rgb565;
@@ -42,14 +42,10 @@ static int vout_open(void)
        return 0;
 }
 
-static void *vout_set_mode(int w, int h, int bpp)
+static void vout_set_mode(int w, int h, int bpp)
 {
-       vout_width = w;
-       vout_height = h;
-       return vout_buf;
 }
 
-/* FIXME: either teach PCSX to blit to RGB1555 or RetroArch to support RGB565 */
 static void convert(void *buf, size_t bytes)
 {
        unsigned int i, v, *p = buf;
@@ -60,14 +56,39 @@ static void convert(void *buf, size_t bytes)
        }
 }
 
-static void *vout_flip(void)
+static void vout_flip(const void *vram, int stride, int bgr24, int w, int h)
 {
-       pl_rearmed_cbs.flip_cnt++;
-       if (!native_rgb565)
-               convert(vout_buf,  vout_width * vout_height * 2);
-       video_cb(vout_buf, vout_width, vout_height, vout_width * 2);
+       unsigned short *dest = vout_buf;
+       const unsigned short *src = vram;
+       int dstride = w, h1 = h;
+
+       if (vram == NULL) {
+               // blanking
+               memset(pl_vout_buf, 0, dstride * h * 2);
+               goto out;
+       }
 
-       return vout_buf;
+       if (bgr24)
+       {
+               // XXX: could we switch to RETRO_PIXEL_FORMAT_XRGB8888 here?
+               for (; h1-- > 0; dest += dstride, src += stride)
+               {
+                       bgr888_to_rgb565(dest, src, w * 3);
+               }
+       }
+       else
+       {
+               for (; h1-- > 0; dest += dstride, src += stride)
+               {
+                       bgr555_to_rgb565(dest, src, w * 2);
+               }
+       }
+
+out:
+       if (!native_rgb565)
+               convert(vout_buf, w * h * 2);
+       video_cb(vout_buf, w, h, w * 2);
+       pl_rearmed_cbs.flip_cnt++;
 }
 
 static void vout_close(void)
index b7152b5..4ed1e65 100644 (file)
 #include <time.h>
 #include <unistd.h>
 #include <sys/mman.h>
+#include <errno.h>
 
 #include "../common/plat.h"
 
+/* XXX: maybe unhardcode pagesize? */
+#define HUGETLB_PAGESIZE (2 * 1024 * 1024)
+#define HUGETLB_THRESHOLD (HUGETLB_PAGESIZE / 2)
+#ifndef MAP_HUGETLB
+#define MAP_HUGETLB 0x40000 /* arch specific */
+#endif
+
 
 int plat_is_dir(const char *path)
 {
@@ -126,16 +134,36 @@ int plat_wait_event(int *fds_hnds, int count, int timeout_ms)
        return ret;
 }
 
-void *plat_mmap(unsigned long addr, size_t size)
+void *plat_mmap(unsigned long addr, size_t size, int need_exec, int is_fixed)
 {
+       static int hugetlb_disabled;
+       int prot = PROT_READ | PROT_WRITE;
+       int flags = MAP_PRIVATE | MAP_ANONYMOUS;
        void *req, *ret;
 
        req = (void *)addr;
-       ret = mmap(req, size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+       if (need_exec)
+               prot |= PROT_EXEC;
+       if (is_fixed)
+               flags |= MAP_FIXED;
+       if (size >= HUGETLB_THRESHOLD && !hugetlb_disabled)
+               flags |= MAP_HUGETLB;
+
+       ret = mmap(req, size, prot, flags, -1, 0);
+       if (ret == MAP_FAILED && (flags & MAP_HUGETLB)) {
+               fprintf(stderr,
+                       "warning: failed to do hugetlb mmap (%p, %zu): %d\n",
+                       req, size, errno);
+               hugetlb_disabled = 1;
+               flags &= ~MAP_HUGETLB;
+               ret = mmap(req, size, prot, flags, -1, 0);
+       }
        if (ret == MAP_FAILED)
                return NULL;
-       if (ret != req)
-               printf("warning: mmaped to %p, requested %p\n", ret, req);
+
+       if (req != NULL && ret != req)
+               fprintf(stderr,
+                       "warning: mmaped to %p, requested %p\n", ret, req);
 
        return ret;
 }
@@ -155,7 +183,18 @@ void *plat_mremap(void *ptr, size_t oldsize, size_t newsize)
 
 void plat_munmap(void *ptr, size_t size)
 {
-       munmap(ptr, size);
+       int ret;
+
+       ret = munmap(ptr, size);
+       if (ret != 0 && (size & (HUGETLB_PAGESIZE - 1))) {
+               // prehaps an autorounded hugetlb mapping?
+               size = (size + HUGETLB_PAGESIZE - 1) & ~(HUGETLB_PAGESIZE - 1);
+               ret = munmap(ptr, size);
+       }
+       if (ret != 0) {
+               fprintf(stderr,
+                       "munmap(%p, %zu) failed: %d\n", ptr, size, errno);
+       }
 }
 
 /* lprintf */
index 19e8319..56b5cb7 100644 (file)
@@ -143,6 +143,8 @@ void emu_set_default_config(void)
        Config.PsxAuto = 1;
 
        pl_rearmed_cbs.gpu_neon.allow_interlace = 2; // auto
+       pl_rearmed_cbs.gpu_neon.enhancement_enable =
+       pl_rearmed_cbs.gpu_neon.enhancement_no_main = 0;
        pl_rearmed_cbs.gpu_peops.iUseDither = 0;
        pl_rearmed_cbs.gpu_peops.dwActFixes = 1<<7;
        pl_rearmed_cbs.gpu_unai.abe_hack =
@@ -230,6 +232,14 @@ do_state_slot:
                        pl_rearmed_cbs.frameskip == 0 ? "OFF" : "1" );
                plugin_call_rearmed_cbs();
                break;
+       case SACTION_SWITCH_DISPMODE:
+               pl_switch_dispmode();
+               plugin_call_rearmed_cbs();
+               if (GPU_open != NULL && GPU_close != NULL) {
+                       GPU_close();
+                       GPU_open(&gpuDisp, "PCSX", NULL);
+               }
+               break;
        case SACTION_SCREENSHOT:
                {
                        char buf[MAXPATHLEN];
index bdb4870..a03db8b 100644 (file)
@@ -65,6 +65,7 @@ enum sched_action {
        SACTION_NEXT_SSLOT,
        SACTION_PREV_SSLOT,
        SACTION_TOGGLE_FSKIP,
+       SACTION_SWITCH_DISPMODE,
        SACTION_SCREENSHOT,
        SACTION_VOLUME_UP,
        SACTION_VOLUME_DOWN,
index 42a53e1..d3ce06c 100644 (file)
@@ -75,6 +75,7 @@ typedef enum
        MA_OPT_SCALER,
        MA_OPT_SCALER2,
        MA_OPT_FILTERING,
+       MA_OPT_FILTERING2,
        MA_OPT_SCALER_C,
 } menu_id;
 
@@ -87,7 +88,7 @@ static int psx_clock;
 static int memcard1_sel, memcard2_sel;
 int g_opts, g_scaler;
 int soft_scaling, analog_deadzone; // for Caanoo
-int filter;
+int filter, soft_filter;
 
 #ifdef __ARM_ARCH_7A__
 #define DEFAULT_PSX_CLOCK 57
@@ -213,6 +214,7 @@ static void menu_set_defconfig(void)
        frameskip = 0;
        analog_deadzone = 50;
        soft_scaling = 1;
+       soft_filter = 0;
        psx_clock = DEFAULT_PSX_CLOCK;
 
        region = 0;
@@ -274,6 +276,7 @@ static const struct {
        CE_INTVAL(g_layer_w),
        CE_INTVAL(g_layer_h),
        CE_INTVAL(filter),
+       CE_INTVAL(soft_filter),
        CE_INTVAL(state_slot),
        CE_INTVAL(cpu_clock),
        CE_INTVAL(g_opts),
@@ -288,6 +291,8 @@ static const struct {
        CE_INTVAL_P(gpu_unai.no_light),
        CE_INTVAL_P(gpu_unai.no_blend),
        CE_INTVAL_P(gpu_neon.allow_interlace),
+       CE_INTVAL_P(gpu_neon.enhancement_enable),
+       CE_INTVAL_P(gpu_neon.enhancement_no_main),
        CE_INTVAL_P(gpu_peopsgl.bDrawDither),
        CE_INTVAL_P(gpu_peopsgl.iFilterType),
        CE_INTVAL_P(gpu_peopsgl.iFrameTexType),
@@ -661,6 +666,7 @@ me_bind_action emuctrl_actions[] =
        { "Next Save Slot   ", 1 << SACTION_NEXT_SSLOT },
        { "Toggle Frameskip ", 1 << SACTION_TOGGLE_FSKIP },
        { "Take Screenshot  ", 1 << SACTION_SCREENSHOT },
+       { "Switch Renderer  ", 1 << SACTION_SWITCH_DISPMODE },
        { "Enter Menu       ", 1 << SACTION_ENTER_MENU },
 #ifdef __ARM_ARCH_7A__ /* XXX */
        { "Minimize         ", 1 << SACTION_MINIMIZE },
@@ -1029,9 +1035,15 @@ static int menu_loop_keyconfig(int id, int keys)
 // ------------ gfx options menu ------------
 
 static const char *men_scaler[] = { "1x1", "scaled 4:3", "integer scaled 4:3", "fullscreen", "custom", NULL };
+static const char *men_soft_filter[] = { "None",
+#ifdef __ARM_NEON__
+       "scale2x", "eagle2x",
+#endif
+       NULL };
+static const char *men_dummy[] = { NULL };
 static const char h_cscaler[]   = "Displays the scaler layer, you can resize it\n"
                                  "using d-pad or move it using R+d-pad";
-static const char *men_dummy[] = { NULL };
+static const char h_soft_filter[] = "Works only if game uses low resolution modes";
 
 static int menu_loop_cscaler(int id, int keys)
 {
@@ -1090,6 +1102,7 @@ static menu_entry e_menu_gfx_options[] =
        mee_enum      ("Scaler",                   MA_OPT_SCALER, g_scaler, men_scaler),
        mee_onoff     ("Software Scaling",         MA_OPT_SCALER2, soft_scaling, 1),
        mee_enum      ("Filter",                   MA_OPT_FILTERING, filter, men_dummy),
+       mee_enum_h    ("Software Filter",          MA_OPT_FILTERING2, soft_filter, men_soft_filter, h_soft_filter),
 //     mee_onoff     ("Vsync",                    0, vsync, 1),
        mee_cust_h    ("Setup custom scaler",      MA_OPT_SCALER_C, menu_loop_cscaler, NULL, h_cscaler),
        mee_end,
@@ -1118,18 +1131,26 @@ void menu_set_filter_list(void *filters)
 
 #ifdef __ARM_NEON__
 
-static const char h_gpu_neon[] = "Configure built-in NEON GPU plugin";
+static const char h_gpu_neon[] =
+       "Configure built-in NEON GPU plugin";
+static const char h_gpu_neon_enhanced[] =
+       "Renders in double resolution at the cost of lower performance\n"
+       "(not available for high resolution games)";
+static const char h_gpu_neon_enhanced_hack[] =
+       "Speed hack for above option (glitches some games)";
 static const char *men_gpu_interlace[] = { "Off", "On", "Auto", NULL };
 
 static menu_entry e_menu_plugin_gpu_neon[] =
 {
        mee_enum      ("Enable interlace mode",      0, pl_rearmed_cbs.gpu_neon.allow_interlace, men_gpu_interlace),
+       mee_onoff_h   ("Enhanced resolution (slow)", 0, pl_rearmed_cbs.gpu_neon.enhancement_enable, 1, h_gpu_neon_enhanced),
+       mee_onoff_h   ("Enhanced res. speed hack",   0, pl_rearmed_cbs.gpu_neon.enhancement_no_main, 1, h_gpu_neon_enhanced_hack),
        mee_end,
 };
 
 static int menu_loop_plugin_gpu_neon(int id, int keys)
 {
-       int sel = 0;
+       static int sel = 0;
        me_loop(e_menu_plugin_gpu_neon, &sel);
        return 0;
 }
@@ -2247,6 +2268,7 @@ void menu_init(void)
 #ifndef __ARM_ARCH_7A__ /* XXX */
        me_enable(e_menu_gfx_options, MA_OPT_SCALER, 0);
        me_enable(e_menu_gfx_options, MA_OPT_FILTERING, 0);
+       me_enable(e_menu_gfx_options, MA_OPT_FILTERING2, 0);
        me_enable(e_menu_gfx_options, MA_OPT_SCALER_C, 0);
        me_enable(e_menu_keyconfig, MA_CTRL_NUBS_BTNS, 0);
 #else
index 2062acd..221be15 100644 (file)
@@ -22,9 +22,15 @@ enum g_scaler_opts {
        SCALE_CUSTOM,
 };
 
+enum g_soft_filter_opts {
+       SOFT_FILTER_NONE,
+       SOFT_FILTER_SCALE2X,
+       SOFT_FILTER_EAGLE2X,
+};
+
 extern int g_opts, g_scaler;
 extern int soft_scaling, analog_deadzone;
-extern int filter;
+extern int filter, soft_filter;
 
 extern int g_menuscreen_w;
 extern int g_menuscreen_h;
index 0957b94..bc1d6c5 100755 (executable)
@@ -5,10 +5,15 @@ nub0mode=`cat /proc/pandora/nub0/mode`
 nub1mode=`cat /proc/pandora/nub1/mode`
 /usr/pandora/scripts/op_nubchange.sh absolute absolute
 
+# 4MB for RAM (2+align) + 2MB for vram (1+overdraw) + 10MB for gpu_neon (8+overdraw)
+# no big deal if this fails, only performance loss
+sudo -n /usr/pandora/scripts/op_hugetlb.sh 16
+
 ./pcsx "$@"
 
 # restore stuff if pcsx crashes
 ./picorestore
 sudo -n /usr/pandora/scripts/op_lcdrate.sh 60
+sudo -n /usr/pandora/scripts/op_hugetlb.sh 0
 
 /usr/pandora/scripts/op_nubchange.sh $nub0mode $nub1mode
index b01c634..e5b6c04 100644 (file)
@@ -52,8 +52,9 @@ static int omap_setup_layer_(int fd, int enabled, int x, int y, int w, int h)
                        perror("SETUP_PLANE");
        }
 
-       if (mi.size < 640*512*3*3) {
-               mi.size = 640*512*3*3;
+       // upto 1024x512 (2x resolution enhancement)
+       if (mi.size < 1024*512*2 * 3) {
+               mi.size = 1024*512*2 * 3;
                ret = ioctl(fd, OMAPFB_SETUP_MEM, &mi);
                if (ret != 0) {
                        perror("SETUP_MEM");
index 9ec747d..b82450c 100644 (file)
@@ -65,6 +65,7 @@ static const struct in_default_bind in_evdev_defbinds[] = {
        { KEY_4,        IN_BINDTYPE_EMU, SACTION_NEXT_SSLOT },
        { KEY_5,        IN_BINDTYPE_EMU, SACTION_TOGGLE_FSKIP },
        { KEY_6,        IN_BINDTYPE_EMU, SACTION_SCREENSHOT },
+       { KEY_7,        IN_BINDTYPE_EMU, SACTION_SWITCH_DISPMODE },
        { 0, 0, 0 }
 };
 
index 1dafb7c..52a09b1 100644 (file)
@@ -305,12 +305,13 @@ static void spend_cycles(int loops)
 #define DMA_REG(x) memregl[(DMA_BASE6 + x) >> 2]
 
 /* this takes ~1.5ms, while ldm/stm ~1.95ms */
-static void raw_flip_dma(int x, int y)
+static void raw_flip_dma(const void *vram, int stride, int bgr24, int w, int h)
 {
+       unsigned int pixel_offset = psx_vram - (unsigned short *)vram;
        unsigned int dst = fb_paddrs[fb_work_buf] +
                        (fb_offset_y * 320 + fb_offset_x) * psx_bpp / 8;
-       int spsx_line = y + psx_offset_y;
-       int spsx_offset = (x + psx_offset_x) & 0x3f8;
+       int spsx_line = pixel_offset / 1024 + psx_offset_y;
+       int spsx_offset = (pixel_offset + psx_offset_x) & 0x3f8;
        int dst_stride = 320 * psx_bpp / 8;
        int len = psx_src_width * psx_bpp / 8;
        int i;
@@ -344,7 +345,7 @@ static void raw_flip_dma(int x, int y)
 
        if (psx_bpp == 16) {
                pl_vout_buf = g_menuscreen_ptr;
-               pl_print_hud(fb_offset_x);
+               pl_print_hud(w, h, fb_offset_x);
        }
 
        g_menuscreen_ptr = fb_flip();
@@ -354,26 +355,24 @@ static void raw_flip_dma(int x, int y)
 }
 
 #define make_flip_func(name, blitfunc)                                                  \
-static void name(int x, int y)                                                          \
+static void name(const void *vram_, int stride, int bgr24, int w, int h)                \
 {                                                                                       \
-        unsigned short *vram = psx_vram;                                                \
+        const unsigned short *vram = vram_;                                             \
         unsigned char *dst = (unsigned char *)g_menuscreen_ptr +                        \
                         (fb_offset_y * 320 + fb_offset_x) * psx_bpp / 8;                \
-        unsigned int src = (y + psx_offset_y) * 1024 + x + psx_offset_x;                \
         int dst_stride = 320 * psx_bpp / 8;                                             \
         int len = psx_src_width * psx_bpp / 8;                                          \
         int i;                                                                          \
                                                                                         \
         pcnt_start(PCNT_BLIT);                                                          \
                                                                                         \
-        for (i = psx_src_height; i > 0; i--, src += psx_step * 1024, dst += dst_stride) { \
-                src &= 1024*512-1;                                                      \
-                blitfunc(dst, vram + src, len);                                         \
-        }                                                                               \
+        vram += psx_offset_y * 1024 + psx_offset_x;                                     \
+        for (i = psx_src_height; i > 0; i--, vram += psx_step * 1024, dst += dst_stride)\
+                blitfunc(dst, vram, len);                                               \
                                                                                         \
         if (psx_bpp == 16) {                                                            \
                 pl_vout_buf = g_menuscreen_ptr;                                         \
-                pl_print_hud(fb_offset_x);                                              \
+                pl_print_hud(w, h, fb_offset_x);                                        \
         }                                                                               \
                                                                                         \
         g_menuscreen_ptr = fb_flip();                                                   \
@@ -402,20 +401,20 @@ void *plat_gvideo_set_mode(int *w_, int *h_, int *bpp_)
 
        switch (w + (bpp != 16) + !soft_scaling) {
        case 640:
-               pl_rearmed_cbs.pl_vout_raw_flip = raw_flip_soft_640;
+               pl_rearmed_cbs.pl_vout_flip = raw_flip_soft_640;
                w_max = 640;
                break;
        case 512:
-               pl_rearmed_cbs.pl_vout_raw_flip = raw_flip_soft_512;
+               pl_rearmed_cbs.pl_vout_flip = raw_flip_soft_512;
                w_max = 512;
                break;
        case 384:
        case 368:
-               pl_rearmed_cbs.pl_vout_raw_flip = raw_flip_soft_368;
+               pl_rearmed_cbs.pl_vout_flip = raw_flip_soft_368;
                w_max = 368;
                break;
        default:
-               pl_rearmed_cbs.pl_vout_raw_flip = have_warm ? raw_flip_dma : raw_flip_soft;
+               pl_rearmed_cbs.pl_vout_flip = have_warm ? raw_flip_dma : raw_flip_soft;
                w_max = 320;
                break;
        }
@@ -621,7 +620,7 @@ void plat_init(void)
        if (mixerdev == -1)
                perror("open(/dev/mixer)");
 
-       pl_rearmed_cbs.pl_vout_raw_flip = have_warm ? raw_flip_dma : raw_flip_soft;
+       pl_rearmed_cbs.pl_vout_flip = have_warm ? raw_flip_dma : raw_flip_soft;
        pl_rearmed_cbs.pl_vout_set_raw_vram = pl_vout_set_raw_vram;
 
        psx_src_width = 320;
index 4dbb9a7..3ee5947 100644 (file)
 #include "linux/fbdev.h"
 #include "common/fonts.h"
 #include "common/input.h"
+#include "common/plat.h"
 #include "menu.h"
 #include "main.h"
 #include "plat.h"
 #include "pcnt.h"
 #include "pl_gun_ts.h"
+#include "libpicofe/arm/neon_scale2x.h"
+#include "libpicofe/arm/neon_eagle2x.h"
 #include "../libpcsxcore/new_dynarec/new_dynarec.h"
 #include "../libpcsxcore/psemu_plugin_defs.h"
+#include "../plugins/gpulib/cspace.h"
 
 int in_type1, in_type2;
 int in_a1[2] = { 127, 127 }, in_a2[2] = { 127, 127 };
@@ -38,6 +42,7 @@ void *tsdev;
 void *pl_vout_buf;
 int g_layer_x, g_layer_y, g_layer_w, g_layer_h;
 static int pl_vout_w, pl_vout_h, pl_vout_bpp; /* output display/layer */
+static int pl_vout_scale;
 static int psx_w, psx_h, psx_bpp;
 static int vsync_cnt;
 static int is_pal, frame_interval, frame_interval1024;
@@ -113,10 +118,8 @@ static __attribute__((noinline)) void draw_active_chans(int vout_w, int vout_h)
        }
 }
 
-void pl_print_hud(int xborder)
+void pl_print_hud(int w, int h, int xborder)
 {
-       int w = pl_vout_w, h = pl_vout_h;
-
        if (h < 16)
                return;
 
@@ -184,55 +187,142 @@ static void update_layer_size(int w, int h)
        if (g_layer_h > g_menuscreen_h) g_layer_h = g_menuscreen_h;
 }
 
-static void *pl_vout_set_mode(int w, int h, int bpp)
+// XXX: this is platform specific really
+static int resolution_ok(int w, int h)
 {
+       return w <= 1024 && h <= 512;
+}
+
+static void pl_vout_set_mode(int w, int h, int bpp)
+{
+       int vout_w, vout_h, vout_bpp;
+
        // special h handling, Wipeout likes to change it by 1-6
        static int vsync_cnt_ms_prev;
        if ((unsigned int)(vsync_cnt - vsync_cnt_ms_prev) < 5*60)
                h = (h + 7) & ~7;
        vsync_cnt_ms_prev = vsync_cnt;
 
-       if (w == psx_w && h == psx_h && bpp == psx_bpp)
-               return pl_vout_buf;
+       vout_w = psx_w = w;
+       vout_h = psx_h = h;
+       vout_bpp = psx_bpp = bpp;
+
+       pl_vout_scale = 1;
+#ifdef __ARM_NEON__
+       if (soft_filter) {
+               if (resolution_ok(w * 2, h * 2) && bpp == 16) {
+                       vout_w *= 2;
+                       vout_h *= 2;
+                       pl_vout_scale = 2;
+               }
+               else {
+                       // filter unavailable
+                       hud_msg[0] = 0;
+               }
+       }
+#endif
 
-       pl_vout_w = psx_w = w;
-       pl_vout_h = psx_h = h;
-       pl_vout_bpp = psx_bpp = bpp;
+       if (pl_vout_buf != NULL && vout_w == pl_vout_w && vout_h == pl_vout_h
+           && vout_bpp == pl_vout_bpp)
+               return;
 
-       update_layer_size(pl_vout_w, pl_vout_h);
+       update_layer_size(vout_w, vout_h);
 
-       pl_vout_buf = plat_gvideo_set_mode(&pl_vout_w, &pl_vout_h, &pl_vout_bpp);
-       if (pl_vout_buf == NULL && pl_rearmed_cbs.pl_vout_raw_flip == NULL)
+       pl_vout_buf = plat_gvideo_set_mode(&vout_w, &vout_h, &vout_bpp);
+       if (pl_vout_buf == NULL)
                fprintf(stderr, "failed to set mode %dx%d@%d\n",
                        psx_w, psx_h, psx_bpp);
+       else {
+               pl_vout_w = vout_w;
+               pl_vout_h = vout_h;
+               pl_vout_bpp = vout_bpp;
+       }
 
        menu_notify_mode_change(pl_vout_w, pl_vout_h, pl_vout_bpp);
-
-       return pl_vout_buf;
 }
 
-// only used if raw flip is not defined
-static void *pl_vout_flip(void)
+static void pl_vout_flip(const void *vram, int stride, int bgr24, int w, int h)
 {
-       pl_rearmed_cbs.flip_cnt++;
+       static int doffs_old, clear_counter;
+       unsigned char *dest = pl_vout_buf;
+       const unsigned short *src = vram;
+       int dstride = pl_vout_w, h1 = h;
+       int doffs;
+
+       if (dest == NULL)
+               goto out;
+
+       if (vram == NULL) {
+               // blanking
+               memset(pl_vout_buf, 0, dstride * pl_vout_h * pl_vout_bpp / 8);
+               goto out;
+       }
+
+       // borders
+       doffs = (dstride - w * pl_vout_scale) / 2 & ~1;
+       dest += doffs * 2;
+
+       if (doffs > doffs_old)
+               clear_counter = 2;
+       doffs_old = doffs;
+
+       if (clear_counter > 0) {
+               memset(pl_vout_buf, 0, dstride * pl_vout_h * pl_vout_bpp / 8);
+               clear_counter--;
+       }
 
-       if (pl_vout_buf != NULL)
-               pl_print_hud(0);
+       if (bgr24)
+       {
+               if (pl_rearmed_cbs.only_16bpp) {
+                       for (; h1-- > 0; dest += dstride * 2, src += stride)
+                       {
+                               bgr888_to_rgb565(dest, src, w * 3);
+                       }
+               }
+               else {
+                       dest -= doffs * 2;
+                       dest += (doffs / 8) * 24;
 
+                       for (; h1-- > 0; dest += dstride * 3, src += stride)
+                       {
+                               bgr888_to_rgb888(dest, src, w * 3);
+                       }
+               }
+       }
+#ifdef __ARM_NEON__
+       else if (soft_filter == SOFT_FILTER_SCALE2X && pl_vout_scale == 2)
+       {
+               neon_scale2x_16_16(src, (void *)dest, w,
+                       stride * 2, dstride * 2, h1);
+       }
+       else if (soft_filter == SOFT_FILTER_EAGLE2X && pl_vout_scale == 2)
+       {
+               neon_eagle2x_16_16(src, (void *)dest, w,
+                       stride * 2, dstride * 2, h1);
+       }
+#endif
+       else
+       {
+               for (; h1-- > 0; dest += dstride * 2, src += stride)
+               {
+                       bgr555_to_rgb565(dest, src, w * 2);
+               }
+       }
+
+       pl_print_hud(w * pl_vout_scale, h * pl_vout_scale, 0);
+
+out:
        // let's flip now
        pl_vout_buf = plat_gvideo_flip();
-       return pl_vout_buf;
+       pl_rearmed_cbs.flip_cnt++;
 }
 
 static int pl_vout_open(void)
 {
        struct timeval now;
-       int h;
 
-       // force mode update
-       h = psx_h;
-       psx_h--;
-       pl_vout_buf = pl_vout_set_mode(psx_w, h, psx_bpp);
+       // force mode update on pl_vout_set_mode() call from gpulib/vout_pl
+       pl_vout_buf = NULL;
 
        plat_gvideo_open(is_pal);
 
@@ -249,6 +339,11 @@ static void pl_vout_close(void)
        plat_gvideo_close();
 }
 
+static void pl_set_gpu_caps(int caps)
+{
+       pl_rearmed_cbs.gpu_caps = caps;
+}
+
 void *pl_prepare_screenshot(int *w, int *h, int *bpp)
 {
        void *ret = plat_prepare_screenshot(w, h, bpp);
@@ -262,6 +357,75 @@ void *pl_prepare_screenshot(int *w, int *h, int *bpp)
        return pl_vout_buf;
 }
 
+/* display/redering mode switcher */
+static int dispmode_default(void)
+{
+       pl_rearmed_cbs.gpu_neon.enhancement_enable = 0;
+       soft_filter = SOFT_FILTER_NONE;
+       snprintf(hud_msg, sizeof(hud_msg), "default mode");
+       return 1;
+}
+
+int dispmode_doubleres(void)
+{
+       if (!(pl_rearmed_cbs.gpu_caps & GPU_CAP_SUPPORTS_2X)
+           || !resolution_ok(psx_w * 2, psx_h * 2) || psx_bpp != 16)
+               return 0;
+
+       dispmode_default();
+       pl_rearmed_cbs.gpu_neon.enhancement_enable = 1;
+       snprintf(hud_msg, sizeof(hud_msg), "double resolution");
+       return 1;
+}
+
+int dispmode_scale2x(void)
+{
+       if (psx_bpp != 16)
+               return 0;
+
+       dispmode_default();
+       soft_filter = SOFT_FILTER_SCALE2X;
+       snprintf(hud_msg, sizeof(hud_msg), "scale2x");
+       return 1;
+}
+
+int dispmode_eagle2x(void)
+{
+       if (psx_bpp != 16)
+               return 0;
+
+       dispmode_default();
+       soft_filter = SOFT_FILTER_EAGLE2X;
+       snprintf(hud_msg, sizeof(hud_msg), "eagle2x");
+       return 1;
+}
+
+static int (*dispmode_switchers[])(void) = {
+       dispmode_default,
+#ifdef __ARM_NEON__
+       dispmode_doubleres,
+       dispmode_scale2x,
+       dispmode_eagle2x,
+#endif
+};
+
+static int dispmode_current;
+
+void pl_switch_dispmode(void)
+{
+       if (pl_rearmed_cbs.gpu_caps & GPU_CAP_OWNS_DISPLAY)
+               return;
+
+       while (1) {
+               dispmode_current++;
+               if (dispmode_current >=
+                   sizeof(dispmode_switchers) / sizeof(dispmode_switchers[0]))
+                       dispmode_current = 0;
+               if (dispmode_switchers[dispmode_current]())
+                       break;
+       }
+}
+
 #ifndef MAEMO
 static void update_analogs(void)
 {
@@ -442,16 +606,31 @@ void pl_timing_prepare(int is_pal_)
 
 static void pl_text_out16_(int x, int y, const char *text)
 {
-       int i, l, len = strlen(text), w = pl_vout_w;
-       unsigned short *screen = (unsigned short *)pl_vout_buf + x + y * w;
+       int i, l, w = pl_vout_w;
+       unsigned short *screen;
        unsigned short val = 0xffff;
 
-       for (i = 0; i < len; i++, screen += 8)
+       x &= ~1;
+       screen = (unsigned short *)pl_vout_buf + x + y * w;
+       for (i = 0; ; i++, screen += 8)
        {
+               char c = text[i];
+               if (c == 0)
+                       break;
+               if (c == ' ')
+                       continue;
+
                for (l = 0; l < 8; l++)
                {
-                       unsigned char fd = fontdata8x8[text[i] * 8 + l];
+                       unsigned char fd = fontdata8x8[c * 8 + l];
                        unsigned short *s = screen + l * w;
+                       unsigned int *s32 = (void *)s;
+
+                       s32[0] = (s32[0] >> 1) & 0x7bef7bef;
+                       s32[1] = (s32[1] >> 1) & 0x7bef7bef;
+                       s32[2] = (s32[2] >> 1) & 0x7bef7bef;
+                       s32[3] = (s32[3] >> 1) & 0x7bef7bef;
+
                        if (fd&0x80) s[0] = val;
                        if (fd&0x40) s[1] = val;
                        if (fd&0x20) s[2] = val;
@@ -484,12 +663,26 @@ static void pl_get_layer_pos(int *x, int *y, int *w, int *h)
        *h = g_layer_h;
 }
 
+static void *pl_mmap(unsigned int size)
+{
+       return plat_mmap(0, size, 0, 0);
+}
+
+static void pl_munmap(void *ptr, unsigned int size)
+{
+       plat_munmap(ptr, size);
+}
+
 struct rearmed_cbs pl_rearmed_cbs = {
        pl_get_layer_pos,
        pl_vout_open,
        pl_vout_set_mode,
        pl_vout_flip,
        pl_vout_close,
+
+       .mmap = pl_mmap,
+       .munmap = pl_munmap,
+       .pl_set_gpu_caps = pl_set_gpu_caps,
 };
 
 /* watchdog */
index bcf74ac..332fbc2 100644 (file)
@@ -31,7 +31,8 @@ void  pl_text_out16(int x, int y, const char *texto, ...);
 void  pl_start_watchdog(void);
 void *pl_prepare_screenshot(int *w, int *h, int *bpp);
 void  pl_init(void);
-void  pl_print_hud(int xborder);
+void  pl_print_hud(int width, int height, int xborder);
+void  pl_switch_dispmode(void);
 
 void  pl_timing_prepare(int is_pal);
 void  pl_frame_limit(void);
@@ -41,12 +42,15 @@ void  pl_update_gun(int *xn, int *xres, int *y, int *in);
 struct rearmed_cbs {
        void  (*pl_get_layer_pos)(int *x, int *y, int *w, int *h);
        int   (*pl_vout_open)(void);
-       void *(*pl_vout_set_mode)(int w, int h, int bpp);
-       void *(*pl_vout_flip)(void);
+       void  (*pl_vout_set_mode)(int w, int h, int bpp);
+       void  (*pl_vout_flip)(const void *vram, int stride, int bgr24,
+                             int w, int h);
        void  (*pl_vout_close)(void);
-       // these are only used by some frontends
-       void  (*pl_vout_raw_flip)(int x, int y);
+       void *(*mmap)(unsigned int size);
+       void  (*munmap)(void *ptr, unsigned int size);
+       // only used by some frontends
        void  (*pl_vout_set_raw_vram)(void *vram);
+       void  (*pl_set_gpu_caps)(int caps);
        // some stats, for display by some plugins
        int flips_per_sec, cpu_usage;
        float vsps_cur; // currect vsync/s
@@ -60,6 +64,8 @@ struct rearmed_cbs {
        unsigned int only_16bpp; // platform is 16bpp-only
        struct {
                int   allow_interlace; // 0 off, 1 on, 2 guess
+               int   enhancement_enable;
+               int   enhancement_no_main;
        } gpu_neon;
        struct {
                int   iUseDither;
@@ -78,10 +84,17 @@ struct rearmed_cbs {
                int   iUseMask, bOpaquePass, bAdvancedBlend, bUseFastMdec;
                int   iVRamSize, iTexGarbageCollection;
        } gpu_peopsgl;
+       // misc
+       int gpu_caps;
 };
 
 extern struct rearmed_cbs pl_rearmed_cbs;
 
+enum gpu_plugin_caps {
+       GPU_CAP_OWNS_DISPLAY = (1 << 0),
+       GPU_CAP_SUPPORTS_2X = (1 << 1),
+};
+
 #ifndef ARRAY_SIZE
 #define ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0]))
 #endif
index 1cabd53..ddcd05b 100644 (file)
@@ -60,6 +60,16 @@ u8 **psxMemRLUT = NULL;
 0xbfc0_0000-0xbfc7_ffff                BIOS Mirror (512K) Uncached
 */
 
+#if 1
+void *plat_mmap(unsigned long addr, size_t size, int need_exec, int is_fixed);
+void  plat_munmap(void *ptr, size_t size);
+#else
+#define plat_mmap(addr, size, need_exec, is_fixed) \
+       mmap((void *)addr, size, PROT_WRITE | PROT_READ, \
+       MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0)
+#define plat_munmap munmap
+#endif
+
 int psxMemInit() {
        int i;
 
@@ -68,8 +78,7 @@ int psxMemInit() {
        memset(psxMemRLUT, 0, 0x10000 * sizeof(void *));
        memset(psxMemWLUT, 0, 0x10000 * sizeof(void *));
 
-       psxM = mmap((void *)0x80000000, 0x00210000,
-               PROT_WRITE | PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
+       psxM = plat_mmap(0x80000000, 0x00210000, 0, 1);
 #ifndef RAM_FIXED
        if (psxM == MAP_FAILED)
                psxM = mmap((void *)0x70000000, 0x00210000,
@@ -144,7 +153,7 @@ void psxMemReset() {
 }
 
 void psxMemShutdown() {
-       munmap(psxM, 0x00210000);
+       plat_munmap(psxM, 0x00210000);
        munmap(psxH, 0x1f800000);
        munmap(psxR, 0x80000);
 
index dffd52b..06a635d 100644 (file)
@@ -19,56 +19,26 @@ BOOL           bCheckMask = FALSE;
 unsigned short sSetMask;
 unsigned long  lSetMask;
 
-static void blit(void *vout_buf)
+static void blit(void)
 {
  int px = PSXDisplay.DisplayPosition.x & ~1; // XXX: align needed by bgr*_to_...
  int py = PSXDisplay.DisplayPosition.y;
  int w = PreviousPSXDisplay.Range.x1;
  int h = PreviousPSXDisplay.DisplayMode.y;
- int pitch = PreviousPSXDisplay.DisplayMode.x;
  unsigned short *srcs = psxVuw + py * 1024 + px;
- unsigned char *dest = vout_buf;
 
  if (w <= 0)
    return;
 
- pitch *= (PSXDisplay.RGB24 && !rcbs->only_16bpp) ? 3 : 2;
-
  // account for centering
  h -= PreviousPSXDisplay.Range.y0;
- dest += PreviousPSXDisplay.Range.y0 / 2 * pitch;
- dest += (PreviousPSXDisplay.Range.x0 & ~3) * 2; // must align here too..
-
- if (PSXDisplay.RGB24)
- {
-   if (!rcbs->only_16bpp)
-   {
-     for (; h-- > 0; dest += pitch, srcs += 1024)
-     {
-       bgr888_to_rgb888(dest, srcs, w * 3);
-     }
-   }
-   else
-   {
-     for (; h-- > 0; dest += pitch, srcs += 1024)
-     {
-       bgr888_to_rgb565(dest, srcs, w * 3);
-     }
-   }
- }
- else
- {
-   for (; h-- > 0; dest += pitch, srcs += 1024)
-   {
-     bgr555_to_rgb565(dest, srcs, w * 2);
-   }
- }
+
+ rcbs->pl_vout_flip(srcs, 1024, PSXDisplay.RGB24, w, h);
 }
 
 void DoBufferSwap(void)
 {
  static int fbw, fbh, fb24bpp;
- static void *vout_buf;
 
  if (PreviousPSXDisplay.DisplayMode.x == 0 || PreviousPSXDisplay.DisplayMode.y == 0)
   return;
@@ -80,17 +50,12 @@ void DoBufferSwap(void)
   fbw = PreviousPSXDisplay.DisplayMode.x;
   fbh = PreviousPSXDisplay.DisplayMode.y;
   fb24bpp = PSXDisplay.RGB24;
-  vout_buf = rcbs->pl_vout_set_mode(fbw, fbh, fb24bpp ? 24 : 16);
+  rcbs->pl_vout_set_mode(fbw, fbh, fb24bpp ? 24 : 16);
  }
 
  pcnt_start(PCNT_BLIT);
- if (rcbs->pl_vout_raw_flip != NULL)
-  rcbs->pl_vout_raw_flip(PSXDisplay.DisplayPosition.x, PSXDisplay.DisplayPosition.y);
- else
-  blit(vout_buf);
+ blit();
  pcnt_end(PCNT_BLIT);
-
- vout_buf = rcbs->pl_vout_flip();
 }
 
 void DoClearScreenBuffer(void)
index 9fa08fe..3d20dfa 100644 (file)
@@ -1143,6 +1143,8 @@ void GPUrearmedCallbacks(const struct rearmed_cbs *cbs)
  dwFrameRateTicks = cbs->gpu_peops.dwFrameRateTicks;
  if (cbs->pl_vout_set_raw_vram)
   cbs->pl_vout_set_raw_vram(psxVub);
+ if (cbs->pl_set_gpu_caps)
+  cbs->pl_set_gpu_caps(0);
 
  skip_advice = &cbs->fskip_advice;
  fps_skip = 100.0f;
index 12aa0a3..d98520c 100644 (file)
@@ -265,9 +265,9 @@ long           lLowerpart;
 
 /////////////////////////////////////////////////////////////////////////////
 
-int renderer_init(void)
+static void set_vram(void *vram)
 {
- psxVub=(void *)gpu.vram;
+ psxVub=vram;
 
  psxVsb=(signed char *)psxVub;                         // different ways of accessing PSX VRAM
  psxVsw=(signed short *)psxVub;
@@ -276,6 +276,11 @@ int renderer_init(void)
  psxVul=(uint32_t *)psxVub;
 
  psxVuw_eom=psxVuw+1024*512;                           // pre-calc of end of vram
+}
+
+int renderer_init(void)
+{
+ set_vram(gpu.vram);
 
  PSXDisplay.RGB24        = FALSE;                      // init some stuff
  PSXDisplay.Interlaced   = FALSE;
@@ -294,6 +299,14 @@ int renderer_init(void)
  return 0;
 }
 
+void renderer_finish(void)
+{
+}
+
+void renderer_notify_res_change(void)
+{
+}
+
 extern const unsigned char cmd_lengths[256];
 
 int do_cmd_list(unsigned int *list, int list_len, int *last_cmd)
@@ -408,4 +421,7 @@ void renderer_set_config(const struct rearmed_cbs *cbs)
 {
  iUseDither = cbs->gpu_peops.iUseDither;
  dwActFixes = cbs->gpu_peops.dwActFixes;
+ if (cbs->pl_set_gpu_caps)
+  cbs->pl_set_gpu_caps(0);
+ set_vram(gpu.vram);
 }
index c25ad8b..068dc41 100644 (file)
@@ -479,10 +479,15 @@ switch((gdata>>24)&0xff)
 
 static int is_opened;
 
-int renderer_init(void)
+static void set_vram(void *vram)
 {
- psxVub=(void *)gpu.vram;
+ psxVub=vram;
  psxVuw=(unsigned short *)psxVub;
+}
+
+int renderer_init(void)
+{
+ set_vram(gpu.vram);
 
  PSXDisplay.RGB24        = FALSE;                      // init some stuff
  PSXDisplay.Interlaced   = FALSE;
@@ -500,6 +505,14 @@ int renderer_init(void)
  return 0;
 }
 
+void renderer_finish(void)
+{
+}
+
+void renderer_notify_res_change(void)
+{
+}
+
 extern const unsigned char cmd_lengths[256];
 
 // XXX: mostly dupe code from soft peops
@@ -702,6 +715,10 @@ void renderer_set_config(const struct rearmed_cbs *cbs_)
  bUseFastMdec = cbs->gpu_peopsgl.bUseFastMdec;
  iTexGarbageCollection = cbs->gpu_peopsgl.iTexGarbageCollection;
  iVRamSize = cbs->gpu_peopsgl.iVRamSize;
+ if (cbs->pl_set_gpu_caps)
+  cbs->pl_set_gpu_caps(GPU_CAP_OWNS_DISPLAY);
+
+ set_vram(gpu.vram);
 }
 
 void SetAspectRatio(void)
index 8a7342b..08bf0ee 100644 (file)
@@ -1,4 +1,4 @@
-CFLAGS += -ggdb -Wall -O2
+CFLAGS += -ggdb -Wall -O2 -DNDEBUG
 
 include ../../config.mak
 
index f299f79..d5cf3e9 100644 (file)
@@ -18,5 +18,7 @@ typedef unsigned long long int u64;
 #include "vector_ops.h"
 #include "psx_gpu.h"
 
+#define unlikely(x) __builtin_expect((x), 0)
+
 #endif
 
index 68996c1..2cba878 100644 (file)
@@ -47,7 +47,8 @@ u32 zero_block_spans = 0;
 u32 texture_cache_loads = 0;
 u32 false_modulated_blocks = 0;
 
-u32 reciprocal_table[512];
+/* double size for enhancement */
+u32 reciprocal_table[512 * 2];
 
 
 typedef s32 fixed_type;
@@ -453,7 +454,7 @@ void setup_blocks_shaded_untextured_undithered_unswizzled_indirect(
 
 void flush_render_block_buffer(psx_gpu_struct *psx_gpu)
 {
-  if((psx_gpu->interlace_mode & RENDER_INTERLACE_ENABLED) &&
+  if((psx_gpu->render_mode & RENDER_INTERLACE_ENABLED) &&
    (psx_gpu->primitive_type == PRIMITIVE_TYPE_SPRITE))
   {
     u32 num_blocks_dest = 0;
@@ -463,7 +464,7 @@ void flush_render_block_buffer(psx_gpu_struct *psx_gpu)
     u16 *vram_ptr = psx_gpu->vram_ptr;
     u32 i;
 
-    if(psx_gpu->interlace_mode & RENDER_INTERLACE_ODD)
+    if(psx_gpu->render_mode & RENDER_INTERLACE_ODD)
     {
       for(i = 0; i < psx_gpu->num_blocks; i++)
       {
@@ -566,7 +567,7 @@ void compute_all_gradients(psx_gpu_struct *psx_gpu, vertex_struct *a,
 
   vec_4x32u uvrg_base;
   vec_4x32u b_base;
-  vec_4x32u const_0x8000;
+  vec_4x32u uvrgb_phase;
 
   vec_4x16s d0_a_d3_c, d0_b, d0_c;
   vec_4x16s d1_a, d1_b, d1_c_d2_a;
@@ -595,12 +596,12 @@ void compute_all_gradients(psx_gpu_struct *psx_gpu, vertex_struct *a,
   setup_gradient_calculation_input(1, b);
   setup_gradient_calculation_input(2, c);
 
-  dup_4x32b(const_0x8000, 0x8000);
+  dup_4x32b(uvrgb_phase, psx_gpu->uvrgb_phase);
   shl_long_4x16b(uvrg_base, x0_a_y0_c, 16);
   shl_long_4x16b(b_base, x0_b, 16);
 
-  add_4x32b(uvrg_base, uvrg_base, const_0x8000);
-  add_4x32b(b_base, b_base, const_0x8000);
+  add_4x32b(uvrg_base, uvrg_base, uvrgb_phase);
+  add_4x32b(b_base, b_base, uvrgb_phase);
 
   // Can probably pair these, but it'll require careful register allocation
   sub_4x16b(d0_a_d3_c, x1_a_y1_c, x0_a_y0_c);
@@ -766,6 +767,26 @@ void compute_all_gradients(psx_gpu_struct *psx_gpu, vertex_struct *a,
     printf("mismatch on %s %s: %x vs %x\n", #_a, #_b, _a, _b)                  \
 
 
+#ifndef NDEBUG
+#define setup_spans_debug_check(span_edge_data_element)                        \
+{                                                                              \
+  u32 _num_spans = &span_edge_data_element - psx_gpu->span_edge_data;          \
+  if (_num_spans > MAX_SPANS)                                                  \
+    *(int *)0 = 1;                                                             \
+  if (_num_spans < psx_gpu->num_spans)                                         \
+  {                                                                            \
+    if(span_edge_data_element.num_blocks > MAX_BLOCKS_PER_ROW)                 \
+      *(int *)0 = 1;                                                           \
+    if(span_edge_data_element.y > 2048)                                        \
+      *(int *)0 = 1;                                                           \
+  }                                                                            \
+}                                                                              \
+
+#else
+#define setup_spans_debug_check(span_edge_data_element)                        \
+
+#endif
+
 #define setup_spans_prologue_alternate_yes()                                   \
   vec_2x64s alternate_x;                                                       \
   vec_2x64s alternate_dx_dy;                                                   \
@@ -854,7 +875,7 @@ void compute_all_gradients(psx_gpu_struct *psx_gpu, vertex_struct *a,
                                                                                \
   dup_2x32b(edge_shifts, edge_shift);                                          \
   sub_2x32b(heights_b, heights, c_0x01);                                       \
-  shr_2x32b(height_reciprocals, edge_shifts, 12);                              \
+  shr_2x32b(height_reciprocals, edge_shifts, 10);                              \
                                                                                \
   mla_2x32b(heights_b, x_starts, heights);                                     \
   bic_immediate_4x16b(vector_cast(vec_4x16u, edge_shifts), 0xE0);              \
@@ -883,8 +904,8 @@ void compute_all_gradients(psx_gpu_struct *psx_gpu, vertex_struct *a,
   sub_2x32b(widths, x_ends, x_starts);                                         \
   width_alt = x_c - start_c;                                                   \
                                                                                \
-  shr_2x32b(height_reciprocals, edge_shifts, 12);                              \
-  height_reciprocal_alt = edge_shift_alt >> 12;                                \
+  shr_2x32b(height_reciprocals, edge_shifts, 10);                              \
+  height_reciprocal_alt = edge_shift_alt >> 10;                                \
                                                                                \
   bic_immediate_4x16b(vector_cast(vec_4x16u, edge_shifts), 0xE0);              \
   edge_shift_alt &= 0x1F;                                                      \
@@ -1069,6 +1090,7 @@ void compute_all_gradients(psx_gpu_struct *psx_gpu, vertex_struct *a,
     span_edge_data[i].num_blocks = left_right_x_16.high.e[i];                  \
     span_edge_data[i].right_mask = span_shift.e[i];                            \
     span_edge_data[i].y = y_x4.e[i];                                           \
+    setup_spans_debug_check(span_edge_data[i]);                                \
   }                                                                            \
                                                                                \
   span_edge_data += 4;                                                         \
@@ -1406,12 +1428,16 @@ void setup_spans_up_down(psx_gpu_struct *psx_gpu, vertex_struct *v_a,
     y_x4.e[3] = y_a + 3;
     setup_spans_adjust_edges_alternate_no(index_left, index_right);
 
+    // FIXME: overflow corner case
+    if(psx_gpu->num_spans + height_minor_b == MAX_SPANS)
+      height_minor_b &= ~3;
+
     psx_gpu->num_spans += height_minor_b;
-    do
+    while(height_minor_b > 0)
     {
       setup_spans_set_x4(none, down, no);
       height_minor_b -= 4;
-    } while(height_minor_b > 0);
+    }
   }
 
   left_split_triangles++;
@@ -1872,7 +1898,7 @@ void setup_blocks_##shading##_##texturing##_##dithering##_##sw##_##target(     \
     if(span_num_blocks)                                                        \
     {                                                                          \
       y = span_edge_data->y;                                                   \
-      fb_ptr = psx_gpu->vram_ptr + span_edge_data->left_x + (y * 1024);        \
+      fb_ptr = psx_gpu->vram_out_ptr + span_edge_data->left_x + (y * 1024);    \
                                                                                \
       setup_blocks_span_initialize_##shading##_##texturing();                  \
       setup_blocks_span_initialize_##dithering(texturing);                     \
@@ -2905,8 +2931,8 @@ char *render_block_flag_strings[] =
    (triangle_y_direction_##direction_c << 4) |                                 \
    (triangle_winding_##winding << 6))                                          \
 
-void render_triangle(psx_gpu_struct *psx_gpu, vertex_struct *vertexes,
u32 flags)
+static int prepare_triangle(psx_gpu_struct *psx_gpu, vertex_struct *vertexes,
vertex_struct *vertexes_out[3])
 {
   s32 y_top, y_bottom;
   s32 triangle_area;
@@ -2927,7 +2953,7 @@ void render_triangle(psx_gpu_struct *psx_gpu, vertex_struct *vertexes,
 #ifdef PROFILE
     trivial_rejects++;
 #endif
-    return;
+    return 0;
   }
 
   if(b->y < a->y)
@@ -2949,7 +2975,7 @@ void render_triangle(psx_gpu_struct *psx_gpu, vertex_struct *vertexes,
 #ifdef PROFILE
     trivial_rejects++;
 #endif
-    return;
+    return 0;
   }
 
   if(triangle_area < 0)
@@ -2975,7 +3001,7 @@ void render_triangle(psx_gpu_struct *psx_gpu, vertex_struct *vertexes,
 #ifdef PROFILE
     trivial_rejects++;
 #endif
-    return;
+    return 0;
   }
 
   if(invalidate_texture_cache_region_viewport(psx_gpu, a->x, y_top, c->x,
@@ -2984,13 +3010,28 @@ void render_triangle(psx_gpu_struct *psx_gpu, vertex_struct *vertexes,
 #ifdef PROFILE
     trivial_rejects++;
 #endif
-    return;
+    return 0;
   }
 
-  psx_gpu->num_spans = 0;
   psx_gpu->triangle_area = triangle_area;
   psx_gpu->triangle_winding = triangle_winding;
 
+  vertexes_out[0] = a;
+  vertexes_out[1] = b;
+  vertexes_out[2] = c;
+
+  return 1;
+}
+
+static void render_triangle_p(psx_gpu_struct *psx_gpu,
+ vertex_struct *vertex_ptrs[3], u32 flags)
+{
+  psx_gpu->num_spans = 0;
+
+  vertex_struct *a = vertex_ptrs[0];
+  vertex_struct *b = vertex_ptrs[1];
+  vertex_struct *c = vertex_ptrs[2];
+
   s32 y_delta_a = b->y - a->y;
   s32 y_delta_b = c->y - b->y;
   s32 y_delta_c = c->y - a->y;
@@ -3002,7 +3043,7 @@ void render_triangle(psx_gpu_struct *psx_gpu, vertex_struct *vertexes,
   compute_all_gradients(psx_gpu, a, b, c);
 
   switch(y_direction_a | (y_direction_b << 2) | (y_direction_c << 4) |
-   (triangle_winding << 6))
+   (psx_gpu->triangle_winding << 6))
   {
     triangle_case(up, up, up, negative):
     triangle_case(up, up, flat, negative):
@@ -3081,11 +3122,11 @@ void render_triangle(psx_gpu_struct *psx_gpu, vertex_struct *vertexes,
   spans += psx_gpu->num_spans;
 #endif
 
-  if(psx_gpu->interlace_mode & RENDER_INTERLACE_ENABLED)
+  if(unlikely(psx_gpu->render_mode & RENDER_INTERLACE_ENABLED))
   {
     u32 i;
 
-    if(psx_gpu->interlace_mode & RENDER_INTERLACE_ODD)
+    if(psx_gpu->render_mode & RENDER_INTERLACE_ODD)
     {
       for(i = 0; i < psx_gpu->num_spans; i++)
       {
@@ -3126,6 +3167,14 @@ void render_triangle(psx_gpu_struct *psx_gpu, vertex_struct *vertexes,
    (psx_gpu);
 }
 
+void render_triangle(psx_gpu_struct *psx_gpu, vertex_struct *vertexes,
+ u32 flags)
+{
+  vertex_struct *vertex_ptrs[3];
+  if (prepare_triangle(psx_gpu, vertexes, vertex_ptrs))
+    render_triangle_p(psx_gpu, vertex_ptrs, flags);
+}
+
 
 void texture_sprite_blocks_8bpp(psx_gpu_struct *psx_gpu);
 
@@ -3161,14 +3210,17 @@ void texture_sprite_blocks_8bpp(psx_gpu_struct *psx_gpu)
 #endif
 
 
-#define setup_sprite_tiled_initialize_4bpp()                                   \
+#define setup_sprite_tiled_initialize_4bpp_clut()                              \
   u16 *clut_ptr = psx_gpu->clut_ptr;                                           \
   vec_8x16u clut_a, clut_b;                                                    \
   vec_16x8u clut_low, clut_high;                                               \
                                                                                \
   load_8x16b(clut_a, clut_ptr);                                                \
   load_8x16b(clut_b, clut_ptr + 8);                                            \
-  unzip_16x8b(clut_low, clut_high, clut_a, clut_b);                            \
+  unzip_16x8b(clut_low, clut_high, clut_a, clut_b)                             \
+
+#define setup_sprite_tiled_initialize_4bpp()                                   \
+  setup_sprite_tiled_initialize_4bpp_clut();                                   \
                                                                                \
   if(psx_gpu->current_texture_mask & psx_gpu->dirty_textures_4bpp_mask)        \
     update_texture_4bpp_cache(psx_gpu)                                         \
@@ -3185,10 +3237,6 @@ void texture_sprite_blocks_8bpp(psx_gpu_struct *psx_gpu)
   load_64b(texels, texture_block_ptr)                                          \
 
 
-#define setup_sprite_tile_setup_block_yes(side, offset, texture_mode)          \
-
-#define setup_sprite_tile_setup_block_no(side, offset, texture_mode)           \
-
 #define setup_sprite_tile_add_blocks(tile_num_blocks)                          \
   num_blocks += tile_num_blocks;                                               \
   sprite_blocks += tile_num_blocks;                                            \
@@ -3334,34 +3382,36 @@ void texture_sprite_blocks_8bpp(psx_gpu_struct *psx_gpu)
 #define setup_sprite_tile_column_edge_post_adjust_full(edge)                   \
 
 
-#define setup_sprite_tile_column_height_single(edge_mode, edge, texture_mode)  \
+#define setup_sprite_tile_column_height_single(edge_mode, edge, texture_mode,  \
+ x4mode)                                                                       \
 do                                                                             \
 {                                                                              \
   sub_tile_height = column_data;                                               \
-  setup_sprite_tile_column_edge_pre_adjust_##edge_mode(edge);                  \
-  setup_sprite_tile_##edge_mode##_##texture_mode(edge);                        \
-  setup_sprite_tile_column_edge_post_adjust_##edge_mode(edge);                 \
+  setup_sprite_tile_column_edge_pre_adjust_##edge_mode##x4mode(edge);          \
+  setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge);                \
+  setup_sprite_tile_column_edge_post_adjust_##edge_mode##x4mode(edge);         \
 } while(0)                                                                     \
 
-#define setup_sprite_tile_column_height_multi(edge_mode, edge, texture_mode)   \
+#define setup_sprite_tile_column_height_multi(edge_mode, edge, texture_mode,   \
+ x4mode)                                                                       \
 do                                                                             \
 {                                                                              \
   u32 tiles_remaining = column_data >> 16;                                     \
   sub_tile_height = column_data & 0xFF;                                        \
-  setup_sprite_tile_column_edge_pre_adjust_##edge_mode(edge);                  \
-  setup_sprite_tile_##edge_mode##_##texture_mode(edge);                        \
+  setup_sprite_tile_column_edge_pre_adjust_##edge_mode##x4mode(edge);          \
+  setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge);                \
   tiles_remaining -= 1;                                                        \
                                                                                \
   while(tiles_remaining)                                                       \
   {                                                                            \
     sub_tile_height = 16;                                                      \
-    setup_sprite_tile_##edge_mode##_##texture_mode(edge);                      \
+    setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge);              \
     tiles_remaining--;                                                         \
   }                                                                            \
                                                                                \
   sub_tile_height = (column_data >> 8) & 0xFF;                                 \
-  setup_sprite_tile_##edge_mode##_##texture_mode(edge);                        \
-  setup_sprite_tile_column_edge_post_adjust_##edge_mode(edge);                 \
+  setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge);                \
+  setup_sprite_tile_column_edge_post_adjust_##edge_mode##x4mode(edge);         \
 } while(0)                                                                     \
 
 
@@ -3374,15 +3424,18 @@ do                                                                             \
   column_data |= (tile_height - 1) << 16                                       \
 
 
+#define RIGHT_MASK_BIT_SHIFT 8
+#define RIGHT_MASK_BIT_SHIFT_4x 16
+
 #define setup_sprite_tile_column_width_single(texture_mode, multi_height,      \
- edge_mode, edge)                                                              \
+ edge_mode, edge, x4mode)                                                      \
 {                                                                              \
   setup_sprite_column_data_##multi_height();                                   \
   left_mask_bits = left_block_mask | right_block_mask;                         \
-  right_mask_bits = left_mask_bits >> 8;                                       \
+  right_mask_bits = left_mask_bits >> RIGHT_MASK_BIT_SHIFT##x4mode;            \
                                                                                \
   setup_sprite_tile_column_height_##multi_height(edge_mode, edge,              \
-   texture_mode);                                                              \
+   texture_mode, x4mode);                                                      \
 }                                                                              \
 
 #define setup_sprite_tiled_advance_column()                                    \
@@ -3390,18 +3443,22 @@ do                                                                             \
   if((texture_offset_base & 0xF00) == 0)                                       \
     texture_offset_base -= (0x100 + 0xF00)                                     \
 
+#define FB_PTR_MULTIPLIER 1
+#define FB_PTR_MULTIPLIER_4x 2
+
 #define setup_sprite_tile_column_width_multi(texture_mode, multi_height,       \
- left_mode, right_mode)                                                        \
+ left_mode, right_mode, x4mode)                                                \
 {                                                                              \
   setup_sprite_column_data_##multi_height();                                   \
-  s32 fb_ptr_advance_column = 16 - (1024 * height);                            \
+  s32 fb_ptr_advance_column = (16 - (1024 * height))                           \
+    * FB_PTR_MULTIPLIER##x4mode;                                               \
                                                                                \
   tile_width -= 2;                                                             \
   left_mask_bits = left_block_mask;                                            \
-  right_mask_bits = left_mask_bits >> 8;                                       \
+  right_mask_bits = left_mask_bits >> RIGHT_MASK_BIT_SHIFT##x4mode;            \
                                                                                \
   setup_sprite_tile_column_height_##multi_height(left_mode, right,             \
-   texture_mode);                                                              \
+   texture_mode, x4mode);                                                      \
   fb_ptr += fb_ptr_advance_column;                                             \
                                                                                \
   left_mask_bits = 0x00;                                                       \
@@ -3410,22 +3467,297 @@ do                                                                             \
   while(tile_width)                                                            \
   {                                                                            \
     setup_sprite_tiled_advance_column();                                       \
-    setup_sprite_tile_column_height_##multi_height(full, none, texture_mode);  \
+    setup_sprite_tile_column_height_##multi_height(full, none,                 \
+     texture_mode, x4mode);                                                    \
     fb_ptr += fb_ptr_advance_column;                                           \
     tile_width--;                                                              \
   }                                                                            \
                                                                                \
   left_mask_bits = right_block_mask;                                           \
-  right_mask_bits = left_mask_bits >> 8;                                       \
+  right_mask_bits = left_mask_bits >> RIGHT_MASK_BIT_SHIFT##x4mode;            \
                                                                                \
   setup_sprite_tiled_advance_column();                                         \
   setup_sprite_tile_column_height_##multi_height(right_mode, left,             \
-   texture_mode);                                                              \
+   texture_mode, x4mode);                                                      \
+}                                                                              \
+
+
+/* 4x stuff */
+#define setup_sprite_tiled_initialize_4bpp_4x()                                \
+  setup_sprite_tiled_initialize_4bpp_clut()                                    \
+
+#define setup_sprite_tiled_initialize_8bpp_4x()                                \
+
+
+#define setup_sprite_tile_full_4bpp_4x(edge)                                   \
+{                                                                              \
+  vec_8x8u texels_low, texels_high;                                            \
+  vec_8x16u pixels, pixels_wide;                                               \
+  setup_sprite_tile_add_blocks(sub_tile_height * 2 * 4);                       \
+  u32 left_mask_bits_a = left_mask_bits & 0xFF;                                \
+  u32 left_mask_bits_b = left_mask_bits >> 8;                                  \
+  u32 right_mask_bits_a = right_mask_bits & 0xFF;                              \
+  u32 right_mask_bits_b = right_mask_bits >> 8;                                \
+                                                                               \
+  while(sub_tile_height)                                                       \
+  {                                                                            \
+    setup_sprite_tile_fetch_texel_block_8bpp(0);                               \
+    tbl_16(texels_low, texels, clut_low);                                      \
+    tbl_16(texels_high, texels, clut_high);                                    \
+    zip_8x16b(pixels, texels_low, texels_high);                                \
+                                                                               \
+    zip_4x32b(vector_cast(vec_4x32u, pixels_wide), pixels.low, pixels.low);    \
+    block->texels = pixels_wide;                                               \
+    block->draw_mask_bits = left_mask_bits_a;                                  \
+    block->fb_ptr = fb_ptr;                                                    \
+    block++;                                                                   \
+                                                                               \
+    block->texels = pixels_wide;                                               \
+    block->draw_mask_bits = left_mask_bits_a;                                  \
+    block->fb_ptr = fb_ptr + 1024;                                             \
+    block++;                                                                   \
+                                                                               \
+    zip_4x32b(vector_cast(vec_4x32u, pixels_wide), pixels.high, pixels.high);  \
+    block->texels = pixels_wide;                                               \
+    block->draw_mask_bits = left_mask_bits_b;                                  \
+    block->fb_ptr = fb_ptr + 8;                                                \
+    block++;                                                                   \
+                                                                               \
+    block->texels = pixels_wide;                                               \
+    block->draw_mask_bits = left_mask_bits_b;                                  \
+    block->fb_ptr = fb_ptr + 1024 + 8;                                         \
+    block++;                                                                   \
+                                                                               \
+    setup_sprite_tile_fetch_texel_block_8bpp(8);                               \
+    tbl_16(texels_low, texels, clut_low);                                      \
+    tbl_16(texels_high, texels, clut_high);                                    \
+    zip_8x16b(pixels, texels_low, texels_high);                                \
+                                                                               \
+    zip_4x32b(vector_cast(vec_4x32u, pixels_wide), pixels.low, pixels.low);    \
+    block->texels = pixels_wide;                                               \
+    block->draw_mask_bits = right_mask_bits_a;                                 \
+    block->fb_ptr = fb_ptr + 16;                                               \
+    block++;                                                                   \
+                                                                               \
+    block->texels = pixels_wide;                                               \
+    block->draw_mask_bits = right_mask_bits_a;                                 \
+    block->fb_ptr = fb_ptr + 1024 + 16;                                        \
+    block++;                                                                   \
+                                                                               \
+    zip_4x32b(vector_cast(vec_4x32u, pixels_wide), pixels.high, pixels.high);  \
+    block->texels = pixels_wide;                                               \
+    block->draw_mask_bits = right_mask_bits_b;                                 \
+    block->fb_ptr = fb_ptr + 24;                                               \
+    block++;                                                                   \
+                                                                               \
+    block->texels = pixels_wide;                                               \
+    block->draw_mask_bits = right_mask_bits_b;                                 \
+    block->fb_ptr = fb_ptr + 1024 + 24;                                        \
+    block++;                                                                   \
+                                                                               \
+    fb_ptr += 2048;                                                            \
+    texture_offset += 0x10;                                                    \
+    sub_tile_height--;                                                         \
+  }                                                                            \
+  texture_offset += 0xF00;                                                     \
+  psx_gpu->num_blocks = num_blocks;                                            \
 }                                                                              \
 
+#define setup_sprite_tile_half_4bpp_4x(edge)                                   \
+{                                                                              \
+  vec_8x8u texels_low, texels_high;                                            \
+  vec_8x16u pixels, pixels_wide;                                               \
+  setup_sprite_tile_add_blocks(sub_tile_height * 4);                           \
+  u32 edge##_mask_bits_a = edge##_mask_bits & 0xFF;                            \
+  u32 edge##_mask_bits_b = edge##_mask_bits >> 8;                              \
+                                                                               \
+  while(sub_tile_height)                                                       \
+  {                                                                            \
+    setup_sprite_tile_fetch_texel_block_8bpp(0);                               \
+    tbl_16(texels_low, texels, clut_low);                                      \
+    tbl_16(texels_high, texels, clut_high);                                    \
+    zip_8x16b(pixels, texels_low, texels_high);                                \
+                                                                               \
+    zip_4x32b(vector_cast(vec_4x32u, pixels_wide), pixels.low, pixels.low);    \
+    block->texels = pixels_wide;                                               \
+    block->draw_mask_bits = edge##_mask_bits_a;                                \
+    block->fb_ptr = fb_ptr;                                                    \
+    block++;                                                                   \
+                                                                               \
+    block->texels = pixels_wide;                                               \
+    block->draw_mask_bits = edge##_mask_bits_a;                                \
+    block->fb_ptr = fb_ptr + 1024;                                             \
+    block++;                                                                   \
+                                                                               \
+    zip_4x32b(vector_cast(vec_4x32u, pixels_wide), pixels.high, pixels.high);  \
+    block->texels = pixels_wide;                                               \
+    block->draw_mask_bits = edge##_mask_bits_b;                                \
+    block->fb_ptr = fb_ptr + 8;                                                \
+    block++;                                                                   \
+                                                                               \
+    block->texels = pixels_wide;                                               \
+    block->draw_mask_bits = edge##_mask_bits_b;                                \
+    block->fb_ptr = fb_ptr + 1024 + 8;                                         \
+    block++;                                                                   \
+                                                                               \
+    fb_ptr += 2048;                                                            \
+    texture_offset += 0x10;                                                    \
+    sub_tile_height--;                                                         \
+  }                                                                            \
+  texture_offset += 0xF00;                                                     \
+  psx_gpu->num_blocks = num_blocks;                                            \
+}                                                                              \
 
-#define setup_sprite_tiled_builder(texture_mode)                               \
-void setup_sprite_##texture_mode(psx_gpu_struct *psx_gpu, s32 x, s32 y,        \
+  
+#define setup_sprite_tile_full_8bpp_4x(edge)                                   \
+{                                                                              \
+  setup_sprite_tile_add_blocks(sub_tile_height * 2 * 4);                       \
+  vec_16x8u texels_wide;                                                       \
+  u32 left_mask_bits_a = left_mask_bits & 0xFF;                                \
+  u32 left_mask_bits_b = left_mask_bits >> 8;                                  \
+  u32 right_mask_bits_a = right_mask_bits & 0xFF;                              \
+  u32 right_mask_bits_b = right_mask_bits >> 8;                                \
+                                                                               \
+  while(sub_tile_height)                                                       \
+  {                                                                            \
+    setup_sprite_tile_fetch_texel_block_8bpp(0);                               \
+    zip_8x16b(vector_cast(vec_8x16u, texels_wide), texels, texels);            \
+    block->r = texels_wide.low;                                                \
+    block->draw_mask_bits = left_mask_bits_a;                                  \
+    block->fb_ptr = fb_ptr;                                                    \
+    block++;                                                                   \
+                                                                               \
+    block->r = texels_wide.low;                                                \
+    block->draw_mask_bits = left_mask_bits_a;                                  \
+    block->fb_ptr = fb_ptr + 1024;                                             \
+    block++;                                                                   \
+                                                                               \
+    block->r = texels_wide.high;                                               \
+    block->draw_mask_bits = left_mask_bits_b;                                  \
+    block->fb_ptr = fb_ptr + 8;                                                \
+    block++;                                                                   \
+                                                                               \
+    block->r = texels_wide.high;                                               \
+    block->draw_mask_bits = left_mask_bits_b;                                  \
+    block->fb_ptr = fb_ptr + 1024 + 8;                                         \
+    block++;                                                                   \
+                                                                               \
+    setup_sprite_tile_fetch_texel_block_8bpp(8);                               \
+    zip_8x16b(vector_cast(vec_8x16u, texels_wide), texels, texels);            \
+    block->r = texels_wide.low;                                                \
+    block->draw_mask_bits = right_mask_bits_a;                                 \
+    block->fb_ptr = fb_ptr + 16;                                               \
+    block++;                                                                   \
+                                                                               \
+    block->r = texels_wide.low;                                                \
+    block->draw_mask_bits = right_mask_bits_a;                                 \
+    block->fb_ptr = fb_ptr + 1024 + 16;                                        \
+    block++;                                                                   \
+                                                                               \
+    block->r = texels_wide.high;                                               \
+    block->draw_mask_bits = right_mask_bits_b;                                 \
+    block->fb_ptr = fb_ptr + 24;                                               \
+    block++;                                                                   \
+                                                                               \
+    block->r = texels_wide.high;                                               \
+    block->draw_mask_bits = right_mask_bits_b;                                 \
+    block->fb_ptr = fb_ptr + 24 + 1024;                                        \
+    block++;                                                                   \
+                                                                               \
+    fb_ptr += 2048;                                                            \
+    texture_offset += 0x10;                                                    \
+    sub_tile_height--;                                                         \
+  }                                                                            \
+  texture_offset += 0xF00;                                                     \
+  psx_gpu->num_blocks = num_blocks;                                            \
+}                                                                              \
+
+#define setup_sprite_tile_half_8bpp_4x(edge)                                   \
+{                                                                              \
+  setup_sprite_tile_add_blocks(sub_tile_height * 4);                           \
+  vec_16x8u texels_wide;                                                       \
+  u32 edge##_mask_bits_a = edge##_mask_bits & 0xFF;                            \
+  u32 edge##_mask_bits_b = edge##_mask_bits >> 8;                              \
+                                                                               \
+  while(sub_tile_height)                                                       \
+  {                                                                            \
+    setup_sprite_tile_fetch_texel_block_8bpp(0);                               \
+    zip_8x16b(vector_cast(vec_8x16u, texels_wide), texels, texels);            \
+    block->r = texels_wide.low;                                                \
+    block->draw_mask_bits = edge##_mask_bits_a;                                \
+    block->fb_ptr = fb_ptr;                                                    \
+    block++;                                                                   \
+                                                                               \
+    block->r = texels_wide.low;                                                \
+    block->draw_mask_bits = edge##_mask_bits_a;                                \
+    block->fb_ptr = fb_ptr + 1024;                                             \
+    block++;                                                                   \
+                                                                               \
+    block->r = texels_wide.high;                                               \
+    block->draw_mask_bits = edge##_mask_bits_b;                                \
+    block->fb_ptr = fb_ptr + 8;                                                \
+    block++;                                                                   \
+                                                                               \
+    block->r = texels_wide.high;                                               \
+    block->draw_mask_bits = edge##_mask_bits_b;                                \
+    block->fb_ptr = fb_ptr + 8 + 1024;                                         \
+    block++;                                                                   \
+                                                                               \
+    fb_ptr += 2048;                                                            \
+    texture_offset += 0x10;                                                    \
+    sub_tile_height--;                                                         \
+  }                                                                            \
+  texture_offset += 0xF00;                                                     \
+  psx_gpu->num_blocks = num_blocks;                                            \
+}                                                                              \
+
+  
+#define setup_sprite_tile_column_edge_pre_adjust_half_right_4x()               \
+  texture_offset = texture_offset_base + 8;                                    \
+  fb_ptr += 16                                                                 \
+
+#define setup_sprite_tile_column_edge_pre_adjust_half_left_4x()                \
+  texture_offset = texture_offset_base                                         \
+
+#define setup_sprite_tile_column_edge_pre_adjust_half_4x(edge)                 \
+  setup_sprite_tile_column_edge_pre_adjust_half_##edge##_4x()                  \
+
+#define setup_sprite_tile_column_edge_pre_adjust_full_4x(edge)                 \
+  texture_offset = texture_offset_base                                         \
+
+#define setup_sprite_tile_column_edge_post_adjust_half_right_4x()              \
+  fb_ptr -= 16                                                                 \
+
+#define setup_sprite_tile_column_edge_post_adjust_half_left_4x()               \
+
+#define setup_sprite_tile_column_edge_post_adjust_half_4x(edge)                \
+  setup_sprite_tile_column_edge_post_adjust_half_##edge##_4x()                 \
+
+#define setup_sprite_tile_column_edge_post_adjust_full_4x(edge)                \
+
+
+#define setup_sprite_offset_u_adjust()                                         \
+
+#define setup_sprite_comapre_left_block_mask()                                 \
+  ((left_block_mask & 0xFF) == 0xFF)                                           \
+
+#define setup_sprite_comapre_right_block_mask()                                \
+  (((right_block_mask >> 8) & 0xFF) == 0xFF)                                   \
+
+
+#define setup_sprite_offset_u_adjust_4x()                                      \
+  offset_u *= 2;                                                               \
+  offset_u_right = offset_u_right * 2 + 1                                      \
+
+#define setup_sprite_comapre_left_block_mask_4x()                              \
+  ((left_block_mask & 0xFFFF) == 0xFFFF)                                       \
+
+#define setup_sprite_comapre_right_block_mask_4x()                             \
+  (((right_block_mask >> 16) & 0xFFFF) == 0xFFFF)                              \
+
+
+#define setup_sprite_tiled_builder(texture_mode, x4mode)                       \
+void setup_sprite_##texture_mode##x4mode(psx_gpu_struct *psx_gpu, s32 x, s32 y,\
  s32 u, s32 v, s32 width, s32 height, u32 color)                               \
 {                                                                              \
   s32 offset_u = u & 0xF;                                                      \
@@ -3437,8 +3769,10 @@ void setup_sprite_##texture_mode(psx_gpu_struct *psx_gpu, s32 x, s32 y,        \
   s32 tile_width = width_rounded / 16;                                         \
   u32 offset_u_right = width_rounded & 0xF;                                    \
                                                                                \
-  u32 left_block_mask = ~(0xFFFF << offset_u);                                 \
-  u32 right_block_mask = 0xFFFE << offset_u_right;                             \
+  setup_sprite_offset_u_adjust##x4mode();                                      \
+                                                                               \
+  u32 left_block_mask = ~(0xFFFFFFFF << offset_u);                             \
+  u32 right_block_mask = 0xFFFFFFFE << offset_u_right;                         \
                                                                                \
   u32 left_mask_bits;                                                          \
   u32 right_mask_bits;                                                         \
@@ -3455,19 +3789,19 @@ void setup_sprite_##texture_mode(psx_gpu_struct *psx_gpu, s32 x, s32 y,        \
   u32 texture_offset_base = texture_offset;                                    \
   u32 control_mask;                                                            \
                                                                                \
-  u16 *fb_ptr = psx_gpu->vram_ptr + (y * 1024) + (x - offset_u);               \
+  u16 *fb_ptr = psx_gpu->vram_out_ptr + (y * 1024) + (x - offset_u);           \
   u32 num_blocks = psx_gpu->num_blocks;                                        \
   block_struct *block = psx_gpu->blocks + num_blocks;                          \
                                                                                \
   u16 *texture_block_ptr;                                                      \
   vec_8x8u texels;                                                             \
                                                                                \
-  setup_sprite_tiled_initialize_##texture_mode();                              \
+  setup_sprite_tiled_initialize_##texture_mode##x4mode();                      \
                                                                                \
   control_mask = tile_width == 1;                                              \
   control_mask |= (tile_height == 1) << 1;                                     \
-  control_mask |= ((left_block_mask & 0xFF) == 0xFF) << 2;                     \
-  control_mask |= (((right_block_mask >> 8) & 0xFF) == 0xFF) << 3;             \
+  control_mask |= setup_sprite_comapre_left_block_mask##x4mode() << 2;         \
+  control_mask |= setup_sprite_comapre_right_block_mask##x4mode() << 3;        \
                                                                                \
   sprites_##texture_mode++;                                                    \
                                                                                \
@@ -3475,64 +3809,77 @@ void setup_sprite_##texture_mode(psx_gpu_struct *psx_gpu, s32 x, s32 y,        \
   {                                                                            \
     default:                                                                   \
     case 0x0:                                                                  \
-      setup_sprite_tile_column_width_multi(texture_mode, multi, full, full);   \
+      setup_sprite_tile_column_width_multi(texture_mode, multi, full, full,    \
+       x4mode);                                                                \
       break;                                                                   \
                                                                                \
     case 0x1:                                                                  \
-      setup_sprite_tile_column_width_single(texture_mode, multi, full, none);  \
+      setup_sprite_tile_column_width_single(texture_mode, multi, full, none,   \
+       x4mode);                                                                \
       break;                                                                   \
                                                                                \
     case 0x2:                                                                  \
-      setup_sprite_tile_column_width_multi(texture_mode, single, full, full);  \
+      setup_sprite_tile_column_width_multi(texture_mode, single, full, full,   \
+       x4mode);                                                                \
       break;                                                                   \
                                                                                \
     case 0x3:                                                                  \
-      setup_sprite_tile_column_width_single(texture_mode, single, full, none); \
+      setup_sprite_tile_column_width_single(texture_mode, single, full, none,  \
+       x4mode);                                                                \
       break;                                                                   \
                                                                                \
     case 0x4:                                                                  \
-      setup_sprite_tile_column_width_multi(texture_mode, multi, half, full);   \
+      setup_sprite_tile_column_width_multi(texture_mode, multi, half, full,    \
+       x4mode);                                                                \
       break;                                                                   \
                                                                                \
     case 0x5:                                                                  \
-      setup_sprite_tile_column_width_single(texture_mode, multi, half, right); \
+      setup_sprite_tile_column_width_single(texture_mode, multi, half, right,  \
+       x4mode);                                                                \
       break;                                                                   \
                                                                                \
     case 0x6:                                                                  \
-      setup_sprite_tile_column_width_multi(texture_mode, single, half, full);  \
+      setup_sprite_tile_column_width_multi(texture_mode, single, half, full,   \
+       x4mode);                                                                \
       break;                                                                   \
                                                                                \
     case 0x7:                                                                  \
-      setup_sprite_tile_column_width_single(texture_mode, single, half, right);\
+      setup_sprite_tile_column_width_single(texture_mode, single, half, right, \
+       x4mode);                                                                \
       break;                                                                   \
                                                                                \
     case 0x8:                                                                  \
-      setup_sprite_tile_column_width_multi(texture_mode, multi, full, half);   \
+      setup_sprite_tile_column_width_multi(texture_mode, multi, full, half,    \
+       x4mode);                                                                \
       break;                                                                   \
                                                                                \
     case 0x9:                                                                  \
-      setup_sprite_tile_column_width_single(texture_mode, multi, half, left);  \
+      setup_sprite_tile_column_width_single(texture_mode, multi, half, left,   \
+       x4mode);                                                                \
       break;                                                                   \
                                                                                \
     case 0xA:                                                                  \
-      setup_sprite_tile_column_width_multi(texture_mode, single, full, half);  \
+      setup_sprite_tile_column_width_multi(texture_mode, single, full, half,   \
+       x4mode);                                                                \
       break;                                                                   \
                                                                                \
     case 0xB:                                                                  \
-      setup_sprite_tile_column_width_single(texture_mode, single, half, left); \
+      setup_sprite_tile_column_width_single(texture_mode, single, half, left,  \
+       x4mode);                                                                \
       break;                                                                   \
                                                                                \
     case 0xC:                                                                  \
-      setup_sprite_tile_column_width_multi(texture_mode, multi, half, half);   \
+      setup_sprite_tile_column_width_multi(texture_mode, multi, half, half,    \
+       x4mode);                                                                \
       break;                                                                   \
                                                                                \
     case 0xE:                                                                  \
-      setup_sprite_tile_column_width_multi(texture_mode, single, half, half);  \
+      setup_sprite_tile_column_width_multi(texture_mode, single, half, half,   \
+       x4mode);                                                                \
       break;                                                                   \
   }                                                                            \
 }                                                                              \
 
-
 void setup_sprite_4bpp(psx_gpu_struct *psx_gpu, s32 x, s32 y, s32 u, s32 v,
  s32 width, s32 height, u32 color);
 void setup_sprite_8bpp(psx_gpu_struct *psx_gpu, s32 x, s32 y, s32 u, s32 v,
@@ -3540,9 +3887,24 @@ void setup_sprite_8bpp(psx_gpu_struct *psx_gpu, s32 x, s32 y, s32 u, s32 v,
 void setup_sprite_16bpp(psx_gpu_struct *psx_gpu, s32 x, s32 y, s32 u, s32 v,
  s32 width, s32 height, u32 color);
 
+void setup_sprite_4bpp_4x(psx_gpu_struct *psx_gpu, s32 x, s32 y, s32 u, s32 v,
+ s32 width, s32 height, u32 color);
+void setup_sprite_8bpp_4x(psx_gpu_struct *psx_gpu, s32 x, s32 y, s32 u, s32 v,
+ s32 width, s32 height, u32 color);
+void setup_sprite_16bpp_4x(psx_gpu_struct *psx_gpu, s32 x, s32 y, s32 u, s32 v,
+ s32 width, s32 height, u32 color);
+
+void setup_sprite_untextured(psx_gpu_struct *psx_gpu, s32 x, s32 y, s32 u,
+ s32 v, s32 width, s32 height, u32 color);
+void setup_sprite_untextured_simple(psx_gpu_struct *psx_gpu, s32 x, s32 y,
+ s32 u, s32 v, s32 width, s32 height, u32 color);
+
 #ifndef NEON_BUILD
-setup_sprite_tiled_builder(4bpp);
-setup_sprite_tiled_builder(8bpp);
+setup_sprite_tiled_builder(4bpp,);
+setup_sprite_tiled_builder(8bpp,);
+
+setup_sprite_tiled_builder(4bpp,_4x);
+setup_sprite_tiled_builder(8bpp,_4x);
 
 void setup_sprite_16bpp(psx_gpu_struct *psx_gpu, s32 x, s32 y, s32 u,
  s32 v, s32 width, s32 height, u32 color)
@@ -3550,7 +3912,7 @@ void setup_sprite_16bpp(psx_gpu_struct *psx_gpu, s32 x, s32 y, s32 u,
   u32 left_offset = u & 0x7;
   u32 width_rounded = width + left_offset + 7;
 
-  u16 *fb_ptr = psx_gpu->vram_ptr + (y * 1024) + (s32)(x - left_offset);
+  u16 *fb_ptr = psx_gpu->vram_out_ptr + (y * 1024) + (s32)(x - left_offset);
   u32 right_width = width_rounded & 0x7;
   u32 block_width = width_rounded / 8;
   u32 fb_ptr_pitch = (1024 + 8) - (block_width * 8);
@@ -3665,14 +4027,19 @@ void setup_sprite_16bpp(psx_gpu_struct *psx_gpu, s32 x, s32 y, s32 u,
   }
 }
 
-#endif
-
 void setup_sprite_untextured(psx_gpu_struct *psx_gpu, s32 x, s32 y, s32 u,
  s32 v, s32 width, s32 height, u32 color)
 {
+  if((psx_gpu->render_state & (RENDER_STATE_MASK_EVALUATE |
+   RENDER_FLAGS_MODULATE_TEXELS | RENDER_FLAGS_BLEND)) == 0)
+  {
+    setup_sprite_untextured_simple(psx_gpu, x, y, u, v, width, height, color);
+    return;
+  }
+
   u32 right_width = ((width - 1) & 0x7) + 1;
   u32 right_mask_bits = (0xFF << right_width);
-  u16 *fb_ptr = psx_gpu->vram_ptr + (y * 1024) + x;
+  u16 *fb_ptr = psx_gpu->vram_out_ptr + (y * 1024) + x;
   u32 block_width = (width + 7) / 8;
   u32 fb_ptr_pitch = 1024 - ((block_width - 1) * 8);
   u32 blocks_remaining;
@@ -3735,6 +4102,66 @@ void setup_sprite_untextured(psx_gpu_struct *psx_gpu, s32 x, s32 y, s32 u,
   }
 }
 
+#endif
+
+void setup_sprite_untextured_simple(psx_gpu_struct *psx_gpu, s32 x, s32 y,
+ s32 u, s32 v, s32 width, s32 height, u32 color)
+{
+  u32 r = color & 0xFF;
+  u32 g = (color >> 8) & 0xFF;
+  u32 b = (color >> 16) & 0xFF;
+  u32 color_16bpp = (r >> 3) | ((g >> 3) << 5) | ((b >> 3) << 10) |
+   psx_gpu->mask_msb;
+  u32 color_32bpp = color_16bpp | (color_16bpp << 16);
+
+  u16 *vram_ptr16 = psx_gpu->vram_out_ptr + x + (y * 1024);
+  u32 *vram_ptr;
+
+  u32 num_width;
+
+  if(psx_gpu->num_blocks > MAX_BLOCKS)
+  {
+    flush_render_block_buffer(psx_gpu);
+  }
+
+  while(height)
+  {
+    num_width = width;
+
+    vram_ptr = (void *)vram_ptr16;
+    if((long)vram_ptr16 & 2)
+    {
+      *vram_ptr16 = color_32bpp;
+      vram_ptr = (void *)(vram_ptr16 + 1);
+      num_width--;
+    }
+
+    while(num_width >= 4 * 2)
+    {
+      vram_ptr[0] = color_32bpp;
+      vram_ptr[1] = color_32bpp;
+      vram_ptr[2] = color_32bpp;
+      vram_ptr[3] = color_32bpp;
+
+      vram_ptr += 4;
+      num_width -= 4 * 2;
+    }
+
+    while(num_width >= 2)
+    {
+      *vram_ptr++ = color_32bpp;
+      num_width -= 2;
+    }
+
+    if(num_width > 0)
+    {
+      *(u16 *)vram_ptr = color_32bpp;
+    }
+
+    vram_ptr16 += 1024;
+    height--;
+  }
+}
 
 
 #define setup_sprite_blocks_switch_textured(texture_mode)                      \
@@ -4155,9 +4582,6 @@ do                                                                             \
   {                                                                            \
     delta_y *= -1;                                                             \
                                                                                \
-    if(delta_y >= 512)                                                         \
-      return;                                                                  \
-                                                                               \
     if(delta_x > delta_y)                                                      \
     {                                                                          \
       draw_line_span_horizontal(decrement, shading, blending, dithering,       \
@@ -4171,9 +4595,6 @@ do                                                                             \
   }                                                                            \
   else                                                                         \
   {                                                                            \
-    if(delta_y >= 512)                                                         \
-      return;                                                                  \
-                                                                               \
     if(delta_x > delta_y)                                                      \
     {                                                                          \
       draw_line_span_horizontal(increment, shading, blending, dithering,       \
@@ -4188,7 +4609,7 @@ do                                                                             \
 
                                                                                 
 void render_line(psx_gpu_struct *psx_gpu, vertex_struct *vertexes, u32 flags,
- u32 color)
+ u32 color, int double_resolution)
 {
   s32 color_r, color_g, color_b;
   u32 triangle_winding = 0;
@@ -4240,12 +4661,22 @@ void render_line(psx_gpu_struct *psx_gpu, vertex_struct *vertexes, u32 flags,
   delta_x = x_b - x_a;
   delta_y = y_b - y_a;
 
-  if(delta_x >= 1024)
+  if(delta_x >= 1024 || delta_y >= 512 || delta_y <= -512)
     return;
 
+  if(double_resolution)
+  {
+    x_a *= 2;
+    x_b *= 2;
+    y_a *= 2;
+    y_b *= 2;
+    delta_x *= 2;
+    delta_y *= 2;
+  }
+
   flags &= ~RENDER_FLAGS_TEXTURE_MAP;
 
-  vram_ptr = psx_gpu->vram_ptr + (y_a * 1024) + x_a;
+  vram_ptr = psx_gpu->vram_out_ptr + (y_a * 1024) + x_a;
 
   control_mask = 0x0;
 
@@ -4435,7 +4866,6 @@ void render_block_fill(psx_gpu_struct *psx_gpu, u32 color, u32 x, u32 y,
   if((width == 0) || (height == 0))
     return;
 
-  flush_render_block_buffer(psx_gpu);
   invalidate_texture_cache_region(psx_gpu, x, y, x + width - 1, y + height - 1);
 
   u32 r = color & 0xFF;
@@ -4445,17 +4875,17 @@ void render_block_fill(psx_gpu_struct *psx_gpu, u32 color, u32 x, u32 y,
    psx_gpu->mask_msb;
   u32 color_32bpp = color_16bpp | (color_16bpp << 16);
 
-  u32 *vram_ptr = (u32 *)(psx_gpu->vram_ptr + x + (y * 1024));
+  u32 *vram_ptr = (u32 *)(psx_gpu->vram_out_ptr + x + (y * 1024));
 
   u32 pitch = 512 - (width / 2);
   u32 num_width;
 
-  if(psx_gpu->interlace_mode & RENDER_INTERLACE_ENABLED)
+  if(psx_gpu->render_mode & RENDER_INTERLACE_ENABLED)
   {
     pitch += 512;
     height /= 2;
 
-    if(psx_gpu->interlace_mode & RENDER_INTERLACE_ODD)
+    if(psx_gpu->render_mode & RENDER_INTERLACE_ODD)
       vram_ptr += 512; 
   }
 
@@ -4482,6 +4912,50 @@ void render_block_fill(psx_gpu_struct *psx_gpu, u32 color, u32 x, u32 y,
   }
 }
 
+void render_block_fill_enh(psx_gpu_struct *psx_gpu, u32 color, u32 x, u32 y,
+ u32 width, u32 height)
+{
+  if((width == 0) || (height == 0))
+    return;
+
+  if(width > 1024)
+    width = 1024;
+
+  u32 r = color & 0xFF;
+  u32 g = (color >> 8) & 0xFF;
+  u32 b = (color >> 16) & 0xFF;
+  u32 color_16bpp = (r >> 3) | ((g >> 3) << 5) | ((b >> 3) << 10) |
+   psx_gpu->mask_msb;
+  u32 color_32bpp = color_16bpp | (color_16bpp << 16);
+
+  u32 *vram_ptr = (u32 *)(psx_gpu->vram_out_ptr + x + (y * 1024));
+
+  u32 pitch = 1024 / 2 - (width / 2);
+  u32 num_width;
+
+  while(height)
+  {
+    num_width = width;
+    while(num_width)
+    {
+      vram_ptr[0] = color_32bpp;
+      vram_ptr[1] = color_32bpp;
+      vram_ptr[2] = color_32bpp;
+      vram_ptr[3] = color_32bpp;
+      vram_ptr[4] = color_32bpp;
+      vram_ptr[5] = color_32bpp;
+      vram_ptr[6] = color_32bpp;
+      vram_ptr[7] = color_32bpp;
+
+      vram_ptr += 8;
+      num_width -= 16;
+    }
+
+    vram_ptr += pitch;
+    height--;
+  }
+}
+
 void render_block_copy(psx_gpu_struct *psx_gpu, u16 *source, u32 x, u32 y,
  u32 width, u32 height, u32 pitch)
 {
@@ -4522,16 +4996,17 @@ void initialize_reciprocal_table(void)
   u32 height_reciprocal;
   s32 shift;
 
-  for(height = 1; height < 512; height++)
+  for(height = 1; height < sizeof(reciprocal_table)
+       / sizeof(reciprocal_table[0]); height++)
   {
     shift = __builtin_clz(height);
     height_normalized = height << shift;
-    height_reciprocal = ((1ULL << 50) + (height_normalized - 1)) /
+    height_reciprocal = ((1ULL << 51) + (height_normalized - 1)) /
      height_normalized;
 
-    shift = 32 - (50 - shift);
+    shift = 32 - (51 - shift);
 
-    reciprocal_table[height] = (height_reciprocal << 12) | shift;
+    reciprocal_table[height] = (height_reciprocal << 10) | shift;
   }
 }
 
@@ -4559,8 +5034,10 @@ void initialize_psx_gpu(psx_gpu_struct *psx_gpu, u16 *vram)
   psx_gpu->render_state = 0;
   psx_gpu->render_state_base = 0;
   psx_gpu->num_blocks = 0;
+  psx_gpu->uvrgb_phase = 0x8000;
 
   psx_gpu->vram_ptr = vram;
+  psx_gpu->vram_out_ptr = vram;
 
   psx_gpu->texture_page_base = psx_gpu->vram_ptr;
   psx_gpu->texture_page_ptr = psx_gpu->vram_ptr;
@@ -4573,7 +5050,7 @@ void initialize_psx_gpu(psx_gpu_struct *psx_gpu, u16 *vram)
   psx_gpu->texture_mask_width = 0xFF;
   psx_gpu->texture_mask_height = 0xFF;
 
-  psx_gpu->interlace_mode = 0;
+  psx_gpu->render_mode = 0;
 
   memset(psx_gpu->vram_ptr, 0, sizeof(u16) * 1024 * 512);
 
@@ -4596,6 +5073,8 @@ void initialize_psx_gpu(psx_gpu_struct *psx_gpu, u16 *vram)
   psx_gpu->dither_table[3] = dither_table_row(3, -1, 2, -2);
 
   psx_gpu->primitive_type = PRIMITIVE_TYPE_UNKNOWN;
+
+  psx_gpu->enhancement_x_threshold = 256;
 }
 
 u64 get_us(void)
@@ -4660,3 +5139,4 @@ void triangle_benchmark(psx_gpu_struct *psx_gpu)
 
 #endif
 
+#include "psx_gpu_4x.c"
index 53a8717..846658c 100644 (file)
@@ -56,8 +56,8 @@ typedef enum
 typedef enum
 {
   RENDER_INTERLACE_ENABLED     = 0x1,
-  RENDER_INTERLACE_ODD         = 0x2
-} render_interlace_enum;
+  RENDER_INTERLACE_ODD         = 0x2,
+} render_mode_enum;
 
 typedef struct
 {
@@ -122,7 +122,6 @@ typedef struct
   vec_4x32u g_block_span;
   vec_4x32u b_block_span;
 
-  // 72 bytes
   u32 b;
   u32 b_dy;
 
@@ -138,25 +137,21 @@ typedef struct
   u32 triangle_color;
   u32 dither_table[4];
 
+  u32 uvrgb_phase;
+
   struct render_block_handler_struct *render_block_handler;
   void *texture_page_ptr;
   void *texture_page_base;
   u16 *clut_ptr;
   u16 *vram_ptr;
+  u16 *vram_out_ptr;
 
-  // 26 bytes
   u16 render_state_base;
   u16 render_state;
 
   u16 num_spans;
   u16 num_blocks;
 
-  s16 offset_x;
-  s16 offset_y;
-
-  u16 clut_settings;
-  u16 texture_settings;
-
   s16 viewport_start_x;
   s16 viewport_start_y;
   s16 viewport_end_x;
@@ -164,7 +159,6 @@ typedef struct
 
   u16 mask_msb;
 
-  // 8 bytes
   u8 triangle_winding;
 
   u8 display_area_draw_enable;
@@ -178,11 +172,27 @@ typedef struct
   u8 texture_window_y;
 
   u8 primitive_type;
-  u8 interlace_mode;
+  u8 render_mode;
+
+  s16 offset_x;
+  s16 offset_y;
+
+  u16 clut_settings;
+  u16 texture_settings;
+
+  // enhancement stuff
+  u16 *enhancement_buf_ptr;
+  u16 *enhancement_current_buf_ptr;
+  u32 enhancement_x_threshold;
+  s16 saved_viewport_start_x;
+  s16 saved_viewport_start_y;
+  s16 saved_viewport_end_x;
+  s16 saved_viewport_end_y;
+  u8 enhancement_buf_by_x16[64];
 
   // Align up to 64 byte boundary to keep the upcoming buffers cache line
-  // aligned
-  //u8 reserved_a[0];
+  // aligned, also make reachable with single immediate addition
+  u8 reserved_a[164];
 
   // 8KB
   block_struct blocks[MAX_BLOCKS_PER_ROW];
@@ -224,7 +234,7 @@ void render_triangle(psx_gpu_struct *psx_gpu, vertex_struct *vertexes,
 void render_sprite(psx_gpu_struct *psx_gpu, s32 x, s32 y, u32 u, u32 v,
  s32 width, s32 height, u32 flags, u32 color);
 void render_line(psx_gpu_struct *gpu, vertex_struct *vertexes, u32 flags,
- u32 color);
+ u32 color, int double_resolution);
 
 u32 texture_region_mask(s32 x1, s32 y1, s32 x2, s32 y2);
 
diff --git a/plugins/gpu_neon/psx_gpu/psx_gpu_4x.c b/plugins/gpu_neon/psx_gpu/psx_gpu_4x.c
new file mode 100644 (file)
index 0000000..83c6680
--- /dev/null
@@ -0,0 +1,384 @@
+#define select_enhancement_buf_ptr(psx_gpu, x) \\r
+  ((psx_gpu)->enhancement_buf_ptr + \\r
+   ((psx_gpu)->enhancement_buf_by_x16[(x) / 16] << 20))\r
+\r
+#ifndef NEON_BUILD\r
+void setup_sprite_16bpp_4x(psx_gpu_struct *psx_gpu, s32 x, s32 y, s32 u,\r
+ s32 v, s32 width, s32 height, u32 color)\r
+{\r
+  u32 left_offset = u & 0x7;\r
+  u32 width_rounded = width + left_offset + 7;\r
+\r
+  u16 *fb_ptr = psx_gpu->vram_out_ptr + (y * 1024) + (s32)(x - left_offset * 2);\r
+  u32 right_width = width_rounded & 0x7;\r
+  u32 block_width = width_rounded / 8;\r
+  u32 fb_ptr_pitch = (2048 + 16) - (block_width * 16);\r
+\r
+  u32 left_mask_bits = ~(0xFFFF << (left_offset * 2));\r
+  u32 right_mask_bits = 0xFFFC << (right_width * 2);\r
+\r
+  u32 texture_offset_base = u + (v * 1024);\r
+  u32 texture_mask =\r
+   psx_gpu->texture_mask_width | (psx_gpu->texture_mask_height * 1024);\r
+\r
+  u32 blocks_remaining;\r
+  u32 num_blocks = psx_gpu->num_blocks;\r
+  block_struct *block = psx_gpu->blocks + num_blocks;\r
+\r
+  u16 *texture_page_ptr = psx_gpu->texture_page_ptr;\r
+  u16 *texture_block_ptr;\r
+\r
+  texture_offset_base &= ~0x7;\r
+\r
+  sprites_16bpp++;\r
+\r
+  if(block_width == 1)\r
+  {\r
+    u32 mask_bits = left_mask_bits | right_mask_bits;\r
+    u32 mask_bits_a = mask_bits & 0xFF;\r
+    u32 mask_bits_b = mask_bits >> 8;\r
+    \r
+    vec_8x16u texels;\r
+    vec_8x16u texels_wide;\r
+\r
+    while(height)\r
+    {\r
+      num_blocks += 4;\r
+      sprite_blocks += 4;\r
+\r
+      if(num_blocks > MAX_BLOCKS)\r
+      {\r
+        flush_render_block_buffer(psx_gpu);\r
+        num_blocks = 4;\r
+        block = psx_gpu->blocks;\r
+      }\r
+      \r
+      texture_block_ptr =\r
+       texture_page_ptr + (texture_offset_base & texture_mask);\r
+\r
+      load_128b(texels, texture_block_ptr);\r
+      \r
+      zip_4x32b(vector_cast(vec_4x32u, texels_wide), texels.low, texels.low);\r
+      block->texels = texels_wide;\r
+      block->draw_mask_bits = mask_bits_a;\r
+      block->fb_ptr = fb_ptr;          \r
+      block++;\r
+      \r
+      block->texels = texels_wide;\r
+      block->draw_mask_bits = mask_bits_a;\r
+      block->fb_ptr = fb_ptr + 1024;          \r
+      block++;\r
+      \r
+      zip_4x32b(vector_cast(vec_4x32u, texels_wide), texels.high, texels.high);\r
+      block->texels = texels_wide;\r
+      block->draw_mask_bits = mask_bits_b;\r
+      block->fb_ptr = fb_ptr + 8;\r
+      block++;\r
+      \r
+      block->texels = texels_wide;\r
+      block->draw_mask_bits = mask_bits_b;\r
+      block->fb_ptr = fb_ptr + 8 + 1024;          \r
+      block++;      \r
+\r
+      texture_offset_base += 1024;\r
+      fb_ptr += 2048;\r
+\r
+      height--;\r
+      psx_gpu->num_blocks = num_blocks;\r
+    }\r
+  }\r
+  else\r
+  {\r
+    u32 texture_offset;\r
+    \r
+    u32 left_mask_bits_a = left_mask_bits & 0xFF;\r
+    u32 left_mask_bits_b = left_mask_bits >> 8;\r
+    u32 right_mask_bits_a = right_mask_bits & 0xFF;\r
+    u32 right_mask_bits_b = right_mask_bits >> 8;\r
+    \r
+    vec_8x16u texels;\r
+    vec_8x16u texels_wide;    \r
+\r
+    while(height)\r
+    {\r
+      blocks_remaining = block_width - 2;\r
+      num_blocks += block_width * 4;\r
+      sprite_blocks += block_width * 4;\r
+\r
+      if(num_blocks > MAX_BLOCKS)\r
+      {\r
+        flush_render_block_buffer(psx_gpu);\r
+        num_blocks = block_width * 4;\r
+        block = psx_gpu->blocks;\r
+      }\r
+\r
+      texture_offset = texture_offset_base;\r
+      texture_offset_base += 1024;\r
+\r
+      texture_block_ptr = texture_page_ptr + (texture_offset & texture_mask);\r
+      \r
+      load_128b(texels, texture_block_ptr);\r
+\r
+      zip_4x32b(vector_cast(vec_4x32u, texels_wide), texels.low, texels.low);\r
+      block->texels = texels_wide;\r
+      block->draw_mask_bits = left_mask_bits_a;\r
+      block->fb_ptr = fb_ptr;\r
+      block++;\r
+      \r
+      block->texels = texels_wide;\r
+      block->draw_mask_bits = left_mask_bits_a;\r
+      block->fb_ptr = fb_ptr + 1024;\r
+      block++;      \r
+\r
+      zip_4x32b(vector_cast(vec_4x32u, texels_wide), texels.high, texels.high);\r
+      block->texels = texels_wide;\r
+      block->draw_mask_bits = left_mask_bits_b;\r
+      block->fb_ptr = fb_ptr + 8;\r
+      block++;  \r
+      \r
+      block->texels = texels_wide;\r
+      block->draw_mask_bits = left_mask_bits_b;\r
+      block->fb_ptr = fb_ptr + 8 + 1024;\r
+      block++;  \r
+      \r
+      texture_offset += 8;\r
+      fb_ptr += 16;\r
+\r
+      while(blocks_remaining)\r
+      {\r
+        texture_block_ptr = texture_page_ptr + (texture_offset & texture_mask);\r
+        load_128b(texels, texture_block_ptr);\r
+\r
+        zip_4x32b(vector_cast(vec_4x32u, texels_wide), texels.low, texels.low);\r
+        block->texels = texels_wide;\r
+        block->draw_mask_bits = 0;\r
+        block->fb_ptr = fb_ptr;\r
+        block++;\r
+        \r
+        block->texels = texels_wide;\r
+        block->draw_mask_bits = 0;\r
+        block->fb_ptr = fb_ptr + 1024;\r
+        block++;      \r
+\r
+        zip_4x32b(vector_cast(vec_4x32u, texels_wide), texels.high, texels.high);\r
+        block->texels = texels_wide;\r
+        block->draw_mask_bits = 0;\r
+        block->fb_ptr = fb_ptr + 8;\r
+        block++;\r
+        \r
+        block->texels = texels_wide;\r
+        block->draw_mask_bits = 0;\r
+        block->fb_ptr = fb_ptr + 8 + 1024;\r
+        block++;\r
+        \r
+        texture_offset += 8;\r
+        fb_ptr += 16;\r
+\r
+        blocks_remaining--;\r
+      }\r
+\r
+      texture_block_ptr = texture_page_ptr + (texture_offset & texture_mask);\r
+      load_128b(texels, texture_block_ptr);\r
+      \r
+      zip_4x32b(vector_cast(vec_4x32u, texels_wide), texels.low, texels.low);\r
+      block->texels = texels_wide;\r
+      block->draw_mask_bits = right_mask_bits_a;\r
+      block->fb_ptr = fb_ptr;\r
+      block++;\r
+      \r
+      block->texels = texels_wide;\r
+      block->draw_mask_bits = right_mask_bits_a;\r
+      block->fb_ptr = fb_ptr + 1024;\r
+      block++;      \r
+\r
+      zip_4x32b(vector_cast(vec_4x32u, texels_wide), texels.high, texels.high);\r
+      block->texels = texels_wide;\r
+      block->draw_mask_bits = right_mask_bits_b;\r
+      block->fb_ptr = fb_ptr + 8;\r
+      block++;\r
+\r
+      block->texels = texels_wide;\r
+      block->draw_mask_bits = right_mask_bits_b;\r
+      block->fb_ptr = fb_ptr + 8 + 1024;      \r
+      block++;\r
+\r
+      fb_ptr += fb_ptr_pitch;\r
+\r
+      height--;\r
+      psx_gpu->num_blocks = num_blocks;\r
+    }\r
+  }\r
+}\r
+\r
+#endif\r
+\r
+static void setup_sprite_untextured_4x(psx_gpu_struct *psx_gpu, s32 x, s32 y,\r
+ s32 u, s32 v, s32 width, s32 height, u32 color)\r
+{\r
+  setup_sprite_untextured(psx_gpu, x, y, u, v, width * 2, height * 2, color);\r
+}\r
+\r
+#define setup_sprite_blocks_switch_textured_4x(texture_mode)                   \\r
+  setup_sprite_##texture_mode##_4x                                             \\r
+\r
+#define setup_sprite_blocks_switch_untextured_4x(texture_mode)                 \\r
+  setup_sprite_untextured_4x                                                   \\r
+\r
+#define setup_sprite_blocks_switch_4x(texturing, texture_mode)                 \\r
+  setup_sprite_blocks_switch_##texturing##_4x(texture_mode)                    \\r
+\r
+  \r
+#define render_sprite_blocks_switch_block_modulation_4x(texture_mode,          \\r
+ blend_mode, mask_evaluate, shading, dithering, texturing, blending,           \\r
+ modulation)                                                                   \\r
+{                                                                              \\r
+  setup_sprite_blocks_switch_4x(texturing, texture_mode),                      \\r
+  texture_sprite_blocks_switch_##texturing(texture_mode),                      \\r
+  shade_blocks_switch(unshaded, texturing, modulation, undithered, blending,   \\r
+   mask_evaluate),                                                             \\r
+  blend_blocks_switch(texturing, blending, blend_mode, mask_evaluate)          \\r
+}                                                                              \\r
+\r
+#define render_sprite_blocks_switch_block_blending_4x(texture_mode,            \\r
+ blend_mode, mask_evaluate, shading, dithering, texturing, blending)           \\r
+  render_sprite_blocks_switch_block_modulation_4x(texture_mode, blend_mode,    \\r
+   mask_evaluate, shading, dithering, texturing, blending, modulated),         \\r
+  render_sprite_blocks_switch_block_modulation_4x(texture_mode, blend_mode,    \\r
+   mask_evaluate, shading, dithering, texturing, blending, unmodulated)        \\r
+\r
+#define render_sprite_blocks_switch_block_texturing_4x(texture_mode,           \\r
+ blend_mode, mask_evaluate, shading, dithering, texturing)                     \\r
+  render_sprite_blocks_switch_block_blending_4x(texture_mode, blend_mode,      \\r
+   mask_evaluate, shading, dithering, texturing, unblended),                   \\r
+  render_sprite_blocks_switch_block_blending_4x(texture_mode, blend_mode,      \\r
+   mask_evaluate, shading, dithering, texturing, blended)                      \\r
+\r
+#define render_sprite_blocks_switch_block_dithering_4x(texture_mode,           \\r
+ blend_mode, mask_evaluate, shading, dithering)                                \\r
+  render_sprite_blocks_switch_block_texturing_4x(texture_mode, blend_mode,     \\r
+   mask_evaluate, shading, dithering, untextured),                             \\r
+  render_sprite_blocks_switch_block_texturing_4x(texture_mode, blend_mode,     \\r
+   mask_evaluate, shading, dithering, textured)                                \\r
+\r
+#define render_sprite_blocks_switch_block_shading_4x(texture_mode, blend_mode, \\r
+ mask_evaluate, shading)                                                       \\r
+  render_sprite_blocks_switch_block_dithering_4x(texture_mode, blend_mode,     \\r
+   mask_evaluate, shading, undithered),                                        \\r
+  render_sprite_blocks_switch_block_dithering_4x(texture_mode, blend_mode,     \\r
+   mask_evaluate, shading, dithered)                                           \\r
+\r
+#define render_sprite_blocks_switch_block_mask_evaluate_4x(texture_mode,       \\r
+ blend_mode, mask_evaluate)                                                    \\r
+  render_sprite_blocks_switch_block_shading_4x(texture_mode, blend_mode,       \\r
+   mask_evaluate, unshaded),                                                   \\r
+  render_sprite_blocks_switch_block_shading_4x(texture_mode, blend_mode,       \\r
+   mask_evaluate, shaded)                                                      \\r
+\r
+#define render_sprite_blocks_switch_block_blend_mode_4x(texture_mode,          \\r
+ blend_mode)                                                                   \\r
+  render_sprite_blocks_switch_block_mask_evaluate_4x(texture_mode, blend_mode, \\r
+   off),                                                                       \\r
+  render_sprite_blocks_switch_block_mask_evaluate_4x(texture_mode, blend_mode, \\r
+   on)                                                                         \\r
+\r
+#define render_sprite_blocks_switch_block_texture_mode_4x(texture_mode)        \\r
+  render_sprite_blocks_switch_block_blend_mode_4x(texture_mode, average),      \\r
+  render_sprite_blocks_switch_block_blend_mode_4x(texture_mode, add),          \\r
+  render_sprite_blocks_switch_block_blend_mode_4x(texture_mode, subtract),     \\r
+  render_sprite_blocks_switch_block_blend_mode_4x(texture_mode, add_fourth)    \\r
+\r
+#define render_sprite_blocks_switch_block_4x()                                 \\r
+  render_sprite_blocks_switch_block_texture_mode_4x(4bpp),                     \\r
+  render_sprite_blocks_switch_block_texture_mode_4x(8bpp),                     \\r
+  render_sprite_blocks_switch_block_texture_mode_4x(16bpp),                    \\r
+  render_sprite_blocks_switch_block_texture_mode_4x(4bpp)                      \\r
+\r
+\r
+render_block_handler_struct render_sprite_block_handlers_4x[] =\r
+{\r
+  render_sprite_blocks_switch_block_4x()\r
+};\r
+\r
+\r
+void render_sprite_4x(psx_gpu_struct *psx_gpu, s32 x, s32 y, u32 u, u32 v,\r
+ s32 width, s32 height, u32 flags, u32 color)\r
+{\r
+  s32 x_right = x + width - 1;\r
+  s32 y_bottom = y + height - 1;\r
+\r
+#ifdef PROFILE\r
+  sprites++;\r
+#endif\r
+\r
+  if(x < psx_gpu->viewport_start_x)\r
+  {\r
+    u32 clip = psx_gpu->viewport_start_x - x;\r
+    x += clip;\r
+    u += clip;\r
+    width -= clip;\r
+  }\r
+\r
+  if(y < psx_gpu->viewport_start_y)\r
+  {\r
+    s32 clip = psx_gpu->viewport_start_y - y;\r
+    y += clip;\r
+    v += clip;\r
+    height -= clip;\r
+  }\r
+\r
+  if(x_right > psx_gpu->viewport_end_x)\r
+    width -= x_right - psx_gpu->viewport_end_x;\r
+\r
+  if(y_bottom > psx_gpu->viewport_end_y)\r
+    height -= y_bottom - psx_gpu->viewport_end_y;\r
+\r
+  if((width <= 0) || (height <= 0))\r
+    return;\r
+\r
+  psx_gpu->vram_out_ptr = select_enhancement_buf_ptr(psx_gpu, x);\r
+\r
+  x *= 2;\r
+  y *= 2;\r
+\r
+#ifdef PROFILE\r
+  span_pixels += width * height;\r
+  spans += height;\r
+#endif\r
+\r
+  u32 render_state = flags &\r
+   (RENDER_FLAGS_MODULATE_TEXELS | RENDER_FLAGS_BLEND |\r
+   RENDER_FLAGS_TEXTURE_MAP);\r
+  render_state |=\r
+   (psx_gpu->render_state_base & ~RENDER_STATE_DITHER);\r
+\r
+  if((psx_gpu->render_state != render_state) ||\r
+   (psx_gpu->primitive_type != PRIMITIVE_TYPE_SPRITE))\r
+  {\r
+    psx_gpu->render_state = render_state;\r
+    flush_render_block_buffer(psx_gpu);\r
+#ifdef PROFILE\r
+    state_changes++;\r
+#endif\r
+  }\r
+\r
+  psx_gpu->primitive_type = PRIMITIVE_TYPE_SPRITE;\r
+\r
+  color &= 0xFFFFFF;\r
+\r
+  if(psx_gpu->triangle_color != color)\r
+  {\r
+    flush_render_block_buffer(psx_gpu);\r
+    psx_gpu->triangle_color = color;\r
+  }\r
+\r
+  if(color == 0x808080)\r
+    render_state |= RENDER_FLAGS_MODULATE_TEXELS;\r
+\r
+  render_block_handler_struct *render_block_handler =\r
+   &(render_sprite_block_handlers_4x[render_state]);\r
+  psx_gpu->render_block_handler = render_block_handler;\r
+\r
+  ((setup_sprite_function_type *)render_block_handler->setup_blocks)\r
+   (psx_gpu, x, y, u, v, width, height, color);\r
+}\r
+\r
index 294685a..4e1e403 100644 (file)
@@ -1,5 +1,6 @@
 /*
  * Copyright (C) 2011 Gilead Kutnick "Exophase" <exophase@gmail.com>
+ * Copyright (C) 2012 Gražvydas Ignotas "notaz" <notasas@gmail.com>
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as
 #define MAX_BLOCKS                                        64
 #define MAX_BLOCKS_PER_ROW                                128
 
-#define psx_gpu_test_mask_offset                          0
-#define psx_gpu_uvrg_offset                               16
-#define psx_gpu_uvrg_dx_offset                            32
-#define psx_gpu_uvrg_dy_offset                            48
-#define psx_gpu_u_block_span_offset                       64
-#define psx_gpu_v_block_span_offset                       80
-#define psx_gpu_r_block_span_offset                       96
-#define psx_gpu_g_block_span_offset                       112
-#define psx_gpu_b_block_span_offset                       128
-
-#define psx_gpu_b_dx_offset                               132
-
-#define psx_gpu_b_offset                                  144
-#define psx_gpu_b_dy_offset                               148
-#define psx_gpu_triangle_area_offset                      152
-#define psx_gpu_texture_window_settings_offset            156
-#define psx_gpu_current_texture_mask_offset               160
-#define psx_gpu_viewport_mask_offset                      164
-#define psx_gpu_dirty_textures_4bpp_mask_offset           168
-#define psx_gpu_dirty_textures_8bpp_mask_offset           172
-#define psx_gpu_dirty_textures_8bpp_alternate_mask_offset 176
-#define psx_gpu_triangle_color_offset                     180
-#define psx_gpu_dither_table_offset                       184
-#define psx_gpu_render_block_handler_offset               200
-#define psx_gpu_texture_page_ptr_offset                   204
-#define psx_gpu_texture_page_base_offset                  208
-#define psx_gpu_clut_ptr_offset                           212
-#define psx_gpu_vram_ptr_offset                           216
-
-#define psx_gpu_render_state_base_offset                  220
-#define psx_gpu_render_state_offset                       222
-#define psx_gpu_num_spans_offset                          224
-#define psx_gpu_num_blocks_offset                         226
-#define psx_gpu_offset_x_offset                           228
-#define psx_gpu_offset_y_offset                           230
-#define psx_gpu_clut_settings_offset                      232
-#define psx_gpu_texture_settings_offset                   234
-#define psx_gpu_viewport_start_x_offset                   236
-#define psx_gpu_viewport_start_y_offset                   238
-#define psx_gpu_viewport_end_x_offset                     240
-#define psx_gpu_viewport_end_y_offset                     242
-#define psx_gpu_mask_msb_offset                           244
-                                                          
-#define psx_gpu_triangle_winding_offset                   246
-#define psx_gpu_display_area_draw_enable_offset           247
-#define psx_gpu_current_texture_page_offset               248
-#define psx_gpu_last_8bpp_texture_page_offset             249
-#define psx_gpu_texture_mask_width_offset                 250
-#define psx_gpu_texture_mask_height_offset                251
-#define psx_gpu_texture_window_x_offset                   252
-#define psx_gpu_texture_window_y_offset                   253
-#define psx_gpu_primitive_type_offset                     254
-
-#define psx_gpu_reserved_a_offset                         255
-
-#define psx_gpu_blocks_offset                             0x0100
-#define psx_gpu_span_uvrg_offset_offset                   0x2100
-#define psx_gpu_span_edge_data_offset                     0x4100
-#define psx_gpu_span_b_offset_offset                      0x5100
+#define RENDER_STATE_MASK_EVALUATE                        0x20
+#define RENDER_FLAGS_MODULATE_TEXELS                      0x1
+#define RENDER_FLAGS_BLEND                                0x2
+
+#include "psx_gpu_offsets.h"
+
+#define psx_gpu_b_dx_offset (psx_gpu_b_block_span_offset + 4)
 
 #define edge_data_left_x_offset                           0
 #define edge_data_num_blocks_offset                       2
 #define uvrg_dx3l                                         d6
 #define uvrg_dx3h                                         d7
 
+#define uvrgb_phase                                       q13
 
 .align 4
 
@@ -369,11 +319,16 @@ function(compute_all_gradients)
   vmull.s16 ga_uvrg_y, d0_b, d1_b
   rsbmi ga_bx, ga_bx, #0
 
+  @ r12 = psx_gpu->uvrgb_phase
+  ldr r12, [ psx_gpu, #psx_gpu_uvrgb_phase_offset ]
+
   vmlsl.s16 ga_uvrg_y, d2_b, d3_b
   movs gs_by, ga_by, asr #31
 
   vshr.u64 d0, d30, #22
-  mov b_base, b0, lsl #16
+  add b_base, r12, b0, lsl #16
+
+  vdup.u32 uvrgb_phase, r12
 
   rsbmi ga_by, ga_by, #0
   vclt.s32 gs_uvrg_x, ga_uvrg_x, #0  @ gs_uvrg_x = ga_uvrg_x < 0
@@ -382,7 +337,6 @@ function(compute_all_gradients)
   ldrb r12, [ psx_gpu, #psx_gpu_triangle_winding_offset ]
   vclt.s32 gs_uvrg_y, ga_uvrg_y, #0  @ gs_uvrg_y = ga_uvrg_y < 0
 
-  add b_base, b_base, #0x8000
   rsb r12, r12, #0                   @ r12 = -(triangle->winding)
 
   vdup.u32 w_mask, r12               @ w_mask = { -w, -w, -w, -w }
@@ -391,7 +345,7 @@ function(compute_all_gradients)
   vshll.u16 uvrg_base, uvrg0, #16    @ uvrg_base = uvrg0 << 16
   vdup.u32 r_shift, r14              @ r_shift = { shift, shift, shift, shift }
 
-  vorr.u32 uvrg_base, #0x8000
+  vadd.u32 uvrg_base, uvrgb_phase
   vabs.s32 ga_uvrg_x, ga_uvrg_x      @ ga_uvrg_x = abs(ga_uvrg_x)
 
   vmov area_r_s, s0                  @ area_r_s = triangle_reciprocal
@@ -657,7 +611,7 @@ function(compute_all_gradients)
                                                                                \
   vdup.u32 edge_shifts, temp;                                                  \
   vsub.u32 heights_b, heights, c_0x01;                                         \
-  vshr.u32 height_reciprocals, edge_shifts, #12;                               \
+  vshr.u32 height_reciprocals, edge_shifts, #10;                               \
                                                                                \
   vmla.s32 heights_b, x_starts, heights;                                       \
   vbic.u16 edge_shifts, #0xE0;                                                 \
@@ -682,8 +636,8 @@ function(compute_all_gradients)
   vsub.u32 heights_b, heights, c_0x01;                                         \
   sub height_b_alt, height_minor_b, #1;                                        \
                                                                                \
-  vshr.u32 height_reciprocals, edge_shifts, #12;                               \
-  lsr height_reciprocal_alt, edge_shift_alt, #12;                              \
+  vshr.u32 height_reciprocals, edge_shifts, #10;                               \
+  lsr height_reciprocal_alt, edge_shift_alt, #10;                              \
                                                                                \
   vmla.s32 heights_b, x_starts, heights;                                       \
   mla height_b_alt, height_minor_b, start_c, height_b_alt;                     \
@@ -1221,6 +1175,10 @@ function(setup_spans_up_down)
 
   ldrh temp, [ psx_gpu, #psx_gpu_num_spans_offset ]
   add temp, temp, height_minor_b
+
+  cmp temp, #MAX_SPANS
+  beq 5f
+
   strh temp, [ psx_gpu, #psx_gpu_num_spans_offset ]
 
  2:                                                     
@@ -1236,6 +1194,15 @@ function(setup_spans_up_down)
   setup_spans_prologue_b()
   bal 4b
 
+ 5:
+  // FIXME: overflow corner case
+  sub temp, temp, height_minor_b
+  bics height_minor_b, #3
+  add temp, temp, height_minor_b
+  strh temp, [ psx_gpu, #psx_gpu_num_spans_offset ]
+  bne 2b
+  bal 1b
+
 .pool
 
 #undef span_uvrg_offset
@@ -1393,7 +1360,7 @@ function(setup_blocks_shaded_textured_dithered_##swizzling##_indirect)         \
   add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset;                \
                                                                                \
   ldrh y, [ span_edge_data, #edge_data_y_offset ];                             \
-  ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_ptr_offset ];                           \
+  ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ];                       \
                                                                                \
   cmp span_num_blocks, #0;                                                     \
   beq 1f;                                                                      \
@@ -1660,7 +1627,7 @@ function(setup_blocks_unshaded_textured_dithered_##swizzling##_indirect)       \
   add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset;                \
                                                                                \
   ldrh y, [ span_edge_data, #edge_data_y_offset ];                             \
-  ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_ptr_offset ];                           \
+  ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ];                       \
                                                                                \
   cmp span_num_blocks, #0;                                                     \
   beq 1f;                                                                      \
@@ -1855,7 +1822,7 @@ function(setup_blocks_unshaded_untextured_undithered_unswizzled_indirect)
   ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ]
   ldrh y, [ span_edge_data, #edge_data_y_offset ]
 
-  ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_ptr_offset ]
+  ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]
 
   cmp span_num_blocks, #0
   beq 1f
@@ -1975,7 +1942,7 @@ function(setup_blocks_unshaded_untextured_undithered_unswizzled_direct)
   ldrh span_num_blocks, [ span_edge_data, #edge_data_num_blocks_offset ]
   ldrh y, [ span_edge_data, #edge_data_y_offset ]
 
-  ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_ptr_offset ]
+  ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]
 
   cmp span_num_blocks, #0
   beq 1f
@@ -2162,7 +2129,7 @@ function(setup_blocks_shaded_untextured_##dithering##_unswizzled_indirect)     \
   add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset;                \
                                                                                \
   ldrh y, [ span_edge_data, #edge_data_y_offset ];                             \
-  ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_ptr_offset ];                           \
+  ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ];                       \
                                                                                \
   cmp span_num_blocks, #0;                                                     \
   beq 1f;                                                                      \
@@ -2402,7 +2369,7 @@ function(setup_blocks_shaded_untextured_##dithering##_unswizzled_direct)       \
   add dither_offset_ptr, psx_gpu, #psx_gpu_dither_table_offset;                \
                                                                                \
   ldrh y, [ span_edge_data, #edge_data_y_offset ];                             \
-  ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_ptr_offset ];                           \
+  ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ];                       \
                                                                                \
   cmp span_num_blocks, #0;                                                     \
   beq 1f;                                                                      \
@@ -3239,6 +3206,7 @@ function(shade_blocks_##shading##_textured_modulated_##dithering##_##target)   \
   shade_blocks_textured_modulated_load_bdm_##shading();                        \
   vshrn.u16 texels_b, texels, #7;                                              \
                                                                                \
+  pld [ block_ptr_load_a ];                                                    \
   vmovn.u16 texels_r, texels;                                                  \
   vmlal.u8 pixels, pixels_r_low, d64_1;                                        \
                                                                                \
@@ -3437,10 +3405,12 @@ function(shade_blocks_textured_unmodulated_direct)
    [ draw_mask_bits_ptr, :16 ], c_64
   vbif.u16 fb_pixels, pixels, draw_mask_combined
 
-  vld1.u32 { pixels }, [ block_ptr_load, :128 ], c_64
-
   sub fb_ptr_cmp, fb_ptr_next, fb_ptr
+  pld [ fb_ptr_next, #64 ]
+
   add fb_ptr_cmp, fb_ptr_cmp, #14
+  vld1.u32 { pixels }, [ block_ptr_load, :128 ], c_64
+
   cmp fb_ptr_cmp, #28
   bls 4f
 
@@ -3799,11 +3769,15 @@ function(blend_blocks_textured_add_##mask_evaluate)                            \
   vorr.u16 blend_pixels, fb_pixels_rb, fb_pixels_g;                            \
   vand.u16 pixels_mg, pixels, d128_0x83E0;                                     \
                                                                                \
-  vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
-  vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64;                       \
+  sub fb_ptr_cmp, fb_ptr_next, fb_ptr;                                         \
+  pld [ fb_ptr_next, #64 ];                                                    \
                                                                                \
   sub fb_ptr_cmp, fb_ptr_next, fb_ptr;                                         \
+  vbit.u16 blend_pixels, fb_pixels, draw_mask;                                 \
+                                                                               \
   add fb_ptr_cmp, fb_ptr_cmp, #14;                                             \
+  vld1.u32 { draw_mask }, [ draw_mask_ptr, :128 ], c_64;                       \
+                                                                               \
   cmp fb_ptr_cmp, #28;                                                         \
   bls 2f;                                                                      \
                                                                                \
@@ -4456,6 +4430,12 @@ function(render_block_fill_body)
 #define draw_mask_fb_ptr_left                             d2
 #define draw_mask_fb_ptr_right                            d3
 
+#define draw_mask_fb_ptr_left_a                           d2
+#define draw_mask_fb_ptr_left_b                           d3
+#define draw_mask_fb_ptr_right_a                          d10
+#define draw_mask_fb_ptr_right_b                          d11
+#define draw_masks_fb_ptrs2                               q5
+
 #define clut_low_a                                        d4
 #define clut_low_b                                        d5
 #define clut_high_a                                       d6
@@ -4467,37 +4447,24 @@ function(render_block_fill_body)
 #define clut_a                                            q2
 #define clut_b                                            q3
 
-#define texels_low                                        d10
-#define texels_high                                       d11
-
+#define texels_low                                        d12
+#define texels_high                                       d13
 
-setup_sprite_flush_blocks_single:
-  vpush { q1 - q4 }
-
-  stmdb sp!, { r0 - r3, r12, r14 }
-  bl flush_render_block_buffer
-  ldmia sp!, { r0 - r3, r12, r14 }
-
-  vpop { q1 - q4 }
-
-  add block, psx_gpu, #psx_gpu_blocks_offset
-
-  mov num_blocks, sub_tile_height
-  bx lr
+#define texels_wide_low                                   d14
+#define texels_wide_high                                  d15
+#define texels_wide                                       q7
 
 
-setup_sprite_flush_blocks_double:
-  vpush { q1 - q4 }
+setup_sprite_flush_blocks:
+  vpush { q1 - q5 }
 
   stmdb sp!, { r0 - r3, r12, r14 }
   bl flush_render_block_buffer
   ldmia sp!, { r0 - r3, r12, r14 }
 
-  vpop { q1 - q4 }
+  vpop { q1 - q5 }
 
   add block, psx_gpu, #psx_gpu_blocks_offset
-
-  mov num_blocks, sub_tile_height, lsl #1
   bx lr
 
 
@@ -4535,8 +4502,6 @@ setup_sprite_update_texture_8bpp_cache:
   blne setup_sprite_update_texture_8bpp_cache                                  \
 
 
-#define setup_sprite_tile_setup_block_no(side, offset, texture_mode)           \
-
 #define setup_sprite_block_count_single()                                      \
   sub_tile_height                                                              \
 
@@ -4547,7 +4512,8 @@ setup_sprite_update_texture_8bpp_cache:
   add num_blocks, num_blocks, setup_sprite_block_count_##type();               \
   cmp num_blocks, #MAX_BLOCKS;                                                 \
                                                                                \
-  blgt setup_sprite_flush_blocks_##type                                        \
+  movgt num_blocks, setup_sprite_block_count_##type();                         \
+  blgt setup_sprite_flush_blocks                                               \
 
 
 #define setup_sprite_tile_full_4bpp(edge)                                      \
@@ -4729,31 +4695,33 @@ setup_sprite_update_texture_8bpp_cache:
 #define setup_sprite_tile_column_edge_post_adjust_full(edge)                   \
 
 
-#define setup_sprite_tile_column_height_single(edge_mode, edge, texture_mode)  \
+#define setup_sprite_tile_column_height_single(edge_mode, edge, texture_mode,  \
+ x4mode)                                                                       \
   mov sub_tile_height, column_data;                                            \
-  setup_sprite_tile_column_edge_pre_adjust_##edge_mode(edge);                  \
-  setup_sprite_tile_##edge_mode##_##texture_mode(edge);                        \
-  setup_sprite_tile_column_edge_post_adjust_##edge_mode(edge)                  \
+  setup_sprite_tile_column_edge_pre_adjust_##edge_mode##x4mode(edge);          \
+  setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge);                \
+  setup_sprite_tile_column_edge_post_adjust_##edge_mode##x4mode(edge)          \
 
-#define setup_sprite_tile_column_height_multi(edge_mode, edge, texture_mode)   \
+#define setup_sprite_tile_column_height_multi(edge_mode, edge, texture_mode,   \
+ x4mode)                                                                       \
   and sub_tile_height, column_data, #0xFF;                                     \
   mov tiles_remaining, column_data, lsr #16;                                   \
-  setup_sprite_tile_column_edge_pre_adjust_##edge_mode(edge);                  \
-  setup_sprite_tile_##edge_mode##_##texture_mode(edge);                        \
+  setup_sprite_tile_column_edge_pre_adjust_##edge_mode##x4mode(edge);          \
+  setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge);                \
                                                                                \
   subs tiles_remaining, tiles_remaining, #1;                                   \
   beq 2f;                                                                      \
                                                                                \
  3:                                                                            \
   mov sub_tile_height, #16;                                                    \
-  setup_sprite_tile_##edge_mode##_##texture_mode(edge);                        \
+  setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge);                \
   subs tiles_remaining, tiles_remaining, #1;                                   \
   bne 3b;                                                                      \
                                                                                \
  2:                                                                            \
   uxtb sub_tile_height, column_data, ror #8;                                   \
-  setup_sprite_tile_##edge_mode##_##texture_mode(edge);                        \
-  setup_sprite_tile_column_edge_post_adjust_##edge_mode(edge)                  \
+  setup_sprite_tile_##edge_mode##_##texture_mode##x4mode(edge);                \
+  setup_sprite_tile_column_edge_post_adjust_##edge_mode##x4mode(edge)          \
 
 
 #define setup_sprite_column_data_single()                                      \
@@ -4772,17 +4740,30 @@ setup_sprite_update_texture_8bpp_cache:
                                                                                \
   orr column_data, column_data, height_rounded, lsl #8                         \
 
-#define setup_sprite_tile_column_width_single(texture_mode, multi_height,      \
- edge_mode, edge)                                                              \
- setup_sprite_##texture_mode##_single_##multi_height##_##edge_mode##_##edge:   \
+#define setup_sprite_setup_left_draw_mask_fb_ptr()                             \
+  vdup.u8 draw_mask_fb_ptr_left, block_masks[0];                               \
+  vdup.u8 draw_mask_fb_ptr_right, block_masks[1]                               \
+
+#define setup_sprite_setup_left_draw_mask_fb_ptr_advance_column()              \
+  mov fb_ptr_advance_column, #32;                                              \
+  vdup.u8 draw_mask_fb_ptr_left, block_masks[0];                               \
+                                                                               \
+  sub fb_ptr_advance_column, height, lsl #11;                                  \
+  vdup.u8 draw_mask_fb_ptr_right, block_masks[1]                               \
+
+#define setup_sprite_setup_right_draw_mask_fb_ptr()                            \
+  vdup.u8 draw_mask_fb_ptr_left, block_masks[4];                               \
+  vdup.u8 draw_mask_fb_ptr_right, block_masks[5]                               \
+
+#define setup_sprite_tile_column_width_single(tm, multi_height, edge_mode,     \
+ edge, x4mode)                                                                 \
+ setup_sprite_##tm##_single_##multi_height##_##edge_mode##_##edge##x4mode:     \
   setup_sprite_column_data_##multi_height();                                   \
   vext.32 block_masks_shifted, block_masks, block_masks, #1;                   \
   vorr.u32 block_masks, block_masks, block_masks_shifted;                      \
-  vdup.u8 draw_mask_fb_ptr_left, block_masks[0];                               \
-  vdup.u8 draw_mask_fb_ptr_right, block_masks[1];                              \
+  setup_sprite_setup_left_draw_mask_fb_ptr##x4mode();                          \
                                                                                \
-  setup_sprite_tile_column_height_##multi_height(edge_mode, edge,              \
-   texture_mode);                                                              \
+  setup_sprite_tile_column_height_##multi_height(edge_mode, edge, tm, x4mode); \
   ldmia sp!, { r4 - r11, pc }                                                  \
 
 #define setup_sprite_tiled_advance_column()                                    \
@@ -4791,39 +4772,337 @@ setup_sprite_update_texture_8bpp_cache:
   subeq texture_offset_base, texture_offset_base, #(0x100 + 0xF00)             \
 
 #define setup_sprite_tile_column_width_multi(tm, multi_height, left_mode,      \
- right_mode)                                                                   \
- setup_sprite_##tm##_multi_##multi_height##_##left_mode##_##right_mode:        \
+ right_mode, x4mode)                                                           \
+ setup_sprite_##tm##_multi_##multi_height##_##left_mode##_##right_mode##x4mode:\
   setup_sprite_column_data_##multi_height();                                   \
-  mov fb_ptr_advance_column, #32;                                              \
                                                                                \
-  sub fb_ptr_advance_column, height, lsl #11;                                  \
-  vdup.u8 draw_mask_fb_ptr_left, block_masks[0];                               \
+  setup_sprite_setup_left_draw_mask_fb_ptr_advance_column##x4mode();           \
                                                                                \
-  vdup.u8 draw_mask_fb_ptr_right, block_masks[1];                              \
-  setup_sprite_tile_column_height_##multi_height(left_mode, right, tm);        \
+  setup_sprite_tile_column_height_##multi_height(left_mode, right, tm, x4mode);\
                                                                                \
   subs tile_width, tile_width, #2;                                             \
   add fb_ptr, fb_ptr, fb_ptr_advance_column;                                   \
                                                                                \
-  vmov.u8 draw_masks_fb_ptrs, #0;                                              \
   beq 1f;                                                                      \
                                                                                \
+  vmov.u8 draw_masks_fb_ptrs, #0;                                              \
+  vmov.u8 draw_masks_fb_ptrs2, #0;                                             \
+                                                                               \
  0:                                                                            \
   setup_sprite_tiled_advance_column();                                         \
-  setup_sprite_tile_column_height_##multi_height(full, none, tm);              \
+  setup_sprite_tile_column_height_##multi_height(full, none, tm, x4mode);      \
   add fb_ptr, fb_ptr, fb_ptr_advance_column;                                   \
   subs tile_width, tile_width, #1;                                             \
   bne 0b;                                                                      \
                                                                                \
  1:                                                                            \
-  vdup.u8 draw_mask_fb_ptr_left, block_masks[4];                               \
-  vdup.u8 draw_mask_fb_ptr_right, block_masks[5];                              \
+  setup_sprite_setup_right_draw_mask_fb_ptr##x4mode();                         \
                                                                                \
   setup_sprite_tiled_advance_column();                                         \
-  setup_sprite_tile_column_height_##multi_height(right_mode, left, tm);        \
+  setup_sprite_tile_column_height_##multi_height(right_mode, left, tm, x4mode);\
   ldmia sp!, { r4 - r11, pc }                                                  \
 
 
+#define setup_sprite_offset_u_adjust()                                         \
+
+#define setup_sprite_get_left_block_mask()                                     \
+  and left_block_mask, left_block_mask, #0xFF                                  \
+
+#define setup_sprite_compare_left_block_mask()                                 \
+  cmp left_block_mask, #0xFF                                                   \
+
+#define setup_sprite_get_right_block_mask()                                    \
+  uxtb right_block_mask, right_block_mask, ror #8                              \
+
+#define setup_sprite_compare_right_block_mask()                                \
+  cmp right_block_mask, #0xFF                                                  \
+
+
+
+/* 4x stuff */
+#define fb_ptr2 column_data
+
+#define setup_sprite_offset_u_adjust_4x()                                      \
+  sub fb_ptr, fb_ptr, offset_u, lsl #1;                                        \
+  lsl offset_u_right, #1;                                                      \
+  lsl offset_u, #1;                                                            \
+  add offset_u_right, #1                                                       \
+
+#define setup_sprite_get_left_block_mask_4x()                                  \
+  sxth left_block_mask, left_block_mask                                        \
+
+#define setup_sprite_compare_left_block_mask_4x()                              \
+  cmp left_block_mask, #0xFFFFFFFF                                             \
+
+#define setup_sprite_get_right_block_mask_4x()                                 \
+  sxth right_block_mask, right_block_mask, ror #16                             \
+
+#define setup_sprite_compare_right_block_mask_4x()                             \
+  cmp right_block_mask, #0xFFFFFFFF                                            \
+
+
+#define widen_texels_16bpp(texels_)                                            \
+  vmov texels_wide_low, texels_;                                               \
+  vmov texels_wide_high, texels_;                                              \
+  vzip.16 texels_wide_low, texels_wide_high                                    \
+
+#define widen_texels_8bpp(texels_)                                             \
+  vmov texels_wide_low, texels_;                                               \
+  vmov texels_wide_high, texels_;                                              \
+  vzip.8 texels_wide_low, texels_wide_high                                     \
+
+#define write_block_16bpp(texels_, block_, draw_mask_fb_ptr_, fb_ptr_)         \
+  vst1.u32 { texels_ }, [ block_, :128 ];                                      \
+  add block_, block_, #40;                                                     \
+                                                                               \
+  vmov.u32 draw_mask_fb_ptr_[1], fb_ptr_;                                      \
+  vst1.u32 { draw_mask_fb_ptr_ }, [ block_, :64 ];                             \
+  add block_, block_, #24                                                      \
+
+/* assumes 16-byte offset already added to block_ */
+#define write_block_8bpp(texels_, block_, draw_mask_fb_ptr_, fb_ptr_)          \
+  vst1.u32 { texels_ }, [ block_, :64 ];                                       \
+  add block_, block_, #24;                                                     \
+                                                                               \
+  vmov.u32 draw_mask_fb_ptr_[1], fb_ptr_;                                      \
+  vst1.u32 { draw_mask_fb_ptr_ }, [ block_, :64 ];                             \
+  add block_, block_, #40                                                      \
+
+#define do_texture_block_16bpp_4x(fb_ptr_tmp, draw_mask_fb_ptr_a_,             \
+ draw_mask_fb_ptr_b_)                                                          \
+  widen_texels_16bpp(texels_low);                                              \
+  add fb_ptr_tmp, fb_ptr, #1024*2;                                             \
+                                                                               \
+  write_block_16bpp(texels_wide, block, draw_mask_fb_ptr_a_, fb_ptr);          \
+                                                                               \
+  write_block_16bpp(texels_wide, block, draw_mask_fb_ptr_a_, fb_ptr_tmp);      \
+  widen_texels_16bpp(texels_high);                                             \
+                                                                               \
+  add fb_ptr_tmp, fb_ptr, #8*2;                                                \
+  write_block_16bpp(texels_wide, block, draw_mask_fb_ptr_b_, fb_ptr_tmp);      \
+                                                                               \
+  add fb_ptr_tmp, fb_ptr_tmp, #1024*2;                                         \
+  write_block_16bpp(texels_wide, block, draw_mask_fb_ptr_b_, fb_ptr_tmp)       \
+
+#define do_texture_block_8bpp_4x(fb_ptr_tmp, draw_mask_fb_ptr_a_,              \
+ draw_mask_fb_ptr_b_)                                                          \
+  widen_texels_8bpp(texels);                                                   \
+  add fb_ptr_tmp, fb_ptr, #1024*2;                                             \
+                                                                               \
+  write_block_8bpp(texels_wide_low, block, draw_mask_fb_ptr_a_, fb_ptr);       \
+  write_block_8bpp(texels_wide_low, block, draw_mask_fb_ptr_a_, fb_ptr_tmp);   \
+                                                                               \
+  add fb_ptr_tmp, fb_ptr, #8*2;                                                \
+  write_block_8bpp(texels_wide_high, block, draw_mask_fb_ptr_b_, fb_ptr_tmp);  \
+                                                                               \
+  add fb_ptr_tmp, fb_ptr_tmp, #1024*2;                                         \
+  write_block_8bpp(texels_wide_high, block, draw_mask_fb_ptr_b_, fb_ptr_tmp)   \
+
+
+#define setup_sprite_tiled_initialize_4bpp_4x()                                \
+  ldr clut_ptr, [ psx_gpu, #psx_gpu_clut_ptr_offset ];                         \
+  vld1.u32 { clut_a, clut_b }, [ clut_ptr, :128 ];                             \
+                                                                               \
+  vuzp.u8 clut_a, clut_b                                                       \
+
+#define setup_sprite_tiled_initialize_8bpp_4x()                                \
+
+
+#define setup_sprite_block_count_single_4x()                                   \
+  sub_tile_height, lsl #2                                                      \
+
+#define setup_sprite_block_count_double_4x()                                   \
+  sub_tile_height, lsl #(1+2)                                                  \
+
+#define setup_sprite_tile_full_4bpp_4x(edge)                                   \
+  setup_sprite_tile_add_blocks(double_4x);                                     \
+  str column_data, [sp, #-8]!; /* fb_ptr2 */                                   \
+                                                                               \
+ 4:                                                                            \
+  and texture_block_ptr, texture_offset, texture_mask;                         \
+  pld [ fb_ptr ];                                                              \
+                                                                               \
+  add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
+  vld1.u32 { texels }, [ texture_block_ptr, :64 ];                             \
+                                                                               \
+  add texture_block_ptr, texture_offset, #8;                                   \
+  vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels;                       \
+                                                                               \
+  and texture_block_ptr, texture_block_ptr, texture_mask;                      \
+  vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels;                    \
+                                                                               \
+  vzip.8 texels_low, texels_high;                                              \
+  do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_left_a,                  \
+   draw_mask_fb_ptr_left_b);                                                   \
+                                                                               \
+  add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
+  pld [ fb_ptr, #2048 ];                                                       \
+                                                                               \
+  vld1.u32 { texels }, [ texture_block_ptr, :64 ];                             \
+  add fb_ptr, fb_ptr, #16*2;                                                   \
+                                                                               \
+  vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels;                       \
+  vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels;                    \
+                                                                               \
+  vzip.8 texels_low, texels_high;                                              \
+  do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_right_a,                 \
+   draw_mask_fb_ptr_right_b);                                                  \
+                                                                               \
+  add texture_offset, texture_offset, #0x10;                                   \
+  add fb_ptr, fb_ptr, #(2048 - 16) * 2;                                        \
+                                                                               \
+  subs sub_tile_height, sub_tile_height, #1;                                   \
+  bne 4b;                                                                      \
+                                                                               \
+  ldr column_data, [sp], #8; /* fb_ptr2 */                                     \
+  add texture_offset, texture_offset, #0xF00;                                  \
+  strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]                     \
+
+
+#define setup_sprite_tile_half_4bpp_4x(edge)                                   \
+  setup_sprite_tile_add_blocks(single_4x);                                     \
+  str column_data, [sp, #-8]!; /* fb_ptr2 */                                   \
+                                                                               \
+ 4:                                                                            \
+  and texture_block_ptr, texture_offset, texture_mask;                         \
+  pld [ fb_ptr ];                                                              \
+                                                                               \
+  add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
+  vld1.u32 { texels }, [ texture_block_ptr, :64 ];                             \
+                                                                               \
+  add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
+  vtbl.8 texels_low, { clut_low_a, clut_low_b }, texels;                       \
+                                                                               \
+  vtbl.8 texels_high, { clut_high_a, clut_high_b }, texels;                    \
+  add texture_offset, texture_offset, #0x10;                                   \
+                                                                               \
+  vzip.8 texels_low, texels_high;                                              \
+  do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_##edge##_a,              \
+   draw_mask_fb_ptr_##edge##_b);                                               \
+                                                                               \
+  pld [ fb_ptr, #2048 ];                                                       \
+  add fb_ptr, fb_ptr, #2048 * 2;                                               \
+                                                                               \
+  subs sub_tile_height, sub_tile_height, #1;                                   \
+  bne 4b;                                                                      \
+                                                                               \
+  ldr column_data, [sp], #8; /* fb_ptr2 */                                     \
+  add texture_offset, texture_offset, #0xF00;                                  \
+  strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]                     \
+
+
+#define setup_sprite_tile_full_8bpp_4x(edge)                                   \
+  setup_sprite_tile_add_blocks(double_4x);                                     \
+  add block, block, #16;                                                       \
+  str column_data, [sp, #-8]!; /* fb_ptr2 */                                   \
+                                                                               \
+ 4:                                                                            \
+  and texture_block_ptr, texture_offset, texture_mask;                         \
+  pld [ fb_ptr ];                                                              \
+                                                                               \
+  add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
+  vld1.u32 { texels }, [ texture_block_ptr, :64 ];                             \
+                                                                               \
+  add texture_block_ptr, texture_offset, #8;                                   \
+  do_texture_block_8bpp_4x(fb_ptr2, draw_mask_fb_ptr_left_a,                   \
+   draw_mask_fb_ptr_left_b);                                                   \
+                                                                               \
+  pld [ fb_ptr, #2048 ];                                                       \
+  and texture_block_ptr, texture_block_ptr, texture_mask;                      \
+                                                                               \
+  add fb_ptr, fb_ptr, #16*2;                                                   \
+  add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
+                                                                               \
+  vld1.u32 { texels }, [ texture_block_ptr, :64 ];                             \
+                                                                               \
+  do_texture_block_8bpp_4x(fb_ptr2, draw_mask_fb_ptr_right_a,                  \
+   draw_mask_fb_ptr_right_b);                                                  \
+                                                                               \
+  add texture_offset, texture_offset, #0x10;                                   \
+  add fb_ptr, fb_ptr, #(2048 - 16) * 2;                                        \
+                                                                               \
+  subs sub_tile_height, sub_tile_height, #1;                                   \
+  bne 4b;                                                                      \
+                                                                               \
+  sub block, block, #16;                                                       \
+  ldr column_data, [sp], #8; /* fb_ptr2 */                                     \
+  add texture_offset, texture_offset, #0xF00;                                  \
+  strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]                     \
+
+  
+#define setup_sprite_tile_half_8bpp_4x(edge)                                   \
+  setup_sprite_tile_add_blocks(single_4x);                                     \
+  add block, block, #16;                                                       \
+  str column_data, [sp, #-8]!; /* fb_ptr2 */                                   \
+                                                                               \
+ 4:                                                                            \
+  and texture_block_ptr, texture_offset, texture_mask;                         \
+  pld [ fb_ptr ];                                                              \
+                                                                               \
+  add texture_block_ptr, texture_page_ptr, texture_block_ptr;                  \
+  vld1.u32 { texels }, [ texture_block_ptr, :64 ];                             \
+                                                                               \
+  pld [ fb_ptr, #2048 ];                                                       \
+  do_texture_block_8bpp_4x(fb_ptr2, draw_mask_fb_ptr_##edge##_a,               \
+   draw_mask_fb_ptr_##edge##_b);                                               \
+                                                                               \
+  add texture_offset, texture_offset, #0x10;                                   \
+  add fb_ptr, fb_ptr, #2048 * 2;                                               \
+                                                                               \
+  subs sub_tile_height, sub_tile_height, #1;                                   \
+  bne 4b;                                                                      \
+                                                                               \
+  sub block, block, #16;                                                       \
+  ldr column_data, [sp], #8; /* fb_ptr2 */                                     \
+  add texture_offset, texture_offset, #0xF00;                                  \
+  strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]                     \
+
+#define setup_sprite_tile_column_edge_pre_adjust_half_right_4x()               \
+  add texture_offset, texture_offset_base, #8;                                 \
+  add fb_ptr, fb_ptr, #16 * 2                                                  \
+
+#define setup_sprite_tile_column_edge_pre_adjust_half_left_4x()                \
+  mov texture_offset, texture_offset_base                                      \
+
+#define setup_sprite_tile_column_edge_pre_adjust_half_4x(edge)                 \
+  setup_sprite_tile_column_edge_pre_adjust_half_##edge##_4x()                  \
+
+#define setup_sprite_tile_column_edge_pre_adjust_full_4x(edge)                 \
+  mov texture_offset, texture_offset_base                                      \
+
+#define setup_sprite_tile_column_edge_post_adjust_half_right_4x()              \
+  sub fb_ptr, fb_ptr, #16 * 2                                                  \
+
+#define setup_sprite_tile_column_edge_post_adjust_half_left_4x()               \
+
+#define setup_sprite_tile_column_edge_post_adjust_half_4x(edge)                \
+  setup_sprite_tile_column_edge_post_adjust_half_##edge##_4x()                 \
+
+#define setup_sprite_tile_column_edge_post_adjust_full_4x(edge)                \
+
+
+#define setup_sprite_setup_left_draw_mask_fb_ptr_4x()                          \
+  vdup.u8 draw_mask_fb_ptr_left_a, block_masks[0];                             \
+  vdup.u8 draw_mask_fb_ptr_left_b, block_masks[1];                             \
+  vdup.u8 draw_mask_fb_ptr_right_a, block_masks[2];                            \
+  vdup.u8 draw_mask_fb_ptr_right_b, block_masks[3]                             \
+
+#define setup_sprite_setup_left_draw_mask_fb_ptr_advance_column_4x()           \
+  mov fb_ptr_advance_column, #32 * 2;                                          \
+  vdup.u8 draw_mask_fb_ptr_left_a, block_masks[0];                             \
+  vdup.u8 draw_mask_fb_ptr_left_b, block_masks[1];                             \
+  sub fb_ptr_advance_column, height, lsl #11 + 1;                              \
+  vdup.u8 draw_mask_fb_ptr_right_a, block_masks[2];                            \
+  vdup.u8 draw_mask_fb_ptr_right_b, block_masks[3]                             \
+
+#define setup_sprite_setup_right_draw_mask_fb_ptr_4x()                         \
+  vdup.u8 draw_mask_fb_ptr_left_a, block_masks[4];                             \
+  vdup.u8 draw_mask_fb_ptr_left_b, block_masks[5];                             \
+  vdup.u8 draw_mask_fb_ptr_right_a, block_masks[6];                            \
+  vdup.u8 draw_mask_fb_ptr_right_b, block_masks[7]                             \
+
+
 // r0: psx_gpu
 // r1: x
 // r2: y
@@ -4833,34 +5112,48 @@ setup_sprite_update_texture_8bpp_cache:
 // [ sp + 8 ]: height
 // [ sp + 12 ]: color (unused)
 
-#define setup_sprite_tiled_builder(texture_mode)                               \
-                                                                               \
-setup_sprite_tile_column_width_multi(texture_mode,  multi,  full, full);       \
-setup_sprite_tile_column_width_single(texture_mode, multi,  full, none);       \
-setup_sprite_tile_column_width_multi(texture_mode,  single, full, full);       \
-setup_sprite_tile_column_width_single(texture_mode, single, full, none);       \
-setup_sprite_tile_column_width_multi(texture_mode,  multi,  half, full);       \
-setup_sprite_tile_column_width_single(texture_mode, multi,  half, right);      \
-setup_sprite_tile_column_width_multi(texture_mode,  single, half, full);       \
-setup_sprite_tile_column_width_single(texture_mode, single, half, right);      \
-setup_sprite_tile_column_width_multi(texture_mode,  multi,  full, half);       \
-setup_sprite_tile_column_width_single(texture_mode, multi,  half, left);       \
-setup_sprite_tile_column_width_multi(texture_mode,  single, full, half);       \
-setup_sprite_tile_column_width_single(texture_mode, single, half, left);       \
-setup_sprite_tile_column_width_multi(texture_mode,  multi,  half, half);       \
-setup_sprite_tile_column_width_multi(texture_mode,  single, half, half);       \
+#define setup_sprite_tiled_builder(texture_mode, x4mode)                       \
+                                                                               \
+setup_sprite_tile_column_width_multi(texture_mode,  multi,  full, full,        \
+  x4mode);                                                                     \
+setup_sprite_tile_column_width_single(texture_mode, multi,  full, none,        \
+  x4mode);                                                                     \
+setup_sprite_tile_column_width_multi(texture_mode,  single, full, full,        \
+  x4mode);                                                                     \
+setup_sprite_tile_column_width_single(texture_mode, single, full, none,        \
+  x4mode);                                                                     \
+setup_sprite_tile_column_width_multi(texture_mode,  multi,  half, full,        \
+  x4mode);                                                                     \
+setup_sprite_tile_column_width_single(texture_mode, multi,  half, right,       \
+  x4mode);                                                                     \
+setup_sprite_tile_column_width_multi(texture_mode,  single, half, full,        \
+  x4mode);                                                                     \
+setup_sprite_tile_column_width_single(texture_mode, single, half, right,       \
+  x4mode);                                                                     \
+setup_sprite_tile_column_width_multi(texture_mode,  multi,  full, half,        \
+  x4mode);                                                                     \
+setup_sprite_tile_column_width_single(texture_mode, multi,  half, left,        \
+  x4mode);                                                                     \
+setup_sprite_tile_column_width_multi(texture_mode,  single, full, half,        \
+  x4mode);                                                                     \
+setup_sprite_tile_column_width_single(texture_mode, single, half, left,        \
+  x4mode);                                                                     \
+setup_sprite_tile_column_width_multi(texture_mode,  multi,  half, half,        \
+  x4mode);                                                                     \
+setup_sprite_tile_column_width_multi(texture_mode,  single, half, half,        \
+  x4mode);                                                                     \
                                                                                \
 .align 4;                                                                      \
                                                                                \
-function(setup_sprite_##texture_mode)                                          \
+function(setup_sprite_##texture_mode##x4mode)                                  \
   stmdb sp!, { r4 - r11, r14 };                                                \
-  setup_sprite_tiled_initialize_##texture_mode();                              \
+  setup_sprite_tiled_initialize_##texture_mode##x4mode();                      \
                                                                                \
   ldr v, [ sp, #36 ];                                                          \
   and offset_u, u, #0xF;                                                       \
                                                                                \
   ldr width, [ sp, #40 ];                                                      \
-  ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_ptr_offset ];                           \
+  ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ];                       \
                                                                                \
   ldr height, [ sp, #44 ];                                                     \
   add fb_ptr, fb_ptr, y, lsl #11;                                              \
@@ -4883,11 +5176,13 @@ function(setup_sprite_##texture_mode)                                          \
                                                                                \
   /* texture_offset_base = VH-UH-UL-00                                       */\
   bfi texture_offset_base, u, #4, #8;                                          \
-  movw right_block_mask, #0xFFFE;                                              \
+  mov right_block_mask, #0xFFFFFFFE;                                           \
+                                                                               \
+  setup_sprite_offset_u_adjust##x4mode();                                      \
                                                                                \
   /* texture_offset_base = VH-UH-VL-00                                       */\
   bfi texture_offset_base, v, #4, #4;                                          \
-  movw left_block_mask, #0xFFFF;                                               \
+  mov left_block_mask, #0xFFFFFFFF;                                            \
                                                                                \
   mov tile_height, height_rounded, lsr #4;                                     \
   mvn left_block_mask, left_block_mask, lsl offset_u;                          \
@@ -4907,16 +5202,16 @@ function(setup_sprite_##texture_mode)                                          \
                                                                                \
   /* texture_mask = HH-WH-HL-WL                                              */\
   bfi texture_mask, texture_mask_rev, #8, #4;                                  \
-  and left_block_mask, left_block_mask, #0xFF;                                 \
+  setup_sprite_get_left_block_mask##x4mode();                                  \
                                                                                \
   mov control_mask, #0;                                                        \
-  cmp left_block_mask, #0xFF;                                                  \
+  setup_sprite_compare_left_block_mask##x4mode();                              \
                                                                                \
-  uxtb right_block_mask, right_block_mask, ror #8;                             \
+  setup_sprite_get_right_block_mask##x4mode();                                 \
   orreq control_mask, control_mask, #0x4;                                      \
                                                                                \
   ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ];                    \
-  cmp right_block_mask, #0xFF;                                                 \
+  setup_sprite_compare_right_block_mask##x4mode();                             \
                                                                                \
   orreq control_mask, control_mask, #0x8;                                      \
   cmp tile_width, #1;                                                          \
@@ -4931,25 +5226,31 @@ function(setup_sprite_##texture_mode)                                          \
   ldr pc, [ pc, control_mask, lsl #2 ];                                        \
   nop;                                                                         \
                                                                                \
- .word setup_sprite_##texture_mode##_multi_multi_full_full;                    \
- .word setup_sprite_##texture_mode##_single_multi_full_none;                   \
- .word setup_sprite_##texture_mode##_multi_single_full_full;                   \
- .word setup_sprite_##texture_mode##_single_single_full_none;                  \
- .word setup_sprite_##texture_mode##_multi_multi_half_full;                    \
- .word setup_sprite_##texture_mode##_single_multi_half_right;                  \
- .word setup_sprite_##texture_mode##_multi_single_half_full;                   \
- .word setup_sprite_##texture_mode##_single_single_half_right;                 \
- .word setup_sprite_##texture_mode##_multi_multi_full_half;                    \
- .word setup_sprite_##texture_mode##_single_multi_half_left;                   \
- .word setup_sprite_##texture_mode##_multi_single_full_half;                   \
- .word setup_sprite_##texture_mode##_single_single_half_left;                  \
- .word setup_sprite_##texture_mode##_multi_multi_half_half;                    \
+ .word setup_sprite_##texture_mode##_multi_multi_full_full##x4mode;            \
+ .word setup_sprite_##texture_mode##_single_multi_full_none##x4mode;           \
+ .word setup_sprite_##texture_mode##_multi_single_full_full##x4mode;           \
+ .word setup_sprite_##texture_mode##_single_single_full_none##x4mode;          \
+ .word setup_sprite_##texture_mode##_multi_multi_half_full##x4mode;            \
+ .word setup_sprite_##texture_mode##_single_multi_half_right##x4mode;          \
+ .word setup_sprite_##texture_mode##_multi_single_half_full##x4mode;           \
+ .word setup_sprite_##texture_mode##_single_single_half_right##x4mode;         \
+ .word setup_sprite_##texture_mode##_multi_multi_full_half##x4mode;            \
+ .word setup_sprite_##texture_mode##_single_multi_half_left##x4mode;           \
+ .word setup_sprite_##texture_mode##_multi_single_full_half##x4mode;           \
+ .word setup_sprite_##texture_mode##_single_single_half_left##x4mode;          \
+ .word setup_sprite_##texture_mode##_multi_multi_half_half##x4mode;            \
  .word 0x00000000;                                                             \
- .word setup_sprite_##texture_mode##_multi_single_half_half                    \
+ .word setup_sprite_##texture_mode##_multi_single_half_half##x4mode;           \
 
 
-setup_sprite_tiled_builder(4bpp);
-setup_sprite_tiled_builder(8bpp);
+setup_sprite_tiled_builder(4bpp,);
+setup_sprite_tiled_builder(8bpp,);
+
+#undef draw_mask_fb_ptr_left
+#undef draw_mask_fb_ptr_right
+
+setup_sprite_tiled_builder(4bpp, _4x);
+setup_sprite_tiled_builder(8bpp, _4x);
 
 
 #undef block_ptr
@@ -5038,6 +5339,12 @@ function(texture_sprite_blocks_8bpp)
 #undef texture_mask
 #undef num_blocks
 #undef texture_offset
+#undef texels_low
+#undef texels_high
+#undef texels_wide_low
+#undef texels_wide_high
+#undef texels_wide
+#undef fb_ptr2
 
 #define psx_gpu                                           r0
 #define x                                                 r1
@@ -5049,6 +5356,7 @@ function(texture_sprite_blocks_8bpp)
 #define left_offset                                       r8
 #define width_rounded                                     r9
 #define right_width                                       r10
+
 #define block_width                                       r11
 
 #define texture_offset_base                               r1
@@ -5059,6 +5367,7 @@ function(texture_sprite_blocks_8bpp)
 #define fb_ptr                                            r7
 #define texture_offset                                    r8
 #define blocks_remaining                                  r9
+#define fb_ptr2                                           r10
 #define fb_ptr_pitch                                      r12
 #define texture_block_ptr                                 r14
 
@@ -5077,29 +5386,23 @@ function(texture_sprite_blocks_8bpp)
 #define draw_mask_fb_ptr                                  d2
 #define texels                                            q2
 
+#define draw_mask_fb_ptr_a                                d2
+#define draw_mask_fb_ptr_b                                d3
+#define texels_low                                        d4
+#define texels_high                                       d5
+#define texels_wide_low                                   d6
+#define texels_wide_high                                  d7
+#define texels_wide                                       q3
 
-setup_sprites_16bpp_flush_single:
-  vpush { d0 - d2 }
-
-  stmdb sp!, { r0 - r3, r12, r14 }
-  bl flush_render_block_buffer
-  ldmia sp!, { r0 - r3, r12, r14 }
-
-  vpop { d0 - d2 }
-
-  add block, psx_gpu, #psx_gpu_blocks_offset
-  mov num_blocks, #1
-
-  bx lr
 
-setup_sprites_16bpp_flush_row:
-  vpush { d0 - d2 }
+setup_sprites_16bpp_flush:
+  vpush { d0 - d3 }
 
   stmdb sp!, { r0 - r3, r12, r14 }
   bl flush_render_block_buffer
   ldmia sp!, { r0 - r3, r12, r14 }
 
-  vpop { d0 - d2 }
+  vpop { d0 - d3 }
 
   add block, psx_gpu, #psx_gpu_blocks_offset
   mov num_blocks, block_width
@@ -5108,7 +5411,7 @@ setup_sprites_16bpp_flush_row:
 
 function(setup_sprite_16bpp)
   stmdb sp!, { r4 - r11, r14 }
-  ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_ptr_offset ]
+  ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]
 
   ldr v, [ sp, #36 ]
   add fb_ptr, fb_ptr, y, lsl #11
@@ -5164,7 +5467,7 @@ function(setup_sprite_16bpp)
  1:
   add num_blocks, num_blocks, #1
   cmp num_blocks, #MAX_BLOCKS
-  blgt setup_sprites_16bpp_flush_single
+  blgt setup_sprites_16bpp_flush
 
   and texture_block_ptr, texture_offset_base, texture_mask
   subs height, height, #1
@@ -5193,7 +5496,7 @@ function(setup_sprite_16bpp)
   mov texture_offset, texture_offset_base
 
   cmp num_blocks, #MAX_BLOCKS
-  blgt setup_sprites_16bpp_flush_row
+  blgt setup_sprites_16bpp_flush
 
   add texture_offset_base, texture_offset_base, #2048
   and texture_block_ptr, texture_offset, texture_mask
@@ -5264,6 +5567,290 @@ function(setup_sprite_16bpp)
   ldmia sp!, { r4 - r11, pc }
 
 
+// 4x version
+// FIXME: duplicate code with normal version :(
+#undef draw_mask_fb_ptr
+
+function(setup_sprite_16bpp_4x)
+  stmdb sp!, { r4 - r11, r14 }
+  ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]
+
+  ldr v, [ sp, #36 ]
+  add fb_ptr, fb_ptr, y, lsl #11
+
+  ldr width, [ sp, #40 ]
+  add fb_ptr, fb_ptr, x, lsl #1
+
+  ldr height, [ sp, #44 ]
+  and left_offset, u, #0x7
+
+  add texture_offset_base, u, u
+  add width_rounded, width, #7
+
+  add texture_offset_base, v, lsl #11
+  movw left_mask_bits, #0xFFFF
+  
+  ldrb texture_mask_width, [ psx_gpu, #psx_gpu_texture_mask_width_offset ]
+  add width_rounded, width_rounded, left_offset
+
+  lsl left_offset, #1
+
+  ldrb texture_mask_height, [ psx_gpu, #psx_gpu_texture_mask_height_offset ]
+  sub fb_ptr, fb_ptr, left_offset, lsl #1
+
+  add texture_mask, texture_mask_width, texture_mask_width
+  movw right_mask_bits, #0xFFFC
+
+  and right_width, width_rounded, #0x7
+  mvn left_mask_bits, left_mask_bits, lsl left_offset
+
+  lsl right_width, #1
+
+  add texture_mask, texture_mask_height, lsl #11
+  mov block_width, width_rounded, lsr #3
+
+  mov right_mask_bits, right_mask_bits, lsl right_width
+  movw fb_ptr_pitch, #(2048 + 16) * 2
+
+  sub fb_ptr_pitch, fb_ptr_pitch, block_width, lsl #4+1
+  vmov block_masks, left_mask_bits, right_mask_bits
+
+  ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
+  add block, psx_gpu, #psx_gpu_blocks_offset
+
+  bic texture_offset_base, texture_offset_base, #0xF
+  cmp block_width, #1
+
+  ldr texture_page_ptr, [ psx_gpu, #psx_gpu_texture_page_ptr_offset ]
+  add block, block, num_blocks, lsl #6
+
+  lsl block_width, #2
+  bne 0f
+
+  vext.32 block_masks_shifted, block_masks, block_masks, #1
+  vorr.u32 block_masks, block_masks, block_masks_shifted
+  vdup.u8 draw_mask_fb_ptr_a, block_masks[0]
+  vdup.u8 draw_mask_fb_ptr_b, block_masks[1]
+
+ 1:
+  add num_blocks, num_blocks, block_width
+  cmp num_blocks, #MAX_BLOCKS
+  blgt setup_sprites_16bpp_flush
+
+  and texture_block_ptr, texture_offset_base, texture_mask
+  subs height, height, #1
+
+  add texture_block_ptr, texture_page_ptr, texture_block_ptr
+  vld1.u32 { texels }, [ texture_block_ptr, :128 ]
+
+  do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_a, draw_mask_fb_ptr_b)
+
+  add texture_offset_base, texture_offset_base, #2048
+  add fb_ptr, fb_ptr, #2048*2
+  strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
+  bne 1b
+
+  ldmia sp!, { r4 - r11, pc }
+
+ 0:
+  add num_blocks, num_blocks, block_width
+  mov texture_offset, texture_offset_base
+
+  vdup.u8 draw_mask_fb_ptr_a, block_masks[0] // left_mask_bits
+  vdup.u8 draw_mask_fb_ptr_b, block_masks[1]
+
+  cmp num_blocks, #MAX_BLOCKS
+  blgt setup_sprites_16bpp_flush
+
+  add texture_offset_base, texture_offset_base, #2048
+  and texture_block_ptr, texture_offset, texture_mask
+
+  add texture_block_ptr, texture_page_ptr, texture_block_ptr
+  vld1.u32 { texels }, [ texture_block_ptr, :128 ]
+
+  do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_a, draw_mask_fb_ptr_b)
+
+  subs blocks_remaining, block_width, #2*4
+  add texture_offset, texture_offset, #16
+
+  vmov.u8 draw_mask_fb_ptr_a, #0
+  vmov.u8 draw_mask_fb_ptr_b, #0
+
+  add fb_ptr, fb_ptr, #16*2
+  beq 2f
+
+ 1:
+  and texture_block_ptr, texture_offset, texture_mask
+  subs blocks_remaining, blocks_remaining, #4
+
+  add texture_block_ptr, texture_page_ptr, texture_block_ptr
+  vld1.u32 { texels }, [ texture_block_ptr, :128 ]
+
+  do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_a, draw_mask_fb_ptr_b)
+  add texture_offset, texture_offset, #16
+
+  add fb_ptr, fb_ptr, #16*2
+  bgt 1b
+
+ 2:
+  vdup.u8 draw_mask_fb_ptr_a, block_masks[4] // right_mask_bits
+  vdup.u8 draw_mask_fb_ptr_b, block_masks[5]
+
+  and texture_block_ptr, texture_offset, texture_mask
+  add texture_block_ptr, texture_page_ptr, texture_block_ptr
+
+  vld1.u32 { texels }, [ texture_block_ptr, :128 ]
+
+  do_texture_block_16bpp_4x(fb_ptr2, draw_mask_fb_ptr_a, draw_mask_fb_ptr_b)
+  subs height, height, #1
+
+  add fb_ptr, fb_ptr, fb_ptr_pitch
+  strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
+
+  bne 0b
+
+  ldmia sp!, { r4 - r11, pc }
+
+
+#undef width
+#undef right_width
+#undef right_mask_bits
+#undef color
+#undef height
+#undef blocks_remaining
+#undef colors
+#undef right_mask
+#undef test_mask
+#undef draw_mask
+
+#define psx_gpu                                           r0
+#define x                                                 r1
+#define y                                                 r2
+#define width                                             r3
+#define right_width                                       r5
+#define right_mask_bits                                   r6
+#define fb_ptr                                            r7
+#define color                                             r8
+#define height                                            r9
+#define fb_ptr_pitch                                      r12
+
+// referenced by setup_sprites_16bpp_flush
+#define num_blocks                                        r4
+#define block                                             r5
+#define block_width                                       r11
+
+#define color_r                                           r1
+#define color_g                                           r2
+#define color_b                                           r8
+#define blocks_remaining                                  r6
+
+#define colors                                            q0
+#define right_mask                                        q1
+#define test_mask                                         q2
+#define draw_mask                                         q2
+#define draw_mask_bits_fb_ptr                             d6
+
+
+.align 3
+
+function(setup_sprite_untextured)
+  ldrh r12, [ psx_gpu, #psx_gpu_render_state_offset ]
+  tst r12, #(RENDER_STATE_MASK_EVALUATE | RENDER_FLAGS_MODULATE_TEXELS         \
+    | RENDER_FLAGS_BLEND)
+  beq setup_sprite_untextured_simple
+
+  stmdb sp!, { r4 - r11, r14 }
+
+  ldr width, [ sp, #40 ]
+  ldr fb_ptr, [ psx_gpu, #psx_gpu_vram_out_ptr_offset ]
+
+  ldr height, [ sp, #44 ]
+  add fb_ptr, fb_ptr, y, lsl #11
+
+  add fb_ptr, fb_ptr, x, lsl #1
+  sub right_width, width, #1
+
+  ldr color, [ sp, #48 ]
+  and right_width, #7
+
+  add block_width, width, #7
+  add right_width, #1
+
+  lsr block_width, #3
+  mov right_mask_bits, #0xff
+
+  sub fb_ptr_pitch, block_width, #1
+  lsl right_mask_bits, right_width
+
+  lsl fb_ptr_pitch, #3+1
+  ubfx color_r, color, #3, #5
+
+  rsb fb_ptr_pitch, #1024*2
+  ubfx color_g, color, #11, #5
+
+  vld1.u32 { test_mask }, [ psx_gpu, :128 ]
+  ubfx color_b, color, #19, #5
+
+  vdup.u16 right_mask, right_mask_bits
+  orr color, color_r, color_b, lsl #10
+
+  ldrh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
+  orr color, color, color_g, lsl #5
+
+  vtst.u16 right_mask, right_mask, test_mask
+  add block, psx_gpu, #psx_gpu_blocks_offset
+
+  vdup.u16 colors, color
+  add block, block, num_blocks, lsl #6
+
+
+setup_sprite_untextured_height_loop:
+  add num_blocks, block_width
+  sub blocks_remaining, block_width, #1
+
+  cmp num_blocks, #MAX_BLOCKS
+  blgt setup_sprites_16bpp_flush
+
+  cmp blocks_remaining, #0
+  ble 1f
+
+  vmov.u8 draw_mask, #0 /* zero_mask */
+  vmov.u8 draw_mask_bits_fb_ptr, #0
+
+ 0:
+  vst1.u32 { draw_mask }, [ block, :128 ]!
+  subs blocks_remaining, #1
+
+  vst1.u32 { colors }, [ block, :128 ]
+  add block, block, #24
+
+  vmov.u32 draw_mask_bits_fb_ptr[1], fb_ptr
+  vst1.u32 { draw_mask_bits_fb_ptr }, [ block, :64 ]
+  
+  add block, block, #24
+  add fb_ptr, #8*2
+  bgt 0b
+
+ 1:
+  vst1.u32 { right_mask }, [ block, :128 ]!
+  subs height, #1
+
+  vst1.u32 { colors }, [ block, :128 ]
+  add block, block, #24
+
+  vmov.u32 draw_mask_bits_fb_ptr[1], fb_ptr
+  vst1.u32 { draw_mask_bits_fb_ptr }, [ block, :64 ]
+  
+  add block, block, #24
+  add fb_ptr, fb_ptr_pitch
+
+  strh num_blocks, [ psx_gpu, #psx_gpu_num_blocks_offset ]
+  bgt setup_sprite_untextured_height_loop
+
+  ldmia sp!, { r4 - r11, pc }
+
+
+
 #undef texture_page_ptr
 #undef vram_ptr
 #undef dirty_textures_mask
@@ -5461,3 +6048,40 @@ function(update_texture_8bpp_cache_slice)
   vpop { q0 - q3 }
   ldmia sp!, { r4 - r11, pc }
 
+
+/* void scale2x_tiles8(void *dst, const void *src, int w8, int h) */
+function(scale2x_tiles8)
+  push { r4, r14 }
+
+  mov r4, r1
+  add r12, r0, #1024*2
+  mov r14, r2
+
+0:
+  vld1.u16 { q0 }, [ r1, :128 ]!
+  vld1.u16 { q2 }, [ r1, :128 ]!
+  vmov q1, q0
+  vmov q3, q2
+  vzip.16 q0, q1
+  vzip.16 q2, q3
+  subs r14, #2
+  vst1.u16 { q0, q1 }, [ r0, :128 ]!
+  vst1.u16 { q0, q1 }, [ r12, :128 ]!
+  blt 1f
+  vst1.u16 { q2, q3 }, [ r0, :128 ]!
+  vst1.u16 { q2, q3 }, [ r12, :128 ]!
+  bgt 0b
+1:
+  subs r3, #1
+  mov r14, r2
+  add r0, #1024*2*2
+  add r4, #1024*2
+  sub r0, r2, lsl #4+1
+  mov r1, r4
+  add r12, r0, #1024*2
+  bgt 0b
+  nop
+
+  pop { r4, pc }
+
+// vim:filetype=armasm
diff --git a/plugins/gpu_neon/psx_gpu/psx_gpu_offsets.h b/plugins/gpu_neon/psx_gpu/psx_gpu_offsets.h
new file mode 100644 (file)
index 0000000..1307891
--- /dev/null
@@ -0,0 +1,57 @@
+#define psx_gpu_test_mask_offset                          0x0
+#define psx_gpu_uvrg_offset                               0x10
+#define psx_gpu_uvrg_dx_offset                            0x20
+#define psx_gpu_uvrg_dy_offset                            0x30
+#define psx_gpu_u_block_span_offset                       0x40
+#define psx_gpu_v_block_span_offset                       0x50
+#define psx_gpu_r_block_span_offset                       0x60
+#define psx_gpu_g_block_span_offset                       0x70
+#define psx_gpu_b_block_span_offset                       0x80
+#define psx_gpu_b_offset                                  0x90
+#define psx_gpu_b_dy_offset                               0x94
+#define psx_gpu_triangle_area_offset                      0x98
+#define psx_gpu_texture_window_settings_offset            0x9c
+#define psx_gpu_current_texture_mask_offset               0xa0
+#define psx_gpu_viewport_mask_offset                      0xa4
+#define psx_gpu_dirty_textures_4bpp_mask_offset           0xa8
+#define psx_gpu_dirty_textures_8bpp_mask_offset           0xac
+#define psx_gpu_dirty_textures_8bpp_alternate_mask_offset 0xb0
+#define psx_gpu_triangle_color_offset                     0xb4
+#define psx_gpu_dither_table_offset                       0xb8
+#define psx_gpu_uvrgb_phase_offset                        0xc8
+#define psx_gpu_render_block_handler_offset               0xcc
+#define psx_gpu_texture_page_ptr_offset                   0xd0
+#define psx_gpu_texture_page_base_offset                  0xd4
+#define psx_gpu_clut_ptr_offset                           0xd8
+#define psx_gpu_vram_ptr_offset                           0xdc
+#define psx_gpu_vram_out_ptr_offset                       0xe0
+#define psx_gpu_render_state_base_offset                  0xe4
+#define psx_gpu_render_state_offset                       0xe6
+#define psx_gpu_num_spans_offset                          0xe8
+#define psx_gpu_num_blocks_offset                         0xea
+#define psx_gpu_viewport_start_x_offset                   0xec
+#define psx_gpu_viewport_start_y_offset                   0xee
+#define psx_gpu_viewport_end_x_offset                     0xf0
+#define psx_gpu_viewport_end_y_offset                     0xf2
+#define psx_gpu_mask_msb_offset                           0xf4
+#define psx_gpu_triangle_winding_offset                   0xf6
+#define psx_gpu_display_area_draw_enable_offset           0xf7
+#define psx_gpu_current_texture_page_offset               0xf8
+#define psx_gpu_last_8bpp_texture_page_offset             0xf9
+#define psx_gpu_texture_mask_width_offset                 0xfa
+#define psx_gpu_texture_mask_height_offset                0xfb
+#define psx_gpu_texture_window_x_offset                   0xfc
+#define psx_gpu_texture_window_y_offset                   0xfd
+#define psx_gpu_primitive_type_offset                     0xfe
+#define psx_gpu_render_mode_offset                        0xff
+#define psx_gpu_offset_x_offset                           0x100
+#define psx_gpu_offset_y_offset                           0x102
+#define psx_gpu_clut_settings_offset                      0x104
+#define psx_gpu_texture_settings_offset                   0x106
+#define psx_gpu_blocks_offset                             0x200
+#define psx_gpu_span_uvrg_offset_offset                   0x2200
+#define psx_gpu_span_edge_data_offset                     0x4200
+#define psx_gpu_span_b_offset_offset                      0x5200
+#define psx_gpu_texture_4bpp_cache_offset                 0x5a00
+#define psx_gpu_texture_8bpp_even_cache_offset            0x205a00
+#define psx_gpu_texture_8bpp_odd_cache_offset             0x305a00
diff --git a/plugins/gpu_neon/psx_gpu/psx_gpu_offsets_update.c b/plugins/gpu_neon/psx_gpu/psx_gpu_offsets_update.c
new file mode 100644 (file)
index 0000000..5adfb75
--- /dev/null
@@ -0,0 +1,86 @@
+#include <stdio.h>
+#include <stddef.h>
+
+#include "common.h"
+
+#define WRITE_OFFSET(f, member) \
+       fprintf(f, "#define %-50s0x%x\n", \
+               "psx_gpu_" #member "_offset", \
+               offsetof(psx_gpu_struct, member));
+
+int main()
+{
+       FILE *f;
+
+       if (sizeof(f) != 4) {
+               fprintf(stderr, "bad pointer size\n");
+               return 1;
+       }
+
+       f = fopen("psx_gpu_offsets.h", "w");
+       if (f == NULL) {
+               perror("fopen");
+               return 1;
+       }
+
+       WRITE_OFFSET(f, test_mask);
+       WRITE_OFFSET(f, uvrg);
+       WRITE_OFFSET(f, uvrg_dx);
+       WRITE_OFFSET(f, uvrg_dy);
+       WRITE_OFFSET(f, u_block_span);
+       WRITE_OFFSET(f, v_block_span);
+       WRITE_OFFSET(f, r_block_span);
+       WRITE_OFFSET(f, g_block_span);
+       WRITE_OFFSET(f, b_block_span);
+       WRITE_OFFSET(f, b);
+       WRITE_OFFSET(f, b_dy);
+       WRITE_OFFSET(f, triangle_area);
+       WRITE_OFFSET(f, texture_window_settings);
+       WRITE_OFFSET(f, current_texture_mask);
+       WRITE_OFFSET(f, viewport_mask);
+       WRITE_OFFSET(f, dirty_textures_4bpp_mask);
+       WRITE_OFFSET(f, dirty_textures_8bpp_mask);
+       WRITE_OFFSET(f, dirty_textures_8bpp_alternate_mask);
+       WRITE_OFFSET(f, triangle_color);
+       WRITE_OFFSET(f, dither_table);
+       WRITE_OFFSET(f, uvrgb_phase);
+       WRITE_OFFSET(f, render_block_handler);
+       WRITE_OFFSET(f, texture_page_ptr);
+       WRITE_OFFSET(f, texture_page_base);
+       WRITE_OFFSET(f, clut_ptr);
+       WRITE_OFFSET(f, vram_ptr);
+       WRITE_OFFSET(f, vram_out_ptr);
+       WRITE_OFFSET(f, render_state_base);
+       WRITE_OFFSET(f, render_state);
+       WRITE_OFFSET(f, num_spans);
+       WRITE_OFFSET(f, num_blocks);
+       WRITE_OFFSET(f, viewport_start_x);
+       WRITE_OFFSET(f, viewport_start_y);
+       WRITE_OFFSET(f, viewport_end_x);
+       WRITE_OFFSET(f, viewport_end_y);
+       WRITE_OFFSET(f, mask_msb);
+       WRITE_OFFSET(f, triangle_winding);
+       WRITE_OFFSET(f, display_area_draw_enable);
+       WRITE_OFFSET(f, current_texture_page);
+       WRITE_OFFSET(f, last_8bpp_texture_page);
+       WRITE_OFFSET(f, texture_mask_width);
+       WRITE_OFFSET(f, texture_mask_height);
+       WRITE_OFFSET(f, texture_window_x);
+       WRITE_OFFSET(f, texture_window_y);
+       WRITE_OFFSET(f, primitive_type);
+       WRITE_OFFSET(f, render_mode);
+       WRITE_OFFSET(f, offset_x);
+       WRITE_OFFSET(f, offset_y);
+       WRITE_OFFSET(f, clut_settings);
+       WRITE_OFFSET(f, texture_settings);
+       WRITE_OFFSET(f, blocks);
+       WRITE_OFFSET(f, span_uvrg_offset);
+       WRITE_OFFSET(f, span_edge_data);
+       WRITE_OFFSET(f, span_b_offset);
+       WRITE_OFFSET(f, texture_4bpp_cache);
+       WRITE_OFFSET(f, texture_8bpp_even_cache);
+       WRITE_OFFSET(f, texture_8bpp_odd_cache);
+       fclose(f);
+
+       return 0;
+}
index 920c638..67da86e 100644 (file)
@@ -92,6 +92,7 @@ void update_texture_ptr(psx_gpu_struct *psx_gpu)
 
 void set_texture(psx_gpu_struct *psx_gpu, u32 texture_settings)
 {
+  texture_settings &= 0x1FF;
   if(psx_gpu->texture_settings != texture_settings)
   {
     u32 new_texture_page = texture_settings & 0x1F;
@@ -152,6 +153,52 @@ void set_triangle_color(psx_gpu_struct *psx_gpu, u32 triangle_color)
   }
 }
 
+static void do_fill(psx_gpu_struct *psx_gpu, u32 x, u32 y,
+ u32 width, u32 height, u32 color)
+{
+  x &= ~0xF;
+  width = ((width + 0xF) & ~0xF);
+
+  flush_render_block_buffer(psx_gpu);
+
+  if(unlikely((x + width) > 1024))
+  {
+    u32 width_a = 1024 - x;
+    u32 width_b = width - width_a;
+
+    if(unlikely((y + height) > 512))
+    {
+      u32 height_a = 512 - y;
+      u32 height_b = height - height_a;
+
+      render_block_fill(psx_gpu, color, x, y, width_a, height_a);
+      render_block_fill(psx_gpu, color, 0, y, width_b, height_a);
+      render_block_fill(psx_gpu, color, x, 0, width_a, height_b);
+      render_block_fill(psx_gpu, color, 0, 0, width_b, height_b);
+    }
+    else
+    {
+      render_block_fill(psx_gpu, color, x, y, width_a, height);
+      render_block_fill(psx_gpu, color, 0, y, width_b, height);
+    }
+  }
+  else
+  {
+    if(unlikely((y + height) > 512))
+    {
+      u32 height_a = 512 - y;
+      u32 height_b = height - height_a;
+
+      render_block_fill(psx_gpu, color, x, y, width, height_a);
+      render_block_fill(psx_gpu, color, x, 0, width, height_b);
+    }
+    else
+    {
+      render_block_fill(psx_gpu, color, x, y, width, height);
+    }
+  }
+}
+
 #define sign_extend_12bit(value)                                               \
   (((s32)((value) << 20)) >> 20)                                               \
 
@@ -235,45 +282,7 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size, u32 *last_command)
         u32 height = list_s16[5] & 0x1FF;
         u32 color = list[0] & 0xFFFFFF;
 
-        x &= ~0xF;
-        width = ((width + 0xF) & ~0xF);
-
-        if((x + width) > 1024)
-        {
-          u32 width_a = 1024 - x;
-          u32 width_b = width - width_a;
-
-          if((y + height) > 512)
-          {
-            u32 height_a = 512 - y;
-            u32 height_b = height - height_a;
-
-            render_block_fill(psx_gpu, color, x, y, width_a, height_a);
-            render_block_fill(psx_gpu, color, 0, y, width_b, height_a);
-            render_block_fill(psx_gpu, color, x, 0, width_a, height_b);
-            render_block_fill(psx_gpu, color, 0, 0, width_b, height_b);
-          }
-          else
-          {
-            render_block_fill(psx_gpu, color, x, y, width_a, height);
-            render_block_fill(psx_gpu, color, 0, y, width_b, height);
-          }
-        }
-        else
-        {
-          if((y + height) > 512)
-          {
-            u32 height_a = 512 - y;
-            u32 height_b = height - height_a;
-
-            render_block_fill(psx_gpu, color, x, y, width, height_a);
-            render_block_fill(psx_gpu, color, x, 0, width, height_b);
-          }
-          else
-          {
-            render_block_fill(psx_gpu, color, x, y, width, height);
-          }
-        }
+        do_fill(psx_gpu, x, y, width, height, color);
                        break;
       }
   
@@ -399,7 +408,7 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size, u32 *last_command)
         vertexes[1].x = list_s16[4] + psx_gpu->offset_x;
         vertexes[1].y = list_s16[5] + psx_gpu->offset_y;
 
-        render_line(psx_gpu, vertexes, current_command, list[0]);
+        render_line(psx_gpu, vertexes, current_command, list[0], 0);
                        break;
       }
   
@@ -420,7 +429,7 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size, u32 *last_command)
           vertexes[1].x = (xy & 0xFFFF) + psx_gpu->offset_x;
           vertexes[1].y = (xy >> 16) + psx_gpu->offset_y;
 
-          render_line(psx_gpu, vertexes, current_command, list[0]);
+          render_line(psx_gpu, vertexes, current_command, list[0], 0);
 
           list_position++;
           num_vertexes++;
@@ -451,7 +460,7 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size, u32 *last_command)
         vertexes[1].x = list_s16[6] + psx_gpu->offset_x;
         vertexes[1].y = list_s16[7] + psx_gpu->offset_y;
 
-        render_line(psx_gpu, vertexes, current_command, 0);
+        render_line(psx_gpu, vertexes, current_command, 0, 0);
                        break;
       }
  
@@ -481,7 +490,7 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size, u32 *last_command)
           vertexes[1].x = (xy & 0xFFFF) + psx_gpu->offset_x;
           vertexes[1].y = (xy >> 16) + psx_gpu->offset_y;
 
-          render_line(psx_gpu, vertexes, current_command, 0);
+          render_line(psx_gpu, vertexes, current_command, 0, 0);
 
           list_position += 2;
           num_vertexes++;
@@ -592,12 +601,22 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size, u32 *last_command)
                        break;
       }
   
-               case 0x80:          //  vid -> vid
-        render_block_move(psx_gpu, list_s16[2] & 0x3FF, list_s16[3] & 0x1FF,
-         list_s16[4] & 0x3FF, list_s16[5] & 0x1FF,
-         ((list_s16[6] - 1) & 0x3FF) + 1, ((list_s16[7] - 1) & 0x1FF) + 1);
-                       break;
+      case 0x80:          //  vid -> vid
+      {
+        u32 sx = list_s16[2] & 0x3FF;
+        u32 sy = list_s16[3] & 0x1FF;
+        u32 dx = list_s16[4] & 0x3FF;
+        u32 dy = list_s16[5] & 0x1FF;
+        u32 w = ((list_s16[6] - 1) & 0x3FF) + 1;
+        u32 h = ((list_s16[7] - 1) & 0x1FF) + 1;
+
+        if (sx == dx && sy == dy && psx_gpu->mask_msb == 0)
+          break;
+
+        render_block_move(psx_gpu, sx, sy, dx, dy, w, h);
+        break;
+      } 
+
 #ifdef PCSX
                case 0xA0:          //  sys -> vid
                case 0xC0:          //  vid -> sys
@@ -626,7 +645,7 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size, u32 *last_command)
 #endif
 
                case 0xE1:
-        set_texture(psx_gpu, list[0] & 0x1FF);
+        set_texture(psx_gpu, list[0]);
 
         if(list[0] & (1 << 9))
           psx_gpu->render_state_base |= RENDER_STATE_DITHER;
@@ -669,11 +688,21 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size, u32 *last_command)
         }
         SET_Ex(2, list[0]);
         break;
-               }
+      }
+
+      case 0xE3:
+      {
+        s16 viewport_start_x = list[0] & 0x3FF;
+        s16 viewport_start_y = (list[0] >> 10) & 0x1FF;
+
+        if(viewport_start_x == psx_gpu->viewport_start_x &&
+         viewport_start_y == psx_gpu->viewport_start_y)
+        {
+          break;
+        }
   
-               case 0xE3:
-        psx_gpu->viewport_start_x = list[0] & 0x3FF;
-        psx_gpu->viewport_start_y = (list[0] >> 10) & 0x1FF;
+        psx_gpu->viewport_start_x = viewport_start_x;
+        psx_gpu->viewport_start_y = viewport_start_y;
 
 #ifdef TEXTURE_CACHE_4BPP
         psx_gpu->viewport_mask =
@@ -681,12 +710,23 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size, u32 *last_command)
          psx_gpu->viewport_start_y, psx_gpu->viewport_end_x,
          psx_gpu->viewport_end_y);
 #endif
-                       SET_Ex(3, list[0]);
-                       break;
-  
-               case 0xE4:
-        psx_gpu->viewport_end_x = list[0] & 0x3FF;
-        psx_gpu->viewport_end_y = (list[0] >> 10) & 0x1FF;
+        SET_Ex(3, list[0]);
+        break;
+      }
+
+      case 0xE4:
+      {
+        s16 viewport_end_x = list[0] & 0x3FF;
+        s16 viewport_end_y = (list[0] >> 10) & 0x1FF;
+
+        if(viewport_end_x == psx_gpu->viewport_end_x &&
+         viewport_end_y == psx_gpu->viewport_end_y)
+        {
+          break;
+        }
+
+        psx_gpu->viewport_end_x = viewport_end_x;
+        psx_gpu->viewport_end_y = viewport_end_y;
 
 #ifdef TEXTURE_CACHE_4BPP
         psx_gpu->viewport_mask =
@@ -694,10 +734,11 @@ u32 gpu_parse(psx_gpu_struct *psx_gpu, u32 *list, u32 size, u32 *last_command)
          psx_gpu->viewport_start_y, psx_gpu->viewport_end_x,
          psx_gpu->viewport_end_y);
 #endif
-                       SET_Ex(4, list[0]);
-                       break;
+        SET_Ex(4, list[0]);
+        break;
+      }
   
-               case 0xE5:
+      case 0xE5:
       {
         s32 offset_x = list[0] << 21;
         s32 offset_y = list[0] << 10;
@@ -741,3 +782,786 @@ breakloop:
   return list - list_start;
 }
 
+#ifdef PCSX
+
+#define ENH_BUF_TABLE_STEP (1024 / sizeof(psx_gpu->enhancement_buf_by_x16))
+
+static void update_enhancement_buf_table_from_hres(psx_gpu_struct *psx_gpu)
+{
+  u32 b, x, s;
+
+  b = 0;
+  s = psx_gpu->enhancement_x_threshold;
+  for (x = 0; x < sizeof(psx_gpu->enhancement_buf_by_x16); x++)
+  {
+    if (b < 3 && x * ENH_BUF_TABLE_STEP >= s - ENH_BUF_TABLE_STEP - 1)
+    {
+      s += psx_gpu->enhancement_x_threshold;
+      b++;
+    }
+    psx_gpu->enhancement_buf_by_x16[x] = b;
+  }
+}
+
+static void update_enhancement_buf_table_from_x(psx_gpu_struct *psx_gpu,
+ u32 x0, u32 len)
+{
+  u32 x, b;
+
+  for (x = x0, b = 0; x >= len; b++)
+    x -= len;
+  if (b > 3)
+    b = 3;
+
+  memset(psx_gpu->enhancement_buf_by_x16 + x0 / ENH_BUF_TABLE_STEP,
+   b, (len + ENH_BUF_TABLE_STEP - 1) / ENH_BUF_TABLE_STEP);
+}
+
+#define select_enhancement_buf(psx_gpu) \
+  psx_gpu->enhancement_current_buf_ptr = \
+    select_enhancement_buf_ptr(psx_gpu, psx_gpu->saved_viewport_start_x)
+
+#define enhancement_disable() { \
+  psx_gpu->vram_out_ptr = psx_gpu->vram_ptr; \
+  psx_gpu->viewport_start_x = psx_gpu->saved_viewport_start_x; \
+  psx_gpu->viewport_start_y = psx_gpu->saved_viewport_start_y; \
+  psx_gpu->viewport_end_x = psx_gpu->saved_viewport_end_x; \
+  psx_gpu->viewport_end_y = psx_gpu->saved_viewport_end_y; \
+  psx_gpu->uvrgb_phase = 0x8000; \
+}
+
+#define enhancement_enable() { \
+  psx_gpu->vram_out_ptr = psx_gpu->enhancement_current_buf_ptr; \
+  psx_gpu->viewport_start_x = psx_gpu->saved_viewport_start_x * 2; \
+  psx_gpu->viewport_start_y = psx_gpu->saved_viewport_start_y * 2; \
+  psx_gpu->viewport_end_x = psx_gpu->saved_viewport_end_x * 2 + 1; \
+  psx_gpu->viewport_end_y = psx_gpu->saved_viewport_end_y * 2 + 1; \
+  psx_gpu->uvrgb_phase = 0x1000; \
+}
+
+#define shift_vertices3(v) { \
+  v[0]->x *= 2; \
+  v[0]->y *= 2; \
+  v[1]->x *= 2; \
+  v[1]->y *= 2; \
+  v[2]->x *= 2; \
+  v[2]->y *= 2; \
+}
+
+#define unshift_vertices3(v) { \
+  v[0]->x /= 2; \
+  v[0]->y /= 2; \
+  v[1]->x /= 2; \
+  v[1]->y /= 2; \
+  v[2]->x /= 2; \
+  v[2]->y /= 2; \
+}
+
+#define shift_triangle_area() \
+  psx_gpu->triangle_area *= 4
+
+extern void scale2x_tiles8(void *dst, const void *src, int w8, int h);
+
+#ifndef NEON_BUILD
+// TODO?
+void scale2x_tiles8(void *dst, const void *src, int w8, int h) {}
+#endif
+
+static int disable_main_render;
+
+static void do_triangle_enhanced(psx_gpu_struct *psx_gpu,
+ vertex_struct *vertexes, u32 current_command)
+{
+  vertex_struct *vertex_ptrs[3];
+
+  if (!prepare_triangle(psx_gpu, vertexes, vertex_ptrs))
+    return;
+
+  if (!disable_main_render)
+    render_triangle_p(psx_gpu, vertex_ptrs, current_command);
+
+  enhancement_enable();
+  shift_vertices3(vertex_ptrs);
+  shift_triangle_area();
+  render_triangle_p(psx_gpu, vertex_ptrs, current_command);
+}
+
+static void do_quad_enhanced(psx_gpu_struct *psx_gpu, vertex_struct *vertexes,
+ u32 current_command)
+{
+  vertex_struct *vertex_ptrs[3];
+
+  if (prepare_triangle(psx_gpu, vertexes, vertex_ptrs)) {
+    if (!disable_main_render)
+      render_triangle_p(psx_gpu, vertex_ptrs, current_command);
+
+    enhancement_enable();
+    shift_vertices3(vertex_ptrs);
+    shift_triangle_area();
+    render_triangle_p(psx_gpu, vertex_ptrs, current_command);
+    unshift_vertices3(vertex_ptrs);
+  }
+  enhancement_disable();
+  if (prepare_triangle(psx_gpu, &vertexes[1], vertex_ptrs)) {
+    if (!disable_main_render)
+      render_triangle_p(psx_gpu, vertex_ptrs, current_command);
+
+    enhancement_enable();
+    shift_vertices3(vertex_ptrs);
+    shift_triangle_area();
+    render_triangle_p(psx_gpu, vertex_ptrs, current_command);
+  }
+}
+
+#if 0
+
+#define fill_vertex(i, x_, y_, u_, v_, rgb_) \
+  vertexes[i].x = x_; \
+  vertexes[i].y = y_; \
+  vertexes[i].u = u_; \
+  vertexes[i].v = v_; \
+  vertexes[i].r = rgb_; \
+  vertexes[i].g = (rgb_) >> 8; \
+  vertexes[i].b = (rgb_) >> 16
+
+static void do_sprite_enhanced(psx_gpu_struct *psx_gpu, int x, int y,
+ u32 u, u32 v, u32 w, u32 h, u32 cmd_rgb)
+{
+  vertex_struct *vertex_ptrs[3];
+  u32 flags = (cmd_rgb >> 24);
+  u32 color = cmd_rgb & 0xffffff;
+  u32 render_state_base_saved = psx_gpu->render_state_base;
+  int x1, y1;
+  u8 u1, v1;
+
+  flags &=
+   (RENDER_FLAGS_MODULATE_TEXELS | RENDER_FLAGS_BLEND |
+   RENDER_FLAGS_TEXTURE_MAP);
+
+  set_triangle_color(psx_gpu, color);
+  if(color == 0x808080)
+    flags |= RENDER_FLAGS_MODULATE_TEXELS;
+
+  psx_gpu->render_state_base &= ~RENDER_STATE_DITHER;
+  enhancement_enable();
+
+  x1 = x + w;
+  y1 = y + h;
+  u1 = u + w;
+  v1 = v + h;
+  // FIXME..
+  if (u1 < u) u1 = 0xff;
+  if (v1 < v) v1 = 0xff;
+
+  // 0-2
+  // |/
+  // 1
+  fill_vertex(0, x,  y,  u,  v,  color);
+  fill_vertex(1, x,  y1, u,  v1, color);
+  fill_vertex(2, x1, y,  u1, v,  color);
+  if (prepare_triangle(psx_gpu, vertexes, vertex_ptrs)) {
+    shift_vertices3(vertex_ptrs);
+    shift_triangle_area();
+    render_triangle_p(psx_gpu, vertex_ptrs, flags);
+  }
+
+  //   0
+  //  /|
+  // 1-2
+  fill_vertex(0, x1, y,  u1, v,  color);
+  fill_vertex(1, x,  y1, u,  v1, color);
+  fill_vertex(2, x1, y1, u1, v1, color);
+  if (prepare_triangle(psx_gpu, vertexes, vertex_ptrs)) {
+    shift_vertices3(vertex_ptrs);
+    shift_triangle_area();
+    render_triangle_p(psx_gpu, vertex_ptrs, flags);
+  }
+
+  psx_gpu->render_state_base = render_state_base_saved;
+}
+#else
+static void do_sprite_enhanced(psx_gpu_struct *psx_gpu, int x, int y,
+ u32 u, u32 v, u32 w, u32 h, u32 cmd_rgb)
+{
+  u32 flags = (cmd_rgb >> 24);
+  u32 color = cmd_rgb & 0xffffff;
+
+  render_sprite_4x(psx_gpu, x, y, u, v, w, h, flags, color);
+}
+#endif
+
+u32 gpu_parse_enhanced(psx_gpu_struct *psx_gpu, u32 *list, u32 size,
+ u32 *last_command)
+{
+  u32 current_command = 0, command_length;
+
+  u32 *list_start = list;
+  u32 *list_end = list + (size / 4);
+
+  psx_gpu->saved_viewport_start_x = psx_gpu->viewport_start_x;
+  psx_gpu->saved_viewport_start_y = psx_gpu->viewport_start_y;
+  psx_gpu->saved_viewport_end_x = psx_gpu->viewport_end_x;
+  psx_gpu->saved_viewport_end_y = psx_gpu->viewport_end_y;
+  select_enhancement_buf(psx_gpu);
+
+  for(; list < list_end; list += 1 + command_length)
+  {
+    s16 *list_s16 = (void *)list;
+    current_command = *list >> 24;
+    command_length = command_lengths[current_command];
+    if (list + 1 + command_length > list_end) {
+      current_command = (u32)-1;
+      break;
+    }
+
+    enhancement_disable();
+
+    switch(current_command)
+    {
+      case 0x00:
+        break;
+  
+      case 0x02:
+      {
+        u32 x = list_s16[2] & 0x3FF;
+        u32 y = list_s16[3] & 0x1FF;
+        u32 width = list_s16[4] & 0x3FF;
+        u32 height = list_s16[5] & 0x1FF;
+        u32 color = list[0] & 0xFFFFFF;
+
+        x &= ~0xF;
+        width = ((width + 0xF) & ~0xF);
+
+        do_fill(psx_gpu, x, y, width, height, color);
+
+        psx_gpu->vram_out_ptr = select_enhancement_buf_ptr(psx_gpu, x);
+        x *= 2;
+        y *= 2;
+        width *= 2;
+        height *= 2;
+        render_block_fill_enh(psx_gpu, color, x, y, width, height);
+        break;
+      }
+  
+      case 0x20 ... 0x23:
+      {
+        set_triangle_color(psx_gpu, list[0] & 0xFFFFFF);
+  
+        get_vertex_data_xy(0, 2);
+        get_vertex_data_xy(1, 4);
+        get_vertex_data_xy(2, 6);
+
+        do_triangle_enhanced(psx_gpu, vertexes, current_command);
+        break;
+      }
+  
+      case 0x24 ... 0x27:
+      {
+        set_clut(psx_gpu, list_s16[5]);
+        set_texture(psx_gpu, list_s16[9]);
+        set_triangle_color(psx_gpu, list[0] & 0xFFFFFF);
+  
+        get_vertex_data_xy_uv(0, 2);
+        get_vertex_data_xy_uv(1, 6);
+        get_vertex_data_xy_uv(2, 10);
+  
+        do_triangle_enhanced(psx_gpu, vertexes, current_command);
+        break;
+      }
+  
+      case 0x28 ... 0x2B:
+      {
+        set_triangle_color(psx_gpu, list[0] & 0xFFFFFF);
+  
+        get_vertex_data_xy(0, 2);
+        get_vertex_data_xy(1, 4);
+        get_vertex_data_xy(2, 6);
+        get_vertex_data_xy(3, 8);
+
+        do_quad_enhanced(psx_gpu, vertexes, current_command);
+        break;
+      }
+  
+      case 0x2C ... 0x2F:
+      {
+        set_clut(psx_gpu, list_s16[5]);
+        set_texture(psx_gpu, list_s16[9]);
+        set_triangle_color(psx_gpu, list[0] & 0xFFFFFF);
+  
+        get_vertex_data_xy_uv(0, 2);   
+        get_vertex_data_xy_uv(1, 6);   
+        get_vertex_data_xy_uv(2, 10);  
+        get_vertex_data_xy_uv(3, 14);
+  
+        do_quad_enhanced(psx_gpu, vertexes, current_command);
+        break;
+      }
+  
+      case 0x30 ... 0x33:
+      {
+        get_vertex_data_xy_rgb(0, 0);
+        get_vertex_data_xy_rgb(1, 4);
+        get_vertex_data_xy_rgb(2, 8);
+  
+        do_triangle_enhanced(psx_gpu, vertexes, current_command);
+        break;
+      }
+  
+      case 0x34:
+      case 0x35:
+      case 0x36:
+      case 0x37:
+      {
+        set_clut(psx_gpu, list_s16[5]);
+        set_texture(psx_gpu, list_s16[11]);
+  
+        get_vertex_data_xy_uv_rgb(0, 0);
+        get_vertex_data_xy_uv_rgb(1, 6);
+        get_vertex_data_xy_uv_rgb(2, 12);
+
+        do_triangle_enhanced(psx_gpu, vertexes, current_command);
+        break;
+      }
+  
+      case 0x38:
+      case 0x39:
+      case 0x3A:
+      case 0x3B:
+      {
+        get_vertex_data_xy_rgb(0, 0);
+        get_vertex_data_xy_rgb(1, 4);
+        get_vertex_data_xy_rgb(2, 8);
+        get_vertex_data_xy_rgb(3, 12);
+  
+        do_quad_enhanced(psx_gpu, vertexes, current_command);
+        break;
+      }
+  
+      case 0x3C:
+      case 0x3D:
+      case 0x3E:
+      case 0x3F:
+      {
+        set_clut(psx_gpu, list_s16[5]);
+        set_texture(psx_gpu, list_s16[11]);
+  
+        get_vertex_data_xy_uv_rgb(0, 0);
+        get_vertex_data_xy_uv_rgb(1, 6);
+        get_vertex_data_xy_uv_rgb(2, 12);
+        get_vertex_data_xy_uv_rgb(3, 18);
+
+        do_quad_enhanced(psx_gpu, vertexes, current_command);
+        break;
+      }
+  
+      case 0x40 ... 0x47:
+      {
+        vertexes[0].x = list_s16[2] + psx_gpu->offset_x;
+        vertexes[0].y = list_s16[3] + psx_gpu->offset_y;
+        vertexes[1].x = list_s16[4] + psx_gpu->offset_x;
+        vertexes[1].y = list_s16[5] + psx_gpu->offset_y;
+
+        render_line(psx_gpu, vertexes, current_command, list[0], 0);
+        enhancement_enable();
+        render_line(psx_gpu, vertexes, current_command, list[0], 1);
+        break;
+      }
+  
+      case 0x48 ... 0x4F:
+      {
+        u32 num_vertexes = 1;
+        u32 *list_position = &(list[2]);
+        u32 xy = list[1];
+
+        vertexes[1].x = (xy & 0xFFFF) + psx_gpu->offset_x;
+        vertexes[1].y = (xy >> 16) + psx_gpu->offset_y;
+      
+        xy = *list_position;
+        while(1)
+        {
+          vertexes[0] = vertexes[1];
+
+          vertexes[1].x = (xy & 0xFFFF) + psx_gpu->offset_x;
+          vertexes[1].y = (xy >> 16) + psx_gpu->offset_y;
+
+          enhancement_disable();
+          render_line(psx_gpu, vertexes, current_command, list[0], 0);
+          enhancement_enable();
+          render_line(psx_gpu, vertexes, current_command, list[0], 1);
+
+          list_position++;
+          num_vertexes++;
+
+          if(list_position >= list_end)
+            break;
+
+          xy = *list_position;
+          if((xy & 0xF000F000) == 0x50005000)
+            break;
+        }
+
+        command_length += (num_vertexes - 2);
+        break;
+      }
+  
+      case 0x50 ... 0x57:
+      {
+        vertexes[0].r = list[0] & 0xFF;
+        vertexes[0].g = (list[0] >> 8) & 0xFF;
+        vertexes[0].b = (list[0] >> 16) & 0xFF;
+        vertexes[0].x = list_s16[2] + psx_gpu->offset_x;
+        vertexes[0].y = list_s16[3] + psx_gpu->offset_y;
+
+        vertexes[1].r = list[2] & 0xFF;
+        vertexes[1].g = (list[2] >> 8) & 0xFF;
+        vertexes[1].b = (list[2] >> 16) & 0xFF;
+        vertexes[1].x = list_s16[6] + psx_gpu->offset_x;
+        vertexes[1].y = list_s16[7] + psx_gpu->offset_y;
+
+        render_line(psx_gpu, vertexes, current_command, 0, 0);
+        enhancement_enable();
+        render_line(psx_gpu, vertexes, current_command, 0, 1);
+        break;
+      }
+      case 0x58 ... 0x5F:
+      {
+        u32 num_vertexes = 1;
+        u32 *list_position = &(list[2]);
+        u32 color = list[0];
+        u32 xy = list[1];
+
+        vertexes[1].r = color & 0xFF;
+        vertexes[1].g = (color >> 8) & 0xFF;
+        vertexes[1].b = (color >> 16) & 0xFF;
+        vertexes[1].x = (xy & 0xFFFF) + psx_gpu->offset_x;
+        vertexes[1].y = (xy >> 16) + psx_gpu->offset_y;
+      
+        color = list_position[0];
+        while(1)
+        {
+          xy = list_position[1];
+
+          vertexes[0] = vertexes[1];
+
+          vertexes[1].r = color & 0xFF;
+          vertexes[1].g = (color >> 8) & 0xFF;
+          vertexes[1].b = (color >> 16) & 0xFF;
+          vertexes[1].x = (xy & 0xFFFF) + psx_gpu->offset_x;
+          vertexes[1].y = (xy >> 16) + psx_gpu->offset_y;
+
+          enhancement_disable();
+          render_line(psx_gpu, vertexes, current_command, 0, 0);
+          enhancement_enable();
+          render_line(psx_gpu, vertexes, current_command, 0, 1);
+
+          list_position += 2;
+          num_vertexes++;
+
+          if(list_position >= list_end)
+            break;
+
+          color = list_position[0];
+          if((color & 0xF000F000) == 0x50005000)
+            break;
+        }
+
+        command_length += ((num_vertexes - 2) * 2);
+        break;
+      }
+  
+      case 0x60 ... 0x63:
+      {        
+        u32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x);
+        u32 y = sign_extend_11bit(list_s16[3] + psx_gpu->offset_y);
+        u32 width = list_s16[4] & 0x3FF;
+        u32 height = list_s16[5] & 0x1FF;
+
+        render_sprite(psx_gpu, x, y, 0, 0, width, height, current_command, list[0]);
+        do_sprite_enhanced(psx_gpu, x, y, 0, 0, width, height, list[0]);
+        break;
+      }
+  
+      case 0x64 ... 0x67:
+      {        
+        u32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x);
+        u32 y = sign_extend_11bit(list_s16[3] + psx_gpu->offset_y);
+        u8 u = list_s16[4];
+        u8 v = list_s16[4] >> 8;
+        u32 width = list_s16[6] & 0x3FF;
+        u32 height = list_s16[7] & 0x1FF;
+
+        set_clut(psx_gpu, list_s16[5]);
+
+        render_sprite(psx_gpu, x, y, u, v, width, height,
+         current_command, list[0]);
+        do_sprite_enhanced(psx_gpu, x, y, u, v, width, height, list[0]);
+        break;
+      }
+  
+      case 0x68:
+      case 0x69:
+      case 0x6A:
+      case 0x6B:
+      {
+        s32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x);
+        s32 y = sign_extend_11bit(list_s16[3] + psx_gpu->offset_y);
+
+        render_sprite(psx_gpu, x, y, 0, 0, 1, 1, current_command, list[0]);
+        do_sprite_enhanced(psx_gpu, x, y, 0, 0, 1, 1, list[0]);
+        break;
+      }
+  
+      case 0x70:
+      case 0x71:
+      case 0x72:
+      case 0x73:
+      {        
+        s32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x);
+        s32 y = sign_extend_11bit(list_s16[3] + psx_gpu->offset_y);
+
+        render_sprite(psx_gpu, x, y, 0, 0, 8, 8, current_command, list[0]);
+        do_sprite_enhanced(psx_gpu, x, y, 0, 0, 8, 8, list[0]);
+        break;
+      }
+  
+      case 0x74:
+      case 0x75:
+      case 0x76:
+      case 0x77:
+      {        
+        s32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x);
+        s32 y = sign_extend_11bit(list_s16[3] + psx_gpu->offset_y);
+        u8 u = list_s16[4];
+        u8 v = list_s16[4] >> 8;
+
+        set_clut(psx_gpu, list_s16[5]);
+
+        render_sprite(psx_gpu, x, y, u, v, 8, 8,
+         current_command, list[0]);
+        do_sprite_enhanced(psx_gpu, x, y, u, v, 8, 8, list[0]);
+        break;
+      }
+  
+      case 0x78:
+      case 0x79:
+      case 0x7A:
+      case 0x7B:
+      {        
+        s32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x);
+        s32 y = sign_extend_11bit(list_s16[3] + psx_gpu->offset_y);
+
+        render_sprite(psx_gpu, x, y, 0, 0, 16, 16, current_command, list[0]);
+        do_sprite_enhanced(psx_gpu, x, y, 0, 0, 16, 16, list[0]);
+        break;
+      }
+  
+      case 0x7C:
+      case 0x7D:
+      case 0x7E:
+      case 0x7F:
+      {        
+        s32 x = sign_extend_11bit(list_s16[2] + psx_gpu->offset_x);
+        s32 y = sign_extend_11bit(list_s16[3] + psx_gpu->offset_y);
+        u8 u = list_s16[4];
+        u8 v = list_s16[4] >> 8;
+
+        set_clut(psx_gpu, list_s16[5]);
+
+        render_sprite(psx_gpu, x, y, u, v, 16, 16, current_command, list[0]);
+        do_sprite_enhanced(psx_gpu, x, y, u, v, 16, 16, list[0]);
+        break;
+      }
+  
+      case 0x80:          //  vid -> vid
+      {
+        u32 sx = list_s16[2] & 0x3FF;
+        u32 sy = list_s16[3] & 0x1FF;
+        u32 dx = list_s16[4] & 0x3FF;
+        u32 dy = list_s16[5] & 0x1FF;
+        u32 w = ((list_s16[6] - 1) & 0x3FF) + 1;
+        u32 h = ((list_s16[7] - 1) & 0x1FF) + 1;
+        u16 *buf;
+
+        if (sx == dx && sy == dy && psx_gpu->mask_msb == 0)
+          break;
+
+        render_block_move(psx_gpu, sx, sy, dx, dy, w, h);
+        if (dy + h > 512)
+          h = 512 - dy;
+        sx = sx & ~7; // FIXME?
+        dx = dx * 2 & ~7;
+        dy *= 2;
+        w = (w + 7) / 8;
+        buf = select_enhancement_buf_ptr(psx_gpu, dx / 2);
+        scale2x_tiles8(buf + dy * 1024 + dx,
+          psx_gpu->vram_ptr + sy * 1024 + sx, w, h);
+        break;
+      }
+      case 0xA0:          //  sys -> vid
+      case 0xC0:          //  vid -> sys
+        goto breakloop;
+
+      case 0xE1:
+        set_texture(psx_gpu, list[0]);
+
+        if(list[0] & (1 << 9))
+          psx_gpu->render_state_base |= RENDER_STATE_DITHER;
+        else
+          psx_gpu->render_state_base &= ~RENDER_STATE_DITHER;
+
+        psx_gpu->display_area_draw_enable = (list[0] >> 10) & 0x1;
+        SET_Ex(1, list[0]);
+        break;
+  
+      case 0xE2:
+      {
+        // TODO: Clean
+        u32 texture_window_settings = list[0];
+        u32 tmp, x, y, w, h;
+
+        if(texture_window_settings != psx_gpu->texture_window_settings)
+        {
+          tmp = (texture_window_settings & 0x1F) | 0x20;
+          for(w = 8; (tmp & 1) == 0; tmp >>= 1, w <<= 1);
+
+          tmp = ((texture_window_settings >> 5) & 0x1f) | 0x20;
+          for (h = 8; (tmp & 1) == 0; tmp >>= 1, h <<= 1);
+
+          tmp = 32 - (w >> 3);
+          x = ((texture_window_settings >> 10) & tmp) << 3;
+
+          tmp = 32 - (h >> 3);
+          y = ((texture_window_settings >> 15) & tmp) << 3;
+
+          flush_render_block_buffer(psx_gpu);
+          
+          psx_gpu->texture_window_settings = texture_window_settings;
+          psx_gpu->texture_window_x = x;
+          psx_gpu->texture_window_y = y;
+          psx_gpu->texture_mask_width = w - 1;
+          psx_gpu->texture_mask_height = h - 1;
+
+          update_texture_ptr(psx_gpu);
+        }
+        SET_Ex(2, list[0]);
+        break;
+      }
+  
+      case 0xE3:
+      {
+        s16 viewport_start_x = list[0] & 0x3FF;
+        s16 viewport_start_y = (list[0] >> 10) & 0x1FF;
+        u32 d;
+
+        if(viewport_start_x == psx_gpu->viewport_start_x &&
+         viewport_start_y == psx_gpu->viewport_start_y)
+        {
+          break;
+        }
+        psx_gpu->viewport_start_x = viewport_start_x;
+        psx_gpu->viewport_start_y = viewport_start_y;
+        psx_gpu->saved_viewport_start_x = viewport_start_x;
+        psx_gpu->saved_viewport_start_y = viewport_start_y;
+
+        d = (u32)psx_gpu->viewport_end_x - (u32)viewport_start_x + 1;
+        if((u32)psx_gpu->enhancement_x_threshold - d <= 16)
+        {
+          update_enhancement_buf_table_from_x(psx_gpu,
+           viewport_start_x, d);
+        }
+        select_enhancement_buf(psx_gpu);
+
+#ifdef TEXTURE_CACHE_4BPP
+        psx_gpu->viewport_mask =
+         texture_region_mask(psx_gpu->viewport_start_x,
+         psx_gpu->viewport_start_y, psx_gpu->viewport_end_x,
+         psx_gpu->viewport_end_y);
+#endif
+        SET_Ex(3, list[0]);
+        break;
+      }
+
+      case 0xE4:
+      {
+        s16 viewport_end_x = list[0] & 0x3FF;
+        s16 viewport_end_y = (list[0] >> 10) & 0x1FF;
+        u32 d;
+
+        if(viewport_end_x == psx_gpu->viewport_end_x &&
+         viewport_end_y == psx_gpu->viewport_end_y)
+        {
+          break;
+        }
+
+        psx_gpu->viewport_end_x = viewport_end_x;
+        psx_gpu->viewport_end_y = viewport_end_y;
+        psx_gpu->saved_viewport_end_x = viewport_end_x;
+        psx_gpu->saved_viewport_end_y = viewport_end_y;
+
+        d = (u32)viewport_end_x - (u32)psx_gpu->viewport_start_x + 1;
+        if((u32)psx_gpu->enhancement_x_threshold - d <= 16)
+        {
+          update_enhancement_buf_table_from_x(psx_gpu,
+           psx_gpu->viewport_start_x, d);
+        }
+        select_enhancement_buf(psx_gpu);
+
+#ifdef TEXTURE_CACHE_4BPP
+        psx_gpu->viewport_mask =
+         texture_region_mask(psx_gpu->viewport_start_x,
+         psx_gpu->viewport_start_y, psx_gpu->viewport_end_x,
+         psx_gpu->viewport_end_y);
+#endif
+        SET_Ex(4, list[0]);
+        break;
+      }
+  
+      case 0xE5:
+      {
+        s32 offset_x = list[0] << 21;
+        s32 offset_y = list[0] << 10;
+        psx_gpu->offset_x = offset_x >> 21;
+        psx_gpu->offset_y = offset_y >> 21; 
+  
+        SET_Ex(5, list[0]);
+        break;
+      }
+
+      case 0xE6:
+      {
+        u32 mask_settings = list[0];
+        u16 mask_msb = mask_settings << 15;
+
+        if(list[0] & 0x2)
+          psx_gpu->render_state_base |= RENDER_STATE_MASK_EVALUATE;
+        else
+          psx_gpu->render_state_base &= ~RENDER_STATE_MASK_EVALUATE;
+
+        if(mask_msb != psx_gpu->mask_msb)
+        {
+          flush_render_block_buffer(psx_gpu);
+          psx_gpu->mask_msb = mask_msb;
+        }
+
+        SET_Ex(6, list[0]);
+        break;
+      }
+  
+      default:
+        break;
+    }
+  }
+
+  enhancement_disable();
+
+breakloop:
+  if (last_command != NULL)
+    *last_command = current_command;
+  return list - list_start;
+}
+
+#endif /* PCSX */
+
+// vim:shiftwidth=2:expandtab
index 210f44d..58cca29 100644 (file)
@@ -13,7 +13,7 @@ ASFLAGS = $(CFLAGS)
 OBJ += psx_gpu_arm_neon.o
 endif
 ifndef DEBUG
-CFLAGS += -O2 -fno-strict-aliasing
+CFLAGS += -O2 -DNDEBUG -fno-strict-aliasing
 endif
 
 OBJ += psx_gpu.o psx_gpu_parse.o psx_gpu_main.o
index c11955d..c91e7d9 100644 (file)
@@ -394,6 +394,10 @@ build_vector_types(s);
   foreach_element(8, (dest).e[_i] =                                            \
    (u8)(source_a).e[_i] | ((u8)(source_b).e[_i] << 8))                         \
 
+#define zip_4x32b(dest, source_a, source_b)                                    \
+  foreach_element(4, (dest).e[_i] =                                            \
+   (u16)(source_a).e[_i] | ((u16)(source_b).e[_i] << 16))                      \
+
 #define zip_2x64b(dest, source_a, source_b)                                    \
   foreach_element(2, (dest).e[_i] =                                            \
    (u64)(source_a).e[_i] | ((u64)(source_b).e[_i] << 32))                      \
index ff31c27..ad01761 100644 (file)
@@ -9,11 +9,13 @@
  */
 
 #include <stdio.h>
+#include <sys/mman.h>
 
 extern const unsigned char cmd_lengths[256];
 #define command_lengths cmd_lengths
 
 static unsigned int *ex_regs;
+static int initialized;
 
 #define PCSX
 #define SET_Ex(r, v) \
@@ -27,20 +29,102 @@ static psx_gpu_struct egpu __attribute__((aligned(256)));
 
 int do_cmd_list(uint32_t *list, int count, int *last_cmd)
 {
-  int ret = gpu_parse(&egpu, list, count * 4, (u32 *)last_cmd);
+  int ret;
+
+  if (gpu.state.enhancement_active)
+    ret = gpu_parse_enhanced(&egpu, list, count * 4, (u32 *)last_cmd);
+  else
+    ret = gpu_parse(&egpu, list, count * 4, (u32 *)last_cmd);
 
   ex_regs[1] &= ~0x1ff;
   ex_regs[1] |= egpu.texture_settings & 0x1ff;
   return ret;
 }
 
+#define ENHANCEMENT_BUF_SIZE (1024 * 1024 * 2 * 4 + 4096 * 2)
+
+static uint16_t *get_enhancement_bufer(int *x, int *y, int *w, int *h,
+ int *vram_h)
+{
+  uint16_t *ret = select_enhancement_buf_ptr(&egpu, *x);
+
+  *x *= 2;
+  *y *= 2;
+  *w = *w * 2;
+  *h = *h * 2;
+  *vram_h = 1024;
+  return ret;
+}
+
+static void map_enhancement_buffer(void)
+{
+  // currently we use 4x 1024*1024 buffers instead of single 2048*1024
+  // to be able to reuse 1024-width code better (triangle setup,
+  // dithering phase, lines).
+  egpu.enhancement_buf_ptr = gpu.mmap(ENHANCEMENT_BUF_SIZE);
+  if (egpu.enhancement_buf_ptr == NULL) {
+    fprintf(stderr, "failed to map enhancement buffer\n");
+    gpu.get_enhancement_bufer = NULL;
+  }
+  else {
+    egpu.enhancement_buf_ptr += 4096 / 2;
+    gpu.get_enhancement_bufer = get_enhancement_bufer;
+  }
+}
+
 int renderer_init(void)
 {
-  initialize_psx_gpu(&egpu, gpu.vram);
+  if (gpu.vram != NULL) {
+    initialize_psx_gpu(&egpu, gpu.vram);
+    initialized = 1;
+  }
+
+  if (gpu.mmap != NULL && egpu.enhancement_buf_ptr == NULL)
+    map_enhancement_buffer();
+
   ex_regs = gpu.ex_regs;
   return 0;
 }
 
+void renderer_finish(void)
+{
+  if (egpu.enhancement_buf_ptr != NULL) {
+    egpu.enhancement_buf_ptr -= 4096 / 2;
+    gpu.munmap(egpu.enhancement_buf_ptr, ENHANCEMENT_BUF_SIZE);
+  }
+  egpu.enhancement_buf_ptr = NULL;
+  egpu.enhancement_current_buf_ptr = NULL;
+  initialized = 0;
+}
+
+static __attribute__((noinline)) void
+sync_enhancement_buffers(int x, int y, int w, int h)
+{
+  const int step_x = 1024 / sizeof(egpu.enhancement_buf_by_x16);
+  u16 *src, *dst;
+  int w1, fb_index;
+
+  w += x & (step_x - 1);
+  x &= ~(step_x - 1);
+  w = (w + step_x - 1) & ~(step_x - 1);
+  if (y + h > 512)
+    h = 512 - y;
+
+  while (w > 0) {
+    fb_index = egpu.enhancement_buf_by_x16[x / step_x];
+    for (w1 = 0; w > 0; w1++, w -= step_x)
+      if (fb_index != egpu.enhancement_buf_by_x16[x / step_x + w1])
+        break;
+
+    src = gpu.vram + y * 1024 + x;
+    dst = select_enhancement_buf_ptr(&egpu, x);
+    dst += (y * 1024 + x) * 2;
+    scale2x_tiles8(dst, src, w1 * step_x / 8, h);
+
+    x += w1 * step_x;
+  }
+}
+
 void renderer_sync_ecmds(uint32_t *ecmds)
 {
   gpu_parse(&egpu, ecmds + 1, 6 * 4, NULL);
@@ -49,6 +133,8 @@ void renderer_sync_ecmds(uint32_t *ecmds)
 void renderer_update_caches(int x, int y, int w, int h)
 {
   update_texture_cache_region(&egpu, x, y, x + w - 1, y + h - 1);
+  if (gpu.state.enhancement_active && !gpu.status.rgb24)
+    sync_enhancement_buffers(x, y, w, h);
 }
 
 void renderer_flush_queues(void)
@@ -58,13 +144,44 @@ void renderer_flush_queues(void)
 
 void renderer_set_interlace(int enable, int is_odd)
 {
-  egpu.interlace_mode &= ~(RENDER_INTERLACE_ENABLED|RENDER_INTERLACE_ODD);
+  egpu.render_mode &= ~(RENDER_INTERLACE_ENABLED|RENDER_INTERLACE_ODD);
   if (enable)
-    egpu.interlace_mode |= RENDER_INTERLACE_ENABLED;
+    egpu.render_mode |= RENDER_INTERLACE_ENABLED;
   if (is_odd)
-    egpu.interlace_mode |= RENDER_INTERLACE_ODD;
+    egpu.render_mode |= RENDER_INTERLACE_ODD;
 }
 
+void renderer_notify_res_change(void)
+{
+  // note: must keep it multiple of 8
+  if (egpu.enhancement_x_threshold != gpu.screen.hres)
+  {
+    egpu.enhancement_x_threshold = gpu.screen.hres;
+    update_enhancement_buf_table_from_hres(&egpu);
+  }
+}
+
+#include "../../frontend/plugin_lib.h"
+
 void renderer_set_config(const struct rearmed_cbs *cbs)
 {
+  static int enhancement_was_on;
+
+  disable_main_render = cbs->gpu_neon.enhancement_no_main;
+  if (egpu.enhancement_buf_ptr != NULL && cbs->gpu_neon.enhancement_enable
+      && !enhancement_was_on)
+  {
+    sync_enhancement_buffers(0, 0, 1024, 512);
+  }
+  enhancement_was_on = cbs->gpu_neon.enhancement_enable;
+
+  if (!initialized) {
+    initialize_psx_gpu(&egpu, gpu.vram);
+    initialized = 1;
+  }
+
+  if (gpu.mmap != NULL && egpu.enhancement_buf_ptr == NULL)
+    map_enhancement_buffer();
+  if (cbs->pl_set_gpu_caps)
+    cbs->pl_set_gpu_caps(GPU_CAP_SUPPORTS_2X);
 }
index 46552ac..c111d78 100644 (file)
@@ -824,7 +824,6 @@ void  GPU_updateLace(void)
 extern "C" {
 
 static const struct rearmed_cbs *cbs;
-static void *screen_buf;
 static s16 old_res_horz, old_res_vert, old_rgb24;
 
 static void blit(void)
@@ -832,12 +831,10 @@ static void blit(void)
        u16 *base = (u16 *)GPU_FrameBuffer;
        s16 isRGB24 = (GPU_GP1 & 0x00200000) ? 1 : 0;
        s16 h0, x0, y0, w0, h1;
-       u32 fb_offs;
-       u8  *dest;
 
        x0 = DisplayArea[0] & ~1; // alignment needed by blitter
        y0 = DisplayArea[1];
-       fb_offs = FRAME_OFFSET(x0, y0);
+       base += FRAME_OFFSET(x0, y0);
 
        w0 = DisplayArea[2];
        h0 = DisplayArea[3];  // video mode
@@ -853,62 +850,10 @@ static void blit(void)
                old_res_horz = w0;
                old_res_vert = h1;
                old_rgb24 = (s16)isRGB24;
-               screen_buf = cbs->pl_vout_set_mode(w0, h1, isRGB24 ? 24 : 16);
+               cbs->pl_vout_set_mode(w0, h1, isRGB24 ? 24 : 16);
        }
-       dest = (u8 *)screen_buf;
 
-       if (isRGB24)
-       {
-               if (!cbs->only_16bpp)
-               {
-                       for (; h1-- > 0; dest += w0 * 3, fb_offs += 1024)
-                       {
-                               fb_offs &= 1024*512-1;
-                               bgr888_to_rgb888(dest, base + fb_offs, w0 * 3);
-                       }
-               }
-               else
-               {
-                       for (; h1-- > 0; dest += w0 * 2, fb_offs += 1024)
-                       {
-                               fb_offs &= 1024*512-1;
-                               bgr888_to_rgb565(dest, base + fb_offs, w0 * 3);
-                       }
-               }
-       }
-       else
-       {
-               for (; h1-- > 0; dest += w0 * 2, fb_offs += 1024)
-               {
-                       fb_offs &= 1024*512-1;
-                       bgr555_to_rgb565(dest, base + fb_offs, w0 * 2);
-               }
-       }
-
-       screen_buf = cbs->pl_vout_flip();
-}
-
-static void blit_raw(void)
-{
-       s16 isRGB24 = (GPU_GP1 & 0x00200000) ? 1 : 0;
-       s16 h0, w0, h1;
-
-       w0 = DisplayArea[2];
-       h0 = DisplayArea[3];  // video mode
-       h1 = DisplayArea[5] - DisplayArea[4]; // display needed
-       if (h0 == 480) h1 = Min2(h1*2,480);
-
-       if (h1 <= 0)
-               return;
-
-       if (w0 != old_res_horz || h1 != old_res_vert || isRGB24 != old_rgb24)
-       {
-               old_res_horz = w0;
-               old_res_vert = h1;
-               old_rgb24 = (s16)isRGB24;
-               screen_buf = cbs->pl_vout_set_mode(w0, h1, isRGB24 ? 24 : 16);
-       }
-       cbs->pl_vout_raw_flip(DisplayArea[0], DisplayArea[1]);
+       cbs->pl_vout_flip(base, 1024, isRGB24, w0, h1);
 }
 
 void GPU_updateLace(void)
@@ -920,10 +865,7 @@ void GPU_updateLace(void)
                return;
 
        if (!wasSkip) {
-               if (cbs->pl_vout_raw_flip != NULL)
-                       blit_raw();
-               else
-                       blit();
+               blit();
                fb_dirty = false;
                skCount = 0;
        }
@@ -939,7 +881,6 @@ void GPU_updateLace(void)
 long GPUopen(unsigned long *, char *, char *)
 {
        cbs->pl_vout_open();
-       screen_buf = cbs->pl_vout_flip();
        return 0;
 }
 
@@ -966,6 +907,8 @@ void GPUrearmedCallbacks(const struct rearmed_cbs *cbs_)
                cbs_->pl_vout_set_raw_vram((void *)GPU_FrameBuffer);
 
        cbs = cbs_;
+       if (cbs->pl_set_gpu_caps)
+               cbs->pl_set_gpu_caps(0);
 }
 
 } /* extern "C" */
index 38e7ce1..de16721 100644 (file)
@@ -154,6 +154,14 @@ int renderer_init(void)
        return 0;
 }
 
+void renderer_finish(void)
+{
+}
+
+void renderer_notify_res_change(void)
+{
+}
+
 extern const unsigned char cmd_lengths[256];
 
 int do_cmd_list(unsigned int *list, int list_len, int *last_cmd)
@@ -525,6 +533,8 @@ void renderer_set_config(const struct rearmed_cbs *cbs)
   enableAbbeyHack = cbs->gpu_unai.abe_hack;
   light = !cbs->gpu_unai.no_light;
   blend = !cbs->gpu_unai.no_blend;
+
+  GPU_FrameBuffer = (u16 *)gpu.vram;
 }
 
 #endif
index 46e92d1..b61bff6 100644 (file)
@@ -24,7 +24,7 @@
 //#define log_anomaly gpu_log
 #define log_anomaly(...)
 
-struct psx_gpu gpu __attribute__((aligned(2048)));
+struct psx_gpu gpu;
 
 static noinline int do_cmd_buffer(uint32_t *data, int count);
 static void finish_vram_transfer(int is_read);
@@ -133,6 +133,22 @@ static noinline void get_gpu_info(uint32_t data)
   }
 }
 
+// double, for overdraw guard
+#define VRAM_SIZE (1024 * 512 * 2 * 2)
+
+static int map_vram(void)
+{
+  gpu.vram = gpu.mmap(VRAM_SIZE);
+  if (gpu.vram != NULL) {
+    gpu.vram += 4096 / 2;
+    return 0;
+  }
+  else {
+    fprintf(stderr, "could not map vram, expect crashes\n");
+    return -1;
+  }
+}
+
 long GPUinit(void)
 {
   int ret;
@@ -145,12 +161,26 @@ long GPUinit(void)
   gpu.cmd_len = 0;
   do_reset();
 
+  if (gpu.mmap != NULL) {
+    if (map_vram() != 0)
+      ret = -1;
+  }
   return ret;
 }
 
 long GPUshutdown(void)
 {
-  return vout_finish();
+  long ret;
+
+  renderer_finish();
+  ret = vout_finish();
+  if (gpu.vram != NULL) {
+    gpu.vram -= 4096 / 2;
+    gpu.munmap(gpu.vram, VRAM_SIZE);
+  }
+  gpu.vram = NULL;
+
+  return ret;
 }
 
 void GPUwriteStatus(uint32_t data)
@@ -207,6 +237,7 @@ void GPUwriteStatus(uint32_t data)
       gpu.screen.vres = vres[(gpu.status.reg >> 19) & 3];
       update_width();
       update_height();
+      renderer_notify_res_change();
       break;
     default:
       if ((cmd & 0xf0) == 0x10)
@@ -582,13 +613,13 @@ long GPUfreeze(uint32_t type, struct GPUFreeze *freeze)
     case 1: // save
       if (gpu.cmd_len > 0)
         flush_cmd_buffer();
-      memcpy(freeze->psxVRam, gpu.vram, sizeof(gpu.vram));
+      memcpy(freeze->psxVRam, gpu.vram, 1024 * 512 * 2);
       memcpy(freeze->ulControl, gpu.regs, sizeof(gpu.regs));
       memcpy(freeze->ulControl + 0xe0, gpu.ex_regs, sizeof(gpu.ex_regs));
       freeze->ulStatus = gpu.status.reg;
       break;
     case 0: // load
-      memcpy(gpu.vram, freeze->psxVRam, sizeof(gpu.vram));
+      memcpy(gpu.vram, freeze->psxVRam, 1024 * 512 * 2);
       memcpy(gpu.regs, freeze->ulControl, sizeof(gpu.regs));
       memcpy(gpu.ex_regs, freeze->ulControl + 0xe0, sizeof(gpu.ex_regs));
       gpu.status.reg = freeze->ulStatus;
@@ -669,6 +700,14 @@ void GPUrearmedCallbacks(const struct rearmed_cbs *cbs)
   gpu.state.hcnt = cbs->gpu_hcnt;
   gpu.state.frame_count = cbs->gpu_frame_count;
   gpu.state.allow_interlace = cbs->gpu_neon.allow_interlace;
+  gpu.state.enhancement_enable = cbs->gpu_neon.enhancement_enable;
+
+  gpu.mmap = cbs->mmap;
+  gpu.munmap = cbs->munmap;
+
+  // delayed vram mmap
+  if (gpu.vram == NULL)
+    map_vram();
 
   if (cbs->pl_vout_set_raw_vram)
     cbs->pl_vout_set_raw_vram(gpu.vram);
index 1cbe38c..d11f991 100644 (file)
@@ -17,10 +17,9 @@ extern "C" {
 #define CMD_BUFFER_LEN          1024
 
 struct psx_gpu {
-  uint16_t vram[1024 * 512];
-  uint16_t guard[1024 * 512]; // overdraw guard
   uint32_t cmd_buffer[CMD_BUFFER_LEN];
   uint32_t regs[16];
+  uint16_t *vram;
   union {
     uint32_t reg;
     struct {
@@ -67,6 +66,8 @@ struct psx_gpu {
     uint32_t old_interlace:1;
     uint32_t allow_interlace:2;
     uint32_t blanked:1;
+    uint32_t enhancement_enable:1;
+    uint32_t enhancement_active:1;
     uint32_t *frame_count;
     uint32_t *hcnt; /* hsync count */
     struct {
@@ -87,6 +88,10 @@ struct psx_gpu {
     uint32_t last_flip_frame;
     uint32_t pending_fill[3];
   } frameskip;
+  uint16_t *(*get_enhancement_bufer)
+    (int *x, int *y, int *w, int *h, int *vram_h);
+  void *(*mmap)(unsigned int size);
+  void  (*munmap)(void *ptr, unsigned int size);
 };
 
 extern struct psx_gpu gpu;
@@ -98,11 +103,13 @@ int do_cmd_list(uint32_t *list, int count, int *last_cmd);
 struct rearmed_cbs;
 
 int  renderer_init(void);
+void renderer_finish(void);
 void renderer_sync_ecmds(uint32_t * ecmds);
 void renderer_update_caches(int x, int y, int w, int h);
 void renderer_flush_queues(void);
 void renderer_set_interlace(int enable, int is_odd);
 void renderer_set_config(const struct rearmed_cbs *config);
+void renderer_notify_res_change(void);
 
 int  vout_init(void);
 int  vout_finish(void);
index 0bd1ecf..11307e2 100644 (file)
@@ -15,7 +15,6 @@
 #include "../../frontend/plugin_lib.h"
 
 static const struct rearmed_cbs *cbs;
-static void *screen_buf;
 
 int vout_init(void)
 {
@@ -27,90 +26,74 @@ int vout_finish(void)
   return 0;
 }
 
-static void check_mode_change(void)
+static void check_mode_change(int force)
 {
   static uint32_t old_status;
   static int old_h;
+  int w = gpu.screen.hres;
+  int h = gpu.screen.h;
+
+  gpu.state.enhancement_active =
+    gpu.get_enhancement_bufer != NULL && gpu.state.enhancement_enable
+    && w <= 512 && h <= 256 && !gpu.status.rgb24;
+
+  if (gpu.state.enhancement_active) {
+    w *= 2;
+    h *= 2;
+  }
 
   // width|rgb24 change?
-  if ((gpu.status.reg ^ old_status) & ((7<<16)|(1<<21)) || gpu.screen.h != old_h)
+  if (force || (gpu.status.reg ^ old_status) & ((7<<16)|(1<<21)) || h != old_h)
   {
     old_status = gpu.status.reg;
-    old_h = gpu.screen.h;
-    screen_buf = cbs->pl_vout_set_mode(gpu.screen.hres, gpu.screen.h,
+    old_h = h;
+
+    cbs->pl_vout_set_mode(w, h,
       (gpu.status.rgb24 && !cbs->only_16bpp) ? 24 : 16);
   }
 }
 
-static void blit(void)
+void vout_update(void)
 {
   int x = gpu.screen.x & ~1; // alignment needed by blitter
   int y = gpu.screen.y;
   int w = gpu.screen.w;
   int h = gpu.screen.h;
   uint16_t *vram = gpu.vram;
-  int stride = gpu.screen.hres;
-  int fb_offs, doffs;
-  uint8_t *dest;
+  int vram_h = 512;
 
-  dest = (uint8_t *)screen_buf;
-  if (dest == NULL)
+  if (w == 0 || h == 0)
     return;
 
-  fb_offs = y * 1024 + x;
+  check_mode_change(0);
+  if (gpu.state.enhancement_active)
+    vram = gpu.get_enhancement_bufer(&x, &y, &w, &h, &vram_h);
 
-  // only do centering, at least for now
-  doffs = (stride - w) / 2 & ~1;
-
-  if (gpu.status.rgb24)
-  {
-    if (cbs->only_16bpp) {
-      dest += doffs * 2;
-      for (; h-- > 0; dest += stride * 2, fb_offs += 1024)
-      {
-        fb_offs &= 1024*512-1;
-        bgr888_to_rgb565(dest, vram + fb_offs, w * 3);
-      }
-    }
-    else {
-      dest += (doffs / 8) * 24;
-      for (; h-- > 0; dest += stride * 3, fb_offs += 1024)
-      {
-        fb_offs &= 1024*512-1;
-        bgr888_to_rgb888(dest, vram + fb_offs, w * 3);
-      }
-    }
-  }
-  else
-  {
-    dest += doffs * 2;
-    for (; h-- > 0; dest += stride * 2, fb_offs += 1024)
-    {
-      fb_offs &= 1024*512-1;
-      bgr555_to_rgb565(dest, vram + fb_offs, w * 2);
+  if (y + h > vram_h) {
+    if (y + h - vram_h > h / 2) {
+      // wrap
+      y = 0;
+      h -= vram_h - y;
     }
+    else
+      // clip
+      h = vram_h - y;
   }
 
-  screen_buf = cbs->pl_vout_flip();
-}
+  vram += y * 1024 + x;
 
-void vout_update(void)
-{
-  check_mode_change();
-  if (cbs->pl_vout_raw_flip)
-    cbs->pl_vout_raw_flip(gpu.screen.x, gpu.screen.y);
-  else
-    blit();
+  cbs->pl_vout_flip(vram, 1024, gpu.status.rgb24, w, h);
 }
 
 void vout_blank(void)
 {
-  check_mode_change();
-  if (cbs->pl_vout_raw_flip == NULL) {
-    int bytespp = gpu.status.rgb24 ? 3 : 2;
-    memset(screen_buf, 0, gpu.screen.hres * gpu.screen.h * bytespp);
-    screen_buf = cbs->pl_vout_flip();
+  int w = gpu.screen.hres;
+  int h = gpu.screen.h;
+  if (gpu.state.enhancement_active) {
+    w *= 2;
+    h *= 2;
   }
+  cbs->pl_vout_flip(NULL, 1024, gpu.status.rgb24, w, h);
 }
 
 long GPUopen(void **unused)
@@ -119,7 +102,7 @@ long GPUopen(void **unused)
   gpu.frameskip.frame_ready = 1;
 
   cbs->pl_vout_open();
-  screen_buf = cbs->pl_vout_flip();
+  check_mode_change(1);
   return 0;
 }