Backport GPU Unai plugin from PCSX4ALL
authornegativeExponent <negativeExponent@users.noreply.github.com>
Sat, 17 Aug 2019 01:31:06 +0000 (09:31 +0800)
committernegativeExponent <negativeExponent@users.noreply.github.com>
Sat, 17 Aug 2019 01:33:48 +0000 (09:33 +0800)
- backports gpu unai plugin from PCSX4ALL
- sync necessary files with notaz/master to allow building standalone app

54 files changed:
Makefile
blackberry_qnx/.cproject [new file with mode: 0644]
blackberry_qnx/.project [new file with mode: 0644]
debian_maemo/buildpkg [new file with mode: 0644]
debian_maemo/changelog [new file with mode: 0644]
debian_maemo/compat [new file with mode: 0644]
debian_maemo/control [new file with mode: 0644]
debian_maemo/copyright [new file with mode: 0644]
debian_maemo/dirs [new file with mode: 0644]
debian_maemo/docs [new file with mode: 0644]
debian_maemo/files [new file with mode: 0644]
debian_maemo/install [new file with mode: 0644]
debian_maemo/rules [new file with mode: 0644]
frontend/320240/caanoo.gpe [new file with mode: 0755]
frontend/320240/haptic_s.cfg [new file with mode: 0644]
frontend/320240/haptic_w.cfg [new file with mode: 0644]
frontend/320240/pcsx26.png [new file with mode: 0644]
frontend/320240/pcsx_rearmed.ini [new file with mode: 0644]
frontend/320240/pcsxb.png [new file with mode: 0644]
frontend/320240/pollux_set.c [new file with mode: 0644]
frontend/320240/skin/background.png [new file with mode: 0644]
frontend/320240/skin/font.png [new file with mode: 0644]
frontend/320240/skin/readme.txt [new file with mode: 0644]
frontend/320240/skin/selector.png [new file with mode: 0644]
frontend/320240/skin/skin.txt [new file with mode: 0644]
frontend/320240/ui_gp2x.h [new file with mode: 0644]
frontend/libretro.c
frontend/libretro_core_options.h
frontend/main.c
frontend/menu.c
frontend/plugin_lib.h
jni/Android.mk
maemo/hildon.c [new file with mode: 0644]
maemo/maemo_common.h [new file with mode: 0644]
maemo/maemo_xkb.c [new file with mode: 0644]
plugins/gpu_unai/Makefile
plugins/gpu_unai/README_senquack.txt [new file with mode: 0644]
plugins/gpu_unai/gpu.cpp
plugins/gpu_unai/gpu.h
plugins/gpu_unai/gpu_blit.h
plugins/gpu_unai/gpu_command.h
plugins/gpu_unai/gpu_fixedpoint.h
plugins/gpu_unai/gpu_inner.h
plugins/gpu_unai/gpu_inner_blend.h
plugins/gpu_unai/gpu_inner_blend_arm5.h [new file with mode: 0644]
plugins/gpu_unai/gpu_inner_blend_arm7.h [new file with mode: 0644]
plugins/gpu_unai/gpu_inner_light.h
plugins/gpu_unai/gpu_inner_quantization.h [new file with mode: 0644]
plugins/gpu_unai/gpu_raster_image.h
plugins/gpu_unai/gpu_raster_line.h
plugins/gpu_unai/gpu_raster_polygon.h
plugins/gpu_unai/gpu_raster_sprite.h
plugins/gpu_unai/gpu_unai.h [new file with mode: 0644]
plugins/gpu_unai/gpulib_if.cpp

index 06e4fcc..b44c1f2 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -150,6 +150,9 @@ OBJS += plugins/dfxvideo/gpulib_if.o
 endif
 ifeq "$(BUILTIN_GPU)" "unai"
 CFLAGS += -DGPU_UNAI
+CFLAGS += -DUSE_GPULIB=1
+#CFLAGS += -DINLINE="static __inline__"
+#CFLAGS += -Dasm="__asm__ __volatile__"
 OBJS += plugins/gpu_unai/gpulib_if.o
 ifeq "$(ARCH)" "arm"
 OBJS += plugins/gpu_unai/gpu_arm.o
diff --git a/blackberry_qnx/.cproject b/blackberry_qnx/.cproject
new file mode 100644 (file)
index 0000000..565f4a9
--- /dev/null
@@ -0,0 +1,142 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<?fileVersion 4.0.0?>
+
+<cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
+       <storageModule moduleId="org.eclipse.cdt.core.settings">
+               <cconfiguration id="com.qnx.qcc.toolChain.1762498539">
+                       <storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="com.qnx.qcc.toolChain.1762498539" moduleId="org.eclipse.cdt.core.settings" name="Device-Debug">
+                               <externalSettings/>
+                               <extensions>
+                                       <extension id="com.qnx.tools.ide.qde.core.QDEBynaryParser" point="org.eclipse.cdt.core.BinaryParser"/>
+                                       <extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+                                       <extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
+                                       <extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+                               </extensions>
+                       </storageModule>
+                       <storageModule moduleId="cdtBuildSystem" version="4.0.0">
+                               <configuration artifactName="${ProjName}" buildProperties="" description="" id="com.qnx.qcc.toolChain.1762498539" name="Device-Debug" parent="org.eclipse.cdt.build.core.emptycfg">
+                                       <folderInfo id="com.qnx.qcc.toolChain.1762498539.1561488424" name="/" resourcePath="">
+                                               <toolChain id="com.qnx.qcc.toolChain.682312592" name="com.qnx.qcc.toolChain" superClass="com.qnx.qcc.toolChain">
+                                                       <option id="com.qnx.qcc.option.os.1720929524" name="Target OS:" superClass="com.qnx.qcc.option.os"/>
+                                                       <option id="com.qnx.qcc.option.cpu.2107899725" name="Target CPU:" superClass="com.qnx.qcc.option.cpu" value="com.qnx.qcc.option.gen.cpu.armle-v7" valueType="enumerated"/>
+                                                       <option id="com.qnx.qcc.option.compiler.596535986" name="Compiler:" superClass="com.qnx.qcc.option.compiler"/>
+                                                       <option id="com.qnx.qcc.option.runtime.742171011" name="Runtime:" superClass="com.qnx.qcc.option.runtime"/>
+                                                       <targetPlatform archList="all" binaryParser="com.qnx.tools.ide.qde.core.QDEBynaryParser" id="com.qnx.qcc.targetPlatform.982231418" osList="all" superClass="com.qnx.qcc.targetPlatform"/>
+                                                       <builder arguments="-C .. -f Makefile.libretro platform=qnx" command="make" id="com.qnx.qcc.toolChain.1762498539.480897078" keepEnvironmentInBuildfile="false" managedBuildOn="false" name="Gnu Make Builder" superClass="org.eclipse.cdt.build.core.settings.default.builder"/>
+                                                       <tool id="com.qnx.qcc.tool.compiler.267897021" name="QCC Compiler" superClass="com.qnx.qcc.tool.compiler">
+                                                               <option id="com.qnx.qcc.option.compiler.optlevel.1293751119" name="Optimization Level" superClass="com.qnx.qcc.option.compiler.optlevel" value="com.qnx.qcc.option.compiler.optlevel.0" valueType="enumerated"/>
+                                                               <option id="com.qnx.qcc.option.compiler.includePath.365274483" name="Include Directories (-I)" superClass="com.qnx.qcc.option.compiler.includePath" valueType="includePath">
+                                                                       <listOptionValue builtIn="false" value="${QNX_TARGET}/usr/include/freetype2"/>
+                                                                       <listOptionValue builtIn="false" value="${QNX_TARGET}/../target-override/usr/include"/>
+                                                               </option>
+                                                               <inputType id="com.qnx.qcc.inputType.compiler.116424583" superClass="com.qnx.qcc.inputType.compiler"/>
+                                                       </tool>
+                                                       <tool id="com.qnx.qcc.tool.assembler.1307903249" name="QCC Assembler" superClass="com.qnx.qcc.tool.assembler">
+                                                               <inputType id="com.qnx.qcc.inputType.assembler.1838739065" superClass="com.qnx.qcc.inputType.assembler"/>
+                                                       </tool>
+                                                       <tool id="com.qnx.qcc.tool.linker.1852803277" name="QCC Linker" superClass="com.qnx.qcc.tool.linker"/>
+                                                       <tool id="com.qnx.qcc.tool.archiver.1682937256" name="QCC Archiver" superClass="com.qnx.qcc.tool.archiver"/>
+                                               </toolChain>
+                                       </folderInfo>
+                               </configuration>
+                       </storageModule>
+                       <storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
+               </cconfiguration>
+               <cconfiguration id="com.qnx.qcc.toolChain.1815033502">
+                       <storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="com.qnx.qcc.toolChain.1815033502" moduleId="org.eclipse.cdt.core.settings" name="Device-Release">
+                               <externalSettings/>
+                               <extensions>
+                                       <extension id="com.qnx.tools.ide.qde.core.QDEBynaryParser" point="org.eclipse.cdt.core.BinaryParser"/>
+                                       <extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+                                       <extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
+                                       <extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+                               </extensions>
+                       </storageModule>
+                       <storageModule moduleId="cdtBuildSystem" version="4.0.0">
+                               <configuration artifactName="${ProjName}" buildProperties="" description="" id="com.qnx.qcc.toolChain.1815033502" name="Device-Release" parent="org.eclipse.cdt.build.core.emptycfg">
+                                       <folderInfo id="com.qnx.qcc.toolChain.1815033502.1093640979" name="/" resourcePath="">
+                                               <toolChain id="com.qnx.qcc.toolChain.1811843468" name="com.qnx.qcc.toolChain" superClass="com.qnx.qcc.toolChain">
+                                                       <option id="com.qnx.qcc.option.os.66936807" name="Target OS:" superClass="com.qnx.qcc.option.os"/>
+                                                       <option id="com.qnx.qcc.option.cpu.1884625209" name="Target CPU:" superClass="com.qnx.qcc.option.cpu" value="com.qnx.qcc.option.gen.cpu.armle-v7" valueType="enumerated"/>
+                                                       <option id="com.qnx.qcc.option.compiler.903071639" name="Compiler:" superClass="com.qnx.qcc.option.compiler"/>
+                                                       <option id="com.qnx.qcc.option.runtime.901433789" name="Runtime:" superClass="com.qnx.qcc.option.runtime"/>
+                                                       <targetPlatform archList="all" binaryParser="com.qnx.tools.ide.qde.core.QDEBynaryParser" id="com.qnx.qcc.targetPlatform.1169345860" osList="all" superClass="com.qnx.qcc.targetPlatform"/>
+                                                       <builder id="com.qnx.qcc.toolChain.1815033502.1831895405" keepEnvironmentInBuildfile="false" managedBuildOn="false" name="Gnu Make Builder" superClass="org.eclipse.cdt.build.core.settings.default.builder"/>
+                                                       <tool id="com.qnx.qcc.tool.compiler.401658009" name="QCC Compiler" superClass="com.qnx.qcc.tool.compiler">
+                                                               <option id="com.qnx.qcc.option.compiler.optlevel.20820451" name="Optimization Level" superClass="com.qnx.qcc.option.compiler.optlevel" value="com.qnx.qcc.option.compiler.optlevel.0" valueType="enumerated"/>
+                                                               <option id="com.qnx.qcc.option.compiler.includePath.2022402746" name="Include Directories (-I)" superClass="com.qnx.qcc.option.compiler.includePath" valueType="includePath">
+                                                                       <listOptionValue builtIn="false" value="${QNX_TARGET}/usr/include/freetype2"/>
+                                                                       <listOptionValue builtIn="false" value="${QNX_TARGET}/../target-override/usr/include"/>
+                                                               </option>
+                                                               <inputType id="com.qnx.qcc.inputType.compiler.1180700251" superClass="com.qnx.qcc.inputType.compiler"/>
+                                                       </tool>
+                                                       <tool id="com.qnx.qcc.tool.assembler.1403530230" name="QCC Assembler" superClass="com.qnx.qcc.tool.assembler">
+                                                               <inputType id="com.qnx.qcc.inputType.assembler.1360707586" superClass="com.qnx.qcc.inputType.assembler"/>
+                                                       </tool>
+                                                       <tool id="com.qnx.qcc.tool.linker.577346665" name="QCC Linker" superClass="com.qnx.qcc.tool.linker"/>
+                                                       <tool id="com.qnx.qcc.tool.archiver.637344581" name="QCC Archiver" superClass="com.qnx.qcc.tool.archiver"/>
+                                               </toolChain>
+                                       </folderInfo>
+                               </configuration>
+                       </storageModule>
+                       <storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
+               </cconfiguration>
+               <cconfiguration id="com.qnx.qcc.toolChain.1271074456">
+                       <storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="com.qnx.qcc.toolChain.1271074456" moduleId="org.eclipse.cdt.core.settings" name="Simulator-Debug">
+                               <externalSettings/>
+                               <extensions>
+                                       <extension id="com.qnx.tools.ide.qde.core.QDEBynaryParser" point="org.eclipse.cdt.core.BinaryParser"/>
+                                       <extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+                                       <extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
+                                       <extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+                               </extensions>
+                       </storageModule>
+                       <storageModule moduleId="cdtBuildSystem" version="4.0.0">
+                               <configuration artifactName="${ProjName}" buildProperties="" description="" id="com.qnx.qcc.toolChain.1271074456" name="Simulator-Debug" parent="org.eclipse.cdt.build.core.emptycfg">
+                                       <folderInfo id="com.qnx.qcc.toolChain.1271074456.2095507025" name="/" resourcePath="">
+                                               <toolChain id="com.qnx.qcc.toolChain.563285451" name="com.qnx.qcc.toolChain" superClass="com.qnx.qcc.toolChain">
+                                                       <option id="com.qnx.qcc.option.os.2028959839" name="Target OS:" superClass="com.qnx.qcc.option.os"/>
+                                                       <option id="com.qnx.qcc.option.cpu.460119393" name="Target CPU:" superClass="com.qnx.qcc.option.cpu"/>
+                                                       <option id="com.qnx.qcc.option.compiler.318948553" name="Compiler:" superClass="com.qnx.qcc.option.compiler"/>
+                                                       <option id="com.qnx.qcc.option.runtime.1244314155" name="Runtime:" superClass="com.qnx.qcc.option.runtime"/>
+                                                       <targetPlatform archList="all" binaryParser="com.qnx.tools.ide.qde.core.QDEBynaryParser" id="com.qnx.qcc.targetPlatform.2005367550" osList="all" superClass="com.qnx.qcc.targetPlatform"/>
+                                                       <builder id="com.qnx.qcc.toolChain.1271074456.325666051" keepEnvironmentInBuildfile="false" managedBuildOn="false" name="Gnu Make Builder" superClass="org.eclipse.cdt.build.core.settings.default.builder"/>
+                                                       <tool id="com.qnx.qcc.tool.compiler.821983732" name="QCC Compiler" superClass="com.qnx.qcc.tool.compiler">
+                                                               <option id="com.qnx.qcc.option.compiler.optlevel.1701209030" name="Optimization Level" superClass="com.qnx.qcc.option.compiler.optlevel" value="com.qnx.qcc.option.compiler.optlevel.0" valueType="enumerated"/>
+                                                               <option id="com.qnx.qcc.option.compiler.includePath.1616908655" name="Include Directories (-I)" superClass="com.qnx.qcc.option.compiler.includePath" valueType="includePath">
+                                                                       <listOptionValue builtIn="false" value="${QNX_TARGET}/usr/include/freetype2"/>
+                                                                       <listOptionValue builtIn="false" value="${QNX_TARGET}/../target-override/usr/include"/>
+                                                               </option>
+                                                               <inputType id="com.qnx.qcc.inputType.compiler.1059435667" superClass="com.qnx.qcc.inputType.compiler"/>
+                                                       </tool>
+                                                       <tool id="com.qnx.qcc.tool.assembler.1920350417" name="QCC Assembler" superClass="com.qnx.qcc.tool.assembler">
+                                                               <inputType id="com.qnx.qcc.inputType.assembler.618235584" superClass="com.qnx.qcc.inputType.assembler"/>
+                                                       </tool>
+                                                       <tool id="com.qnx.qcc.tool.linker.1321150712" name="QCC Linker" superClass="com.qnx.qcc.tool.linker"/>
+                                                       <tool id="com.qnx.qcc.tool.archiver.1860233844" name="QCC Archiver" superClass="com.qnx.qcc.tool.archiver"/>
+                                               </toolChain>
+                                       </folderInfo>
+                               </configuration>
+                       </storageModule>
+                       <storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
+               </cconfiguration>
+       </storageModule>
+       <storageModule moduleId="cdtBuildSystem" version="4.0.0">
+               <project id="pcsx_rearmed.null.446260429" name="pcsx_rearmed"/>
+       </storageModule>
+       <storageModule moduleId="scannerConfiguration">
+               <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="com.qnx.tools.ide.qde.managedbuilder.core.qccScannerInfo"/>
+               <scannerConfigBuildInfo instanceId="com.qnx.qcc.toolChain.1815033502">
+                       <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="com.qnx.tools.ide.qde.managedbuilder.core.qccScannerInfo"/>
+               </scannerConfigBuildInfo>
+               <scannerConfigBuildInfo instanceId="com.qnx.qcc.toolChain.1762498539">
+                       <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="com.qnx.tools.ide.qde.managedbuilder.core.qccScannerInfo"/>
+               </scannerConfigBuildInfo>
+               <scannerConfigBuildInfo instanceId="com.qnx.qcc.toolChain.1271074456">
+                       <autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="com.qnx.tools.ide.qde.managedbuilder.core.qccScannerInfo"/>
+               </scannerConfigBuildInfo>
+       </storageModule>
+       <storageModule moduleId="refreshScope" versionNumber="1">
+               <resource resourceType="PROJECT" workspacePath="/pcsx_rearmed"/>
+       </storageModule>
+</cproject>
diff --git a/blackberry_qnx/.project b/blackberry_qnx/.project
new file mode 100644 (file)
index 0000000..c8e1e20
--- /dev/null
@@ -0,0 +1,84 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+       <name>pcsx_rearmed</name>
+       <comment></comment>
+       <projects>
+       </projects>
+       <buildSpec>
+               <buildCommand>
+                       <name>org.eclipse.cdt.managedbuilder.core.genmakebuilder</name>
+                       <triggers>clean,full,incremental,</triggers>
+                       <arguments>
+                               <dictionary>
+                                       <key>?name?</key>
+                                       <value></value>
+                               </dictionary>
+                               <dictionary>
+                                       <key>org.eclipse.cdt.make.core.append_environment</key>
+                                       <value>true</value>
+                               </dictionary>
+                               <dictionary>
+                                       <key>org.eclipse.cdt.make.core.autoBuildTarget</key>
+                                       <value>all</value>
+                               </dictionary>
+                               <dictionary>
+                                       <key>org.eclipse.cdt.make.core.buildArguments</key>
+                                       <value>-C .. -f Makefile.libretro platform=qnx</value>
+                               </dictionary>
+                               <dictionary>
+                                       <key>org.eclipse.cdt.make.core.buildCommand</key>
+                                       <value>make</value>
+                               </dictionary>
+                               <dictionary>
+                                       <key>org.eclipse.cdt.make.core.cleanBuildTarget</key>
+                                       <value>clean</value>
+                               </dictionary>
+                               <dictionary>
+                                       <key>org.eclipse.cdt.make.core.contents</key>
+                                       <value>org.eclipse.cdt.make.core.activeConfigSettings</value>
+                               </dictionary>
+                               <dictionary>
+                                       <key>org.eclipse.cdt.make.core.enableAutoBuild</key>
+                                       <value>false</value>
+                               </dictionary>
+                               <dictionary>
+                                       <key>org.eclipse.cdt.make.core.enableCleanBuild</key>
+                                       <value>true</value>
+                               </dictionary>
+                               <dictionary>
+                                       <key>org.eclipse.cdt.make.core.enableFullBuild</key>
+                                       <value>true</value>
+                               </dictionary>
+                               <dictionary>
+                                       <key>org.eclipse.cdt.make.core.fullBuildTarget</key>
+                                       <value>all</value>
+                               </dictionary>
+                               <dictionary>
+                                       <key>org.eclipse.cdt.make.core.stopOnError</key>
+                                       <value>true</value>
+                               </dictionary>
+                               <dictionary>
+                                       <key>org.eclipse.cdt.make.core.useDefaultBuildCmd</key>
+                                       <value>false</value>
+                               </dictionary>
+                       </arguments>
+               </buildCommand>
+               <buildCommand>
+                       <name>org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder</name>
+                       <triggers>full,incremental,</triggers>
+                       <arguments>
+                       </arguments>
+               </buildCommand>
+               <buildCommand>
+                       <name>com.qnx.tools.bbt.xml.core.bbtXMLValidationBuilder</name>
+                       <arguments>
+                       </arguments>
+               </buildCommand>
+       </buildSpec>
+       <natures>
+               <nature>org.eclipse.cdt.core.cnature</nature>
+               <nature>org.eclipse.cdt.managedbuilder.core.managedBuildNature</nature>
+               <nature>org.eclipse.cdt.managedbuilder.core.ScannerConfigNature</nature>
+               <nature>com.qnx.tools.ide.bbt.core.bbtnature</nature>
+       </natures>
+</projectDescription>
diff --git a/debian_maemo/buildpkg b/debian_maemo/buildpkg
new file mode 100644 (file)
index 0000000..4c34f94
--- /dev/null
@@ -0,0 +1,13 @@
+#!/bin/bash -e
+
+NAME=`head debian/changelog -n1 | sed -n 's/^\(.*\) (\(.*\)) .*/\1-\2/p'`
+[[ -z $NAME ]] && { echo "Could not extract package name and version from debian/changelog" 2>&1; exit 1; }
+
+rm -rf ../$NAME
+cp -r ../`basename $PWD` ../$NAME
+cd ../$NAME
+rm -rf .git*
+find . -depth -name .svn -type d -exec rm -r {} \;
+find . -name '*~' -exec rm {} \;
+
+LD_LIBRARY_PATH=/usr/lib dpkg-buildpackage -rfakeroot $*
diff --git a/debian_maemo/changelog b/debian_maemo/changelog
new file mode 100644 (file)
index 0000000..e3395de
--- /dev/null
@@ -0,0 +1,112 @@
+pcsxrearmed (0.4.0.14.13) unstable; urgency=low
+
+   * Updated source to notaz git version
+
+ -- sakya <sakya_tg@yahoo.it>  Fri,  15 Feb 2013 12:50:28 +0200
+
+pcsxrearmed (0.4.0.14.12) unstable; urgency=low
+
+   * Fixed a problem with controller and vibration (Gran Turismo 2, Wipeout 3)
+   * Added dependency to libts
+
+ -- sakya <sakya_tg@yahoo.it>  Wed,  16 May 2012 17:09:33 +0200
+
+pcsxrearmed (0.4.0.14.11) unstable; urgency=low
+
+   * Added option -guncon and -gunnotrigger to activate guncon controller type
+
+ -- sakya <sakya_tg@yahoo.it>  Wed,  16 May 2012 09:37:12 +0200
+
+pcsxrearmed (0.4.0.14.10) unstable; urgency=low
+
+   * Added option -corners to set action to execute when clicking on display corners
+   * Fixed problem with notification using gles plugin
+   * Fixed controller problem with game "Heart Of Darkness" (maybe others?)
+
+ -- sakya <sakya_tg@yahoo.it>  Fri, 11 May 2012 16:38:29 +0200
+
+pcsxrearmed (0.4.0.14.9) unstable; urgency=low
+
+   * Added support to .mdf extension
+   * Added option -vibration to activate vibration
+
+ -- sakya <sakya_tg@yahoo.it>  Tue,  1 May 2012 12:19:49 +0200
+
+pcsxrearmed (0.4.0.14.8) unstable; urgency=low
+
+  * Added option -disc to set the initial disc in multi discs images (used when loading a savestate with -load)
+  * Added option -autosave
+  * Fixed disc change for multi discs images (PBP)
+  * Merged commits from Notaz git
+    * drc: inv: fix ram ofset and mirror handling
+    * support emulated RAM mapped at offset
+
+ -- sakya <sakya_tg@yahoo.it>  Fri,  20 Apr 2012 20:27:19 +0200
+
+pcsxrearmed (0.4.0.14.7) unstable; urgency=low
+
+  * Fixed -displayon
+
+ -- sakya <sakya_tg@yahoo.it>  Sun,  15 Apr 2012 17:22:08 +0200
+
+pcsxrearmed (0.4.0.14.6) unstable; urgency=low
+
+  * Added option -keys to set the keys config file
+  * Fixed L1/L2/R1/R2
+  * Added autopause on incoming call
+
+ -- sakya <sakya_tg@yahoo.it>  Wed,  13 Apr 2012 12:51:35 +0200
+
+pcsxrearmed (0.4.0.14.5) unstable; urgency=low
+
+  * Fixed accelerometer using gles
+  * Added -analog option to use the accelerometer as the analog pad
+  * Added options to set accelerometer sens, max value, y_def
+  * Added -displayon option to keep the display on (useful when playing using the accelerometer)
+
+ -- sakya <sakya_tg@yahoo.it>  Tue,  10 Apr 2012 15:34:11 +0200
+
+pcsxrearmed (0.4.0.14.4) unstable; urgency=low
+
+  * Fixed -load option
+  * Added disc change (configured a new key)
+
+ -- sakya <sakya_tg@yahoo.it>  Fri,  06 Apr 2012 13:54:56 +0200
+
+pcsxrearmed (0.4.0.14.3) unstable; urgency=low
+
+  * Added options to set various gles settings
+  * Fixed save state slot selection
+  * Added notification on save state slot change
+
+ -- sakya <sakya_tg@yahoo.it>  Wed,  04 Apr 2012 10:20:18 +0200
+
+pcsxrearmed (0.4.0.14.2) unstable; urgency=low
+
+  * Fixed fullscreen using gpu-gles
+  * Fixed crash when saving savestate using gpu-gles
+  * Added options to set spu reverb and interpolation (disabled by default)
+
+ -- sakya <sakya_tg@yahoo.it>  Sun,  01 Apr 2012 11:42:20 +0200
+
+pcsxrearmed (0.4.0.14.1) unstable; urgency=low
+
+  * Added option to set psx region (NTSC/PAL/Auto)
+  * Use PulseAudio (better audio)
+
+ -- sakya <sakya_tg@yahoo.it>  Wed,  30 Mar 2012 09:44:51 +0200
+
+pcsxrearmed (0.4.0.14) unstable; urgency=low
+
+  * Updated to r14
+  * Added --help
+  * PCSX4All
+
+ -- sakya <sakya_tg@yahoo.it>  Sun,  27 Dec 2011 00:02:27 +0200
+
+pcsxrearmed (0.4.0.12.2) unstable; urgency=low
+
+  * gpu-gles
+
+
+ -- Bonapart <bonapart@programist.ru>  Sun,  27 Dec 2011 00:02:27 +0200
diff --git a/debian_maemo/compat b/debian_maemo/compat
new file mode 100644 (file)
index 0000000..7ed6ff8
--- /dev/null
@@ -0,0 +1 @@
+5
diff --git a/debian_maemo/control b/debian_maemo/control
new file mode 100644 (file)
index 0000000..4469ed8
--- /dev/null
@@ -0,0 +1,115 @@
+Source: pcsxrearmed
+Section: user/games
+Priority: extra
+Maintainer: Bonapart <bonapart@programist.ru>
+Build-Depends: debhelper (>= 5), zlib1g-dev, libhildon1-dev,  libpulse-dev, libasound2-dev, libbz2-dev, libgles1-sgx-img-dev, opengles-sgx-img-common-dev, libosso-dev, libdbus-1-dev, libhildonfm2-dev, libts-dev
+Standards-Version: 3.7.3
+
+Package: pcsxrearmed
+Architecture: armel
+Depends: ${shlibs:Depends}, libts-0.0-0
+Description: Sony PlayStation emulator
+XSBC-Homepage: http://notaz.gp2x.de/pcsx_rearmed.php
+XSBC-Bugtracker: http://notaz.gp2x.de/pcsx_rearmed.php
+XB-Maemo-Display-Name: PCSX-ReArmed
+XB-Maemo-Icon-26:
+ iVBORw0KGgoAAAANSUhEUgAAADAAAAAwCAYAAABXAvmHAAAAAXNSR0IArs4c
+ 6QAAEStJREFUaN7Fmn+wXVV1xz9r733uufe+e9+DJIRESEICEQghAUQBI8Uo
+ ik7jrxm1o+3ooB2RkWrtONqZlhn7Y8bRUVudqVZsoTjFH2CrSKsVEFIQIr8C
+ JARC0CDkBwkkIcl7L7n3nLP36h97n3Pvo07/7c2cOfeenB9rr/Vd3/Vd6zxh
+ 7DM52b9isj+5YcGC+a9bunTpOVNTU91WK5Msy1ECIgYRMGKxzmGtQQBrHSZ9
+ R8C5DOcc1hiMtTjrsM4hAtZYWq1W/H9nyZzDZVk6x9LKWmSteH2rlZNlLhw/
+ fnzXszt3fmHjxo13bH74oReBwbbtO6r0ODht5UpnfXXdytOXX+mswVcl1hqc
+ swjgQ0BV8T7gfSCEQFBQFFQJqqhquptpHCJiMGIQKxhjsdZircMai8kczibj
+ XUaWZdi0d1lGlrVwmdN2nsvypUtZvmwpw7LY8dSTT37yX264/sGgOrNl67ZS
+ AFavPudTF5y76u9fOniQZ3/7PLPHjuG9Z8wSJO7QeAABVIW0DFAFBEVANF0j
+ AGh9XBWV9L3eRFAx6dr6XFDiMWMNy049VS9Yu4o/et+7xBj78u5duz78lS9/
+ 8U6UoZx59lnZ2StXbMP7lT+7cyNFUWCsbQwmyLhTo1EiydvpSFogIpiIMQST
+ zhOMMSAmnmcMIhYxgogFYxBjEYnnYOIWn2OjA4IiRjhj+TL98uc/K5X3W7Zs
+ 2fKBf/72Pz5tZmdmz5jodJZtfeppyqqK+LQx3EYtpm0wRChYazBWMCYaasRg
+ TFwAUu/TsgyIiecytkAxBjG1sTZuzfe4GIwDk4GxqMmQLAdx7PjNs/KNG75H
+ 3srWLFmy5E3P79rj7Omnr7is08o+8Otnn6MYFtGIErQCyZXsqxW6B3S3gQqw
+ /I6PNgEZwSPmQAMVYxERVCSSQTI8et4iYtF6MckhcXECJuWSGPbvf1EvumCN
+ nDA1taIoypuNr6pFg6JgMEjGe2Ch0vpmSXdnQeuqQPeeivZtBbI6oIWiKJqS
+ WLU2vM6POh2SIZjEXpIMtglGdSTib9IxEsSoFy9mBFtjmTleyJ33bMIYOevQ
+ oUMnuLIs5w+LkuCr+HAP+dcqWu8PFD8SZAEwA9kGxawuOXZeBjMGXJ0BNMk4
+ gpIZYbr2ZvJ6hJBrftcLEGPTfVJ+EBcfUo6pgohQhMCBgwdB4fDhw5MG6JdV
+ RQgKQZAO2IuVMIDB+xzhcWF4lWP4FUPYZJATf1dCp0ROxps69LXR1mGsTZtD
+ TNrbWE/i9/jbmFEiRx6oCSCRRHJZURTMzMx0nUK7LD2KIgZ0CPoM2KWQf8tj
+ TlPkZGX4GRdNdiTvzzW+Nrg2voaIqb1uHUYsYm2KRNxURnBSTfdB4k5r6q0x
+ qogIIShFWVKWZWa8962y8g0eRGH4GYd/Usg/GsjerHQ2VnRuK7GvC6iXhjpH
+ xo+MyPOcdruNsTZyughiXFxI42mHuAyswzgLNrGRNWANYg1gUCOoMQ2DYUZ5
+ JgKqahxI5n0Y4cJBeMxw7LwM+6ZA62880o454C4rmb1U4AkLrXHjY6UNqpx6
+ ymKm+n2qEBgUFUemZzk8PYsPinMxIsa6xDAR71rXjFj/CJo8WdNDXeQTY4iA
+ tZHVnKo61VBfCW3FXqzoUCl/bnB/ECj+0uGurGj/bcBdESifcBGLKQIN5oFu
+ p8OCBfPpdDp0u13yvI1X5fnd+9jx210cOjJDnmUY4yLOE9xACEmamHoRQZtF
+ aar0oBgxOJfFBYgxTjXxeGVgnqdzd4nuh9mLshizFkieHDKcyzam5nNjICjt
+ djS81+sxMTHB5OQkvV6P89euoZXn3Pfgo9y16RGKMiDWRu8bQVTQEEAiPQsh
+ KoAwqjApDBgjZM5F54mIa5IwA31OKL5kMYthYkeJe2ege39Bfm3Ab4fqJy7i
+ PVjEW/AGCel3sHTb0fh+v8/k5CQ+KEemZ+h0u8yfN48PvvddXPvpq5ma6qMK
+ 1goGk2qdIKYmZlMrLiBE+ISAAkaELMswxohx1mZSaxgBnFD8hePYBx3V92PW
+ hEeFwecyjr+lA7stsgDMUkWWhLidGpBXBWRxQPvKZH+yWcA9v9rMN2+8mWd2
+ Pk+/36fX6/G615zHtZ++GucsqpIkR6obgJFXVPfa+QliYgwuy7DW4sQYGdFg
+ UsMC1fct1Q8c+ZdKqq9lhBds1D3WYt8zwP3JMNFc/RTBiPLcXdtZ9/LFdHtd
+ +v0+/ck+Yi0vHXyZXq9Ht9ul3W5z2bqLeNsb13HrHffQytvUVdSIRPyPVfna
+ +HpJxiSpL4LTEDAxHFFC12IssUz51y2oBNOSVPqF8MMu1X90QWykR7EY46iq
+ ite89VImzp6Ixvf7tLIW1rbodDtMTEzQ6XTI85w8z7nowvP54U/vJs9Hyanq
+ myKpKKIpAppEsWqCXhRlTlUxSbvXJTtSlYyS1tTcnKrt0CKFbSqqcZHnfeVY
+ 0DupMb7X65HlOVhD3mo33s/znCzLWLl8Ka08IxBtUA1NUBVFJeoukdg0ee+Z
+ mZllMBwgCM45cSGEEPHX0OzcRqaWv4y+R/GVqmqSCJJ6iM5Ep2GeXq9H1soR
+ sWSt1hzjsyxj3oknMJG3KGqmqTsmiaCvo1BWnmJYUJRDKu9HjZIqzhijseSb
+ RlXW+zoqkSJMoyZreWDHKms8Bt1urzG+1+vhsgwjllaWkef5qB+2lsoHfEj8
+ PoZ3QQjeU5QFRVFSlRUh+Oa8qIK1gVAwxozEWLpN3Tr6oIhRjIJY0yhHsbEB
+ qQWaMRapPN1uh36/z8TEBN1ul8xliAHrsjnGG2PY88I+Zo4dZ6I3EVnHB4qy
+ ZDgYUpYVGkJqbUO0SAEJqAYq72MEAG+SFtc6DwREU6+bIhCA4AMWgzhpEnek
+ c2KE6iJWV+I4jbA4Z2vubqK96aHHokQuo5eHRYEGT0hDBG0wLU0VltRi+qqK
+ CxARX+uKV+J/1HhHcRXVo+C9EvB4FWwAYwLz5p+IwZC1crrdLp1Oh3a7jRhD
+ 5QODwSAZBoPhkLvv/RXfuP4mPIbS+5iA49t4m9QcjwUt+EBRFrV0I4y8ImOV
+ Q5qOKiZzrUINKlBVnrzlWLFsCWefdQbr113CLx94BCNCp9Oh0+nQarWoqujZ
+ m370U3bs3MWik+Zx5OhRfnHPJnrdLsOyZPr4MarKY63BGtMYKnXhIsKmXogP
+ nqIogajuGwjVTKQyd+yROnQCUJYVJ05NsWb1Kt74hks4c+XpnPKqxfT7fZ58
+ 5lmyVqsxPsuilsqtsPrMM7jktWs5/bRl9HtdrvnYh7HWMhgMefHAQR7c/Dg/
+ u3Mjj27djnEWI6aJQNJ0zcd7T1EMI4RUtaqTamxWMoJOUoo+KKcsWsglF13I
+ 773hEk6av4B+vzcnYV8+fKQpUs45nHMcPTrNT757Heede04zwfhfcAXeuv5S
+ Pvepj3PHXffw2c9/kWd378Vg0DAXRqrgK89gMCSEMJbEafxBrVoZNS55nnP5
+ G9dx+frLmgJVU+TR6Vnu+O/7+Pdb/4vHt2zjfe/ZEKdsaTRTVSWLTl6Ic5ay
+ qjg2e4ytTz3NDf96C4889jjHBkP+/E+v5sMfeC/tPOcdb38LU/0+7//jT3J0
+ enpsXDDKCx88xTAtQKFsIKRjA6t6WqZw9qvP4PL1lzE5OcnExATtdocHHn6M
+ +x/azP0PPkorb7P23HO4MG8nre6oqdmK4cCBg+zavYc77v4lP7/rHrZse4oz
+ Vqxg7bnn8o4r1vPmy9Y1zATw4oGDBO/RkPJRx1pKjRAaDCMpOIGqSWIZTdqU
+ URIfPjpNnrdHnncZe/e/xAknnMDVH/0Q6y+9hFVnvZp/+s73CaqM15V+v8s1
+ n72W5/fsY3Z2lt9/y3o+9fErOX/NOSxfthTnRoOmF/bt58bv3sz1N93CzOzs
+ qJGRxEAoSCB4z3A4yoFiVAdovB6naQYV4bnd+/jCV7/BuzdcwaVvuJjTly/i
+ z675GO12m06n00gDl2WNWhyfXBw9Ms2nr7qSd2+4gkUnL6Tdzkd5oPCrBx/m
+ m9d/h433PcjR6RmGRTXWTupcFKnifcWwzgERKWMEatYZCdjxAe2uF/bxtW/d
+ yNe/fSNrV69i1dmvZvHCRbSyDLGW/fv38/O77uX1//CVOcl5+MhRbv/xTSw6
+ eWGjX7z3TE9P88Nbf8oNN93MY1ufjHoqiUljYncW6sSlrgFRRnhfMSzqHFAt
+ jJFGsMXRRuR+bSZjMULORSmxdfuveXzbMzFlxiYS5XDI3n37eenAQebPOzFC
+ CeLMCXjpwEGe3P4Mt/zoNv7z9l/w/J59KYKt6NwxxlFlrIAx+h0ghEAxLOIC
+ gmpRj/5C7XV5xfiiRkRiVWcdtOq5T2z9xBjaeYu/+uLf8YN/u5ULzlvNhisu
+ ZzAcsOnBh9m9dy+3/+JeNm95giNHZ8haGZNT/cjxGtvGKB9CQkpACQRCwkBI
+ 8jouoCxLvPfqNIRBnQOicY6TXBvlrMaGew60dCT8JIUcBWMNBw4d5t5ND3Pf
+ A4/wvVt+wuzsMbZt/zXT09MMyxJjLZ1Op1GUUmuexsPjsoH4m7lsFEKgrLXQ
+ KInndIdzJs9xRhPHHLFKayrtBiQZks4REWxm0QAHXz4MCoNiCBiyzKVnNJiI
+ 9wmRYXTMeG2eERKca4qJLBRqNaqqRZ7nWOua1q2REPWKAs1UTALxfVkawMYb
+ +9F8RxQNZvz9B6iMTbLjTUMIc7yu9Z4wFoUR89T72ODXL0oEZ4yUcQFWVcbZ
+ TZtmRhtRFXm4rtIRXqnZVghGRzYzPmtPhKraDK8aVtG52J/LOjp6hUVI3lVE
+ RK2NswsHlK34llCa7BeNuKd2uU0tTkA1hiJG1qNqmh6C1F01EJyzklFyJspJ
+ JBNSDigStDlWGz96AZGco4qNr31qOS1F1spod9ujKXP9Uq5OgWa0B0ZHN2s8
+ Qz3XTBBibpvYxLSBgI4aljpx0Yb3m1eHiaHGtZ8I5O0cUMqy9KaqyuPe+4OL
+ Fy4kc07rmzH2kDjqCOnBPr4F0YCGufuAJwSPBo+G6hVbOlc9IV2j9bWEdN8k
+ F9SPYEZgjH5oOdHFCxcwHBb7q6oqzNGj00cOHjz05No1q1lw4qSoD801UlfC
+ Bn5a3yfOMTUWn7gF8CkR/4+NEFknfq/vXz9z9LwRtBI0gxK8Z+H8E2XtmtXs
+ 3bv38enp6VlzbHb25W3bnnhANQzeseFtKBWVL0bYk9jU61jySTP1Do2XGsrT
+ MOa5sU1HXZXq+LFRtyK11yXMmciB4qsCoeRd73w7RVEc37x586YjR44ctsba
+ 7NChQ3m73T7ltRe+Zumac1ax9Yltenx6RsSmF26M6aRxlkpo1zQASEgevQTU
+ MRgSKTJomHMMJb7pb/7RRDbWBk91/DiTUxN6zVUfkWVLlnDvvffed//99/8Y
+ +I0VkWCtMc/s2DHw3i9at+71C996+Xrp9Sd01/O7ZHD8GASP+gpClfbxt097
+ rY/7ilAlvPt4bHwLVUmo4n1COten65rzqmL0PXgmum3eueEK/ciH/lBOmJrU
+ 2267bevtt9/+XeBRYJ8Att3OT6oqf6b3/pJ58+a96ROf+MRFZ5115sTk5JTp
+ 93vSzttYF//OwaU/2siyLPatiSqNiXI8TjhoeoIRe0gjr4MGqjI6oKoqyqKk
+ KAqGxRBf+TQIGFKWJbMzszozOxt27tw5e9111z1w4MCBO4H7gKeBlwWg3c5b
+ 3of5VVWtUNVzgfMvvvji80477bQFS5Ys6VprTd301IaNG2iMaWBR9wKNwa/Y
+ 69j8sqoqQhpeeR8Nr+V22sKePXuO7969+6WHHnroseT1x4GdwCGgbFzU7XSy
+ wXDYDyGcDKwAlgOLgUmgzf/PZwgcAV4Ank2G7wemgRLgfwDIFWZCNtkwCgAA
+ AABJRU5ErkJggg==
diff --git a/debian_maemo/copyright b/debian_maemo/copyright
new file mode 100644 (file)
index 0000000..75a6b06
--- /dev/null
@@ -0,0 +1,2 @@
+this package was maemonized by Roman Deninberg <bonapart@programist.ru>
+Mon, 10 Jan 2011 02:00:13 +0100
diff --git a/debian_maemo/dirs b/debian_maemo/dirs
new file mode 100644 (file)
index 0000000..33359b8
--- /dev/null
@@ -0,0 +1 @@
+usr/games
diff --git a/debian_maemo/docs b/debian_maemo/docs
new file mode 100644 (file)
index 0000000..e845566
--- /dev/null
@@ -0,0 +1 @@
+README
diff --git a/debian_maemo/files b/debian_maemo/files
new file mode 100644 (file)
index 0000000..0cc57dd
--- /dev/null
@@ -0,0 +1 @@
+pcsxrearmed_0.4.0.14.13_armel.deb user/games extra
diff --git a/debian_maemo/install b/debian_maemo/install
new file mode 100644 (file)
index 0000000..a260186
--- /dev/null
@@ -0,0 +1,6 @@
+pcsx opt/maemo/usr/games/
+plugins/spunull/spunull.so opt/maemo/usr/games/plugins
+plugins/gpu_unai/gpu_unai.so opt/maemo/usr/games/plugins
+#plugins/gpu_unai/gpuPCSX4ALL.so opt/maemo/usr/games/plugins
+plugins/dfxvideo/gpu_peops.so opt/maemo/usr/games/plugins
+plugins/gpu-gles/gpu_gles.so opt/maemo/usr/games/plugins
diff --git a/debian_maemo/rules b/debian_maemo/rules
new file mode 100644 (file)
index 0000000..5230bf7
--- /dev/null
@@ -0,0 +1,68 @@
+#!/usr/bin/make -f
+# -*- makefile -*-
+
+#export DH_VERBOSE=1
+
+DEB_HOST_GNU_TYPE   ?= $(shell dpkg-architecture -qDEB_HOST_GNU_TYPE)
+DEB_BUILD_GNU_TYPE  ?= $(shell dpkg-architecture -qDEB_BUILD_GNU_TYPE)
+DEB_HOST_ARCH ?= $(shell dpkg-architecture -qDEB_HOST_ARCH)
+
+#GAME_VERSION := $(shell head debian/changelog -n1 | sed -n 's/.* (\(.*\)) .*/\1/p')
+CFLAGS = -Wall -g
+
+ifneq (,$(findstring noopt,$(DEB_BUILD_OPTIONS)))
+       CFLAGS += -O0
+else
+       CFLAGS += -O2
+endif
+
+build: build-stamp
+
+build-stamp:
+       dh_testdir
+       ./configure --platform=maemo --gpu=neon --sound-drivers=pulseaudio --enable-neon
+       $(MAKE)
+       strip pcsx
+       strip plugins/gpu_unai/gpu_unai.so
+       strip plugins/gpu-gles/gpu_gles.so
+       strip plugins/spunull/spunull.so
+       touch build-stamp
+
+clean:
+       dh_testdir
+       dh_testroot
+       rm -f build-stamp
+       dh_clean
+       $(MAKE) clean clean_plugins
+
+install: build
+       dh_testdir
+       dh_testroot
+       dh_installdirs
+       mkdir -p "$(CURDIR)"/debian/pcsxrearmed/opt/maemo/usr/games/screenshots
+       chmod 777 "$(CURDIR)"/debian/pcsxrearmed/opt/maemo/usr/games/screenshots
+       chown user "$(CURDIR)"/debian/pcsxrearmed/opt/maemo/usr/games/screenshots
+       dh_install
+
+binary-indep: build install
+
+binary-arch: build install
+       dh_testdir
+       dh_testroot
+       dh_installchangelogs
+       dh_installdocs
+       #dh_installmenu
+       dh_link
+       dh_strip
+       dh_compress
+       dh_fixperms
+       dh_installdeb
+       dh_makeshlibs
+       dh_shlibdeps
+       dh_gencontrol
+       #maemo-optify
+       dh_md5sums
+       dh_builddeb
+
+binary: binary-indep binary-arch
+.PHONY: build clean binary-indep binary-arch binary install
diff --git a/frontend/320240/caanoo.gpe b/frontend/320240/caanoo.gpe
new file mode 100755 (executable)
index 0000000..9d6154a
--- /dev/null
@@ -0,0 +1,23 @@
+#!/bin/sh
+
+# Wiz's timings are already good, apply this for Caanoo
+if [ -e /dev/accel ]; then
+  ./pollux_set "ram_timings=3,9,4,1,1,1,1"
+fi
+
+# the sync mount causes problems when writing saves,
+# probably due to many write calls, so have to get rid of it
+if grep mmcblk /proc/mounts | grep -q '\<sync\>'; then
+  oldmount=`grep mmcblk /proc/mounts | grep '\<sync\>' | awk '{print $4}'`
+  mount /dev/mmcblk0p1 /mnt/sd/ -o remount,dirsync,noatime
+fi
+
+./pcsx "$@"
+sync
+
+if [ -n "$oldmount" ]; then
+  mount /dev/mmcblk0p1 /mnt/sd/ -o remount,$oldmount
+fi
+
+cd /usr/gp2x
+exec ./gp2xmenu
diff --git a/frontend/320240/haptic_s.cfg b/frontend/320240/haptic_s.cfg
new file mode 100644 (file)
index 0000000..624056d
--- /dev/null
@@ -0,0 +1,3 @@
+0      126
+100    -126
+115    0
diff --git a/frontend/320240/haptic_w.cfg b/frontend/320240/haptic_w.cfg
new file mode 100644 (file)
index 0000000..3585a71
--- /dev/null
@@ -0,0 +1,3 @@
+0       54
+100     -126
+105     0
diff --git a/frontend/320240/pcsx26.png b/frontend/320240/pcsx26.png
new file mode 100644 (file)
index 0000000..ed220a0
Binary files /dev/null and b/frontend/320240/pcsx26.png differ
diff --git a/frontend/320240/pcsx_rearmed.ini b/frontend/320240/pcsx_rearmed.ini
new file mode 100644 (file)
index 0000000..b15497f
--- /dev/null
@@ -0,0 +1,6 @@
+[info]
+name="PCSX ReARMed"
+icon="/pcsx_rearmed/pcsx26.png"
+path="/pcsx_rearmed/pcsx.gpe"
+title="/pcsx_rearmed/pcsxb.png"
+group="GAMES"
diff --git a/frontend/320240/pcsxb.png b/frontend/320240/pcsxb.png
new file mode 100644 (file)
index 0000000..ff5a48a
Binary files /dev/null and b/frontend/320240/pcsxb.png differ
diff --git a/frontend/320240/pollux_set.c b/frontend/320240/pollux_set.c
new file mode 100644 (file)
index 0000000..f49e777
--- /dev/null
@@ -0,0 +1,389 @@
+/*
+ * quick tool to set various timings for Wiz
+ *
+ * Copyright (c) Gražvydas "notaz" Ignotas, 2009
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the organization nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * HTOTAL:    X VTOTAL:  341
+ * HSWIDTH:   1 VSWIDTH:   0
+ * HASTART:  37 VASTART:  17
+ * HAEND:   277 VAEND:   337
+ *
+ * 120Hz
+ * pcd  8, 447: + 594us
+ * pcd  9, 397: +  36us
+ * pcd 10, 357: - 523us
+ * pcd 11, 325: +1153us
+ *
+ * 'lcd_timings=397,1,37,277,341,0,17,337;dpc_clkdiv0=9'
+ * 'ram_timings=2,9,4,1,1,1,1'
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+//#include "pollux_set.h"
+#define BINARY
+
+/* parse stuff */
+static int parse_lcd_timings(const char *str, void *data)
+{
+       int *lcd_timings = data;
+       const char *p = str;
+       int ret, c;
+       ret = sscanf(str, "%d,%d,%d,%d,%d,%d,%d,%d",
+                       &lcd_timings[0], &lcd_timings[1], &lcd_timings[2], &lcd_timings[3],
+                       &lcd_timings[4], &lcd_timings[5], &lcd_timings[6], &lcd_timings[7]);
+       if (ret != 8)
+               return -1;
+       /* skip seven commas */
+       for (c = 0; c < 7 && *p != 0; p++)
+               if (*p == ',')
+                       c++;
+       if (c != 7)
+               return -1;
+       /* skip last number */
+       while ('0' <= *p && *p <= '9')
+               p++;
+
+       return p - str;
+}
+
+static int parse_ram_timings(const char *str, void *data)
+{
+       int *ram_timings = data;
+       const char *p = str;
+       int ret, c;
+       float cas;
+
+       ret = sscanf(p, "%f,%d,%d,%d,%d,%d,%d",
+                       &cas, &ram_timings[1], &ram_timings[2], &ram_timings[3],
+                       &ram_timings[4], &ram_timings[5], &ram_timings[6]);
+       if (ret != 7)
+               return -1;
+       if (cas == 2)
+               ram_timings[0] = 1;
+       else if (cas == 2.5)
+               ram_timings[0] = 2;
+       else if (cas == 3)
+               ram_timings[0] = 3;
+       else
+               return -1;
+       for (c = 0; c < 6 && *p != 0; p++)
+               if (*p == ',')
+                       c++;
+       if (c != 6)
+               return -1;
+       while ('0' <= *p && *p <= '9')
+               p++;
+
+       return p - str;
+}
+
+static int parse_decimal(const char *str, void *data)
+{
+       char *ep;
+
+       *(int *)data = strtoul(str, &ep, 10);
+       if (ep == str)
+               return -1;
+
+       return ep - str;
+}
+
+/* validate and apply stuff */
+static int apply_lcd_timings(volatile unsigned short *memregs, void *data)
+{
+       int *lcd_timings = data;
+       int i;
+
+       for (i = 0; i < 8; i++) {
+               if (lcd_timings[i] & ~0xffff) {
+                       fprintf(stderr, "pollux_set: invalid lcd timing %d: %d\n", i, lcd_timings[i]);
+                       return -1;
+               }
+       }
+
+       for (i = 0; i < 8; i++)
+               memregs[(0x307c>>1) + i] = lcd_timings[i];
+
+       return 0;
+}
+
+static const struct {
+       signed char adj;        /* how to adjust value passed by user */
+       signed short min;       /* range of */
+       signed short max;       /* allowed values (inclusive) */
+}
+ram_ranges[] = {
+       {  0,  1,  3 }, /* cas (cl) */
+       { -2,  0, 15 }, /* trc */
+       { -2,  0, 15 }, /* tras */
+       {  0,  0, 15 }, /* twr */
+       {  0,  0, 15 }, /* tmrd */
+       {  0,  0, 15 }, /* trp */
+       {  0,  0, 15 }, /* trcd */
+};
+
+static int apply_ram_timings(volatile unsigned short *memregs, void *data)
+{
+       int *ram_timings = data;
+       int i, val;
+
+       for (i = 0; i < 7; i++)
+       {
+               ram_timings[i] += ram_ranges[i].adj;
+               if (ram_timings[i] < ram_ranges[i].min || ram_timings[i] > ram_ranges[i].max) {
+                       fprintf(stderr, "pollux_set: invalid RAM timing %d\n", i);
+                       return -1;
+               }
+       }
+
+       val = memregs[0x14802>>1] & 0x0f00;
+       val |= (ram_timings[4] << 12) | (ram_timings[5] << 4) | ram_timings[6];
+       memregs[0x14802>>1] = val;
+
+       val = memregs[0x14804>>1] & 0x4000;
+       val |= (ram_timings[0] << 12) | (ram_timings[1] << 8) |
+               (ram_timings[2] << 4) | ram_timings[3];
+       val |= 0x8000;
+       memregs[0x14804>>1] = val;
+
+       for (i = 0; i < 0x100000 && (memregs[0x14804>>1] & 0x8000); i++)
+               ;
+
+       return 0;
+}
+
+static int apply_dpc_clkdiv0(volatile unsigned short *memregs, void *data)
+{
+       int pcd = *(int *)data;
+       int tmp;
+
+       if ((pcd - 1) & ~0x3f) {
+               fprintf(stderr, "pollux_set: invalid lcd clkdiv0: %d\n", pcd);
+               return -1;
+       }
+
+       pcd = (pcd - 1) & 0x3f;
+       tmp = memregs[0x31c4>>1];
+       memregs[0x31c4>>1] = (tmp & ~0x3f0) | (pcd << 4);
+
+       return 0;
+}
+
+static int apply_cpuclk(volatile unsigned short *memregs, void *data)
+{
+       volatile unsigned int *memregl = (volatile void *)memregs;
+       int mhz = *(int *)data;
+       int adiv, mdiv, pdiv, sdiv = 0;
+       int i, vf000, vf004;
+
+       // m = MDIV, p = PDIV, s = SDIV
+       #define SYS_CLK_FREQ 27
+       pdiv = 9;
+       mdiv = (mhz * pdiv) / SYS_CLK_FREQ;
+       if (mdiv & ~0x3ff)
+               return -1;
+       vf004 = (pdiv<<18) | (mdiv<<8) | sdiv;
+
+       // attempt to keep AHB the divider close to 250, but not higher
+       for (adiv = 1; mhz / adiv > 250; adiv++)
+               ;
+
+       vf000 = memregl[0xf000>>2];
+       vf000 = (vf000 & ~0x3c0) | ((adiv - 1) << 6);
+       memregl[0xf000>>2] = vf000;
+       memregl[0xf004>>2] = vf004;
+       memregl[0xf07c>>2] |= 0x8000;
+       for (i = 0; (memregl[0xf07c>>2] & 0x8000) && i < 0x100000; i++)
+               ;
+
+       printf("clock set to %dMHz, AHB set to %dMHz\n", mhz, mhz / adiv);
+       return 0;
+}
+
+static int lcd_timings[8];
+static int ram_timings[7];
+static int dpc_clkdiv0;
+static int cpuclk;
+
+static const char lcd_t_help[] = "htotal,hswidth,hastart,haend,vtotal,vswidth,vastart,vaend";
+static const char ram_t_help[] = "CAS,tRC,tRAS,tWR,tMRD,tRP,tRCD";
+
+static const struct {
+       const char *name;
+       const char *help;
+       int (*parse)(const char *str, void *data);
+       int (*apply)(volatile unsigned short *memregs, void *data);
+       void *data;
+}
+all_params[] = {
+       { "lcd_timings", lcd_t_help, parse_lcd_timings, apply_lcd_timings, lcd_timings  },
+       { "ram_timings", ram_t_help, parse_ram_timings, apply_ram_timings, ram_timings  },
+       { "dpc_clkdiv0", "divider",  parse_decimal,     apply_dpc_clkdiv0, &dpc_clkdiv0 },
+       { "clkdiv0",     "divider",  parse_decimal,     apply_dpc_clkdiv0, &dpc_clkdiv0 }, /* alias */
+       { "cpuclk",      "MHZ",      parse_decimal,     apply_cpuclk,      &cpuclk      },
+};
+#define ALL_PARAM_COUNT (sizeof(all_params) / sizeof(all_params[0]))
+
+/*
+ * set timings based on preformated string
+ * returns 0 on success.
+ */
+int pollux_set(volatile unsigned short *memregs, const char *str)
+{
+       int parsed_params[ALL_PARAM_COUNT];
+       int applied_params[ALL_PARAM_COUNT];
+       int applied_something = 0;
+       const char *p, *po;
+       int i, ret;
+
+       if (str == NULL)
+               return -1;
+
+       memset(parsed_params, 0, sizeof(parsed_params));
+       memset(applied_params, 0, sizeof(applied_params));
+
+       p = str;
+       while (1)
+       {
+again:
+               while (*p == ';' || *p == ' ')
+                       p++;
+               if (*p == 0)
+                       break;
+
+               for (i = 0; i < ALL_PARAM_COUNT; i++)
+               {
+                       int param_len = strlen(all_params[i].name);
+                       if (strncmp(p, all_params[i].name, param_len) == 0 && p[param_len] == '=')
+                       {
+                               p += param_len + 1;
+                               ret = all_params[i].parse(p, all_params[i].data);
+                               if (ret < 0) {
+                                       fprintf(stderr, "pollux_set parser: error at %-10s\n", p);
+                                       fprintf(stderr, "  valid format is: <%s>\n", all_params[i].help);
+                                       return -1;
+                               }
+                               parsed_params[i] = 1;
+                               p += ret;
+                               goto again;
+                       }
+               }
+
+               /* Unknown param. Attempt to be forward compatible and ignore it. */
+               for (po = p; *p != 0 && *p != ';'; p++)
+                       ;
+
+               fprintf(stderr, "unhandled param: ");
+               fwrite(po, 1, p - po, stderr);
+               fprintf(stderr, "\n");
+       }
+
+       /* validate and apply */
+       for (i = 0; i < ALL_PARAM_COUNT; i++)
+       {
+               if (!parsed_params[i])
+                       continue;
+
+               ret = all_params[i].apply(memregs, all_params[i].data);
+               if (ret < 0) {
+                       fprintf(stderr, "pollux_set: failed to apply %s (bad value?)\n",
+                               all_params[i].name);
+                       continue;
+               }
+
+               applied_something = 1;
+               applied_params[i] = 1;
+       }
+
+       if (applied_something)
+       {
+               int c;
+               printf("applied: ");
+               for (i = c = 0; i < ALL_PARAM_COUNT; i++)
+               {
+                       if (!applied_params[i])
+                               continue;
+                       if (c != 0)
+                               printf(", ");
+                       printf("%s", all_params[i].name);
+                       c++;
+               }
+               printf("\n");
+       }
+
+       return 0;
+}
+
+#ifdef BINARY
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+static void usage(const char *binary)
+{
+       int i;
+       printf("usage:\n%s <set_str[;set_str[;...]]>\n"
+               "set_str:\n", binary);
+       for (i = 0; i < ALL_PARAM_COUNT; i++)
+               printf("  %s=<%s>\n", all_params[i].name, all_params[i].help);
+}
+
+int main(int argc, char *argv[])
+{
+       volatile unsigned short *memregs;
+       int ret, memdev;
+
+       if (argc != 2) {
+               usage(argv[0]);
+               return 1;
+       }
+
+       memdev = open("/dev/mem", O_RDWR);
+       if (memdev == -1)
+       {
+               perror("open(/dev/mem) failed");
+               return 1;
+       }
+
+       memregs = mmap(0, 0x20000, PROT_READ|PROT_WRITE, MAP_SHARED, memdev, 0xc0000000);
+       if (memregs == MAP_FAILED)
+       {
+               perror("mmap(memregs) failed");
+               close(memdev);
+               return 1;
+       }
+
+       ret = pollux_set(memregs, argv[1]);
+
+       munmap((void *)memregs, 0x20000);
+       close(memdev);
+
+       return ret;
+}
+#endif
diff --git a/frontend/320240/skin/background.png b/frontend/320240/skin/background.png
new file mode 100644 (file)
index 0000000..0efdd18
Binary files /dev/null and b/frontend/320240/skin/background.png differ
diff --git a/frontend/320240/skin/font.png b/frontend/320240/skin/font.png
new file mode 100644 (file)
index 0000000..c526a08
Binary files /dev/null and b/frontend/320240/skin/font.png differ
diff --git a/frontend/320240/skin/readme.txt b/frontend/320240/skin/readme.txt
new file mode 100644 (file)
index 0000000..dd83963
--- /dev/null
@@ -0,0 +1,8 @@
+The skin images can be customized, but there are several limitations:\r
+\r
+background.png - must be 320x240 image with 24bit RGB colors.\r
+font.png       - must be 128x160 8bit grayscale image.\r
+selector.png   - must be 8x10 8bit grayscale image.\r
+\r
+Font and selector colors can be changed by editing skin.txt.\r
+\r
diff --git a/frontend/320240/skin/selector.png b/frontend/320240/skin/selector.png
new file mode 100644 (file)
index 0000000..5062cc2
Binary files /dev/null and b/frontend/320240/skin/selector.png differ
diff --git a/frontend/320240/skin/skin.txt b/frontend/320240/skin/skin.txt
new file mode 100644 (file)
index 0000000..1d6979f
--- /dev/null
@@ -0,0 +1,4 @@
+// html-style hex color codes, ex. ff0000 is red, 0000ff is blue, etc.\r
+text_color=ffffc0\r
+selection_color=808010\r
+\r
diff --git a/frontend/320240/ui_gp2x.h b/frontend/320240/ui_gp2x.h
new file mode 100644 (file)
index 0000000..a9c4413
--- /dev/null
@@ -0,0 +1,15 @@
+#ifndef UI_FEATURES_H
+#define UI_FEATURES_H
+
+#define MENU_BIOS_PATH "pcsx_rearmed/bios/"
+#define MENU_SHOW_VARSCALER 0
+#define MENU_SHOW_VOUTMODE 0
+#define MENU_SHOW_SCALER2 1
+#define MENU_SHOW_NUBS_BTNS 0
+#define MENU_SHOW_VIBRATION 1
+#define MENU_SHOW_DEADZONE 1
+#define MENU_SHOW_MINIMIZE 0
+#define MENU_SHOW_FULLSCREEN 0
+#define MENU_SHOW_VOLUME 1
+
+#endif // UI_FEATURES_H
index b8d17ab..919fabb 100644 (file)
@@ -78,6 +78,7 @@ static bool display_internal_fps = false;
 static unsigned frame_count = 0;
 static bool libretro_supports_bitmasks = false;
 static int show_advanced_gpu_peops_settings = -1;
+static int show_advanced_gpu_unai_settings  = -1;
 
 static unsigned previous_width = 0;
 static unsigned previous_height = 0;
@@ -1423,6 +1424,7 @@ static void update_variables(bool in_flight)
       if (strcmp(var.value, "disabled") == 0) {
          pl_rearmed_cbs.gpu_peops.iUseDither = 0;
          pl_rearmed_cbs.gpu_peopsgl.bDrawDither = 0;
+         pl_rearmed_cbs.gpu_unai.dithering = 0;
 #ifdef __ARM_NEON__
          pl_rearmed_cbs.gpu_neon.allow_dithering = 0;
 #endif
@@ -1430,6 +1432,7 @@ static void update_variables(bool in_flight)
       else if (strcmp(var.value, "enabled") == 0) {
          pl_rearmed_cbs.gpu_peops.iUseDither = 1;
          pl_rearmed_cbs.gpu_peopsgl.bDrawDither = 1;
+         pl_rearmed_cbs.gpu_unai.dithering = 1;
 #ifdef __ARM_NEON__
          pl_rearmed_cbs.gpu_neon.allow_dithering = 1;
 #endif
@@ -1757,6 +1760,96 @@ static void update_variables(bool in_flight)
    }
 #endif
 
+#ifdef GPU_UNAI
+   var.key = "pcsx_rearmed_gpu_unai_ilace_force";
+   var.value = NULL;
+
+   if (environ_cb(RETRO_ENVIRONMENT_GET_VARIABLE, &var) || var.value)
+   {
+      if (strcmp(var.value, "disabled") == 0)
+         pl_rearmed_cbs.gpu_unai.ilace_force = 0;
+      else if (strcmp(var.value, "enabled") == 0)
+         pl_rearmed_cbs.gpu_unai.ilace_force = 1;
+   }
+
+   var.key = "pcsx_rearmed_gpu_unai_pixel_skip";
+   var.value = NULL;
+
+   if (environ_cb(RETRO_ENVIRONMENT_GET_VARIABLE, &var) || var.value)
+   {
+      if (strcmp(var.value, "disabled") == 0)
+         pl_rearmed_cbs.gpu_unai.pixel_skip = 0;
+      else if (strcmp(var.value, "enabled") == 0)
+         pl_rearmed_cbs.gpu_unai.pixel_skip = 1;
+   }
+
+   var.key = "pcsx_rearmed_gpu_unai_lighting";
+   var.value = NULL;
+
+   if (environ_cb(RETRO_ENVIRONMENT_GET_VARIABLE, &var) || var.value)
+   {
+      if (strcmp(var.value, "disabled") == 0)
+         pl_rearmed_cbs.gpu_unai.lighting = 0;
+      else if (strcmp(var.value, "enabled") == 0)
+         pl_rearmed_cbs.gpu_unai.lighting = 1;
+   }
+
+   var.key = "pcsx_rearmed_gpu_unai_fast_lighting";
+   var.value = NULL;
+
+   if (environ_cb(RETRO_ENVIRONMENT_GET_VARIABLE, &var) || var.value)
+   {
+      if (strcmp(var.value, "disabled") == 0)
+         pl_rearmed_cbs.gpu_unai.fast_lighting = 0;
+      else if (strcmp(var.value, "enabled") == 0)
+         pl_rearmed_cbs.gpu_unai.fast_lighting = 1;
+   }
+
+   var.key = "pcsx_rearmed_gpu_unai_blending";
+   var.value = NULL;
+
+   if (environ_cb(RETRO_ENVIRONMENT_GET_VARIABLE, &var) || var.value)
+   {
+      if (strcmp(var.value, "disabled") == 0)
+         pl_rearmed_cbs.gpu_unai.blending = 0;
+      else if (strcmp(var.value, "enabled") == 0)
+         pl_rearmed_cbs.gpu_unai.blending = 1;
+   }
+
+   var.key = "pcsx_rearmed_show_gpu_unai_settings";
+   var.value = NULL;
+
+   if (environ_cb(RETRO_ENVIRONMENT_GET_VARIABLE, &var) && var.value)
+   {
+      int show_advanced_gpu_unai_settings_prev = show_advanced_gpu_unai_settings;
+
+      show_advanced_gpu_unai_settings = 1;
+      if (strcmp(var.value, "disabled") == 0)
+         show_advanced_gpu_unai_settings = 0;
+
+      if (show_advanced_gpu_unai_settings != show_advanced_gpu_unai_settings_prev)
+      {
+         unsigned i;
+         struct retro_core_option_display option_display;
+         char gpu_unai_option[5][40] = {
+            "pcsx_rearmed_gpu_unai_blending",
+            "pcsx_rearmed_gpu_unai_lighting",
+            "pcsx_rearmed_gpu_unai_fast_lighting",
+            "pcsx_rearmed_gpu_unai_ilace_force",
+            "pcsx_rearmed_gpu_unai_pixel_skip",
+         };
+
+         option_display.visible = show_advanced_gpu_unai_settings;
+
+         for (i = 0; i < 5; i++)
+         {
+            option_display.key = gpu_unai_option[i];
+            environ_cb(RETRO_ENVIRONMENT_SET_CORE_OPTIONS_DISPLAY, &option_display);
+         }
+      }
+   }
+#endif // GPU_UNAI
+
    if (in_flight) {
       // inform core things about possible config changes
       plugin_call_rearmed_cbs();
index 6513e1c..bfb21a5 100644 (file)
@@ -534,6 +534,76 @@ struct retro_core_option_definition option_defs_us[] = {
    },
 #endif
 
+    /* GPU UNAI Advanced Options */
+#ifdef GPU_UNAI
+   {
+      "pcsx_rearmed_show_gpu_unai_settings",
+      "Advance GPU UNAI/PCSX4All Settings",
+      "Shows or hides advanced gpu settings. A core restart might be needed for settings to take effect. NOTE: Quick Menu must be toggled for this setting to take effect.",
+      {
+         { "disabled", NULL },
+         { "enabled",  NULL },
+         { NULL, NULL},
+      },
+      "disabled",
+   },
+   {
+      "pcsx_rearmed_gpu_unai_blending",
+      "(GPU) Enable Blending",
+      NULL,
+      {
+         { "disabled", NULL },
+         { "enabled",  NULL },
+         { NULL, NULL},
+      },
+      "enabled",
+   },
+   {
+      "pcsx_rearmed_gpu_unai_lighting",
+      "(GPU) Enable Lighting",
+      NULL,
+      {
+         { "disabled", NULL },
+         { "enabled",  NULL },
+         { NULL, NULL},
+      },
+      "enabled",
+   },
+   {
+      "pcsx_rearmed_gpu_unai_fast_lighting",
+      "(GPU) Enable Fast Lighting",
+      NULL,
+      {
+         { "disabled", NULL },
+         { "enabled",  NULL },
+         { NULL, NULL},
+      },
+      "enabled",
+   },
+   {
+      "pcsx_rearmed_gpu_unai_ilace_force",
+      "(GPU) Enable Forced Interlace",
+      NULL,
+      {
+         { "disabled", NULL },
+         { "enabled",  NULL },
+         { NULL, NULL},
+      },
+      "disabled",
+   },
+   {
+      "pcsx_rearmed_gpu_unai_pixel_skip",
+      "(GPU) Enable Pixel Skip",
+      NULL,
+      {
+         { "disabled", NULL },
+         { "enabled",  NULL },
+         { NULL, NULL},
+      },
+      "disabled",
+   },
+#endif /* GPU UNAI Advanced Settings */
+
    {
       "pcsx_rearmed_show_bios_bootlogo",
       "Show Bios Bootlogo",
index c0bfd0f..b6b5411 100644 (file)
@@ -130,6 +130,13 @@ void emu_set_default_config(void)
        pl_rearmed_cbs.gpu_neon.enhancement_no_main = 0;
        pl_rearmed_cbs.gpu_peops.iUseDither = 0;
        pl_rearmed_cbs.gpu_peops.dwActFixes = 1<<7;
+       pl_rearmed_cbs.gpu_unai.ilace_force = 0;
+       pl_rearmed_cbs.gpu_unai.pixel_skip = 1;
+       pl_rearmed_cbs.gpu_unai.lighting = 1;
+       pl_rearmed_cbs.gpu_unai.fast_lighting = 1;
+       pl_rearmed_cbs.gpu_unai.blending = 1;
+       pl_rearmed_cbs.gpu_unai.dithering = 0;
+       // old gpu_unai config
        pl_rearmed_cbs.gpu_unai.abe_hack =
        pl_rearmed_cbs.gpu_unai.no_light =
        pl_rearmed_cbs.gpu_unai.no_blend = 0;
index babe109..47523b2 100644 (file)
@@ -305,14 +305,14 @@ static void menu_sync_config(void)
        cycle_multiplier = 10000 / psx_clock;
 
        switch (in_type_sel1) {
-       case 1:  in_type1 = PSE_PAD_TYPE_ANALOGPAD; break;
-       case 2:  in_type1 = PSE_PAD_TYPE_NEGCON;    break;
-       default: in_type1 = PSE_PAD_TYPE_STANDARD;
+       case 1:  in_type[0] = PSE_PAD_TYPE_ANALOGPAD; break;
+       case 2:  in_type[0] = PSE_PAD_TYPE_NEGCON;    break;
+       default: in_type[0] = PSE_PAD_TYPE_STANDARD;
        }
        switch (in_type_sel2) {
-       case 1:  in_type2 = PSE_PAD_TYPE_ANALOGPAD; break;
-       case 2:  in_type2 = PSE_PAD_TYPE_NEGCON;    break;
-       default: in_type2 = PSE_PAD_TYPE_STANDARD;
+       case 1:  in_type[1] = PSE_PAD_TYPE_ANALOGPAD; break;
+       case 2:  in_type[1] = PSE_PAD_TYPE_NEGCON;    break;
+       default: in_type[1] = PSE_PAD_TYPE_STANDARD;
        }
        if (in_evdev_allow_abs_only != allow_abs_only_old) {
                in_probe();
@@ -422,6 +422,12 @@ static const struct {
        CE_INTVAL_V(frameskip, 3),
        CE_INTVAL_P(gpu_peops.iUseDither),
        CE_INTVAL_P(gpu_peops.dwActFixes),
+       CE_INTVAL_P(gpu_unai.ilace_force),
+       CE_INTVAL_P(gpu_unai.pixel_skip),
+       CE_INTVAL_P(gpu_unai.lighting),
+       CE_INTVAL_P(gpu_unai.fast_lighting),
+       CE_INTVAL_P(gpu_unai.blending),
+       CE_INTVAL_P(gpu_unai.dithering),
        CE_INTVAL_P(gpu_unai.lineskip),
        CE_INTVAL_P(gpu_unai.abe_hack),
        CE_INTVAL_P(gpu_unai.no_light),
@@ -1358,10 +1364,16 @@ static int menu_loop_plugin_gpu_neon(int id, int keys)
 
 static menu_entry e_menu_plugin_gpu_unai[] =
 {
-       mee_onoff     ("Skip every 2nd line",        0, pl_rearmed_cbs.gpu_unai.lineskip, 1),
-       mee_onoff     ("Abe's Odyssey hack",         0, pl_rearmed_cbs.gpu_unai.abe_hack, 1),
-       mee_onoff     ("Disable lighting",           0, pl_rearmed_cbs.gpu_unai.no_light, 1),
-       mee_onoff     ("Disable blending",           0, pl_rearmed_cbs.gpu_unai.no_blend, 1),
+       //mee_onoff     ("Skip every 2nd line",        0, pl_rearmed_cbs.gpu_unai.lineskip, 1),
+       //mee_onoff     ("Abe's Odyssey hack",         0, pl_rearmed_cbs.gpu_unai.abe_hack, 1),
+       //mee_onoff     ("Disable lighting",           0, pl_rearmed_cbs.gpu_unai.no_light, 1),
+       //mee_onoff     ("Disable blending",           0, pl_rearmed_cbs.gpu_unai.no_blend, 1),
+       mee_onoff     ("Interlace",                  0, pl_rearmed_cbs.gpu_unai.ilace_force, 1),
+       mee_onoff     ("Dithering",                  0, pl_rearmed_cbs.gpu_unai.dithering, 1),
+       mee_onoff     ("Lighting",                   0, pl_rearmed_cbs.gpu_unai.lighting, 1),
+       mee_onoff     ("Fast lighting",              0, pl_rearmed_cbs.gpu_unai.fast_lighting, 1),
+       mee_onoff     ("Blending",                   0, pl_rearmed_cbs.gpu_unai.blending, 1),
+       mee_onoff     ("Pixel skip",                 0, pl_rearmed_cbs.gpu_unai.pixel_skip, 1),
        mee_end,
 };
 
index 92e62e9..d51c5e7 100644 (file)
@@ -80,6 +80,13 @@ struct rearmed_cbs {
                int   dwFrameRateTicks;
        } gpu_peops;
        struct {
+               int ilace_force;
+               int pixel_skip;
+               int lighting;
+               int fast_lighting;
+               int blending;
+               int dithering;
+               // old gpu_unai config for compatibility
                int   abe_hack;
                int   no_light, no_blend;
                int   lineskip;
index 2867791..3ebdf97 100644 (file)
@@ -140,10 +140,14 @@ ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
                  $(FRONTEND_DIR)/cspace_neon.S
   SOURCES_C   += $(NEON_DIR)/psx_gpu_if.c
 else ifeq ($(TARGET_ARCH_ABI),armeabi)
+  COREFLAGS += -DUSE_GPULIB=1 -DGPU_UNAI
+  COREFLAGS += -DINLINE="static __inline__" -Dasm="__asm__ __volatile__"
   SOURCES_ASM += $(UNAI_DIR)/gpu_arm.S \
                  $(FRONTEND_DIR)/cspace_arm.S
   SOURCES_C += $(UNAI_DIR)/gpulib_if.cpp
 else
+  COREFLAGS += -DUSE_GPULIB=1 -DGPU_UNAI
+  COREFLAGS += -DINLINE="static __inline__" -Dasm="__asm__ __volatile__"
   SOURCES_C += $(UNAI_DIR)/gpulib_if.cpp
 endif
 
diff --git a/maemo/hildon.c b/maemo/hildon.c
new file mode 100644 (file)
index 0000000..7e9cd9f
--- /dev/null
@@ -0,0 +1,843 @@
+#include <gtk/gtk.h>
+#include <glib.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <hildon/hildon.h>
+#include <string.h>
+#include <pthread.h>
+
+#include "../frontend/plugin_lib.h"
+#include "../frontend/main.h"
+#include "../libpcsxcore/misc.h"
+#include "../include/psemu_plugin_defs.h"
+#include "../libpcsxcore/cdrom.h"
+#include "../libpcsxcore/cdriso.h"
+#include "../plugins/dfinput/main.h"
+#include "../frontend/libpicofe/readpng.h"
+#include "maemo_common.h"
+#include <libosso.h>
+#include <dbus/dbus.h>
+
+#define X_RES           800
+#define Y_RES           480
+#define D_WIDTH                        640
+#define D_HEIGHT               480
+
+#define CALL_SIGNAL_IF "com.nokia.csd.Call"
+#define CALL_SIGNAL_PATH "/com/nokia/csd/call"
+#define CALL_INCOMING_SIG "Coming"
+
+#define DBUS_RULE_CALL_INCOMING "type='signal',interface='" CALL_SIGNAL_IF \
+                                "',path='" CALL_SIGNAL_PATH \
+                                "',member='" CALL_INCOMING_SIG "'"
+
+osso_context_t* osso = NULL;
+int bRunning = TRUE;
+extern int bKeepDisplayOn;
+extern int bAutosaveOnExit;
+extern int cornerActions[4];
+extern char keys_config_file[MAXPATHLEN];
+static pthread_t display_thread = (pthread_t)0;
+int g_layer_x = (X_RES - D_WIDTH) / 2;
+int g_layer_y = (Y_RES - D_HEIGHT) / 2;
+int g_layer_w = D_WIDTH, g_layer_h = D_HEIGHT;
+
+static GdkImage *image;
+static HildonAnimationActor *actor;
+static GtkWidget *window, *drawing = NULL;
+
+static int pl_buf_w, pl_buf_h;
+int keymap[65536];
+int direction_keys[4];
+
+// map psx4m compatible keymap to PSX keys
+static const unsigned char keymap2[14] = {
+       DKEY_LEFT,   // 0
+       DKEY_RIGHT,
+       DKEY_UP,
+       DKEY_DOWN,
+       DKEY_CIRCLE,
+       DKEY_CROSS,  // 5
+       DKEY_TRIANGLE,
+       DKEY_SQUARE,
+       DKEY_SELECT,
+       DKEY_START,
+       DKEY_L1,     // 10
+       DKEY_R1,
+       DKEY_L2,
+       DKEY_R2,
+};
+
+void hildon_quit()
+{
+       maemo_finish();
+       gtk_main_quit();
+       exit(0);
+}
+
+gdouble press_x = -1;
+gdouble press_y = -1;
+
+int maemo_x11_update_keys();
+void show_notification(char* text);
+
+void change_slot(int delta)
+{
+       state_slot += delta;
+       if (state_slot > 9)
+               state_slot = 0;
+       else if (state_slot < 0)
+               state_slot = 9;
+       char message[50];
+       sprintf(message,"Savestate slot: %i",state_slot + 1);
+       show_notification(message);
+}
+
+void save(int state_slot)
+{
+       emu_save_state(state_slot);
+       char buf[MAXPATHLEN];
+       if (image && image->mem){
+               sprintf (buf,"/opt/maemo/usr/games/screenshots%s.%3.3d",file_name,state_slot);
+               writepng(buf, image->mem, pl_buf_w,pl_buf_h);
+       }
+       char message[50];
+       sprintf(message,"Saved savestate slot: %i",state_slot + 1);
+       show_notification(message);
+}
+
+void quit()
+{
+       if (bAutosaveOnExit){
+               show_notification("Autosaving");
+               emu_save_state(99);
+               char buf[MAXPATHLEN];
+               if (image && image->mem){
+                       sprintf (buf,"/opt/maemo/usr/games/screenshots%s.%3.3d",file_name,99);
+                       writepng(buf, image->mem, pl_buf_w,pl_buf_h);
+               }
+       }
+       hildon_quit();
+}
+
+int show_confirmbox(char* text)
+{
+       if (!window)
+               return TRUE;
+
+       GtkWidget *dialog;
+       dialog = gtk_message_dialog_new (GTK_WINDOW(window),
+                                                                        GTK_DIALOG_DESTROY_WITH_PARENT,
+                                                                        GTK_MESSAGE_QUESTION,
+                                                                        GTK_BUTTONS_YES_NO,
+                                                                        text);
+       gint result = gtk_dialog_run (GTK_DIALOG (dialog));
+       gtk_widget_destroy (dialog);
+       if (result == GTK_RESPONSE_YES)
+               return TRUE;
+       return FALSE;
+}
+
+static void
+window_button_proxy(GtkWidget *widget,
+                                   GdkEventButton *event,
+                                   gpointer user_data)
+{
+       int corner = -1;
+       int sens = 100;
+
+       switch (event->type){
+       case GDK_BUTTON_PRESS:
+               //printf("GDK_BUTTON_PRESS: x=%f y=%f\n", event->x, event->y);
+               press_x = event->x;
+               press_y = event->y;
+               break;
+       case GDK_BUTTON_RELEASE:
+               //printf("GDK_BUTTON_RELEASE: x=%f y=%f\n", event->x, event->y);
+               if (press_x < sens && press_y < sens && event->x < sens && event->y < sens)
+                       corner = 0;
+               else if (press_x > 800 - sens && press_y < sens && event->x > 800 - sens && event->y < sens)
+                       corner = 1;
+               else if (press_x > 800 - sens && press_y > 480 - sens && event->x > 800 - sens && event->y > 480 - sens)
+                       corner = 2;
+               else if (press_x < sens && press_y > 480 - sens && event->x < sens && event->y > 480 - sens)
+                       corner = 3;
+
+               press_x = -1;
+               press_y = -1;
+               break;
+       default:
+               break;
+       }
+
+       if (corner >= 0){
+               switch (cornerActions[corner]){
+                       case 1:
+                               if (show_confirmbox("Save savestate?"))
+                                       save(state_slot);
+                               break;
+                       case 2:
+                               if (show_confirmbox("Load savestate?"))
+                                       emu_load_state(state_slot);
+                               break;
+                       case 3:
+                               change_slot(1);
+                               break;
+                       case 4:
+                               change_slot(-1);
+                               break;
+                       case 5:
+                               if (show_confirmbox("Quit?"))
+                                       quit();
+                               break;
+               }
+       }
+}
+
+static void *displayThread(void *arg)
+{
+       DBusConnection* system_bus = (DBusConnection*)osso_get_sys_dbus_connection(osso);
+       DBusMessage* msg = dbus_message_new_method_call("com.nokia.mce",
+                                                                                                   "/com/nokia/mce/request",
+                                                                                                   "com.nokia.mce.request",
+                                                                                                   "req_display_blanking_pause");
+       if (msg && system_bus) {
+               bRunning = TRUE;
+               while (bRunning) {
+                       dbus_connection_send(system_bus, msg, NULL);
+                       dbus_connection_flush(system_bus);
+                       int i = 0;
+                       for (i=0; i<8; i++){
+                               usleep(500000);
+                               if (!bRunning)
+                                       break;
+                       }
+               }
+               dbus_message_unref(msg);
+       }
+
+       pthread_exit(0);
+       return NULL;
+}
+
+void show_notification(char* text)
+{
+       if (window){
+               GtkWidget* banner = hildon_banner_show_information(GTK_WIDGET(window), NULL, text);
+               hildon_banner_set_timeout(HILDON_BANNER(banner), 3000);
+       }else{
+               DBusConnection* session_bus = (DBusConnection*)osso_get_dbus_connection(osso);
+               DBusMessageIter args;
+               DBusMessage*msg = dbus_message_new_method_call("org.freedesktop.Notifications",
+                                                                                                          "/org/freedesktop/Notifications",
+                                                                                                          "org.freedesktop.Notifications",
+                                                                                                          "SystemNoteInfoprint");
+               if (msg) {
+                       dbus_message_iter_init_append(msg, &args);
+                       char* param = text;
+                       if (dbus_message_iter_append_basic(&args, DBUS_TYPE_STRING, &param)) {
+                               dbus_connection_send(session_bus, msg, NULL);
+                               dbus_connection_flush(session_bus);
+                       }
+                       dbus_message_unref(msg);
+               }
+       }
+}
+
+void show_messagebox(char* text)
+{
+       if (!window)
+               return;
+
+       GtkWidget *dialog;
+       dialog = gtk_message_dialog_new (GTK_WINDOW(window),
+                                                                        GTK_DIALOG_DESTROY_WITH_PARENT,
+                                                                        GTK_MESSAGE_INFO,
+                                                                        GTK_BUTTONS_OK,
+                                                                        text);
+       gtk_dialog_run (GTK_DIALOG (dialog));
+       gtk_widget_destroy (dialog);
+}
+
+#include <hildon/hildon-file-chooser-dialog.h>
+void change_disc()
+{
+       GtkWidget *dialog;
+       dialog = hildon_file_chooser_dialog_new (GTK_WINDOW(window), GTK_FILE_CHOOSER_ACTION_OPEN);
+    gtk_window_set_title (GTK_WINDOW (dialog), "Change disc");
+
+       char currentFile[MAXPATHLEN];
+       strcpy(currentFile, GetIsoFile());
+       if (strlen(currentFile))
+               gtk_file_chooser_set_filename (GTK_FILE_CHOOSER(dialog), currentFile);
+       else
+               gtk_file_chooser_set_current_folder (GTK_FILE_CHOOSER(dialog), "/home/user/MyDocs/");
+
+       GtkFileFilter *filter=gtk_file_filter_new();
+       gtk_file_filter_add_pattern (filter,"*.bin");
+       gtk_file_filter_add_pattern (filter,"*.BIN");
+       gtk_file_filter_add_pattern (filter,"*.iso");
+       gtk_file_filter_add_pattern (filter,"*.ISO");
+       gtk_file_filter_add_pattern (filter,"*.img");
+       gtk_file_filter_add_pattern (filter,"*.IMG");
+       gtk_file_filter_add_pattern (filter,"*.z");
+       gtk_file_filter_add_pattern (filter,"*.Z");
+       gtk_file_filter_add_pattern (filter,"*.znx");
+       gtk_file_filter_add_pattern (filter,"*.ZNX");
+       gtk_file_filter_add_pattern (filter,"*.pbp");
+       gtk_file_filter_add_pattern (filter,"*.PBP");
+       gtk_file_filter_add_pattern (filter,"*.mdf");
+       gtk_file_filter_add_pattern (filter,"*.MDF");
+       gtk_file_chooser_set_filter (GTK_FILE_CHOOSER (dialog),filter);
+
+       if (gtk_dialog_run (GTK_DIALOG (dialog)) == GTK_RESPONSE_OK) {
+               char *filename = gtk_file_chooser_get_filename (GTK_FILE_CHOOSER (dialog));
+
+               //if (strcmp(filename, currentFile)) {
+                       CdromId[0] = '\0';
+                       CdromLabel[0] = '\0';
+
+                       set_cd_image(filename);
+                       if (ReloadCdromPlugin() < 0)
+                               printf("Failed to load cdr plugin\n");
+
+                       if (CDR_open() < 0)
+                               printf("Failed to open cdr plugin\n");
+
+                       strcpy(file_name, strrchr(filename,'/'));
+
+                       SetCdOpenCaseTime(time(NULL) + 3);
+                       LidInterrupt();
+               //}
+               g_free (filename);
+       }
+
+       gtk_widget_destroy (dialog);
+}
+
+void change_multi_disc()
+{
+    HildonDialog* window = HILDON_DIALOG(hildon_dialog_new());
+    gtk_window_set_title (GTK_WINDOW (window), "Change disc");
+    gtk_window_set_default_size(GTK_WINDOW (window), 480, 300);
+
+    GtkWidget* sw = hildon_pannable_area_new ();
+    gtk_box_pack_start (GTK_BOX(GTK_DIALOG(window)->vbox), sw, TRUE, TRUE, 0);
+
+    GtkWidget* tree_view = hildon_gtk_tree_view_new (HILDON_UI_MODE_EDIT);
+    gtk_widget_set_name (tree_view, "fremantle-widget");
+
+    gtk_tree_view_set_rules_hint (GTK_TREE_VIEW (tree_view), TRUE);
+
+    int i;
+    GtkListStore *store = gtk_list_store_new (1, G_TYPE_STRING);
+    for (i = 0; i < cdrIsoMultidiskCount; i++) {
+        gchar *str;
+
+        str = g_strdup_printf ("Disc %d", i+1);
+        gtk_list_store_insert_with_values (store, NULL, i, 0, str, -1);
+        g_free (str);
+    }
+    GtkTreeModel* model =  GTK_TREE_MODEL (store);
+
+    gtk_tree_view_set_model (GTK_TREE_VIEW (tree_view), model);
+    g_object_unref (model);
+
+    GtkTreeSelection* selection = gtk_tree_view_get_selection (GTK_TREE_VIEW (tree_view));
+    gtk_tree_selection_set_mode (selection, GTK_SELECTION_SINGLE);
+
+    GtkCellRenderer* renderer = gtk_cell_renderer_text_new ();
+    g_object_set (renderer,
+                  "xalign", 0.5,
+                  "weight", PANGO_WEIGHT_NORMAL,
+                  NULL);
+
+    gtk_tree_view_insert_column_with_attributes (GTK_TREE_VIEW (tree_view),
+                                                 0, "Column 0",
+                                                 renderer,
+                                                 "text", 0,
+                                                 NULL);
+
+    char current[5];
+    sprintf(current, "%i", cdrIsoMultidiskSelect);
+    GtkTreePath* path = gtk_tree_path_new_from_string(current);
+    gtk_tree_selection_select_path (selection, path);
+    gtk_tree_path_free(path);
+
+    gtk_widget_set_size_request (tree_view, 480, 800);
+    gtk_container_add (GTK_CONTAINER (sw), tree_view);
+
+    hildon_dialog_add_button (HILDON_DIALOG(window), GTK_STOCK_OK, GTK_RESPONSE_ACCEPT);
+
+    gtk_widget_show_all (GTK_WIDGET(window));
+    gint result = gtk_dialog_run (GTK_DIALOG (window));
+    if (result == GTK_RESPONSE_ACCEPT) {
+      GtkTreeModel* model;
+      GtkTreeIter iter;
+      GtkTreeSelection* selection = gtk_tree_view_get_selection(GTK_TREE_VIEW(tree_view));
+      if (gtk_tree_selection_get_selected(selection, &model, &iter)){
+           GtkTreePath* path = gtk_tree_model_get_path(model , &iter);
+               int* i = gtk_tree_path_get_indices(path) ;
+
+               cdrIsoMultidiskSelect = *i;
+               CdromId[0] = '\0';
+               CdromLabel[0] = '\0';
+
+               CDR_close();
+               if (CDR_open() < 0) {
+                       printf("Failed to load cdr plugin\n");
+                       return;
+               }
+
+               SetCdOpenCaseTime(time(NULL) + 3);
+               LidInterrupt();
+      }
+    }
+       gtk_widget_destroy(GTK_WIDGET(window));
+}
+
+static DBusHandlerResult on_msg_recieved(DBusConnection* connection G_GNUC_UNUSED, DBusMessage* message, void* data)
+{
+       const char* path = dbus_message_get_path(message);
+       if (path && g_str_equal(path, CALL_SIGNAL_PATH)){
+               const char* mbr = dbus_message_get_member(message);
+               if (mbr && g_str_equal(mbr, CALL_INCOMING_SIG))
+                       show_messagebox("Paused");
+       }
+
+       return DBUS_HANDLER_RESULT_NOT_YET_HANDLED;
+}
+
+static void
+window_key_proxy(GtkWidget *widget,
+                    GdkEventKey *event,
+                    gpointer user_data)
+{
+       key_press_event(event->hardware_keycode, event->type == GDK_KEY_PRESS ? 1 : (event->type == GDK_KEY_RELEASE ? 2 : 0) );
+}
+
+int last_key_pressed = 0;
+inline void key_press_event(int key2,int type)
+{
+       int psxkey1 = -1, psxkey2 = -1;
+       int key=keymap[key2];
+
+       if (key < 0)
+               return;
+
+       if (type == 1 && key2 == last_key_pressed)
+               return;
+       last_key_pressed = type == 1 ? key2 : 0;
+
+       //printf("Key: %i %s\n", key2, type == 1 ? "Pressed" : (type == 2 ? "Released" : "Unknown"));
+       if (key < ARRAY_SIZE(keymap2)){
+               psxkey1 = keymap2[key];
+       }else switch (key) {
+               case 14:
+                       quit();
+                       break;
+               case 15:
+                       psxkey1 = DKEY_UP;
+                       psxkey2 = DKEY_LEFT;
+                       break;
+               case 16:
+                       psxkey1 = DKEY_UP;
+                       psxkey2 = DKEY_RIGHT;
+                       break;
+               case 17:
+                       psxkey1 = DKEY_DOWN;
+                       psxkey2 = DKEY_LEFT;
+                       break;
+               case 18:
+                       psxkey1 = DKEY_DOWN;
+                       psxkey2 = DKEY_RIGHT;
+                       break;
+               case 19:
+                       if (type == 1)
+                               save(state_slot);
+                       return;
+               case 20:
+                       if (type == 1)
+                               emu_load_state(state_slot);
+                       return;
+               case 21:
+                       if (type == 1)
+                               change_slot(1);
+                       return;
+               case 22:
+                       if (type == 1)
+                               change_slot(-1);
+                       return;
+               case 23:
+                       if (type == 1){
+                               if (cdrIsoMultidiskCount > 1)
+                                       change_multi_disc();
+                               else
+                                       change_disc();
+                       }
+                       return;
+       }
+
+       if (in_type1 == PSE_PAD_TYPE_GUNCON){
+               if (type == 1) {
+                       switch (psxkey1){
+                               case DKEY_CROSS:
+                                       in_state_gun |= SACTION_GUN_A;
+                                       break;          
+                               case DKEY_CIRCLE:
+                                       in_state_gun |= SACTION_GUN_B;
+                                       break;          
+                               case DKEY_TRIANGLE:
+                                       in_state_gun |= SACTION_GUN_TRIGGER2;
+                                       break;          
+                               case DKEY_SQUARE:
+                                       in_state_gun |= SACTION_GUN_TRIGGER;
+                                       break;          
+                       }
+               }else if (type == 2) {
+                       switch (psxkey1){
+                               case DKEY_CROSS:
+                                       in_state_gun &= ~SACTION_GUN_A;
+                                       break;          
+                               case DKEY_CIRCLE:
+                                       in_state_gun &= ~SACTION_GUN_B;
+                                       break;          
+                               case DKEY_TRIANGLE:
+                                       in_state_gun &= ~SACTION_GUN_TRIGGER2;
+                                       break;          
+                               case DKEY_SQUARE:
+                                       in_state_gun &= ~SACTION_GUN_TRIGGER;
+                                       break;          
+                       }
+               }
+       }else{
+               if (type == 1) {
+               if (psxkey1 >= 0)
+                       in_keystate |= 1 << psxkey1;
+               if (psxkey2 >= 0)
+                       in_keystate |= 1 << psxkey2;
+
+                       if (in_type1 == PSE_PAD_TYPE_ANALOGPAD){
+                               switch(psxkey1){
+                                       case DKEY_LEFT:
+                                               in_a1[0] = 0;
+                                               break;
+                                       case DKEY_RIGHT:
+                                               in_a1[0] = 255;
+                                               break;
+                                       case DKEY_UP:
+                                               in_a1[1] = 0;
+                                               break;
+                                       case DKEY_DOWN:
+                                               in_a1[1] = 255;
+                                               break;
+                               }
+       }
+               }
+               else if (type == 2) {
+               if (psxkey1 >= 0)
+                       in_keystate &= ~(1 << psxkey1);
+               if (psxkey2 >= 0)
+                       in_keystate &= ~(1 << psxkey2);
+
+                       if (in_type1 == PSE_PAD_TYPE_ANALOGPAD){
+                               switch(psxkey1){
+                                       case DKEY_LEFT:
+                                       case DKEY_RIGHT:
+                                               in_a1[0] = 127;
+                                               break;
+                                       case DKEY_UP:
+                                       case DKEY_DOWN:
+                                               in_a1[1] = 127;
+                                               break;
+                               }
+                       }
+               emu_set_action(SACTION_NONE);
+       }
+       }
+}
+
+void plat_finish()
+{
+       hildon_quit();
+}
+
+void set_accel_multipliers()
+{
+       accelOptions.xMultiplier = 255.0 / ( (accelOptions.maxValue - accelOptions.sens) * 2.0);
+       accelOptions.yMultiplier = 255.0 / ( (accelOptions.maxValue - accelOptions.sens) * 2.0);
+}
+
+#include <gdk/gdkx.h>
+int maemo_init(int *argc, char ***argv)
+{
+       osso = osso_initialize("pcsxrearmed", PACKAGE_VERSION, FALSE, NULL);
+
+       DBusConnection* system_bus = (DBusConnection*)osso_get_sys_dbus_connection(osso);
+    dbus_bus_add_match(system_bus, DBUS_RULE_CALL_INCOMING, NULL);
+       dbus_connection_add_filter(system_bus, on_msg_recieved, NULL, NULL);
+
+       FILE* pFile;
+       pFile = fopen(keys_config_file, "r");
+       if (pFile == NULL){
+               fprintf(stderr, "Error opening keys config file %s\n", keys_config_file);
+               return 1;
+       }
+       printf("Keys config read from %s\n", keys_config_file);
+
+       int ch;
+       int i=0;
+       for (i=0;i<65536;i++)
+               keymap[i]=-1;
+       if (NULL != pFile) {
+               for(i=0;i<24;i++){
+                       fscanf(pFile, "%i",&ch);
+                       keymap[ch]=i;
+                       if (i < 4)
+                               direction_keys[i] = ch;
+               }
+               fclose(pFile);
+       }
+       
+       switch (in_type1){
+               case PSE_PAD_TYPE_GUNCON:
+                       memset(cornerActions, 0, sizeof(cornerActions));
+                       printf("Controller set to GUNCON (SLPH-00034)\n");
+                       break;
+               case PSE_PAD_TYPE_STANDARD:
+                       printf("Controller set to standard (SCPH-1080)\n");
+                       break;
+               case PSE_PAD_TYPE_ANALOGPAD:
+                       printf("Controller set to analog (SCPH-1150)\n");
+                       break;  
+       }
+
+       if (in_enable_vibration)
+               printf("Vibration enabled\n");
+
+       if (!(g_maemo_opts&8)){
+       gtk_init (argc, argv);
+
+       window = hildon_stackable_window_new ();
+       gtk_widget_realize (window);
+       gtk_window_fullscreen (GTK_WINDOW(window));
+
+               if (cornerActions[0] + cornerActions[1] + cornerActions[2] + cornerActions[3] > 0){
+                       g_signal_connect (G_OBJECT (window), "button_release_event",
+                                               G_CALLBACK (window_button_proxy), NULL);
+                       g_signal_connect (G_OBJECT (window), "button_press_event",
+                                               G_CALLBACK (window_button_proxy), NULL);
+               }
+
+       g_signal_connect (G_OBJECT (window), "key-press-event",
+                               G_CALLBACK (window_key_proxy), NULL);
+       g_signal_connect (G_OBJECT (window), "key-release-event",
+                               G_CALLBACK (window_key_proxy), NULL);
+       g_signal_connect (G_OBJECT (window), "delete_event",
+                               G_CALLBACK (hildon_quit), NULL);
+       gtk_widget_add_events (window,
+                               GDK_BUTTON_PRESS_MASK | GDK_BUTTON_RELEASE_MASK);
+
+       actor = HILDON_ANIMATION_ACTOR (hildon_animation_actor_new());
+       if (g_maemo_opts & 2)
+               hildon_animation_actor_set_position (actor, 0, 0 );
+       else
+               hildon_animation_actor_set_position (actor, (X_RES - D_WIDTH)/2, (Y_RES - D_HEIGHT)/2 );
+       hildon_animation_actor_set_parent (actor, GTK_WINDOW (window));
+
+       drawing = gtk_image_new ();
+
+       gtk_container_add (GTK_CONTAINER (actor), drawing);
+
+       gtk_widget_show_all (GTK_WIDGET (actor));
+       gtk_widget_show_all (GTK_WIDGET (window));
+       }else{
+               gtk_init (argc, argv);
+               /*GdkScreen* scr = gdk_screen_get_default();
+               window = GTK_WIDGET(gdk_screen_get_root_window(scr));
+               if (!window)
+                       window = GTK_WIDGET(gdk_get_default_root_window());*/
+       }
+
+       set_accel_multipliers();
+
+       if (bKeepDisplayOn){
+               if (pthread_create(&display_thread, NULL, displayThread, NULL))
+                       printf("Failed to create display thread.\n");           
+       }
+
+       pl_rearmed_cbs.only_16bpp = 1;
+       return 0;
+}
+
+void maemo_finish()
+{
+       if (display_thread > 0){
+               bRunning = FALSE;
+               pthread_join(display_thread, NULL);
+       }
+
+       if (osso){
+               osso_deinitialize(osso);
+               osso = NULL;
+       }
+       printf("Exiting\n");
+}
+
+void menu_loop(void)
+{
+}
+
+void *plat_gvideo_set_mode(int *w_, int *h_, int *bpp_)
+{
+       int w = *w_, h = *h_;
+
+       if (g_maemo_opts&8) return pl_vout_buf;
+       //printf("Setting video mode %ix%i\n", w, h);
+
+       if (w <= 0 || h <= 0)
+               return pl_vout_buf;
+
+       if (image) gdk_image_destroy(image);
+       image = gdk_image_new( GDK_IMAGE_FASTEST, gdk_visual_get_system(), w, h );
+
+       pl_vout_buf = (void *) image->mem;
+
+       gtk_image_set_from_image (GTK_IMAGE(drawing), image, NULL);
+
+       gtk_window_resize (GTK_WINDOW (actor), w, h);
+       if (g_maemo_opts & 2)
+               hildon_animation_actor_set_scale (actor,
+                               (gdouble)800 / (gdouble)w,
+                               (gdouble)480 / (gdouble)h
+                               );
+       else
+               hildon_animation_actor_set_scale (actor,
+                               (gdouble)D_WIDTH / (gdouble)w,
+                               (gdouble)D_HEIGHT / (gdouble)h
+                               );
+       pl_buf_w=w;pl_buf_h=h;
+       return pl_vout_buf;
+}
+
+void *plat_gvideo_flip(void)
+{
+       if (!(g_maemo_opts&8))
+               gtk_widget_queue_draw(drawing);
+
+       // process accelometer
+       if (g_maemo_opts & 4) {
+               float x, y, z;
+               FILE* f = fopen( "/sys/class/i2c-adapter/i2c-3/3-001d/coord", "r" );
+               if( !f ) {printf ("err in accel"); exit(1);}
+               fscanf( f, "%f %f %f", &x, &y, &z );
+               fclose( f );
+
+               if (in_type1 == PSE_PAD_TYPE_ANALOGPAD){
+                       if (x > accelOptions.maxValue) x = accelOptions.maxValue;
+                       else if (x < -accelOptions.maxValue) x = -accelOptions.maxValue;
+
+                       const int maxValue = accelOptions.maxValue - accelOptions.sens;
+                       if(x > accelOptions.sens){
+                               x -= accelOptions.sens;
+                               in_a1[0] = (-x + maxValue ) *  accelOptions.xMultiplier;
+                       }else if (x < -accelOptions.sens){
+                               x += accelOptions.sens;
+                               in_a1[0] = (-x + maxValue ) *  accelOptions.xMultiplier;
+                       }else in_a1[0] = 127;
+
+                       y += accelOptions.y_def;
+                       if (y > accelOptions.maxValue) y = accelOptions.maxValue;
+                       else if (y < -accelOptions.maxValue) y = -accelOptions.maxValue;
+
+                       if(y > accelOptions.sens){
+                               y -= accelOptions.sens;
+                               in_a1[1] = (-y + maxValue ) *  accelOptions.yMultiplier;
+                       }else if (y < -accelOptions.sens){
+                               y += accelOptions.sens;
+                               in_a1[1] = (-y + maxValue ) *  accelOptions.yMultiplier;
+                       }else in_a1[1] = 127;
+
+                       //printf("x: %i y: %i\n", in_a1[0], in_a1[1]);
+               }else{
+                       if( x > accelOptions.sens ) in_keystate |= 1 << DKEY_LEFT;
+                       else if( x < -accelOptions.sens ) in_keystate |= 1 << DKEY_RIGHT;
+               else {in_keystate &= ~(1 << DKEY_LEFT);in_keystate &= ~(1 << DKEY_RIGHT);}
+
+                       y += accelOptions.y_def;
+                       if( y > accelOptions.sens )in_keystate |= 1 << DKEY_UP;
+                       else if( y < -accelOptions.sens ) in_keystate |= 1 << DKEY_DOWN;
+               else {in_keystate &= ~(1 << DKEY_DOWN);in_keystate &= ~(1 << DKEY_UP);}
+               }
+       }
+
+       return pl_vout_buf;
+}
+
+// for frontend/plugin_lib.c
+void update_input(void)
+{
+       if (g_maemo_opts & 8)
+               maemo_x11_update_keys();
+       else {
+               /* process GTK+ events */
+               while (gtk_events_pending())
+                       gtk_main_iteration();
+       }
+}
+
+int omap_enable_layer(int enabled)
+{
+       return 0;
+}
+
+void menu_notify_mode_change(int w, int h, int bpp)
+{
+}
+
+void *plat_prepare_screenshot(int *w, int *h, int *bpp)
+{
+       return NULL;
+}
+
+void plat_step_volume(int is_up)
+{
+}
+
+void plat_trigger_vibrate(int pad, int low, int high)
+{
+       const int vDuration = 10;
+
+       DBusConnection* system_bus = (DBusConnection*)osso_get_sys_dbus_connection(osso);
+       DBusMessageIter args;
+       DBusMessage*msg = dbus_message_new_method_call("com.nokia.mce",
+                                                                                                  "/com/nokia/mce/request",
+                                                                                                  "com.nokia.mce.request",
+                                                                                                  "req_start_manual_vibration");
+       if (msg) {
+               dbus_message_iter_init_append(msg, &args);
+               // FIXME: somebody with hardware should tune this
+               int speed = high; // is_strong ? 200 : 150;
+               int duration = vDuration;
+               if (dbus_message_iter_append_basic(&args, DBUS_TYPE_INT32, &speed)) {
+                       if (dbus_message_iter_append_basic(&args, DBUS_TYPE_INT32, &duration)) {
+                               dbus_connection_send(system_bus, msg, NULL);
+                               //dbus_connection_flush(system_bus);
+                       }
+               }
+               dbus_message_unref(msg);
+       }
+}
+
+void plat_minimize(void)
+{
+}
+
+void plat_gvideo_close(void)
+{
+}
+
+void plat_gvideo_open(int is_pal)
+{
+}
diff --git a/maemo/maemo_common.h b/maemo/maemo_common.h
new file mode 100644 (file)
index 0000000..ace0bfd
--- /dev/null
@@ -0,0 +1,18 @@
+int maemo_init(int *argc, char ***argv);
+void maemo_finish();
+
+extern char file_name[MAXPATHLEN];
+extern int g_maemo_opts;
+
+extern inline void key_press_event(int key,int type);
+
+typedef struct
+{ 
+       int sens;
+       int y_def;
+       float maxValue;
+       float xMultiplier;
+       float yMultiplier;
+} accel_option;
+
+extern accel_option accelOptions;
diff --git a/maemo/maemo_xkb.c b/maemo/maemo_xkb.c
new file mode 100644 (file)
index 0000000..52af2ca
--- /dev/null
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2009, Wei Mingzhi <whistler@openoffice.org>.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses>.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <X11/Xlib.h>
+#include <X11/Xutil.h>
+#include <X11/keysym.h>
+#include <X11/XKBlib.h>
+
+#include "../frontend/main.h"
+#include "../frontend/plugin_lib.h"
+
+static Atom wmprotocols, wmdelwindow;
+static int initialized;
+
+
+
+static void InitKeyboard(void) {
+       Display *disp = (Display *)gpuDisp;
+       if (disp){
+               wmprotocols = XInternAtom(disp, "WM_PROTOCOLS", 0);
+               wmdelwindow = XInternAtom(disp, "WM_DELETE_WINDOW", 0);
+               XkbSetDetectableAutoRepeat(disp, 1, NULL);
+       }
+}
+
+static void DestroyKeyboard(void) {
+       Display *disp = (Display *)gpuDisp;
+       if (disp)
+               XkbSetDetectableAutoRepeat(disp, 0, NULL);
+}
+#include "maemo_common.h"
+
+int maemo_x11_update_keys() {
+
+       XEvent                                  evt;
+       XClientMessageEvent             *xce;
+       int leave = 0;
+       Display *disp = (Display *)gpuDisp;
+       
+       if (!disp)
+               return 0;
+               
+       if (!initialized) {
+               initialized++;
+               InitKeyboard();
+       }
+
+       while (XPending(disp)>0) {
+               XNextEvent(disp, &evt);
+               switch (evt.type) {
+                       case KeyPress:
+                       case KeyRelease:
+                               key_press_event(evt.xkey.keycode, evt.type==KeyPress ? 1 : (evt.type==KeyRelease ? 2 : 0) );
+                               break;
+
+                       case ClientMessage:
+                               xce = (XClientMessageEvent *)&evt;
+                               if (xce->message_type == wmprotocols && (Atom)xce->data.l[0] == wmdelwindow)
+                                       leave = 1;
+                               break;
+               }
+       }
+
+       if (leave) {
+               DestroyKeyboard();
+               exit(1);
+       }
+
+       return 0;
+}
index 1075ee5..756d19a 100644 (file)
@@ -1,6 +1,9 @@
 CFLAGS += -ggdb -Wall -O3 -ffast-math
 CFLAGS += -DREARMED
 CFLAGS += -I../../include
+#CFLAGS += -DINLINE="static __inline__"
+#CFLAGS += -Dasm="__asm__ __volatile__"
+CFLAGS += -DUSE_GPULIB=1
 
 include ../../config.mak
 
@@ -8,7 +11,7 @@ SRC_STANDALONE += gpu.cpp
 SRC_GPULIB += gpulib_if.cpp
 
 ifeq "$(ARCH)" "arm"
-SRC += gpu_arm.s
+SRC += gpu_arm.S
 endif
 
 #BIN_STANDALONE = gpuPCSX4ALL.so
diff --git a/plugins/gpu_unai/README_senquack.txt b/plugins/gpu_unai/README_senquack.txt
new file mode 100644 (file)
index 0000000..cda17fc
--- /dev/null
@@ -0,0 +1,956 @@
+//NOTE: You can find the set of original Unai poly routines (disabled now)
+// at the bottom end of this file.
+
+//senquack - Original Unai GPU poly routines have been replaced with new
+// ones based on DrHell routines. The original routines suffered from
+// shifted rows, causing many quads to have their first triangle drawn
+// correctly, but the second triangle would randomly have pixels shifted
+// either left or right or entire rows not drawn at all. Furthermore,
+// some times entire triangles seemed to be either missing or only
+// partially drawn (most clearly seen in sky/road textures in NFS3,
+// clock tower in beginning of Castlevania SOTN). Pixel gaps were
+// prevalent.
+//
+// Since DrHell GPU didn't seem to exhibit these artifacts at all, I adapted
+// its routines to GPU Unai (Unai was probably already originally based on it).
+// DrHell uses 22.10 fixed point instead of Unai's 16.16, so gpu_fixedpoint.h
+// required modification as well as gpu_inner.h (where gpuPolySpanFn driver
+// functions are).
+//
+// Originally, I tried to patch up original Unai routines and got as far
+// as fixing the shifted rows, but still had other problem of triangles rendered
+// wrong (black triangular gaps in NFS3 sky, clock tower in Castlevania SOTN).
+// I eventually gave up. Even after rewriting/adapting the routines,
+// however, I still had some random pixel droupouts, specifically in
+// NFS3 sky texture. I discovered that gpu_inner.h gpuPolySpanFn function
+// was taking optimizations to an extreme and packing u/v texture coords
+// into one 32-bit word, reducing their accuracy. Only once they were
+// handled in full-accuracy individual words was that problem fixed.
+//
+// NOTE: I also added support for doing divisions using the FPU, either
+//  with normal division or multiplication-by-reciprocal.
+//  To use float division, GPU_UNAI_USE_FLOATMATH should be defined.
+//  To use float mult-by-reciprocal, GPU_UNAI_USE_FLOAT_DIV_MULTINV
+//   can be specified (GPU_UNAI_USE_FLOATMATH must also be specified)
+//  To use inaccurate fixed-point mult-by-reciprocal, define
+//   GPU_UNAI_USE_INT_DIV_MULTINV. This is the default on older
+//   ARM devices like Wiz/Caanoo that have neither integer division
+//   in hardware or an FPU. It results in some pixel dropouts,
+//   texture glitches, but less than the original GPU UNAI code.
+//
+//  If nothing is specified, integer division will be used.
+//
+// NOTE 2: Even with MIPS32R2 having FPU recip.s instruction, and it is
+//  used when this platform is detected, I found it not to give any
+//  noticeable speedup over normal float division (in fact seemed a tiny
+//  tiny bit slower). I also found float division to not provide any
+//  noticeable speedups versus integer division on MISP32R2 platform.
+//  Granted, the differences were all around .5 FPS or less.
+//
+// TODO:
+// * See if anything can be done about remaining pixel gaps in Gran
+//   Turismo car models, track.
+// * Find better way of passing parameters to gpuPolySpanFn functions than
+//   through original Unai method of using global variables u4,v4,du4 etc.
+// * Come up with some newer way of drawing rows of pixels than by calling
+//   gpuPolySpanFn through function pointer. For every row, at least on
+//   MIPS platforms, many registers are having to be pushed/popped from stack
+//   on each call, which is strange since MIPS has so many registers.
+// * MIPS MXU/ASM optimized gpuPolySpanFn ?
+
+//////////////////////////////////////////////////////////////////////////
+//senquack - Disabled original Unai poly routines left here for reference:
+// ( from gpu_raster_polygon.h )
+//////////////////////////////////////////////////////////////////////////
+#define GPU_TESTRANGE3() \
+{ \
+       if(x0<0) { if((x1-x0)>CHKMAX_X) return; if((x2-x0)>CHKMAX_X) return; } \
+       if(x1<0) { if((x0-x1)>CHKMAX_X) return; if((x2-x1)>CHKMAX_X) return; } \
+       if(x2<0) { if((x0-x2)>CHKMAX_X) return; if((x1-x2)>CHKMAX_X) return; } \
+       if(y0<0) { if((y1-y0)>CHKMAX_Y) return; if((y2-y0)>CHKMAX_Y) return; } \
+       if(y1<0) { if((y0-y1)>CHKMAX_Y) return; if((y2-y1)>CHKMAX_Y) return; } \
+       if(y2<0) { if((y0-y2)>CHKMAX_Y) return; if((y1-y2)>CHKMAX_Y) return; } \
+}
+
+/*----------------------------------------------------------------------
+F3
+----------------------------------------------------------------------*/
+
+void gpuDrawF3(const PP gpuPolySpanDriver)
+{
+       const int li=linesInterlace;
+       const int pi=(progressInterlace?(linesInterlace+1):0);
+       const int pif=(progressInterlace?(progressInterlace_flag?(linesInterlace+1):0):1);
+       s32 temp;
+       s32 xa, xb, xmin, xmax;
+       s32 ya, yb, ymin, ymax;
+       s32 x0, x1, x2, x3, dx3=0, x4, dx4=0, dx;
+       s32 y0, y1, y2;
+
+       x0 = GPU_EXPANDSIGN(PacketBuffer.S2[2]);
+       y0 = GPU_EXPANDSIGN(PacketBuffer.S2[3]);
+       x1 = GPU_EXPANDSIGN(PacketBuffer.S2[4]);
+       y1 = GPU_EXPANDSIGN(PacketBuffer.S2[5]);
+       x2 = GPU_EXPANDSIGN(PacketBuffer.S2[6]);
+       y2 = GPU_EXPANDSIGN(PacketBuffer.S2[7]);
+
+       GPU_TESTRANGE3();
+       
+       x0 += DrawingOffset[0];   x1 += DrawingOffset[0];   x2 += DrawingOffset[0];
+       y0 += DrawingOffset[1];   y1 += DrawingOffset[1];   y2 += DrawingOffset[1];
+
+       xmin = DrawingArea[0];  xmax = DrawingArea[2];
+       ymin = DrawingArea[1];  ymax = DrawingArea[3];
+
+       {
+               int rx0 = Max2(xmin,Min3(x0,x1,x2));
+               int ry0 = Max2(ymin,Min3(y0,y1,y2));
+               int rx1 = Min2(xmax,Max3(x0,x1,x2));
+               int ry1 = Min2(ymax,Max3(y0,y1,y2));
+               if( rx0>=rx1 || ry0>=ry1) return;
+       }
+       
+       PixelData = GPU_RGB16(PacketBuffer.U4[0]);
+
+       if (y0 >= y1)
+       {
+               if( y0!=y1 || x0>x1 )
+               {
+                       GPU_SWAP(x0, x1, temp);
+                       GPU_SWAP(y0, y1, temp);
+               }
+       }
+       if (y1 >= y2)
+       {
+               if( y1!=y2 || x1>x2 )
+               {
+                       GPU_SWAP(x1, x2, temp);
+                       GPU_SWAP(y1, y2, temp);
+               }
+       }
+       if (y0 >= y1)
+       {
+               if( y0!=y1 || x0>x1 )
+               {
+                       GPU_SWAP(x0, x1, temp);
+                       GPU_SWAP(y0, y1, temp);
+               }
+       }
+
+       ya = y2 - y0;
+       yb = y2 - y1;
+       dx =(x2 - x1) * ya - (x2 - x0) * yb;
+
+       for (s32 loop0 = 2; loop0; --loop0)
+       {
+               if (loop0 == 2)
+               {
+                       ya = y0;
+                       yb = y1;
+                       x3 = i2x(x0);
+                       x4 = y0!=y1 ? x3 : i2x(x1);
+                       if (dx < 0)
+                       {
+                               dx3 = xLoDivx((x2 - x0), (y2 - y0));
+                               dx4 = xLoDivx((x1 - x0), (y1 - y0));
+                       }
+                       else
+                       {
+                               dx3 = xLoDivx((x1 - x0), (y1 - y0));
+                               dx4 = xLoDivx((x2 - x0), (y2 - y0));
+                       }
+               }
+               else
+               {
+                       ya = y1;
+                       yb = y2;
+                       if (dx < 0)
+                       {
+                               x4  = i2x(x1);
+                               x3  = i2x(x0) + (dx3 * (y1 - y0));
+                               dx4 = xLoDivx((x2 - x1), (y2 - y1));
+                       }
+                       else
+                       {
+                               x3  = i2x(x1);
+                               x4  = i2x(x0) + (dx4 * (y1 - y0));
+                               dx3 = xLoDivx((x2 - x1), (y2 - y1));
+                       }
+               }
+
+               temp = ymin - ya;
+               if (temp > 0)
+               {
+                       ya  = ymin;
+                       x3 += dx3*temp;
+                       x4 += dx4*temp;
+               }
+               if (yb > ymax) yb = ymax;
+               if (ya>=yb) continue;
+
+               x3+= fixed_HALF;
+               x4+= fixed_HALF;
+
+               u16* PixelBase  = &((u16*)GPU_FrameBuffer)[FRAME_OFFSET(0, ya)];
+               
+               for(;ya<yb;++ya, PixelBase += FRAME_WIDTH, x3+=dx3, x4+=dx4)
+               {
+                       if (ya&li) continue;
+                       if ((ya&pi)==pif) continue;
+                       xa = x2i(x3);
+                       xb = x2i(x4);
+                       if( (xa>xmax) || (xb<xmin) ) continue;
+                       if(xa < xmin) xa = xmin;
+                       if(xb > xmax) xb = xmax;
+                       xb-=xa;
+                       if(xb>0) gpuPolySpanDriver(PixelBase + xa,xb);
+               }
+       }
+}
+
+/*----------------------------------------------------------------------
+FT3
+----------------------------------------------------------------------*/
+
+void gpuDrawFT3(const PP gpuPolySpanDriver)
+{
+       const int li=linesInterlace;
+       const int pi=(progressInterlace?(linesInterlace+1):0);
+       const int pif=(progressInterlace?(progressInterlace_flag?(linesInterlace+1):0):1);
+       s32 temp;
+       s32 xa, xb, xmin, xmax;
+       s32 ya, yb, ymin, ymax;
+       s32 x0, x1, x2, x3, dx3=0, x4, dx4=0, dx;
+       s32 y0, y1, y2;
+       s32 u0, u1, u2, u3, du3=0;
+       s32 v0, v1, v2, v3, dv3=0;
+
+       x0 = GPU_EXPANDSIGN(PacketBuffer.S2[2] );
+       y0 = GPU_EXPANDSIGN(PacketBuffer.S2[3] );
+       x1 = GPU_EXPANDSIGN(PacketBuffer.S2[6] );
+       y1 = GPU_EXPANDSIGN(PacketBuffer.S2[7] );
+       x2 = GPU_EXPANDSIGN(PacketBuffer.S2[10]);
+       y2 = GPU_EXPANDSIGN(PacketBuffer.S2[11]);
+
+       GPU_TESTRANGE3();
+       
+       x0 += DrawingOffset[0];   x1 += DrawingOffset[0];   x2 += DrawingOffset[0];
+       y0 += DrawingOffset[1];   y1 += DrawingOffset[1];   y2 += DrawingOffset[1];
+
+       xmin = DrawingArea[0];  xmax = DrawingArea[2];
+       ymin = DrawingArea[1];  ymax = DrawingArea[3];
+
+       {
+               int rx0 = Max2(xmin,Min3(x0,x1,x2));
+               int ry0 = Max2(ymin,Min3(y0,y1,y2));
+               int rx1 = Min2(xmax,Max3(x0,x1,x2));
+               int ry1 = Min2(ymax,Max3(y0,y1,y2));
+               if( rx0>=rx1 || ry0>=ry1) return;
+       }
+       
+       u0 = PacketBuffer.U1[8];  v0 = PacketBuffer.U1[9];
+       u1 = PacketBuffer.U1[16]; v1 = PacketBuffer.U1[17];
+       u2 = PacketBuffer.U1[24]; v2 = PacketBuffer.U1[25];
+
+       r4 = s32(PacketBuffer.U1[0]);
+       g4 = s32(PacketBuffer.U1[1]);
+       b4 = s32(PacketBuffer.U1[2]);
+       dr4 = dg4 = db4 = 0;
+
+       if (y0 >= y1)
+       {
+               if( y0!=y1 || x0>x1 )
+               {
+                       GPU_SWAP(x0, x1, temp);
+                       GPU_SWAP(y0, y1, temp);
+                       GPU_SWAP(u0, u1, temp);
+                       GPU_SWAP(v0, v1, temp);
+               }
+       }
+       if (y1 >= y2)
+       {
+               if( y1!=y2 || x1>x2 )
+               {
+                       GPU_SWAP(x1, x2, temp);
+                       GPU_SWAP(y1, y2, temp);
+                       GPU_SWAP(u1, u2, temp);
+                       GPU_SWAP(v1, v2, temp);
+               }
+       }
+       if (y0 >= y1)
+       {
+               if( y0!=y1 || x0>x1 )
+               {
+                       GPU_SWAP(x0, x1, temp);
+                       GPU_SWAP(y0, y1, temp);
+                       GPU_SWAP(u0, u1, temp);
+                       GPU_SWAP(v0, v1, temp);
+               }
+       }
+
+       ya  = y2 - y0;
+       yb  = y2 - y1;
+       dx  = (x2 - x1) * ya - (x2 - x0) * yb;
+       du4 = (u2 - u1) * ya - (u2 - u0) * yb;
+       dv4 = (v2 - v1) * ya - (v2 - v0) * yb;
+
+       s32 iF,iS;
+       xInv( dx, iF, iS);
+       du4 = xInvMulx( du4, iF, iS);
+       dv4 = xInvMulx( dv4, iF, iS);
+       tInc = ((u32)(du4<<7)&0x7fff0000) | ((u32)(dv4>>9)&0x00007fff);
+       tMsk = (TextureWindow[2]<<23) | (TextureWindow[3]<<7) | 0x00ff00ff;
+
+       for (s32 loop0 = 2; loop0; --loop0)
+       {
+               if (loop0 == 2)
+               {
+                       ya = y0;
+                       yb = y1;
+                       u3 = i2x(u0);
+                       v3 = i2x(v0);
+                       x3 = i2x(x0);
+                       x4 = y0!=y1 ? x3 : i2x(x1);
+                       if (dx < 0)
+                       {
+                               xInv( (y2 - y0), iF, iS);
+                               dx3 = xInvMulx( (x2 - x0), iF, iS);
+                               du3 = xInvMulx( (u2 - u0), iF, iS);
+                               dv3 = xInvMulx( (v2 - v0), iF, iS);
+                               dx4 = xLoDivx ( (x1 - x0), (y1 - y0));
+                       }
+                       else
+                       {
+                               xInv( (y1 - y0), iF, iS);
+                               dx3 = xInvMulx( (x1 - x0), iF, iS);
+                               du3 = xInvMulx( (u1 - u0), iF, iS);
+                               dv3 = xInvMulx( (v1 - v0), iF, iS);
+                               dx4 = xLoDivx ( (x2 - x0), (y2 - y0));
+                       }
+               }
+               else
+               {
+                       ya = y1;
+                       yb = y2;
+                       if (dx < 0)
+                       {
+                               temp = y1 - y0;
+                               u3 = i2x(u0) + (du3 * temp);
+                               v3 = i2x(v0) + (dv3 * temp);
+                               x3 = i2x(x0) + (dx3 * temp);
+                               x4 = i2x(x1);
+                               dx4 = xLoDivx((x2 - x1), (y2 - y1));
+                       }
+                       else
+                       {
+                               u3 = i2x(u1);
+                               v3 = i2x(v1);
+                               x3 = i2x(x1);
+                               x4 = i2x(x0) + (dx4 * (y1 - y0));
+                               xInv( (y2 - y1), iF, iS);
+                               dx3 = xInvMulx( (x2 - x1), iF, iS);
+                               du3 = xInvMulx( (u2 - u1), iF, iS);
+                               dv3 = xInvMulx( (v2 - v1), iF, iS);
+                       }
+               }
+
+               temp = ymin - ya;
+               if (temp > 0)
+               {
+                       ya  = ymin;
+                       x3 += dx3*temp;
+                       x4 += dx4*temp;
+                       u3 += du3*temp;
+                       v3 += dv3*temp;
+               }
+               if (yb > ymax) yb = ymax;
+               if (ya>=yb) continue;
+
+               x3+= fixed_HALF;
+               x4+= fixed_HALF;
+               u3+= fixed_HALF;
+               v4+= fixed_HALF;
+
+               u16* PixelBase  = &((u16*)GPU_FrameBuffer)[FRAME_OFFSET(0, ya)];
+
+               for(;ya<yb;++ya, PixelBase += FRAME_WIDTH, x3+=dx3, x4+=dx4, u3+=du3, v3+=dv3)
+               {
+                       if (ya&li) continue;
+                       if ((ya&pi)==pif) continue;
+                       xa = x2i(x3);
+                       xb = x2i(x4);
+                       if( (xa>xmax) || (xb<xmin) ) continue;
+
+                       temp = xmin - xa;
+                       if(temp > 0)
+                       {
+                               xa  = xmin;
+                               u4 = u3 + du4*temp;
+                               v4 = v3 + dv4*temp;
+                       }
+                       else
+                       {
+                               u4 = u3;
+                               v4 = v3;
+                       }
+                       if(xb > xmax) xb = xmax;
+                       xb-=xa;
+                       if(xb>0) gpuPolySpanDriver(PixelBase + xa,xb);
+               }
+       }
+}
+
+/*----------------------------------------------------------------------
+G3
+----------------------------------------------------------------------*/
+
+void gpuDrawG3(const PP gpuPolySpanDriver)
+{
+       const int li=linesInterlace;
+       const int pi=(progressInterlace?(linesInterlace+1):0);
+       const int pif=(progressInterlace?(progressInterlace_flag?(linesInterlace+1):0):1);
+       s32 temp;
+       s32 xa, xb, xmin, xmax;
+       s32 ya, yb, ymin, ymax;
+       s32 x0, x1, x2, x3, dx3=0, x4, dx4=0, dx;
+       s32 y0, y1, y2;
+       s32 r0, r1, r2, r3, dr3=0;
+       s32 g0, g1, g2, g3, dg3=0;
+       s32 b0, b1, b2, b3, db3=0;
+
+       x0 = GPU_EXPANDSIGN(PacketBuffer.S2[2] );
+       y0 = GPU_EXPANDSIGN(PacketBuffer.S2[3] );
+       x1 = GPU_EXPANDSIGN(PacketBuffer.S2[6] );
+       y1 = GPU_EXPANDSIGN(PacketBuffer.S2[7] );
+       x2 = GPU_EXPANDSIGN(PacketBuffer.S2[10]);
+       y2 = GPU_EXPANDSIGN(PacketBuffer.S2[11]);
+
+       GPU_TESTRANGE3();
+       
+       x0 += DrawingOffset[0];   x1 += DrawingOffset[0];   x2 += DrawingOffset[0];
+       y0 += DrawingOffset[1];   y1 += DrawingOffset[1];   y2 += DrawingOffset[1];
+
+       xmin = DrawingArea[0];  xmax = DrawingArea[2];
+       ymin = DrawingArea[1];  ymax = DrawingArea[3];
+
+       {
+               int rx0 = Max2(xmin,Min3(x0,x1,x2));
+               int ry0 = Max2(ymin,Min3(y0,y1,y2));
+               int rx1 = Min2(xmax,Max3(x0,x1,x2));
+               int ry1 = Min2(ymax,Max3(y0,y1,y2));
+               if( rx0>=rx1 || ry0>=ry1) return;
+       }
+       
+       r0 = PacketBuffer.U1[0];        g0 = PacketBuffer.U1[1];        b0 = PacketBuffer.U1[2];
+       r1 = PacketBuffer.U1[8];        g1 = PacketBuffer.U1[9];        b1 = PacketBuffer.U1[10];
+       r2 = PacketBuffer.U1[16];       g2 = PacketBuffer.U1[17];       b2 = PacketBuffer.U1[18];
+
+       if (y0 >= y1)
+       {
+               if( y0!=y1 || x0>x1 )
+               {
+                       GPU_SWAP(x0, x1, temp);         GPU_SWAP(y0, y1, temp);
+                       GPU_SWAP(r0, r1, temp);         GPU_SWAP(g0, g1, temp);         GPU_SWAP(b0, b1, temp);
+               }
+       }
+       if (y1 >= y2)
+       {
+               if( y1!=y2 || x1>x2 )
+               {
+                       GPU_SWAP(x1, x2, temp);         GPU_SWAP(y1, y2, temp);
+                       GPU_SWAP(r1, r2, temp);         GPU_SWAP(g1, g2, temp);   GPU_SWAP(b1, b2, temp);
+               }
+       }
+       if (y0 >= y1)
+       {
+               if( y0!=y1 || x0>x1 )
+               {
+                       GPU_SWAP(x0, x1, temp);         GPU_SWAP(y0, y1, temp);
+                       GPU_SWAP(r0, r1, temp);   GPU_SWAP(g0, g1, temp);               GPU_SWAP(b0, b1, temp);
+               }
+       }
+
+       ya  = y2 - y0;
+       yb  = y2 - y1;
+       dx  = (x2 - x1) * ya - (x2 - x0) * yb;
+       dr4 = (r2 - r1) * ya - (r2 - r0) * yb;
+       dg4 = (g2 - g1) * ya - (g2 - g0) * yb;
+       db4 = (b2 - b1) * ya - (b2 - b0) * yb;
+
+       s32 iF,iS;
+       xInv(            dx, iF, iS);
+       dr4 = xInvMulx( dr4, iF, iS);
+       dg4 = xInvMulx( dg4, iF, iS);
+       db4 = xInvMulx( db4, iF, iS);
+       u32 dr = (u32)(dr4<< 8)&(0xffffffff<<21);   if(dr4<0) dr+= 1<<21;
+       u32 dg = (u32)(dg4>> 3)&(0xffffffff<<10);   if(dg4<0) dg+= 1<<10;
+       u32 db = (u32)(db4>>14)&(0xffffffff    );   if(db4<0) db+= 1<< 0;
+       lInc = db + dg + dr;
+
+       for (s32 loop0 = 2; loop0; --loop0)
+       {
+               if (loop0 == 2)
+               {
+                       ya = y0;
+                       yb = y1;
+                       r3 = i2x(r0);
+                       g3 = i2x(g0);
+                       b3 = i2x(b0);
+                       x3 = i2x(x0);
+                       x4 = y0!=y1 ? x3 : i2x(x1);
+                       if (dx < 0)
+                       {
+                               xInv(           (y2 - y0), iF, iS);
+                               dx3 = xInvMulx( (x2 - x0), iF, iS);
+                               dr3 = xInvMulx( (r2 - r0), iF, iS);
+                               dg3 = xInvMulx( (g2 - g0), iF, iS);
+                               db3 = xInvMulx( (b2 - b0), iF, iS);
+                               dx4 = xLoDivx ( (x1 - x0), (y1 - y0));
+                       }
+                       else
+                       {
+                               xInv(           (y1 - y0), iF, iS);
+                               dx3 = xInvMulx( (x1 - x0), iF, iS);
+                               dr3 = xInvMulx( (r1 - r0), iF, iS);
+                               dg3 = xInvMulx( (g1 - g0), iF, iS);
+                               db3 = xInvMulx( (b1 - b0), iF, iS);
+                               dx4 = xLoDivx ( (x2 - x0), (y2 - y0));
+                       }
+               }
+               else
+               {
+                       ya = y1;
+                       yb = y2;
+                       if (dx < 0)
+                       {
+                               temp = y1 - y0;
+                               r3  = i2x(r0) + (dr3 * temp);
+                               g3  = i2x(g0) + (dg3 * temp);
+                               b3  = i2x(b0) + (db3 * temp);
+                               x3  = i2x(x0) + (dx3 * temp);
+                               x4  = i2x(x1);
+                               dx4 = xLoDivx((x2 - x1), (y2 - y1));
+                       }
+                       else
+                       {
+                               r3 = i2x(r1);
+                               g3 = i2x(g1);
+                               b3 = i2x(b1);
+                               x3 = i2x(x1);
+                               x4 = i2x(x0) + (dx4 * (y1 - y0));
+
+                               xInv(           (y2 - y1), iF, iS);
+                               dx3 = xInvMulx( (x2 - x1), iF, iS);
+                               dr3 = xInvMulx( (r2 - r1), iF, iS);
+                               dg3 = xInvMulx( (g2 - g1), iF, iS);
+                               db3 = xInvMulx( (b2 - b1), iF, iS);
+                       }
+               }
+
+               temp = ymin - ya;
+               if (temp > 0)
+               {
+                       ya  = ymin;
+                       x3 += dx3*temp;   x4 += dx4*temp;
+                       r3 += dr3*temp;   g3 += dg3*temp;   b3 += db3*temp;
+               }
+               if (yb > ymax) yb = ymax;
+               if (ya>=yb) continue;
+
+               x3+= fixed_HALF;  x4+= fixed_HALF;
+               r3+= fixed_HALF;  g3+= fixed_HALF;  b3+= fixed_HALF;
+
+               u16* PixelBase  = &((u16*)GPU_FrameBuffer)[FRAME_OFFSET(0, ya)];
+               
+               for(;ya<yb;++ya, PixelBase += FRAME_WIDTH, x3+=dx3, x4+=dx4, r3+=dr3, g3+=dg3, b3+=db3)
+               {
+                       if (ya&li) continue;
+                       if ((ya&pi)==pif) continue;
+                       xa = x2i(x3);
+                       xb = x2i(x4);
+                       if( (xa>xmax) || (xb<xmin) ) continue;
+
+                       temp = xmin - xa;
+                       if(temp > 0)
+                       {
+                               xa  = xmin;
+                               r4 = r3 + dr4*temp;   g4 = g3 + dg4*temp;   b4 = b3 + db4*temp;
+                       }
+                       else
+                       {
+                               r4 = r3;  g4 = g3;  b4 = b3;
+                       }
+                       if(xb > xmax) xb = xmax;
+                       xb-=xa;
+                       if(xb>0) gpuPolySpanDriver(PixelBase + xa,xb);
+               }
+       }
+}
+
+/*----------------------------------------------------------------------
+GT3
+----------------------------------------------------------------------*/
+
+void gpuDrawGT3(const PP gpuPolySpanDriver)
+{
+       const int li=linesInterlace;
+       const int pi=(progressInterlace?(linesInterlace+1):0);
+       const int pif=(progressInterlace?(progressInterlace_flag?(linesInterlace+1):0):1);
+       s32 temp;
+       s32 xa, xb, xmin, xmax;
+       s32 ya, yb, ymin, ymax;
+       s32 x0, x1, x2, x3, dx3=0, x4, dx4=0, dx;
+       s32 y0, y1, y2;
+       s32 u0, u1, u2, u3, du3=0;
+       s32 v0, v1, v2, v3, dv3=0;
+       s32 r0, r1, r2, r3, dr3=0;
+       s32 g0, g1, g2, g3, dg3=0;
+       s32 b0, b1, b2, b3, db3=0;
+
+       x0 = GPU_EXPANDSIGN(PacketBuffer.S2[2] );
+       y0 = GPU_EXPANDSIGN(PacketBuffer.S2[3] );
+       x1 = GPU_EXPANDSIGN(PacketBuffer.S2[8] );
+       y1 = GPU_EXPANDSIGN(PacketBuffer.S2[9] );
+       x2 = GPU_EXPANDSIGN(PacketBuffer.S2[14]);
+       y2 = GPU_EXPANDSIGN(PacketBuffer.S2[15]);
+
+       GPU_TESTRANGE3();
+       
+       x0 += DrawingOffset[0];   x1 += DrawingOffset[0];   x2 += DrawingOffset[0];
+       y0 += DrawingOffset[1];   y1 += DrawingOffset[1];   y2 += DrawingOffset[1];
+
+       xmin = DrawingArea[0];  xmax = DrawingArea[2];
+       ymin = DrawingArea[1];  ymax = DrawingArea[3];
+
+       {
+               int rx0 = Max2(xmin,Min3(x0,x1,x2));
+               int ry0 = Max2(ymin,Min3(y0,y1,y2));
+               int rx1 = Min2(xmax,Max3(x0,x1,x2));
+               int ry1 = Min2(ymax,Max3(y0,y1,y2));
+               if( rx0>=rx1 || ry0>=ry1) return;
+       }
+
+       r0 = PacketBuffer.U1[0];        g0 = PacketBuffer.U1[1];        b0 = PacketBuffer.U1[2];
+       u0 = PacketBuffer.U1[8];        v0 = PacketBuffer.U1[9];
+       r1 = PacketBuffer.U1[12];       g1 = PacketBuffer.U1[13];       b1 = PacketBuffer.U1[14];
+       u1 = PacketBuffer.U1[20];       v1 = PacketBuffer.U1[21];
+       r2 = PacketBuffer.U1[24];       g2 = PacketBuffer.U1[25];       b2 = PacketBuffer.U1[26];
+       u2 = PacketBuffer.U1[32];       v2 = PacketBuffer.U1[33];
+
+       if (y0 >= y1)
+       {
+               if( y0!=y1 || x0>x1 )
+               {
+                       GPU_SWAP(x0, x1, temp);         GPU_SWAP(y0, y1, temp);
+                       GPU_SWAP(u0, u1, temp);         GPU_SWAP(v0, v1, temp);
+                       GPU_SWAP(r0, r1, temp);         GPU_SWAP(g0, g1, temp);   GPU_SWAP(b0, b1, temp);
+               }
+       }
+       if (y1 >= y2)
+       {
+               if( y1!=y2 || x1>x2 )
+               {
+                       GPU_SWAP(x1, x2, temp);         GPU_SWAP(y1, y2, temp);
+                       GPU_SWAP(u1, u2, temp);         GPU_SWAP(v1, v2, temp);
+                       GPU_SWAP(r1, r2, temp);   GPU_SWAP(g1, g2, temp);               GPU_SWAP(b1, b2, temp);
+               }
+       }
+       if (y0 >= y1)
+       {
+               if( y0!=y1 || x0>x1 )
+               {
+                       GPU_SWAP(x0, x1, temp);         GPU_SWAP(y0, y1, temp);
+                       GPU_SWAP(u0, u1, temp);         GPU_SWAP(v0, v1, temp);
+                       GPU_SWAP(r0, r1, temp);         GPU_SWAP(g0, g1, temp);         GPU_SWAP(b0, b1, temp);
+               }
+       }
+
+       ya  = y2 - y0;
+       yb  = y2 - y1;
+       dx  = (x2 - x1) * ya - (x2 - x0) * yb;
+       du4 = (u2 - u1) * ya - (u2 - u0) * yb;
+       dv4 = (v2 - v1) * ya - (v2 - v0) * yb;
+       dr4 = (r2 - r1) * ya - (r2 - r0) * yb;
+       dg4 = (g2 - g1) * ya - (g2 - g0) * yb;
+       db4 = (b2 - b1) * ya - (b2 - b0) * yb;
+
+       s32 iF,iS;
+
+       xInv(            dx, iF, iS);
+       du4 = xInvMulx( du4, iF, iS);
+       dv4 = xInvMulx( dv4, iF, iS);
+       dr4 = xInvMulx( dr4, iF, iS);
+       dg4 = xInvMulx( dg4, iF, iS);
+       db4 = xInvMulx( db4, iF, iS);
+       u32 dr = (u32)(dr4<< 8)&(0xffffffff<<21);   if(dr4<0) dr+= 1<<21;
+       u32 dg = (u32)(dg4>> 3)&(0xffffffff<<10);   if(dg4<0) dg+= 1<<10;
+       u32 db = (u32)(db4>>14)&(0xffffffff    );   if(db4<0) db+= 1<< 0;
+       lInc = db + dg + dr;
+       tInc = ((u32)(du4<<7)&0x7fff0000) | ((u32)(dv4>>9)&0x00007fff);
+       tMsk = (TextureWindow[2]<<23) | (TextureWindow[3]<<7) | 0x00ff00ff;
+
+       for (s32 loop0 = 2; loop0; --loop0)
+       {
+               if (loop0 == 2)
+               {
+                       ya = y0;
+                       yb = y1;
+                       u3 = i2x(u0);
+                       v3 = i2x(v0);
+                       r3 = i2x(r0);
+                       g3 = i2x(g0);
+                       b3 = i2x(b0);
+                       x3 = i2x(x0);
+                       x4 = y0!=y1 ? x3 : i2x(x1);
+                       if (dx < 0)
+                       {
+                               xInv(           (y2 - y0), iF, iS);
+                               dx3 = xInvMulx( (x2 - x0), iF, iS);
+                               du3 = xInvMulx( (u2 - u0), iF, iS);
+                               dv3 = xInvMulx( (v2 - v0), iF, iS);
+                               dr3 = xInvMulx( (r2 - r0), iF, iS);
+                               dg3 = xInvMulx( (g2 - g0), iF, iS);
+                               db3 = xInvMulx( (b2 - b0), iF, iS);
+                               dx4 = xLoDivx ( (x1 - x0), (y1 - y0));
+                       }
+                       else
+                       {
+                               xInv(           (y1 - y0), iF, iS);
+                               dx3 = xInvMulx( (x1 - x0), iF, iS);
+                               du3 = xInvMulx( (u1 - u0), iF, iS);
+                               dv3 = xInvMulx( (v1 - v0), iF, iS);
+                               dr3 = xInvMulx( (r1 - r0), iF, iS);
+                               dg3 = xInvMulx( (g1 - g0), iF, iS);
+                               db3 = xInvMulx( (b1 - b0), iF, iS);
+                               dx4 = xLoDivx ( (x2 - x0), (y2 - y0));
+                       }
+               }
+               else
+               {
+                       ya = y1;
+                       yb = y2;
+                       if (dx < 0)
+                       {
+                               temp = y1 - y0;
+                               u3  = i2x(u0) + (du3 * temp);
+                               v3  = i2x(v0) + (dv3 * temp);
+                               r3  = i2x(r0) + (dr3 * temp);
+                               g3  = i2x(g0) + (dg3 * temp);
+                               b3  = i2x(b0) + (db3 * temp);
+                               x3  = i2x(x0) + (dx3 * temp);
+                               x4  = i2x(x1);
+                               dx4 = xLoDivx((x2 - x1), (y2 - y1));
+                       }
+                       else
+                       {
+                               u3 = i2x(u1);
+                               v3 = i2x(v1);
+                               r3 = i2x(r1);
+                               g3 = i2x(g1);
+                               b3 = i2x(b1);
+                               x3 = i2x(x1);
+                               x4 = i2x(x0) + (dx4 * (y1 - y0));
+
+                               xInv(           (y2 - y1), iF, iS);
+                               dx3 = xInvMulx( (x2 - x1), iF, iS);
+                               du3 = xInvMulx( (u2 - u1), iF, iS);
+                               dv3 = xInvMulx( (v2 - v1), iF, iS);
+                               dr3 = xInvMulx( (r2 - r1), iF, iS);
+                               dg3 = xInvMulx( (g2 - g1), iF, iS);
+                               db3 = xInvMulx( (b2 - b1), iF, iS);
+                       }
+               }
+
+               temp = ymin - ya;
+               if (temp > 0)
+               {
+                       ya  = ymin;
+                       x3 += dx3*temp;   x4 += dx4*temp;
+                       u3 += du3*temp;   v3 += dv3*temp;
+                       r3 += dr3*temp;   g3 += dg3*temp;   b3 += db3*temp;
+               }
+               if (yb > ymax) yb = ymax;
+               if (ya>=yb) continue;
+
+               x3+= fixed_HALF;  x4+= fixed_HALF;
+               u3+= fixed_HALF;  v4+= fixed_HALF;
+               r3+= fixed_HALF;  g3+= fixed_HALF;  b3+= fixed_HALF;
+               u16* PixelBase  = &((u16*)GPU_FrameBuffer)[FRAME_OFFSET(0, ya)];
+               
+               for(;ya<yb;++ya, PixelBase += FRAME_WIDTH, x3+=dx3, x4+=dx4, u3+=du3, v3+=dv3, r3+=dr3, g3+=dg3,        b3+=db3)
+               {
+                       if (ya&li) continue;
+                       if ((ya&pi)==pif) continue;
+                       xa = x2i(x3);
+                       xb = x2i(x4);
+                       if( (xa>xmax) || (xb<xmin))     continue;
+
+                       temp = xmin - xa;
+                       if(temp > 0)
+                       {
+                               xa  = xmin;
+                               u4 = u3 + du4*temp;   v4 = v3 + dv4*temp;
+                               r4 = r3 + dr4*temp;   g4 = g3 + dg4*temp;   b4 = b3 + db4*temp;
+                       }
+                       else
+                       {
+                               u4 = u3;  v4 = v3;
+                               r4 = r3;  g4 = g3;  b4 = b3;
+                       }
+                       if(xb > xmax) xb = xmax;
+                       xb-=xa;
+                       if(xb>0) gpuPolySpanDriver(PixelBase + xa,xb);
+               }
+       }
+}
+
+
+//////////////////////////////////////////////////////////////////////////
+//senquack - Original Unai poly routines left here for reference:
+// ( from gpu_inner.h ) NOTE: this uses 16.16, not 22.10 fixed point
+//////////////////////////////////////////////////////////////////////////
+template<const int CF>
+INLINE void  gpuPolySpanFn(u16 *pDst, u32 count)
+{
+       if (!TM)
+       {       
+               // NO TEXTURE
+               if (!G)
+               {
+                       // NO GOURAUD
+                       u16 data;
+                       if (L) { u32 lCol=((u32)(b4<< 2)&(0x03ff)) | ((u32)(g4<<13)&(0x07ff<<10)) | ((u32)(r4<<24)&(0x07ff<<21)); gpuLightingRGB(data,lCol); }
+                       else data=PixelData;
+                       if ((!M)&&(!B))
+                       {
+                               if (MB) { data = data | 0x8000; }
+                               do { *pDst++ = data; } while (--count);
+                       }
+                       else if ((M)&&(!B))
+                       {
+                               if (MB) { data = data | 0x8000; }
+                               do { if (!(*pDst&0x8000)) { *pDst = data; } pDst++; } while (--count);
+                       }
+                       else
+                       {
+                               u16 uSrc;
+                               u16 uDst;
+                               u32 uMsk; if (BM==0) uMsk=0x7BDE;
+                               u32 bMsk; if (BI) bMsk=blit_mask;
+                               do
+                               {
+                                       // blit-mask
+                                       if (BI) { if((bMsk>>((((u32)pDst)>>1)&7))&1) goto endtile; }
+                                       //  masking
+                                       uDst = *pDst;
+                                       if(M) { if (uDst&0x8000) goto endtile;  }
+                                       uSrc = data;
+                                       //  blend
+                                       if (BM==0) gpuBlending00(uSrc, uDst);
+                                       if (BM==1) gpuBlending01(uSrc, uDst);
+                                       if (BM==2) gpuBlending02(uSrc, uDst);
+                                       if (BM==3) gpuBlending03(uSrc, uDst);
+                                       if (MB) { *pDst = uSrc | 0x8000; }
+                                       else    { *pDst = uSrc; }
+                                       endtile: pDst++;
+                               }
+                               while (--count);
+                       }
+               }
+               else
+               {
+                       // GOURAUD
+                       u16 uDst;
+                       u16 uSrc;
+                       u32 linc=lInc;
+                       u32 lCol=((u32)(b4>>14)&(0x03ff)) | ((u32)(g4>>3)&(0x07ff<<10)) | ((u32)(r4<<8)&(0x07ff<<21));
+                       u32 uMsk; if ((B)&&(BM==0)) uMsk=0x7BDE;
+                       u32 bMsk; if (BI) bMsk=blit_mask;
+                       do
+                       {
+                               // blit-mask
+                               if (BI) { if((bMsk>>((((u32)pDst)>>1)&7))&1) goto endgou; }
+                               //  masking
+                               if(M) { uDst = *pDst;  if (uDst&0x8000) goto endgou;  }
+                               //  blend
+                               if(B)
+                               {
+                                       //  light
+                                       gpuLightingRGB(uSrc,lCol);
+                                       if(!M)    { uDst = *pDst; }
+                                       if (BM==0) gpuBlending00(uSrc, uDst);
+                                       if (BM==1) gpuBlending01(uSrc, uDst);
+                                       if (BM==2) gpuBlending02(uSrc, uDst);
+                                       if (BM==3) gpuBlending03(uSrc, uDst);
+                               }
+                               else
+                               {
+                                       //  light
+                                       gpuLightingRGB(uSrc,lCol);
+                               }
+                               if (MB) { *pDst = uSrc | 0x8000; }
+                               else    { *pDst = uSrc; }
+                               endgou: pDst++; lCol=(lCol+linc);
+                       }
+                       while (--count);
+               }
+       }
+       else
+       {
+               // TEXTURE
+               u16 uDst;
+               u16 uSrc;
+               u32 linc; if (L&&G) linc=lInc;
+               u32 tinc=tInc;
+               u32 tmsk=tMsk;
+               u32 tCor = ((u32)( u4<<7)&0x7fff0000) | ((u32)( v4>>9)&0x00007fff); tCor&= tmsk;
+               const u16* _TBA=TBA;
+               const u16* _CBA; if (TM!=3) _CBA=CBA;
+               u32 lCol;
+               if(L && !G) { lCol = ((u32)(b4<< 2)&(0x03ff)) | ((u32)(g4<<13)&(0x07ff<<10)) | ((u32)(r4<<24)&(0x07ff<<21)); }
+               else if(L && G) { lCol = ((u32)(b4>>14)&(0x03ff)) | ((u32)(g4>>3)&(0x07ff<<10)) | ((u32)(r4<<8)&(0x07ff<<21));  }
+               u32 uMsk; if ((B)&&(BM==0)) uMsk=0x7BDE;
+               u32 bMsk; if (BI) bMsk=blit_mask;
+               do
+               {
+                       // blit-mask
+                       if (BI) { if((bMsk>>((((u32)pDst)>>1)&7))&1) goto endpoly; }
+                       //  masking
+                       if(M) { uDst = *pDst;  if (uDst&0x8000) goto endpoly;  }
+                       //  texture
+                       if (TM==1) { u32 tu=(tCor>>23); u32 tv=(tCor<<4)&(0xff<<11); u8 rgb=((u8*)_TBA)[tv+(tu>>1)]; uSrc=_CBA[(rgb>>((tu&1)<<2))&0xf]; if(!uSrc) goto endpoly; }
+                       if (TM==2) { uSrc = _CBA[(((u8*)_TBA)[(tCor>>23)+((tCor<<4)&(0xff<<11))])]; if(!uSrc)  goto endpoly; }
+                       if (TM==3) { uSrc = _TBA[(tCor>>23)+((tCor<<3)&(0xff<<10))]; if(!uSrc)  goto endpoly; }
+                       //  blend
+                       if(B)
+                       {
+                               if (uSrc&0x8000)
+                               {
+                                       //  light
+                                       if(L) gpuLightingTXT(uSrc, lCol);
+                                       if(!M)    { uDst = *pDst; }
+                                       if (BM==0) gpuBlending00(uSrc, uDst);
+                                       if (BM==1) gpuBlending01(uSrc, uDst);
+                                       if (BM==2) gpuBlending02(uSrc, uDst);
+                                       if (BM==3) gpuBlending03(uSrc, uDst);
+                               }
+                               else
+                               {
+                                       // light
+                                       if(L) gpuLightingTXT(uSrc, lCol);
+                               }
+                       }
+                       else
+                       {
+                               //  light
+                               if(L)  { gpuLightingTXT(uSrc, lCol); } else if(!MB) { uSrc&= 0x7fff; }
+                       }
+                       if (MB) { *pDst = uSrc | 0x8000; }
+                       else    { *pDst = uSrc; }
+                       endpoly: pDst++;
+                       tCor=(tCor+tinc)&tmsk;
+                       if (L&&G) lCol=(lCol+linc);
+               }
+               while (--count);
+       }
+}
index 1552bed..c3f7095 100644 (file)
@@ -1,6 +1,7 @@
 /***************************************************************************
 *   Copyright (C) 2010 PCSX4ALL Team                                      *
 *   Copyright (C) 2010 Unai                                               *
+*   Copyright (C) 2016 Senquack (dansilsby <AT> gmail <DOT> com)          *
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU General Public License as published by  *
 *   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
 ***************************************************************************/
 
-#include "port.h"
-#include "gpu.h"
-#include "profiler.h"
-#include "debug.h"
+#include <stddef.h>
+#include "plugins.h"
+#include "psxcommon.h"
+//#include "port.h"
+#include "gpu_unai.h"
 
-int skipCount = 2; /* frame skip (0,1,2,3...) */
-int skCount = 0; /* internal frame skip */
-int linesInterlace = 0;  /* internal lines interlace */
-int linesInterlace_user = 0; /* Lines interlace */
+#define VIDEO_WIDTH 320
 
-bool isSkip = false; /* skip frame (info coming from GPU) */
-bool wasSkip = false;
-bool skipFrame = false; /* skip frame (according to frame skip) */
-bool alt_fps = false; /* Alternative FPS algorithm */
-bool show_fps = false; /* Show FPS statistics */
-
-bool isPAL = false; /* PAL video timing */
-bool progressInterlace_flag = false; /* Progressive interlace flag */
-bool progressInterlace = false; /* Progressive interlace option*/
-bool frameLimit = false; /* frames to wait */
-
-bool light = true; /* lighting */
-bool blend = true; /* blending */
-bool FrameToRead = false; /* load image in progress */
-bool FrameToWrite = false; /* store image in progress */
-bool fb_dirty = false;
-
-bool enableAbbeyHack = false; /* Abe's Odyssey hack */
-
-u8 BLEND_MODE;
-u8 TEXT_MODE;
-u8 Masking;
-
-u16 PixelMSB;
-u16 PixelData;
-
-///////////////////////////////////////////////////////////////////////////////
-//  GPU Global data
-///////////////////////////////////////////////////////////////////////////////
-
-///////////////////////////////////////////////////////////////////////////////
-//  Dma Transfers info
-s32            px,py;
-s32            x_end,y_end;
-u16*  pvram;
-
-u32 GP0;
-s32 PacketCount;
-s32 PacketIndex;
-
-///////////////////////////////////////////////////////////////////////////////
-//  Display status
-u32 DisplayArea   [6];
-
-///////////////////////////////////////////////////////////////////////////////
-//  Rasterizer status
-u32 TextureWindow [4];
-u32 DrawingArea   [4];
-u32 DrawingOffset [2];
+#ifdef TIME_IN_MSEC
+#define TPS 1000
+#else
+#define TPS 1000000
+#endif
 
-///////////////////////////////////////////////////////////////////////////////
-//  Rasterizer status
+#define IS_PAL (gpu_unai.GPU_GP1&(0x08<<17))
 
-u16* TBA;
-u16* CBA;
+//senquack - Original 512KB of guard space seems not to be enough, as Xenogears
+// accesses outside this range and crashes in town intro fight sequence.
+// Increased to 2MB total (double PSX VRAM) and Xenogears no longer
+// crashes, but some textures are still messed up. Also note that alignment min
+// is 16 bytes, needed for pixel-skipping rendering/blitting in high horiz res.
+// Extra 4KB is for guard room at beginning.
+// TODO: Determine cause of out-of-bounds write/reads. <-- Note: this is largely
+//  solved by adoption of PCSX Rearmed's 'gpulib' in gpulib_if.cpp, which
+//  replaces this file (gpu.cpp)
+//u16   GPU_FrameBuffer[(FRAME_BUFFER_SIZE+512*1024)/2] __attribute__((aligned(32)));
+static u16 GPU_FrameBuffer[(FRAME_BUFFER_SIZE*2 + 4096)/2] __attribute__((aligned(32)));
 
 ///////////////////////////////////////////////////////////////////////////////
-//  Inner Loops
-s32   u4, du4;
-s32   v4, dv4;
-s32   r4, dr4;
-s32   g4, dg4;
-s32   b4, db4;
-u32   lInc;
-u32   tInc, tMsk;
-
-GPUPacket PacketBuffer;
-// FRAME_BUFFER_SIZE is defined in bytes; 512K is guard memory for out of range reads
-u16   GPU_FrameBuffer[(FRAME_BUFFER_SIZE+512*1024)/2] __attribute__((aligned(2048)));
-u32   GPU_GP1;
+// GPU fixed point math
+#include "gpu_fixedpoint.h"
 
 ///////////////////////////////////////////////////////////////////////////////
-//  Inner loop driver instanciation file
+// Inner loop driver instantiation file
 #include "gpu_inner.h"
 
-///////////////////////////////////////////////////////////////////////////////
-//  GPU Raster Macros
-#define        GPU_RGB16(rgb)        ((((rgb)&0xF80000)>>9)|(((rgb)&0xF800)>>6)|(((rgb)&0xF8)>>3))
-
-#define GPU_EXPANDSIGN(x)  (((s32)(x)<<21)>>21)
-
-#define CHKMAX_X 1024
-#define CHKMAX_Y 512
-
-#define        GPU_SWAP(a,b,t) {(t)=(a);(a)=(b);(b)=(t);}
-
 ///////////////////////////////////////////////////////////////////////////////
 // GPU internal image drawing functions
 #include "gpu_raster_image.h"
@@ -135,72 +76,88 @@ u32   GPU_GP1;
 #include "gpu_command.h"
 
 ///////////////////////////////////////////////////////////////////////////////
-INLINE void gpuReset(void)
+static void gpuReset(void)
 {
-       GPU_GP1 = 0x14802000;
-       TextureWindow[0] = 0;
-       TextureWindow[1] = 0;
-       TextureWindow[2] = 255;
-       TextureWindow[3] = 255;
-       DrawingArea[2] = 256;
-       DrawingArea[3] = 240;
-       DisplayArea[2] = 256;
-       DisplayArea[3] = 240;
-       DisplayArea[5] = 240;
+       memset((void*)&gpu_unai, 0, sizeof(gpu_unai));
+       gpu_unai.vram = (u16*)GPU_FrameBuffer + (4096/2); //4kb guard room in front
+       gpu_unai.GPU_GP1 = 0x14802000;
+       gpu_unai.DrawingArea[2] = 256;
+       gpu_unai.DrawingArea[3] = 240;
+       gpu_unai.DisplayArea[2] = 256;
+       gpu_unai.DisplayArea[3] = 240;
+       gpu_unai.DisplayArea[5] = 240;
+       gpu_unai.TextureWindow[0] = 0;
+       gpu_unai.TextureWindow[1] = 0;
+       gpu_unai.TextureWindow[2] = 255;
+       gpu_unai.TextureWindow[3] = 255;
+       //senquack - new vars must be updated whenever texture window is changed:
+       //           (used for polygon-drawing in gpu_inner.h, gpu_raster_polygon.h)
+       const u32 fb = FIXED_BITS;  // # of fractional fixed-pt bits of u4/v4
+       gpu_unai.u_msk = (((u32)gpu_unai.TextureWindow[2]) << fb) | ((1 << fb) - 1);
+       gpu_unai.v_msk = (((u32)gpu_unai.TextureWindow[3]) << fb) | ((1 << fb) - 1);
+
+       // Configuration options
+       gpu_unai.config = gpu_unai_config_ext;
+       gpu_unai.ilace_mask = gpu_unai.config.ilace_force;
+       gpu_unai.frameskip.skipCount = gpu_unai.config.frameskip_count;
+
+       SetupLightLUT();
+       SetupDitheringConstants();
 }
 
 ///////////////////////////////////////////////////////////////////////////////
-bool  GPU_init(void)
+long GPU_init(void)
 {
        gpuReset();
-       
+
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
        // s_invTable
-       for(int i=1;i<=(1<<TABLE_BITS);++i)
+       for(unsigned int i=1;i<=(1<<TABLE_BITS);++i)
        {
-               double v = 1.0 / double(i);
-               #ifdef GPU_TABLE_10_BITS
-               v *= double(0xffffffff>>1);
-               #else
-               v *= double(0x80000000);
-               #endif
-               s_invTable[i-1]=s32(v);
+               s_invTable[i-1]=0x7fffffff/i;
        }
+#endif
+
+       gpu_unai.fb_dirty = true;
+       gpu_unai.dma.last_dma = NULL;
        return (0);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
-void  GPU_shutdown(void)
+long GPU_shutdown(void)
 {
+       return 0;
 }
 
 ///////////////////////////////////////////////////////////////////////////////
-long  GPU_freeze(unsigned int bWrite, GPUFreeze_t* p2)
+long GPU_freeze(u32 bWrite, GPUFreeze_t* p2)
 {
        if (!p2) return (0);
-       if (p2->Version != 1) return (0);
+       if (p2->ulFreezeVersion != 1) return (0);
 
        if (bWrite)
        {
-               p2->GPU_gp1 = GPU_GP1;
-               memset(p2->Control, 0, sizeof(p2->Control));
+               p2->ulStatus = gpu_unai.GPU_GP1;
+               memset(p2->ulControl, 0, sizeof(p2->ulControl));
                // save resolution and registers for P.E.Op.S. compatibility
-               p2->Control[3] = (3 << 24) | ((GPU_GP1 >> 23) & 1);
-               p2->Control[4] = (4 << 24) | ((GPU_GP1 >> 29) & 3);
-               p2->Control[5] = (5 << 24) | (DisplayArea[0] | (DisplayArea[1] << 10));
-               p2->Control[6] = (6 << 24) | (2560 << 12);
-               p2->Control[7] = (7 << 24) | (DisplayArea[4] | (DisplayArea[5] << 10));
-               p2->Control[8] = (8 << 24) | ((GPU_GP1 >> 17) & 0x3f) | ((GPU_GP1 >> 10) & 0x40);
-               memcpy(p2->FrameBuffer, (u16*)GPU_FrameBuffer, FRAME_BUFFER_SIZE);
+               p2->ulControl[3] = (3 << 24) | ((gpu_unai.GPU_GP1 >> 23) & 1);
+               p2->ulControl[4] = (4 << 24) | ((gpu_unai.GPU_GP1 >> 29) & 3);
+               p2->ulControl[5] = (5 << 24) | (gpu_unai.DisplayArea[0] | (gpu_unai.DisplayArea[1] << 10));
+               p2->ulControl[6] = (6 << 24) | (2560 << 12);
+               p2->ulControl[7] = (7 << 24) | (gpu_unai.DisplayArea[4] | (gpu_unai.DisplayArea[5] << 10));
+               p2->ulControl[8] = (8 << 24) | ((gpu_unai.GPU_GP1 >> 17) & 0x3f) | ((gpu_unai.GPU_GP1 >> 10) & 0x40);
+               memcpy((void*)p2->psxVRam, (void*)gpu_unai.vram, FRAME_BUFFER_SIZE);
                return (1);
        }
        else
        {
-               GPU_GP1 = p2->GPU_gp1;
-               memcpy((u16*)GPU_FrameBuffer, p2->FrameBuffer, FRAME_BUFFER_SIZE);
-               GPU_writeStatus((5 << 24) | p2->Control[5]);
-               GPU_writeStatus((7 << 24) | p2->Control[7]);
-               GPU_writeStatus((8 << 24) | p2->Control[8]);
-               gpuSetTexture(GPU_GP1);
+               extern void GPU_writeStatus(u32 data);
+               gpu_unai.GPU_GP1 = p2->ulStatus;
+               memcpy((void*)gpu_unai.vram, (void*)p2->psxVRam, FRAME_BUFFER_SIZE);
+               GPU_writeStatus((5 << 24) | p2->ulControl[5]);
+               GPU_writeStatus((7 << 24) | p2->ulControl[7]);
+               GPU_writeStatus((8 << 24) | p2->ulControl[8]);
+               gpuSetTexture(gpu_unai.GPU_GP1);
                return (1);
        }
        return (0);
@@ -233,72 +190,69 @@ u8 PacketSize[256] =
 ///////////////////////////////////////////////////////////////////////////////
 INLINE void gpuSendPacket()
 {
-#ifdef DEBUG_ANALYSIS
-       dbg_anacnt_GPU_sendPacket++;
-#endif
-       gpuSendPacketFunction(PacketBuffer.U4[0]>>24);
+       gpuSendPacketFunction(gpu_unai.PacketBuffer.U4[0]>>24);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
 INLINE void gpuCheckPacket(u32 uData)
 {
-       if (PacketCount)
+       if (gpu_unai.PacketCount)
        {
-               PacketBuffer.U4[PacketIndex++] = uData;
-               --PacketCount;
+               gpu_unai.PacketBuffer.U4[gpu_unai.PacketIndex++] = uData;
+               --gpu_unai.PacketCount;
        }
        else
        {
-               PacketBuffer.U4[0] = uData;
-               PacketCount = PacketSize[uData >> 24];
-               PacketIndex = 1;
+               gpu_unai.PacketBuffer.U4[0] = uData;
+               gpu_unai.PacketCount = PacketSize[uData >> 24];
+               gpu_unai.PacketIndex = 1;
        }
-       if (!PacketCount) gpuSendPacket();
+       if (!gpu_unai.PacketCount) gpuSendPacket();
 }
 
 ///////////////////////////////////////////////////////////////////////////////
-void  GPU_writeDataMem(u32* dmaAddress, s32 dmaCount)
+void GPU_writeDataMem(u32* dmaAddress, int dmaCount)
 {
-#ifdef DEBUG_ANALYSIS
-       dbg_anacnt_GPU_writeDataMem++;
-#endif
-       pcsx4all_prof_pause(PCSX4ALL_PROF_CPU);
-       pcsx4all_prof_start_with_pause(PCSX4ALL_PROF_GPU,PCSX4ALL_PROF_HW_WRITE);
+       #ifdef ENABLE_GPU_LOG_SUPPORT
+               fprintf(stdout,"GPU_writeDataMem(%d)\n",dmaCount);
+       #endif
        u32 data;
-       const u16 *VIDEO_END=(GPU_FrameBuffer+(FRAME_BUFFER_SIZE/2)-1);
-       GPU_GP1 &= ~0x14000000;
+       const u16 *VIDEO_END = (u16*)gpu_unai.vram+(FRAME_BUFFER_SIZE/2)-1;
+       gpu_unai.GPU_GP1 &= ~0x14000000;
 
        while (dmaCount) 
        {
-               if (FrameToWrite) 
+               if (gpu_unai.dma.FrameToWrite)
                {
                        while (dmaCount)
                        {
                                dmaCount--;
                                data = *dmaAddress++;
-                               if ((&pvram[px])>(VIDEO_END)) pvram-=512*1024;
-                               pvram[px] = data;
-                               if (++px>=x_end) 
+                               if ((&gpu_unai.dma.pvram[gpu_unai.dma.px])>(VIDEO_END)) gpu_unai.dma.pvram-=512*1024;
+                               gpu_unai.dma.pvram[gpu_unai.dma.px] = data;
+                               if (++gpu_unai.dma.px >= gpu_unai.dma.x_end)
                                {
-                                       px = 0;
-                                       pvram += 1024;
-                                       if (++py>=y_end) 
+                                       gpu_unai.dma.px = 0;
+                                       gpu_unai.dma.pvram += 1024;
+                                       if (++gpu_unai.dma.py >= gpu_unai.dma.y_end)
                                        {
-                                               FrameToWrite = false;
-                                               GPU_GP1 &= ~0x08000000;
+                                               gpu_unai.dma.FrameToWrite = false;
+                                               gpu_unai.GPU_GP1 &= ~0x08000000;
+                                               gpu_unai.fb_dirty = true;
                                                break;
                                        }
                                }
-                               if ((&pvram[px])>(VIDEO_END)) pvram-=512*1024;
-                               pvram[px] = data>>16;
-                               if (++px>=x_end) 
+                               if ((&gpu_unai.dma.pvram[gpu_unai.dma.px])>(VIDEO_END)) gpu_unai.dma.pvram-=512*1024;
+                               gpu_unai.dma.pvram[gpu_unai.dma.px] = data>>16;
+                               if (++gpu_unai.dma.px >= gpu_unai.dma.x_end)
                                {
-                                       px = 0;
-                                       pvram += 1024;
-                                       if (++py>=y_end) 
+                                       gpu_unai.dma.px = 0;
+                                       gpu_unai.dma.pvram += 1024;
+                                       if (++gpu_unai.dma.py >= gpu_unai.dma.y_end)
                                        {
-                                               FrameToWrite = false;
-                                               GPU_GP1 &= ~0x08000000;
+                                               gpu_unai.dma.FrameToWrite = false;
+                                               gpu_unai.GPU_GP1 &= ~0x08000000;
+                                               gpu_unai.fb_dirty = true;
                                                break;
                                        }
                                }
@@ -312,95 +266,100 @@ void  GPU_writeDataMem(u32* dmaAddress, s32 dmaCount)
                }
        }
 
-       GPU_GP1 = (GPU_GP1 | 0x14000000) & ~0x60000000;
-       fb_dirty = true;
-       pcsx4all_prof_end_with_resume(PCSX4ALL_PROF_GPU,PCSX4ALL_PROF_HW_WRITE);
-       pcsx4all_prof_resume(PCSX4ALL_PROF_CPU);
+       gpu_unai.GPU_GP1 = (gpu_unai.GPU_GP1 | 0x14000000) & ~0x60000000;
 }
 
-u32 *lUsedAddr[3];
-INLINE int CheckForEndlessLoop(u32 *laddr)
+long GPU_dmaChain(u32 *rambase, u32 start_addr)
 {
-       if(laddr==lUsedAddr[1]) return 1;
-       if(laddr==lUsedAddr[2]) return 1;
+       #ifdef ENABLE_GPU_LOG_SUPPORT
+               fprintf(stdout,"GPU_dmaChain(0x%x)\n",start_addr);
+       #endif
 
-       if(laddr<lUsedAddr[0]) lUsedAddr[1]=laddr;
-       else                   lUsedAddr[2]=laddr;
-       lUsedAddr[0]=laddr;
-       return 0;
-}
-
-///////////////////////////////////////////////////////////////////////////////
-long GPU_dmaChain(u32* baseAddr, u32 dmaVAddr)
-{
-#ifdef DEBUG_ANALYSIS
-       dbg_anacnt_GPU_dmaChain++;
-#endif
-       pcsx4all_prof_start_with_pause(PCSX4ALL_PROF_GPU,PCSX4ALL_PROF_HW_WRITE);
-       u32 data, *address, count, offset;
-       unsigned int DMACommandCounter = 0;
+       u32 addr, *list;
+       u32 len, count;
        long dma_words = 0;
 
-       GPU_GP1 &= ~0x14000000;
-       lUsedAddr[0]=lUsedAddr[1]=lUsedAddr[2]=(u32*)0x1fffff;
-       dmaVAddr &= 0x001FFFFF;
-       while (dmaVAddr != 0x1FFFFF)
+       if (gpu_unai.dma.last_dma) *gpu_unai.dma.last_dma |= 0x800000;
+       
+       gpu_unai.GPU_GP1 &= ~0x14000000;
+       
+       addr = start_addr & 0xffffff;
+       for (count = 0; addr != 0xffffff; count++)
        {
-               address = (baseAddr + (dmaVAddr >> 2));
-               if(DMACommandCounter++ > 2000000) break;
-               if(CheckForEndlessLoop(address)) break;
-               data = *address++;
-               count = (data >> 24);
-               offset = data & 0x001FFFFF;
-               if (dmaVAddr != offset) dmaVAddr = offset;
-               else dmaVAddr = 0x1FFFFF;
-
-               if(count>0) GPU_writeDataMem(address,count);
-               dma_words += 1 + count;
+               list = rambase + (addr & 0x1fffff) / 4;
+               len = list[0] >> 24;
+               addr = list[0] & 0xffffff;
+
+               dma_words += 1 + len;
+
+               // add loop detection marker
+               list[0] |= 0x800000;
+
+               if (len) GPU_writeDataMem(list + 1, len);
+
+               if (addr & 0x800000)
+               {
+                       #ifdef ENABLE_GPU_LOG_SUPPORT
+                               fprintf(stdout,"GPU_dmaChain(LOOP)\n");
+                       #endif
+                       break;
+               }
        }
-       GPU_GP1 = (GPU_GP1 | 0x14000000) & ~0x60000000;
-       pcsx4all_prof_end_with_resume(PCSX4ALL_PROF_GPU,PCSX4ALL_PROF_HW_WRITE);
+
+       // remove loop detection markers
+       addr = start_addr & 0x1fffff;
+       while (count-- > 0)
+       {
+               list = rambase + addr / 4;
+               addr = list[0] & 0x1fffff;
+               list[0] &= ~0x800000;
+       }
+       
+       if (gpu_unai.dma.last_dma) *gpu_unai.dma.last_dma &= ~0x800000;
+       gpu_unai.dma.last_dma = rambase + (start_addr & 0x1fffff) / 4;
+
+       gpu_unai.GPU_GP1 = (gpu_unai.GPU_GP1 | 0x14000000) & ~0x60000000;
 
        return dma_words;
 }
 
 ///////////////////////////////////////////////////////////////////////////////
-void  GPU_writeData(u32 data)
+void GPU_writeData(u32 data)
 {
-       const u16 *VIDEO_END=(GPU_FrameBuffer+(FRAME_BUFFER_SIZE/2)-1);
-#ifdef DEBUG_ANALYSIS
-       dbg_anacnt_GPU_writeData++;
-#endif
-       pcsx4all_prof_pause(PCSX4ALL_PROF_CPU);
-       pcsx4all_prof_start_with_pause(PCSX4ALL_PROF_GPU,PCSX4ALL_PROF_HW_WRITE);
-       GPU_GP1 &= ~0x14000000;
+       const u16 *VIDEO_END = (u16*)gpu_unai.vram+(FRAME_BUFFER_SIZE/2)-1;
+       #ifdef ENABLE_GPU_LOG_SUPPORT
+               fprintf(stdout,"GPU_writeData()\n");
+       #endif
+       gpu_unai.GPU_GP1 &= ~0x14000000;
 
-       if (FrameToWrite)
+       if (gpu_unai.dma.FrameToWrite)
        {
-               if ((&pvram[px])>(VIDEO_END)) pvram-=512*1024;
-               pvram[px]=(u16)data;
-               if (++px>=x_end)
+               if ((&gpu_unai.dma.pvram[gpu_unai.dma.px])>(VIDEO_END)) gpu_unai.dma.pvram-=512*1024;
+               gpu_unai.dma.pvram[gpu_unai.dma.px]=(u16)data;
+               if (++gpu_unai.dma.px >= gpu_unai.dma.x_end)
                {
-                       px = 0;
-                       pvram += 1024;
-                       if (++py>=y_end) 
+                       gpu_unai.dma.px = 0;
+                       gpu_unai.dma.pvram += 1024;
+                       if (++gpu_unai.dma.py >= gpu_unai.dma.y_end)
                        {
-                               FrameToWrite = false;
-                               GPU_GP1 &= ~0x08000000;
+                               gpu_unai.dma.FrameToWrite = false;
+                               gpu_unai.GPU_GP1 &= ~0x08000000;
+                               gpu_unai.fb_dirty = true;
                        }
                }
-               if (FrameToWrite)
+               if (gpu_unai.dma.FrameToWrite)
                {
-                       if ((&pvram[px])>(VIDEO_END)) pvram-=512*1024;
-                       pvram[px]=data>>16;
-                       if (++px>=x_end)
+                       if ((&gpu_unai.dma.pvram[gpu_unai.dma.px])>(VIDEO_END)) gpu_unai.dma.pvram-=512*1024;
+                       gpu_unai.dma.pvram[gpu_unai.dma.px]=data>>16;
+                       if (++gpu_unai.dma.px >= gpu_unai.dma.x_end)
                        {
-                               px = 0;
-                               pvram += 1024;
-                               if (++py>=y_end) 
+                               gpu_unai.dma.px = 0;
+                               gpu_unai.dma.pvram += 1024;
+                               if (++gpu_unai.dma.py >= gpu_unai.dma.y_end)
                                {
-                                       FrameToWrite = false;
-                                       GPU_GP1 &= ~0x08000000;
+                                       gpu_unai.dma.FrameToWrite = false;
+                                       gpu_unai.GPU_GP1 &= ~0x08000000;
+                                       gpu_unai.fb_dirty = true;
                                }
                        }
                }
@@ -409,507 +368,463 @@ void  GPU_writeData(u32 data)
        {
                gpuCheckPacket(data);
        }
-       GPU_GP1 |= 0x14000000;
-       fb_dirty = true;
-       pcsx4all_prof_end_with_resume(PCSX4ALL_PROF_GPU,PCSX4ALL_PROF_HW_WRITE);
-       pcsx4all_prof_resume(PCSX4ALL_PROF_CPU);
-
+       gpu_unai.GPU_GP1 |= 0x14000000;
 }
 
 
 ///////////////////////////////////////////////////////////////////////////////
-void  GPU_readDataMem(u32* dmaAddress, s32 dmaCount)
+void GPU_readDataMem(u32* dmaAddress, int dmaCount)
 {
-       const u16 *VIDEO_END=(GPU_FrameBuffer+(FRAME_BUFFER_SIZE/2)-1);
-#ifdef DEBUG_ANALYSIS
-       dbg_anacnt_GPU_readDataMem++;
-#endif
-       if(!FrameToRead) return;
+       const u16 *VIDEO_END = (u16*)gpu_unai.vram+(FRAME_BUFFER_SIZE/2)-1;
+       #ifdef ENABLE_GPU_LOG_SUPPORT
+               fprintf(stdout,"GPU_readDataMem(%d)\n",dmaCount);
+       #endif
+       if(!gpu_unai.dma.FrameToRead) return;
 
-       pcsx4all_prof_start_with_pause(PCSX4ALL_PROF_GPU,PCSX4ALL_PROF_HW_WRITE);
-       GPU_GP1 &= ~0x14000000;
+       gpu_unai.GPU_GP1 &= ~0x14000000;
        do 
        {
-               if ((&pvram[px])>(VIDEO_END)) pvram-=512*1024;
+               if ((&gpu_unai.dma.pvram[gpu_unai.dma.px])>(VIDEO_END)) gpu_unai.dma.pvram-=512*1024;
                // lower 16 bit
-               u32 data = pvram[px];
+               //senquack - 64-bit fix (from notaz)
+               //u32 data = (unsigned long)gpu_unai.dma.pvram[gpu_unai.dma.px];
+               u32 data = (u32)gpu_unai.dma.pvram[gpu_unai.dma.px];
 
-               if (++px>=x_end) 
+               if (++gpu_unai.dma.px >= gpu_unai.dma.x_end)
                {
-                       px = 0;
-                       pvram += 1024;
+                       gpu_unai.dma.px = 0;
+                       gpu_unai.dma.pvram += 1024;
                }
 
-               if ((&pvram[px])>(VIDEO_END)) pvram-=512*1024;
+               if ((&gpu_unai.dma.pvram[gpu_unai.dma.px])>(VIDEO_END)) gpu_unai.dma.pvram-=512*1024;
                // higher 16 bit (always, even if it's an odd width)
-               data |= (u32)(pvram[px])<<16;
+               //senquack - 64-bit fix (from notaz)
+               //data |= (unsigned long)(gpu_unai.dma.pvram[gpu_unai.dma.px])<<16;
+               data |= (u32)(gpu_unai.dma.pvram[gpu_unai.dma.px])<<16;
                
                *dmaAddress++ = data;
 
-               if (++px>=x_end) 
+               if (++gpu_unai.dma.px >= gpu_unai.dma.x_end)
                {
-                       px = 0;
-                       pvram += 1024;
-                       if (++py>=y_end) 
+                       gpu_unai.dma.px = 0;
+                       gpu_unai.dma.pvram += 1024;
+                       if (++gpu_unai.dma.py >= gpu_unai.dma.y_end)
                        {
-                               FrameToRead = false;
-                               GPU_GP1 &= ~0x08000000;
+                               gpu_unai.dma.FrameToRead = false;
+                               gpu_unai.GPU_GP1 &= ~0x08000000;
                                break;
                        }
                }
        } while (--dmaCount);
 
-       GPU_GP1 = (GPU_GP1 | 0x14000000) & ~0x60000000;
-       pcsx4all_prof_end_with_resume(PCSX4ALL_PROF_GPU,PCSX4ALL_PROF_HW_WRITE);
+       gpu_unai.GPU_GP1 = (gpu_unai.GPU_GP1 | 0x14000000) & ~0x60000000;
 }
 
 
 
 ///////////////////////////////////////////////////////////////////////////////
-u32  GPU_readData(void)
+u32 GPU_readData(void)
 {
-       const u16 *VIDEO_END=(GPU_FrameBuffer+(FRAME_BUFFER_SIZE/2)-1);
-#ifdef DEBUG_ANALYSIS
-       dbg_anacnt_GPU_readData++;
-#endif
-       pcsx4all_prof_pause(PCSX4ALL_PROF_CPU);
-       pcsx4all_prof_start_with_pause(PCSX4ALL_PROF_GPU,PCSX4ALL_PROF_HW_READ);
-       GPU_GP1 &= ~0x14000000;
-       if (FrameToRead)
+       const u16 *VIDEO_END = (u16*)gpu_unai.vram+(FRAME_BUFFER_SIZE/2)-1;
+       #ifdef ENABLE_GPU_LOG_SUPPORT
+               fprintf(stdout,"GPU_readData()\n");
+       #endif
+       gpu_unai.GPU_GP1 &= ~0x14000000;
+       if (gpu_unai.dma.FrameToRead)
        {
-               if ((&pvram[px])>(VIDEO_END)) pvram-=512*1024;
-               GP0 = pvram[px];
-               if (++px>=x_end)
+               if ((&gpu_unai.dma.pvram[gpu_unai.dma.px])>(VIDEO_END)) gpu_unai.dma.pvram-=512*1024;
+               gpu_unai.GPU_GP0 = gpu_unai.dma.pvram[gpu_unai.dma.px];
+               if (++gpu_unai.dma.px >= gpu_unai.dma.x_end)
                {
-                       px = 0;
-                       pvram += 1024;
-                       if (++py>=y_end) 
+                       gpu_unai.dma.px = 0;
+                       gpu_unai.dma.pvram += 1024;
+                       if (++gpu_unai.dma.py >= gpu_unai.dma.y_end)
                        {
-                               FrameToRead = false;
-                               GPU_GP1 &= ~0x08000000;
+                               gpu_unai.dma.FrameToRead = false;
+                               gpu_unai.GPU_GP1 &= ~0x08000000;
                        }
                }
-               if ((&pvram[px])>(VIDEO_END)) pvram-=512*1024;
-               GP0 |= pvram[px]<<16;
-               if (++px>=x_end)
+               if ((&gpu_unai.dma.pvram[gpu_unai.dma.px])>(VIDEO_END)) gpu_unai.dma.pvram-=512*1024;
+               gpu_unai.GPU_GP0 |= gpu_unai.dma.pvram[gpu_unai.dma.px]<<16;
+               if (++gpu_unai.dma.px >= gpu_unai.dma.x_end)
                {
-                       px = 0;
-                       pvram +=1024;
-                       if (++py>=y_end) 
+                       gpu_unai.dma.px = 0;
+                       gpu_unai.dma.pvram += 1024;
+                       if (++gpu_unai.dma.py >= gpu_unai.dma.y_end)
                        {
-                               FrameToRead = false;
-                               GPU_GP1 &= ~0x08000000;
+                               gpu_unai.dma.FrameToRead = false;
+                               gpu_unai.GPU_GP1 &= ~0x08000000;
                        }
                }
 
        }
-       GPU_GP1 |= 0x14000000;
+       gpu_unai.GPU_GP1 |= 0x14000000;
 
-       pcsx4all_prof_end_with_resume(PCSX4ALL_PROF_GPU,PCSX4ALL_PROF_HW_READ);
-       pcsx4all_prof_resume(PCSX4ALL_PROF_CPU);
-       return (GP0);
+       return (gpu_unai.GPU_GP0);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
-u32     GPU_readStatus(void)
+u32 GPU_readStatus(void)
 {
-#ifdef DEBUG_ANALYSIS
-       dbg_anacnt_GPU_readStatus++;
-#endif
-       return GPU_GP1;
+       return gpu_unai.GPU_GP1;
+}
+
+INLINE void GPU_NoSkip(void)
+{
+       #ifdef ENABLE_GPU_LOG_SUPPORT
+               fprintf(stdout,"GPU_NoSkip()\n");
+       #endif
+       gpu_unai.frameskip.wasSkip = gpu_unai.frameskip.isSkip;
+       if (gpu_unai.frameskip.isSkip)
+       {
+               gpu_unai.frameskip.isSkip = false;
+               gpu_unai.frameskip.skipGPU = false;
+       }
+       else
+       {
+               gpu_unai.frameskip.isSkip = gpu_unai.frameskip.skipFrame;
+               gpu_unai.frameskip.skipGPU = gpu_unai.frameskip.skipFrame;
+       }
 }
 
 ///////////////////////////////////////////////////////////////////////////////
 void  GPU_writeStatus(u32 data)
 {
-#ifdef DEBUG_ANALYSIS
-       dbg_anacnt_GPU_writeStatus++;
-#endif
-       pcsx4all_prof_pause(PCSX4ALL_PROF_CPU);
-       pcsx4all_prof_start_with_pause(PCSX4ALL_PROF_GPU,PCSX4ALL_PROF_HW_WRITE);
+       #ifdef ENABLE_GPU_LOG_SUPPORT
+               fprintf(stdout,"GPU_writeStatus(%d,%d)\n",data>>24,data & 0xff);
+       #endif
        switch (data >> 24) {
        case 0x00:
                gpuReset();
                break;
        case 0x01:
-               GPU_GP1 &= ~0x08000000;
-               PacketCount = 0; FrameToRead = FrameToWrite = false;
+               gpu_unai.GPU_GP1 &= ~0x08000000;
+               gpu_unai.PacketCount = 0;
+               gpu_unai.dma.FrameToRead = gpu_unai.dma.FrameToWrite = false;
                break;
        case 0x02:
-               GPU_GP1 &= ~0x08000000;
-               PacketCount = 0; FrameToRead = FrameToWrite = false;
+               gpu_unai.GPU_GP1 &= ~0x08000000;
+               gpu_unai.PacketCount = 0;
+               gpu_unai.dma.FrameToRead = gpu_unai.dma.FrameToWrite = false;
                break;
        case 0x03:
-               GPU_GP1 = (GPU_GP1 & ~0x00800000) | ((data & 1) << 23);
+               gpu_unai.GPU_GP1 = (gpu_unai.GPU_GP1 & ~0x00800000) | ((data & 1) << 23);
                break;
        case 0x04:
-               if (data == 0x04000000)
-               PacketCount = 0;
-               GPU_GP1 = (GPU_GP1 & ~0x60000000) | ((data & 3) << 29);
+               if (data == 0x04000000) gpu_unai.PacketCount = 0;
+               gpu_unai.GPU_GP1 = (gpu_unai.GPU_GP1 & ~0x60000000) | ((data & 3) << 29);
                break;
        case 0x05:
-               DisplayArea[0] = (data & 0x000003FF); //(short)(data & 0x3ff);
-               DisplayArea[1] = ((data & 0x0007FC00)>>10); //(data & 0x000FFC00) >> 10; //(short)((data>>10)&0x1ff);
-               fb_dirty = true;
-               wasSkip = isSkip;
-               if (isSkip)
-                       isSkip = false;
-               else
-                       isSkip = skipFrame;
+               // Start of Display Area in VRAM
+               gpu_unai.DisplayArea[0] = data & 0x3ff;         // X (0..1023)
+               gpu_unai.DisplayArea[1] = (data >> 10) & 0x1ff; // Y (0..511)
+               GPU_NoSkip();
+               break;
+       case 0x06:
+               // GP1(06h) - Horizontal Display range (on Screen)
+               // 0-11   X1 (260h+0)       ;12bit       ;\counted in 53.222400MHz units,
+               // 12-23  X2 (260h+320*8)   ;12bit       ;/relative to HSYNC
+
+               // senquack - gpu_unai completely ignores GP1(0x06) command and
+               // lacks even a place in DisplayArea[] array to store the values.
+               // It seems to have been concerned only with vertical display range
+               // and centering top/bottom. I will not add support here, and
+               // focus instead on the gpulib version (gpulib_if.cpp) which uses
+               // gpulib for its PS1->host framebuffer blitting.
                break;
        case 0x07:
-               DisplayArea[4] = data & 0x000003FF; //(short)(data & 0x3ff);
-               DisplayArea[5] = (data & 0x000FFC00) >> 10; //(short)((data>>10) & 0x3ff);
-               fb_dirty = true;
+               // GP1(07h) - Vertical Display range (on Screen)
+               // 0-9   Y1 (NTSC=88h-(224/2), (PAL=A3h-(264/2))  ;\scanline numbers on screen,
+               // 10-19 Y2 (NTSC=88h+(224/2), (PAL=A3h+(264/2))  ;/relative to VSYNC
+               // 20-23 Not used (zero)
+               {
+                       u32 v1=data & 0x000003FF; //(short)(data & 0x3ff);
+                       u32 v2=(data & 0x000FFC00) >> 10; //(short)((data>>10) & 0x3ff);
+                       if ((gpu_unai.DisplayArea[4]!=v1)||(gpu_unai.DisplayArea[5]!=v2))
+                       {
+                               gpu_unai.DisplayArea[4] = v1;
+                               gpu_unai.DisplayArea[5] = v2;
+                               #ifdef ENABLE_GPU_LOG_SUPPORT
+                                       fprintf(stdout,"video_clear(CHANGE_Y)\n");
+                               #endif
+                               video_clear();
+                       }
+               }
                break;
        case 0x08:
                {
-                       GPU_GP1 = (GPU_GP1 & ~0x007F0000) | ((data & 0x3F) << 17) | ((data & 0x40) << 10);
-                       static u32 HorizontalResolution[8] = { 256, 368, 320, 384, 512, 512, 640, 640 };
-                       DisplayArea[2] = HorizontalResolution[(GPU_GP1 >> 16) & 7];
-                       static u32 VerticalResolution[4] = { 240, 480, 256, 480 };
-                       DisplayArea[3] = VerticalResolution[(GPU_GP1 >> 19) & 3];
-                       isPAL = (data & 0x08) ? true : false; // if 1 - PAL mode, else NTSC
+                       static const u32 HorizontalResolution[8] = { 256, 368, 320, 384, 512, 512, 640, 640 };
+                       static const u32 VerticalResolution[4] = { 240, 480, 256, 480 };
+                       gpu_unai.GPU_GP1 = (gpu_unai.GPU_GP1 & ~0x007F0000) | ((data & 0x3F) << 17) | ((data & 0x40) << 10);
+                       #ifdef ENABLE_GPU_LOG_SUPPORT
+                               fprintf(stdout,"GPU_writeStatus(RES=%dx%d,BITS=%d,PAL=%d)\n",HorizontalResolution[(gpu_unai.GPU_GP1 >> 16) & 7],
+                                               VerticalResolution[(gpu_unai.GPU_GP1 >> 19) & 3],(gpu_unai.GPU_GP1&0x00200000?24:15),(IS_PAL?1:0));
+                       #endif
+                       // Video mode change
+                       u32 new_width = HorizontalResolution[(gpu_unai.GPU_GP1 >> 16) & 7];
+                       u32 new_height = VerticalResolution[(gpu_unai.GPU_GP1 >> 19) & 3];
+
+                       if (gpu_unai.DisplayArea[2] != new_width || gpu_unai.DisplayArea[3] != new_height)
+                       {
+                               // Update width
+                               gpu_unai.DisplayArea[2] = new_width;
+
+                               if (PixelSkipEnabled()) {
+                                       // Set blit_mask for high horizontal resolutions. This allows skipping
+                                       //  rendering pixels that would never get displayed on low-resolution
+                                       //  platforms that use simple pixel-dropping scaler.
+                                       switch (gpu_unai.DisplayArea[2])
+                                       {
+                                               case 512: gpu_unai.blit_mask = 0xa4; break; // GPU_BlitWWSWWSWS
+                                               case 640: gpu_unai.blit_mask = 0xaa; break; // GPU_BlitWS
+                                               default:  gpu_unai.blit_mask = 0;    break;
+                                       }
+                               } else {
+                                       gpu_unai.blit_mask = 0;
+                               }
+
+                               // Update height
+                               gpu_unai.DisplayArea[3] = new_height;
+
+                               if (LineSkipEnabled()) {
+                                       // Set rendering line-skip (only render every other line in high-res
+                                       //  480 vertical mode, or, optionally, force it for all video modes)
+
+                                       if (gpu_unai.DisplayArea[3] == 480) {
+                                               if (gpu_unai.config.ilace_force) {
+                                                       gpu_unai.ilace_mask = 3; // Only need 1/4 of lines
+                                               } else {
+                                                       gpu_unai.ilace_mask = 1; // Only need 1/2 of lines
+                                               }
+                                       } else {
+                                               // Vert resolution changed from 480 to lower one
+                                               gpu_unai.ilace_mask = gpu_unai.config.ilace_force;
+                                       }
+                               } else {
+                                       gpu_unai.ilace_mask = 0;
+                               }
+
+                               #ifdef ENABLE_GPU_LOG_SUPPORT
+                                       fprintf(stdout,"video_clear(CHANGE_RES)\n");
+                               #endif
+                               video_clear();
+                       }
+
                }
-               fb_dirty = true;
                break;
        case 0x10:
-               switch (data & 0xffff) {
-               case 0:
-               case 1:
-               case 3:
-                       GP0 = (DrawingArea[1] << 10) | DrawingArea[0];
-                       break;
-               case 4:
-                       GP0 = ((DrawingArea[3]-1) << 10) | (DrawingArea[2]-1);
-                       break;
-               case 6:
-               case 5:
-                       GP0 = (DrawingOffset[1] << 11) | DrawingOffset[0];
-                       break;
-               case 7:
-                       GP0 = 2;
-                       break;
-               default:
-                       GP0 = 0;
+               switch (data & 0xff) {
+                       case 2: gpu_unai.GPU_GP0 = gpu_unai.tex_window; break;
+                       case 3: gpu_unai.GPU_GP0 = (gpu_unai.DrawingArea[1] << 10) | gpu_unai.DrawingArea[0]; break;
+                       case 4: gpu_unai.GPU_GP0 = ((gpu_unai.DrawingArea[3]-1) << 10) | (gpu_unai.DrawingArea[2]-1); break;
+                       case 5: case 6: gpu_unai.GPU_GP0 = (((u32)gpu_unai.DrawingOffset[1] & 0x7ff) << 11) | ((u32)gpu_unai.DrawingOffset[0] & 0x7ff); break;
+                       case 7: gpu_unai.GPU_GP0 = 2; break;
+                       case 8: case 15: gpu_unai.GPU_GP0 = 0xBFC03720; break;
                }
                break;
        }
-       pcsx4all_prof_end_with_resume(PCSX4ALL_PROF_GPU,PCSX4ALL_PROF_HW_WRITE);
-       pcsx4all_prof_resume(PCSX4ALL_PROF_CPU);
 }
 
-#ifndef REARMED
-
 // Blitting functions
 #include "gpu_blit.h"
 
-INLINE void gpuVideoOutput(void)
+static void gpuVideoOutput(void)
 {
-       static s16 old_res_horz, old_res_vert, old_rgb24;
-       s16 h0, x0, y0, w0, h1;
+       int h0, x0, y0, w0, h1;
 
-       x0 = DisplayArea[0];
-       y0 = DisplayArea[1];
+       x0 = gpu_unai.DisplayArea[0];
+       y0 = gpu_unai.DisplayArea[1];
 
-       w0 = DisplayArea[2];
-       h0 = DisplayArea[3];  // video mode
+       w0 = gpu_unai.DisplayArea[2];
+       h0 = gpu_unai.DisplayArea[3];  // video mode
 
-       h1 = DisplayArea[5] - DisplayArea[4]; // display needed
+       h1 = gpu_unai.DisplayArea[5] - gpu_unai.DisplayArea[4]; // display needed
        if (h0 == 480) h1 = Min2(h1*2,480);
 
-       u16* dest_screen16 = SCREEN;
-       u16* src_screen16  = &((u16*)GPU_FrameBuffer)[FRAME_OFFSET(x0,y0)];
-       u32 isRGB24 = (GPU_GP1 & 0x00200000 ? 32 : 0);
+       bool isRGB24 = (gpu_unai.GPU_GP1 & 0x00200000 ? true : false);
+       u16* dst16 = SCREEN;
+       u16* src16 = (u16*)gpu_unai.vram;
 
-       /* Clear the screen if resolution changed to prevent interlacing and clipping to clash */
-       if( (w0 != old_res_horz || h1 != old_res_vert || (s16)isRGB24 != old_rgb24) )
-       {
-               // Update old resolution
-               old_res_horz = w0;
-               old_res_vert = h1;
-               old_rgb24 = (s16)isRGB24;
-               // Finally, clear the screen for this special case
-               video_clear();
-       }
+       // PS1 fb read wraps around (fixes black screen in 'Tobal no. 1')
+       unsigned int src16_offs_msk = 1024*512-1;
+       unsigned int src16_offs = (x0 + y0*1024) & src16_offs_msk;
 
        //  Height centering
        int sizeShift = 1;
-       if(h0==256) h0 = 240; else if(h0==480) sizeShift = 2;
-       if(h1>h0) { src_screen16 += ((h1-h0)>>sizeShift)*1024; h1 = h0; }
-       else if(h1<h0) dest_screen16 += ((h0-h1)>>sizeShift)*VIDEO_WIDTH;
+       if (h0 == 256) {
+               h0 = 240;
+       } else if (h0 == 480) {
+               sizeShift = 2;
+       }
+       if (h1 > h0) {
+               src16_offs = (src16_offs + (((h1-h0) / 2) * 1024)) & src16_offs_msk;
+               h1 = h0;
+       } else if (h1<h0) {
+               dst16 += ((h0-h1) >> sizeShift) * VIDEO_WIDTH;
+       }
+
 
        /* Main blitter */
        int incY = (h0==480) ? 2 : 1;
        h0=(h0==480 ? 2048 : 1024);
 
        {
-               const int li=linesInterlace;
-               bool pi=progressInterlace;
-               bool pif=progressInterlace_flag;
+               const int li=gpu_unai.ilace_mask;
+               bool pi = ProgressiveInterlaceEnabled();
+               bool pif = gpu_unai.prog_ilace_flag;
                switch ( w0 )
                {
                        case 256:
                                for(int y1=y0+h1; y0<y1; y0+=incY)
                                {
-                                       if(( 0 == (y0&li) ) && ((!pi) || (pif=!pif))) GPU_BlitWWDWW(    src_screen16,   dest_screen16, isRGB24);
-                                       dest_screen16 += VIDEO_WIDTH;
-                                       src_screen16  += h0;
+                                       if (( 0 == (y0&li) ) && ((!pi) || (pif=!pif)))
+                                               GPU_BlitWWDWW(src16 + src16_offs, dst16, isRGB24);
+                                       dst16 += VIDEO_WIDTH;
+                                       src16_offs = (src16_offs + h0) & src16_offs_msk;
                                }
                                break;
                        case 368:
                                for(int y1=y0+h1; y0<y1; y0+=incY)
                                {
-                                       if(( 0 == (y0&li) ) && ((!pi) || (pif=!pif))) GPU_BlitWWWWWWWWS(        src_screen16,   dest_screen16, isRGB24, 4);
-                                       dest_screen16 += VIDEO_WIDTH;
-                                       src_screen16  += h0;
+                                       if (( 0 == (y0&li) ) && ((!pi) || (pif=!pif)))
+                                               GPU_BlitWWWWWWWWS(src16 + src16_offs, dst16, isRGB24, 4);
+                                       dst16 += VIDEO_WIDTH;
+                                       src16_offs = (src16_offs + h0) & src16_offs_msk;
                                }
                                break;
                        case 320:
+                               // Ensure 32-bit alignment for GPU_BlitWW() blitter:
+                               src16_offs &= ~1;
                                for(int y1=y0+h1; y0<y1; y0+=incY)
                                {
-                                       if(( 0 == (y0&li) ) && ((!pi) || (pif=!pif))) GPU_BlitWW(       src_screen16,   dest_screen16, isRGB24);
-                                       dest_screen16 += VIDEO_WIDTH;
-                                       src_screen16  += h0;
+                                       if (( 0 == (y0&li) ) && ((!pi) || (pif=!pif)))
+                                               GPU_BlitWW(src16 + src16_offs, dst16, isRGB24);
+                                       dst16 += VIDEO_WIDTH;
+                                       src16_offs = (src16_offs + h0) & src16_offs_msk;
                                }
                                break;
                        case 384:
                                for(int y1=y0+h1; y0<y1; y0+=incY)
                                {
-                                       if(( 0 == (y0&li) ) && ((!pi) || (pif=!pif))) GPU_BlitWWWWWS(   src_screen16,   dest_screen16, isRGB24);
-                                       dest_screen16 += VIDEO_WIDTH;
-                                       src_screen16  += h0;
+                                       if (( 0 == (y0&li) ) && ((!pi) || (pif=!pif)))
+                                               GPU_BlitWWWWWS(src16 + src16_offs, dst16, isRGB24);
+                                       dst16 += VIDEO_WIDTH;
+                                       src16_offs = (src16_offs + h0) & src16_offs_msk;
                                }
                                break;
                        case 512:
                                for(int y1=y0+h1; y0<y1; y0+=incY)
                                {
-                                       if(( 0 == (y0&li) ) && ((!pi) || (pif=!pif))) GPU_BlitWWSWWSWS( src_screen16, dest_screen16, isRGB24);
-                                       dest_screen16 += VIDEO_WIDTH;
-                                       src_screen16  += h0;
+                                       if (( 0 == (y0&li) ) && ((!pi) || (pif=!pif)))
+                                               GPU_BlitWWSWWSWS(src16 + src16_offs, dst16, isRGB24);
+                                       dst16 += VIDEO_WIDTH;
+                                       src16_offs = (src16_offs + h0) & src16_offs_msk;
                                }
                                break;
                        case 640:
                                for(int y1=y0+h1; y0<y1; y0+=incY)
                                {
-                                       if(( 0 == (y0&li) ) && ((!pi) || (pif=!pif))) GPU_BlitWS(       src_screen16, dest_screen16, isRGB24);
-                                       dest_screen16 += VIDEO_WIDTH;
-                                       src_screen16  += h0;
+                                       if (( 0 == (y0&li) ) && ((!pi) || (pif=!pif)))
+                                               GPU_BlitWS(src16 + src16_offs, dst16, isRGB24);
+                                       dst16 += VIDEO_WIDTH;
+                                       src16_offs = (src16_offs + h0) & src16_offs_msk;
                                }
                                break;
                }
-               progressInterlace_flag=!progressInterlace_flag;
+               gpu_unai.prog_ilace_flag = !gpu_unai.prog_ilace_flag;
        }
        video_flip();
 }
 
-///////////////////////////////////////////////////////////////////////////////
-void  GPU_updateLace(void)
-{
-#ifdef  ENABLE_GPU_LOG_SUPPORT
-       fprintf(stdout,"GPU_updateLace()\n");
-#endif
-#ifdef DEBUG_ANALYSIS
-       dbg_anacnt_GPU_updateLace++;
-#endif
-       pcsx4all_prof_start_with_pause(PCSX4ALL_PROF_GPU,PCSX4ALL_PROF_COUNTERS);
-#ifdef PROFILER_PCSX4ALL
-       pcsx4all_prof_frames++;
-#endif
-#ifdef DEBUG_FRAME
-       if(isdbg_frame())
-       {
-               static int passed=0;
-               if (!passed) dbg_enable();
-               else pcsx4all_exit();
-               passed++;
-       }
-#endif
-
-       // Frame skip table
-       static const unsigned char skipTable[12][12] =
-       {
-               { 0,0,0,0,0,0,0,0,0,0,0,0 },
-               { 0,0,0,0,0,0,0,0,0,0,0,1 },
-               { 0,0,0,0,0,1,0,0,0,0,0,1 },
-               { 0,0,0,1,0,0,0,1,0,0,0,1 },
-               { 0,0,1,0,0,1,0,0,1,0,0,1 },
-               { 0,1,0,0,1,0,1,0,0,1,0,1 },
-               { 0,1,0,1,0,1,0,1,0,1,0,1 },
-               { 0,1,0,1,1,0,1,0,1,1,0,1 },
-               { 0,1,1,0,1,1,0,1,1,0,1,1 },
-               { 0,1,1,1,0,1,1,1,0,1,1,1 },
-               { 0,1,1,1,1,1,0,1,1,1,1,1 },
-               { 0,1,1,1,1,1,1,1,1,1,1,1 }
-       };
-       
-       // Interlace bit toggle
-       GPU_GP1 ^= 0x80000000;
-
-       // Update display
-       if ((!skipFrame) && (!isSkip) && (fb_dirty) && (!(((GPU_GP1&0x08000000))||((GPU_GP1&0x00800000)))))
-       {
-               gpuVideoOutput(); // Display updated
-
-               if (DisplayArea[3] == 480)
-               {
-                       if (linesInterlace_user) linesInterlace = 3; // 1/4 of lines
-                       else linesInterlace = 1; // if 480 we only need half of lines
-               }
-               else if (linesInterlace != linesInterlace_user)
-               {
-                       linesInterlace = linesInterlace_user; // resolution changed from 480 to lower one
-                       video_clear();
-               }
-       }
+// Update frames-skip each second>>3 (8 times per second)
+#define GPU_FRAMESKIP_UPDATE 3
 
-       // Limit FPS
-       if (frameLimit)
-       {
-               static unsigned next=get_ticks();
-               if (!skipFrame)
-               {
-                       unsigned now=get_ticks();
-                       if (now<next) wait_ticks(next-now);
-               }
-               next+=(isPAL?(1000000/50):((unsigned)(1000000.0/59.94)));
-       }
+static void GPU_frameskip (bool show)
+{
+       u32 now=get_ticks(); // current frame
 
-       // Show FPS statistics
-       if (show_fps)
+       // Update frameskip
+       if (gpu_unai.frameskip.skipCount==0) gpu_unai.frameskip.skipFrame=false; // frameskip off
+       else if (gpu_unai.frameskip.skipCount==7) { if (show) gpu_unai.frameskip.skipFrame=!gpu_unai.frameskip.skipFrame; } // frameskip medium
+       else if (gpu_unai.frameskip.skipCount==8) gpu_unai.frameskip.skipFrame=true; // frameskip maximum
+       else
        {
-               static u32 real_fps=0;
-               static u32 prev=get_ticks();
-               static char msg[32]="FPS=000/00 SPD=000%";
-               u32 now=get_ticks();
-               real_fps++;
-               if ((now-prev)>=1000000)
+               static u32 spd=100; // speed %
+               static u32 frames=0; // frames counter
+               static u32 prev=now; // previous fps calculation
+               frames++;
+               if ((now-prev)>=(TPS>>GPU_FRAMESKIP_UPDATE))
                {
-                       u32 expected_fps=(isPAL?50:60);
-                       sprintf(msg,"FPS=%3d/%2d SPD=%3d%%",((real_fps*(12-skipCount))/12),((expected_fps*(12-skipCount))/12),((real_fps*100)/expected_fps));
+                       if (IS_PAL) spd=(frames<<1);
+                       else spd=((frames*1001)/600);
+                       spd<<=GPU_FRAMESKIP_UPDATE;
+                       frames=0;
                        prev=now;
-                       real_fps=0;
                }
-               port_printf(5,5,msg);
-       }
-
-       // Update frame-skip
-       if (!alt_fps)
-       {
-               // Video frame-skip
-               skipFrame=skipTable[skipCount][skCount];
-               skCount--; if (skCount<0) skCount=11;
-               isSkip=skipFrame;
-       }
-       else
-       {
-               // Game frame-skip
-               if (!isSkip)
+               switch(gpu_unai.frameskip.skipCount)
                {
-                       skipFrame=skipTable[skipCount][skCount];
-                       skCount--; if (skCount<0) skCount=11;
-                       isSkip=true;
+                       case 1: if (spd<50) gpu_unai.frameskip.skipFrame=true; else gpu_unai.frameskip.skipFrame=false; break; // frameskip on (spd<50%)
+                       case 2: if (spd<60) gpu_unai.frameskip.skipFrame=true; else gpu_unai.frameskip.skipFrame=false; break; // frameskip on (spd<60%)
+                       case 3: if (spd<70) gpu_unai.frameskip.skipFrame=true; else gpu_unai.frameskip.skipFrame=false; break; // frameskip on (spd<70%)
+                       case 4: if (spd<80) gpu_unai.frameskip.skipFrame=true; else gpu_unai.frameskip.skipFrame=false; break; // frameskip on (spd<80%)
+                       case 5: if (spd<90) gpu_unai.frameskip.skipFrame=true; else gpu_unai.frameskip.skipFrame=false; break; // frameskip on (spd<90%)
                }
        }
-       fb_dirty=false;
-
-       pcsx4all_prof_end_with_resume(PCSX4ALL_PROF_GPU,PCSX4ALL_PROF_COUNTERS);
-}
-
-#else
-
-#include "../../frontend/plugin_lib.h"
-
-extern "C" {
-
-static const struct rearmed_cbs *cbs;
-static s16 old_res_horz, old_res_vert, old_rgb24;
-
-static void blit(void)
-{
-       u16 *base = (u16 *)GPU_FrameBuffer;
-       s16 isRGB24 = (GPU_GP1 & 0x00200000) ? 1 : 0;
-       s16 h0, x0, y0, w0, h1;
-
-       x0 = DisplayArea[0] & ~1; // alignment needed by blitter
-       y0 = DisplayArea[1];
-       base += FRAME_OFFSET(x0, y0);
-
-       w0 = DisplayArea[2];
-       h0 = DisplayArea[3];  // video mode
-
-       h1 = DisplayArea[5] - DisplayArea[4]; // display needed
-       if (h0 == 480) h1 = Min2(h1*2,480);
-
-       if (h1 <= 0)
-               return;
-
-       if (w0 != old_res_horz || h1 != old_res_vert || isRGB24 != old_rgb24)
-       {
-               old_res_horz = w0;
-               old_res_vert = h1;
-               old_rgb24 = (s16)isRGB24;
-               cbs->pl_vout_set_mode(w0, h1, w0, h1, isRGB24 ? 24 : 16);
-       }
-
-       cbs->pl_vout_flip(base, 1024, isRGB24, w0, h1);
 }
 
+///////////////////////////////////////////////////////////////////////////////
 void GPU_updateLace(void)
 {
        // Interlace bit toggle
-       GPU_GP1 ^= 0x80000000;
+       gpu_unai.GPU_GP1 ^= 0x80000000;
 
-       if (!fb_dirty || (GPU_GP1&0x08800000))
-               return;
-
-       if (!wasSkip) {
-               blit();
-               fb_dirty = false;
-               skCount = 0;
-       }
-       else {
-               skCount++;
-               if (skCount >= 8)
-                       wasSkip = isSkip = 0;
+       // Update display?
+       if ((gpu_unai.fb_dirty) && (!gpu_unai.frameskip.wasSkip) && (!(gpu_unai.GPU_GP1&0x00800000)))
+       {
+               // Display updated
+               gpuVideoOutput();
+               GPU_frameskip(true);
+               #ifdef ENABLE_GPU_LOG_SUPPORT
+                       fprintf(stdout,"GPU_updateLace(UPDATE)\n");
+               #endif
+       } else {
+               GPU_frameskip(false);
+               #ifdef ENABLE_GPU_LOG_SUPPORT
+                       fprintf(stdout,"GPU_updateLace(SKIP)\n");
+               #endif
        }
 
-       skipFrame = cbs->fskip_advice || cbs->frameskip == 1;
-}
+       if ((!gpu_unai.frameskip.skipCount) && (gpu_unai.DisplayArea[3] == 480)) gpu_unai.frameskip.skipGPU=true; // Tekken 3 hack
 
-long GPUopen(unsigned long *, char *, char *)
-{
-       cbs->pl_vout_open();
-       return 0;
+       gpu_unai.fb_dirty=false;
+       gpu_unai.dma.last_dma = NULL;
 }
 
-long GPUclose(void)
+// Allows frontend to signal plugin to redraw screen after returning to emu
+void GPU_requestScreenRedraw()
 {
-       cbs->pl_vout_close();
-       return 0;
+       gpu_unai.fb_dirty = true;
 }
 
-long GPUfreeze(unsigned int ulGetFreezeData, GPUFreeze_t* p2)
+void GPU_getScreenInfo(GPUScreenInfo_t *sinfo)
 {
-       if (ulGetFreezeData > 1)
-               return 0;
-
-       return GPU_freeze(ulGetFreezeData, p2);
+       bool depth24 = (gpu_unai.GPU_GP1 & 0x00200000 ? true : false);
+       int16_t hres = (uint16_t)gpu_unai.DisplayArea[2];
+       int16_t vres = (uint16_t)gpu_unai.DisplayArea[3];
+       int16_t w = hres; // Original gpu_unai doesn't support width < 100%
+       int16_t h = gpu_unai.DisplayArea[5] - gpu_unai.DisplayArea[4];
+       if (vres == 480)
+               h *= 2;
+       if (h <= 0 || h > vres)
+               h = vres;
+
+       sinfo->vram    = (uint8_t*)gpu_unai.vram;
+       sinfo->x       = (uint16_t)gpu_unai.DisplayArea[0];
+       sinfo->y       = (uint16_t)gpu_unai.DisplayArea[1];
+       sinfo->w       = w;
+       sinfo->h       = h;
+       sinfo->hres    = hres;
+       sinfo->vres    = vres;
+       sinfo->depth24 = depth24;
+       sinfo->pal     = IS_PAL;
 }
-
-void GPUrearmedCallbacks(const struct rearmed_cbs *cbs_)
-{
-       enableAbbeyHack = cbs_->gpu_unai.abe_hack;
-       light = !cbs_->gpu_unai.no_light;
-       blend = !cbs_->gpu_unai.no_blend;
-       if (cbs_->pl_vout_set_raw_vram)
-               cbs_->pl_vout_set_raw_vram((void *)GPU_FrameBuffer);
-
-       cbs = cbs_;
-       if (cbs->pl_set_gpu_caps)
-               cbs->pl_set_gpu_caps(0);
-}
-
-} /* extern "C" */
-
-#endif
index 1811630..eade2a8 100644 (file)
@@ -1,6 +1,7 @@
 /***************************************************************************
 *   Copyright (C) 2010 PCSX4ALL Team                                      *
 *   Copyright (C) 2010 Unai                                               *
+*   Copyright (C) 2016 Senquack (dansilsby <AT> gmail <DOT> com)          *
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU General Public License as published by  *
 *   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
 ***************************************************************************/
 
-#ifndef NEW_GPU_H
-#define NEW_GPU_H
+#ifndef GPU_UNAI_GPU_H
+#define GPU_UNAI_GPU_H
 
-///////////////////////////////////////////////////////////////////////////////
-//  GPU global definitions
-#define        FRAME_BUFFER_SIZE       (1024*512*2)
-#define        FRAME_WIDTH                       1024
-#define        FRAME_HEIGHT              512
-#define        FRAME_OFFSET(x,y)       (((y)<<10)+(x))
+struct gpu_unai_config_t {
+       uint8_t pixel_skip:1;     // If 1, allows skipping rendering pixels that
+                                 //  would not be visible when a high horizontal
+                                 //  resolution PS1 video mode is set.
+                                 //  Only applies to devices with low resolutions
+                                 //  like 320x240. Should not be used if a
+                                 //  down-scaling framebuffer blitter is in use.
+                                 //  Can cause gfx artifacts if game reads VRAM
+                                 //  to do framebuffer effects.
 
-#define VIDEO_WIDTH 320
+       uint8_t ilace_force:3;    // Option to force skipping rendering of lines,
+                                 //  for very slow platforms. Value will be
+                                 //  assigned to 'ilace_mask' in gpu_unai struct.
+                                 //  Normally 0. Value '1' will skip rendering
+                                 //  odd lines.
 
-typedef char                           s8;
-typedef signed short           s16;
-typedef signed int                     s32;
-typedef signed long long       s64;
+       uint8_t lighting:1;
+       uint8_t fast_lighting:1;
+       uint8_t blending:1;
+       uint8_t dithering:1;
 
-typedef unsigned char          u8;
-typedef unsigned short         u16;
-typedef unsigned int           u32;
-typedef unsigned long long     u64;
+       //senquack Only PCSX Rearmed's version of gpu_unai had this, and I
+       // don't think it's necessary. It would require adding 'AH' flag to
+       // gpuSpriteSpanFn() increasing size of sprite span function array.
+       //uint8_t enableAbbeyHack:1;  // Abe's Odyssey hack
 
-#include "gpu_fixedpoint.h"
-
-///////////////////////////////////////////////////////////////////////////////
-//  Tweaks and Hacks
-extern  int  skipCount;
-extern  bool enableAbbeyHack;
-extern  bool show_fps;
-extern  bool alt_fps;
-
-///////////////////////////////////////////////////////////////////////////////
-//  interlaced rendering
-extern  int linesInterlace_user;
-extern  bool progressInterlace;
-
-extern  bool light;
-extern  bool blend;
-
-typedef struct {
-       u32 Version;
-       u32 GPU_gp1;
-       u32 Control[256];
-       unsigned char FrameBuffer[1024*512*2];
-} GPUFreeze_t;
-
-struct  GPUPacket
-{
-       union
-       {
-               u32 U4[16];
-               s32 S4[16];
-               u16 U2[32];
-               s16 S2[32];
-               u8  U1[64];
-               s8  S1[64];
-       };
+       ////////////////////////////////////////////////////////////////////////////
+       // Variables used only by older standalone version of gpu_unai (gpu.cpp)
+#ifndef USE_GPULIB
+       uint8_t prog_ilace:1;         // Progressive interlace option (old option)
+                                     //  This option was somewhat oddly named:
+                                     //  When in interlaced video mode, on a low-res
+                                     //  320x240 device, only the even lines are
+                                     //  rendered. This option will take that one
+                                     //  step further and only render half the even
+                                     //  even lines one frame, and then the other half.
+       uint8_t frameskip_count:3;    // Frame skip (0..7)
+#endif
 };
 
-///////////////////////////////////////////////////////////////////////////////
-//  Compile Options
+extern gpu_unai_config_t gpu_unai_config_ext;
 
-//#define ENABLE_GPU_NULL_SUPPORT   // Enables NullGPU support
-//#define ENABLE_GPU_LOG_SUPPORT    // Enables gpu logger, very slow only for windows debugging
+// TODO: clean up show_fps frontend option
+extern  bool show_fps;
 
-///////////////////////////////////////////////////////////////////////////////
-#endif  // NEW_GPU_H
+#endif // GPU_UNAI_GPU_H
index 35cd056..e93f12f 100644 (file)
 ///////////////////////////////////////////////////////////////////////////////
 //  GPU Blitting code with rescale and interlace support.
 
-INLINE void GPU_BlitWW(const void* src, u16* dst16, u32 isRGB24)
+INLINE void GPU_BlitWW(const void* src, u16* dst16, bool isRGB24)
 {
        u32 uCount;
-       if(isRGB24 == 0)
+       if(!isRGB24)
        {
                #ifndef USE_BGR15
                        uCount = 20;
@@ -85,10 +85,10 @@ INLINE void GPU_BlitWW(const void* src, u16* dst16, u32 isRGB24)
        }
 }
 
-INLINE void GPU_BlitWWSWWSWS(const void* src, u16* dst16, u32 isRGB24)
+INLINE void GPU_BlitWWSWWSWS(const void* src, u16* dst16, bool isRGB24)
 {
        u32 uCount;
-       if(isRGB24 == 0)
+       if(!isRGB24)
        {
                #ifndef USE_BGR15
                        uCount = 32;
@@ -145,10 +145,10 @@ INLINE void GPU_BlitWWSWWSWS(const void* src, u16* dst16, u32 isRGB24)
        }
 }
 
-INLINE void GPU_BlitWWWWWS(const void* src, u16* dst16, u32 isRGB24)
+INLINE void GPU_BlitWWWWWS(const void* src, u16* dst16, bool isRGB24)
 {
        u32 uCount;
-       if(isRGB24 == 0)
+       if(!isRGB24)
        {
                #ifndef USE_BGR15
                        uCount = 32;
@@ -201,10 +201,10 @@ INLINE void GPU_BlitWWWWWS(const void* src, u16* dst16, u32 isRGB24)
        }
 }
 
-INLINE void GPU_BlitWWWWWWWWS(const void* src, u16* dst16, u32 isRGB24, u32 uClip_src)
+INLINE void GPU_BlitWWWWWWWWS(const void* src, u16* dst16, bool isRGB24, u32 uClip_src)
 {
        u32 uCount;
-       if(isRGB24 == 0)
+       if(!isRGB24)
        {
                #ifndef USE_BGR15
                        uCount = 20;
@@ -274,10 +274,10 @@ INLINE void GPU_BlitWWWWWWWWS(const void* src, u16* dst16, u32 isRGB24, u32 uCli
        }
 }
 
-INLINE void GPU_BlitWWDWW(const void* src, u16* dst16, u32 isRGB24)
+INLINE void GPU_BlitWWDWW(const void* src, u16* dst16, bool isRGB24)
 {
        u32 uCount;
-       if(isRGB24 == 0)
+       if(!isRGB24)
        {
                #ifndef USE_BGR15
                        uCount = 32;
@@ -331,10 +331,10 @@ INLINE void GPU_BlitWWDWW(const void* src, u16* dst16, u32 isRGB24)
 }
 
 
-INLINE void GPU_BlitWS(const void* src, u16* dst16, u32 isRGB24)
+INLINE void GPU_BlitWS(const void* src, u16* dst16, bool isRGB24)
 {
        u32 uCount;
-       if(isRGB24 == 0)
+       if(!isRGB24)
        {
                #ifndef USE_BGR15
                        uCount = 20;
index d6e7a74..7096b75 100644 (file)
@@ -1,6 +1,7 @@
 /***************************************************************************
 *   Copyright (C) 2010 PCSX4ALL Team                                      *
 *   Copyright (C) 2010 Unai                                               *
+*   Copyright (C) 2016 Senquack (dansilsby <AT> gmail <DOT> com)          *
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU General Public License as published by  *
 ***************************************************************************/
 
 ///////////////////////////////////////////////////////////////////////////////
-INLINE void gpuSetTexture(u16 tpage)
+void gpuSetTexture(u16 tpage)
 {
-       u32 tp;
-       u32 tx, ty;
-       GPU_GP1 = (GPU_GP1 & ~0x1FF) | (tpage & 0x1FF);
+       u32 tmode, tx, ty;
+       gpu_unai.GPU_GP1 = (gpu_unai.GPU_GP1 & ~0x1FF) | (tpage & 0x1FF);
+       gpu_unai.TextureWindow[0]&= ~gpu_unai.TextureWindow[2];
+       gpu_unai.TextureWindow[1]&= ~gpu_unai.TextureWindow[3];
 
-       TextureWindow[0]&= ~TextureWindow[2];
-       TextureWindow[1]&= ~TextureWindow[3];
+       tmode = (tpage >> 7) & 3;  // 16bpp, 8bpp, or 4bpp texture colors?
+                                  // 0: 4bpp     1: 8bpp     2/3: 16bpp
+
+       // Nocash PSX docs state setting of 3 is same as setting of 2 (16bpp):
+       // Note: DrHell assumes 3 is same as 0.. TODO: verify which is correct?
+       if (tmode == 3) tmode = 2;
 
-       tp = (tpage >> 7) & 3;
        tx = (tpage & 0x0F) << 6;
        ty = (tpage & 0x10) << 4;
-       if (tp == 3) tp = 2;
 
-       tx += (TextureWindow[0] >> (2 - tp));
-       ty += TextureWindow[1];
+       tx += (gpu_unai.TextureWindow[0] >> (2 - tmode));
+       ty += gpu_unai.TextureWindow[1];
        
-       BLEND_MODE  = (((tpage>>5)&0x3)     ) << 3;
-       TEXT_MODE   = (((tpage>>7)&0x3) + 1 ) << 5; // +1 el cero no lo usamos
-
-       TBA = &((u16*)GPU_FrameBuffer)[FRAME_OFFSET(tx, ty)];
-
+       gpu_unai.BLEND_MODE  = ((tpage>>5) & 3) << 3;
+       gpu_unai.TEXT_MODE   = (tmode + 1) << 5; // gpu_unai.TEXT_MODE should be values 1..3, so add one
+       gpu_unai.TBA = &((u16*)gpu_unai.vram)[FRAME_OFFSET(tx, ty)];
 }
 
 ///////////////////////////////////////////////////////////////////////////////
 INLINE void gpuSetCLUT(u16 clut)
 {
-       CBA = &((u16*)GPU_FrameBuffer)[(clut & 0x7FFF) << 4];
+       gpu_unai.CBA = &((u16*)gpu_unai.vram)[(clut & 0x7FFF) << 4];
 }
 
 #ifdef  ENABLE_GPU_NULL_SUPPORT
@@ -61,159 +63,305 @@ INLINE void gpuSetCLUT(u16 clut)
 #define DO_LOG(expr) {}
 #endif
 
-#define Blending (((PRIM&0x2)&&(blend))?(PRIM&0x2):0)
-#define Blending_Mode (((PRIM&0x2)&&(blend))?BLEND_MODE:0)
-#define Lighting (((~PRIM)&0x1)&&(light))
+#define Blending      (((PRIM&0x2) && BlendingEnabled()) ? (PRIM&0x2) : 0)
+#define Blending_Mode (((PRIM&0x2) && BlendingEnabled()) ? gpu_unai.BLEND_MODE : 0)
+#define Lighting      (((~PRIM)&0x1) && LightingEnabled())
+// Dithering applies only to Gouraud-shaded polys or texture-blended polys:
+#define Dithering     (((((~PRIM)&0x1) || (PRIM&0x10)) && DitheringEnabled()) ?            \
+                       (ForcedDitheringEnabled() ? (1<<9) : (gpu_unai.GPU_GP1 & (1 << 9))) \
+                       : 0)
+
+///////////////////////////////////////////////////////////////////////////////
+//Now handled by Rearmed's gpulib and gpu_unai/gpulib_if.cpp:
+///////////////////////////////////////////////////////////////////////////////
+#ifndef USE_GPULIB
+
+// Handles GP0 draw settings commands 0xE1...0xE6
+static void gpuGP0Cmd_0xEx(gpu_unai_t &gpu_unai, u32 cmd_word)
+{
+       // Assume incoming GP0 command is 0xE1..0xE6, convert to 1..6
+       u8 num = (cmd_word >> 24) & 7;
+       switch (num) {
+               case 1: {
+                       // GP0(E1h) - Draw Mode setting (aka "Texpage")
+                       DO_LOG(("GP0(0xE1) DrawMode TexPage(0x%x)\n", cmd_word));
+                       u32 cur_texpage = gpu_unai.GPU_GP1 & 0x7FF;
+                       u32 new_texpage = cmd_word & 0x7FF;
+                       if (cur_texpage != new_texpage) {
+                               gpu_unai.GPU_GP1 = (gpu_unai.GPU_GP1 & ~0x7FF) | new_texpage;
+                               gpuSetTexture(gpu_unai.GPU_GP1);
+                       }
+               } break;
+
+               case 2: {
+                       // GP0(E2h) - Texture Window setting
+                       DO_LOG(("GP0(0xE2) TextureWindow(0x%x)\n", cmd_word));
+                       if (cmd_word != gpu_unai.TextureWindowCur) {
+                               static const u8 TextureMask[32] = {
+                                       255, 7, 15, 7, 31, 7, 15, 7, 63, 7, 15, 7, 31, 7, 15, 7,
+                                       127, 7, 15, 7, 31, 7, 15, 7, 63, 7, 15, 7, 31, 7, 15, 7
+                               };
+                               gpu_unai.TextureWindowCur = cmd_word;
+                               gpu_unai.TextureWindow[0] = ((cmd_word >> 10) & 0x1F) << 3;
+                               gpu_unai.TextureWindow[1] = ((cmd_word >> 15) & 0x1F) << 3;
+                               gpu_unai.TextureWindow[2] = TextureMask[(cmd_word >> 0) & 0x1F];
+                               gpu_unai.TextureWindow[3] = TextureMask[(cmd_word >> 5) & 0x1F];
+                               gpu_unai.TextureWindow[0] &= ~gpu_unai.TextureWindow[2];
+                               gpu_unai.TextureWindow[1] &= ~gpu_unai.TextureWindow[3];
+
+                               // Inner loop vars must be updated whenever texture window is changed:
+                               const u32 fb = FIXED_BITS;  // # of fractional fixed-pt bits of u4/v4
+                               gpu_unai.u_msk = (((u32)gpu_unai.TextureWindow[2]) << fb) | ((1 << fb) - 1);
+                               gpu_unai.v_msk = (((u32)gpu_unai.TextureWindow[3]) << fb) | ((1 << fb) - 1);
+
+                               gpuSetTexture(gpu_unai.GPU_GP1);
+                       }
+               } break;
+
+               case 3: {
+                       // GP0(E3h) - Set Drawing Area top left (X1,Y1)
+                       DO_LOG(("GP0(0xE3) DrawingArea Pos(0x%x)\n", cmd_word));
+                       gpu_unai.DrawingArea[0] = cmd_word         & 0x3FF;
+                       gpu_unai.DrawingArea[1] = (cmd_word >> 10) & 0x3FF;
+               } break;
+
+               case 4: {
+                       // GP0(E4h) - Set Drawing Area bottom right (X2,Y2)
+                       DO_LOG(("GP0(0xE4) DrawingArea Size(0x%x)\n", cmd_word));
+                       gpu_unai.DrawingArea[2] = (cmd_word         & 0x3FF) + 1;
+                       gpu_unai.DrawingArea[3] = ((cmd_word >> 10) & 0x3FF) + 1;
+               } break;
+
+               case 5: {
+                       // GP0(E5h) - Set Drawing Offset (X,Y)
+                       DO_LOG(("GP0(0xE5) DrawingOffset(0x%x)\n", cmd_word));
+                       gpu_unai.DrawingOffset[0] = ((s32)cmd_word<<(32-11))>>(32-11);
+                       gpu_unai.DrawingOffset[1] = ((s32)cmd_word<<(32-22))>>(32-11);
+               } break;
+
+               case 6: {
+                       // GP0(E6h) - Mask Bit Setting
+                       DO_LOG(("GP0(0xE6) SetMask(0x%x)\n", cmd_word));
+                       gpu_unai.Masking  = (cmd_word & 0x2) <<  1;
+                       gpu_unai.PixelMSB = (cmd_word & 0x1) <<  8;
+               } break;
+       }
+}
 
 void gpuSendPacketFunction(const int PRIM)
 {
        //printf("0x%x\n",PRIM);
 
+       //senquack - TODO: optimize this (packet pointer union as prim draw parameter
+       // introduced as optimization for gpulib command-list processing)
+       PtrUnion packet = { .ptr = (void*)&gpu_unai.PacketBuffer };
+
        switch (PRIM)
        {
-               case 0x02:
+               case 0x02: {
                        NULL_GPU();
-                       gpuClearImage();    //  prim handles updateLace && skip
+                       gpuClearImage(packet);    //  prim handles updateLace && skip
+                       gpu_unai.fb_dirty = true;
                        DO_LOG(("gpuClearImage(0x%x)\n",PRIM));
-                       break;
+               } break;
+
                case 0x20:
                case 0x21:
                case 0x22:
-               case 0x23:
-                       if (!isSkip)
+               case 0x23: {          // Monochrome 3-pt poly
+                       if (!gpu_unai.frameskip.skipGPU)
                        {
                                NULL_GPU();
-                               gpuDrawF3(gpuPolySpanDrivers [Blending_Mode | Masking | Blending | PixelMSB]);
-                               DO_LOG(("gpuDrawF3(0x%x)\n",PRIM));
+                               PP driver = gpuPolySpanDrivers[
+                                       (gpu_unai.blit_mask?1024:0) |
+                                       Blending_Mode |
+                                       gpu_unai.Masking | Blending | gpu_unai.PixelMSB
+                               ];
+                               gpuDrawPolyF(packet, driver, false);
+                               gpu_unai.fb_dirty = true;
+                               DO_LOG(("gpuDrawPolyF(0x%x)\n",PRIM));
                        }
-                       break;
+               } break;
+
                case 0x24:
                case 0x25:
                case 0x26:
-               case 0x27:
-                       if (!isSkip)
+               case 0x27: {          // Textured 3-pt poly
+                       if (!gpu_unai.frameskip.skipGPU)
                        {
                                NULL_GPU();
-                               gpuSetCLUT    (PacketBuffer.U4[2] >> 16);
-                               gpuSetTexture (PacketBuffer.U4[4] >> 16);
-                               if ((PacketBuffer.U1[0]>0x5F) && (PacketBuffer.U1[1]>0x5F) && (PacketBuffer.U1[2]>0x5F))
-                                       gpuDrawFT3(gpuPolySpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | PixelMSB]);
-                               else
-                                       gpuDrawFT3(gpuPolySpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | Lighting | PixelMSB]);
-                               DO_LOG(("gpuDrawFT3(0x%x)\n",PRIM));
+                               gpuSetCLUT    (gpu_unai.PacketBuffer.U4[2] >> 16);
+                               gpuSetTexture (gpu_unai.PacketBuffer.U4[4] >> 16);
+
+                               u32 driver_idx =
+                                       (gpu_unai.blit_mask?1024:0) |
+                                       Dithering |
+                                       Blending_Mode | gpu_unai.TEXT_MODE |
+                                       gpu_unai.Masking | Blending | gpu_unai.PixelMSB;
+
+                               if (!FastLightingEnabled()) {
+                                       driver_idx |= Lighting;
+                               } else {
+                                       if (!((gpu_unai.PacketBuffer.U1[0]>0x5F) && (gpu_unai.PacketBuffer.U1[1]>0x5F) && (gpu_unai.PacketBuffer.U1[2]>0x5F)))
+                                               driver_idx |= Lighting;
+                               }
+
+                               PP driver = gpuPolySpanDrivers[driver_idx];
+                               gpuDrawPolyFT(packet, driver, false);
+                               gpu_unai.fb_dirty = true;
+                               DO_LOG(("gpuDrawPolyFT(0x%x)\n",PRIM));
                        }
-                       break;
+               } break;
+
                case 0x28:
                case 0x29:
                case 0x2A:
-               case 0x2B:
-                       if (!isSkip)
+               case 0x2B: {          // Monochrome 4-pt poly
+                       if (!gpu_unai.frameskip.skipGPU)
                        {
                                NULL_GPU();
-                               const PP gpuPolySpanDriver  = gpuPolySpanDrivers [Blending_Mode | Masking | Blending | PixelMSB];
-                               //--PacketBuffer.S2[6];
-                               gpuDrawF3(gpuPolySpanDriver);
-                               PacketBuffer.U4[1] = PacketBuffer.U4[4];
-                               //--PacketBuffer.S2[2];
-                               gpuDrawF3(gpuPolySpanDriver);
-                               DO_LOG(("gpuDrawF4(0x%x)\n",PRIM));
+                               PP driver = gpuPolySpanDrivers[
+                                       (gpu_unai.blit_mask?1024:0) |
+                                       Blending_Mode |
+                                       gpu_unai.Masking | Blending | gpu_unai.PixelMSB
+                               ];
+                               gpuDrawPolyF(packet, driver, true); // is_quad = true
+                               gpu_unai.fb_dirty = true;
+                               DO_LOG(("gpuDrawPolyF(0x%x) (4-pt QUAD)\n",PRIM));
                        }
-                       break;
+               } break;
+
                case 0x2C:
                case 0x2D:
                case 0x2E:
-               case 0x2F:
-                       if (!isSkip)
+               case 0x2F: {          // Textured 4-pt poly
+                       if (!gpu_unai.frameskip.skipGPU)
                        {
                                NULL_GPU();
-                               gpuSetCLUT    (PacketBuffer.U4[2] >> 16);
-                               gpuSetTexture (PacketBuffer.U4[4] >> 16);
-                               PP gpuPolySpanDriver;
-                               if ((PacketBuffer.U1[0]>0x5F) && (PacketBuffer.U1[1]>0x5F) && (PacketBuffer.U1[2]>0x5F))
-                                       gpuPolySpanDriver = gpuPolySpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | PixelMSB];
-                               else
-                                       gpuPolySpanDriver = gpuPolySpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | Lighting | PixelMSB];
-                               //--PacketBuffer.S2[6];
-                               gpuDrawFT3(gpuPolySpanDriver);
-                               PacketBuffer.U4[1] = PacketBuffer.U4[7];
-                               PacketBuffer.U4[2] = PacketBuffer.U4[8];
-                               //--PacketBuffer.S2[2];
-                               gpuDrawFT3(gpuPolySpanDriver);
-                               DO_LOG(("gpuDrawFT4(0x%x)\n",PRIM));
+                               gpuSetCLUT    (gpu_unai.PacketBuffer.U4[2] >> 16);
+                               gpuSetTexture (gpu_unai.PacketBuffer.U4[4] >> 16);
+
+                               u32 driver_idx =
+                                       (gpu_unai.blit_mask?1024:0) |
+                                       Dithering |
+                                       Blending_Mode | gpu_unai.TEXT_MODE |
+                                       gpu_unai.Masking | Blending | gpu_unai.PixelMSB;
+
+                               if (!FastLightingEnabled()) {
+                                       driver_idx |= Lighting;
+                               } else {
+                                       if (!((gpu_unai.PacketBuffer.U1[0]>0x5F) && (gpu_unai.PacketBuffer.U1[1]>0x5F) && (gpu_unai.PacketBuffer.U1[2]>0x5F)))
+                                               driver_idx |= Lighting;
+                               }
+
+                               PP driver = gpuPolySpanDrivers[driver_idx];
+                               gpuDrawPolyFT(packet, driver, true); // is_quad = true
+                               gpu_unai.fb_dirty = true;
+                               DO_LOG(("gpuDrawPolyFT(0x%x) (4-pt QUAD)\n",PRIM));
                        }
-                       break;
+               } break;
+
                case 0x30:
                case 0x31:
                case 0x32:
-               case 0x33:
-                       if (!isSkip)
+               case 0x33: {          // Gouraud-shaded 3-pt poly
+                       if (!gpu_unai.frameskip.skipGPU)
                        {
                                NULL_GPU();
-                               gpuDrawG3(gpuPolySpanDrivers [Blending_Mode | Masking | Blending | 129 | PixelMSB]);
-                               DO_LOG(("gpuDrawG3(0x%x)\n",PRIM));
+                               //NOTE: The '129' here is CF_GOURAUD | CF_LIGHT, however
+                               // this is an untextured poly, so CF_LIGHT (texture blend)
+                               // shouldn't apply. Until the original array of template
+                               // instantiation ptrs is fixed, we're stuck with this. (TODO)
+                               PP driver = gpuPolySpanDrivers[
+                                       (gpu_unai.blit_mask?1024:0) |
+                                       Dithering |
+                                       Blending_Mode |
+                                       gpu_unai.Masking | Blending | 129 | gpu_unai.PixelMSB
+                               ];
+                               gpuDrawPolyG(packet, driver, false);
+                               gpu_unai.fb_dirty = true;
+                               DO_LOG(("gpuDrawPolyG(0x%x)\n",PRIM));
                        }
-                       break;
+               } break;
+
                case 0x34:
                case 0x35:
                case 0x36:
-               case 0x37:
-                       if (!isSkip)
+               case 0x37: {          // Gouraud-shaded, textured 3-pt poly
+                       if (!gpu_unai.frameskip.skipGPU)
                        {
                                NULL_GPU();
-                               gpuSetCLUT    (PacketBuffer.U4[2] >> 16);
-                               gpuSetTexture (PacketBuffer.U4[5] >> 16);
-                               gpuDrawGT3(gpuPolySpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | ((Lighting)?129:0) | PixelMSB]);
-                               DO_LOG(("gpuDrawGT3(0x%x)\n",PRIM));
+                               gpuSetCLUT    (gpu_unai.PacketBuffer.U4[2] >> 16);
+                               gpuSetTexture (gpu_unai.PacketBuffer.U4[5] >> 16);
+                               PP driver = gpuPolySpanDrivers[
+                                       (gpu_unai.blit_mask?1024:0) |
+                                       Dithering |
+                                       Blending_Mode | gpu_unai.TEXT_MODE |
+                                       gpu_unai.Masking | Blending | ((Lighting)?129:0) | gpu_unai.PixelMSB
+                               ];
+                               gpuDrawPolyGT(packet, driver, false);
+                               gpu_unai.fb_dirty = true;
+                               DO_LOG(("gpuDrawPolyGT(0x%x)\n",PRIM));
                        }
-                       break;
+               } break;
+
                case 0x38:
                case 0x39:
                case 0x3A:
-               case 0x3B:
-                       if (!isSkip)
+               case 0x3B: {          // Gouraud-shaded 4-pt poly
+                       if (!gpu_unai.frameskip.skipGPU)
                        {
                                NULL_GPU();
-                               const PP gpuPolySpanDriver  = gpuPolySpanDrivers [Blending_Mode | Masking | Blending | 129 | PixelMSB];
-                               //--PacketBuffer.S2[6];
-                               gpuDrawG3(gpuPolySpanDriver);
-                               PacketBuffer.U4[0] = PacketBuffer.U4[6];
-                               PacketBuffer.U4[1] = PacketBuffer.U4[7];
-                               //--PacketBuffer.S2[2];
-                               gpuDrawG3(gpuPolySpanDriver);
-                               DO_LOG(("gpuDrawG4(0x%x)\n",PRIM));
+                               // See notes regarding '129' for 0x30..0x33 further above -senquack
+                               PP driver = gpuPolySpanDrivers[
+                                       (gpu_unai.blit_mask?1024:0) |
+                                       Dithering |
+                                       Blending_Mode |
+                                       gpu_unai.Masking | Blending | 129 | gpu_unai.PixelMSB
+                               ];
+                               gpuDrawPolyG(packet, driver, true); // is_quad = true
+                               gpu_unai.fb_dirty = true;
+                               DO_LOG(("gpuDrawPolyG(0x%x) (4-pt QUAD)\n",PRIM));
                        }
-                       break;
+               } break;
+
                case 0x3C:
                case 0x3D:
                case 0x3E:
-               case 0x3F:
-                       if (!isSkip)
+               case 0x3F: {          // Gouraud-shaded, textured 4-pt poly
+                       if (!gpu_unai.frameskip.skipGPU)
                        {
                                NULL_GPU();
-                               gpuSetCLUT    (PacketBuffer.U4[2] >> 16);
-                               gpuSetTexture (PacketBuffer.U4[5] >> 16);
-                               const PP gpuPolySpanDriver  = gpuPolySpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | ((Lighting)?129:0) | PixelMSB];
-                               //--PacketBuffer.S2[6];
-                               gpuDrawGT3(gpuPolySpanDriver);
-                               PacketBuffer.U4[0] = PacketBuffer.U4[9];
-                               PacketBuffer.U4[1] = PacketBuffer.U4[10];
-                               PacketBuffer.U4[2] = PacketBuffer.U4[11];
-                               //--PacketBuffer.S2[2];
-                               gpuDrawGT3(gpuPolySpanDriver);
-                               DO_LOG(("gpuDrawGT4(0x%x)\n",PRIM));
+                               gpuSetCLUT    (gpu_unai.PacketBuffer.U4[2] >> 16);
+                               gpuSetTexture (gpu_unai.PacketBuffer.U4[5] >> 16);
+                               PP driver = gpuPolySpanDrivers[
+                                       (gpu_unai.blit_mask?1024:0) |
+                                       Dithering |
+                                       Blending_Mode | gpu_unai.TEXT_MODE |
+                                       gpu_unai.Masking | Blending | ((Lighting)?129:0) | gpu_unai.PixelMSB
+                               ];
+                               gpuDrawPolyGT(packet, driver, true); // is_quad = true
+                               gpu_unai.fb_dirty = true;
+                               DO_LOG(("gpuDrawPolyGT(0x%x) (4-pt QUAD)\n",PRIM));
                        }
-                       break;
+               } break;
+
                case 0x40:
                case 0x41:
                case 0x42:
-               case 0x43:
-                       if (!isSkip)
+               case 0x43: {          // Monochrome line
+                       if (!gpu_unai.frameskip.skipGPU)
                        {
                                NULL_GPU();
-                               gpuDrawLF(gpuPixelDrivers [ (Blending_Mode | Masking | Blending | (PixelMSB>>3)) >> 1]);
-                               DO_LOG(("gpuDrawLF(0x%x)\n",PRIM));
+                               // Shift index right by one, as untextured prims don't use lighting
+                               u32 driver_idx = (Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1;
+                               PSD driver = gpuPixelSpanDrivers[driver_idx];
+                               gpuDrawLineF(packet, driver);
+                               gpu_unai.fb_dirty = true;
+                               DO_LOG(("gpuDrawLineF(0x%x)\n",PRIM));
                        }
-                       break;
+               } break;
+
                case 0x48:
                case 0x49:
                case 0x4A:
@@ -221,32 +369,44 @@ void gpuSendPacketFunction(const int PRIM)
                case 0x4C:
                case 0x4D:
                case 0x4E:
-               case 0x4F:
-                       if (!isSkip)
+               case 0x4F: { // Monochrome line strip
+                       if (!gpu_unai.frameskip.skipGPU)
                        {
                                NULL_GPU();
-                               gpuDrawLF(gpuPixelDrivers [ (Blending_Mode | Masking | Blending | (PixelMSB>>3)) >> 1]);
-                               DO_LOG(("gpuDrawLF(0x%x)\n",PRIM));
+                               // Shift index right by one, as untextured prims don't use lighting
+                               u32 driver_idx = (Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1;
+                               PSD driver = gpuPixelSpanDrivers[driver_idx];
+                               gpuDrawLineF(packet, driver);
+                               gpu_unai.fb_dirty = true;
+                               DO_LOG(("gpuDrawLineF(0x%x)\n",PRIM));
                        }
-                       if ((PacketBuffer.U4[3] & 0xF000F000) != 0x50005000)
+                       if ((gpu_unai.PacketBuffer.U4[3] & 0xF000F000) != 0x50005000)
                        {
-                               PacketBuffer.U4[1] = PacketBuffer.U4[2];
-                               PacketBuffer.U4[2] = PacketBuffer.U4[3];
-                               PacketCount = 1;
-                               PacketIndex = 3;
+                               gpu_unai.PacketBuffer.U4[1] = gpu_unai.PacketBuffer.U4[2];
+                               gpu_unai.PacketBuffer.U4[2] = gpu_unai.PacketBuffer.U4[3];
+                               gpu_unai.PacketCount = 1;
+                               gpu_unai.PacketIndex = 3;
                        }
-                       break;
+               } break;
+
                case 0x50:
                case 0x51:
                case 0x52:
-               case 0x53:
-                       if (!isSkip)
+               case 0x53: {          // Gouraud-shaded line
+                       if (!gpu_unai.frameskip.skipGPU)
                        {
                                NULL_GPU();
-                               gpuDrawLG(gpuPixelDrivers [ (Blending_Mode | Masking | Blending | (PixelMSB>>3)) >> 1]);
-                               DO_LOG(("gpuDrawLG(0x%x)\n",PRIM));
+                               // Shift index right by one, as untextured prims don't use lighting
+                               u32 driver_idx = (Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1;
+                               // Index MSB selects Gouraud-shaded PixelSpanDriver:
+                               driver_idx |= (1 << 5);
+                               PSD driver = gpuPixelSpanDrivers[driver_idx];
+                               gpuDrawLineG(packet, driver);
+                               gpu_unai.fb_dirty = true;
+                               DO_LOG(("gpuDrawLineG(0x%x)\n",PRIM));
                        }
-                       break;
+               } break;
+
                case 0x58:
                case 0x59:
                case 0x5A:
@@ -254,204 +414,203 @@ void gpuSendPacketFunction(const int PRIM)
                case 0x5C:
                case 0x5D:
                case 0x5E:
-               case 0x5F:
-                       if (!isSkip)
+               case 0x5F: { // Gouraud-shaded line strip
+                       if (!gpu_unai.frameskip.skipGPU)
                        {
                                NULL_GPU();
-                               gpuDrawLG(gpuPixelDrivers [ (Blending_Mode | Masking | Blending | (PixelMSB>>3)) >> 1]);
-                               DO_LOG(("gpuDrawLG(0x%x)\n",PRIM));
+                               // Shift index right by one, as untextured prims don't use lighting
+                               u32 driver_idx = (Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1;
+                               // Index MSB selects Gouraud-shaded PixelSpanDriver:
+                               driver_idx |= (1 << 5);
+                               PSD driver = gpuPixelSpanDrivers[driver_idx];
+                               gpuDrawLineG(packet, driver);
+                               gpu_unai.fb_dirty = true;
+                               DO_LOG(("gpuDrawLineG(0x%x)\n",PRIM));
                        }
-                       if ((PacketBuffer.U4[4] & 0xF000F000) != 0x50005000)
+                       if ((gpu_unai.PacketBuffer.U4[4] & 0xF000F000) != 0x50005000)
                        {
-                               PacketBuffer.U1[3 + (2 * 4)] = PacketBuffer.U1[3 + (0 * 4)];
-                               PacketBuffer.U4[0] = PacketBuffer.U4[2];
-                               PacketBuffer.U4[1] = PacketBuffer.U4[3];
-                               PacketBuffer.U4[2] = PacketBuffer.U4[4];
-                               PacketCount = 2;
-                               PacketIndex = 3;
+                               gpu_unai.PacketBuffer.U1[3 + (2 * 4)] = gpu_unai.PacketBuffer.U1[3 + (0 * 4)];
+                               gpu_unai.PacketBuffer.U4[0] = gpu_unai.PacketBuffer.U4[2];
+                               gpu_unai.PacketBuffer.U4[1] = gpu_unai.PacketBuffer.U4[3];
+                               gpu_unai.PacketBuffer.U4[2] = gpu_unai.PacketBuffer.U4[4];
+                               gpu_unai.PacketCount = 2;
+                               gpu_unai.PacketIndex = 3;
                        }
-                       break;
+               } break;
+
                case 0x60:
                case 0x61:
                case 0x62:
-               case 0x63:
-                       if (!isSkip)
+               case 0x63: {          // Monochrome rectangle (variable size)
+                       if (!gpu_unai.frameskip.skipGPU)
                        {
                                NULL_GPU();
-                               gpuDrawT(gpuTileSpanDrivers [Blending_Mode | Masking | Blending | (PixelMSB>>3)]);
+                               PT driver = gpuTileSpanDrivers[(Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1];
+                               gpuDrawT(packet, driver);
+                               gpu_unai.fb_dirty = true;
                                DO_LOG(("gpuDrawT(0x%x)\n",PRIM));
                        }
-                       break;
+               } break;
+
                case 0x64:
                case 0x65:
                case 0x66:
-               case 0x67:
-                       if (!isSkip)
+               case 0x67: {          // Textured rectangle (variable size)
+                       if (!gpu_unai.frameskip.skipGPU)
                        {
                                NULL_GPU();
-                               gpuSetCLUT    (PacketBuffer.U4[2] >> 16);
-                               gpuSetTexture (GPU_GP1);
-                               if ((PacketBuffer.U1[0]>0x5F) && (PacketBuffer.U1[1]>0x5F) && (PacketBuffer.U1[2]>0x5F))
-                                       gpuDrawS(gpuSpriteSpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | (enableAbbeyHack<<7)  | PixelMSB]);
-                               else
-                                       gpuDrawS(gpuSpriteSpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | Lighting | (enableAbbeyHack<<7)  | PixelMSB]);
+                               gpuSetCLUT    (gpu_unai.PacketBuffer.U4[2] >> 16);
+                               u32 driver_idx = Blending_Mode | gpu_unai.TEXT_MODE | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>1);
+
+                               // This fixes Silent Hill running animation on loading screens:
+                               // (On PSX, color values 0x00-0x7F darken the source texture's color,
+                               //  0x81-FF lighten textures (ultimately clamped to 0x1F),
+                               //  0x80 leaves source texture color unchanged, HOWEVER,
+                               //   gpu_unai uses a simple lighting LUT whereby only the upper
+                               //   5 bits of an 8-bit color are used, so 0x80-0x87 all behave as
+                               //   0x80.
+                               // 
+                               // NOTE: I've changed all textured sprite draw commands here and
+                               //  elsewhere to use proper behavior, but left poly commands
+                               //  alone, I don't want to slow rendering down too much. (TODO)
+                               //if ((gpu_unai.PacketBuffer.U1[0]>0x5F) && (gpu_unai.PacketBuffer.U1[1]>0x5F) && (gpu_unai.PacketBuffer.U1[2]>0x5F))
+                               // Strip lower 3 bits of each color and determine if lighting should be used:
+                               if ((gpu_unai.PacketBuffer.U4[0] & 0xF8F8F8) != 0x808080)
+                                       driver_idx |= Lighting;
+                               PS driver = gpuSpriteSpanDrivers[driver_idx];
+                               gpuDrawS(packet, driver);
+                               gpu_unai.fb_dirty = true;
                                DO_LOG(("gpuDrawS(0x%x)\n",PRIM));
                        }
-                       break;
+               } break;
+
                case 0x68:
                case 0x69:
                case 0x6A:
-               case 0x6B:
-                       if (!isSkip)
+               case 0x6B: {          // Monochrome rectangle (1x1 dot)
+                       if (!gpu_unai.frameskip.skipGPU)
                        {
                                NULL_GPU();
-                               PacketBuffer.U4[2] = 0x00010001;
-                               gpuDrawT(gpuTileSpanDrivers [Blending_Mode | Masking | Blending | (PixelMSB>>3)]);
+                               gpu_unai.PacketBuffer.U4[2] = 0x00010001;
+                               PT driver = gpuTileSpanDrivers[(Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1];
+                               gpuDrawT(packet, driver);
+                               gpu_unai.fb_dirty = true;
                                DO_LOG(("gpuDrawT(0x%x)\n",PRIM));
                        }
-                       break;
+               } break;
+
                case 0x70:
                case 0x71:
                case 0x72:
-               case 0x73:
-                       if (!isSkip)
+               case 0x73: {          // Monochrome rectangle (8x8)
+                       if (!gpu_unai.frameskip.skipGPU)
                        {
                                NULL_GPU();
-                               PacketBuffer.U4[2] = 0x00080008;
-                               gpuDrawT(gpuTileSpanDrivers [Blending_Mode | Masking | Blending | (PixelMSB>>3)]);
+                               gpu_unai.PacketBuffer.U4[2] = 0x00080008;
+                               PT driver = gpuTileSpanDrivers[(Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1];
+                               gpuDrawT(packet, driver);
+                               gpu_unai.fb_dirty = true;
                                DO_LOG(("gpuDrawT(0x%x)\n",PRIM));
                        }
-                       break;
+               } break;
+
                case 0x74:
                case 0x75:
                case 0x76:
-               case 0x77:
-                       if (!isSkip)
+               case 0x77: {          // Textured rectangle (8x8)
+                       if (!gpu_unai.frameskip.skipGPU)
                        {
                                NULL_GPU();
-                               PacketBuffer.U4[3] = 0x00080008;
-                               gpuSetCLUT    (PacketBuffer.U4[2] >> 16);
-                               gpuSetTexture (GPU_GP1);
-                               if ((PacketBuffer.U1[0]>0x5F) && (PacketBuffer.U1[1]>0x5F) && (PacketBuffer.U1[2]>0x5F))
-                                       gpuDrawS(gpuSpriteSpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | (enableAbbeyHack<<7)  | PixelMSB]);
-                               else
-                                       gpuDrawS(gpuSpriteSpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | Lighting | (enableAbbeyHack<<7)  | PixelMSB]);
+                               gpu_unai.PacketBuffer.U4[3] = 0x00080008;
+                               gpuSetCLUT    (gpu_unai.PacketBuffer.U4[2] >> 16);
+                               u32 driver_idx = Blending_Mode | gpu_unai.TEXT_MODE | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>1);
+
+                               //senquack - Only color 808080h-878787h allows skipping lighting calculation:
+                               //if ((gpu_unai.PacketBuffer.U1[0]>0x5F) && (gpu_unai.PacketBuffer.U1[1]>0x5F) && (gpu_unai.PacketBuffer.U1[2]>0x5F))
+                               // Strip lower 3 bits of each color and determine if lighting should be used:
+                               if ((gpu_unai.PacketBuffer.U4[0] & 0xF8F8F8) != 0x808080)
+                                       driver_idx |= Lighting;
+                               PS driver = gpuSpriteSpanDrivers[driver_idx];
+                               gpuDrawS(packet, driver);
+                               gpu_unai.fb_dirty = true;
                                DO_LOG(("gpuDrawS(0x%x)\n",PRIM));
                        }
-                       break;
+               } break;
+
                case 0x78:
                case 0x79:
                case 0x7A:
-               case 0x7B:
-                       if (!isSkip)
+               case 0x7B: {          // Monochrome rectangle (16x16)
+                       if (!gpu_unai.frameskip.skipGPU)
                        {
                                NULL_GPU();
-                               PacketBuffer.U4[2] = 0x00100010;
-                               gpuDrawT(gpuTileSpanDrivers [Blending_Mode | Masking | Blending | (PixelMSB>>3)]);
+                               gpu_unai.PacketBuffer.U4[2] = 0x00100010;
+                               PT driver = gpuTileSpanDrivers[(Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1];
+                               gpuDrawT(packet, driver);
+                               gpu_unai.fb_dirty = true;
                                DO_LOG(("gpuDrawT(0x%x)\n",PRIM));
                        }
-                       break;
+               } break;
+
                case 0x7C:
                case 0x7D:
-#ifdef __arm__
-                       if ((GPU_GP1 & 0x180) == 0 && (Masking | PixelMSB) == 0)
+                       #ifdef __arm__
+                       /* Notaz 4bit sprites optimization */
+                       if ((!gpu_unai.frameskip.skipGPU) && (!(gpu_unai.GPU_GP1&0x180)) && (!(gpu_unai.Masking|gpu_unai.PixelMSB)))
                        {
-                               gpuSetCLUT    (PacketBuffer.U4[2] >> 16);
-                               gpuSetTexture (GPU_GP1);
-                               gpuDrawS16();
+                               gpuSetCLUT    (gpu_unai.PacketBuffer.U4[2] >> 16);
+                               gpuDrawS16(packet);
+                               gpu_unai.fb_dirty = true;
                                break;
                        }
-                       // fallthrough
-#endif
+                       #endif
                case 0x7E:
-               case 0x7F:
-                       if (!isSkip)
+               case 0x7F: {          // Textured rectangle (16x16)
+                       if (!gpu_unai.frameskip.skipGPU)
                        {
                                NULL_GPU();
-                               PacketBuffer.U4[3] = 0x00100010;
-                               gpuSetCLUT    (PacketBuffer.U4[2] >> 16);
-                               gpuSetTexture (GPU_GP1);
-                               if ((PacketBuffer.U1[0]>0x5F) && (PacketBuffer.U1[1]>0x5F) && (PacketBuffer.U1[2]>0x5F))
-                                       gpuDrawS(gpuSpriteSpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | (enableAbbeyHack<<7)  | PixelMSB]);
-                               else
-                                       gpuDrawS(gpuSpriteSpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | Lighting | (enableAbbeyHack<<7)  | PixelMSB]);
+                               gpu_unai.PacketBuffer.U4[3] = 0x00100010;
+                               gpuSetCLUT    (gpu_unai.PacketBuffer.U4[2] >> 16);
+                               u32 driver_idx = Blending_Mode | gpu_unai.TEXT_MODE | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>1);
+
+                               //senquack - Only color 808080h-878787h allows skipping lighting calculation:
+                               //if ((gpu_unai.PacketBuffer.U1[0]>0x5F) && (gpu_unai.PacketBuffer.U1[1]>0x5F) && (gpu_unai.PacketBuffer.U1[2]>0x5F))
+                               // Strip lower 3 bits of each color and determine if lighting should be used:
+                               if ((gpu_unai.PacketBuffer.U4[0] & 0xF8F8F8) != 0x808080)
+                                       driver_idx |= Lighting;
+                               PS driver = gpuSpriteSpanDrivers[driver_idx];
+                               gpuDrawS(packet, driver);
+                               gpu_unai.fb_dirty = true;
                                DO_LOG(("gpuDrawS(0x%x)\n",PRIM));
                        }
-                       break;
+               } break;
+
                case 0x80:          //  vid -> vid
-                       gpuMoveImage();   //  prim handles updateLace && skip
+                       gpuMoveImage(packet);   //  prim handles updateLace && skip
+                       if ((!gpu_unai.frameskip.skipCount) && (gpu_unai.DisplayArea[3] == 480)) // Tekken 3 hack
+                       {
+                               if (!gpu_unai.frameskip.skipGPU) gpu_unai.fb_dirty = true;
+                       }
+                       else
+                       {
+                               gpu_unai.fb_dirty = true;
+                       }
                        DO_LOG(("gpuMoveImage(0x%x)\n",PRIM));
                        break;
                case 0xA0:          //  sys ->vid
-                       gpuLoadImage();   //  prim handles updateLace && skip
-#ifndef isSkip // not a define
-                       if (alt_fps) isSkip=false;
-#endif
+                       gpuLoadImage(packet);   //  prim handles updateLace && skip
                        DO_LOG(("gpuLoadImage(0x%x)\n",PRIM));
                        break;
                case 0xC0:          //  vid -> sys
-                       gpuStoreImage();  //  prim handles updateLace && skip
+                       gpuStoreImage(packet);  //  prim handles updateLace && skip
                        DO_LOG(("gpuStoreImage(0x%x)\n",PRIM));
                        break;
-               case 0xE1:
-                       {
-                               const u32 temp = PacketBuffer.U4[0];
-                               GPU_GP1 = (GPU_GP1 & ~0x000007FF) | (temp & 0x000007FF);
-                               gpuSetTexture(temp);
-                               DO_LOG(("gpuSetTexture(0x%x)\n",PRIM));
-                       }
-                       break;
-               case 0xE2:        
-                       {
-                               static const u8  TextureMask[32] = {
-                                       255, 7, 15, 7, 31, 7, 15, 7, 63, 7, 15, 7, 31, 7, 15, 7,        //
-                                       127, 7, 15, 7, 31, 7, 15, 7, 63, 7, 15, 7, 31, 7, 15, 7   //
-                               };
-                               const u32 temp = PacketBuffer.U4[0];
-                               TextureWindow[0] = ((temp >> 10) & 0x1F) << 3;
-                               TextureWindow[1] = ((temp >> 15) & 0x1F) << 3;
-                               TextureWindow[2] = TextureMask[(temp >> 0) & 0x1F];
-                               TextureWindow[3] = TextureMask[(temp >> 5) & 0x1F];
-                               gpuSetTexture(GPU_GP1);
-                               //isSkip = false;
-                               DO_LOG(("TextureWindow(0x%x)\n",PRIM));
-                       }
-                       break;
-               case 0xE3:
-                       {
-                               const u32 temp = PacketBuffer.U4[0];
-                               DrawingArea[0] = temp         & 0x3FF;
-                               DrawingArea[1] = (temp >> 10) & 0x3FF;
-                               //isSkip = false;
-                               DO_LOG(("DrawingArea_Pos(0x%x)\n",PRIM));
-                       }
-                       break;
-               case 0xE4:
-                       {
-                               const u32 temp = PacketBuffer.U4[0];
-                               DrawingArea[2] = (temp         & 0x3FF) + 1;
-                               DrawingArea[3] = ((temp >> 10) & 0x3FF) + 1;
-                               //isSkip = false;
-                               DO_LOG(("DrawingArea_Size(0x%x)\n",PRIM));
-                       }
-                       break;
-               case 0xE5:
-                       {
-                               const u32 temp = PacketBuffer.U4[0];
-                               DrawingOffset[0] = ((s32)temp<<(32-11))>>(32-11);
-                               DrawingOffset[1] = ((s32)temp<<(32-22))>>(32-11);
-                               //isSkip = false;
-                               DO_LOG(("DrawingOffset(0x%x)\n",PRIM));
-                       }
-                       break;
-               case 0xE6:
-                       {
-                               const u32 temp = PacketBuffer.U4[0];
-                               //GPU_GP1 = (GPU_GP1 & ~0x00001800) | ((temp&3) << 11);
-                               Masking = (temp & 0x2) <<  1;
-                               PixelMSB =(temp & 0x1) <<  8;
-                               DO_LOG(("SetMask(0x%x)\n",PRIM));
-                       }
-                       break;
+               case 0xE1 ... 0xE6: { // Draw settings
+                       gpuGP0Cmd_0xEx(gpu_unai, gpu_unai.PacketBuffer.U4[0]);
+               } break;
        }
 }
+#endif //!USE_GPULIB
+///////////////////////////////////////////////////////////////////////////////
+// End of code specific to non-gpulib standalone version of gpu_unai
+///////////////////////////////////////////////////////////////////////////////
index e72fda1..5df42cf 100644 (file)
 #ifndef FIXED_H
 #define FIXED_H
 
-#include "arm_features.h"
-
 typedef s32 fixed;
 
-#ifdef GPU_TABLE_10_BITS
-#define TABLE_BITS 10
-#else
-#define TABLE_BITS 16
-#endif
-
-#define FIXED_BITS 16
+//senquack - The gpu_drhell poly routines I adapted use 22.10 fixed point,
+//           while original Unai used 16.16: (see README_senquack.txt)
+//#define FIXED_BITS 16
+#define FIXED_BITS 10
 
 #define fixed_ZERO ((fixed)0)
 #define fixed_ONE  ((fixed)1<<FIXED_BITS)
 #define fixed_TWO  ((fixed)2<<FIXED_BITS)
 #define fixed_HALF ((fixed)((1<<FIXED_BITS)>>1))
 
-//  big precision inverse table.
-s32 s_invTable[(1<<TABLE_BITS)];
+#define fixed_LOMASK ((fixed)((1<<FIXED_BITS)-1))
+#define fixed_HIMASK ((fixed)(~fixed_LOMASK))
+
+// int<->fixed conversions:
+#define i2x(x) ((x)<<FIXED_BITS)
+#define x2i(x) ((x)>>FIXED_BITS)
+
+INLINE fixed FixedCeil(const fixed x)
+{
+       return (x + (fixed_ONE - 1)) & fixed_HIMASK;
+}
 
-INLINE  fixed i2x(const int   _x) { return  ((_x)<<FIXED_BITS); }
-INLINE  fixed x2i(const fixed _x) { return  ((_x)>>FIXED_BITS); }
+INLINE s32 FixedCeilToInt(const fixed x)
+{
+       return (x + (fixed_ONE - 1)) >> FIXED_BITS;
+}
 
-/*
-INLINE u32 Log2(u32 _a)
+//senquack - float<->fixed conversions:
+#define f2x(x) ((s32)((x) * (float)(1<<FIXED_BITS)))
+#define x2f(x) ((float)(x) / (float)(1<<FIXED_BITS))
+
+//senquack - floating point reciprocal:
+//NOTE: These assume x is always != 0 !!!
+#ifdef GPU_UNAI_USE_FLOATMATH
+#if defined(_MIPS_ARCH_MIPS32R2) || (__mips == 64)
+INLINE float FloatInv(const float x)
+{
+       float res;
+       asm("recip.s %0,%1" : "=f" (res) : "f" (x));
+       return res;
+}
+#else
+INLINE float FloatInv(const float x)
 {
-  u32 c = 0; // result of log2(v) will go here
-  if (_a & 0xFFFF0000) { _a >>= 16; c |= 16;  }
-  if (_a & 0xFF00) { _a >>= 8; c |= 8;  }
-  if (_a & 0xF0) { _a >>= 4; c |= 4;  }
-  if (_a & 0xC) { _a >>= 2; c |= 2;  }
-  if (_a & 0x2) { _a >>= 1; c |= 1;  }
-  return c;
+       return (1.0f / x);
 }
-*/
+#endif
+#endif
 
-#ifdef HAVE_ARMV5
+///////////////////////////////////////////////////////////////////////////
+// --- BEGIN INVERSE APPROXIMATION SECTION ---
+///////////////////////////////////////////////////////////////////////////
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+
+//  big precision inverse table.
+#define TABLE_BITS 16
+s32 s_invTable[(1<<TABLE_BITS)];
+
+//senquack - MIPS32 happens to have same instruction/format:
+#if defined(__arm__) || (__mips == 32)
 INLINE u32 Log2(u32 x) { u32 res; asm("clz %0,%1" : "=r" (res) : "r" (x)); return 32-res; }
 #else
 INLINE u32 Log2(u32 x) { u32 i = 0; for ( ; x > 0; ++i, x >>= 1); return i - 1; }
 #endif
 
-#ifdef GPU_TABLE_10_BITS
-INLINE  void  xInv (const fixed _b, s32& iFactor_, s32& iShift_)
-{
-    u32 uD   = (_b<0) ? -_b : _b ;
-    u32 uLog = Log2(uD);
-    uLog = uLog>(TABLE_BITS-1) ? uLog-(TABLE_BITS-1) : 0;
-    u32 uDen = uD>>uLog;
-    iFactor_ = s_invTable[uDen];
-    iFactor_ = (_b<0) ? -iFactor_ :iFactor_;
-    iShift_  = 15+uLog;
-}
-#else
 INLINE  void  xInv (const fixed _b, s32& iFactor_, s32& iShift_)
 {
   u32 uD = (_b<0) ? -_b : _b;
@@ -82,10 +95,12 @@ INLINE  void  xInv (const fixed _b, s32& iFactor_, s32& iShift_)
   {
        u32 uLog = Log2(uD);
     uLog = uLog>(TABLE_BITS-1) ? uLog-(TABLE_BITS-1) : 0;
-    u32 uDen = (uD>>uLog)-1;
+    u32 uDen = (uD>>uLog);
     iFactor_ = s_invTable[uDen];
     iFactor_ = (_b<0) ? -iFactor_ :iFactor_;
-    iShift_  = 15+uLog;
+    //senquack - Adapted to 22.10 fixed point (originally 16.16):
+    //iShift_  = 15+uLog;
+    iShift_  = 21+uLog;
   }
   else
   {
@@ -93,7 +108,6 @@ INLINE  void  xInv (const fixed _b, s32& iFactor_, s32& iShift_)
     iShift_ = 0;
   }
 }
-#endif
 
 INLINE  fixed xInvMulx  (const fixed _a, const s32 _iFact, const s32 _iShift)
 {
@@ -112,20 +126,9 @@ INLINE  fixed xLoDivx   (const fixed _a, const fixed _b)
   xInv(_b, iFact, iShift);
   return xInvMulx(_a, iFact, iShift);
 }
-
+#endif // GPU_UNAI_USE_INT_DIV_MULTINV
 ///////////////////////////////////////////////////////////////////////////
-template<typename T>
-INLINE  T Min2 (const T _a, const T _b)             { return (_a<_b)?_a:_b; }
-
-template<typename T>
-INLINE  T Min3 (const T _a, const T _b, const T _c) { return  Min2(Min2(_a,_b),_c); }
-
+// --- END INVERSE APPROXIMATION SECTION ---
 ///////////////////////////////////////////////////////////////////////////
-template<typename T>
-INLINE  T Max2 (const T _a, const T _b)             { return  (_a>_b)?_a:_b; }
 
-template<typename T>
-INLINE  T Max3 (const T _a, const T _b, const T _c) { return  Max2(Max2(_a,_b),_c); }
-
-///////////////////////////////////////////////////////////////////////////
 #endif  //FIXED_H
index 4cd7bff..723e09f 100644 (file)
@@ -1,6 +1,7 @@
 /***************************************************************************
 *   Copyright (C) 2010 PCSX4ALL Team                                      *
 *   Copyright (C) 2010 Unai                                               *
+*   Copyright (C) 2016 Senquack (dansilsby <AT> gmail <DOT> com)          *
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU General Public License as published by  *
 ***************************************************************************/
 
 ///////////////////////////////////////////////////////////////////////////////
-//  Inner loop driver instanciation file
+// Inner loop driver instantiation file
 
 ///////////////////////////////////////////////////////////////////////////////
-//  Option Masks
-#define   L ((CF>>0)&1)
-#define   B ((CF>>1)&1)
-#define   M ((CF>>2)&1)
-#define  BM ((CF>>3)&3)
-#define  TM ((CF>>5)&3)
-#define   G ((CF>>7)&1)
+//  Option Masks (CF template paramter)
+#define  CF_LIGHT     ((CF>> 0)&1) // Lighting
+#define  CF_BLEND     ((CF>> 1)&1) // Blending
+#define  CF_MASKCHECK ((CF>> 2)&1) // Mask bit check
+#define  CF_BLENDMODE ((CF>> 3)&3) // Blend mode   0..3
+#define  CF_TEXTMODE  ((CF>> 5)&3) // Texture mode 1..3 (0: texturing disabled)
+#define  CF_GOURAUD   ((CF>> 7)&1) // Gouraud shading
+#define  CF_MASKSET   ((CF>> 8)&1) // Mask bit set
+#define  CF_DITHER    ((CF>> 9)&1) // Dithering
+#define  CF_BLITMASK  ((CF>>10)&1) // blit_mask check (skip rendering pixels
+                                   //  that wouldn't end up displayed on
+                                   //  low-res screen using simple downscaler)
 
-#define  AH ((CF>>7)&1)
-
-#define  MB ((CF>>8)&1)
+//#ifdef __arm__
+//#ifndef ENABLE_GPU_ARMV7
+/* ARMv5 */
+//#include "gpu_inner_blend_arm5.h"
+//#else
+/* ARMv7 optimized */
+//#include "gpu_inner_blend_arm7.h"
+//#endif
+//#else
+//#include "gpu_inner_blend.h"
+//#endif
 
+// TODO: use the arm-optimized gpu_inner_blends for arm builds
 #include "gpu_inner_blend.h"
+
+#include "gpu_inner_quantization.h"
 #include "gpu_inner_light.h"
 
+// If defined, Gouraud colors are fixed-point 5.11, otherwise they are 8.16
+// This is only for debugging/verification of low-precision colors in C.
+// Low-precision Gouraud is intended for use by SIMD-optimized inner drivers
+// which get/use Gouraud colors in SIMD registers.
+//#define GPU_GOURAUD_LOW_PRECISION
+
+// How many bits of fixed-point precision GouraudColor uses
+#ifdef GPU_GOURAUD_LOW_PRECISION
+#define GPU_GOURAUD_FIXED_BITS 11
+#else
+#define GPU_GOURAUD_FIXED_BITS 16
+#endif
+
+// Used to pass Gouraud colors to gpuPixelSpanFn() (lines)
+struct GouraudColor {
+#ifdef GPU_GOURAUD_LOW_PRECISION
+       u16 r, g, b;
+       s16 r_incr, g_incr, b_incr;
+#else
+       u32 r, g, b;
+       s32 r_incr, g_incr, b_incr;
+#endif
+};
+
+static inline u16 gpuGouraudColor15bpp(u32 r, u32 g, u32 b)
+{
+       r >>= GPU_GOURAUD_FIXED_BITS;
+       g >>= GPU_GOURAUD_FIXED_BITS;
+       b >>= GPU_GOURAUD_FIXED_BITS;
+
+#ifndef GPU_GOURAUD_LOW_PRECISION
+       // High-precision Gouraud colors are 8-bit + fractional
+       r >>= 3;  g >>= 3;  b >>= 3;
+#endif
+
+       return r | (g << 5) | (b << 10);
+}
+
 ///////////////////////////////////////////////////////////////////////////////
-//  GPU Pixel opperations generator
-template<const int CF>
-INLINE void gpuPixelFn(u16 *pixel,const u16 data)
+//  GPU Pixel span operations generator gpuPixelSpanFn<>
+//  Oct 2016: Created/adapted from old gpuPixelFn by senquack:
+//  Original gpuPixelFn was used to draw lines one pixel at a time. I wrote
+//  new line algorithms that draw lines using horizontal/vertical/diagonal
+//  spans of pixels, necessitating new pixel-drawing function that could
+//  not only render spans of pixels, but gouraud-shade them as well.
+//  This speeds up line rendering and would allow tile-rendering (untextured
+//  rectangles) to use the same set of functions. Since tiles are always
+//  monochrome, they simply wouldn't use the extra set of 32 gouraud-shaded
+//  gpuPixelSpanFn functions (TODO?).
+//
+// NOTE: While the PS1 framebuffer is 16 bit, we use 8-bit pointers here,
+//       so that pDst can be incremented directly by 'incr' parameter
+//       without having to shift it before use.
+template<int CF>
+static u8* gpuPixelSpanFn(u8* pDst, uintptr_t data, ptrdiff_t incr, size_t len)
 {
-       if ((!M)&&(!B))
-       {
-               if(MB) { *pixel = data | 0x8000; }
-               else   { *pixel = data; }
+       // Blend func can save an operation if it knows uSrc MSB is
+       //  unset. For untextured prims, this is always true.
+       const bool skip_uSrc_mask = true;
+
+       u16 col;
+       struct GouraudColor * gcPtr;
+       u32 r, g, b;
+       s32 r_incr, g_incr, b_incr;
+
+       if (CF_GOURAUD) {
+               gcPtr = (GouraudColor*)data;
+               r = gcPtr->r;  r_incr = gcPtr->r_incr;
+               g = gcPtr->g;  g_incr = gcPtr->g_incr;
+               b = gcPtr->b;  b_incr = gcPtr->b_incr;
+       } else {
+               col = (u16)data;
        }
-       else if ((M)&&(!B))
-       {
-               if (!(*pixel&0x8000))
-               {
-                       if(MB) { *pixel = data | 0x8000; }
-                       else   { *pixel = data; }
+
+       do {
+               if (!CF_GOURAUD)
+               {   // NO GOURAUD
+                       if (!CF_MASKCHECK && !CF_BLEND) {
+                               if (CF_MASKSET) { *(u16*)pDst = col | 0x8000; }
+                               else            { *(u16*)pDst = col;          }
+                       } else if (CF_MASKCHECK && !CF_BLEND) {
+                               if (!(*(u16*)pDst & 0x8000)) {
+                                       if (CF_MASKSET) { *(u16*)pDst = col | 0x8000; }
+                                       else            { *(u16*)pDst = col;          }
+                               }
+                       } else {
+                               u16 uDst = *(u16*)pDst;
+                               if (CF_MASKCHECK) { if (uDst & 0x8000) goto endpixel; }
+
+                               u16 uSrc = col;
+
+                               if (CF_BLEND)
+                                       uSrc = gpuBlending<CF_BLENDMODE, skip_uSrc_mask>(uSrc, uDst);
+
+                               if (CF_MASKSET) { *(u16*)pDst = uSrc | 0x8000; }
+                               else            { *(u16*)pDst = uSrc;          }
+                       }
+
+               } else
+               {   // GOURAUD
+
+                       if (!CF_MASKCHECK && !CF_BLEND) {
+                               col = gpuGouraudColor15bpp(r, g, b);
+                               if (CF_MASKSET) { *(u16*)pDst = col | 0x8000; }
+                               else            { *(u16*)pDst = col;          }
+                       } else if (CF_MASKCHECK && !CF_BLEND) {
+                               col = gpuGouraudColor15bpp(r, g, b);
+                               if (!(*(u16*)pDst & 0x8000)) {
+                                       if (CF_MASKSET) { *(u16*)pDst = col | 0x8000; }
+                                       else            { *(u16*)pDst = col;          }
+                               }
+                       } else {
+                               u16 uDst = *(u16*)pDst;
+                               if (CF_MASKCHECK) { if (uDst & 0x8000) goto endpixel; }
+                               col = gpuGouraudColor15bpp(r, g, b);
+
+                               u16 uSrc = col;
+
+                               // Blend func can save an operation if it knows uSrc MSB is
+                               //  unset. For untextured prims, this is always true.
+                               const bool skip_uSrc_mask = true;
+
+                               if (CF_BLEND)
+                                       uSrc = gpuBlending<CF_BLENDMODE, skip_uSrc_mask>(uSrc, uDst);
+
+                               if (CF_MASKSET) { *(u16*)pDst = uSrc | 0x8000; }
+                               else            { *(u16*)pDst = uSrc;          }
+                       }
                }
+
+endpixel:
+               if (CF_GOURAUD) {
+                       r += r_incr;
+                       g += g_incr;
+                       b += b_incr;
+               }
+               pDst += incr;
+       } while (len-- > 1);
+
+       // Note from senquack: Normally, I'd prefer to write a 'do {} while (--len)'
+       //  loop, or even a for() loop, however, on MIPS platforms anything but the
+       //  'do {} while (len-- > 1)' tends to generate very unoptimal asm, with
+       //  many unneeded MULs/ADDs/branches at the ends of these functions.
+       //  If you change the loop structure above, be sure to compare the quality
+       //  of the generated code!!
+
+       if (CF_GOURAUD) {
+               gcPtr->r = r;
+               gcPtr->g = g;
+               gcPtr->b = b;
        }
-       else
-       {
-               u16 uDst = *pixel;
-               if(M) { if (uDst&0x8000) return; }
-               u16 uSrc = data;
-               u32 uMsk; if (BM==0) uMsk=0x7BDE;
-               if (BM==0) gpuBlending00(uSrc, uDst);
-               if (BM==1) gpuBlending01(uSrc, uDst);
-               if (BM==2) gpuBlending02(uSrc, uDst);
-               if (BM==3) gpuBlending03(uSrc, uDst);
-               if(MB) { *pixel = uSrc | 0x8000; }
-               else   { *pixel = uSrc; }
-       }
+       return pDst;
+}
+
+static u8* PixelSpanNULL(u8* pDst, uintptr_t data, ptrdiff_t incr, size_t len)
+{
+       #ifdef ENABLE_GPU_LOG_SUPPORT
+               fprintf(stdout,"PixelSpanNULL()\n");
+       #endif
+       return pDst;
 }
-///////////////////////////////////////////////////////////////////////////////
 
 ///////////////////////////////////////////////////////////////////////////////
-//  Pixel drawing drivers, for lines (only blending)
-typedef void (*PD)(u16 *pixel,const u16 data);
-const PD  gpuPixelDrivers[32] =   //  We only generate pixel op for MASKING/BLEND_ENABLE/BLEND_MODE
+//  PixelSpan (lines) innerloops driver
+typedef u8* (*PSD)(u8* dst, uintptr_t data, ptrdiff_t incr, size_t len);
+
+const PSD gpuPixelSpanDrivers[64] =
 { 
-       gpuPixelFn<0x00<<1>,gpuPixelFn<0x01<<1>,gpuPixelFn<0x02<<1>,gpuPixelFn<0x03<<1>,  
-       NULL,gpuPixelFn<0x05<<1>,NULL,gpuPixelFn<0x07<<1>,
-       NULL,gpuPixelFn<0x09<<1>,NULL,gpuPixelFn<0x0B<<1>,
-       NULL,gpuPixelFn<0x0D<<1>,NULL,gpuPixelFn<0x0F<<1>,
-
-       gpuPixelFn<(0x00<<1)|256>,gpuPixelFn<(0x01<<1)|256>,gpuPixelFn<(0x02<<1)|256>,gpuPixelFn<(0x03<<1)|256>,  
-       NULL,gpuPixelFn<(0x05<<1)|256>,NULL,gpuPixelFn<(0x07<<1)|256>,
-       NULL,gpuPixelFn<(0x09<<1)|256>,NULL,gpuPixelFn<(0x0B<<1)|256>,
-       NULL,gpuPixelFn<(0x0D<<1)|256>,NULL,gpuPixelFn<(0x0F<<1)|256>
+       // Array index | 'CF' template field | Field value
+       // ------------+---------------------+----------------
+       // Bit 0       | CF_BLEND            | off (0), on (1)
+       // Bit 1       | CF_MASKCHECK        | off (0), on (1)
+       // Bit 3:2     | CF_BLENDMODE        | 0..3
+       // Bit 4       | CF_MASKSET          | off (0), on (1)
+       // Bit 5       | CF_GOURAUD          | off (0), on (1)
+       //
+       // NULL entries are ones for which blending is disabled and blend-mode
+       //  field is non-zero, which is obviously invalid.
+
+       // Flat-shaded
+       gpuPixelSpanFn<0x00<<1>,         gpuPixelSpanFn<0x01<<1>,         gpuPixelSpanFn<0x02<<1>,         gpuPixelSpanFn<0x03<<1>,
+       PixelSpanNULL,                   gpuPixelSpanFn<0x05<<1>,         PixelSpanNULL,                   gpuPixelSpanFn<0x07<<1>,
+       PixelSpanNULL,                   gpuPixelSpanFn<0x09<<1>,         PixelSpanNULL,                   gpuPixelSpanFn<0x0B<<1>,
+       PixelSpanNULL,                   gpuPixelSpanFn<0x0D<<1>,         PixelSpanNULL,                   gpuPixelSpanFn<0x0F<<1>,
+
+       // Flat-shaded + PixelMSB (CF_MASKSET)
+       gpuPixelSpanFn<(0x00<<1)|0x100>, gpuPixelSpanFn<(0x01<<1)|0x100>, gpuPixelSpanFn<(0x02<<1)|0x100>, gpuPixelSpanFn<(0x03<<1)|0x100>,
+       PixelSpanNULL,                   gpuPixelSpanFn<(0x05<<1)|0x100>, PixelSpanNULL,                   gpuPixelSpanFn<(0x07<<1)|0x100>,
+       PixelSpanNULL,                   gpuPixelSpanFn<(0x09<<1)|0x100>, PixelSpanNULL,                   gpuPixelSpanFn<(0x0B<<1)|0x100>,
+       PixelSpanNULL,                   gpuPixelSpanFn<(0x0D<<1)|0x100>, PixelSpanNULL,                   gpuPixelSpanFn<(0x0F<<1)|0x100>,
+
+       // Gouraud-shaded (CF_GOURAUD)
+       gpuPixelSpanFn<(0x00<<1)|0x80>,  gpuPixelSpanFn<(0x01<<1)|0x80>,  gpuPixelSpanFn<(0x02<<1)|0x80>,  gpuPixelSpanFn<(0x03<<1)|0x80>,
+       PixelSpanNULL,                   gpuPixelSpanFn<(0x05<<1)|0x80>,  PixelSpanNULL,                   gpuPixelSpanFn<(0x07<<1)|0x80>,
+       PixelSpanNULL,                   gpuPixelSpanFn<(0x09<<1)|0x80>,  PixelSpanNULL,                   gpuPixelSpanFn<(0x0B<<1)|0x80>,
+       PixelSpanNULL,                   gpuPixelSpanFn<(0x0D<<1)|0x80>,  PixelSpanNULL,                   gpuPixelSpanFn<(0x0F<<1)|0x80>,
+
+       // Gouraud-shaded (CF_GOURAUD) + PixelMSB (CF_MASKSET)
+       gpuPixelSpanFn<(0x00<<1)|0x180>, gpuPixelSpanFn<(0x01<<1)|0x180>, gpuPixelSpanFn<(0x02<<1)|0x180>, gpuPixelSpanFn<(0x03<<1)|0x180>,
+       PixelSpanNULL,                   gpuPixelSpanFn<(0x05<<1)|0x180>, PixelSpanNULL,                   gpuPixelSpanFn<(0x07<<1)|0x180>,
+       PixelSpanNULL,                   gpuPixelSpanFn<(0x09<<1)|0x180>, PixelSpanNULL,                   gpuPixelSpanFn<(0x0B<<1)|0x180>,
+       PixelSpanNULL,                   gpuPixelSpanFn<(0x0D<<1)|0x180>, PixelSpanNULL,                   gpuPixelSpanFn<(0x0F<<1)|0x180>
 };
 
 ///////////////////////////////////////////////////////////////////////////////
 //  GPU Tiles innerloops generator
 
-template<const int CF>
-INLINE void  gpuTileSpanFn(u16 *pDst, u32 count, u16 data)
+template<int CF>
+static void gpuTileSpanFn(u16 *pDst, u32 count, u16 data)
 {
-       if ((!M)&&(!B))
-       {
-               if (MB) { data = data | 0x8000; }
+       if (!CF_MASKCHECK && !CF_BLEND) {
+               if (CF_MASKSET) { data = data | 0x8000; }
                do { *pDst++ = data; } while (--count);
-       }
-       else if ((M)&&(!B))
-       {
-               if (MB) { data = data | 0x8000; }
+       } else if (CF_MASKCHECK && !CF_BLEND) {
+               if (CF_MASKSET) { data = data | 0x8000; }
                do { if (!(*pDst&0x8000)) { *pDst = data; } pDst++; } while (--count);
-       }
-       else
+       } else
        {
-               u16 uSrc;
-               u16 uDst;
-               u32 uMsk; if (BM==0) uMsk=0x7BDE;
+               // Blend func can save an operation if it knows uSrc MSB is
+               //  unset. For untextured prims, this is always true.
+               const bool skip_uSrc_mask = true;
+
+               u16 uSrc, uDst;
                do
                {
-                       //  MASKING
-                       uDst = *pDst;
-                       if(M) { if (uDst&0x8000) goto endtile;  }
+                       if (CF_MASKCHECK || CF_BLEND) { uDst = *pDst; }
+                       if (CF_MASKCHECK) { if (uDst&0x8000) goto endtile; }
+
                        uSrc = data;
 
-                       //  BLEND
-                       if (BM==0) gpuBlending00(uSrc, uDst);
-                       if (BM==1) gpuBlending01(uSrc, uDst);
-                       if (BM==2) gpuBlending02(uSrc, uDst);
-                       if (BM==3) gpuBlending03(uSrc, uDst);
+                       if (CF_BLEND)
+                               uSrc = gpuBlending<CF_BLENDMODE, skip_uSrc_mask>(uSrc, uDst);
 
-                       if (MB) { *pDst = uSrc | 0x8000; }
-                       else    { *pDst = uSrc; }
-                       endtile: pDst++;
+                       if (CF_MASKSET) { *pDst = uSrc | 0x8000; }
+                       else            { *pDst = uSrc;          }
+
+                       //senquack - Did not apply "Silent Hill" mask-bit fix to here.
+                       // It is hard to tell from scarce documentation available and
+                       //  lack of comments in code, but I believe the tile-span
+                       //  functions here should not bother to preserve any source MSB,
+                       //  as they are not drawing from a texture.
+endtile:
+                       pDst++;
                }
                while (--count);
        }
 }
 
+static void TileNULL(u16 *pDst, u32 count, u16 data)
+{
+       #ifdef ENABLE_GPU_LOG_SUPPORT
+               fprintf(stdout,"TileNULL()\n");
+       #endif
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 //  Tiles innerloops driver
 typedef void (*PT)(u16 *pDst, u32 count, u16 data);
-const PT gpuTileSpanDrivers[64] = 
-{
-       gpuTileSpanFn<0x00>,NULL,gpuTileSpanFn<0x02>,NULL,  gpuTileSpanFn<0x04>,NULL,gpuTileSpanFn<0x06>,NULL,  NULL,NULL,gpuTileSpanFn<0x0A>,NULL,  NULL,NULL,gpuTileSpanFn<0x0E>,NULL,
-       NULL,NULL,gpuTileSpanFn<0x12>,NULL,  NULL,NULL,gpuTileSpanFn<0x16>,NULL,  NULL,NULL,gpuTileSpanFn<0x1A>,NULL,  NULL,NULL,gpuTileSpanFn<0x1E>,NULL,
 
-       gpuTileSpanFn<0x100>,NULL,gpuTileSpanFn<0x102>,NULL,  gpuTileSpanFn<0x104>,NULL,gpuTileSpanFn<0x106>,NULL,  NULL,NULL,gpuTileSpanFn<0x10A>,NULL,  NULL,NULL,gpuTileSpanFn<0x10E>,NULL,
-       NULL,NULL,gpuTileSpanFn<0x112>,NULL,  NULL,NULL,gpuTileSpanFn<0x116>,NULL,  NULL,NULL,gpuTileSpanFn<0x11A>,NULL,  NULL,NULL,gpuTileSpanFn<0x11E>,NULL,
+// Template instantiation helper macros
+#define TI(cf) gpuTileSpanFn<(cf)>
+#define TN     TileNULL
+#define TIBLOCK(ub) \
+       TI((ub)|0x00), TI((ub)|0x02), TI((ub)|0x04), TI((ub)|0x06), \
+       TN,            TI((ub)|0x0a), TN,            TI((ub)|0x0e), \
+       TN,            TI((ub)|0x12), TN,            TI((ub)|0x16), \
+       TN,            TI((ub)|0x1a), TN,            TI((ub)|0x1e)
+
+const PT gpuTileSpanDrivers[32] = {
+       TIBLOCK(0<<8), TIBLOCK(1<<8)
 };
 
+#undef TI
+#undef TN
+#undef TIBLOCK
+
+
 ///////////////////////////////////////////////////////////////////////////////
 //  GPU Sprites innerloops generator
 
-template<const int CF>
-INLINE void  gpuSpriteSpanFn(u16 *pDst, u32 count, u32 u0, const u32 mask)
+template<int CF>
+static void gpuSpriteSpanFn(u16 *pDst, u32 count, u8* pTxt, u32 u0)
 {
-       u16 uSrc;
-       u16 uDst;
-       const u16* pTxt = TBA+(u0&~0x1ff); u0=u0&0x1ff;
-       const u16 *_CBA; if(TM!=3) _CBA=CBA;
-       u32 lCol; if(L)  { lCol = ((u32)(b4<< 2)&(0x03ff)) | ((u32)(g4<<13)&(0x07ff<<10)) | ((u32)(r4<<24)&(0x07ff<<21));  }
-       u8 rgb; if (TM==1) rgb = ((u8*)pTxt)[u0>>1];
-       u32 uMsk; if ((B)&&(BM==0)) uMsk=0x7BDE;
+       // Blend func can save an operation if it knows uSrc MSB is unset.
+       //  Untextured prims can always skip (source color always comes with MSB=0).
+       //  For textured prims, lighting funcs always return it unset. (bonus!)
+       const bool skip_uSrc_mask = (!CF_TEXTMODE) || CF_LIGHT;
+
+       u16 uSrc, uDst, srcMSB;
+       u32 u0_mask = gpu_unai.TextureWindow[2];
+
+       u8 r5, g5, b5;
+       if (CF_LIGHT) {
+               r5 = gpu_unai.r5;
+               g5 = gpu_unai.g5;
+               b5 = gpu_unai.b5;
+       }
+
+       if (CF_TEXTMODE==3) {
+               // Texture is accessed byte-wise, so adjust mask if 16bpp
+               u0_mask <<= 1;
+       }
+
+       const u16 *CBA_; if (CF_TEXTMODE!=3) CBA_ = gpu_unai.CBA;
 
        do
        {
-               //  MASKING
-               if(M)   { uDst = *pDst;   if (uDst&0x8000) { u0=(u0+1)&mask; goto endsprite; }  }
+               if (CF_MASKCHECK || CF_BLEND) { uDst = *pDst; }
+               if (CF_MASKCHECK) if (uDst&0x8000) { goto endsprite; }
 
-               //  TEXTURE MAPPING
-               if (TM==1) { if (!(u0&1)) rgb = ((u8*)pTxt)[u0>>1]; uSrc = _CBA[(rgb>>((u0&1)<<2))&0xf]; u0=(u0+1)&mask; }
-               if (TM==2) { uSrc = _CBA[((u8*)pTxt)[u0]]; u0=(u0+1)&mask; }
-               if (TM==3) { uSrc = pTxt[u0]; u0=(u0+1)&mask; }
-               if(!AH) { if (!uSrc) goto endsprite; }
-
-               //  BLEND
-               if(B)
-               {
-                       if(uSrc&0x8000)
-                       {
-                               //  LIGHTING CALCULATIONS
-                               if(L)  { gpuLightingTXT(uSrc, lCol);   }
-
-                               if(!M)    { uDst = *pDst; }
-                               if (BM==0) gpuBlending00(uSrc, uDst);
-                               if (BM==1) gpuBlending01(uSrc, uDst);
-                               if (BM==2) gpuBlending02(uSrc, uDst);
-                               if (BM==3) gpuBlending03(uSrc, uDst);
-                       }
-                       else
-                       {
-                               //  LIGHTING CALCULATIONS
-                               if(L)  { gpuLightingTXT(uSrc, lCol); }
-                       }
+               if (CF_TEXTMODE==1) {  //  4bpp (CLUT)
+                       u8 rgb = pTxt[(u0 & u0_mask)>>1];
+                       uSrc = CBA_[(rgb>>((u0&1)<<2))&0xf];
                }
-               else
-               {
-                       //  LIGHTING CALCULATIONS
-                       if(L)  { gpuLightingTXT(uSrc, lCol);   } else
-                       { if(!MB) uSrc&= 0x7fff;               }
+               if (CF_TEXTMODE==2) {  //  8bpp (CLUT)
+                       uSrc = CBA_[pTxt[u0 & u0_mask]];
+               }
+               if (CF_TEXTMODE==3) {  // 16bpp
+                       uSrc = *(u16*)(&pTxt[u0 & u0_mask]);
                }
 
-               if (MB) { *pDst = uSrc | 0x8000; }
-               else    { *pDst = uSrc; }
+               if (!uSrc) goto endsprite;
+
+               //senquack - save source MSB, as blending or lighting macros will not
+               //           (Silent Hill gray rectangles mask bit bug)
+               if (CF_BLEND || CF_LIGHT) srcMSB = uSrc & 0x8000;
                
-               endsprite: pDst++;
+               if (CF_LIGHT)
+                       uSrc = gpuLightingTXT(uSrc, r5, g5, b5);
+
+               if (CF_BLEND && srcMSB)
+                       uSrc = gpuBlending<CF_BLENDMODE, skip_uSrc_mask>(uSrc, uDst);
+
+               if (CF_MASKSET)                { *pDst = uSrc | 0x8000; }
+               else if (CF_BLEND || CF_LIGHT) { *pDst = uSrc | srcMSB; }
+               else                           { *pDst = uSrc;          }
+
+endsprite:
+               u0 += (CF_TEXTMODE==3) ? 2 : 1;
+               pDst++;
        }
        while (--count);
 }
+
+static void SpriteNULL(u16 *pDst, u32 count, u8* pTxt, u32 u0)
+{
+       #ifdef ENABLE_GPU_LOG_SUPPORT
+               fprintf(stdout,"SpriteNULL()\n");
+       #endif
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 
 ///////////////////////////////////////////////////////////////////////////////
 //  Sprite innerloops driver
-typedef void (*PS)(u16 *pDst, u32 count, u32 u0, const u32 mask);
-const PS gpuSpriteSpanDrivers[512] = 
-{
-       NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,
-       NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,
-       gpuSpriteSpanFn<0x20>,gpuSpriteSpanFn<0x21>,gpuSpriteSpanFn<0x22>,gpuSpriteSpanFn<0x23>,  gpuSpriteSpanFn<0x24>,gpuSpriteSpanFn<0x25>,gpuSpriteSpanFn<0x26>,gpuSpriteSpanFn<0x27>,  NULL,NULL,gpuSpriteSpanFn<0x2A>,gpuSpriteSpanFn<0x2B>,  NULL,NULL,gpuSpriteSpanFn<0x2E>,gpuSpriteSpanFn<0x2F>,
-       NULL,NULL,gpuSpriteSpanFn<0x32>,gpuSpriteSpanFn<0x33>,  NULL,NULL,gpuSpriteSpanFn<0x36>,gpuSpriteSpanFn<0x37>,  NULL,NULL,gpuSpriteSpanFn<0x3A>,gpuSpriteSpanFn<0x3B>,  NULL,NULL,gpuSpriteSpanFn<0x3E>,gpuSpriteSpanFn<0x3F>,
-       gpuSpriteSpanFn<0x40>,gpuSpriteSpanFn<0x41>,gpuSpriteSpanFn<0x42>,gpuSpriteSpanFn<0x43>,  gpuSpriteSpanFn<0x44>,gpuSpriteSpanFn<0x45>,gpuSpriteSpanFn<0x46>,gpuSpriteSpanFn<0x47>,  NULL,NULL,gpuSpriteSpanFn<0x4A>,gpuSpriteSpanFn<0x4B>,  NULL,NULL,gpuSpriteSpanFn<0x4E>,gpuSpriteSpanFn<0x4F>,
-       NULL,NULL,gpuSpriteSpanFn<0x52>,gpuSpriteSpanFn<0x53>,  NULL,NULL,gpuSpriteSpanFn<0x56>,gpuSpriteSpanFn<0x57>,  NULL,NULL,gpuSpriteSpanFn<0x5A>,gpuSpriteSpanFn<0x5B>,  NULL,NULL,gpuSpriteSpanFn<0x5E>,gpuSpriteSpanFn<0x5F>,
-       gpuSpriteSpanFn<0x60>,gpuSpriteSpanFn<0x61>,gpuSpriteSpanFn<0x62>,gpuSpriteSpanFn<0x63>,  gpuSpriteSpanFn<0x64>,gpuSpriteSpanFn<0x65>,gpuSpriteSpanFn<0x66>,gpuSpriteSpanFn<0x67>,  NULL,NULL,gpuSpriteSpanFn<0x6A>,gpuSpriteSpanFn<0x6B>,  NULL,NULL,gpuSpriteSpanFn<0x6E>,gpuSpriteSpanFn<0x6F>,
-       NULL,NULL,gpuSpriteSpanFn<0x72>,gpuSpriteSpanFn<0x73>,  NULL,NULL,gpuSpriteSpanFn<0x76>,gpuSpriteSpanFn<0x77>,  NULL,NULL,gpuSpriteSpanFn<0x7A>,gpuSpriteSpanFn<0x7B>,  NULL,NULL,gpuSpriteSpanFn<0x7E>,gpuSpriteSpanFn<0x7F>,
-
-       NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,
-       NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,
-       gpuSpriteSpanFn<0xa0>,gpuSpriteSpanFn<0xa1>,gpuSpriteSpanFn<0xa2>,gpuSpriteSpanFn<0xa3>,  gpuSpriteSpanFn<0xa4>,gpuSpriteSpanFn<0xa5>,gpuSpriteSpanFn<0xa6>,gpuSpriteSpanFn<0xa7>,  NULL,NULL,gpuSpriteSpanFn<0xaA>,gpuSpriteSpanFn<0xaB>,  NULL,NULL,gpuSpriteSpanFn<0xaE>,gpuSpriteSpanFn<0xaF>,
-       NULL,NULL,gpuSpriteSpanFn<0xb2>,gpuSpriteSpanFn<0xb3>,  NULL,NULL,gpuSpriteSpanFn<0xb6>,gpuSpriteSpanFn<0xb7>,  NULL,NULL,gpuSpriteSpanFn<0xbA>,gpuSpriteSpanFn<0xbB>,  NULL,NULL,gpuSpriteSpanFn<0xbE>,gpuSpriteSpanFn<0xbF>,
-       gpuSpriteSpanFn<0xc0>,gpuSpriteSpanFn<0xc1>,gpuSpriteSpanFn<0xc2>,gpuSpriteSpanFn<0xc3>,  gpuSpriteSpanFn<0xc4>,gpuSpriteSpanFn<0xc5>,gpuSpriteSpanFn<0xc6>,gpuSpriteSpanFn<0xc7>,  NULL,NULL,gpuSpriteSpanFn<0xcA>,gpuSpriteSpanFn<0xcB>,  NULL,NULL,gpuSpriteSpanFn<0xcE>,gpuSpriteSpanFn<0xcF>,
-       NULL,NULL,gpuSpriteSpanFn<0xd2>,gpuSpriteSpanFn<0xd3>,  NULL,NULL,gpuSpriteSpanFn<0xd6>,gpuSpriteSpanFn<0xd7>,  NULL,NULL,gpuSpriteSpanFn<0xdA>,gpuSpriteSpanFn<0xdB>,  NULL,NULL,gpuSpriteSpanFn<0xdE>,gpuSpriteSpanFn<0xdF>,
-       gpuSpriteSpanFn<0xe0>,gpuSpriteSpanFn<0xe1>,gpuSpriteSpanFn<0xe2>,gpuSpriteSpanFn<0xe3>,  gpuSpriteSpanFn<0xe4>,gpuSpriteSpanFn<0xe5>,gpuSpriteSpanFn<0xe6>,gpuSpriteSpanFn<0xe7>,  NULL,NULL,gpuSpriteSpanFn<0xeA>,gpuSpriteSpanFn<0xeB>,  NULL,NULL,gpuSpriteSpanFn<0xeE>,gpuSpriteSpanFn<0xeF>,
-       NULL,NULL,gpuSpriteSpanFn<0xf2>,gpuSpriteSpanFn<0xf3>,  NULL,NULL,gpuSpriteSpanFn<0xf6>,gpuSpriteSpanFn<0xf7>,  NULL,NULL,gpuSpriteSpanFn<0xfA>,gpuSpriteSpanFn<0xfB>,  NULL,NULL,gpuSpriteSpanFn<0xfE>,gpuSpriteSpanFn<0xfF>,
-
-       NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,
-       NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,
-       gpuSpriteSpanFn<0x120>,gpuSpriteSpanFn<0x121>,gpuSpriteSpanFn<0x122>,gpuSpriteSpanFn<0x123>,  gpuSpriteSpanFn<0x124>,gpuSpriteSpanFn<0x125>,gpuSpriteSpanFn<0x126>,gpuSpriteSpanFn<0x127>,  NULL,NULL,gpuSpriteSpanFn<0x12A>,gpuSpriteSpanFn<0x12B>,  NULL,NULL,gpuSpriteSpanFn<0x12E>,gpuSpriteSpanFn<0x12F>,
-       NULL,NULL,gpuSpriteSpanFn<0x132>,gpuSpriteSpanFn<0x133>,  NULL,NULL,gpuSpriteSpanFn<0x136>,gpuSpriteSpanFn<0x137>,  NULL,NULL,gpuSpriteSpanFn<0x13A>,gpuSpriteSpanFn<0x13B>,  NULL,NULL,gpuSpriteSpanFn<0x13E>,gpuSpriteSpanFn<0x13F>,
-       gpuSpriteSpanFn<0x140>,gpuSpriteSpanFn<0x141>,gpuSpriteSpanFn<0x142>,gpuSpriteSpanFn<0x143>,  gpuSpriteSpanFn<0x144>,gpuSpriteSpanFn<0x145>,gpuSpriteSpanFn<0x146>,gpuSpriteSpanFn<0x147>,  NULL,NULL,gpuSpriteSpanFn<0x14A>,gpuSpriteSpanFn<0x14B>,  NULL,NULL,gpuSpriteSpanFn<0x14E>,gpuSpriteSpanFn<0x14F>,
-       NULL,NULL,gpuSpriteSpanFn<0x152>,gpuSpriteSpanFn<0x153>,  NULL,NULL,gpuSpriteSpanFn<0x156>,gpuSpriteSpanFn<0x157>,  NULL,NULL,gpuSpriteSpanFn<0x15A>,gpuSpriteSpanFn<0x15B>,  NULL,NULL,gpuSpriteSpanFn<0x15E>,gpuSpriteSpanFn<0x15F>,
-       gpuSpriteSpanFn<0x160>,gpuSpriteSpanFn<0x161>,gpuSpriteSpanFn<0x162>,gpuSpriteSpanFn<0x163>,  gpuSpriteSpanFn<0x164>,gpuSpriteSpanFn<0x165>,gpuSpriteSpanFn<0x166>,gpuSpriteSpanFn<0x167>,  NULL,NULL,gpuSpriteSpanFn<0x16A>,gpuSpriteSpanFn<0x16B>,  NULL,NULL,gpuSpriteSpanFn<0x16E>,gpuSpriteSpanFn<0x16F>,
-       NULL,NULL,gpuSpriteSpanFn<0x172>,gpuSpriteSpanFn<0x173>,  NULL,NULL,gpuSpriteSpanFn<0x176>,gpuSpriteSpanFn<0x177>,  NULL,NULL,gpuSpriteSpanFn<0x17A>,gpuSpriteSpanFn<0x17B>,  NULL,NULL,gpuSpriteSpanFn<0x17E>,gpuSpriteSpanFn<0x17F>,
-                                                                                                                                                                                                                                                                                                                                                                                      
-       NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,
-       NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,
-       gpuSpriteSpanFn<0x1a0>,gpuSpriteSpanFn<0x1a1>,gpuSpriteSpanFn<0x1a2>,gpuSpriteSpanFn<0x1a3>,  gpuSpriteSpanFn<0x1a4>,gpuSpriteSpanFn<0x1a5>,gpuSpriteSpanFn<0x1a6>,gpuSpriteSpanFn<0x1a7>,  NULL,NULL,gpuSpriteSpanFn<0x1aA>,gpuSpriteSpanFn<0x1aB>,  NULL,NULL,gpuSpriteSpanFn<0x1aE>,gpuSpriteSpanFn<0x1aF>,
-       NULL,NULL,gpuSpriteSpanFn<0x1b2>,gpuSpriteSpanFn<0x1b3>,  NULL,NULL,gpuSpriteSpanFn<0x1b6>,gpuSpriteSpanFn<0x1b7>,  NULL,NULL,gpuSpriteSpanFn<0x1bA>,gpuSpriteSpanFn<0x1bB>,  NULL,NULL,gpuSpriteSpanFn<0x1bE>,gpuSpriteSpanFn<0x1bF>,
-       gpuSpriteSpanFn<0x1c0>,gpuSpriteSpanFn<0x1c1>,gpuSpriteSpanFn<0x1c2>,gpuSpriteSpanFn<0x1c3>,  gpuSpriteSpanFn<0x1c4>,gpuSpriteSpanFn<0x1c5>,gpuSpriteSpanFn<0x1c6>,gpuSpriteSpanFn<0x1c7>,  NULL,NULL,gpuSpriteSpanFn<0x1cA>,gpuSpriteSpanFn<0x1cB>,  NULL,NULL,gpuSpriteSpanFn<0x1cE>,gpuSpriteSpanFn<0x1cF>,
-       NULL,NULL,gpuSpriteSpanFn<0x1d2>,gpuSpriteSpanFn<0x1d3>,  NULL,NULL,gpuSpriteSpanFn<0x1d6>,gpuSpriteSpanFn<0x1d7>,  NULL,NULL,gpuSpriteSpanFn<0x1dA>,gpuSpriteSpanFn<0x1dB>,  NULL,NULL,gpuSpriteSpanFn<0x1dE>,gpuSpriteSpanFn<0x1dF>,
-       gpuSpriteSpanFn<0x1e0>,gpuSpriteSpanFn<0x1e1>,gpuSpriteSpanFn<0x1e2>,gpuSpriteSpanFn<0x1e3>,  gpuSpriteSpanFn<0x1e4>,gpuSpriteSpanFn<0x1e5>,gpuSpriteSpanFn<0x1e6>,gpuSpriteSpanFn<0x1e7>,  NULL,NULL,gpuSpriteSpanFn<0x1eA>,gpuSpriteSpanFn<0x1eB>,  NULL,NULL,gpuSpriteSpanFn<0x1eE>,gpuSpriteSpanFn<0x1eF>,
-       NULL,NULL,gpuSpriteSpanFn<0x1f2>,gpuSpriteSpanFn<0x1f3>,  NULL,NULL,gpuSpriteSpanFn<0x1f6>,gpuSpriteSpanFn<0x1f7>,  NULL,NULL,gpuSpriteSpanFn<0x1fA>,gpuSpriteSpanFn<0x1fB>,  NULL,NULL,gpuSpriteSpanFn<0x1fE>,gpuSpriteSpanFn<0x1fF>
+typedef void (*PS)(u16 *pDst, u32 count, u8* pTxt, u32 u0);
+
+// Template instantiation helper macros
+#define TI(cf) gpuSpriteSpanFn<(cf)>
+#define TN     SpriteNULL
+#define TIBLOCK(ub) \
+       TN,            TN,            TN,            TN,            TN,            TN,            TN,            TN,            \
+       TN,            TN,            TN,            TN,            TN,            TN,            TN,            TN,            \
+       TN,            TN,            TN,            TN,            TN,            TN,            TN,            TN,            \
+       TN,            TN,            TN,            TN,            TN,            TN,            TN,            TN,            \
+       TI((ub)|0x20), TI((ub)|0x21), TI((ub)|0x22), TI((ub)|0x23), TI((ub)|0x24), TI((ub)|0x25), TI((ub)|0x26), TI((ub)|0x27), \
+       TN,            TN,            TI((ub)|0x2a), TI((ub)|0x2b), TN,            TN,            TI((ub)|0x2e), TI((ub)|0x2f), \
+       TN,            TN,            TI((ub)|0x32), TI((ub)|0x33), TN,            TN,            TI((ub)|0x36), TI((ub)|0x37), \
+       TN,            TN,            TI((ub)|0x3a), TI((ub)|0x3b), TN,            TN,            TI((ub)|0x3e), TI((ub)|0x3f), \
+       TI((ub)|0x40), TI((ub)|0x41), TI((ub)|0x42), TI((ub)|0x43), TI((ub)|0x44), TI((ub)|0x45), TI((ub)|0x46), TI((ub)|0x47), \
+       TN,            TN,            TI((ub)|0x4a), TI((ub)|0x4b), TN,            TN,            TI((ub)|0x4e), TI((ub)|0x4f), \
+       TN,            TN,            TI((ub)|0x52), TI((ub)|0x53), TN,            TN,            TI((ub)|0x56), TI((ub)|0x57), \
+       TN,            TN,            TI((ub)|0x5a), TI((ub)|0x5b), TN,            TN,            TI((ub)|0x5e), TI((ub)|0x5f), \
+       TI((ub)|0x60), TI((ub)|0x61), TI((ub)|0x62), TI((ub)|0x63), TI((ub)|0x64), TI((ub)|0x65), TI((ub)|0x66), TI((ub)|0x67), \
+       TN,            TN,            TI((ub)|0x6a), TI((ub)|0x6b), TN,            TN,            TI((ub)|0x6e), TI((ub)|0x6f), \
+       TN,            TN,            TI((ub)|0x72), TI((ub)|0x73), TN,            TN,            TI((ub)|0x76), TI((ub)|0x77), \
+       TN,            TN,            TI((ub)|0x7a), TI((ub)|0x7b), TN,            TN,            TI((ub)|0x7e), TI((ub)|0x7f)
+
+const PS gpuSpriteSpanDrivers[256] = {
+       TIBLOCK(0<<8), TIBLOCK(1<<8)
 };
 
+#undef TI
+#undef TN
+#undef TIBLOCK
+
 ///////////////////////////////////////////////////////////////////////////////
 //  GPU Polygon innerloops generator
-template<const int CF>
-INLINE void  gpuPolySpanFn(u16 *pDst, u32 count)
+
+//senquack - Newer version with following changes:
+//           * Adapted to work with new poly routings in gpu_raster_polygon.h
+//             adapted from DrHell GPU. They are less glitchy and use 22.10
+//             fixed-point instead of original UNAI's 16.16.
+//           * Texture coordinates are no longer packed together into one
+//             unsigned int. This seems to lose too much accuracy (they each
+//             end up being only 8.7 fixed-point that way) and pixel-droupouts
+//             were noticeable both with original code and current DrHell
+//             adaptations. An example would be the sky in NFS3. Now, they are
+//             stored in separate ints, using separate masks.
+//           * Function is no longer INLINE, as it was always called
+//             through a function pointer.
+//           * Function now ensures the mask bit of source texture is preserved
+//             across calls to blending functions (Silent Hill rectangles fix)
+//           * November 2016: Large refactoring of blending/lighting when
+//             JohnnyF added dithering. See gpu_inner_quantization.h and
+//             relevant blend/light headers.
+// (see README_senquack.txt)
+template<int CF>
+static void gpuPolySpanFn(const gpu_unai_t &gpu_unai, u16 *pDst, u32 count)
 {
-       if (!TM)
-       {       
-               // NO TEXTURE
-               if (!G)
+       // Blend func can save an operation if it knows uSrc MSB is unset.
+       //  Untextured prims can always skip this (src color MSB is always 0).
+       //  For textured prims, lighting funcs always return it unset. (bonus!)
+       const bool skip_uSrc_mask = (!CF_TEXTMODE) || CF_LIGHT;
+
+       u32 bMsk; if (CF_BLITMASK) bMsk = gpu_unai.blit_mask;
+
+       if (!CF_TEXTMODE)
+       {
+               if (!CF_GOURAUD)
                {
-                       // NO GOURAUD
-                       u16 data;
-                       if (L) { u32 lCol=((u32)(b4<< 2)&(0x03ff)) | ((u32)(g4<<13)&(0x07ff<<10)) | ((u32)(r4<<24)&(0x07ff<<21)); gpuLightingRGB(data,lCol); }
-                       else data=PixelData;
-                       if ((!M)&&(!B))
-                       {
-                               if (MB) { data = data | 0x8000; }
-                               do { *pDst++ = data; } while (--count);
-                       }
-                       else if ((M)&&(!B))
-                       {
-                               if (MB) { data = data | 0x8000; }
-                               do { if (!(*pDst&0x8000)) { *pDst = data; } pDst++; } while (--count);
-                       }
-                       else
-                       {
-                               u16 uSrc;
-                               u16 uDst;
-                               u32 uMsk; if (BM==0) uMsk=0x7BDE;
-                               do
-                               {
-                                       //  masking
-                                       uDst = *pDst;
-                                       if(M) { if (uDst&0x8000) goto endtile;  }
-                                       uSrc = data;
-                                       //  blend
-                                       if (BM==0) gpuBlending00(uSrc, uDst);
-                                       if (BM==1) gpuBlending01(uSrc, uDst);
-                                       if (BM==2) gpuBlending02(uSrc, uDst);
-                                       if (BM==3) gpuBlending03(uSrc, uDst);
-                                       if (MB) { *pDst = uSrc | 0x8000; }
-                                       else    { *pDst = uSrc; }
-                                       endtile: pDst++;
-                               }
-                               while (--count);
-                       }
+                       // UNTEXTURED, NO GOURAUD
+                       const u16 pix15 = gpu_unai.PixelData;
+                       do {
+                               u16 uSrc, uDst;
+
+                               // NOTE: Don't enable CF_BLITMASK  pixel skipping (speed hack)
+                               //  on untextured polys. It seems to do more harm than good: see
+                               //  gravestone text at end of Medieval intro sequence. -senquack
+                               //if (CF_BLITMASK) { if ((bMsk>>((((uintptr_t)pDst)>>1)&7))&1) { goto endpolynotextnogou; } }
+
+                               if (CF_BLEND || CF_MASKCHECK) uDst = *pDst;
+                               if (CF_MASKCHECK) { if (uDst&0x8000) { goto endpolynotextnogou; } }
+
+                               uSrc = pix15;
+
+                               if (CF_BLEND)
+                                       uSrc = gpuBlending<CF_BLENDMODE, skip_uSrc_mask>(uSrc, uDst);
+
+                               if (CF_MASKSET) { *pDst = uSrc | 0x8000; }
+                               else            { *pDst = uSrc;          }
+
+endpolynotextnogou:
+                               pDst++;
+                       } while(--count);
                }
                else
                {
-                       // GOURAUD
-                       u16 uDst;
-                       u16 uSrc;
-                       u32 linc=lInc;
-                       u32 lCol=((u32)(b4>>14)&(0x03ff)) | ((u32)(g4>>3)&(0x07ff<<10)) | ((u32)(r4<<8)&(0x07ff<<21));
-                       u32 uMsk; if ((B)&&(BM==0)) uMsk=0x7BDE;
-                       do
-                       {
-                               //  masking
-                               if(M) { uDst = *pDst;  if (uDst&0x8000) goto endgou;  }
-                               //  blend
-                               if(B)
-                               {
-                                       //  light
-                                       gpuLightingRGB(uSrc,lCol);
-                                       if(!M)    { uDst = *pDst; }
-                                       if (BM==0) gpuBlending00(uSrc, uDst);
-                                       if (BM==1) gpuBlending01(uSrc, uDst);
-                                       if (BM==2) gpuBlending02(uSrc, uDst);
-                                       if (BM==3) gpuBlending03(uSrc, uDst);
-                               }
-                               else
-                               {
-                                       //  light
-                                       gpuLightingRGB(uSrc,lCol);
+                       // UNTEXTURED, GOURAUD
+                       u32 l_gCol = gpu_unai.gCol;
+                       u32 l_gInc = gpu_unai.gInc;
+
+                       do {
+                               u16 uDst, uSrc;
+
+                               // See note in above loop regarding CF_BLITMASK
+                               //if (CF_BLITMASK) { if ((bMsk>>((((uintptr_t)pDst)>>1)&7))&1) goto endpolynotextgou; }
+
+                               if (CF_BLEND || CF_MASKCHECK) uDst = *pDst;
+                               if (CF_MASKCHECK) { if (uDst&0x8000) goto endpolynotextgou; }
+
+                               if (CF_DITHER) {
+                                       // GOURAUD, DITHER
+
+                                       u32 uSrc24 = gpuLightingRGB24(l_gCol);
+                                       if (CF_BLEND)
+                                               uSrc24 = gpuBlending24<CF_BLENDMODE>(uSrc24, uDst);
+                                       uSrc = gpuColorQuantization24<CF_DITHER>(uSrc24, pDst);
+                               } else {
+                                       // GOURAUD, NO DITHER
+
+                                       uSrc = gpuLightingRGB(l_gCol);
+
+                                       if (CF_BLEND)
+                                               uSrc = gpuBlending<CF_BLENDMODE, skip_uSrc_mask>(uSrc, uDst);
                                }
-                               if (MB) { *pDst = uSrc | 0x8000; }
-                               else    { *pDst = uSrc; }
-                               endgou: pDst++; lCol=(lCol+linc);
+
+                               if (CF_MASKSET) { *pDst = uSrc | 0x8000; }
+                               else            { *pDst = uSrc;          }
+
+endpolynotextgou:
+                               pDst++;
+                               l_gCol += l_gInc;
                        }
                        while (--count);
                }
        }
        else
        {
-               // TEXTURE
-               u16 uDst;
-               u16 uSrc;
-               u32 linc; if (L&&G) linc=lInc;
-               u32 tinc=tInc;
-               u32 tmsk=tMsk;
-               u32 tCor = ((u32)( u4<<7)&0x7fff0000) | ((u32)( v4>>9)&0x00007fff); tCor&= tmsk;
-               const u16* _TBA=TBA;
-               const u16* _CBA; if (TM!=3) _CBA=CBA;
-               u32 lCol;
-               if(L && !G) { lCol = ((u32)(b4<< 2)&(0x03ff)) | ((u32)(g4<<13)&(0x07ff<<10)) | ((u32)(r4<<24)&(0x07ff<<21)); }
-               else if(L && G) { lCol = ((u32)(b4>>14)&(0x03ff)) | ((u32)(g4>>3)&(0x07ff<<10)) | ((u32)(r4<<8)&(0x07ff<<21));  }
-               u32 uMsk; if ((B)&&(BM==0)) uMsk=0x7BDE;
+               // TEXTURED
+
+               u16 uDst, uSrc, srcMSB;
+
+               //senquack - note: original UNAI code had gpu_unai.{u4/v4} packed into
+               // one 32-bit unsigned int, but this proved to lose too much accuracy
+               // (pixel drouputs noticeable in NFS3 sky), so now are separate vars.
+               u32 l_u_msk = gpu_unai.u_msk;     u32 l_v_msk = gpu_unai.v_msk;
+               u32 l_u = gpu_unai.u & l_u_msk;   u32 l_v = gpu_unai.v & l_v_msk;
+               s32 l_u_inc = gpu_unai.u_inc;     s32 l_v_inc = gpu_unai.v_inc;
+
+               const u16* TBA_ = gpu_unai.TBA;
+               const u16* CBA_; if (CF_TEXTMODE!=3) CBA_ = gpu_unai.CBA;
+
+               u8 r5, g5, b5;
+               u8 r8, g8, b8;
+
+               u32 l_gInc, l_gCol;
+
+               if (CF_LIGHT) {
+                       if (CF_GOURAUD) {
+                               l_gInc = gpu_unai.gInc;
+                               l_gCol = gpu_unai.gCol;
+                       } else {
+                               if (CF_DITHER) {
+                                       r8 = gpu_unai.r8;
+                                       g8 = gpu_unai.g8;
+                                       b8 = gpu_unai.b8;
+                               } else {
+                                       r5 = gpu_unai.r5;
+                                       g5 = gpu_unai.g5;
+                                       b5 = gpu_unai.b5;
+                               }
+                       }
+               }
+
                do
                {
-                       //  masking
-                       if(M) { uDst = *pDst;  if (uDst&0x8000) goto endpoly;  }
-                       //  texture
-                       if (TM==1) { u32 tu=(tCor>>23); u32 tv=(tCor<<4)&(0xff<<11); u8 rgb=((u8*)_TBA)[tv+(tu>>1)]; uSrc=_CBA[(rgb>>((tu&1)<<2))&0xf]; if(!uSrc) goto endpoly; }
-                       if (TM==2) { uSrc = _CBA[(((u8*)_TBA)[(tCor>>23)+((tCor<<4)&(0xff<<11))])]; if(!uSrc)  goto endpoly; }
-                       if (TM==3) { uSrc = _TBA[(tCor>>23)+((tCor<<3)&(0xff<<10))]; if(!uSrc)  goto endpoly; }
-                       //  blend
-                       if(B)
-                       {
-                               if (uSrc&0x8000)
-                               {
-                                       //  light
-                                       if(L) gpuLightingTXT(uSrc, lCol);
-                                       if(!M)    { uDst = *pDst; }
-                                       if (BM==0) gpuBlending00(uSrc, uDst);
-                                       if (BM==1) gpuBlending01(uSrc, uDst);
-                                       if (BM==2) gpuBlending02(uSrc, uDst);
-                                       if (BM==3) gpuBlending03(uSrc, uDst);
-                               }
-                               else
-                               {
-                                       // light
-                                       if(L) gpuLightingTXT(uSrc, lCol);
-                               }
+                       if (CF_BLITMASK) { if ((bMsk>>((((uintptr_t)pDst)>>1)&7))&1) goto endpolytext; }
+                       if (CF_MASKCHECK || CF_BLEND) { uDst = *pDst; }
+                       if (CF_MASKCHECK) if (uDst&0x8000) { goto endpolytext; }
+
+                       //senquack - adapted to work with new 22.10 fixed point routines:
+                       //           (UNAI originally used 16.16)
+                       if (CF_TEXTMODE==1) {  //  4bpp (CLUT)
+                               u32 tu=(l_u>>10);
+                               u32 tv=(l_v<<1)&(0xff<<11);
+                               u8 rgb=((u8*)TBA_)[tv+(tu>>1)];
+                               uSrc=CBA_[(rgb>>((tu&1)<<2))&0xf];
+                               if (!uSrc) goto endpolytext;
+                       }
+                       if (CF_TEXTMODE==2) {  //  8bpp (CLUT)
+                               uSrc = CBA_[(((u8*)TBA_)[(l_u>>10)+((l_v<<1)&(0xff<<11))])];
+                               if (!uSrc) goto endpolytext;
                        }
-                       else
+                       if (CF_TEXTMODE==3) {  // 16bpp
+                               uSrc = TBA_[(l_u>>10)+((l_v)&(0xff<<10))];
+                               if (!uSrc) goto endpolytext;
+                       }
+
+                       // Save source MSB, as blending or lighting will not (Silent Hill)
+                       if (CF_BLEND || CF_LIGHT) srcMSB = uSrc & 0x8000;
+
+                       // When textured, only dither when LIGHT (texture blend) is enabled
+                       // LIGHT &&  BLEND => dither
+                       // LIGHT && !BLEND => dither
+                       //!LIGHT &&  BLEND => no dither
+                       //!LIGHT && !BLEND => no dither
+
+                       if (CF_DITHER && CF_LIGHT) {
+                               u32 uSrc24;
+                               if ( CF_GOURAUD)
+                                       uSrc24 = gpuLightingTXT24Gouraud(uSrc, l_gCol);
+                               if (!CF_GOURAUD)
+                                       uSrc24 = gpuLightingTXT24(uSrc, r8, g8, b8);
+
+                               if (CF_BLEND && srcMSB)
+                                       uSrc24 = gpuBlending24<CF_BLENDMODE>(uSrc24, uDst);
+
+                               uSrc = gpuColorQuantization24<CF_DITHER>(uSrc24, pDst);
+                       } else
                        {
-                               //  light
-                               if(L)  { gpuLightingTXT(uSrc, lCol); } else if(!MB) { uSrc&= 0x7fff; }
+                               if (CF_LIGHT) {
+                                       if ( CF_GOURAUD)
+                                               uSrc = gpuLightingTXTGouraud(uSrc, l_gCol);
+                                       if (!CF_GOURAUD)
+                                               uSrc = gpuLightingTXT(uSrc, r5, g5, b5);
+                               }
+
+                               if (CF_BLEND && srcMSB)
+                                       uSrc = gpuBlending<CF_BLENDMODE, skip_uSrc_mask>(uSrc, uDst);
                        }
-                       if (MB) { *pDst = uSrc | 0x8000; }
-                       else    { *pDst = uSrc; }
-                       endpoly: pDst++;
-                       tCor=(tCor+tinc)&tmsk;
-                       if (L&&G) lCol=(lCol+linc);
+
+                       if (CF_MASKSET)                { *pDst = uSrc | 0x8000; }
+                       else if (CF_BLEND || CF_LIGHT) { *pDst = uSrc | srcMSB; }
+                       else                           { *pDst = uSrc;          }
+endpolytext:
+                       pDst++;
+                       l_u = (l_u + l_u_inc) & l_u_msk;
+                       l_v = (l_v + l_v_inc) & l_v_msk;
+                       if (CF_LIGHT && CF_GOURAUD) l_gCol += l_gInc;
                }
                while (--count);
        }
 }
 
-// supposedly shouldn't be called?
-static void gpuPolySpanFn_NULL_(u16 *pDst, u32 count)
+static void PolyNULL(const gpu_unai_t &gpu_unai, u16 *pDst, u32 count)
 {
+       #ifdef ENABLE_GPU_LOG_SUPPORT
+               fprintf(stdout,"PolyNULL()\n");
+       #endif
 }
 
-///////////////////////////////////////////////////////////////////////////////
-
 ///////////////////////////////////////////////////////////////////////////////
 //  Polygon innerloops driver
-typedef void (*PP)(u16 *pDst, u32 count);
-const PP gpuPolySpanDrivers[512] =
-{
-       gpuPolySpanFn<0x00>,gpuPolySpanFn<0x01>,gpuPolySpanFn<0x02>,gpuPolySpanFn<0x03>,  gpuPolySpanFn<0x04>,gpuPolySpanFn<0x05>,gpuPolySpanFn<0x06>,gpuPolySpanFn<0x07>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x0A>,gpuPolySpanFn<0x0B>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x0E>,gpuPolySpanFn<0x0F>,
-       gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x12>,gpuPolySpanFn<0x13>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x16>,gpuPolySpanFn<0x17>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1A>,gpuPolySpanFn<0x1B>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1E>,gpuPolySpanFn<0x1F>,
-       gpuPolySpanFn<0x20>,gpuPolySpanFn<0x21>,gpuPolySpanFn<0x22>,gpuPolySpanFn<0x23>,  gpuPolySpanFn<0x24>,gpuPolySpanFn<0x25>,gpuPolySpanFn<0x26>,gpuPolySpanFn<0x27>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x2A>,gpuPolySpanFn<0x2B>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x2E>,gpuPolySpanFn<0x2F>,
-       gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x32>,gpuPolySpanFn<0x33>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x36>,gpuPolySpanFn<0x37>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x3A>,gpuPolySpanFn<0x3B>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x3E>,gpuPolySpanFn<0x3F>,
-       gpuPolySpanFn<0x40>,gpuPolySpanFn<0x41>,gpuPolySpanFn<0x42>,gpuPolySpanFn<0x43>,  gpuPolySpanFn<0x44>,gpuPolySpanFn<0x45>,gpuPolySpanFn<0x46>,gpuPolySpanFn<0x47>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x4A>,gpuPolySpanFn<0x4B>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x4E>,gpuPolySpanFn<0x4F>,
-       gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x52>,gpuPolySpanFn<0x53>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x56>,gpuPolySpanFn<0x57>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x5A>,gpuPolySpanFn<0x5B>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x5E>,gpuPolySpanFn<0x5F>,
-       gpuPolySpanFn<0x60>,gpuPolySpanFn<0x61>,gpuPolySpanFn<0x62>,gpuPolySpanFn<0x63>,  gpuPolySpanFn<0x64>,gpuPolySpanFn<0x65>,gpuPolySpanFn<0x66>,gpuPolySpanFn<0x67>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x6A>,gpuPolySpanFn<0x6B>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x6E>,gpuPolySpanFn<0x6F>,
-       gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x72>,gpuPolySpanFn<0x73>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x76>,gpuPolySpanFn<0x77>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x7A>,gpuPolySpanFn<0x7B>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x7E>,gpuPolySpanFn<0x7F>,
-
-       gpuPolySpanFn_NULL_,gpuPolySpanFn<0x81>,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x83>,  gpuPolySpanFn_NULL_,gpuPolySpanFn<0x85>,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x87>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x8B>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x8F>,
-       gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x93>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x97>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x9B>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x9F>,
-       gpuPolySpanFn_NULL_,gpuPolySpanFn<0xa1>,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xa3>,  gpuPolySpanFn_NULL_,gpuPolySpanFn<0xa5>,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xa7>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xaB>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xaF>,
-       gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xb3>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xb7>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xbB>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xbF>,
-       gpuPolySpanFn_NULL_,gpuPolySpanFn<0xc1>,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xc3>,  gpuPolySpanFn_NULL_,gpuPolySpanFn<0xc5>,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xc7>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xcB>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xcF>,
-       gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xd3>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xd7>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xdB>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xdF>,
-       gpuPolySpanFn_NULL_,gpuPolySpanFn<0xe1>,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xe3>,  gpuPolySpanFn_NULL_,gpuPolySpanFn<0xe5>,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xe7>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xeB>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xeF>,
-       gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xf3>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xf7>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xfB>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0xfF>,
-
-       gpuPolySpanFn<0x100>,gpuPolySpanFn<0x101>,gpuPolySpanFn<0x102>,gpuPolySpanFn<0x103>,  gpuPolySpanFn<0x104>,gpuPolySpanFn<0x105>,gpuPolySpanFn<0x106>,gpuPolySpanFn<0x107>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x10A>,gpuPolySpanFn<0x10B>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x10E>,gpuPolySpanFn<0x10F>,
-       gpuPolySpanFn_NULL_, gpuPolySpanFn_NULL_, gpuPolySpanFn<0x112>,gpuPolySpanFn<0x113>,  gpuPolySpanFn_NULL_, gpuPolySpanFn_NULL_, gpuPolySpanFn<0x116>,gpuPolySpanFn<0x117>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x11A>,gpuPolySpanFn<0x11B>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x11E>,gpuPolySpanFn<0x11F>,
-       gpuPolySpanFn<0x120>,gpuPolySpanFn<0x121>,gpuPolySpanFn<0x122>,gpuPolySpanFn<0x123>,  gpuPolySpanFn<0x124>,gpuPolySpanFn<0x125>,gpuPolySpanFn<0x126>,gpuPolySpanFn<0x127>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x12A>,gpuPolySpanFn<0x12B>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x12E>,gpuPolySpanFn<0x12F>,
-       gpuPolySpanFn_NULL_, gpuPolySpanFn_NULL_, gpuPolySpanFn<0x132>,gpuPolySpanFn<0x133>,  gpuPolySpanFn_NULL_, gpuPolySpanFn_NULL_, gpuPolySpanFn<0x136>,gpuPolySpanFn<0x137>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x13A>,gpuPolySpanFn<0x13B>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x13E>,gpuPolySpanFn<0x13F>,
-       gpuPolySpanFn<0x140>,gpuPolySpanFn<0x141>,gpuPolySpanFn<0x142>,gpuPolySpanFn<0x143>,  gpuPolySpanFn<0x144>,gpuPolySpanFn<0x145>,gpuPolySpanFn<0x146>,gpuPolySpanFn<0x147>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x14A>,gpuPolySpanFn<0x14B>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x14E>,gpuPolySpanFn<0x14F>,
-       gpuPolySpanFn_NULL_, gpuPolySpanFn_NULL_, gpuPolySpanFn<0x152>,gpuPolySpanFn<0x153>,  gpuPolySpanFn_NULL_, gpuPolySpanFn_NULL_, gpuPolySpanFn<0x156>,gpuPolySpanFn<0x157>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x15A>,gpuPolySpanFn<0x15B>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x15E>,gpuPolySpanFn<0x15F>,
-       gpuPolySpanFn<0x160>,gpuPolySpanFn<0x161>,gpuPolySpanFn<0x162>,gpuPolySpanFn<0x163>,  gpuPolySpanFn<0x164>,gpuPolySpanFn<0x165>,gpuPolySpanFn<0x166>,gpuPolySpanFn<0x167>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x16A>,gpuPolySpanFn<0x16B>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x16E>,gpuPolySpanFn<0x16F>,
-       gpuPolySpanFn_NULL_, gpuPolySpanFn_NULL_, gpuPolySpanFn<0x172>,gpuPolySpanFn<0x173>,  gpuPolySpanFn_NULL_, gpuPolySpanFn_NULL_, gpuPolySpanFn<0x176>,gpuPolySpanFn<0x177>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x17A>,gpuPolySpanFn<0x17B>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x17E>,gpuPolySpanFn<0x17F>,
-                                                                                                                                                                                                                                                                                                                                                                                      
-       gpuPolySpanFn_NULL_,gpuPolySpanFn<0x181>,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x183>,  gpuPolySpanFn_NULL_,gpuPolySpanFn<0x185>,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x187>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x18B>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x18F>,
-       gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_, gpuPolySpanFn_NULL_,gpuPolySpanFn<0x193>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_, gpuPolySpanFn_NULL_,gpuPolySpanFn<0x197>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x19B>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x19F>,
-       gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1a1>,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1a3>,  gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1a5>,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1a7>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1aB>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1aF>,
-       gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_, gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1b3>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_, gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1b7>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1bB>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1bF>,
-       gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1c1>,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1c3>,  gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1c5>,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1c7>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1cB>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1cF>,
-       gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_, gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1d3>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_, gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1d7>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1dB>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1dF>,
-       gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1e1>,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1e3>,  gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1e5>,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1e7>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1eB>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1eF>,
-       gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_, gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1f3>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_, gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1f7>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1fB>,  gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn_NULL_,gpuPolySpanFn<0x1fF>
+typedef void (*PP)(const gpu_unai_t &gpu_unai, u16 *pDst, u32 count);
+
+// Template instantiation helper macros
+#define TI(cf) gpuPolySpanFn<(cf)>
+#define TN     PolyNULL
+#define TIBLOCK(ub) \
+       TI((ub)|0x00), TI((ub)|0x01), TI((ub)|0x02), TI((ub)|0x03), TI((ub)|0x04), TI((ub)|0x05), TI((ub)|0x06), TI((ub)|0x07), \
+       TN,            TN,            TI((ub)|0x0a), TI((ub)|0x0b), TN,            TN,            TI((ub)|0x0e), TI((ub)|0x0f), \
+       TN,            TN,            TI((ub)|0x12), TI((ub)|0x13), TN,            TN,            TI((ub)|0x16), TI((ub)|0x17), \
+       TN,            TN,            TI((ub)|0x1a), TI((ub)|0x1b), TN,            TN,            TI((ub)|0x1e), TI((ub)|0x1f), \
+       TI((ub)|0x20), TI((ub)|0x21), TI((ub)|0x22), TI((ub)|0x23), TI((ub)|0x24), TI((ub)|0x25), TI((ub)|0x26), TI((ub)|0x27), \
+       TN,            TN,            TI((ub)|0x2a), TI((ub)|0x2b), TN,            TN,            TI((ub)|0x2e), TI((ub)|0x2f), \
+       TN,            TN,            TI((ub)|0x32), TI((ub)|0x33), TN,            TN,            TI((ub)|0x36), TI((ub)|0x37), \
+       TN,            TN,            TI((ub)|0x3a), TI((ub)|0x3b), TN,            TN,            TI((ub)|0x3e), TI((ub)|0x3f), \
+       TI((ub)|0x40), TI((ub)|0x41), TI((ub)|0x42), TI((ub)|0x43), TI((ub)|0x44), TI((ub)|0x45), TI((ub)|0x46), TI((ub)|0x47), \
+       TN,            TN,            TI((ub)|0x4a), TI((ub)|0x4b), TN,            TN,            TI((ub)|0x4e), TI((ub)|0x4f), \
+       TN,            TN,            TI((ub)|0x52), TI((ub)|0x53), TN,            TN,            TI((ub)|0x56), TI((ub)|0x57), \
+       TN,            TN,            TI((ub)|0x5a), TI((ub)|0x5b), TN,            TN,            TI((ub)|0x5e), TI((ub)|0x5f), \
+       TI((ub)|0x60), TI((ub)|0x61), TI((ub)|0x62), TI((ub)|0x63), TI((ub)|0x64), TI((ub)|0x65), TI((ub)|0x66), TI((ub)|0x67), \
+       TN,            TN,            TI((ub)|0x6a), TI((ub)|0x6b), TN,            TN,            TI((ub)|0x6e), TI((ub)|0x6f), \
+       TN,            TN,            TI((ub)|0x72), TI((ub)|0x73), TN,            TN,            TI((ub)|0x76), TI((ub)|0x77), \
+       TN,            TN,            TI((ub)|0x7a), TI((ub)|0x7b), TN,            TN,            TI((ub)|0x7e), TI((ub)|0x7f), \
+       TN,            TI((ub)|0x81), TN,            TI((ub)|0x83), TN,            TI((ub)|0x85), TN,            TI((ub)|0x87), \
+       TN,            TN,            TN,            TI((ub)|0x8b), TN,            TN,            TN,            TI((ub)|0x8f), \
+       TN,            TN,            TN,            TI((ub)|0x93), TN,            TN,            TN,            TI((ub)|0x97), \
+       TN,            TN,            TN,            TI((ub)|0x9b), TN,            TN,            TN,            TI((ub)|0x9f), \
+       TN,            TI((ub)|0xa1), TN,            TI((ub)|0xa3), TN,            TI((ub)|0xa5), TN,            TI((ub)|0xa7), \
+       TN,            TN,            TN,            TI((ub)|0xab), TN,            TN,            TN,            TI((ub)|0xaf), \
+       TN,            TN,            TN,            TI((ub)|0xb3), TN,            TN,            TN,            TI((ub)|0xb7), \
+       TN,            TN,            TN,            TI((ub)|0xbb), TN,            TN,            TN,            TI((ub)|0xbf), \
+       TN,            TI((ub)|0xc1), TN,            TI((ub)|0xc3), TN,            TI((ub)|0xc5), TN,            TI((ub)|0xc7), \
+       TN,            TN,            TN,            TI((ub)|0xcb), TN,            TN,            TN,            TI((ub)|0xcf), \
+       TN,            TN,            TN,            TI((ub)|0xd3), TN,            TN,            TN,            TI((ub)|0xd7), \
+       TN,            TN,            TN,            TI((ub)|0xdb), TN,            TN,            TN,            TI((ub)|0xdf), \
+       TN,            TI((ub)|0xe1), TN,            TI((ub)|0xe3), TN,            TI((ub)|0xe5), TN,            TI((ub)|0xe7), \
+       TN,            TN,            TN,            TI((ub)|0xeb), TN,            TN,            TN,            TI((ub)|0xef), \
+       TN,            TN,            TN,            TI((ub)|0xf3), TN,            TN,            TN,            TI((ub)|0xf7), \
+       TN,            TN,            TN,            TI((ub)|0xfb), TN,            TN,            TN,            TI((ub)|0xff)
+
+const PP gpuPolySpanDrivers[2048] = {
+       TIBLOCK(0<<8), TIBLOCK(1<<8), TIBLOCK(2<<8), TIBLOCK(3<<8),
+       TIBLOCK(4<<8), TIBLOCK(5<<8), TIBLOCK(6<<8), TIBLOCK(7<<8)
 };
+
+#undef TI
+#undef TN
+#undef TIBLOCK
index ce439d3..93c268b 100644 (file)
 
 //  GPU Blending operations functions
 
-#ifdef __arm__
-#define gpuBlending00(uSrc,uDst) \
-{ \
-       asm ("and  %[src], %[src], %[msk]\n" \
-            "and  %[dst], %[dst], %[msk]\n" \
-            "add  %[src], %[dst], %[src]\n" \
-            "mov  %[src], %[src], lsr #1\n" \
-        : [src] "=&r" (uSrc), [dst] "=&r" (uDst) : "0" (uSrc), "1" (uDst), [msk] "r" (uMsk)); \
-}
-#else
-#define gpuBlending00(uSrc,uDst) \
-{ \
-       uSrc = (((uDst & uMsk) + (uSrc & uMsk)) >> 1); \
-}
-#endif
+////////////////////////////////////////////////////////////////////////////////
+// Blend bgr555 color in 'uSrc' (foreground) with bgr555 color
+//  in 'uDst' (background), returning resulting color.
+//
+// INPUT:
+//  'uSrc','uDst' input: -bbbbbgggggrrrrr
+//                       ^ bit 16
+// OUTPUT:
+//           u16 output: 0bbbbbgggggrrrrr
+//                       ^ bit 16
+// RETURNS:
+// Where '0' is zero-padding, and '-' is don't care
+////////////////////////////////////////////////////////////////////////////////
+template <int BLENDMODE, bool SKIP_USRC_MSB_MASK>
+GPU_INLINE u16 gpuBlending(u16 uSrc, u16 uDst)
+{
+       // These use Blargg's bitwise modulo-clamping:
+       //  http://blargg.8bitalley.com/info/rgb_mixing.html
+       //  http://blargg.8bitalley.com/info/rgb_clamped_add.html
+       //  http://blargg.8bitalley.com/info/rgb_clamped_sub.html
 
-//     1.0 x Back + 1.0 x Forward
-#ifdef __arm__
-#define gpuBlending01(uSrc,uDst) \
-{ \
-       u32 st,dt,out; \
-       asm ("and    %[dt],  %[dst],   #0x7C00\n" \
-            "and    %[st],  %[src],   #0x7C00\n" \
-            "add    %[out], %[dt],    %[st]  \n" \
-            "cmp    %[out], #0x7C00          \n" \
-            "movhi  %[out], #0x7C00          \n" \
-            "and    %[dt],  %[dst],   #0x03E0\n" \
-            "and    %[st],  %[src],   #0x03E0\n" \
-            "add    %[dt],  %[dt],    %[st]  \n" \
-            "cmp    %[dt],  #0x03E0          \n" \
-            "movhi  %[dt],  #0x03E0          \n" \
-            "orr    %[out], %[out],   %[dt]  \n" \
-            "and    %[dt],  %[dst],   #0x001F\n" \
-            "and    %[st],  %[src],   #0x001F\n" \
-            "add    %[dt],  %[dt],    %[st]  \n" \
-            "cmp    %[dt],  #0x001F          \n" \
-            "movhi  %[dt],  #0x001F          \n" \
-            "orr    %[src], %[out],  %[dt]  \n" \
-        : [src] "=r" (uSrc), [st] "=&r" (st), [dt] "=&r" (dt), [out] "=&r" (out) \
-        : [dst] "r" (uDst), "0" (uSrc) : "cc"); \
-}
+       u16 mix;
+
+       // 0.5 x Back + 0.5 x Forward
+       if (BLENDMODE==0) {
+#ifdef GPU_UNAI_USE_ACCURATE_BLENDING
+               // Slower, but more accurate (doesn't lose LSB data)
+               uDst &= 0x7fff;
+               if (!SKIP_USRC_MSB_MASK)
+                       uSrc &= 0x7fff;
+               mix = ((uSrc + uDst) - ((uSrc ^ uDst) & 0x0421)) >> 1;
 #else
-#define gpuBlending01(uSrc,uDst) \
-{ \
-       u16 rr, gg, bb; \
-       bb = (uDst & 0x7C00) + (uSrc & 0x7C00);   if (bb > 0x7C00)  bb = 0x7C00; \
-       gg = (uDst & 0x03E0) + (uSrc & 0x03E0);   if (gg > 0x03E0)  gg = 0x03E0;  bb |= gg; \
-       rr = (uDst & 0x001F) + (uSrc & 0x001F);   if (rr > 0x001F)  rr = 0x001F;  bb |= rr; \
-       uSrc = bb; \
-}
+               mix = ((uDst & 0x7bde) + (uSrc & 0x7bde)) >> 1;
 #endif
+       }
+
+       // 1.0 x Back + 1.0 x Forward
+       if (BLENDMODE==1) {
+               uDst &= 0x7fff;
+               if (!SKIP_USRC_MSB_MASK)
+                       uSrc &= 0x7fff;
+               u32 sum      = uSrc + uDst;
+               u32 low_bits = (uSrc ^ uDst) & 0x0421;
+               u32 carries  = (sum - low_bits) & 0x8420;
+               u32 modulo   = sum - carries;
+               u32 clamp    = carries - (carries >> 5);
+               mix = modulo | clamp;
+       }
+
+       // 1.0 x Back - 1.0 x Forward
+       if (BLENDMODE==2) {
+               uDst &= 0x7fff;
+               if (!SKIP_USRC_MSB_MASK)
+                       uSrc &= 0x7fff;
+               u32 diff     = uDst - uSrc + 0x8420;
+               u32 low_bits = (uDst ^ uSrc) & 0x8420;
+               u32 borrows  = (diff - low_bits) & 0x8420;
+               u32 modulo   = diff - borrows;
+               u32 clamp    = borrows - (borrows >> 5);
+               mix = modulo & clamp;
+       }
 
-//     1.0 x Back - 1.0 x Forward      */
-#ifdef __arm__
-#define gpuBlending02(uSrc,uDst) \
-{ \
-       u32 st,dt,out; \
-       asm ("and    %[dt],  %[dst],   #0x7C00\n" \
-            "and    %[st],  %[src],   #0x7C00\n" \
-            "subs   %[out], %[dt],    %[st]  \n" \
-            "movmi  %[out], #0x0000          \n" \
-            "and    %[dt],  %[dst],   #0x03E0\n" \
-            "and    %[st],  %[src],   #0x03E0\n" \
-            "subs   %[dt],  %[dt],    %[st]  \n" \
-            "orrpl  %[out], %[out],   %[dt]  \n" \
-            "and    %[dt],  %[dst],   #0x001F\n" \
-            "and    %[st],  %[src],   #0x001F\n" \
-            "subs   %[dt],  %[dt],    %[st]  \n" \
-            "orrpl  %[out], %[out],   %[dt]  \n" \
-            "mov    %[src], %[out]           \n" \
-        : [src] "=r" (uSrc), [st] "=&r" (st), [dt] "=&r" (dt), [out] "=&r" (out) \
-        : [dst] "r" (uDst), "0" (uSrc) : "cc"); \
+       // 1.0 x Back + 0.25 x Forward
+       if (BLENDMODE==3) {
+               uDst &= 0x7fff;
+               uSrc = ((uSrc >> 2) & 0x1ce7);
+               u32 sum      = uSrc + uDst;
+               u32 low_bits = (uSrc ^ uDst) & 0x0421;
+               u32 carries  = (sum - low_bits) & 0x8420;
+               u32 modulo   = sum - carries;
+               u32 clamp    = carries - (carries >> 5);
+               mix = modulo | clamp;
+       }
+
+       return mix;
 }
 
-int btest(int s, int d)
+
+////////////////////////////////////////////////////////////////////////////////
+// Convert bgr555 color in uSrc to padded u32 5.4:5.4:5.4 bgr fixed-pt
+//  color triplet suitable for use with HQ 24-bit quantization.
+//
+// INPUT:
+//       'uDst' input: -bbbbbgggggrrrrr
+//                     ^ bit 16
+// RETURNS:
+//         u32 output: 000bbbbbXXXX0gggggXXXX0rrrrrXXXX
+//                     ^ bit 31
+// Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care
+////////////////////////////////////////////////////////////////////////////////
+GPU_INLINE u32 gpuGetRGB24(u16 uSrc)
 {
-       gpuBlending02(s, d);
-       return s;
-}
-#else
-#define gpuBlending02(uSrc,uDst) \
-{ \
-       s32 rr, gg, bb; \
-       bb = (uDst & 0x7C00) - (uSrc & 0x7C00);   if (bb < 0)  bb  =  0; \
-       gg = (uDst & 0x03E0) - (uSrc & 0x03E0);   if (gg > 0)  bb |= gg; \
-       rr = (uDst & 0x001F) - (uSrc & 0x001F);   if (rr > 0)  bb |= rr; \
-       uSrc = bb; \
+       return ((uSrc & 0x7C00)<<14)
+            | ((uSrc & 0x03E0)<< 9)
+            | ((uSrc & 0x001F)<< 4);
 }
-#endif
 
-//     1.0 x Back + 0.25 x Forward     */
-#ifdef __arm__
-#define gpuBlending03(uSrc,uDst) \
-{ \
-       u32 st,dt,out; \
-       asm ("mov    %[src], %[src],   lsr #2 \n" \
-            "and    %[dt],  %[dst],   #0x7C00\n" \
-            "and    %[st],  %[src],   #0x1C00\n" \
-            "add    %[out], %[dt],    %[st]  \n" \
-            "cmp    %[out], #0x7C00          \n" \
-            "movhi  %[out], #0x7C00          \n" \
-            "and    %[dt],  %[dst],   #0x03E0\n" \
-            "and    %[st],  %[src],   #0x00E0\n" \
-            "add    %[dt],  %[dt],    %[st]  \n" \
-            "cmp    %[dt],  #0x03E0          \n" \
-            "movhi  %[dt],  #0x03E0          \n" \
-            "orr    %[out], %[out],   %[dt]  \n" \
-            "and    %[dt],  %[dst],   #0x001F\n" \
-            "and    %[st],  %[src],   #0x0007\n" \
-            "add    %[dt],  %[dt],    %[st]  \n" \
-            "cmp    %[dt],  #0x001F          \n" \
-            "movhi  %[dt],  #0x001F          \n" \
-            "orr    %[src], %[out],   %[dt]  \n" \
-        : [src] "=r" (uSrc), [st] "=&r" (st), [dt] "=&r" (dt), [out] "=&r" (out) \
-        : [dst] "r" (uDst), "0" (uSrc) : "cc"); \
-}
-#else
-#define gpuBlending03(uSrc,uDst) \
-{ \
-       u16 rr, gg, bb; \
-       uSrc >>= 2; \
-       bb = (uDst & 0x7C00) + (uSrc & 0x1C00);   if (bb > 0x7C00)  bb = 0x7C00; \
-       gg = (uDst & 0x03E0) + (uSrc & 0x00E0);   if (gg > 0x03E0)  gg = 0x03E0;  bb |= gg; \
-       rr = (uDst & 0x001F) + (uSrc & 0x0007);   if (rr > 0x001F)  rr = 0x001F;  bb |= rr; \
-       uSrc = bb; \
+
+////////////////////////////////////////////////////////////////////////////////
+// Blend padded u32 5.4:5.4:5.4 bgr fixed-pt color triplet in 'uSrc24'
+//  (foreground color) with bgr555 color in 'uDst' (background color),
+//  returning the resulting u32 5.4:5.4:5.4 color.
+//
+// INPUT:
+//     'uSrc24' input: 000bbbbbXXXX0gggggXXXX0rrrrrXXXX
+//                     ^ bit 31
+//       'uDst' input: -bbbbbgggggrrrrr
+//                     ^ bit 16
+// RETURNS:
+//         u32 output: 000bbbbbXXXX0gggggXXXX0rrrrrXXXX
+//                     ^ bit 31
+// Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care
+////////////////////////////////////////////////////////////////////////////////
+template <int BLENDMODE>
+GPU_INLINE u32 gpuBlending24(u32 uSrc24, u16 uDst)
+{
+       // These use techniques adapted from Blargg's techniques mentioned in
+       //  in gpuBlending() comments above. Not as much bitwise trickery is
+       //  necessary because of presence of 0 padding in uSrc24 format.
+
+       u32 uDst24 = gpuGetRGB24(uDst);
+       u32 mix;
+
+       // 0.5 x Back + 0.5 x Forward
+       if (BLENDMODE==0) {
+               const u32 uMsk = 0x1FE7F9FE;
+               // Only need to mask LSBs of uSrc24, uDst24's LSBs are 0 already
+               mix = (uDst24 + (uSrc24 & uMsk)) >> 1;
+       }
+
+       // 1.0 x Back + 1.0 x Forward
+       if (BLENDMODE==1) {
+               u32 sum     = uSrc24 + uDst24;
+               u32 carries = sum & 0x20080200;
+               u32 modulo  = sum - carries;
+               u32 clamp   = carries - (carries >> 9);
+               mix = modulo | clamp;
+       }
+
+       // 1.0 x Back - 1.0 x Forward
+       if (BLENDMODE==2) {
+               // Insert ones in 0-padded borrow slot of color to be subtracted from
+               uDst24 |= 0x20080200;
+               u32 diff    = uDst24 - uSrc24;
+               u32 borrows = diff & 0x20080200;
+               u32 clamp   = borrows - (borrows >> 9);
+               mix = diff & clamp;
+       }
+
+       // 1.0 x Back + 0.25 x Forward
+       if (BLENDMODE==3) {
+               uSrc24 = (uSrc24 & 0x1FC7F1FC) >> 2;
+               u32 sum     = uSrc24 + uDst24;
+               u32 carries = sum & 0x20080200;
+               u32 modulo  = sum - carries;
+               u32 clamp   = carries - (carries >> 9);
+               mix = modulo | clamp;
+       }
+
+       return mix;
 }
-#endif
 
 #endif  //_OP_BLEND_H_
diff --git a/plugins/gpu_unai/gpu_inner_blend_arm5.h b/plugins/gpu_unai/gpu_inner_blend_arm5.h
new file mode 100644 (file)
index 0000000..0e9b74f
--- /dev/null
@@ -0,0 +1,100 @@
+/***************************************************************************
+*   Copyright (C) 2010 PCSX4ALL Team                                      *
+*   Copyright (C) 2010 Unai                                               *
+*                                                                         *
+*   This program is free software; you can redistribute it and/or modify  *
+*   it under the terms of the GNU General Public License as published by  *
+*   the Free Software Foundation; either version 2 of the License, or     *
+*   (at your option) any later version.                                   *
+*                                                                         *
+*   This program is distributed in the hope that it will be useful,       *
+*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+*   GNU General Public License for more details.                          *
+*                                                                         *
+*   You should have received a copy of the GNU General Public License     *
+*   along with this program; if not, write to the                         *
+*   Free Software Foundation, Inc.,                                       *
+*   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
+***************************************************************************/
+
+#ifndef _OP_BLEND_H_
+#define _OP_BLEND_H_
+
+//  GPU Blending operations functions
+
+#define gpuBlending00(uSrc,uDst) \
+{ \
+       asm ("and  %[src], %[src], %[msk]  " : [src] "=r" (uSrc) : "0" (uSrc), [msk] "r" (uMsk)                  ); \
+       asm ("and  %[dst], %[dst], %[msk]  " : [dst] "=r" (uDst) : "0" (uDst), [msk] "r" (uMsk)                  ); \
+       asm ("add  %[src], %[dst], %[src]  " : [src] "=r" (uSrc) :             [dst] "r" (uDst), "0" (uSrc)      ); \
+       asm ("mov  %[src], %[src], lsr #1  " : [src] "=r" (uSrc) : "0" (uSrc)                                    ); \
+}
+
+//     1.0 x Back + 1.0 x Forward
+#define gpuBlending01(uSrc,uDst) \
+{ \
+       u16 st,dt,out; \
+       asm ("and    %[dt],  %[dst],   #0x7C00  " : [dt]  "=r" (dt)   :             [dst] "r" (uDst)                    ); \
+       asm ("and    %[st],  %[src],   #0x7C00  " : [st]  "=r" (st)   :             [src] "r" (uSrc)                    ); \
+       asm ("add    %[out], %[dt],    %[st]    " : [out] "=r" (out)  :             [dt]  "r" (dt),   [st]  "r" (st)    ); \
+       asm ("cmp    %[out], #0x7C00            " :                   :             [out] "r" (out) : "cc"              ); \
+       asm ("movhi  %[out], #0x7C00            " : [out] "=r" (out)  : "0" (out)                                       ); \
+       asm ("and    %[dt],  %[dst],   #0x03E0  " : [dt]  "=r" (dt)   :             [dst] "r" (uDst)                    ); \
+       asm ("and    %[st],  %[src],   #0x03E0  " : [st]  "=r" (st)   :             [src] "r" (uSrc)                    ); \
+       asm ("add    %[dt],  %[dt],    %[st]    " : [dt]  "=r" (dt)   : "0" (dt),   [st]  "r" (st)                      ); \
+       asm ("cmp    %[dt],  #0x03E0            " :                   :             [dt]  "r" (dt) : "cc"               ); \
+       asm ("movhi  %[dt],  #0x03E0            " : [dt]  "=r" (dt)   : "0" (dt)                                        ); \
+       asm ("orr    %[out], %[out],   %[dt]    " : [out] "=r" (out)  : "0" (out),  [dt]  "r" (dt)                      ); \
+       asm ("and    %[dt],  %[dst],   #0x001F  " : [dt]  "=r" (dt)   :             [dst] "r" (uDst)                    ); \
+       asm ("and    %[st],  %[src],   #0x001F  " : [st]  "=r" (st)   :             [src] "r" (uSrc)                    ); \
+       asm ("add    %[dt],  %[dt],    %[st]    " : [dt]  "=r" (dt)   : "0" (dt),   [st]  "r" (st)                      ); \
+       asm ("cmp    %[dt],  #0x001F            " :                   :             [dt]  "r" (dt) : "cc"               ); \
+       asm ("movhi  %[dt],  #0x001F            " : [dt]  "=r" (dt)   : "0" (dt)                                        ); \
+       asm ("orr    %[uSrc], %[out],   %[dt]   " : [uSrc] "=r" (uSrc)  : [out] "r" (out),  [dt]  "r" (dt)              ); \
+}
+
+//     1.0 x Back - 1.0 x Forward      */
+#define gpuBlending02(uSrc,uDst) \
+{ \
+       u16 st,dt,out; \
+       asm ("and    %[dt],  %[dst],   #0x7C00  " : [dt]  "=r" (dt)   :             [dst] "r" (uDst)                    ); \
+       asm ("and    %[st],  %[src],   #0x7C00  " : [st]  "=r" (st)   :             [src] "r" (uSrc)                    ); \
+       asm ("subs   %[out], %[dt],    %[st]    " : [out] "=r" (out)  : [dt]  "r" (dt),   [st]  "r" (st) : "cc"         ); \
+       asm ("movmi  %[out], #0x0000            " : [out] "=r" (out)  : "0" (out)                                       ); \
+       asm ("and    %[dt],  %[dst],   #0x03E0  " : [dt]  "=r" (dt)   :             [dst] "r" (uDst)                    ); \
+       asm ("and    %[st],  %[src],   #0x03E0  " : [st]  "=r" (st)   :             [src] "r" (uSrc)                    ); \
+       asm ("subs   %[dt],  %[dt],    %[st]    " : [dt]  "=r" (dt)   : "0" (dt),   [st]  "r" (st) : "cc"               ); \
+       asm ("orrpl  %[out], %[out],   %[dt]    " : [out] "=r" (out)  : "0" (out),  [dt]  "r" (dt)                      ); \
+       asm ("and    %[dt],  %[dst],   #0x001F  " : [dt]  "=r" (dt)   :             [dst] "r" (uDst)                    ); \
+       asm ("and    %[st],  %[src],   #0x001F  " : [st]  "=r" (st)   :             [src] "r" (uSrc)                    ); \
+       asm ("subs   %[dt],  %[dt],    %[st]    " : [dt]  "=r" (dt)   : "0" (dt),   [st]  "r" (st) : "cc"               ); \
+       asm ("orrpl  %[out], %[out],   %[dt]    " : [out] "=r" (out)  : "0" (out),  [dt]  "r" (dt)                      ); \
+       asm ("mov %[uSrc], %[out]" : [uSrc] "=r" (uSrc) : [out] "r" (out) ); \
+}
+
+//     1.0 x Back + 0.25 x Forward     */
+#define gpuBlending03(uSrc,uDst) \
+{ \
+               u16 st,dt,out; \
+               asm ("mov    %[src], %[src],   lsr #2   " : [src] "=r" (uSrc) : "0" (uSrc)                                      ); \
+               asm ("and    %[dt],  %[dst],   #0x7C00  " : [dt]  "=r" (dt)   :             [dst] "r" (uDst)                    ); \
+               asm ("and    %[st],  %[src],   #0x1C00  " : [st]  "=r" (st)   :             [src] "r" (uSrc)                    ); \
+               asm ("add    %[out], %[dt],    %[st]    " : [out] "=r" (out)  :             [dt]  "r" (dt),   [st]  "r" (st)    ); \
+               asm ("cmp    %[out], #0x7C00            " :                   :             [out] "r" (out) : "cc"              ); \
+               asm ("movhi  %[out], #0x7C00            " : [out] "=r" (out)  : "0" (out)                                       ); \
+               asm ("and    %[dt],  %[dst],   #0x03E0  " : [dt]  "=r" (dt)   :             [dst] "r" (uDst)                    ); \
+               asm ("and    %[st],  %[src],   #0x00E0  " : [st]  "=r" (st)   :             [src] "r" (uSrc)                    ); \
+               asm ("add    %[dt],  %[dt],    %[st]    " : [dt]  "=r" (dt)   : "0" (dt),   [st]  "r" (st)                      ); \
+               asm ("cmp    %[dt],  #0x03E0            " :                   :             [dt]  "r" (dt) : "cc"               ); \
+               asm ("movhi  %[dt],  #0x03E0            " : [dt]  "=r" (dt)   : "0" (dt)                                        ); \
+               asm ("orr    %[out], %[out],   %[dt]    " : [out] "=r" (out)  : "0" (out),  [dt]  "r" (dt)                      ); \
+               asm ("and    %[dt],  %[dst],   #0x001F  " : [dt]  "=r" (dt)   :             [dst] "r" (uDst)                    ); \
+               asm ("and    %[st],  %[src],   #0x0007  " : [st]  "=r" (st)   :             [src] "r" (uSrc)                    ); \
+               asm ("add    %[dt],  %[dt],    %[st]    " : [dt]  "=r" (dt)   : "0" (dt),   [st]  "r" (st)                      ); \
+               asm ("cmp    %[dt],  #0x001F            " :                   :             [dt]  "r" (dt) : "cc"               ); \
+               asm ("movhi  %[dt],  #0x001F            " : [dt]  "=r" (dt)   : "0" (dt)                                        ); \
+               asm ("orr    %[uSrc], %[out],   %[dt]   " : [uSrc] "=r" (uSrc)  : [out] "r" (out),  [dt]  "r" (dt)              ); \
+}
+
+#endif  //_OP_BLEND_H_
diff --git a/plugins/gpu_unai/gpu_inner_blend_arm7.h b/plugins/gpu_unai/gpu_inner_blend_arm7.h
new file mode 100644 (file)
index 0000000..083e62d
--- /dev/null
@@ -0,0 +1,107 @@
+/***************************************************************************
+*   Copyright (C) 2010 PCSX4ALL Team                                      *
+*   Copyright (C) 2010 Unai                                               *
+*                                                                         *
+*   This program is free software; you can redistribute it and/or modify  *
+*   it under the terms of the GNU General Public License as published by  *
+*   the Free Software Foundation; either version 2 of the License, or     *
+*   (at your option) any later version.                                   *
+*                                                                         *
+*   This program is distributed in the hope that it will be useful,       *
+*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+*   GNU General Public License for more details.                          *
+*                                                                         *
+*   You should have received a copy of the GNU General Public License     *
+*   along with this program; if not, write to the                         *
+*   Free Software Foundation, Inc.,                                       *
+*   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
+***************************************************************************/
+
+#ifndef _OP_BLEND_H_
+#define _OP_BLEND_H_
+
+//  GPU Blending operations functions
+
+#define gpuBlending00(uSrc,uDst) \
+{ \
+       asm ("and  %[src], %[src], %[msk]\n" \
+            "and  %[dst], %[dst], %[msk]\n" \
+            "add  %[src], %[dst], %[src]\n" \
+            "mov  %[src], %[src], lsr #1\n" \
+        : [src] "=&r" (uSrc), [dst] "=&r" (uDst) : "0" (uSrc), "1" (uDst), [msk] "r" (uMsk)); \
+}
+
+//     1.0 x Back + 1.0 x Forward
+#define gpuBlending01(uSrc,uDst) \
+{ \
+       u32 st,dt,out; \
+       asm ("and    %[dt],  %[dst],   #0x7C00\n" \
+            "and    %[st],  %[src],   #0x7C00\n" \
+            "add    %[out], %[dt],    %[st]  \n" \
+            "cmp    %[out], #0x7C00          \n" \
+            "movhi  %[out], #0x7C00          \n" \
+            "and    %[dt],  %[dst],   #0x03E0\n" \
+            "and    %[st],  %[src],   #0x03E0\n" \
+            "add    %[dt],  %[dt],    %[st]  \n" \
+            "cmp    %[dt],  #0x03E0          \n" \
+            "movhi  %[dt],  #0x03E0          \n" \
+            "orr    %[out], %[out],   %[dt]  \n" \
+            "and    %[dt],  %[dst],   #0x001F\n" \
+            "and    %[st],  %[src],   #0x001F\n" \
+            "add    %[dt],  %[dt],    %[st]  \n" \
+            "cmp    %[dt],  #0x001F          \n" \
+            "movhi  %[dt],  #0x001F          \n" \
+            "orr    %[src], %[out],  %[dt]  \n" \
+        : [src] "=r" (uSrc), [st] "=&r" (st), [dt] "=&r" (dt), [out] "=&r" (out) \
+        : [dst] "r" (uDst), "0" (uSrc) : "cc"); \
+}
+
+//     1.0 x Back - 1.0 x Forward      */
+#define gpuBlending02(uSrc,uDst) \
+{ \
+       u32 st,dt,out; \
+       asm ("and    %[dt],  %[dst],   #0x7C00\n" \
+            "and    %[st],  %[src],   #0x7C00\n" \
+            "subs   %[out], %[dt],    %[st]  \n" \
+            "movmi  %[out], #0x0000          \n" \
+            "and    %[dt],  %[dst],   #0x03E0\n" \
+            "and    %[st],  %[src],   #0x03E0\n" \
+            "subs   %[dt],  %[dt],    %[st]  \n" \
+            "orrpl  %[out], %[out],   %[dt]  \n" \
+            "and    %[dt],  %[dst],   #0x001F\n" \
+            "and    %[st],  %[src],   #0x001F\n" \
+            "subs   %[dt],  %[dt],    %[st]  \n" \
+            "orrpl  %[out], %[out],   %[dt]  \n" \
+            "mov    %[src], %[out]           \n" \
+        : [src] "=r" (uSrc), [st] "=&r" (st), [dt] "=&r" (dt), [out] "=&r" (out) \
+        : [dst] "r" (uDst), "0" (uSrc) : "cc"); \
+}
+
+//     1.0 x Back + 0.25 x Forward     */
+#define gpuBlending03(uSrc,uDst) \
+{ \
+       u32 st,dt,out; \
+       asm ("mov    %[src], %[src],   lsr #2 \n" \
+            "and    %[dt],  %[dst],   #0x7C00\n" \
+            "and    %[st],  %[src],   #0x1C00\n" \
+            "add    %[out], %[dt],    %[st]  \n" \
+            "cmp    %[out], #0x7C00          \n" \
+            "movhi  %[out], #0x7C00          \n" \
+            "and    %[dt],  %[dst],   #0x03E0\n" \
+            "and    %[st],  %[src],   #0x00E0\n" \
+            "add    %[dt],  %[dt],    %[st]  \n" \
+            "cmp    %[dt],  #0x03E0          \n" \
+            "movhi  %[dt],  #0x03E0          \n" \
+            "orr    %[out], %[out],   %[dt]  \n" \
+            "and    %[dt],  %[dst],   #0x001F\n" \
+            "and    %[st],  %[src],   #0x0007\n" \
+            "add    %[dt],  %[dt],    %[st]  \n" \
+            "cmp    %[dt],  #0x001F          \n" \
+            "movhi  %[dt],  #0x001F          \n" \
+            "orr    %[src], %[out],   %[dt]  \n" \
+        : [src] "=r" (uSrc), [st] "=&r" (st), [dt] "=&r" (dt), [out] "=&r" (out) \
+        : [dst] "r" (uDst), "0" (uSrc) : "cc"); \
+}
+
+#endif  //_OP_BLEND_H_
index d291418..b041dc3 100644 (file)
@@ -1,5 +1,5 @@
 /***************************************************************************
-*   Copyright (C) 2010 PCSX4ALL Team                                      *
+*   Copyright (C) 2016 PCSX4ALL Team                                      *
 *   Copyright (C) 2010 Unai                                               *
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 
 //  GPU color operations for lighting calculations
 
-#ifdef __arm__
-#define gpuLightingRGB(uSrc,lCol) \
-{ \
-       u32 cb,cg; \
-       asm ("and %[cb],  %[lCol], #0x7C00/32      \n" \
-            "and %[cg],  %[lCol], #0x03E0*2048    \n" \
-            "mov %[res], %[lCol],          lsr #27\n" \
-            "orr %[res], %[res], %[cb],    lsl #5 \n" \
-            "orr %[res], %[res], %[cg],    lsr #11\n" \
-        : [res] "=&r" (uSrc), [cb] "=&r" (cb), [cg] "=&r" (cg) \
-        : [lCol] "r" (lCol)); \
+static void SetupLightLUT()
+{
+       // 1024-entry lookup table that modulates 5-bit texture + 5-bit light value.
+       // A light value of 15 does not modify the incoming texture color.
+       // LightLUT[32*32] array is initialized to following values:
+       //  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       //  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+       //  0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
+       //  0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5,
+       //  0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7,
+       //  0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 9,
+       //  0, 0, 0, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 9, 9, 9,10,10,10,11,11,
+       //  0, 0, 0, 1, 1, 2, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 7, 8, 8, 9, 9,10,10,10,11,11,12,12,13,13,
+       //  0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9,10,10,11,11,12,12,13,13,14,14,15,15,
+       //  0, 0, 1, 1, 2, 2, 3, 3, 4, 5, 5, 6, 6, 7, 7, 8, 9, 9,10,10,11,11,12,12,13,14,14,15,15,16,16,17,
+       //  0, 0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 6, 7, 8, 8, 9,10,10,11,11,12,13,13,14,15,15,16,16,17,18,18,19,
+       //  0, 0, 1, 2, 2, 3, 4, 4, 5, 6, 6, 7, 8, 8, 9,10,11,11,12,13,13,14,15,15,16,17,17,18,19,19,20,21,
+       //  0, 0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9,10,11,12,12,13,14,15,15,16,17,18,18,19,20,21,21,22,23,
+       //  0, 0, 1, 2, 3, 4, 4, 5, 6, 7, 8, 8, 9,10,11,12,13,13,14,15,16,17,17,18,19,20,21,21,22,23,24,25,
+       //  0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 9,10,11,12,13,14,14,15,16,17,18,19,20,21,21,22,23,24,25,26,27,
+       //  0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,
+       //  0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
+       //  0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,31,
+       //  0, 1, 2, 3, 4, 5, 6, 7, 9,10,11,12,13,14,15,16,18,19,20,21,22,23,24,25,27,28,29,30,31,31,31,31,
+       //  0, 1, 2, 3, 4, 5, 7, 8, 9,10,11,13,14,15,16,17,19,20,21,22,23,24,26,27,28,29,30,31,31,31,31,31,
+       //  0, 1, 2, 3, 5, 6, 7, 8,10,11,12,13,15,16,17,18,20,21,22,23,25,26,27,28,30,31,31,31,31,31,31,31,
+       //  0, 1, 2, 3, 5, 6, 7, 9,10,11,13,14,15,17,18,19,21,22,23,24,26,27,28,30,31,31,31,31,31,31,31,31,
+       //  0, 1, 2, 4, 5, 6, 8, 9,11,12,13,15,16,17,19,20,22,23,24,26,27,28,30,31,31,31,31,31,31,31,31,31,
+       //  0, 1, 2, 4, 5, 7, 8,10,11,12,14,15,17,18,20,21,23,24,25,27,28,30,31,31,31,31,31,31,31,31,31,31,
+       //  0, 1, 3, 4, 6, 7, 9,10,12,13,15,16,18,19,21,22,24,25,27,28,30,31,31,31,31,31,31,31,31,31,31,31,
+       //  0, 1, 3, 4, 6, 7, 9,10,12,14,15,17,18,20,21,23,25,26,28,29,31,31,31,31,31,31,31,31,31,31,31,31,
+       //  0, 1, 3, 4, 6, 8, 9,11,13,14,16,17,19,21,22,24,26,27,29,30,31,31,31,31,31,31,31,31,31,31,31,31,
+       //  0, 1, 3, 5, 6, 8,10,11,13,15,16,18,20,21,23,25,27,28,30,31,31,31,31,31,31,31,31,31,31,31,31,31,
+       //  0, 1, 3, 5, 7, 8,10,12,14,15,17,19,21,22,24,26,28,29,31,31,31,31,31,31,31,31,31,31,31,31,31,31,
+       //  0, 1, 3, 5, 7, 9,10,12,14,16,18,19,21,23,25,27,29,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,
+       //  0, 1, 3, 5, 7, 9,11,13,15,16,18,20,22,24,26,28,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,
+       //  0, 1, 3, 5, 7, 9,11,13,15,17,19,21,23,25,27,29,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31
+
+       for (int j=0; j < 32; ++j) {
+               for (int i=0; i < 32; ++i) {
+                       int val = i * j / 16;
+                       if (val > 31) val = 31;
+                       gpu_unai.LightLUT[(j*32) + i] = val;
+               }
+       }
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Create packed Gouraud fixed-pt 8.3:8.3:8.2 rgb triplet
+//
+// INPUT:
+// 'r','g','b' are 8.10 fixed-pt color components (r shown here)
+//     'r' input:  --------------rrrrrrrrXXXXXXXXXX
+//                 ^ bit 31
+// RETURNS:
+//    u32 output:  rrrrrrrrXXXggggggggXXXbbbbbbbbXX
+//                 ^ bit 31
+// Where 'r,g,b' are integer bits of colors, 'X' fixed-pt, and '-' don't care
+////////////////////////////////////////////////////////////////////////////////
+GPU_INLINE u32 gpuPackGouraudCol(u32 r, u32 g, u32 b)
+{
+       return ((u32)(b>> 8)&(0x03ff    ))
+            | ((u32)(g<< 3)&(0x07ff<<10))
+            | ((u32)(r<<14)&(0x07ff<<21));
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Create packed increment for Gouraud fixed-pt 8.3:8.3:8.2 rgb triplet
+//
+// INPUT:
+//  Sign-extended 8.10 fixed-pt r,g,b color increment values (only dr is shown)
+//   'dr' input:  ssssssssssssssrrrrrrrrXXXXXXXXXX
+//                ^ bit 31
+// RETURNS:
+//   u32 output:  rrrrrrrrXXXggggggggXXXbbbbbbbbXX
+//                ^ bit 31
+// Where 'r,g,b' are integer bits of colors, 'X' fixed-pt, and 's' sign bits
+//
+// NOTE: The correctness of this code/method has not been fully verified,
+//       having been merely factored out from original code in
+//       poly-drawing functions. Feel free to check/improve it -senquack
+////////////////////////////////////////////////////////////////////////////////
+GPU_INLINE u32 gpuPackGouraudColInc(s32 dr, s32 dg, s32 db)
+{
+       u32 dr_tmp = (u32)(dr << 14)&(0xffffffff<<21);  if (dr < 0) dr_tmp += 1<<21;
+       u32 dg_tmp = (u32)(dg <<  3)&(0xffffffff<<10);  if (dg < 0) dg_tmp += 1<<10;
+       u32 db_tmp = (u32)(db >>  8)&(0xffffffff    );  if (db < 0) db_tmp += 1<< 0;
+       return db_tmp + dg_tmp + dr_tmp;
 }
-#else
-#define gpuLightingRGB(uSrc,lCol) uSrc=((lCol<<5)&0x7C00) | ((lCol>>11)&0x3E0) | (lCol>>27)
-#endif
 
-INLINE void gpuLightingTXT(u16 &uSrc, u32 &lCol)
+
+////////////////////////////////////////////////////////////////////////////////
+// Extract bgr555 color from Gouraud u32 fixed-pt 8.3:8.3:8.2 rgb triplet
+//
+// INPUT:
+//  'gCol' input:  rrrrrrrrXXXggggggggXXXbbbbbbbbXX
+//                 ^ bit 31
+// RETURNS:
+//    u16 output:  0bbbbbgggggrrrrr
+//                 ^ bit 16
+// Where 'r,g,b' are integer bits of colors, 'X' fixed-pt, and '0' zero
+////////////////////////////////////////////////////////////////////////////////
+GPU_INLINE u16 gpuLightingRGB(u32 gCol)
+{
+       return ((gCol<< 5)&0x7C00) |
+              ((gCol>>11)&0x03E0) |
+               (gCol>>27);
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Convert packed Gouraud u32 fixed-pt 8.3:8.3:8.2 rgb triplet in 'gCol'
+//  to padded u32 5.4:5.4:5.4 bgr fixed-pt triplet, suitable for use
+//  with HQ 24-bit lighting/quantization.
+//
+// INPUT:
+//       'gCol' input:  rrrrrrrrXXXggggggggXXXbbbbbbbbXX
+//                      ^ bit 31
+// RETURNS:
+//         u32 output:  000bbbbbXXXX0gggggXXXX0rrrrrXXXX
+//                      ^ bit 31
+//  Where 'X' are fixed-pt bits, '0' zero-padding, and '-' is don't care
+////////////////////////////////////////////////////////////////////////////////
+GPU_INLINE u32 gpuLightingRGB24(u32 gCol)
+{
+       return ((gCol<<19) & (0x1FF<<20)) |
+              ((gCol>> 2) & (0x1FF<<10)) |
+               (gCol>>23);
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Apply fast (low-precision) 5-bit lighting to bgr555 texture color:
+//
+// INPUT:
+//        'r5','g5','b5' are unsigned 5-bit color values, value of 15
+//          is midpoint that doesn't modify that component of texture
+//        'uSrc' input:  -bbbbbgggggrrrrr
+//                       ^ bit 16
+// RETURNS:
+//          u16 output:  0bbbbbgggggrrrrr
+// Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care
+////////////////////////////////////////////////////////////////////////////////
+GPU_INLINE u16 gpuLightingTXT(u16 uSrc, u8 r5, u8 g5, u8 b5)
 {
-       //  Pixelops Table
-       static const u8 _gpuLitT[32*32] = {
-                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
-                0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5,
-                0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7,
-                0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 9,
-                0, 0, 0, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 9, 9, 9,10,10,10,11,11,
-                0, 0, 0, 1, 1, 2, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 7, 8, 8, 9, 9,10,10,10,11,11,12,12,13,13,
-                0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9,10,10,11,11,12,12,13,13,14,14,15,15,
-                0, 0, 1, 1, 2, 2, 3, 3, 4, 5, 5, 6, 6, 7, 7, 8, 9, 9,10,10,11,11,12,12,13,14,14,15,15,16,16,17,
-                0, 0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 6, 7, 8, 8, 9,10,10,11,11,12,13,13,14,15,15,16,16,17,18,18,19,
-                0, 0, 1, 2, 2, 3, 4, 4, 5, 6, 6, 7, 8, 8, 9,10,11,11,12,13,13,14,15,15,16,17,17,18,19,19,20,21,
-                0, 0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9,10,11,12,12,13,14,15,15,16,17,18,18,19,20,21,21,22,23,
-                0, 0, 1, 2, 3, 4, 4, 5, 6, 7, 8, 8, 9,10,11,12,13,13,14,15,16,17,17,18,19,20,21,21,22,23,24,25,
-                0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 9,10,11,12,13,14,14,15,16,17,18,19,20,21,21,22,23,24,25,26,27,
-                0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,
-                0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
-                0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,31,
-                0, 1, 2, 3, 4, 5, 6, 7, 9,10,11,12,13,14,15,16,18,19,20,21,22,23,24,25,27,28,29,30,31,31,31,31,
-                0, 1, 2, 3, 4, 5, 7, 8, 9,10,11,13,14,15,16,17,19,20,21,22,23,24,26,27,28,29,30,31,31,31,31,31,
-                0, 1, 2, 3, 5, 6, 7, 8,10,11,12,13,15,16,17,18,20,21,22,23,25,26,27,28,30,31,31,31,31,31,31,31,
-                0, 1, 2, 3, 5, 6, 7, 9,10,11,13,14,15,17,18,19,21,22,23,24,26,27,28,30,31,31,31,31,31,31,31,31,
-                0, 1, 2, 4, 5, 6, 8, 9,11,12,13,15,16,17,19,20,22,23,24,26,27,28,30,31,31,31,31,31,31,31,31,31,
-                0, 1, 2, 4, 5, 7, 8,10,11,12,14,15,17,18,20,21,23,24,25,27,28,30,31,31,31,31,31,31,31,31,31,31,
-                0, 1, 3, 4, 6, 7, 9,10,12,13,15,16,18,19,21,22,24,25,27,28,30,31,31,31,31,31,31,31,31,31,31,31,
-                0, 1, 3, 4, 6, 7, 9,10,12,14,15,17,18,20,21,23,25,26,28,29,31,31,31,31,31,31,31,31,31,31,31,31,
-                0, 1, 3, 4, 6, 8, 9,11,13,14,16,17,19,21,22,24,26,27,29,30,31,31,31,31,31,31,31,31,31,31,31,31,
-                0, 1, 3, 5, 6, 8,10,11,13,15,16,18,20,21,23,25,27,28,30,31,31,31,31,31,31,31,31,31,31,31,31,31,
-                0, 1, 3, 5, 7, 8,10,12,14,15,17,19,21,22,24,26,28,29,31,31,31,31,31,31,31,31,31,31,31,31,31,31,
-                0, 1, 3, 5, 7, 9,10,12,14,16,18,19,21,23,25,27,29,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,
-                0, 1, 3, 5, 7, 9,11,13,15,16,18,20,22,24,26,28,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,
-                0, 1, 3, 5, 7, 9,11,13,15,17,19,21,23,25,27,29,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31
-       };
-       uSrc  = (_gpuLitT[((uSrc&0x7C00)>>5)|((lCol>>5)&0x1f)]<<10)|(_gpuLitT[(uSrc&0x03E0)|((lCol>>16)&0x1f)]<<5)|(_gpuLitT[((uSrc&0x001F)<<5)|(lCol>>27)]);
+       return (gpu_unai.LightLUT[((uSrc&0x7C00)>>5) | b5] << 10) |
+              (gpu_unai.LightLUT[ (uSrc&0x03E0)     | g5] <<  5) |
+              (gpu_unai.LightLUT[((uSrc&0x001F)<<5) | r5]      );
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Apply fast (low-precision) 5-bit Gouraud lighting to bgr555 texture color:
+//
+// INPUT:
+//  'gCol' is a packed Gouraud u32 fixed-pt 8.3:8.3:8.2 rgb triplet, value of
+//     15.0 is midpoint that does not modify color of texture
+//         gCol input :  rrrrrXXXXXXgggggXXXXXXbbbbbXXXXX
+//                       ^ bit 31
+//        'uSrc' input:  -bbbbbgggggrrrrr
+//                       ^ bit 16
+// RETURNS:
+//          u16 output:  0bbbbbgggggrrrrr
+// Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care
+////////////////////////////////////////////////////////////////////////////////
+GPU_INLINE u16 gpuLightingTXTGouraud(u16 uSrc, u32 gCol)
+{
+       return (gpu_unai.LightLUT[((uSrc&0x7C00)>>5) | ((gCol>> 5)&0x1F)]<<10) |
+              (gpu_unai.LightLUT[ (uSrc&0x03E0)     | ((gCol>>16)&0x1F)]<< 5) |
+              (gpu_unai.LightLUT[((uSrc&0x001F)<<5) |  (gCol>>27)      ]    );
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Apply high-precision 8-bit lighting to bgr555 texture color,
+//  returning a padded u32 5.4:5.4:5.4 bgr fixed-pt triplet
+//  suitable for use with HQ 24-bit lighting/quantization.
+//
+// INPUT:
+//        'r8','g8','b8' are unsigned 8-bit color component values, value of
+//          127 is midpoint that doesn't modify that component of texture
+//
+//         uSrc input: -bbbbbgggggrrrrr
+//                     ^ bit 16
+// RETURNS:
+//         u32 output: 000bbbbbXXXX0gggggXXXX0rrrrrXXXX
+//                     ^ bit 31
+// Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care
+////////////////////////////////////////////////////////////////////////////////
+GPU_INLINE u32 gpuLightingTXT24(u16 uSrc, u8 r8, u8 g8, u8 b8)
+{
+       u16 r1 = uSrc&0x001F;
+       u16 g1 = uSrc&0x03E0;
+       u16 b1 = uSrc&0x7C00;
+
+       u16 r2 = r8;
+       u16 g2 = g8;
+       u16 b2 = b8;
+
+       u32 r3 = r1 * r2; if (r3 & 0xFFFFF000) r3 = ~0xFFFFF000;
+       u32 g3 = g1 * g2; if (g3 & 0xFFFE0000) g3 = ~0xFFFE0000;
+       u32 b3 = b1 * b2; if (b3 & 0xFFC00000) b3 = ~0xFFC00000;
+
+       return ((r3>> 3)    ) |
+              ((g3>> 8)<<10) |
+              ((b3>>13)<<20);
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Apply high-precision 8-bit lighting to bgr555 texture color in 'uSrc',
+//  returning a padded u32 5.4:5.4:5.4 bgr fixed-pt triplet
+//  suitable for use with HQ 24-bit lighting/quantization.
+//
+// INPUT:
+//       'uSrc' input: -bbbbbgggggrrrrr
+//                     ^ bit 16
+//       'gCol' input: rrrrrrrrXXXggggggggXXXbbbbbbbbXX
+//                     ^ bit 31
+// RETURNS:
+//         u32 output: 000bbbbbXXXX0gggggXXXX0rrrrrXXXX
+//                     ^ bit 31
+// Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care
+////////////////////////////////////////////////////////////////////////////////
+GPU_INLINE u32 gpuLightingTXT24Gouraud(u16 uSrc, u32 gCol)
+{
+       u16 r1 = uSrc&0x001F;
+       u16 g1 = uSrc&0x03E0;
+       u16 b1 = uSrc&0x7C00;
+
+       u16 r2 = (gCol>>24) & 0xFF;
+       u16 g2 = (gCol>>13) & 0xFF;
+       u16 b2 = (gCol>> 2) & 0xFF;
+
+       u32 r3 = r1 * r2; if (r3 & 0xFFFFF000) r3 = ~0xFFFFF000;
+       u32 g3 = g1 * g2; if (g3 & 0xFFFE0000) g3 = ~0xFFFE0000;
+       u32 b3 = b1 * b2; if (b3 & 0xFFC00000) b3 = ~0xFFC00000;
+
+       return ((r3>> 3)    ) |
+              ((g3>> 8)<<10) |
+              ((b3>>13)<<20);
 }
 
 #endif  //_OP_LIGHT_H_
diff --git a/plugins/gpu_unai/gpu_inner_quantization.h b/plugins/gpu_unai/gpu_inner_quantization.h
new file mode 100644 (file)
index 0000000..0e7e3e8
--- /dev/null
@@ -0,0 +1,108 @@
+/***************************************************************************
+*   Copyright (C) 2016 PCSX4ALL Team                                      *
+*                                                                         *
+*   This program is free software; you can redistribute it and/or modify  *
+*   it under the terms of the GNU General Public License as published by  *
+*   the Free Software Foundation; either version 2 of the License, or     *
+*   (at your option) any later version.                                   *
+*                                                                         *
+*   This program is distributed in the hope that it will be useful,       *
+*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+*   GNU General Public License for more details.                          *
+*                                                                         *
+*   You should have received a copy of the GNU General Public License     *
+*   along with this program; if not, write to the                         *
+*   Free Software Foundation, Inc.,                                       *
+*   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
+***************************************************************************/
+
+#ifndef _OP_DITHER_H_
+#define _OP_DITHER_H_
+
+static void SetupDitheringConstants()
+{
+       // Initialize Dithering Constants
+       // The screen is divided into 8x8 chunks and sub-unitary noise is applied
+       // using the following matrix. This ensures that data lost in color
+       // quantization will be added back to the image 'by chance' in predictable
+       // patterns that are naturally 'smoothed' by your sight when viewed from a
+       // certain distance.
+       //
+       // http://caca.zoy.org/study/index.html
+       //
+       // Shading colors are encoded in 4.5, and then are quantitized to 5.0,
+       // DitherMatrix constants reflect that.
+
+       static const u8 DitherMatrix[] = {
+                0, 32,  8, 40,  2, 34, 10, 42,
+               48, 16, 56, 24, 50, 18, 58, 26,
+               12, 44,  4, 36, 14, 46,  6, 38,
+               60, 28, 52, 20, 62, 30, 54, 22,
+                3, 35, 11, 43,  1, 33,  9, 41,
+               51, 19, 59, 27, 49, 17, 57, 25,
+               15, 47,  7, 39, 13, 45,  5, 37,
+               63, 31, 55, 23, 61, 29, 53, 21
+       };
+
+       int i, j;
+       for (i = 0; i < 8; i++)
+       {
+               for (j = 0; j < 8; j++)
+               {
+                       u16 offset = (i << 3) | j;
+
+                       u32 component = ((DitherMatrix[offset] + 1) << 4) / 65; //[5.5] -> [5]
+
+                       // XXX - senquack - hack Dec 2016
+                       //  Until JohnnyF gets the time to work further on dithering,
+                       //   force lower bit of component to 0. This fixes grid pattern
+                       //   affecting quality of dithered image, as well as loss of
+                       //   detail in dark areas. With lower bit unset like this, existing
+                       //   27-bit accuracy of dithering math is unneeded, could be 24-bit.
+                       //   Is 8x8 matrix overkill as a result, can we use 4x4?
+                       component &= ~1;
+
+                       gpu_unai.DitherMatrix[offset] = (component)
+                                                     | (component << 10)
+                                                     | (component << 20);
+               }
+       }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Convert padded u32 5.4:5.4:5.4 bgr fixed-pt triplet to final bgr555 color,
+//  applying dithering if specified by template parameter.
+//
+// INPUT:
+//     'uSrc24' input: 000bbbbbXXXX0gggggXXXX0rrrrrXXXX
+//                     ^ bit 31
+//       'pDst' is a pointer to destination framebuffer pixel, used
+//         to determine which DitherMatrix[] entry to apply.
+// RETURNS:
+//         u16 output: 0bbbbbgggggrrrrr
+//                     ^ bit 16
+// Where 'X' are fixed-pt bits, '0' is zero-padding, and '-' is don't care
+////////////////////////////////////////////////////////////////////////////////
+template <int DITHER>
+GPU_INLINE u16 gpuColorQuantization24(u32 uSrc24, const u16 *pDst)
+{
+       if (DITHER)
+       {
+               u16 fbpos  = (u32)(pDst - gpu_unai.vram);
+               u16 offset = ((fbpos & (0x7 << 10)) >> 7) | (fbpos & 0x7);
+
+               //clean overflow flags and add
+               uSrc24 = (uSrc24 & 0x1FF7FDFF) + gpu_unai.DitherMatrix[offset];
+
+               if (uSrc24 & (1<< 9)) uSrc24 |= (0x1FF    );
+               if (uSrc24 & (1<<19)) uSrc24 |= (0x1FF<<10);
+               if (uSrc24 & (1<<29)) uSrc24 |= (0x1FF<<20);
+       }
+
+       return ((uSrc24>> 4) & (0x1F    ))
+            | ((uSrc24>> 9) & (0x1F<<5 ))
+            | ((uSrc24>>14) & (0x1F<<10));
+}
+
+#endif //_OP_DITHER_H_
index 0c82aa9..87d2151 100644 (file)
  ***************************************************************************/
 
 ///////////////////////////////////////////////////////////////////////////////
-INLINE void gpuLoadImage(void)
+#ifndef USE_GPULIB
+void gpuLoadImage(PtrUnion packet)
 {
        u16 x0, y0, w0, h0;
-       x0 = PacketBuffer.U2[2] & 1023;
-       y0 = PacketBuffer.U2[3] & 511;
-       w0 = PacketBuffer.U2[4];
-       h0 = PacketBuffer.U2[5];
+       x0 = packet.U2[2] & 1023;
+       y0 = packet.U2[3] & 511;
+       w0 = packet.U2[4];
+       h0 = packet.U2[5];
 
        if ((y0 + h0) > FRAME_HEIGHT)
        {
                h0 = FRAME_HEIGHT - y0;
        }
 
-       FrameToWrite = ((w0)&&(h0));
+       gpu_unai.dma.FrameToWrite = ((w0)&&(h0));
 
-       px = 0;
-       py = 0;
-       x_end = w0;
-       y_end = h0;
-       pvram = &((u16*)GPU_FrameBuffer)[x0+(y0*1024)];
+       gpu_unai.dma.px = 0;
+       gpu_unai.dma.py = 0;
+       gpu_unai.dma.x_end = w0;
+       gpu_unai.dma.y_end = h0;
+       gpu_unai.dma.pvram = &((u16*)gpu_unai.vram)[x0+(y0*1024)];
 
-       GPU_GP1 |= 0x08000000;
+       gpu_unai.GPU_GP1 |= 0x08000000;
 }
+#endif // !USE_GPULIB
 
 ///////////////////////////////////////////////////////////////////////////////
-INLINE void gpuStoreImage(void)
+#ifndef USE_GPULIB
+void gpuStoreImage(PtrUnion packet)
 {
        u16 x0, y0, w0, h0;
-       x0 = PacketBuffer.U2[2] & 1023;
-       y0 = PacketBuffer.U2[3] & 511;
-       w0 = PacketBuffer.U2[4];
-       h0 = PacketBuffer.U2[5];
+       x0 = packet.U2[2] & 1023;
+       y0 = packet.U2[3] & 511;
+       w0 = packet.U2[4];
+       h0 = packet.U2[5];
 
        if ((y0 + h0) > FRAME_HEIGHT)
        {
                h0 = FRAME_HEIGHT - y0;
        }
-       FrameToRead = ((w0)&&(h0));
+       gpu_unai.dma.FrameToRead = ((w0)&&(h0));
 
-       px = 0;
-       py = 0;
-       x_end = w0;
-       y_end = h0;
-       pvram = &((u16*)GPU_FrameBuffer)[x0+(y0*1024)];
+       gpu_unai.dma.px = 0;
+       gpu_unai.dma.py = 0;
+       gpu_unai.dma.x_end = w0;
+       gpu_unai.dma.y_end = h0;
+       gpu_unai.dma.pvram = &((u16*)gpu_unai.vram)[x0+(y0*1024)];
        
-       GPU_GP1 |= 0x08000000;
+       gpu_unai.GPU_GP1 |= 0x08000000;
 }
+#endif // !USE_GPULIB
 
-INLINE void gpuMoveImage(void)
+void gpuMoveImage(PtrUnion packet)
 {
        u32 x0, y0, x1, y1;
        s32 w0, h0;
-       x0 = PacketBuffer.U2[2] & 1023;
-       y0 = PacketBuffer.U2[3] & 511;
-       x1 = PacketBuffer.U2[4] & 1023;
-       y1 = PacketBuffer.U2[5] & 511;
-       w0 = PacketBuffer.U2[6];
-       h0 = PacketBuffer.U2[7];
+       x0 = packet.U2[2] & 1023;
+       y0 = packet.U2[3] & 511;
+       x1 = packet.U2[4] & 1023;
+       y1 = packet.U2[5] & 511;
+       w0 = packet.U2[6];
+       h0 = packet.U2[7];
 
        if( (x0==x1) && (y0==y1) ) return;
        if ((w0<=0) || (h0<=0)) return;
        
+       #ifdef ENABLE_GPU_LOG_SUPPORT
+               fprintf(stdout,"gpuMoveImage(x0=%u,y0=%u,x1=%u,y1=%u,w0=%d,h0=%d)\n",x0,y0,x1,y1,w0,h0);
+       #endif
+       
        if (((y0+h0)>512)||((x0+w0)>1024)||((y1+h0)>512)||((x1+w0)>1024))
        {
-               u16 *psxVuw=GPU_FrameBuffer;
+               u16 *psxVuw=gpu_unai.vram;
                s32 i,j;
            for(j=0;j<h0;j++)
                 for(i=0;i<w0;i++)
@@ -93,7 +101,7 @@ INLINE void gpuMoveImage(void)
        else if ((x0&1)||(x1&1))
        {
                u16 *lpDst, *lpSrc;
-               lpDst = lpSrc = (u16*)GPU_FrameBuffer;
+               lpDst = lpSrc = (u16*)gpu_unai.vram;
                lpSrc += FRAME_OFFSET(x0, y0);
                lpDst += FRAME_OFFSET(x1, y1);
                x1 = FRAME_WIDTH - w0;
@@ -107,7 +115,7 @@ INLINE void gpuMoveImage(void)
        else
        {
                u32 *lpDst, *lpSrc;
-               lpDst = lpSrc = (u32*)(void*)GPU_FrameBuffer;
+               lpDst = lpSrc = (u32*)(void*)gpu_unai.vram;
                lpSrc += ((FRAME_OFFSET(x0, y0))>>1);
                lpDst += ((FRAME_OFFSET(x1, y1))>>1);
                if (w0&1)
@@ -143,13 +151,13 @@ INLINE void gpuMoveImage(void)
        }
 }
 
-INLINE void gpuClearImage(void)
+void gpuClearImage(PtrUnion packet)
 {
        s32   x0, y0, w0, h0;
-       x0 = PacketBuffer.S2[2];
-       y0 = PacketBuffer.S2[3];
-       w0 = PacketBuffer.S2[4] & 0x3ff;
-       h0 = PacketBuffer.S2[5] & 0x3ff;
+       x0 = packet.S2[2];
+       y0 = packet.S2[3];
+       w0 = packet.S2[4] & 0x3ff;
+       h0 = packet.S2[5] & 0x3ff;
         
        w0 += x0;
        if (x0 < 0) x0 = 0;
@@ -162,10 +170,14 @@ INLINE void gpuClearImage(void)
        h0 -= y0;
        if (h0 <= 0) return;
 
+       #ifdef ENABLE_GPU_LOG_SUPPORT
+               fprintf(stdout,"gpuClearImage(x0=%d,y0=%d,w0=%d,h0=%d)\n",x0,y0,w0,h0);
+       #endif
+       
        if (x0&1)
        {
-               u16* pixel = (u16*)GPU_FrameBuffer + FRAME_OFFSET(x0, y0);
-               u16 rgb = GPU_RGB16(PacketBuffer.S4[0]);
+               u16* pixel = (u16*)gpu_unai.vram + FRAME_OFFSET(x0, y0);
+               u16 rgb = GPU_RGB16(packet.U4[0]);
                y0 = FRAME_WIDTH - w0;
                do {
                        x0=w0;
@@ -175,8 +187,8 @@ INLINE void gpuClearImage(void)
        }
        else
        {
-               u32* pixel = (u32*)(void*)GPU_FrameBuffer + ((FRAME_OFFSET(x0, y0))>>1);
-               u32 rgb = GPU_RGB16(PacketBuffer.S4[0]);
+               u32* pixel = (u32*)gpu_unai.vram + ((FRAME_OFFSET(x0, y0))>>1);
+               u32 rgb = GPU_RGB16(packet.U4[0]);
                rgb |= (rgb<<16);
                if (w0&1)
                {
index fc59b79..28ea074 100644 (file)
@@ -1,6 +1,7 @@
 /***************************************************************************
 *   Copyright (C) 2010 PCSX4ALL Team                                      *
 *   Copyright (C) 2010 Unai                                               *
+*   Copyright (C) 2016 Senquack (dansilsby <AT> gmail <DOT> com)          *
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU General Public License as published by  *
 *   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
 ***************************************************************************/
 
-#define        GPU_TESTRANGE(x)      { if((u32)(x+1024) > 2047) return; }
-
 ///////////////////////////////////////////////////////////////////////////////
 //  GPU internal line drawing functions
+//
+// Rewritten October 2016 by senquack:
+//  Instead of one pixel at a time, lines are now drawn in runs of pixels,
+//  whether vertical, horizontal, or diagonal. A new inner driver
+//  'gpuPixelSpanFn' is used, as well as an enhanced Bresenham run-slice
+//  algorithm. For more information, see the following:
+//
+//  Michael Abrash - Graphics Programming Black Book
+//  Chapters 35 - 36 (does not implement diagonal runs)
+//  http://www.drdobbs.com/parallel/graphics-programming-black-book/184404919
+//  http://www.jagregory.com/abrash-black-book/
+//
+//  Article by Andrew Delong (does not implement diagonal runs)
+//  http://timetraces.ca/nw/drawline.htm
+//
+//  'Run-Based Multi-Point Line Drawing' by Eun Jae Lee & Larry F. Hodges
+//  https://smartech.gatech.edu/bitstream/handle/1853/3632/93-22.pdf
+//  Provided the idea of doing a half-octant transform allowing lines with
+//  slopes between 0.5 and 2.0 (diagonal runs of pixels) to be handled
+//  identically to the traditional horizontal/vertical run-slice method.
 
-#define GPU_DIGITS  16
-#define GPU_DIGITSC (GPU_DIGITS+3)
+// Use 16.16 fixed point precision for line math.
+// NOTE: Gouraud colors used by gpuPixelSpanFn can use a different precision.
+#define GPU_LINE_FIXED_BITS 16
 
-INLINE s32 GPU_DIV(s32 rs, s32 rt)
-{
-       return rt ? (rs / rt) : (0);
-}
+// If defined, Gouraud lines will use fixed-point multiply-by-inverse to
+// do most divisions. With enough accuracy, this should be OK.
+#define USE_LINES_ALL_FIXED_PT_MATH
 
-///////////////////////////////////////////////////////////////////////////////
-void gpuDrawLF(const PD gpuPixelDriver)
+//////////////////////
+// Flat-shaded line //
+//////////////////////
+void gpuDrawLineF(PtrUnion packet, const PSD gpuPixelSpanDriver)
 {
-       s32 temp;
-       s32 xmin, xmax;
-       s32 ymin, ymax;
-       s32 x0, x1, dx;
-       s32 y0, y1, dy;
-
-       x0 = PacketBuffer.S2[2] + DrawingOffset[0];     GPU_TESTRANGE(x0);
-       y0 = PacketBuffer.S2[3] + DrawingOffset[1];     GPU_TESTRANGE(y0);
-       x1 = PacketBuffer.S2[4] + DrawingOffset[0];     GPU_TESTRANGE(x1);
-       y1 = PacketBuffer.S2[5] + DrawingOffset[1];     GPU_TESTRANGE(y1);
-
-       xmin = DrawingArea[0];  xmax = DrawingArea[2];
-       ymin = DrawingArea[1];  ymax = DrawingArea[3];
-       const u16 pixeldata = GPU_RGB16(PacketBuffer.U4[0]);
-
-       dy = (y1 - y0);
-       if (dy < 0) dy = -dy;
-       dx = (x1 - x0);
-       if (dx < 0) dx = -dx;
-       if (dx > dy) {
-               if (x0 > x1) {
-                       GPU_SWAP(x0, x1, temp);
-                       GPU_SWAP(y0, y1, temp);
+       int x0, y0, x1, y1;
+       int dx, dy;
+
+       // All three of these variables should be signed (so multiplication works)
+       ptrdiff_t sx;  // Sign of x delta, positive when x0 < x1
+       const ptrdiff_t dst_depth  = FRAME_BYTES_PER_PIXEL; // PSX: 2 bytes per pixel
+       const ptrdiff_t dst_stride = FRAME_BYTE_STRIDE;     // PSX: 2048 bytes per framebuffer line
+
+       // Clip region: xmax/ymax seem to normally be one *past* the rightmost/
+       //  bottommost pixels of the draw area. Since we render every pixel between
+       //  and including both line endpoints, subtract one from xmax/ymax.
+       const int xmin = gpu_unai.DrawingArea[0];
+       const int ymin = gpu_unai.DrawingArea[1];
+       const int xmax = gpu_unai.DrawingArea[2] - 1;
+       const int ymax = gpu_unai.DrawingArea[3] - 1;
+
+       x0 = GPU_EXPANDSIGN(packet.S2[2]) + gpu_unai.DrawingOffset[0];
+       y0 = GPU_EXPANDSIGN(packet.S2[3]) + gpu_unai.DrawingOffset[1];
+       x1 = GPU_EXPANDSIGN(packet.S2[4]) + gpu_unai.DrawingOffset[0];
+       y1 = GPU_EXPANDSIGN(packet.S2[5]) + gpu_unai.DrawingOffset[1];
+
+       // Always draw top to bottom, so ensure y0 <= y1
+       if (y0 > y1) {
+               SwapValues(y0, y1);
+               SwapValues(x0, x1);
+       }
+
+       // Is line totally outside Y clipping range?
+       if (y0 > ymax || y1 < ymin) return;
+
+       dx = x1 - x0;
+       dy = y1 - y0;
+
+       // X-axis range check : max distance between any two X coords is 1023
+       // (PSX hardware will not render anything violating this rule)
+       // NOTE: We'll check y coord range further below
+       if (dx >= CHKMAX_X || dx <= -CHKMAX_X)
+               return;
+
+       // Y-axis range check and clipping
+       if (dy) {
+               // Y-axis range check : max distance between any two Y coords is 511
+               // (PSX hardware will not render anything violating this rule)
+               if (dy >= CHKMAX_Y)
+                       return;
+
+               // We already know y0 < y1
+               if (y0 < ymin) {
+                       x0 += GPU_FAST_DIV(((ymin - y0) * dx), dy);
+                       y0 = ymin;
                }
-               y1 = GPU_DIV((y1 - y0) << GPU_DIGITS, dx);
-               y0 <<= GPU_DIGITS;
-               temp = xmin - x0;
-               if (temp > 0) {
-                       x0 = xmin;
-                       y0 += (y1 * temp);
+               if (y1 > ymax) {
+                       x1 += GPU_FAST_DIV(((ymax - y1) * dx), dy);
+                       y1 = ymax;
                }
-               if (x1 > xmax) x1 = xmax;
-               x1 -= x0;
-               if (x1 < 0) x1 = 0;
-
-               const int li=linesInterlace;
-               for (; x1; x1--) {
-                       temp = y0 >> GPU_DIGITS;
-                       if( 0 == (temp&li) )  {
-                               if ((u32) (temp - ymin) < (u32) (ymax - ymin)) {
-                                       gpuPixelDriver(&((u16*)GPU_FrameBuffer)[FRAME_OFFSET(x0, temp)],pixeldata);
-                               }
+
+               // Recompute in case clipping occurred:
+               dx = x1 - x0;
+               dy = y1 - y0;
+       }
+
+       // Check X clipping range, set 'sx' x-direction variable
+       if (dx == 0) {
+               // Is vertical line totally outside X clipping range?
+               if (x0 < xmin || x0 > xmax)
+                       return;
+               sx = 0;
+       } else {
+               if (dx > 0) {
+                       // x0 is leftmost coordinate
+                       if (x0 > xmax) return; // Both points outside X clip range
+
+                       if (x0 < xmin) {
+                               if (x1 < xmin) return; // Both points outside X clip range
+                               y0 += GPU_FAST_DIV(((xmin - x0) * dy), dx);
+                               x0 = xmin;
+                       }
+
+                       if (x1 > xmax) {
+                               y1 += GPU_FAST_DIV(((xmax - x1) * dy), dx);
+                               x1 = xmax;
+                       }
+
+                       sx = +1;
+                       dx = x1 - x0; // Get final value, which should also be absolute value
+               } else {
+                       // x1 is leftmost coordinate
+                       if (x1 > xmax) return; // Both points outside X clip range
+
+                       if (x1 < xmin) {
+                               if (x0 < xmin) return; // Both points outside X clip range
+
+                               y1 += GPU_FAST_DIV(((xmin - x1) * dy), dx);
+                               x1 = xmin;
                        }
-                       x0++;
-                       y0 += y1;
+
+                       if (x0 > xmax) {
+                               y0 += GPU_FAST_DIV(((xmax - x0) * dy), dx);
+                               x0 = xmax;
+                       }
+
+                       sx = -1;
+                       dx = x0 - x1; // Get final value, which should also be absolute value
+               }
+
+               // Recompute in case clipping occurred:
+               dy = y1 - y0;
+       }
+
+       // IMPORTANT: dx,dy should now contain their absolute values
+
+       int min_length,    // Minimum length of a pixel run
+           start_length,  // Length of first run
+           end_length,    // Length of last run
+           err_term,      // Cumulative error to determine when to draw longer run
+           err_adjup,     // Increment to err_term for each run drawn
+           err_adjdown;   // Subract this from err_term after drawing longer run
+
+       // Color to draw with (16 bits, highest of which is unset mask bit)
+       uintptr_t col16 = GPU_RGB16(packet.U4[0]);
+
+       // We use u8 pointers even though PS1 has u16 framebuffer.
+       //  This allows pixel-drawing functions to increment dst pointer
+       //  directly by the passed 'incr' value, not having to shift it first.
+       u8 *dst = (u8*)gpu_unai.vram + y0 * dst_stride + x0 * dst_depth;
+
+       // SPECIAL CASE: Vertical line
+       if (dx == 0) {
+               gpuPixelSpanDriver(dst, col16, dst_stride, dy+1);
+               return;
+       }
+
+       // SPECIAL CASE: Horizontal line
+       if (dy == 0) {
+               gpuPixelSpanDriver(dst, col16, sx * dst_depth, dx+1);
+               return;
+       }
+
+       // SPECIAL CASE: Diagonal line
+       if (dx == dy) {
+               gpuPixelSpanDriver(dst, col16, dst_stride + (sx * dst_depth), dy+1);
+               return;
+       }
+
+       int       major, minor;             // Major axis, minor axis
+       ptrdiff_t incr_major, incr_minor;   // Ptr increment for each step along axis
+
+       if (dx > dy) {
+               major = dx;
+               minor = dy;
+       } else {
+               major = dy;
+               minor = dx;
+       }
+
+       // Determine if diagonal or horizontal runs
+       if (major < (2 * minor)) {
+               // Diagonal runs, so perform half-octant transformation
+               minor = major - minor;
+
+               // Advance diagonally when drawing runs
+               incr_major = dst_stride + (sx * dst_depth);
+
+               // After drawing each run, correct for over-advance along minor axis
+               if (dx > dy)
+                       incr_minor = -dst_stride;
+               else
+                       incr_minor = -sx * dst_depth;
+       } else {
+               // Horizontal or vertical runs
+               if (dx > dy) {
+                       incr_major = sx * dst_depth;
+                       incr_minor = dst_stride;
+               } else {
+                       incr_major = dst_stride;
+                       incr_minor = sx * dst_depth;
                }
-       } else if (dy) {
-               if (y0 > y1) {
-                       GPU_SWAP(x0, x1, temp);
-                       GPU_SWAP(y0, y1, temp);
+       }
+
+       if (minor > 1) {
+               // Minimum number of pixels each run
+               min_length = major / minor;
+
+               // Initial error term; reflects an initial step of 0.5 along minor axis
+               err_term = (major % minor) - (minor * 2);
+
+               // Increment err_term this much each step along minor axis; when
+               //  err_term crosses zero, draw longer pixel run.
+               err_adjup = (major % minor) * 2;
+       } else {
+               min_length = major;
+               err_term = 0;
+               err_adjup = 0;
+       }
+
+       // Error term adjustment when err_term turns over; used to factor
+       //  out the major-axis step made at that time
+       err_adjdown = minor * 2;
+
+       // The initial and last runs are partial, because minor axis advances
+       //  only 0.5 for these runs, rather than 1. Each is half a full run,
+       //  plus the initial pixel.
+       start_length = end_length = (min_length / 2) + 1;
+
+       if (min_length & 1) {
+               // If there're an odd number of pixels per run, we have 1 pixel that
+               //  can't be allocated to either the initial or last partial run, so
+               //  we'll add 0.5 to err_term so that this pixel will be handled
+               //  by the normal full-run loop
+               err_term += minor;
+       } else {
+               // If the minimum run length is even and there's no fractional advance,
+               // we have one pixel that could go to either the initial or last
+               // partial run, which we arbitrarily allocate to the last run
+               if (err_adjup == 0)
+                       start_length--; // Leave out the extra pixel at the start
+       }
+
+       // First run of pixels
+       dst = gpuPixelSpanDriver(dst, col16, incr_major, start_length);
+       dst += incr_minor;
+
+       // Middle runs of pixels
+       while (--minor > 0) {
+               int run_length = min_length;
+               err_term += err_adjup;
+
+               // If err_term passed 0, reset it and draw longer run
+               if (err_term > 0) {
+                       err_term -= err_adjdown;
+                       run_length++;
                }
-               x1 = GPU_DIV((x1 - x0) << GPU_DIGITS, dy);
-               x0 <<= GPU_DIGITS;
-               temp = ymin - y0;
-               if (temp > 0) {
+
+               dst = gpuPixelSpanDriver(dst, col16, incr_major, run_length);
+               dst += incr_minor;
+       }
+
+       // Final run of pixels
+       gpuPixelSpanDriver(dst, col16, incr_major, end_length);
+}
+
+/////////////////////////
+// Gouraud-shaded line //
+/////////////////////////
+void gpuDrawLineG(PtrUnion packet, const PSD gpuPixelSpanDriver)
+{
+       int x0, y0, x1, y1;
+       int dx, dy, dr, dg, db;
+       u32 r0, g0, b0, r1, g1, b1;
+
+       // All three of these variables should be signed (so multiplication works)
+       ptrdiff_t sx;  // Sign of x delta, positive when x0 < x1
+       const ptrdiff_t dst_depth  = FRAME_BYTES_PER_PIXEL; // PSX: 2 bytes per pixel
+       const ptrdiff_t dst_stride = FRAME_BYTE_STRIDE;     // PSX: 2048 bytes per framebuffer line
+
+       // Clip region: xmax/ymax seem to normally be one *past* the rightmost/
+       //  bottommost pixels of the draw area. We'll render every pixel between
+       //  and including both line endpoints, so subtract one from xmax/ymax.
+       const int xmin = gpu_unai.DrawingArea[0];
+       const int ymin = gpu_unai.DrawingArea[1];
+       const int xmax = gpu_unai.DrawingArea[2] - 1;
+       const int ymax = gpu_unai.DrawingArea[3] - 1;
+
+       x0 = GPU_EXPANDSIGN(packet.S2[2]) + gpu_unai.DrawingOffset[0];
+       y0 = GPU_EXPANDSIGN(packet.S2[3]) + gpu_unai.DrawingOffset[1];
+       x1 = GPU_EXPANDSIGN(packet.S2[6]) + gpu_unai.DrawingOffset[0];
+       y1 = GPU_EXPANDSIGN(packet.S2[7]) + gpu_unai.DrawingOffset[1];
+
+       u32 col0 = packet.U4[0];
+       u32 col1 = packet.U4[2];
+
+       // Always draw top to bottom, so ensure y0 <= y1
+       if (y0 > y1) {
+               SwapValues(y0, y1);
+               SwapValues(x0, x1);
+               SwapValues(col0, col1);
+       }
+
+       // Is line totally outside Y clipping range?
+       if (y0 > ymax || y1 < ymin) return;
+
+       // If defined, Gouraud colors are fixed-point 5.11, otherwise they are 8.16
+       // (This is only beneficial if using SIMD-optimized pixel driver)
+#ifdef GPU_GOURAUD_LOW_PRECISION
+       r0 = (col0 >> 3) & 0x1f;  g0 = (col0 >> 11) & 0x1f;  b0 = (col0 >> 19) & 0x1f;
+       r1 = (col1 >> 3) & 0x1f;  g1 = (col1 >> 11) & 0x1f;  b1 = (col1 >> 19) & 0x1f;
+#else
+       r0 = col0 & 0xff;  g0 = (col0 >> 8) & 0xff;  b0 = (col0 >> 16) & 0xff;
+       r1 = col1 & 0xff;  g1 = (col1 >> 8) & 0xff;  b1 = (col1 >> 16) & 0xff;
+#endif
+
+       dx = x1 - x0;
+       dy = y1 - y0;
+       dr = r1 - r0;
+       dg = g1 - g0;
+       db = b1 - b0;
+
+       // X-axis range check : max distance between any two X coords is 1023
+       // (PSX hardware will not render anything violating this rule)
+       // NOTE: We'll check y coord range further below
+       if (dx >= CHKMAX_X || dx <= -CHKMAX_X)
+               return;
+
+       // Y-axis range check and clipping
+       if (dy) {
+               // Y-axis range check : max distance between any two Y coords is 511
+               // (PSX hardware will not render anything violating this rule)
+               if (dy >= CHKMAX_Y)
+                       return;
+
+               // We already know y0 < y1
+               if (y0 < ymin) {
+#ifdef USE_LINES_ALL_FIXED_PT_MATH
+                       s32 factor = GPU_FAST_DIV(((ymin - y0) << GPU_LINE_FIXED_BITS), dy);
+                       x0 += (dx * factor) >> GPU_LINE_FIXED_BITS;
+                       r0 += (dr * factor) >> GPU_LINE_FIXED_BITS;
+                       g0 += (dg * factor) >> GPU_LINE_FIXED_BITS;
+                       b0 += (db * factor) >> GPU_LINE_FIXED_BITS;
+#else
+                       x0 += (ymin - y0) * dx / dy;
+                       r0 += (ymin - y0) * dr / dy;
+                       g0 += (ymin - y0) * dg / dy;
+                       b0 += (ymin - y0) * db / dy;
+#endif
                        y0 = ymin;
-                       x0 += (x1 * temp);
                }
-               if (y1 > ymax) y1 = ymax;
-               y1 -= y0;
-               if (y1 < 0) y1 = 0;
-               
-               const int li=linesInterlace;
-               for (; y1; y1--) {
-                       if( 0 == (y0&li) )  {
-                               temp = x0 >> GPU_DIGITS;
-                               if ((u32) (temp - xmin) < (u32) (xmax - xmin)) {
-                                       gpuPixelDriver(&((u16*)GPU_FrameBuffer)[FRAME_OFFSET(temp, y0)],pixeldata);
-                               }
-                       }
-                       y0++;
-                       x0 += x1;
+
+               if (y1 > ymax) {
+#ifdef USE_LINES_ALL_FIXED_PT_MATH
+                       s32 factor = GPU_FAST_DIV(((ymax - y1) << GPU_LINE_FIXED_BITS), dy);
+                       x1 += (dx * factor) >> GPU_LINE_FIXED_BITS;
+                       r1 += (dr * factor) >> GPU_LINE_FIXED_BITS;
+                       g1 += (dg * factor) >> GPU_LINE_FIXED_BITS;
+                       b1 += (db * factor) >> GPU_LINE_FIXED_BITS;
+#else
+                       x1 += (ymax - y1) * dx / dy;
+                       r1 += (ymax - y1) * dr / dy;
+                       g1 += (ymax - y1) * dg / dy;
+                       b1 += (ymax - y1) * db / dy;
+#endif
+                       y1 = ymax;
                }
-               
+
+               // Recompute in case clipping occurred:
+               dx = x1 - x0;
+               dy = y1 - y0;
+               dr = r1 - r0;
+               dg = g1 - g0;
+               db = b1 - b0;
+       }
+
+       // Check X clipping range, set 'sx' x-direction variable
+       if (dx == 0) {
+               // Is vertical line totally outside X clipping range?
+               if (x0 < xmin || x0 > xmax)
+                       return;
+               sx = 0;
        } else {
-               if( 0 == (y0&linesInterlace) )  {
-                       if ((u32) (x0 - xmin) < (u32) (xmax - xmin)) {
-                               if ((u32) (y0 - ymin) < (u32) (ymax - ymin)) {
-                                       gpuPixelDriver(&((u16*)GPU_FrameBuffer)[FRAME_OFFSET(x0, y0)],pixeldata);
-                               }
+               if (dx > 0) {
+                       // x0 is leftmost coordinate
+                       if (x0 > xmax) return; // Both points outside X clip range
+
+                       if (x0 < xmin) {
+                               if (x1 < xmin) return; // Both points outside X clip range
+
+#ifdef USE_LINES_ALL_FIXED_PT_MATH
+                               s32 factor = GPU_FAST_DIV(((xmin - x0) << GPU_LINE_FIXED_BITS), dx);
+                               y0 += (dy * factor) >> GPU_LINE_FIXED_BITS;
+                               r0 += (dr * factor) >> GPU_LINE_FIXED_BITS;
+                               g0 += (dg * factor) >> GPU_LINE_FIXED_BITS;
+                               b0 += (db * factor) >> GPU_LINE_FIXED_BITS;
+#else
+                               y0 += (xmin - x0) * dy / dx;
+                               r0 += (xmin - x0) * dr / dx;
+                               g0 += (xmin - x0) * dg / dx;
+                               b0 += (xmin - x0) * db / dx;
+#endif
+                               x0 = xmin;
                        }
+
+                       if (x1 > xmax) {
+#ifdef USE_LINES_ALL_FIXED_PT_MATH
+                               s32 factor = GPU_FAST_DIV(((xmax - x1) << GPU_LINE_FIXED_BITS), dx);
+                               y1 += (dy * factor) >> GPU_LINE_FIXED_BITS;
+                               r1 += (dr * factor) >> GPU_LINE_FIXED_BITS;
+                               g1 += (dg * factor) >> GPU_LINE_FIXED_BITS;
+                               b1 += (db * factor) >> GPU_LINE_FIXED_BITS;
+#else
+                               y1 += (xmax - x1) * dy / dx;
+                               r1 += (xmax - x1) * dr / dx;
+                               g1 += (xmax - x1) * dg / dx;
+                               b1 += (xmax - x1) * db / dx;
+#endif
+                               x1 = xmax;
+                       }
+
+                       sx = +1;
+                       dx = x1 - x0; // Get final value, which should also be absolute value
+               } else {
+                       // x1 is leftmost coordinate
+                       if (x1 > xmax) return; // Both points outside X clip range
+
+                       if (x1 < xmin) {
+                               if (x0 < xmin) return; // Both points outside X clip range
+
+#ifdef USE_LINES_ALL_FIXED_PT_MATH
+                               s32 factor = GPU_FAST_DIV(((xmin - x1) << GPU_LINE_FIXED_BITS), dx);
+                               y1 += (dy * factor) >> GPU_LINE_FIXED_BITS;
+                               r1 += (dr * factor) >> GPU_LINE_FIXED_BITS;
+                               g1 += (dg * factor) >> GPU_LINE_FIXED_BITS;
+                               b1 += (db * factor) >> GPU_LINE_FIXED_BITS;
+#else
+                               y1 += (xmin - x1) * dy / dx;
+                               r1 += (xmin - x1) * dr / dx;
+                               g1 += (xmin - x1) * dg / dx;
+                               b1 += (xmin - x1) * db / dx;
+#endif
+                               x1 = xmin;
+                       }
+
+                       if (x0 > xmax) {
+#ifdef USE_LINES_ALL_FIXED_PT_MATH
+                               s32 factor = GPU_FAST_DIV(((xmax - x0) << GPU_LINE_FIXED_BITS), dx);
+                               y0 += (dy * factor) >> GPU_LINE_FIXED_BITS;
+                               r0 += (dr * factor) >> GPU_LINE_FIXED_BITS;
+                               g0 += (dg * factor) >> GPU_LINE_FIXED_BITS;
+                               b0 += (db * factor) >> GPU_LINE_FIXED_BITS;
+#else
+                               y0 += (xmax - x0) * dy / dx;
+                               r0 += (xmax - x0) * dr / dx;
+                               g0 += (xmax - x0) * dg / dx;
+                               b0 += (xmax - x0) * db / dx;
+#endif
+                               x0 = xmax;
+                       }
+
+                       sx = -1;
+                       dx = x0 - x1; // Get final value, which should also be absolute value
                }
+
+               // Recompute in case clipping occurred:
+               dy = y1 - y0;
+               dr = r1 - r0;
+               dg = g1 - g0;
+               db = b1 - b0;
        }
-}
 
-/*----------------------------------------------------------------------
-GF
-----------------------------------------------------------------------*/
+       // IMPORTANT: dx,dy should now contain their absolute values
 
-///////////////////////////////////////////////////////////////////////////////
-void gpuDrawLG(const PD gpuPixelDriver)
-{
-       s32 temp;
-       s32 xmin, xmax;
-       s32 ymin, ymax;
-       s32 x0, x1, dx;
-       s32 y0, y1, dy;
-       s32 r0, r1;
-       s32 g0, g1;
-       s32 b0, b1;
-
-       x0 = PacketBuffer.S2[2] + DrawingOffset[0];     GPU_TESTRANGE(x0);
-       y0 = PacketBuffer.S2[3] + DrawingOffset[1];     GPU_TESTRANGE(y0);
-       x1 = PacketBuffer.S2[6] + DrawingOffset[0];     GPU_TESTRANGE(x1);
-       y1 = PacketBuffer.S2[7] + DrawingOffset[1];     GPU_TESTRANGE(y1);
-
-       r0 = PacketBuffer.U1[0];  g0 = PacketBuffer.U1[1];  b0 = PacketBuffer.U1[2];
-       r1 = PacketBuffer.U1[8];  g1 = PacketBuffer.U1[9];      b1 = PacketBuffer.U1[10];
-
-       xmin = DrawingArea[0];  xmax = DrawingArea[2];
-       ymin = DrawingArea[1];  ymax = DrawingArea[3];
-
-       dy = (y1 - y0);
-       if (dy < 0)
-       dy = -dy;
-       dx = (x1 - x0);
-       if (dx < 0)
-       dx = -dx;
-       if (dx > dy) {
-               if (x0 > x1) {
-                       GPU_SWAP(x0, x1, temp);
-                       GPU_SWAP(y0, y1, temp);
-                       GPU_SWAP(r0, r1, temp);
-                       GPU_SWAP(g0, g1, temp);
-                       GPU_SWAP(b0, b1, temp);
-               }
-               y1 = GPU_DIV((y1 - y0) << GPU_DIGITS, dx);
-               r1 = GPU_DIV((r1 - r0) << GPU_DIGITS, dx);
-               g1 = GPU_DIV((g1 - g0) << GPU_DIGITS, dx);
-               b1 = GPU_DIV((b1 - b0) << GPU_DIGITS, dx);
-               y0 <<= GPU_DIGITS;
-               r0 <<= GPU_DIGITS;
-               g0 <<= GPU_DIGITS;
-               b0 <<= GPU_DIGITS;
-               temp = xmin - x0;
-               if (temp > 0) {
-                       x0 = xmin;
-                       y0 += (y1 * temp);
-                       r0 += (r1 * temp);
-                       g0 += (g1 * temp);
-                       b0 += (b1 * temp);
+       int min_length,    // Minimum length of a pixel run
+           start_length,  // Length of first run
+           end_length,    // Length of last run
+           err_term,      // Cumulative error to determine when to draw longer run
+           err_adjup,     // Increment to err_term for each run drawn
+           err_adjdown;   // Subract this from err_term after drawing longer run
+
+       GouraudColor gcol;
+       gcol.r = r0 << GPU_GOURAUD_FIXED_BITS;
+       gcol.g = g0 << GPU_GOURAUD_FIXED_BITS;
+       gcol.b = b0 << GPU_GOURAUD_FIXED_BITS;
+
+       // We use u8 pointers even though PS1 has u16 framebuffer.
+       //  This allows pixel-drawing functions to increment dst pointer
+       //  directly by the passed 'incr' value, not having to shift it first.
+       u8 *dst = (u8*)gpu_unai.vram + y0 * dst_stride + x0 * dst_depth;
+
+       // SPECIAL CASE: Vertical line
+       if (dx == 0) {
+#ifdef USE_LINES_ALL_FIXED_PT_MATH
+               // Get dy fixed-point inverse
+               s32 inv_factor = 1 << GPU_GOURAUD_FIXED_BITS;
+               if (dy > 1) inv_factor = GPU_FAST_DIV(inv_factor, dy);
+
+               // Simultaneously divide and convert integer to Gouraud fixed point:
+               gcol.r_incr = dr * inv_factor;
+               gcol.g_incr = dg * inv_factor;
+               gcol.b_incr = db * inv_factor;
+#else
+               // First, convert to Gouraud fixed point
+               gcol.r_incr = dr << GPU_GOURAUD_FIXED_BITS;
+               gcol.g_incr = dg << GPU_GOURAUD_FIXED_BITS;
+               gcol.b_incr = db << GPU_GOURAUD_FIXED_BITS;
+
+               if (dy > 1) {
+                       if (dr) gcol.r_incr /= dy;
+                       if (dg) gcol.g_incr /= dy;
+                       if (db) gcol.b_incr /= dy;
                }
-               if (x1 > xmax) x1 = xmax;
-               x1 -= x0;
-               if (x1 < 0) x1 = 0;
+#endif
                
-               const int li=linesInterlace;
-               for (; x1; x1--) {
-                       temp = y0 >> GPU_DIGITS;
-                       if( 0 == (temp&li) )  {
-                               if ((u32) (temp - ymin) < (u32) (ymax - ymin)) {
-                                       gpuPixelDriver (
-                                               &((u16*)GPU_FrameBuffer)[FRAME_OFFSET(x0, temp)],
-                                               (((b0>>GPU_DIGITSC)&0x1F)<<10) | (((g0>>GPU_DIGITSC)&0x1F)<< 5) | ((r0>>GPU_DIGITSC)&0x1F)
-                                       );
-                               }
-                       }
-                       x0++;
-                       y0 += y1;
-                       r0 += r1;
-                       g0 += g1;
-                       b0 += b1;
-               }
-       } else if (dy) {
-               if (y0 > y1) {
-                       GPU_SWAP(x0, x1, temp);
-                       GPU_SWAP(y0, y1, temp);
-                       GPU_SWAP(r0, r1, temp);
-                       GPU_SWAP(g0, g1, temp);
-                       GPU_SWAP(b0, b1, temp);
+               gpuPixelSpanDriver(dst, (uintptr_t)&gcol, dst_stride, dy+1);
+               return;
+       }
+
+       // SPECIAL CASE: Horizontal line
+       if (dy == 0) {
+#ifdef USE_LINES_ALL_FIXED_PT_MATH
+               // Get dx fixed-point inverse
+               s32 inv_factor = (1 << GPU_GOURAUD_FIXED_BITS);
+               if (dx > 1) inv_factor = GPU_FAST_DIV(inv_factor, dx);
+
+               // Simultaneously divide and convert integer to Gouraud fixed point:
+               gcol.r_incr = dr * inv_factor;
+               gcol.g_incr = dg * inv_factor;
+               gcol.b_incr = db * inv_factor;
+#else
+               gcol.r_incr = dr << GPU_GOURAUD_FIXED_BITS;
+               gcol.g_incr = dg << GPU_GOURAUD_FIXED_BITS;
+               gcol.b_incr = db << GPU_GOURAUD_FIXED_BITS;
+
+               if (dx > 1) {
+                       if (dr) gcol.r_incr /= dx;
+                       if (dg) gcol.g_incr /= dx;
+                       if (db) gcol.b_incr /= dx;
                }
-               x1 = GPU_DIV((x1 - x0) << GPU_DIGITS, dy);
-               r1 = GPU_DIV((r1 - r0) << GPU_DIGITS, dy);
-               g1 = GPU_DIV((g1 - g0) << GPU_DIGITS, dy);
-               b1 = GPU_DIV((b1 - b0) << GPU_DIGITS, dy);
-               x0 <<= GPU_DIGITS;
-               r0 <<= GPU_DIGITS;
-               g0 <<= GPU_DIGITS;
-               b0 <<= GPU_DIGITS;
-               temp = ymin - y0;
-               if (temp > 0) {
-                       y0 = ymin;
-                       x0 += (x1 * temp);
-                       r0 += (r1 * temp);
-                       g0 += (g1 * temp);
-                       b0 += (b1 * temp);
+#endif
+
+               gpuPixelSpanDriver(dst, (uintptr_t)&gcol, sx * dst_depth, dx+1);
+               return;
+       }
+
+       // SPECIAL CASE: Diagonal line
+       if (dx == dy) {
+#ifdef USE_LINES_ALL_FIXED_PT_MATH
+               // Get dx fixed-point inverse
+               s32 inv_factor = (1 << GPU_GOURAUD_FIXED_BITS);
+               if (dx > 1) inv_factor = GPU_FAST_DIV(inv_factor, dx);
+
+               // Simultaneously divide and convert integer to Gouraud fixed point:
+               gcol.r_incr = dr * inv_factor;
+               gcol.g_incr = dg * inv_factor;
+               gcol.b_incr = db * inv_factor;
+#else
+               // First, convert to Gouraud fixed point
+               gcol.r_incr = dr << GPU_GOURAUD_FIXED_BITS;
+               gcol.g_incr = dg << GPU_GOURAUD_FIXED_BITS;
+               gcol.b_incr = db << GPU_GOURAUD_FIXED_BITS;
+
+               if (dx > 1) {
+                       if (dr) gcol.r_incr /= dx;
+                       if (dg) gcol.g_incr /= dx;
+                       if (db) gcol.b_incr /= dx;
                }
-               if (y1 > ymax) y1 = ymax;
-               y1 -= y0;
-               if (y1 < 0) y1 = 0;
-               
-               const int li=linesInterlace;
-               for (; y1; y1--) {
-                       if( 0 == (y0&li) )  {
-                               temp = x0 >> GPU_DIGITS;
-                               if ((u32) (temp - xmin) < (u32) (xmax - xmin)) {
-                                       gpuPixelDriver (
-                                               &((u16*)GPU_FrameBuffer)[FRAME_OFFSET(temp, y0)],
-                                               (((b0>>GPU_DIGITSC)&0x1F)<<10) | (((g0>>GPU_DIGITSC)&0x1F)<< 5) | ((r0>>GPU_DIGITSC)&0x1F)
-                                       );
-                               }
-                       }
-                       y0++;
-                       x0 += x1;
-                       r0 += r1;
-                       g0 += g1;
-                       b0 += b1;
+#endif
+
+               gpuPixelSpanDriver(dst, (uintptr_t)&gcol, dst_stride + (sx * dst_depth), dy+1);
+               return;
+       }
+
+       int       major, minor;             // Absolute val of major,minor axis delta
+       ptrdiff_t incr_major, incr_minor;   // Ptr increment for each step along axis
+
+       if (dx > dy) {
+               major = dx;
+               minor = dy;
+       } else {
+               major = dy;
+               minor = dx;
+       }
+
+       // Determine if diagonal or horizontal runs
+       if (major < (2 * minor)) {
+               // Diagonal runs, so perform half-octant transformation
+               minor = major - minor;
+
+               // Advance diagonally when drawing runs
+               incr_major = dst_stride + (sx * dst_depth);
+
+               // After drawing each run, correct for over-advance along minor axis
+               if (dx > dy)
+                       incr_minor = -dst_stride;
+               else
+                       incr_minor = -sx * dst_depth;
+       } else {
+               // Horizontal or vertical runs
+               if (dx > dy) {
+                       incr_major = sx * dst_depth;
+                       incr_minor = dst_stride;
+               } else {
+                       incr_major = dst_stride;
+                       incr_minor = sx * dst_depth;
                }
+       }
+
+#ifdef USE_LINES_ALL_FIXED_PT_MATH
+       s32 major_inv = GPU_FAST_DIV((1 << GPU_GOURAUD_FIXED_BITS), major);
+
+       // Simultaneously divide and convert from integer to Gouraud fixed point:
+       gcol.r_incr = dr * major_inv;
+       gcol.g_incr = dg * major_inv;
+       gcol.b_incr = db * major_inv;
+#else
+       gcol.r_incr = dr ? ((dr << GPU_GOURAUD_FIXED_BITS) / major) : 0;
+       gcol.g_incr = dg ? ((dg << GPU_GOURAUD_FIXED_BITS) / major) : 0;
+       gcol.b_incr = db ? ((db << GPU_GOURAUD_FIXED_BITS) / major) : 0;
+#endif
+
+       if (minor > 1) {
+               // Minimum number of pixels each run
+               min_length = major / minor;
+
+               // Initial error term; reflects an initial step of 0.5 along minor axis
+               err_term = (major % minor) - (minor * 2);
+
+               // Increment err_term this much each step along minor axis; when
+               //  err_term crosses zero, draw longer pixel run.
+               err_adjup = (major % minor) * 2;
        } else {
-               if( 0 == (y0&linesInterlace) )  {
-                       if ((u32) (x0 - xmin) < (u32) (xmax - xmin)) {
-                               if ((u32) (y0 - ymin) < (u32) (ymax - ymin)) {
-                                       gpuPixelDriver (
-                                               &((u16*)GPU_FrameBuffer)[FRAME_OFFSET(x0, y0)],
-                                               (((b0>>GPU_DIGITSC)&0x1F)<<10) | (((g0>>GPU_DIGITSC)&0x1F)<< 5) | ((r0>>GPU_DIGITSC)&0x1F)
-                                       );
-                               }
-                       }
+               min_length = major;
+               err_term = 0;
+               err_adjup = 0;
+       }
+
+       // Error term adjustment when err_term turns over; used to factor
+       //  out the major-axis step made at that time
+       err_adjdown = minor * 2;
+
+       // The initial and last runs are partial, because minor axis advances
+       //  only 0.5 for these runs, rather than 1. Each is half a full run,
+       //  plus the initial pixel.
+       start_length = end_length = (min_length / 2) + 1;
+
+       if (min_length & 1) {
+               // If there're an odd number of pixels per run, we have 1 pixel that
+               //  can't be allocated to either the initial or last partial run, so
+               //  we'll add 0.5 to err_term so that this pixel will be handled
+               //  by the normal full-run loop
+               err_term += minor;
+       } else {
+               // If the minimum run length is even and there's no fractional advance,
+               // we have one pixel that could go to either the initial or last
+               // partial run, which we'll arbitrarily allocate to the last run
+               if (err_adjup == 0)
+                       start_length--; // Leave out the extra pixel at the start
+       }
+
+       // First run of pixels
+       dst = gpuPixelSpanDriver(dst, (uintptr_t)&gcol, incr_major, start_length);
+       dst += incr_minor;
+
+       // Middle runs of pixels
+       while (--minor > 0) {
+               int run_length = min_length;
+               err_term += err_adjup;
+
+               // If err_term passed 0, reset it and draw longer run
+               if (err_term > 0) {
+                       err_term -= err_adjdown;
+                       run_length++;
                }
+
+               dst = gpuPixelSpanDriver(dst, (uintptr_t)&gcol, incr_major, run_length);
+               dst += incr_minor;
        }
+
+       // Final run of pixels
+       gpuPixelSpanDriver(dst, (uintptr_t)&gcol, incr_major, end_length);
 }
index c4b0350..f66a9e2 100644 (file)
 *   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
 ***************************************************************************/
 
-#define GPU_TESTRANGE3() \
-{ \
-       if(x0<0) { if((x1-x0)>CHKMAX_X) return; if((x2-x0)>CHKMAX_X) return; } \
-       if(x1<0) { if((x0-x1)>CHKMAX_X) return; if((x2-x1)>CHKMAX_X) return; } \
-       if(x2<0) { if((x0-x2)>CHKMAX_X) return; if((x1-x2)>CHKMAX_X) return; } \
-       if(y0<0) { if((y1-y0)>CHKMAX_Y) return; if((y2-y0)>CHKMAX_Y) return; } \
-       if(y1<0) { if((y0-y1)>CHKMAX_Y) return; if((y2-y1)>CHKMAX_Y) return; } \
-       if(y2<0) { if((y0-y2)>CHKMAX_Y) return; if((y1-y2)>CHKMAX_Y) return; } \
-}
+//senquack - NOTE: GPU Unai poly routines have been rewritten/adapted
+// from DrHell routines to fix multiple issues. See README_senquack.txt
 
 ///////////////////////////////////////////////////////////////////////////////
-//  GPU internal polygon drawing functions
+// Shared poly vertex buffer, able to handle 3 or 4-pt polys of any type.
+///////////////////////////////////////////////////////////////////////////////
 
+struct PolyVertex {
+       s32 x, y; // Sign-extended 11-bit X,Y coords
+       union {
+               struct { u8 u, v, pad[2]; } tex; // Texture coords (if used)
+               u32 tex_word;
+       };
+       union {
+               struct { u8 r, g, b, pad; } col; // 24-bit RGB color (if used)
+               u32 col_word;
+       };
+};
+
+enum PolyAttribute {
+       POLYATTR_TEXTURE = (1 << 0),
+       POLYATTR_GOURAUD = (1 << 1)
+};
+
+enum PolyType {
+       POLYTYPE_F  = 0,
+       POLYTYPE_FT = (POLYATTR_TEXTURE),
+       POLYTYPE_G  = (POLYATTR_GOURAUD),
+       POLYTYPE_GT = (POLYATTR_TEXTURE | POLYATTR_GOURAUD)
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// polyInitVertexBuffer()
+// Fills vbuf[] array with data from any type of poly draw-command packet.
 ///////////////////////////////////////////////////////////////////////////////
-void gpuDrawF3(const PP gpuPolySpanDriver)
+static void polyInitVertexBuffer(PolyVertex *vbuf, const PtrUnion packet, PolyType ptype, u32 is_quad)
 {
-       const int li=linesInterlace;
-       s32 temp;
-       s32 xa, xb, xmin, xmax;
-       s32 ya, yb, ymin, ymax;
-       s32 x0, x1, x2, x3, dx3=0, x4, dx4=0, dx;
-       s32 y0, y1, y2;
+       bool texturing = ptype & POLYATTR_TEXTURE;
+       bool gouraud   = ptype & POLYATTR_GOURAUD;
+
+       int vert_stride = 1; // Stride of vertices in cmd packet, in 32-bit words
+       if (texturing)
+               vert_stride++;
+       if (gouraud)
+               vert_stride++;
+
+       int num_verts = (is_quad) ? 4 : 3;
+       u32 *ptr;
+
+       // X,Y coords, adjusted by draw offsets
+       s32 x_off = gpu_unai.DrawingOffset[0];
+       s32 y_off = gpu_unai.DrawingOffset[1];
+       ptr = &packet.U4[1];
+       for (int i=0;  i < num_verts; ++i, ptr += vert_stride) {
+               s16* coord_ptr = (s16*)ptr;
+               vbuf[i].x = GPU_EXPANDSIGN(coord_ptr[0]) + x_off;
+               vbuf[i].y = GPU_EXPANDSIGN(coord_ptr[1]) + y_off;
+       }
 
-       x0 = GPU_EXPANDSIGN(PacketBuffer.S2[2]);
-       y0 = GPU_EXPANDSIGN(PacketBuffer.S2[3]);
-       x1 = GPU_EXPANDSIGN(PacketBuffer.S2[4]);
-       y1 = GPU_EXPANDSIGN(PacketBuffer.S2[5]);
-       x2 = GPU_EXPANDSIGN(PacketBuffer.S2[6]);
-       y2 = GPU_EXPANDSIGN(PacketBuffer.S2[7]);
+       // U,V texture coords (if applicable)
+       if (texturing) {
+               ptr = &packet.U4[2];
+               for (int i=0;  i < num_verts; ++i, ptr += vert_stride)
+                       vbuf[i].tex_word = *ptr;
+       }
 
-       GPU_TESTRANGE3();
+       // Colors (if applicable)
+       if (gouraud) {
+               ptr = &packet.U4[0];
+               for (int i=0;  i < num_verts; ++i, ptr += vert_stride)
+                       vbuf[i].col_word = *ptr;
+       }
+}
 
-       x0 += DrawingOffset[0];   x1 += DrawingOffset[0];   x2 += DrawingOffset[0];
-       y0 += DrawingOffset[1];   y1 += DrawingOffset[1];   y2 += DrawingOffset[1];
+///////////////////////////////////////////////////////////////////////////////
+//  Helper functions to determine which vertex in a 2 or 3 vertex array
+//   has the highest/lowest X/Y coordinate.
+//   Note: the comparison logic is such that, given a set of vertices with
+//    identical values for a given coordinate, a different index will be
+//    returned from vertIdxOfLeast..() than a call to vertIdxOfHighest..().
+//    This ensures that, during the vertex-ordering phase of rasterization,
+//    all three vertices remain unique.
+///////////////////////////////////////////////////////////////////////////////
 
-       xmin = DrawingArea[0];  xmax = DrawingArea[2];
-       ymin = DrawingArea[1];  ymax = DrawingArea[3];
+template<typename T>
+static inline int vertIdxOfLeastXCoord2(const T *Tptr)
+{
+       return (Tptr[0].x <= Tptr[1].x) ? 0 : 1;
+}
 
-       {
-               int rx0 = Max2(xmin,Min3(x0,x1,x2));
-               int ry0 = Max2(ymin,Min3(y0,y1,y2));
-               int rx1 = Min2(xmax,Max3(x0,x1,x2));
-               int ry1 = Min2(ymax,Max3(y0,y1,y2));
-               if( rx0>=rx1 || ry0>=ry1) return;
-       }
-       
-       PixelData = GPU_RGB16(PacketBuffer.U4[0]);
+template<typename T>
+static inline int vertIdxOfLeastXCoord3(const T *Tptr)
+{
+       int least_of_v0_v1 = vertIdxOfLeastXCoord2(Tptr);
+       return (Tptr[least_of_v0_v1].x <= Tptr[2].x) ? least_of_v0_v1 : 2;
+}
 
-       if (y0 >= y1)
-       {
-               if( y0!=y1 || x0>x1 )
-               {
-                       GPU_SWAP(x0, x1, temp);
-                       GPU_SWAP(y0, y1, temp);
-               }
-       }
-       if (y1 >= y2)
-       {
-               if( y1!=y2 || x1>x2 )
-               {
-                       GPU_SWAP(x1, x2, temp);
-                       GPU_SWAP(y1, y2, temp);
-               }
-       }
-       if (y0 >= y1)
-       {
-               if( y0!=y1 || x0>x1 )
-               {
-                       GPU_SWAP(x0, x1, temp);
-                       GPU_SWAP(y0, y1, temp);
-               }
-       }
+template<typename T>
+static inline int vertIdxOfLeastYCoord2(const T *Tptr)
+{
+       return (Tptr[0].y <= Tptr[1].y) ? 0 : 1;
+}
 
-       ya = y2 - y0;
-       yb = y2 - y1;
-       dx =(x2 - x1) * ya - (x2 - x0) * yb;
+template<typename T>
+static inline int vertIdxOfLeastYCoord3(const T *Tptr)
+{
+       int least_of_v0_v1 = vertIdxOfLeastYCoord2(Tptr);
+       return (Tptr[least_of_v0_v1].y <= Tptr[2].y) ? least_of_v0_v1 : 2;
+}
+
+template<typename T>
+static inline int vertIdxOfHighestXCoord2(const T *Tptr)
+{
+       return (Tptr[1].x >= Tptr[0].x) ? 1 : 0;
+}
+
+template<typename T>
+static inline int vertIdxOfHighestXCoord3(const T *Tptr)
+{
+       int highest_of_v0_v1 = vertIdxOfHighestXCoord2(Tptr);
+       return (Tptr[2].x >= Tptr[highest_of_v0_v1].x) ? 2 : highest_of_v0_v1;
+}
+
+template<typename T>
+static inline int vertIdxOfHighestYCoord2(const T *Tptr)
+{
+       return (Tptr[1].y >= Tptr[0].y) ? 1 : 0;
+}
+
+template<typename T>
+static inline int vertIdxOfHighestYCoord3(const T *Tptr)
+{
+       int highest_of_v0_v1 = vertIdxOfHighestYCoord2(Tptr);
+       return (Tptr[2].y >= Tptr[highest_of_v0_v1].y) ? 2 : highest_of_v0_v1;
+}
 
-       for (s32 loop0 = 2; loop0; --loop0)
+///////////////////////////////////////////////////////////////////////////////
+// polyUseTriangle()
+//  Determines if the specified triangle should be rendered. If so, it
+//  fills the given array of vertex pointers, vert_ptrs, in order of
+//  increasing Y coordinate values, as required by rasterization algorithm.
+//  Parameter 'tri_num' is 0 for first triangle (idx 0,1,2 of vbuf[]),
+//   or 1 for second triangle of a quad (idx 1,2,3 of vbuf[]).
+//  Returns true if triangle should be rendered, false if not.
+///////////////////////////////////////////////////////////////////////////////
+static bool polyUseTriangle(const PolyVertex *vbuf, int tri_num, const PolyVertex **vert_ptrs)
+{
+       // Using verts 0,1,2 or is this the 2nd pass of a quad (verts 1,2,3)?
+       const PolyVertex *tri_ptr = &vbuf[(tri_num == 0) ? 0 : 1];
+
+       // Get indices of highest/lowest X,Y coords within triangle
+       int idx_lowest_x  = vertIdxOfLeastXCoord3(tri_ptr);
+       int idx_highest_x = vertIdxOfHighestXCoord3(tri_ptr);
+       int idx_lowest_y  = vertIdxOfLeastYCoord3(tri_ptr);
+       int idx_highest_y = vertIdxOfHighestYCoord3(tri_ptr);
+
+       // Maximum absolute distance between any two X coordinates is 1023,
+       //  and for Y coordinates is 511 (PS1 hardware limitation)
+       int lowest_x  = tri_ptr[idx_lowest_x].x;
+       int highest_x = tri_ptr[idx_highest_x].x;
+       int lowest_y  = tri_ptr[idx_lowest_y].y;
+       int highest_y = tri_ptr[idx_highest_y].y;
+       if ((highest_x - lowest_x) >= CHKMAX_X ||
+           (highest_y - lowest_y) >= CHKMAX_Y)
+               return false;
+
+       // Determine if triangle is completely outside clipping range
+       int xmin, xmax, ymin, ymax;
+       xmin = gpu_unai.DrawingArea[0];  xmax = gpu_unai.DrawingArea[2];
+       ymin = gpu_unai.DrawingArea[1];  ymax = gpu_unai.DrawingArea[3];
+       int clipped_lowest_x  = Max2(xmin,lowest_x);
+       int clipped_lowest_y  = Max2(ymin,lowest_y);
+       int clipped_highest_x = Min2(xmax,highest_x);
+       int clipped_highest_y = Min2(ymax,highest_y);
+       if (clipped_lowest_x >= clipped_highest_x ||
+           clipped_lowest_y >= clipped_highest_y)
+               return false;
+
+       // Order vertex ptrs by increasing y value (draw routines need this).
+       // The middle index is deduced by a binary math trick that depends
+       //  on index range always being between 0..2
+       vert_ptrs[0] = tri_ptr + idx_lowest_y;
+       vert_ptrs[1] = tri_ptr + ((idx_lowest_y + idx_highest_y) ^ 3);
+       vert_ptrs[2] = tri_ptr + idx_highest_y;
+       return true;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//  GPU internal polygon drawing functions
+///////////////////////////////////////////////////////////////////////////////
+
+/*----------------------------------------------------------------------
+gpuDrawPolyF - Flat-shaded, untextured poly
+----------------------------------------------------------------------*/
+void gpuDrawPolyF(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad)
+{
+       // Set up bgr555 color to be used across calls in inner driver
+       gpu_unai.PixelData = GPU_RGB16(packet.U4[0]);
+
+       PolyVertex vbuf[4];
+       polyInitVertexBuffer(vbuf, packet, POLYTYPE_F, is_quad);
+
+       int total_passes = is_quad ? 2 : 1;
+       int cur_pass = 0;
+       do
        {
-               if (loop0 == 2)
-               {
-                       ya = y0;
-                       yb = y1;
-                       x3 = i2x(x0);
-                       x4 = y0!=y1 ? x3 : i2x(x1);
-                       if (dx < 0)
-                       {
-                               dx3 = xLoDivx((x2 - x0), (y2 - y0));
-                               dx4 = xLoDivx((x1 - x0), (y1 - y0));
-                       }
-                       else
-                       {
-                               dx3 = xLoDivx((x1 - x0), (y1 - y0));
-                               dx4 = xLoDivx((x2 - x0), (y2 - y0));
+               const PolyVertex* vptrs[3];
+               if (polyUseTriangle(vbuf, cur_pass, vptrs) == false)
+                       continue;
+
+               s32 xa, xb, ya, yb;
+               s32 x3, dx3, x4, dx4, dx;
+               s32 x0, x1, x2, y0, y1, y2;
+
+               x0 = vptrs[0]->x;  y0 = vptrs[0]->y;
+               x1 = vptrs[1]->x;  y1 = vptrs[1]->y;
+               x2 = vptrs[2]->x;  y2 = vptrs[2]->y;
+
+               ya = y2 - y0;
+               yb = y2 - y1;
+               dx = (x2 - x1) * ya - (x2 - x0) * yb;
+
+               for (int loop0 = 2; loop0; loop0--) {
+                       if (loop0 == 2) {
+                               ya = y0;  yb = y1;
+                               x3 = x4 = i2x(x0);
+                               if (dx < 0) {
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+                                       dx3 = ((y2 - y0) != 0) ? (fixed)(((x2 - x0) << FIXED_BITS) * FloatInv(y2 - y0)) : 0;
+                                       dx4 = ((y1 - y0) != 0) ? (fixed)(((x1 - x0) << FIXED_BITS) * FloatInv(y1 - y0)) : 0;
+#else
+                                       dx3 = ((y2 - y0) != 0) ? (fixed)(((x2 - x0) << FIXED_BITS) / (float)(y2 - y0)) : 0;
+                                       dx4 = ((y1 - y0) != 0) ? (fixed)(((x1 - x0) << FIXED_BITS) / (float)(y1 - y0)) : 0;
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+                                       dx3 = ((y2 - y0) != 0) ? xLoDivx((x2 - x0), (y2 - y0)) : 0;
+                                       dx4 = ((y1 - y0) != 0) ? xLoDivx((x1 - x0), (y1 - y0)) : 0;
+#else
+                                       dx3 = ((y2 - y0) != 0) ? GPU_FAST_DIV((x2 - x0) << FIXED_BITS, (y2 - y0)) : 0;
+                                       dx4 = ((y1 - y0) != 0) ? GPU_FAST_DIV((x1 - x0) << FIXED_BITS, (y1 - y0)) : 0;
+#endif
+#endif
+                               } else {
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+                                       dx3 = ((y1 - y0) != 0) ? (fixed)(((x1 - x0) << FIXED_BITS) * FloatInv(y1 - y0)) : 0;
+                                       dx4 = ((y2 - y0) != 0) ? (fixed)(((x2 - x0) << FIXED_BITS) * FloatInv(y2 - y0)) : 0;
+#else
+                                       dx3 = ((y1 - y0) != 0) ? (fixed)(((x1 - x0) << FIXED_BITS) / (float)(y1 - y0)) : 0;
+                                       dx4 = ((y2 - y0) != 0) ? (fixed)(((x2 - x0) << FIXED_BITS) / (float)(y2 - y0)) : 0;
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+                                       dx3 = ((y1 - y0) != 0) ? xLoDivx((x1 - x0), (y1 - y0)) : 0;
+                                       dx4 = ((y2 - y0) != 0) ? xLoDivx((x2 - x0), (y2 - y0)) : 0;
+#else
+                                       dx3 = ((y1 - y0) != 0) ? GPU_FAST_DIV((x1 - x0) << FIXED_BITS, (y1 - y0)) : 0;
+                                       dx4 = ((y2 - y0) != 0) ? GPU_FAST_DIV((x2 - x0) << FIXED_BITS, (y2 - y0)) : 0;
+#endif
+#endif
+                               }
+                       } else {
+                               //senquack - break out of final loop if nothing to be drawn (1st loop
+                               //           must always be taken to setup dx3/dx4)
+                               if (y1 == y2) break;
+
+                               ya = y1;  yb = y2;
+
+                               if (dx < 0) {
+                                       x3 = i2x(x0) + (dx3 * (y1 - y0));
+                                       x4 = i2x(x1);
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+                                       dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) * FloatInv(y2 - y1)) : 0;
+#else
+                                       dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) / (float)(y2 - y1)) : 0;
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+                                       dx4 = ((y2 - y1) != 0) ? xLoDivx ((x2 - x1), (y2 - y1)) : 0;
+#else
+                                       dx4 = ((y2 - y1) != 0) ? GPU_FAST_DIV((x2 - x1) << FIXED_BITS, (y2 - y1)) : 0;
+#endif
+#endif
+                               } else {
+                                       x3 = i2x(x1);
+                                       x4 = i2x(x0) + (dx4 * (y1 - y0));
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+                                       dx3 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) * FloatInv(y2 - y1)) : 0;
+#else
+                                       dx3 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) / (float)(y2 - y1)) : 0;
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+                                       dx3 = ((y2 - y1) != 0) ? xLoDivx ((x2 - x1), (y2 - y1)) : 0;
+#else
+                                       dx3 = ((y2 - y1) != 0) ? GPU_FAST_DIV((x2 - x1) << FIXED_BITS, (y2 - y1)) : 0;
+#endif
+#endif
+                               }
                        }
-               }
-               else
-               {
-                       ya = y1;
-                       yb = y2;
-                       if (dx < 0)
-                       {
-                               x4  = i2x(x1);
-                               x3  = i2x(x0) + (dx3 * (y1 - y0));
-                               dx4 = xLoDivx((x2 - x1), (y2 - y1));
+
+                       s32 xmin, xmax, ymin, ymax;
+                       xmin = gpu_unai.DrawingArea[0];  xmax = gpu_unai.DrawingArea[2];
+                       ymin = gpu_unai.DrawingArea[1];  ymax = gpu_unai.DrawingArea[3];
+
+                       if ((ymin - ya) > 0) {
+                               x3 += (dx3 * (ymin - ya));
+                               x4 += (dx4 * (ymin - ya));
+                               ya = ymin;
                        }
-                       else
+
+                       if (yb > ymax) yb = ymax;
+
+                       int loop1 = yb - ya;
+                       if (loop1 <= 0)
+                               continue;
+
+                       u16* PixelBase = &((u16*)gpu_unai.vram)[FRAME_OFFSET(0, ya)];
+                       int li=gpu_unai.ilace_mask;
+                       int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.ilace_mask+1):0);
+                       int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.ilace_mask+1):0):1);
+
+                       for (; loop1; --loop1, ya++, PixelBase += FRAME_WIDTH,
+                                       x3 += dx3, x4 += dx4 )
                        {
-                               x3  = i2x(x1);
-                               x4  = i2x(x0) + (dx4 * (y1 - y0));
-                               dx3 = xLoDivx((x2 - x1), (y2 - y1));
+                               if (ya&li) continue;
+                               if ((ya&pi)==pif) continue;
+
+                               xa = FixedCeilToInt(x3);  xb = FixedCeilToInt(x4);
+                               if ((xmin - xa) > 0) xa = xmin;
+                               if (xb > xmax) xb = xmax;
+                               if ((xb - xa) > 0)
+                                       gpuPolySpanDriver(gpu_unai, PixelBase + xa, (xb - xa));
                        }
                }
-
-               temp = ymin - ya;
-               if (temp > 0)
-               {
-                       ya  = ymin;
-                       x3 += dx3*temp;
-                       x4 += dx4*temp;
-               }
-               if (yb > ymax) yb = ymax;
-               if (ya>=yb) continue;
-
-               x3+= fixed_HALF;
-               x4+= fixed_HALF;
-
-               u16* PixelBase  = &((u16*)GPU_FrameBuffer)[FRAME_OFFSET(0, ya)];
-               
-               for(;ya<yb;++ya, PixelBase += FRAME_WIDTH, x3+=dx3, x4+=dx4)
-               {
-                       if (ya&li) continue;
-                       xa = x2i(x3);
-                       xb = x2i(x4);
-                       if( (xa>xmax) || (xb<xmin) ) continue;
-                       if(xa < xmin) xa = xmin;
-                       if(xb > xmax) xb = xmax;
-                       xb-=xa;
-                       if(xb>0) gpuPolySpanDriver(PixelBase + xa,xb);
-               }
-       }
+       } while (++cur_pass < total_passes);
 }
 
 /*----------------------------------------------------------------------
-FT3
+gpuDrawPolyFT - Flat-shaded, textured poly
 ----------------------------------------------------------------------*/
-
-void gpuDrawFT3(const PP gpuPolySpanDriver)
+void gpuDrawPolyFT(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad)
 {
-       const int li=linesInterlace;
-       s32 temp;
-       s32 xa, xb, xmin, xmax;
-       s32 ya, yb, ymin, ymax;
-       s32 x0, x1, x2, x3, dx3=0, x4, dx4=0, dx;
-       s32 y0, y1, y2;
-       s32 u0, u1, u2, u3, du3=0;
-       s32 v0, v1, v2, v3, dv3=0;
-
-       x0 = GPU_EXPANDSIGN(PacketBuffer.S2[2] );
-       y0 = GPU_EXPANDSIGN(PacketBuffer.S2[3] );
-       x1 = GPU_EXPANDSIGN(PacketBuffer.S2[6] );
-       y1 = GPU_EXPANDSIGN(PacketBuffer.S2[7] );
-       x2 = GPU_EXPANDSIGN(PacketBuffer.S2[10]);
-       y2 = GPU_EXPANDSIGN(PacketBuffer.S2[11]);
-
-       GPU_TESTRANGE3();
-
-       x0 += DrawingOffset[0];   x1 += DrawingOffset[0];   x2 += DrawingOffset[0];
-       y0 += DrawingOffset[1];   y1 += DrawingOffset[1];   y2 += DrawingOffset[1];
-
-       xmin = DrawingArea[0];  xmax = DrawingArea[2];
-       ymin = DrawingArea[1];  ymax = DrawingArea[3];
-
+       // r8/g8/b8 used if texture-blending & dithering is applied (24-bit light)
+       gpu_unai.r8 = packet.U1[0];
+       gpu_unai.g8 = packet.U1[1];
+       gpu_unai.b8 = packet.U1[2];
+       // r5/g5/b5 used if just texture-blending is applied (15-bit light)
+       gpu_unai.r5 = packet.U1[0] >> 3;
+       gpu_unai.g5 = packet.U1[1] >> 3;
+       gpu_unai.b5 = packet.U1[2] >> 3;
+
+       PolyVertex vbuf[4];
+       polyInitVertexBuffer(vbuf, packet, POLYTYPE_FT, is_quad);
+
+       int total_passes = is_quad ? 2 : 1;
+       int cur_pass = 0;
+       do
        {
-               int rx0 = Max2(xmin,Min3(x0,x1,x2));
-               int ry0 = Max2(ymin,Min3(y0,y1,y2));
-               int rx1 = Min2(xmax,Max3(x0,x1,x2));
-               int ry1 = Min2(ymax,Max3(y0,y1,y2));
-               if( rx0>=rx1 || ry0>=ry1) return;
-       }
-       
-       u0 = PacketBuffer.U1[8];  v0 = PacketBuffer.U1[9];
-       u1 = PacketBuffer.U1[16]; v1 = PacketBuffer.U1[17];
-       u2 = PacketBuffer.U1[24]; v2 = PacketBuffer.U1[25];
-
-       r4 = s32(PacketBuffer.U1[0]);
-       g4 = s32(PacketBuffer.U1[1]);
-       b4 = s32(PacketBuffer.U1[2]);
-       dr4 = dg4 = db4 = 0;
+               const PolyVertex* vptrs[3];
+               if (polyUseTriangle(vbuf, cur_pass, vptrs) == false)
+                       continue;
+
+               s32 xa, xb, ya, yb;
+               s32 x3, dx3, x4, dx4, dx;
+               s32 u3, du3, v3, dv3;
+               s32 x0, x1, x2, y0, y1, y2;
+               s32 u0, u1, u2, v0, v1, v2;
+               s32 du4, dv4;
+
+               x0 = vptrs[0]->x;      y0 = vptrs[0]->y;
+               u0 = vptrs[0]->tex.u;  v0 = vptrs[0]->tex.v;
+               x1 = vptrs[1]->x;      y1 = vptrs[1]->y;
+               u1 = vptrs[1]->tex.u;  v1 = vptrs[1]->tex.v;
+               x2 = vptrs[2]->x;      y2 = vptrs[2]->y;
+               u2 = vptrs[2]->tex.u;  v2 = vptrs[2]->tex.v;
+
+               ya = y2 - y0;
+               yb = y2 - y1;
+               dx4 = (x2 - x1) * ya - (x2 - x0) * yb;
+               du4 = (u2 - u1) * ya - (u2 - u0) * yb;
+               dv4 = (v2 - v1) * ya - (v2 - v0) * yb;
+               dx = dx4;
+               if (dx4 < 0) {
+                       dx4 = -dx4;
+                       du4 = -du4;
+                       dv4 = -dv4;
+               }
 
-       if (y0 >= y1)
-       {
-               if( y0!=y1 || x0>x1 )
-               {
-                       GPU_SWAP(x0, x1, temp);
-                       GPU_SWAP(y0, y1, temp);
-                       GPU_SWAP(u0, u1, temp);
-                       GPU_SWAP(v0, v1, temp);
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+               if (dx4 != 0) {
+                       float finv = FloatInv(dx4);
+                       du4 = (fixed)((du4 << FIXED_BITS) * finv);
+                       dv4 = (fixed)((dv4 << FIXED_BITS) * finv);
+               } else {
+                       du4 = dv4 = 0;
                }
-       }
-       if (y1 >= y2)
-       {
-               if( y1!=y2 || x1>x2 )
-               {
-                       GPU_SWAP(x1, x2, temp);
-                       GPU_SWAP(y1, y2, temp);
-                       GPU_SWAP(u1, u2, temp);
-                       GPU_SWAP(v1, v2, temp);
+#else
+               if (dx4 != 0) {
+                       float fdiv = dx4;
+                       du4 = (fixed)((du4 << FIXED_BITS) / fdiv);
+                       dv4 = (fixed)((dv4 << FIXED_BITS) / fdiv);
+               } else {
+                       du4 = dv4 = 0;
                }
-       }
-       if (y0 >= y1)
-       {
-               if( y0!=y1 || x0>x1 )
-               {
-                       GPU_SWAP(x0, x1, temp);
-                       GPU_SWAP(y0, y1, temp);
-                       GPU_SWAP(u0, u1, temp);
-                       GPU_SWAP(v0, v1, temp);
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+               if (dx4 != 0) {
+                       int iF, iS;
+                       xInv(dx4, iF, iS);
+                       du4 = xInvMulx(du4, iF, iS);
+                       dv4 = xInvMulx(dv4, iF, iS);
+               } else {
+                       du4 = dv4 = 0;
                }
-       }
-
-       ya  = y2 - y0;
-       yb  = y2 - y1;
-       dx  = (x2 - x1) * ya - (x2 - x0) * yb;
-       du4 = (u2 - u1) * ya - (u2 - u0) * yb;
-       dv4 = (v2 - v1) * ya - (v2 - v0) * yb;
+#else
+               if (dx4 != 0) {
+                       du4 = GPU_FAST_DIV(du4 << FIXED_BITS, dx4);
+                       dv4 = GPU_FAST_DIV(dv4 << FIXED_BITS, dx4);
+               } else {
+                       du4 = dv4 = 0;
+               }
+#endif
+#endif
+               // Set u,v increments for inner driver
+               gpu_unai.u_inc = du4;
+               gpu_unai.v_inc = dv4;
+
+               //senquack - TODO: why is it always going through 2 iterations when sometimes one would suffice here?
+               //                       (SAME ISSUE ELSEWHERE)
+               for (s32 loop0 = 2; loop0; loop0--) {
+                       if (loop0 == 2) {
+                               ya = y0;  yb = y1;
+                               x3 = x4 = i2x(x0);
+                               u3 = i2x(u0);  v3 = i2x(v0);
+                               if (dx < 0) {
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+                                       if ((y2 - y0) != 0) {
+                                               float finv = FloatInv(y2 - y0);
+                                               dx3 = (fixed)(((x2 - x0) << FIXED_BITS) * finv);
+                                               du3 = (fixed)(((u2 - u0) << FIXED_BITS) * finv);
+                                               dv3 = (fixed)(((v2 - v0) << FIXED_BITS) * finv);
+                                       } else {
+                                               dx3 = du3 = dv3 = 0;
+                                       }
+                                       dx4 = ((y1 - y0) != 0) ? (fixed)(((x1 - x0) << FIXED_BITS) * FloatInv(y1 - y0)) : 0;
+#else
+                                       if ((y2 - y0) != 0) {
+                                               float fdiv = y2 - y0;
+                                               dx3 = (fixed)(((x2 - x0) << FIXED_BITS) / fdiv);
+                                               du3 = (fixed)(((u2 - u0) << FIXED_BITS) / fdiv);
+                                               dv3 = (fixed)(((v2 - v0) << FIXED_BITS) / fdiv);
+                                       } else {
+                                               dx3 = du3 = dv3 = 0;
+                                       }
+                                       dx4 = ((y1 - y0) != 0) ? (fixed)(((x1 - x0) << FIXED_BITS) / (float)(y1 - y0)) : 0;
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+                                       if ((y2 - y0) != 0) {
+                                               int iF, iS;
+                                               xInv((y2 - y0), iF, iS);
+                                               dx3 = xInvMulx((x2 - x0), iF, iS);
+                                               du3 = xInvMulx((u2 - u0), iF, iS);
+                                               dv3 = xInvMulx((v2 - v0), iF, iS);
+                                       } else {
+                                               dx3 = du3 = dv3 = 0;
+                                       }
+                                       dx4 = ((y1 - y0) != 0) ? xLoDivx((x1 - x0), (y1 - y0)) : 0;
+#else
+                                       if ((y2 - y0) != 0) {
+                                               dx3 = GPU_FAST_DIV((x2 - x0) << FIXED_BITS, (y2 - y0));
+                                               du3 = GPU_FAST_DIV((u2 - u0) << FIXED_BITS, (y2 - y0));
+                                               dv3 = GPU_FAST_DIV((v2 - v0) << FIXED_BITS, (y2 - y0));
+                                       } else {
+                                               dx3 = du3 = dv3 = 0;
+                                       }
+                                       dx4 = ((y1 - y0) != 0) ? GPU_FAST_DIV((x1 - x0) << FIXED_BITS, (y1 - y0)) : 0;
+#endif
+#endif
+                               } else {
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+                                       if ((y1 - y0) != 0) {
+                                               float finv = FloatInv(y1 - y0);
+                                               dx3 = (fixed)(((x1 - x0) << FIXED_BITS) * finv);
+                                               du3 = (fixed)(((u1 - u0) << FIXED_BITS) * finv);
+                                               dv3 = (fixed)(((v1 - v0) << FIXED_BITS) * finv);
+                                       } else {
+                                               dx3 = du3 = dv3 = 0;
+                                       }
+                                       dx4 = ((y2 - y0) != 0) ? (fixed)(((x2 - x0) << FIXED_BITS) * FloatInv(y2 - y0)) : 0;
+#else
+                                       if ((y1 - y0) != 0) {
+                                               float fdiv = y1 - y0;
+                                               dx3 = (fixed)(((x1 - x0) << FIXED_BITS) / fdiv);
+                                               du3 = (fixed)(((u1 - u0) << FIXED_BITS) / fdiv);
+                                               dv3 = (fixed)(((v1 - v0) << FIXED_BITS) / fdiv);
+                                       } else {
+                                               dx3 = du3 = dv3 = 0;
+                                       }
+                                       dx4 = ((y2 - y0) != 0) ? (fixed)(((x2 - x0) << FIXED_BITS) / (float)(y2 - y0)) : 0;
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+                                       if ((y1 - y0) != 0) {
+                                               int iF, iS;
+                                               xInv((y1 - y0), iF, iS);
+                                               dx3 = xInvMulx((x1 - x0), iF, iS);
+                                               du3 = xInvMulx((u1 - u0), iF, iS);
+                                               dv3 = xInvMulx((v1 - v0), iF, iS);
+                                       } else {
+                                               dx3 = du3 = dv3 = 0;
+                                       }
+                                       dx4 = ((y2 - y0) != 0) ? xLoDivx((x2 - x0), (y2 - y0)) : 0;
+#else
+                                       if ((y1 - y0) != 0) {
+                                               dx3 = GPU_FAST_DIV((x1 - x0) << FIXED_BITS, (y1 - y0));
+                                               du3 = GPU_FAST_DIV((u1 - u0) << FIXED_BITS, (y1 - y0));
+                                               dv3 = GPU_FAST_DIV((v1 - v0) << FIXED_BITS, (y1 - y0));
+                                       } else {
+                                               dx3 = du3 = dv3 = 0;
+                                       }
+                                       dx4 = ((y2 - y0) != 0) ? GPU_FAST_DIV((x2 - x0) << FIXED_BITS, (y2 - y0)) : 0;
+#endif
+#endif
+                               }
+                       } else {
+                               //senquack - break out of final loop if nothing to be drawn (1st loop
+                               //           must always be taken to setup dx3/dx4)
+                               if (y1 == y2) break;
+
+                               ya = y1;  yb = y2;
+
+                               if (dx < 0) {
+                                       x3 = i2x(x0);
+                                       x4 = i2x(x1);
+                                       u3 = i2x(u0);
+                                       v3 = i2x(v0);
+                                       if ((y1 - y0) != 0) {
+                                               x3 += (dx3 * (y1 - y0));
+                                               u3 += (du3 * (y1 - y0));
+                                               v3 += (dv3 * (y1 - y0));
+                                       }
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+                                       dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) * FloatInv(y2 - y1)) : 0;
+#else
+                                       dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) / (float)(y2 - y1)) : 0;
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+                                       dx4 = ((y2 - y1) != 0) ? xLoDivx((x2 - x1), (y2 - y1)) : 0;
+#else
+                                       dx4 = ((y2 - y1) != 0) ? GPU_FAST_DIV((x2 - x1) << FIXED_BITS, (y2 - y1)) : 0;
+#endif
+#endif
+                               } else {
+                                       x3 = i2x(x1);
+                                       x4 = i2x(x0) + (dx4 * (y1 - y0));
+                                       u3 = i2x(u1);
+                                       v3 = i2x(v1);
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+                                       if ((y2 - y1) != 0) {
+                                               float finv = FloatInv(y2 - y1);
+                                               dx3 = (fixed)(((x2 - x1) << FIXED_BITS) * finv);
+                                               du3 = (fixed)(((u2 - u1) << FIXED_BITS) * finv);
+                                               dv3 = (fixed)(((v2 - v1) << FIXED_BITS) * finv);
+                                       } else {
+                                               dx3 = du3 = dv3 = 0;
+                                       }
+#else
+                                       if ((y2 - y1) != 0) {
+                                               float fdiv = y2 - y1;
+                                               dx3 = (fixed)(((x2 - x1) << FIXED_BITS) / fdiv);
+                                               du3 = (fixed)(((u2 - u1) << FIXED_BITS) / fdiv);
+                                               dv3 = (fixed)(((v2 - v1) << FIXED_BITS) / fdiv);
+                                       } else {
+                                               dx3 = du3 = dv3 = 0;
+                                       }
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+                                       if ((y2 - y1) != 0) {
+                                               int iF, iS;
+                                               xInv((y2 - y1), iF, iS);
+                                               dx3 = xInvMulx((x2 - x1), iF, iS);
+                                               du3 = xInvMulx((u2 - u1), iF, iS);
+                                               dv3 = xInvMulx((v2 - v1), iF, iS);
+                                       } else {
+                                               dx3 = du3 = dv3 = 0;
+                                       }
+#else 
+                                       if ((y2 - y1) != 0) {
+                                               dx3 = GPU_FAST_DIV((x2 - x1) << FIXED_BITS, (y2 - y1));
+                                               du3 = GPU_FAST_DIV((u2 - u1) << FIXED_BITS, (y2 - y1));
+                                               dv3 = GPU_FAST_DIV((v2 - v1) << FIXED_BITS, (y2 - y1));
+                                       } else {
+                                               dx3 = du3 = dv3 = 0;
+                                       }
+#endif
+#endif
+                               }
+                       }
 
-       s32 iF,iS;
-       xInv( dx, iF, iS);
-       du4 = xInvMulx( du4, iF, iS);
-       dv4 = xInvMulx( dv4, iF, iS);
-       tInc = ((u32)(du4<<7)&0x7fff0000) | ((u32)(dv4>>9)&0x00007fff);
-       tMsk = (TextureWindow[2]<<23) | (TextureWindow[3]<<7) | 0x00ff00ff;
+                       s32 xmin, xmax, ymin, ymax;
+                       xmin = gpu_unai.DrawingArea[0];  xmax = gpu_unai.DrawingArea[2];
+                       ymin = gpu_unai.DrawingArea[1];  ymax = gpu_unai.DrawingArea[3];
 
-       for (s32 loop0 = 2; loop0; --loop0)
-       {
-               if (loop0 == 2)
-               {
-                       ya = y0;
-                       yb = y1;
-                       u3 = i2x(u0);
-                       v3 = i2x(v0);
-                       x3 = i2x(x0);
-                       x4 = y0!=y1 ? x3 : i2x(x1);
-                       if (dx < 0)
-                       {
-                               xInv( (y2 - y0), iF, iS);
-                               dx3 = xInvMulx( (x2 - x0), iF, iS);
-                               du3 = xInvMulx( (u2 - u0), iF, iS);
-                               dv3 = xInvMulx( (v2 - v0), iF, iS);
-                               dx4 = xLoDivx ( (x1 - x0), (y1 - y0));
-                       }
-                       else
-                       {
-                               xInv( (y1 - y0), iF, iS);
-                               dx3 = xInvMulx( (x1 - x0), iF, iS);
-                               du3 = xInvMulx( (u1 - u0), iF, iS);
-                               dv3 = xInvMulx( (v1 - v0), iF, iS);
-                               dx4 = xLoDivx ( (x2 - x0), (y2 - y0));
+                       if ((ymin - ya) > 0) {
+                               x3 += dx3 * (ymin - ya);
+                               x4 += dx4 * (ymin - ya);
+                               u3 += du3 * (ymin - ya);
+                               v3 += dv3 * (ymin - ya);
+                               ya = ymin;
                        }
-               }
-               else
-               {
-                       ya = y1;
-                       yb = y2;
-                       if (dx < 0)
-                       {
-                               temp = y1 - y0;
-                               u3 = i2x(u0) + (du3 * temp);
-                               v3 = i2x(v0) + (dv3 * temp);
-                               x3 = i2x(x0) + (dx3 * temp);
-                               x4 = i2x(x1);
-                               dx4 = xLoDivx((x2 - x1), (y2 - y1));
-                       }
-                       else
+
+                       if (yb > ymax) yb = ymax;
+
+                       int loop1 = yb - ya;
+                       if (loop1 <= 0)
+                               continue;
+
+                       u16* PixelBase = &((u16*)gpu_unai.vram)[FRAME_OFFSET(0, ya)];
+                       int li=gpu_unai.ilace_mask;
+                       int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.ilace_mask+1):0);
+                       int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.ilace_mask+1):0):1);
+
+                       for (; loop1; --loop1, ++ya, PixelBase += FRAME_WIDTH,
+                                       x3 += dx3, x4 += dx4,
+                                       u3 += du3, v3 += dv3 )
                        {
-                               u3 = i2x(u1);
-                               v3 = i2x(v1);
-                               x3 = i2x(x1);
-                               x4 = i2x(x0) + (dx4 * (y1 - y0));
-                               xInv( (y2 - y1), iF, iS);
-                               dx3 = xInvMulx( (x2 - x1), iF, iS);
-                               du3 = xInvMulx( (u2 - u1), iF, iS);
-                               dv3 = xInvMulx( (v2 - v1), iF, iS);
-                       }
-               }
+                               if (ya&li) continue;
+                               if ((ya&pi)==pif) continue;
 
-               temp = ymin - ya;
-               if (temp > 0)
-               {
-                       ya  = ymin;
-                       x3 += dx3*temp;
-                       x4 += dx4*temp;
-                       u3 += du3*temp;
-                       v3 += dv3*temp;
-               }
-               if (yb > ymax) yb = ymax;
-               if (ya>=yb) continue;
+                               u32 u4, v4;
 
-               x3+= fixed_HALF;
-               x4+= fixed_HALF;
-               u3+= fixed_HALF;
-               v4+= fixed_HALF;
+                               xa = FixedCeilToInt(x3);  xb = FixedCeilToInt(x4);
+                               u4 = u3;  v4 = v3;
 
-               u16* PixelBase  = &((u16*)GPU_FrameBuffer)[FRAME_OFFSET(0, ya)];
+                               fixed itmp = i2x(xa) - x3;
+                               if (itmp != 0) {
+                                       u4 += (du4 * itmp) >> FIXED_BITS;
+                                       v4 += (dv4 * itmp) >> FIXED_BITS;
+                               }
 
-               for(;ya<yb;++ya, PixelBase += FRAME_WIDTH, x3+=dx3, x4+=dx4, u3+=du3, v3+=dv3)
-               {
-                       if (ya&li) continue;
-                       xa = x2i(x3);
-                       xb = x2i(x4);
-                       if( (xa>xmax) || (xb<xmin) ) continue;
+                               u4 += fixed_HALF;
+                               v4 += fixed_HALF;
 
-                       temp = xmin - xa;
-                       if(temp > 0)
-                       {
-                               xa  = xmin;
-                               u4 = u3 + du4*temp;
-                               v4 = v3 + dv4*temp;
-                       }
-                       else
-                       {
-                               u4 = u3;
-                               v4 = v3;
+                               if ((xmin - xa) > 0) {
+                                       u4 += du4 * (xmin - xa);
+                                       v4 += dv4 * (xmin - xa);
+                                       xa = xmin;
+                               }
+
+                               // Set u,v coords for inner driver
+                               gpu_unai.u = u4;
+                               gpu_unai.v = v4;
+
+                               if (xb > xmax) xb = xmax;
+                               if ((xb - xa) > 0)
+                                       gpuPolySpanDriver(gpu_unai, PixelBase + xa, (xb - xa));
                        }
-                       if(xb > xmax) xb = xmax;
-                       xb-=xa;
-                       if(xb>0) gpuPolySpanDriver(PixelBase + xa,xb);
                }
-       }
+       } while (++cur_pass < total_passes);
 }
 
 /*----------------------------------------------------------------------
-G3
+gpuDrawPolyG - Gouraud-shaded, untextured poly
 ----------------------------------------------------------------------*/
-
-void gpuDrawG3(const PP gpuPolySpanDriver)
+void gpuDrawPolyG(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad)
 {
-       const int li=linesInterlace;
-       s32 temp;
-       s32 xa, xb, xmin, xmax;
-       s32 ya, yb, ymin, ymax;
-       s32 x0, x1, x2, x3, dx3=0, x4, dx4=0, dx;
-       s32 y0, y1, y2;
-       s32 r0, r1, r2, r3, dr3=0;
-       s32 g0, g1, g2, g3, dg3=0;
-       s32 b0, b1, b2, b3, db3=0;
-
-       x0 = GPU_EXPANDSIGN(PacketBuffer.S2[2] );
-       y0 = GPU_EXPANDSIGN(PacketBuffer.S2[3] );
-       x1 = GPU_EXPANDSIGN(PacketBuffer.S2[6] );
-       y1 = GPU_EXPANDSIGN(PacketBuffer.S2[7] );
-       x2 = GPU_EXPANDSIGN(PacketBuffer.S2[10]);
-       y2 = GPU_EXPANDSIGN(PacketBuffer.S2[11]);
-
-       GPU_TESTRANGE3();
-
-       x0 += DrawingOffset[0];   x1 += DrawingOffset[0];   x2 += DrawingOffset[0];
-       y0 += DrawingOffset[1];   y1 += DrawingOffset[1];   y2 += DrawingOffset[1];
-
-       xmin = DrawingArea[0];  xmax = DrawingArea[2];
-       ymin = DrawingArea[1];  ymax = DrawingArea[3];
+       PolyVertex vbuf[4];
+       polyInitVertexBuffer(vbuf, packet, POLYTYPE_G, is_quad);
 
+       int total_passes = is_quad ? 2 : 1;
+       int cur_pass = 0;
+       do
        {
-               int rx0 = Max2(xmin,Min3(x0,x1,x2));
-               int ry0 = Max2(ymin,Min3(y0,y1,y2));
-               int rx1 = Min2(xmax,Max3(x0,x1,x2));
-               int ry1 = Min2(ymax,Max3(y0,y1,y2));
-               if( rx0>=rx1 || ry0>=ry1) return;
-       }
-       
-       r0 = PacketBuffer.U1[0];        g0 = PacketBuffer.U1[1];        b0 = PacketBuffer.U1[2];
-       r1 = PacketBuffer.U1[8];        g1 = PacketBuffer.U1[9];        b1 = PacketBuffer.U1[10];
-       r2 = PacketBuffer.U1[16];       g2 = PacketBuffer.U1[17];       b2 = PacketBuffer.U1[18];
+               const PolyVertex* vptrs[3];
+               if (polyUseTriangle(vbuf, cur_pass, vptrs) == false)
+                       continue;
+
+               s32 xa, xb, ya, yb;
+               s32 x3, dx3, x4, dx4, dx;
+               s32 r3, dr3, g3, dg3, b3, db3;
+               s32 x0, x1, x2, y0, y1, y2;
+               s32 r0, r1, r2, g0, g1, g2, b0, b1, b2;
+               s32 dr4, dg4, db4;
+
+               x0 = vptrs[0]->x;      y0 = vptrs[0]->y;
+               r0 = vptrs[0]->col.r;  g0 = vptrs[0]->col.g;  b0 = vptrs[0]->col.b;
+               x1 = vptrs[1]->x;      y1 = vptrs[1]->y;
+               r1 = vptrs[1]->col.r;  g1 = vptrs[1]->col.g;  b1 = vptrs[1]->col.b;
+               x2 = vptrs[2]->x;      y2 = vptrs[2]->y;
+               r2 = vptrs[2]->col.r;  g2 = vptrs[2]->col.g;  b2 = vptrs[2]->col.b;
+
+               ya = y2 - y0;
+               yb = y2 - y1;
+               dx4 = (x2 - x1) * ya - (x2 - x0) * yb;
+               dr4 = (r2 - r1) * ya - (r2 - r0) * yb;
+               dg4 = (g2 - g1) * ya - (g2 - g0) * yb;
+               db4 = (b2 - b1) * ya - (b2 - b0) * yb;
+               dx = dx4;
+               if (dx4 < 0) {
+                       dx4 = -dx4;
+                       dr4 = -dr4;
+                       dg4 = -dg4;
+                       db4 = -db4;
+               }
 
-       if (y0 >= y1)
-       {
-               if( y0!=y1 || x0>x1 )
-               {
-                       GPU_SWAP(x0, x1, temp);         GPU_SWAP(y0, y1, temp);
-                       GPU_SWAP(r0, r1, temp);         GPU_SWAP(g0, g1, temp);         GPU_SWAP(b0, b1, temp);
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+               if (dx4 != 0) {
+                       float finv = FloatInv(dx4);
+                       dr4 = (fixed)((dr4 << FIXED_BITS) * finv);
+                       dg4 = (fixed)((dg4 << FIXED_BITS) * finv);
+                       db4 = (fixed)((db4 << FIXED_BITS) * finv);
+               } else {
+                       dr4 = dg4 = db4 = 0;
                }
-       }
-       if (y1 >= y2)
-       {
-               if( y1!=y2 || x1>x2 )
-               {
-                       GPU_SWAP(x1, x2, temp);         GPU_SWAP(y1, y2, temp);
-                       GPU_SWAP(r1, r2, temp);         GPU_SWAP(g1, g2, temp);   GPU_SWAP(b1, b2, temp);
+#else
+               if (dx4 != 0) {
+                       float fdiv = dx4;
+                       dr4 = (fixed)((dr4 << FIXED_BITS) / fdiv);
+                       dg4 = (fixed)((dg4 << FIXED_BITS) / fdiv);
+                       db4 = (fixed)((db4 << FIXED_BITS) / fdiv);
+               } else {
+                       dr4 = dg4 = db4 = 0;
                }
-       }
-       if (y0 >= y1)
-       {
-               if( y0!=y1 || x0>x1 )
-               {
-                       GPU_SWAP(x0, x1, temp);         GPU_SWAP(y0, y1, temp);
-                       GPU_SWAP(r0, r1, temp);   GPU_SWAP(g0, g1, temp);               GPU_SWAP(b0, b1, temp);
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+               if (dx4 != 0) {
+                       int iF, iS;
+                       xInv(dx4, iF, iS);
+                       dr4 = xInvMulx(dr4, iF, iS);
+                       dg4 = xInvMulx(dg4, iF, iS);
+                       db4 = xInvMulx(db4, iF, iS);
+               } else {
+                       dr4 = dg4 = db4 = 0;
                }
-       }
-
-       ya  = y2 - y0;
-       yb  = y2 - y1;
-       dx  = (x2 - x1) * ya - (x2 - x0) * yb;
-       dr4 = (r2 - r1) * ya - (r2 - r0) * yb;
-       dg4 = (g2 - g1) * ya - (g2 - g0) * yb;
-       db4 = (b2 - b1) * ya - (b2 - b0) * yb;
-
-       s32 iF,iS;
-       xInv(            dx, iF, iS);
-       dr4 = xInvMulx( dr4, iF, iS);
-       dg4 = xInvMulx( dg4, iF, iS);
-       db4 = xInvMulx( db4, iF, iS);
-       u32 dr = (u32)(dr4<< 8)&(0xffffffff<<21);   if(dr4<0) dr+= 1<<21;
-       u32 dg = (u32)(dg4>> 3)&(0xffffffff<<10);   if(dg4<0) dg+= 1<<10;
-       u32 db = (u32)(db4>>14)&(0xffffffff    );   if(db4<0) db+= 1<< 0;
-       lInc = db + dg + dr;
-
-       for (s32 loop0 = 2; loop0; --loop0)
-       {
-               if (loop0 == 2)
-               {
-                       ya = y0;
-                       yb = y1;
-                       r3 = i2x(r0);
-                       g3 = i2x(g0);
-                       b3 = i2x(b0);
-                       x3 = i2x(x0);
-                       x4 = y0!=y1 ? x3 : i2x(x1);
-                       if (dx < 0)
-                       {
-                               xInv(           (y2 - y0), iF, iS);
-                               dx3 = xInvMulx( (x2 - x0), iF, iS);
-                               dr3 = xInvMulx( (r2 - r0), iF, iS);
-                               dg3 = xInvMulx( (g2 - g0), iF, iS);
-                               db3 = xInvMulx( (b2 - b0), iF, iS);
-                               dx4 = xLoDivx ( (x1 - x0), (y1 - y0));
-                       }
-                       else
-                       {
-                               xInv(           (y1 - y0), iF, iS);
-                               dx3 = xInvMulx( (x1 - x0), iF, iS);
-                               dr3 = xInvMulx( (r1 - r0), iF, iS);
-                               dg3 = xInvMulx( (g1 - g0), iF, iS);
-                               db3 = xInvMulx( (b1 - b0), iF, iS);
-                               dx4 = xLoDivx ( (x2 - x0), (y2 - y0));
-                       }
+#else
+               if (dx4 != 0) {
+                       dr4 = GPU_FAST_DIV(dr4 << FIXED_BITS, dx4);
+                       dg4 = GPU_FAST_DIV(dg4 << FIXED_BITS, dx4);
+                       db4 = GPU_FAST_DIV(db4 << FIXED_BITS, dx4);
+               } else {
+                       dr4 = dg4 = db4 = 0;
                }
-               else
-               {
-                       ya = y1;
-                       yb = y2;
-                       if (dx < 0)
-                       {
-                               temp = y1 - y0;
-                               r3  = i2x(r0) + (dr3 * temp);
-                               g3  = i2x(g0) + (dg3 * temp);
-                               b3  = i2x(b0) + (db3 * temp);
-                               x3  = i2x(x0) + (dx3 * temp);
-                               x4  = i2x(x1);
-                               dx4 = xLoDivx((x2 - x1), (y2 - y1));
-                       }
-                       else
-                       {
-                               r3 = i2x(r1);
-                               g3 = i2x(g1);
-                               b3 = i2x(b1);
-                               x3 = i2x(x1);
-                               x4 = i2x(x0) + (dx4 * (y1 - y0));
-
-                               xInv(           (y2 - y1), iF, iS);
-                               dx3 = xInvMulx( (x2 - x1), iF, iS);
-                               dr3 = xInvMulx( (r2 - r1), iF, iS);
-                               dg3 = xInvMulx( (g2 - g1), iF, iS);
-                               db3 = xInvMulx( (b2 - b1), iF, iS);
+#endif
+#endif
+               // Setup packed Gouraud increment for inner driver
+               gpu_unai.gInc = gpuPackGouraudColInc(dr4, dg4, db4);
+
+               for (s32 loop0 = 2; loop0; loop0--) {
+                       if (loop0 == 2) {
+                               ya = y0;
+                               yb = y1;
+                               x3 = x4 = i2x(x0);
+                               r3 = i2x(r0);
+                               g3 = i2x(g0);
+                               b3 = i2x(b0);
+                               if (dx < 0) {
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+                                       if ((y2 - y0) != 0) {
+                                               float finv = FloatInv(y2 - y0);
+                                               dx3 = (fixed)(((x2 - x0) << FIXED_BITS) * finv);
+                                               dr3 = (fixed)(((r2 - r0) << FIXED_BITS) * finv);
+                                               dg3 = (fixed)(((g2 - g0) << FIXED_BITS) * finv);
+                                               db3 = (fixed)(((b2 - b0) << FIXED_BITS) * finv);
+                                       } else {
+                                               dx3 = dr3 = dg3 = db3 = 0;
+                                       }
+                                       dx4 = ((y1 - y0) != 0) ? (fixed)(((x1 - x0) << FIXED_BITS) * FloatInv(y1 - y0)) : 0;
+#else
+                                       if ((y2 - y0) != 0) {
+                                               float fdiv = y2 - y0;
+                                               dx3 = (fixed)(((x2 - x0) << FIXED_BITS) / fdiv);
+                                               dr3 = (fixed)(((r2 - r0) << FIXED_BITS) / fdiv);
+                                               dg3 = (fixed)(((g2 - g0) << FIXED_BITS) / fdiv);
+                                               db3 = (fixed)(((b2 - b0) << FIXED_BITS) / fdiv);
+                                       } else {
+                                               dx3 = dr3 = dg3 = db3 = 0;
+                                       }
+                                       dx4 = ((y1 - y0) != 0) ? (fixed)(((x1 - x0) << FIXED_BITS) / (float)(y1 - y0)) : 0;
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+                                       if ((y2 - y0) != 0) {
+                                               int iF, iS;
+                                               xInv((y2 - y0), iF, iS);
+                                               dx3 = xInvMulx((x2 - x0), iF, iS);
+                                               dr3 = xInvMulx((r2 - r0), iF, iS);
+                                               dg3 = xInvMulx((g2 - g0), iF, iS);
+                                               db3 = xInvMulx((b2 - b0), iF, iS);
+                                       } else {
+                                               dx3 = dr3 = dg3 = db3 = 0;
+                                       }
+                                       dx4 = ((y1 - y0) != 0) ? xLoDivx((x1 - x0), (y1 - y0)) : 0;
+#else
+                                       if ((y2 - y0) != 0) {
+                                               dx3 = GPU_FAST_DIV((x2 - x0) << FIXED_BITS, (y2 - y0));
+                                               dr3 = GPU_FAST_DIV((r2 - r0) << FIXED_BITS, (y2 - y0));
+                                               dg3 = GPU_FAST_DIV((g2 - g0) << FIXED_BITS, (y2 - y0));
+                                               db3 = GPU_FAST_DIV((b2 - b0) << FIXED_BITS, (y2 - y0));
+                                       } else {
+                                               dx3 = dr3 = dg3 = db3 = 0;
+                                       }
+                                       dx4 = ((y1 - y0) != 0) ? GPU_FAST_DIV((x1 - x0) << FIXED_BITS, (y1 - y0)) : 0;
+#endif
+#endif
+                               } else {
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+                                       if ((y1 - y0) != 0) {
+                                               float finv = FloatInv(y1 - y0);
+                                               dx3 = (fixed)(((x1 - x0) << FIXED_BITS) * finv);
+                                               dr3 = (fixed)(((r1 - r0) << FIXED_BITS) * finv);
+                                               dg3 = (fixed)(((g1 - g0) << FIXED_BITS) * finv);
+                                               db3 = (fixed)(((b1 - b0) << FIXED_BITS) * finv);
+                                       } else {
+                                               dx3 = dr3 = dg3 = db3 = 0;
+                                       }
+                                       dx4 = ((y2 - y0) != 0) ? (fixed)(((x2 - x0) << FIXED_BITS) * FloatInv(y2 - y0)) : 0;
+#else
+                                       if ((y1 - y0) != 0) {
+                                               float fdiv = y1 - y0;
+                                               dx3 = (fixed)(((x1 - x0) << FIXED_BITS) / fdiv);
+                                               dr3 = (fixed)(((r1 - r0) << FIXED_BITS) / fdiv);
+                                               dg3 = (fixed)(((g1 - g0) << FIXED_BITS) / fdiv);
+                                               db3 = (fixed)(((b1 - b0) << FIXED_BITS) / fdiv);
+                                       } else {
+                                               dx3 = dr3 = dg3 = db3 = 0;
+                                       }
+                                       dx4 = ((y2 - y0) != 0) ? (fixed)(((x2 - x0) << FIXED_BITS) / (float)(y2 - y0)) : 0;
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+                                       if ((y1 - y0) != 0) {
+                                               int iF, iS;
+                                               xInv((y1 - y0), iF, iS);
+                                               dx3 = xInvMulx((x1 - x0), iF, iS);
+                                               dr3 = xInvMulx((r1 - r0), iF, iS);
+                                               dg3 = xInvMulx((g1 - g0), iF, iS);
+                                               db3 = xInvMulx((b1 - b0), iF, iS);
+                                       } else {
+                                               dx3 = dr3 = dg3 = db3 = 0;
+                                       }
+                                       dx4 = ((y2 - y0) != 0) ? xLoDivx((x2 - x0), (y2 - y0)) : 0;
+#else
+                                       if ((y1 - y0) != 0) {
+                                               dx3 = GPU_FAST_DIV((x1 - x0) << FIXED_BITS, (y1 - y0));
+                                               dr3 = GPU_FAST_DIV((r1 - r0) << FIXED_BITS, (y1 - y0));
+                                               dg3 = GPU_FAST_DIV((g1 - g0) << FIXED_BITS, (y1 - y0));
+                                               db3 = GPU_FAST_DIV((b1 - b0) << FIXED_BITS, (y1 - y0));
+                                       } else {
+                                               dx3 = dr3 = dg3 = db3 = 0;
+                                       }
+                                       dx4 = ((y2 - y0) != 0) ? GPU_FAST_DIV((x2 - x0) << FIXED_BITS, (y2 - y0)) : 0;
+#endif
+#endif
+                               }
+                       } else {
+                               //senquack - break out of final loop if nothing to be drawn (1st loop
+                               //           must always be taken to setup dx3/dx4)
+                               if (y1 == y2) break;
+
+                               ya = y1;  yb = y2;
+
+                               if (dx < 0) {
+                                       x3 = i2x(x0);  x4 = i2x(x1);
+                                       r3 = i2x(r0);  g3 = i2x(g0);  b3 = i2x(b0);
+
+                                       if ((y1 - y0) != 0) {
+                                               x3 += (dx3 * (y1 - y0));
+                                               r3 += (dr3 * (y1 - y0));
+                                               g3 += (dg3 * (y1 - y0));
+                                               b3 += (db3 * (y1 - y0));
+                                       }
+
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+                                       dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) * FloatInv(y2 - y1)) : 0;
+#else
+                                       dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) / (float)(y2 - y1)) : 0;
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+                                       dx4 = ((y2 - y1) != 0) ? xLoDivx((x2 - x1), (y2 - y1)) : 0;
+#else
+                                       dx4 = ((y2 - y1) != 0) ? GPU_FAST_DIV((x2 - x1) << FIXED_BITS, (y2 - y1)) : 0;
+#endif
+#endif
+                               } else {
+                                       x3 = i2x(x1);
+                                       x4 = i2x(x0) + (dx4 * (y1 - y0));
+
+                                       r3 = i2x(r1);  g3 = i2x(g1);  b3 = i2x(b1);
+
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+                                       if ((y2 - y1) != 0) {
+                                               float finv = FloatInv(y2 - y1);
+                                               dx3 = (fixed)(((x2 - x1) << FIXED_BITS) * finv);
+                                               dr3 = (fixed)(((r2 - r1) << FIXED_BITS) * finv);
+                                               dg3 = (fixed)(((g2 - g1) << FIXED_BITS) * finv);
+                                               db3 = (fixed)(((b2 - b1) << FIXED_BITS) * finv);
+                                       } else {
+                                               dx3 = dr3 = dg3 = db3 = 0;
+                                       }
+#else
+                                       if ((y2 - y1) != 0) {
+                                               float fdiv = y2 - y1;
+                                               dx3 = (fixed)(((x2 - x1) << FIXED_BITS) / fdiv);
+                                               dr3 = (fixed)(((r2 - r1) << FIXED_BITS) / fdiv);
+                                               dg3 = (fixed)(((g2 - g1) << FIXED_BITS) / fdiv);
+                                               db3 = (fixed)(((b2 - b1) << FIXED_BITS) / fdiv);
+                                       } else {
+                                               dx3 = dr3 = dg3 = db3 = 0;
+                                       }
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+                                       if ((y2 - y1) != 0) {
+                                               int iF, iS;
+                                               xInv((y2 - y1), iF, iS);
+                                               dx3 = xInvMulx((x2 - x1), iF, iS);
+                                               dr3 = xInvMulx((r2 - r1), iF, iS);
+                                               dg3 = xInvMulx((g2 - g1), iF, iS);
+                                               db3 = xInvMulx((b2 - b1), iF, iS);
+                                       } else {
+                                               dx3 = dr3 = dg3 = db3 = 0;
+                                       }
+#else
+                                       if ((y2 - y1) != 0) {
+                                               dx3 = GPU_FAST_DIV((x2 - x1) << FIXED_BITS, (y2 - y1));
+                                               dr3 = GPU_FAST_DIV((r2 - r1) << FIXED_BITS, (y2 - y1));
+                                               dg3 = GPU_FAST_DIV((g2 - g1) << FIXED_BITS, (y2 - y1));
+                                               db3 = GPU_FAST_DIV((b2 - b1) << FIXED_BITS, (y2 - y1));
+                                       } else {
+                                               dx3 = dr3 = dg3 = db3 = 0;
+                                       }
+#endif
+#endif
+                               }
                        }
-               }
 
-               temp = ymin - ya;
-               if (temp > 0)
-               {
-                       ya  = ymin;
-                       x3 += dx3*temp;   x4 += dx4*temp;
-                       r3 += dr3*temp;   g3 += dg3*temp;   b3 += db3*temp;
-               }
-               if (yb > ymax) yb = ymax;
-               if (ya>=yb) continue;
-
-               x3+= fixed_HALF;  x4+= fixed_HALF;
-               r3+= fixed_HALF;  g3+= fixed_HALF;  b3+= fixed_HALF;
-
-               u16* PixelBase  = &((u16*)GPU_FrameBuffer)[FRAME_OFFSET(0, ya)];
-               
-               for(;ya<yb;++ya, PixelBase += FRAME_WIDTH, x3+=dx3, x4+=dx4, r3+=dr3, g3+=dg3, b3+=db3)
-               {
-                       if (ya&li) continue;
-                       xa = x2i(x3);
-                       xb = x2i(x4);
-                       if( (xa>xmax) || (xb<xmin) ) continue;
-
-                       temp = xmin - xa;
-                       if(temp > 0)
-                       {
-                               xa  = xmin;
-                               r4 = r3 + dr4*temp;   g4 = g3 + dg4*temp;   b4 = b3 + db4*temp;
+                       s32 xmin, xmax, ymin, ymax;
+                       xmin = gpu_unai.DrawingArea[0];  xmax = gpu_unai.DrawingArea[2];
+                       ymin = gpu_unai.DrawingArea[1];  ymax = gpu_unai.DrawingArea[3];
+
+                       if ((ymin - ya) > 0) {
+                               x3 += (dx3 * (ymin - ya));
+                               x4 += (dx4 * (ymin - ya));
+                               r3 += (dr3 * (ymin - ya));
+                               g3 += (dg3 * (ymin - ya));
+                               b3 += (db3 * (ymin - ya));
+                               ya = ymin;
                        }
-                       else
+
+                       if (yb > ymax) yb = ymax;
+
+                       int loop1 = yb - ya;
+                       if (loop1 <= 0)
+                               continue;
+
+                       u16* PixelBase = &((u16*)gpu_unai.vram)[FRAME_OFFSET(0, ya)];
+                       int li=gpu_unai.ilace_mask;
+                       int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.ilace_mask+1):0);
+                       int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.ilace_mask+1):0):1);
+
+                       for (; loop1; --loop1, ++ya, PixelBase += FRAME_WIDTH,
+                                       x3 += dx3, x4 += dx4,
+                                       r3 += dr3, g3 += dg3, b3 += db3 )
                        {
+                               if (ya&li) continue;
+                               if ((ya&pi)==pif) continue;
+
+                               u32 r4, g4, b4;
+
+                               xa = FixedCeilToInt(x3);
+                               xb = FixedCeilToInt(x4);
                                r4 = r3;  g4 = g3;  b4 = b3;
+
+                               fixed itmp = i2x(xa) - x3;
+                               if (itmp != 0) {
+                                       r4 += (dr4 * itmp) >> FIXED_BITS;
+                                       g4 += (dg4 * itmp) >> FIXED_BITS;
+                                       b4 += (db4 * itmp) >> FIXED_BITS;
+                               }
+
+                               r4 += fixed_HALF;
+                               g4 += fixed_HALF;
+                               b4 += fixed_HALF;
+
+                               if ((xmin - xa) > 0) {
+                                       r4 += (dr4 * (xmin - xa));
+                                       g4 += (dg4 * (xmin - xa));
+                                       b4 += (db4 * (xmin - xa));
+                                       xa = xmin;
+                               }
+
+                               // Setup packed Gouraud color for inner driver
+                               gpu_unai.gCol = gpuPackGouraudCol(r4, g4, b4);
+
+                               if (xb > xmax) xb = xmax;
+                               if ((xb - xa) > 0)
+                                       gpuPolySpanDriver(gpu_unai, PixelBase + xa, (xb - xa));
                        }
-                       if(xb > xmax) xb = xmax;
-                       xb-=xa;
-                       if(xb>0) gpuPolySpanDriver(PixelBase + xa,xb);
                }
-       }
+       } while (++cur_pass < total_passes);
 }
 
 /*----------------------------------------------------------------------
-GT3
+gpuDrawPolyGT - Gouraud-shaded, textured poly
 ----------------------------------------------------------------------*/
-
-void gpuDrawGT3(const PP gpuPolySpanDriver)
+void gpuDrawPolyGT(const PtrUnion packet, const PP gpuPolySpanDriver, u32 is_quad)
 {
-       const int li=linesInterlace;
-       s32 temp;
-       s32 xa, xb, xmin, xmax;
-       s32 ya, yb, ymin, ymax;
-       s32 x0, x1, x2, x3, dx3=0, x4, dx4=0, dx;
-       s32 y0, y1, y2;
-       s32 u0, u1, u2, u3, du3=0;
-       s32 v0, v1, v2, v3, dv3=0;
-       s32 r0, r1, r2, r3, dr3=0;
-       s32 g0, g1, g2, g3, dg3=0;
-       s32 b0, b1, b2, b3, db3=0;
-
-       x0 = GPU_EXPANDSIGN(PacketBuffer.S2[2] );
-       y0 = GPU_EXPANDSIGN(PacketBuffer.S2[3] );
-       x1 = GPU_EXPANDSIGN(PacketBuffer.S2[8] );
-       y1 = GPU_EXPANDSIGN(PacketBuffer.S2[9] );
-       x2 = GPU_EXPANDSIGN(PacketBuffer.S2[14]);
-       y2 = GPU_EXPANDSIGN(PacketBuffer.S2[15]);
-
-       GPU_TESTRANGE3();
-
-       x0 += DrawingOffset[0];   x1 += DrawingOffset[0];   x2 += DrawingOffset[0];
-       y0 += DrawingOffset[1];   y1 += DrawingOffset[1];   y2 += DrawingOffset[1];
-
-       xmin = DrawingArea[0];  xmax = DrawingArea[2];
-       ymin = DrawingArea[1];  ymax = DrawingArea[3];
+       PolyVertex vbuf[4];
+       polyInitVertexBuffer(vbuf, packet, POLYTYPE_GT, is_quad);
 
+       int total_passes = is_quad ? 2 : 1;
+       int cur_pass = 0;
+       do
        {
-               int rx0 = Max2(xmin,Min3(x0,x1,x2));
-               int ry0 = Max2(ymin,Min3(y0,y1,y2));
-               int rx1 = Min2(xmax,Max3(x0,x1,x2));
-               int ry1 = Min2(ymax,Max3(y0,y1,y2));
-               if( rx0>=rx1 || ry0>=ry1) return;
-       }
-
-       r0 = PacketBuffer.U1[0];        g0 = PacketBuffer.U1[1];        b0 = PacketBuffer.U1[2];
-       u0 = PacketBuffer.U1[8];        v0 = PacketBuffer.U1[9];
-       r1 = PacketBuffer.U1[12];       g1 = PacketBuffer.U1[13];       b1 = PacketBuffer.U1[14];
-       u1 = PacketBuffer.U1[20];       v1 = PacketBuffer.U1[21];
-       r2 = PacketBuffer.U1[24];       g2 = PacketBuffer.U1[25];       b2 = PacketBuffer.U1[26];
-       u2 = PacketBuffer.U1[32];       v2 = PacketBuffer.U1[33];
+               const PolyVertex* vptrs[3];
+               if (polyUseTriangle(vbuf, cur_pass, vptrs) == false)
+                       continue;
+
+               s32 xa, xb, ya, yb;
+               s32 x3, dx3, x4, dx4, dx;
+               s32 u3, du3, v3, dv3;
+               s32 r3, dr3, g3, dg3, b3, db3;
+               s32 x0, x1, x2, y0, y1, y2;
+               s32 u0, u1, u2, v0, v1, v2;
+               s32 r0, r1, r2, g0, g1, g2, b0, b1, b2;
+               s32 du4, dv4;
+               s32 dr4, dg4, db4;
+
+               x0 = vptrs[0]->x;      y0 = vptrs[0]->y;
+               u0 = vptrs[0]->tex.u;  v0 = vptrs[0]->tex.v;
+               r0 = vptrs[0]->col.r;  g0 = vptrs[0]->col.g;  b0 = vptrs[0]->col.b;
+               x1 = vptrs[1]->x;      y1 = vptrs[1]->y;
+               u1 = vptrs[1]->tex.u;  v1 = vptrs[1]->tex.v;
+               r1 = vptrs[1]->col.r;  g1 = vptrs[1]->col.g;  b1 = vptrs[1]->col.b;
+               x2 = vptrs[2]->x;      y2 = vptrs[2]->y;
+               u2 = vptrs[2]->tex.u;  v2 = vptrs[2]->tex.v;
+               r2 = vptrs[2]->col.r;  g2 = vptrs[2]->col.g;  b2 = vptrs[2]->col.b;
+
+               ya = y2 - y0;
+               yb = y2 - y1;
+               dx4 = (x2 - x1) * ya - (x2 - x0) * yb;
+               du4 = (u2 - u1) * ya - (u2 - u0) * yb;
+               dv4 = (v2 - v1) * ya - (v2 - v0) * yb;
+               dr4 = (r2 - r1) * ya - (r2 - r0) * yb;
+               dg4 = (g2 - g1) * ya - (g2 - g0) * yb;
+               db4 = (b2 - b1) * ya - (b2 - b0) * yb;
+               dx = dx4;
+               if (dx4 < 0) {
+                       dx4 = -dx4;
+                       du4 = -du4;
+                       dv4 = -dv4;
+                       dr4 = -dr4;
+                       dg4 = -dg4;
+                       db4 = -db4;
+               }
 
-       if (y0 >= y1)
-       {
-               if( y0!=y1 || x0>x1 )
-               {
-                       GPU_SWAP(x0, x1, temp);         GPU_SWAP(y0, y1, temp);
-                       GPU_SWAP(u0, u1, temp);         GPU_SWAP(v0, v1, temp);
-                       GPU_SWAP(r0, r1, temp);         GPU_SWAP(g0, g1, temp);   GPU_SWAP(b0, b1, temp);
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+               if (dx4 != 0) {
+                       float finv = FloatInv(dx4);
+                       du4 = (fixed)((du4 << FIXED_BITS) * finv);
+                       dv4 = (fixed)((dv4 << FIXED_BITS) * finv);
+                       dr4 = (fixed)((dr4 << FIXED_BITS) * finv);
+                       dg4 = (fixed)((dg4 << FIXED_BITS) * finv);
+                       db4 = (fixed)((db4 << FIXED_BITS) * finv);
+               } else {
+                       du4 = dv4 = dr4 = dg4 = db4 = 0;
                }
-       }
-       if (y1 >= y2)
-       {
-               if( y1!=y2 || x1>x2 )
-               {
-                       GPU_SWAP(x1, x2, temp);         GPU_SWAP(y1, y2, temp);
-                       GPU_SWAP(u1, u2, temp);         GPU_SWAP(v1, v2, temp);
-                       GPU_SWAP(r1, r2, temp);   GPU_SWAP(g1, g2, temp);               GPU_SWAP(b1, b2, temp);
+#else
+               if (dx4 != 0) {
+                       float fdiv = dx4;
+                       du4 = (fixed)((du4 << FIXED_BITS) / fdiv);
+                       dv4 = (fixed)((dv4 << FIXED_BITS) / fdiv);
+                       dr4 = (fixed)((dr4 << FIXED_BITS) / fdiv);
+                       dg4 = (fixed)((dg4 << FIXED_BITS) / fdiv);
+                       db4 = (fixed)((db4 << FIXED_BITS) / fdiv);
+               } else {
+                       du4 = dv4 = dr4 = dg4 = db4 = 0;
                }
-       }
-       if (y0 >= y1)
-       {
-               if( y0!=y1 || x0>x1 )
-               {
-                       GPU_SWAP(x0, x1, temp);         GPU_SWAP(y0, y1, temp);
-                       GPU_SWAP(u0, u1, temp);         GPU_SWAP(v0, v1, temp);
-                       GPU_SWAP(r0, r1, temp);         GPU_SWAP(g0, g1, temp);         GPU_SWAP(b0, b1, temp);
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+               if (dx4 != 0) {
+                       int iF, iS;
+                       xInv(dx4, iF, iS);
+                       du4 = xInvMulx(du4, iF, iS);
+                       dv4 = xInvMulx(dv4, iF, iS);
+                       dr4 = xInvMulx(dr4, iF, iS);
+                       dg4 = xInvMulx(dg4, iF, iS);
+                       db4 = xInvMulx(db4, iF, iS);
+               } else {
+                       du4 = dv4 = dr4 = dg4 = db4 = 0;
                }
-       }
-
-       ya  = y2 - y0;
-       yb  = y2 - y1;
-       dx  = (x2 - x1) * ya - (x2 - x0) * yb;
-       du4 = (u2 - u1) * ya - (u2 - u0) * yb;
-       dv4 = (v2 - v1) * ya - (v2 - v0) * yb;
-       dr4 = (r2 - r1) * ya - (r2 - r0) * yb;
-       dg4 = (g2 - g1) * ya - (g2 - g0) * yb;
-       db4 = (b2 - b1) * ya - (b2 - b0) * yb;
-
-       s32 iF,iS;
-
-       xInv(            dx, iF, iS);
-       du4 = xInvMulx( du4, iF, iS);
-       dv4 = xInvMulx( dv4, iF, iS);
-       dr4 = xInvMulx( dr4, iF, iS);
-       dg4 = xInvMulx( dg4, iF, iS);
-       db4 = xInvMulx( db4, iF, iS);
-       u32 dr = (u32)(dr4<< 8)&(0xffffffff<<21);   if(dr4<0) dr+= 1<<21;
-       u32 dg = (u32)(dg4>> 3)&(0xffffffff<<10);   if(dg4<0) dg+= 1<<10;
-       u32 db = (u32)(db4>>14)&(0xffffffff    );   if(db4<0) db+= 1<< 0;
-       lInc = db + dg + dr;
-       tInc = ((u32)(du4<<7)&0x7fff0000) | ((u32)(dv4>>9)&0x00007fff);
-       tMsk = (TextureWindow[2]<<23) | (TextureWindow[3]<<7) | 0x00ff00ff;
-
-       for (s32 loop0 = 2; loop0; --loop0)
-       {
-               if (loop0 == 2)
-               {
-                       ya = y0;
-                       yb = y1;
-                       u3 = i2x(u0);
-                       v3 = i2x(v0);
-                       r3 = i2x(r0);
-                       g3 = i2x(g0);
-                       b3 = i2x(b0);
-                       x3 = i2x(x0);
-                       x4 = y0!=y1 ? x3 : i2x(x1);
-                       if (dx < 0)
-                       {
-                               xInv(           (y2 - y0), iF, iS);
-                               dx3 = xInvMulx( (x2 - x0), iF, iS);
-                               du3 = xInvMulx( (u2 - u0), iF, iS);
-                               dv3 = xInvMulx( (v2 - v0), iF, iS);
-                               dr3 = xInvMulx( (r2 - r0), iF, iS);
-                               dg3 = xInvMulx( (g2 - g0), iF, iS);
-                               db3 = xInvMulx( (b2 - b0), iF, iS);
-                               dx4 = xLoDivx ( (x1 - x0), (y1 - y0));
-                       }
-                       else
-                       {
-                               xInv(           (y1 - y0), iF, iS);
-                               dx3 = xInvMulx( (x1 - x0), iF, iS);
-                               du3 = xInvMulx( (u1 - u0), iF, iS);
-                               dv3 = xInvMulx( (v1 - v0), iF, iS);
-                               dr3 = xInvMulx( (r1 - r0), iF, iS);
-                               dg3 = xInvMulx( (g1 - g0), iF, iS);
-                               db3 = xInvMulx( (b1 - b0), iF, iS);
-                               dx4 = xLoDivx ( (x2 - x0), (y2 - y0));
-                       }
+#else
+               if (dx4 != 0) {
+                       du4 = GPU_FAST_DIV(du4 << FIXED_BITS, dx4);
+                       dv4 = GPU_FAST_DIV(dv4 << FIXED_BITS, dx4);
+                       dr4 = GPU_FAST_DIV(dr4 << FIXED_BITS, dx4);
+                       dg4 = GPU_FAST_DIV(dg4 << FIXED_BITS, dx4);
+                       db4 = GPU_FAST_DIV(db4 << FIXED_BITS, dx4);
+               } else {
+                       du4 = dv4 = dr4 = dg4 = db4 = 0;
                }
-               else
-               {
-                       ya = y1;
-                       yb = y2;
-                       if (dx < 0)
-                       {
-                               temp = y1 - y0;
-                               u3  = i2x(u0) + (du3 * temp);
-                               v3  = i2x(v0) + (dv3 * temp);
-                               r3  = i2x(r0) + (dr3 * temp);
-                               g3  = i2x(g0) + (dg3 * temp);
-                               b3  = i2x(b0) + (db3 * temp);
-                               x3  = i2x(x0) + (dx3 * temp);
-                               x4  = i2x(x1);
-                               dx4 = xLoDivx((x2 - x1), (y2 - y1));
+#endif
+#endif
+               // Set u,v increments and packed Gouraud increment for inner driver
+               gpu_unai.u_inc = du4;
+               gpu_unai.v_inc = dv4;
+               gpu_unai.gInc = gpuPackGouraudColInc(dr4, dg4, db4);
+
+               for (s32 loop0 = 2; loop0; loop0--) {
+                       if (loop0 == 2) {
+                               ya = y0;  yb = y1;
+                               x3 = x4 = i2x(x0);
+                               u3 = i2x(u0);  v3 = i2x(v0);
+                               r3 = i2x(r0);  g3 = i2x(g0);  b3 = i2x(b0);
+                               if (dx < 0) {
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+                                       if ((y2 - y0) != 0) {
+                                               float finv = FloatInv(y2 - y0);
+                                               dx3 = (fixed)(((x2 - x0) << FIXED_BITS) * finv);
+                                               du3 = (fixed)(((u2 - u0) << FIXED_BITS) * finv);
+                                               dv3 = (fixed)(((v2 - v0) << FIXED_BITS) * finv);
+                                               dr3 = (fixed)(((r2 - r0) << FIXED_BITS) * finv);
+                                               dg3 = (fixed)(((g2 - g0) << FIXED_BITS) * finv);
+                                               db3 = (fixed)(((b2 - b0) << FIXED_BITS) * finv);
+                                       } else {
+                                               dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0;
+                                       }
+                                       dx4 = ((y1 - y0) != 0) ? (fixed)(((x1 - x0) << FIXED_BITS) * FloatInv(y1 - y0)) : 0;
+#else
+                                       if ((y2 - y0) != 0) {
+                                               float fdiv = y2 - y0;
+                                               dx3 = (fixed)(((x2 - x0) << FIXED_BITS) / fdiv);
+                                               du3 = (fixed)(((u2 - u0) << FIXED_BITS) / fdiv);
+                                               dv3 = (fixed)(((v2 - v0) << FIXED_BITS) / fdiv);
+                                               dr3 = (fixed)(((r2 - r0) << FIXED_BITS) / fdiv);
+                                               dg3 = (fixed)(((g2 - g0) << FIXED_BITS) / fdiv);
+                                               db3 = (fixed)(((b2 - b0) << FIXED_BITS) / fdiv);
+                                       } else {
+                                               dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0;
+                                       }
+                                       dx4 = ((y1 - y0) != 0) ? (fixed)(((x1 - x0) << FIXED_BITS) / (float)(y1 - y0)) : 0;
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+                                       if ((y2 - y0) != 0) {
+                                               int iF, iS;
+                                               xInv((y2 - y0), iF, iS);
+                                               dx3 = xInvMulx((x2 - x0), iF, iS);
+                                               du3 = xInvMulx((u2 - u0), iF, iS);
+                                               dv3 = xInvMulx((v2 - v0), iF, iS);
+                                               dr3 = xInvMulx((r2 - r0), iF, iS);
+                                               dg3 = xInvMulx((g2 - g0), iF, iS);
+                                               db3 = xInvMulx((b2 - b0), iF, iS);
+                                       } else {
+                                               dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0;
+                                       }
+                                       dx4 = ((y1 - y0) != 0) ? xLoDivx((x1 - x0), (y1 - y0)) : 0;
+#else
+                                       if ((y2 - y0) != 0) {
+                                               dx3 = GPU_FAST_DIV((x2 - x0) << FIXED_BITS, (y2 - y0));
+                                               du3 = GPU_FAST_DIV((u2 - u0) << FIXED_BITS, (y2 - y0));
+                                               dv3 = GPU_FAST_DIV((v2 - v0) << FIXED_BITS, (y2 - y0));
+                                               dr3 = GPU_FAST_DIV((r2 - r0) << FIXED_BITS, (y2 - y0));
+                                               dg3 = GPU_FAST_DIV((g2 - g0) << FIXED_BITS, (y2 - y0));
+                                               db3 = GPU_FAST_DIV((b2 - b0) << FIXED_BITS, (y2 - y0));
+                                       } else {
+                                               dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0;
+                                       }
+                                       dx4 = ((y1 - y0) != 0) ? GPU_FAST_DIV((x1 - x0) << FIXED_BITS, (y1 - y0)) : 0;
+#endif
+#endif
+                               } else {
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+                                       if ((y1 - y0) != 0) {
+                                               float finv = FloatInv(y1 - y0);
+                                               dx3 = (fixed)(((x1 - x0) << FIXED_BITS) * finv);
+                                               du3 = (fixed)(((u1 - u0) << FIXED_BITS) * finv);
+                                               dv3 = (fixed)(((v1 - v0) << FIXED_BITS) * finv);
+                                               dr3 = (fixed)(((r1 - r0) << FIXED_BITS) * finv);
+                                               dg3 = (fixed)(((g1 - g0) << FIXED_BITS) * finv);
+                                               db3 = (fixed)(((b1 - b0) << FIXED_BITS) * finv);
+                                       } else {
+                                               dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0;
+                                       }
+                                       dx4 = ((y2 - y0) != 0) ? (fixed)(((x2 - x0) << FIXED_BITS) * FloatInv(y2 - y0)) : 0;
+#else
+                                       if ((y1 - y0) != 0) {
+                                               float fdiv = y1 - y0;
+                                               dx3 = (fixed)(((x1 - x0) << FIXED_BITS) / fdiv);
+                                               du3 = (fixed)(((u1 - u0) << FIXED_BITS) / fdiv);
+                                               dv3 = (fixed)(((v1 - v0) << FIXED_BITS) / fdiv);
+                                               dr3 = (fixed)(((r1 - r0) << FIXED_BITS) / fdiv);
+                                               dg3 = (fixed)(((g1 - g0) << FIXED_BITS) / fdiv);
+                                               db3 = (fixed)(((b1 - b0) << FIXED_BITS) / fdiv);
+                                       } else {
+                                               dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0;
+                                       }
+                                       dx4 = ((y2 - y0) != 0) ? (fixed)(((x2 - x0) << FIXED_BITS) / float(y2 - y0)) : 0;
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+                                       if ((y1 - y0) != 0) {
+                                               int iF, iS;
+                                               xInv((y1 - y0), iF, iS);
+                                               dx3 = xInvMulx((x1 - x0), iF, iS);
+                                               du3 = xInvMulx((u1 - u0), iF, iS);
+                                               dv3 = xInvMulx((v1 - v0), iF, iS);
+                                               dr3 = xInvMulx((r1 - r0), iF, iS);
+                                               dg3 = xInvMulx((g1 - g0), iF, iS);
+                                               db3 = xInvMulx((b1 - b0), iF, iS);
+                                       } else {
+                                               dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0;
+                                       }
+                                       dx4 = ((y2 - y0) != 0) ? xLoDivx((x2 - x0), (y2 - y0)) : 0;
+#else
+                                       if ((y1 - y0) != 0) {
+                                               dx3 = GPU_FAST_DIV((x1 - x0) << FIXED_BITS, (y1 - y0));
+                                               du3 = GPU_FAST_DIV((u1 - u0) << FIXED_BITS, (y1 - y0));
+                                               dv3 = GPU_FAST_DIV((v1 - v0) << FIXED_BITS, (y1 - y0));
+                                               dr3 = GPU_FAST_DIV((r1 - r0) << FIXED_BITS, (y1 - y0));
+                                               dg3 = GPU_FAST_DIV((g1 - g0) << FIXED_BITS, (y1 - y0));
+                                               db3 = GPU_FAST_DIV((b1 - b0) << FIXED_BITS, (y1 - y0));
+                                       } else {
+                                               dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0;
+                                       }
+                                       dx4 = ((y2 - y0) != 0) ? GPU_FAST_DIV((x2 - x0) << FIXED_BITS, (y2 - y0)) : 0;
+#endif
+#endif
+                               }
+                       } else {
+                               //senquack - break out of final loop if nothing to be drawn (1st loop
+                               //           must always be taken to setup dx3/dx4)
+                               if (y1 == y2) break;
+
+                               ya = y1;  yb = y2;
+
+                               if (dx < 0) {
+                                       x3 = i2x(x0);  x4 = i2x(x1);
+                                       u3 = i2x(u0);  v3 = i2x(v0);
+                                       r3 = i2x(r0);  g3 = i2x(g0);  b3 = i2x(b0);
+
+                                       if ((y1 - y0) != 0) {
+                                               x3 += (dx3 * (y1 - y0));
+                                               u3 += (du3 * (y1 - y0));
+                                               v3 += (dv3 * (y1 - y0));
+                                               r3 += (dr3 * (y1 - y0));
+                                               g3 += (dg3 * (y1 - y0));
+                                               b3 += (db3 * (y1 - y0));
+                                       }
+
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+                                       dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) * FloatInv(y2 - y1)) : 0;
+#else
+                                       dx4 = ((y2 - y1) != 0) ? (fixed)(((x2 - x1) << FIXED_BITS) / (float)(y2 - y1)) : 0;
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+                                       dx4 = ((y2 - y1) != 0) ? xLoDivx((x2 - x1), (y2 - y1)) : 0;
+#else
+                                       dx4 = ((y2 - y1) != 0) ? GPU_FAST_DIV((x2 - x1) << FIXED_BITS, (y2 - y1)) : 0;
+#endif
+#endif
+                               } else {
+                                       x3 = i2x(x1);
+                                       x4 = i2x(x0) + (dx4 * (y1 - y0));
+
+                                       u3 = i2x(u1);  v3 = i2x(v1);
+                                       r3 = i2x(r1);  g3 = i2x(g1);  b3 = i2x(b1);
+#ifdef GPU_UNAI_USE_FLOATMATH
+#ifdef GPU_UNAI_USE_FLOAT_DIV_MULTINV
+                                       if ((y2 - y1) != 0) {
+                                               float finv = FloatInv(y2 - y1);
+                                               dx3 = (fixed)(((x2 - x1) << FIXED_BITS) * finv);
+                                               du3 = (fixed)(((u2 - u1) << FIXED_BITS) * finv);
+                                               dv3 = (fixed)(((v2 - v1) << FIXED_BITS) * finv);
+                                               dr3 = (fixed)(((r2 - r1) << FIXED_BITS) * finv);
+                                               dg3 = (fixed)(((g2 - g1) << FIXED_BITS) * finv);
+                                               db3 = (fixed)(((b2 - b1) << FIXED_BITS) * finv);
+                                       } else {
+                                               dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0;
+                                       }
+#else
+                                       if ((y2 - y1) != 0) {
+                                               float fdiv = y2 - y1;
+                                               dx3 = (fixed)(((x2 - x1) << FIXED_BITS) / fdiv);
+                                               du3 = (fixed)(((u2 - u1) << FIXED_BITS) / fdiv);
+                                               dv3 = (fixed)(((v2 - v1) << FIXED_BITS) / fdiv);
+                                               dr3 = (fixed)(((r2 - r1) << FIXED_BITS) / fdiv);
+                                               dg3 = (fixed)(((g2 - g1) << FIXED_BITS) / fdiv);
+                                               db3 = (fixed)(((b2 - b1) << FIXED_BITS) / fdiv);
+                                       } else {
+                                               dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0;
+                                       }
+#endif
+#else  // Integer Division:
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+                                       if ((y2 - y1) != 0) {
+                                               int iF, iS;
+                                               xInv((y2 - y1), iF, iS);
+                                               dx3 = xInvMulx((x2 - x1), iF, iS);
+                                               du3 = xInvMulx((u2 - u1), iF, iS);
+                                               dv3 = xInvMulx((v2 - v1), iF, iS);
+                                               dr3 = xInvMulx((r2 - r1), iF, iS);
+                                               dg3 = xInvMulx((g2 - g1), iF, iS);
+                                               db3 = xInvMulx((b2 - b1), iF, iS);
+                                       } else {
+                                               dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0;
+                                       }
+#else
+                                       if ((y2 - y1) != 0) {
+                                               dx3 = GPU_FAST_DIV((x2 - x1) << FIXED_BITS, (y2 - y1));
+                                               du3 = GPU_FAST_DIV((u2 - u1) << FIXED_BITS, (y2 - y1));
+                                               dv3 = GPU_FAST_DIV((v2 - v1) << FIXED_BITS, (y2 - y1));
+                                               dr3 = GPU_FAST_DIV((r2 - r1) << FIXED_BITS, (y2 - y1));
+                                               dg3 = GPU_FAST_DIV((g2 - g1) << FIXED_BITS, (y2 - y1));
+                                               db3 = GPU_FAST_DIV((b2 - b1) << FIXED_BITS, (y2 - y1));
+                                       } else {
+                                               dx3 = du3 = dv3 = dr3 = dg3 = db3 = 0;
+                                       }
+#endif
+#endif
+                               }
                        }
-                       else
-                       {
-                               u3 = i2x(u1);
-                               v3 = i2x(v1);
-                               r3 = i2x(r1);
-                               g3 = i2x(g1);
-                               b3 = i2x(b1);
-                               x3 = i2x(x1);
-                               x4 = i2x(x0) + (dx4 * (y1 - y0));
-
-                               xInv(           (y2 - y1), iF, iS);
-                               dx3 = xInvMulx( (x2 - x1), iF, iS);
-                               du3 = xInvMulx( (u2 - u1), iF, iS);
-                               dv3 = xInvMulx( (v2 - v1), iF, iS);
-                               dr3 = xInvMulx( (r2 - r1), iF, iS);
-                               dg3 = xInvMulx( (g2 - g1), iF, iS);
-                               db3 = xInvMulx( (b2 - b1), iF, iS);
-                       }
-               }
 
-               temp = ymin - ya;
-               if (temp > 0)
-               {
-                       ya  = ymin;
-                       x3 += dx3*temp;   x4 += dx4*temp;
-                       u3 += du3*temp;   v3 += dv3*temp;
-                       r3 += dr3*temp;   g3 += dg3*temp;   b3 += db3*temp;
-               }
-               if (yb > ymax) yb = ymax;
-               if (ya>=yb) continue;
-
-               x3+= fixed_HALF;  x4+= fixed_HALF;
-               u3+= fixed_HALF;  v4+= fixed_HALF;
-               r3+= fixed_HALF;  g3+= fixed_HALF;  b3+= fixed_HALF;
-               u16* PixelBase  = &((u16*)GPU_FrameBuffer)[FRAME_OFFSET(0, ya)];
-               
-               for(;ya<yb;++ya, PixelBase += FRAME_WIDTH, x3+=dx3, x4+=dx4, u3+=du3, v3+=dv3, r3+=dr3, g3+=dg3,        b3+=db3)
-               {
-                       if (ya&li) continue;
-                       xa = x2i(x3);
-                       xb = x2i(x4);
-                       if( (xa>xmax) || (xb<xmin))     continue;
-
-                       temp = xmin - xa;
-                       if(temp > 0)
-                       {
-                               xa  = xmin;
-                               u4 = u3 + du4*temp;   v4 = v3 + dv4*temp;
-                               r4 = r3 + dr4*temp;   g4 = g3 + dg4*temp;   b4 = b3 + db4*temp;
+                       s32 xmin, xmax, ymin, ymax;
+                       xmin = gpu_unai.DrawingArea[0];  xmax = gpu_unai.DrawingArea[2];
+                       ymin = gpu_unai.DrawingArea[1];  ymax = gpu_unai.DrawingArea[3];
+
+                       if ((ymin - ya) > 0) {
+                               x3 += (dx3 * (ymin - ya));
+                               x4 += (dx4 * (ymin - ya));
+                               u3 += (du3 * (ymin - ya));
+                               v3 += (dv3 * (ymin - ya));
+                               r3 += (dr3 * (ymin - ya));
+                               g3 += (dg3 * (ymin - ya));
+                               b3 += (db3 * (ymin - ya));
+                               ya = ymin;
                        }
-                       else
+
+                       if (yb > ymax) yb = ymax;
+
+                       int loop1 = yb - ya;
+                       if (loop1 <= 0)
+                               continue;
+
+                       u16* PixelBase = &((u16*)gpu_unai.vram)[FRAME_OFFSET(0, ya)];
+                       int li=gpu_unai.ilace_mask;
+                       int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.ilace_mask+1):0);
+                       int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.ilace_mask+1):0):1);
+
+                       for (; loop1; --loop1, ++ya, PixelBase += FRAME_WIDTH,
+                                       x3 += dx3, x4 += dx4,
+                                       u3 += du3, v3 += dv3,
+                                       r3 += dr3, g3 += dg3, b3 += db3 )
                        {
+                               if (ya&li) continue;
+                               if ((ya&pi)==pif) continue;
+
+                               u32 u4, v4;
+                               u32 r4, g4, b4;
+
+                               xa = FixedCeilToInt(x3);
+                               xb = FixedCeilToInt(x4);
                                u4 = u3;  v4 = v3;
                                r4 = r3;  g4 = g3;  b4 = b3;
+
+                               fixed itmp = i2x(xa) - x3;
+                               if (itmp != 0) {
+                                       u4 += (du4 * itmp) >> FIXED_BITS;
+                                       v4 += (dv4 * itmp) >> FIXED_BITS;
+                                       r4 += (dr4 * itmp) >> FIXED_BITS;
+                                       g4 += (dg4 * itmp) >> FIXED_BITS;
+                                       b4 += (db4 * itmp) >> FIXED_BITS;
+                               }
+
+                               u4 += fixed_HALF;
+                               v4 += fixed_HALF;
+                               r4 += fixed_HALF;
+                               g4 += fixed_HALF;
+                               b4 += fixed_HALF;
+
+                               if ((xmin - xa) > 0) {
+                                       u4 += du4 * (xmin - xa);
+                                       v4 += dv4 * (xmin - xa);
+                                       r4 += dr4 * (xmin - xa);
+                                       g4 += dg4 * (xmin - xa);
+                                       b4 += db4 * (xmin - xa);
+                                       xa = xmin;
+                               }
+
+                               // Set packed Gouraud color and u,v coords for inner driver
+                               gpu_unai.u = u4;
+                               gpu_unai.v = v4;
+                               gpu_unai.gCol = gpuPackGouraudCol(r4, g4, b4);
+
+                               if (xb > xmax) xb = xmax;
+                               if ((xb - xa) > 0)
+                                       gpuPolySpanDriver(gpu_unai, PixelBase + xa, (xb - xa));
                        }
-                       if(xb > xmax) xb = xmax;
-                       xb-=xa;
-                       if(xb>0) gpuPolySpanDriver(PixelBase + xa,xb);
                }
-       }
+       } while (++cur_pass < total_passes);
 }
index a700db3..0afdbf5 100644 (file)
 ///////////////////////////////////////////////////////////////////////////////
 //  GPU internal sprite drawing functions
 
-///////////////////////////////////////////////////////////////////////////////
-void gpuDrawS(const PS gpuSpriteSpanDriver)
+void gpuDrawS(PtrUnion packet, const PS gpuSpriteSpanDriver)
 {
-       s32 x0, x1;
-       s32 y0, y1;
-       s32 u0;
-       s32 v0;
-
-       x1 = x0 = GPU_EXPANDSIGN(PacketBuffer.S2[2]) + DrawingOffset[0];
-       y1 = y0 = GPU_EXPANDSIGN(PacketBuffer.S2[3]) + DrawingOffset[1];
-       x1+= PacketBuffer.S2[6];
-       y1+= PacketBuffer.S2[7];
-
-       {
-               s32 xmin, xmax;
-               s32 ymin, ymax;
-               xmin = DrawingArea[0];  xmax = DrawingArea[2];
-               ymin = DrawingArea[1];  ymax = DrawingArea[3];
-
-               {
-                       int rx0 = Max2(xmin,Min2(x0,x1));
-                       int ry0 = Max2(ymin,Min2(y0,y1));
-                       int rx1 = Min2(xmax,Max2(x0,x1));
-                       int ry1 = Min2(ymax,Max2(y0,y1));
-                       if( rx0>=rx1 || ry0>=ry1) return;
-               }
-
-               u0 = PacketBuffer.U1[8];
-               v0 = PacketBuffer.U1[9];
-
-               r4 = s32(PacketBuffer.U1[0]);
-               g4 = s32(PacketBuffer.U1[1]);
-               b4 = s32(PacketBuffer.U1[2]);
-
-               {
-                       s32 temp;
-                       temp = ymin - y0;
-                       if (temp > 0) { y0 = ymin; v0 += temp; }
-                       if (y1 > ymax) y1 = ymax;
-                       if (y1 <= y0) return;
-                       
-                       temp = xmin - x0;
-                       if (temp > 0) { x0 = xmin; u0 += temp; }
-                       if (x1 > xmax) x1 = xmax;
-                       x1 -= x0;
-                       if (x1 <= 0) return;
-               }
-       }
-
-       {
-               u16 *Pixel = &((u16*)GPU_FrameBuffer)[FRAME_OFFSET(x0, y0)];
-               const int li=linesInterlace;
-               const u32 masku=TextureWindow[2];
-               const u32 maskv=TextureWindow[3];
-
-               for (;y0<y1;++y0) {
-                       if( 0 == (y0&li) ) gpuSpriteSpanDriver(Pixel,x1,FRAME_OFFSET(u0,v0),masku);
-                       Pixel += FRAME_WIDTH;
-                       v0 = (v0+1)&maskv;
-               }
+       s32 x0, x1, y0, y1;
+       u32 u0, v0;
+
+       //NOTE: Must 11-bit sign-extend the whole sum here, not just packet X/Y,
+       // or sprites in 1st level of SkullMonkeys disappear when walking right.
+       // This now matches behavior of Mednafen and PCSX Rearmed's gpu_neon:
+       x0 = GPU_EXPANDSIGN(packet.S2[2] + gpu_unai.DrawingOffset[0]);
+       y0 = GPU_EXPANDSIGN(packet.S2[3] + gpu_unai.DrawingOffset[1]);
+
+       u32 w = packet.U2[6] & 0x3ff; // Max width is 1023
+       u32 h = packet.U2[7] & 0x1ff; // Max height is 511
+       x1 = x0 + w;
+       y1 = y0 + h;
+
+       s32 xmin, xmax, ymin, ymax;
+       xmin = gpu_unai.DrawingArea[0]; xmax = gpu_unai.DrawingArea[2];
+       ymin = gpu_unai.DrawingArea[1]; ymax = gpu_unai.DrawingArea[3];
+
+       u0 = packet.U1[8];
+       v0 = packet.U1[9];
+
+       s32 temp;
+       temp = ymin - y0;
+       if (temp > 0) { y0 = ymin; v0 += temp; }
+       if (y1 > ymax) y1 = ymax;
+       if (y1 <= y0) return;
+
+       temp = xmin - x0;
+       if (temp > 0) { x0 = xmin; u0 += temp; }
+       if (x1 > xmax) x1 = xmax;
+       x1 -= x0;
+       if (x1 <= 0) return;
+
+       gpu_unai.r5 = packet.U1[0] >> 3;
+       gpu_unai.g5 = packet.U1[1] >> 3;
+       gpu_unai.b5 = packet.U1[2] >> 3;
+
+       u16 *Pixel = &((u16*)gpu_unai.vram)[FRAME_OFFSET(x0, y0)];
+       const int li=gpu_unai.ilace_mask;
+       const int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.ilace_mask+1):0);
+       const int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.ilace_mask+1):0):1);
+       unsigned int tmode = gpu_unai.TEXT_MODE >> 5;
+       const u32 v0_mask = gpu_unai.TextureWindow[3];
+       u8* pTxt_base = (u8*)gpu_unai.TBA;
+
+       // Texture is accessed byte-wise, so adjust idx if 16bpp
+       if (tmode == 3) u0 <<= 1;
+
+       for (; y0<y1; ++y0) {
+               u8* pTxt = pTxt_base + ((v0 & v0_mask) * 2048);
+               if (!(y0&li) && (y0&pi)!=pif)
+                       gpuSpriteSpanDriver(Pixel, x1, pTxt, u0);
+               Pixel += FRAME_WIDTH;
+               v0++;
        }
 }
 
 #ifdef __arm__
 #include "gpu_arm.h"
 
-void gpuDrawS16(void)
+/* Notaz 4bit sprites optimization */
+void gpuDrawS16(PtrUnion packet)
 {
        s32 x0, y0;
        s32 u0, v0;
@@ -95,19 +92,22 @@ void gpuDrawS16(void)
        s32 ymin, ymax;
        u32 h = 16;
 
-       x0 = GPU_EXPANDSIGN(PacketBuffer.S2[2]) + DrawingOffset[0];
-       y0 = GPU_EXPANDSIGN(PacketBuffer.S2[3]) + DrawingOffset[1];
+       //NOTE: Must 11-bit sign-extend the whole sum here, not just packet X/Y,
+       // or sprites in 1st level of SkullMonkeys disappear when walking right.
+       // This now matches behavior of Mednafen and PCSX Rearmed's gpu_neon:
+       x0 = GPU_EXPANDSIGN(packet.S2[2] + gpu_unai.DrawingOffset[0]);
+       y0 = GPU_EXPANDSIGN(packet.S2[3] + gpu_unai.DrawingOffset[1]);
 
-       xmin = DrawingArea[0];  xmax = DrawingArea[2];
-       ymin = DrawingArea[1];  ymax = DrawingArea[3];
-       u0 = PacketBuffer.U1[8];
-       v0 = PacketBuffer.U1[9];
+       xmin = gpu_unai.DrawingArea[0]; xmax = gpu_unai.DrawingArea[2];
+       ymin = gpu_unai.DrawingArea[1]; ymax = gpu_unai.DrawingArea[3];
+       u0 = packet.U1[8];
+       v0 = packet.U1[9];
 
        if (x0 > xmax - 16 || x0 < xmin ||
-           ((u0 | v0) & 15) || !(TextureWindow[2] & TextureWindow[3] & 8)) {
+           ((u0 | v0) & 15) || !(gpu_unai.TextureWindow[2] & gpu_unai.TextureWindow[3] & 8)) {
                // send corner cases to general handler
-               PacketBuffer.U4[3] = 0x00100010;
-               gpuDrawS(gpuSpriteSpanFn<0x20>);
+               packet.U4[3] = 0x00100010;
+               gpuDrawS(packet, gpuSpriteSpanFn<0x20>);
                return;
        }
 
@@ -121,54 +121,45 @@ void gpuDrawS16(void)
        else if (ymax - y0 < 16)
                h = ymax - y0;
 
-       draw_spr16_full(&GPU_FrameBuffer[FRAME_OFFSET(x0, y0)], &TBA[FRAME_OFFSET(u0/4, v0)], CBA, h);
+       draw_spr16_full(&gpu_unai.vram[FRAME_OFFSET(x0, y0)], &gpu_unai.TBA[FRAME_OFFSET(u0/4, v0)], gpu_unai.CBA, h);
 }
 #endif // __arm__
 
-///////////////////////////////////////////////////////////////////////////////
-void gpuDrawT(const PT gpuTileSpanDriver)
+void gpuDrawT(PtrUnion packet, const PT gpuTileSpanDriver)
 {
-       s32 x0, y0;
-       s32 x1, y1;
-
-       x1 = x0 = GPU_EXPANDSIGN(PacketBuffer.S2[2]) + DrawingOffset[0];
-       y1 = y0 = GPU_EXPANDSIGN(PacketBuffer.S2[3]) + DrawingOffset[1];
-       x1+= PacketBuffer.S2[4];
-       y1+= PacketBuffer.S2[5];
-
-       {
-               s32 xmin, xmax;
-               s32 ymin, ymax;
-               xmin = DrawingArea[0];  xmax = DrawingArea[2];
-               ymin = DrawingArea[1];  ymax = DrawingArea[3];
-
-               {
-                       int rx0 = Max2(xmin,Min2(x0,x1));
-                       int ry0 = Max2(ymin,Min2(y0,y1));
-                       int rx1 = Min2(xmax,Max2(x0,x1));
-                       int ry1 = Min2(ymax,Max2(y0,y1));
-                       if(rx0>=rx1 || ry0>=ry1) return;
-               }
-
-               if (y0 < ymin) y0 = ymin;
-               if (y1 > ymax) y1 = ymax;
-               if (y1 <= y0) return;
-
-               if (x0 < xmin) x0 = xmin;
-               if (x1 > xmax) x1 = xmax;
-               x1 -= x0;
-               if (x1 <= 0) return;
-       }
-       
-       {
-               u16 *Pixel = &((u16*)GPU_FrameBuffer)[FRAME_OFFSET(x0, y0)];
-               const u16 Data = GPU_RGB16(PacketBuffer.U4[0]);
-               const int li=linesInterlace;
-
-               for (; y0<y1; ++y0)
-               {
-                       if( 0 == (y0&li) ) gpuTileSpanDriver(Pixel,x1,Data);
-                       Pixel += FRAME_WIDTH;
-               }
+       s32 x0, x1, y0, y1;
+
+       // This now matches behavior of Mednafen and PCSX Rearmed's gpu_neon:
+       x0 = GPU_EXPANDSIGN(packet.S2[2] + gpu_unai.DrawingOffset[0]);
+       y0 = GPU_EXPANDSIGN(packet.S2[3] + gpu_unai.DrawingOffset[1]);
+
+       u32 w = packet.U2[4] & 0x3ff; // Max width is 1023
+       u32 h = packet.U2[5] & 0x1ff; // Max height is 511
+       x1 = x0 + w;
+       y1 = y0 + h;
+
+       s32 xmin, xmax, ymin, ymax;
+       xmin = gpu_unai.DrawingArea[0]; xmax = gpu_unai.DrawingArea[2];
+       ymin = gpu_unai.DrawingArea[1]; ymax = gpu_unai.DrawingArea[3];
+
+       if (y0 < ymin) y0 = ymin;
+       if (y1 > ymax) y1 = ymax;
+       if (y1 <= y0) return;
+
+       if (x0 < xmin) x0 = xmin;
+       if (x1 > xmax) x1 = xmax;
+       x1 -= x0;
+       if (x1 <= 0) return;
+
+       const u16 Data = GPU_RGB16(packet.U4[0]);
+       u16 *Pixel = &((u16*)gpu_unai.vram)[FRAME_OFFSET(x0, y0)];
+       const int li=gpu_unai.ilace_mask;
+       const int pi=(ProgressiveInterlaceEnabled()?(gpu_unai.ilace_mask+1):0);
+       const int pif=(ProgressiveInterlaceEnabled()?(gpu_unai.prog_ilace_flag?(gpu_unai.ilace_mask+1):0):1);
+
+       for (; y0<y1; ++y0) {
+               if (!(y0&li) && (y0&pi)!=pif)
+                       gpuTileSpanDriver(Pixel,x1,Data);
+               Pixel += FRAME_WIDTH;
        }
 }
diff --git a/plugins/gpu_unai/gpu_unai.h b/plugins/gpu_unai/gpu_unai.h
new file mode 100644 (file)
index 0000000..8fb2293
--- /dev/null
@@ -0,0 +1,318 @@
+/***************************************************************************
+*   Copyright (C) 2010 PCSX4ALL Team                                      *
+*   Copyright (C) 2010 Unai                                               *
+*   Copyright (C) 2016 Senquack (dansilsby <AT> gmail <DOT> com)          *
+*                                                                         *
+*   This program is free software; you can redistribute it and/or modify  *
+*   it under the terms of the GNU General Public License as published by  *
+*   the Free Software Foundation; either version 2 of the License, or     *
+*   (at your option) any later version.                                   *
+*                                                                         *
+*   This program is distributed in the hope that it will be useful,       *
+*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+*   GNU General Public License for more details.                          *
+*                                                                         *
+*   You should have received a copy of the GNU General Public License     *
+*   along with this program; if not, write to the                         *
+*   Free Software Foundation, Inc.,                                       *
+*   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
+***************************************************************************/
+
+#ifndef GPU_UNAI_H
+#define GPU_UNAI_H
+
+#include "gpu.h"
+
+// Header shared between both standalone gpu_unai (gpu.cpp) and new
+// gpulib-compatible gpu_unai (gpulib_if.cpp)
+// -> Anything here should be for gpu_unai's private use. <-
+
+///////////////////////////////////////////////////////////////////////////////
+//  Compile Options
+
+//#define ENABLE_GPU_NULL_SUPPORT   // Enables NullGPU support
+//#define ENABLE_GPU_LOG_SUPPORT    // Enables gpu logger, very slow only for windows debugging
+//#define ENABLE_GPU_ARMV7                     // Enables ARMv7 optimized assembly
+
+//Poly routine options (default is integer math and accurate division)
+//#define GPU_UNAI_USE_FLOATMATH         // Use float math in poly routines
+//#define GPU_UNAI_USE_FLOAT_DIV_MULTINV // If GPU_UNAI_USE_FLOATMATH is defined,
+                                         //  use multiply-by-inverse for division
+//#define GPU_UNAI_USE_INT_DIV_MULTINV   // If GPU_UNAI_USE_FLOATMATH is *not*
+                                         //  defined, use old inaccurate division
+
+
+#define GPU_INLINE static inline __attribute__((always_inline))
+#define INLINE     static inline __attribute__((always_inline))
+
+#define u8  uint8_t
+#define s8  int8_t
+#define u16 uint16_t
+#define s16 int16_t
+#define u32 uint32_t
+#define s32 int32_t
+#define s64 int64_t
+
+union PtrUnion
+{
+       u32  *U4;
+       s32  *S4;
+       u16  *U2;
+       s16  *S2;
+       u8   *U1;
+       s8   *S1;
+       void *ptr;
+};
+
+union GPUPacket
+{
+       u32 U4[16];
+       s32 S4[16];
+       u16 U2[32];
+       s16 S2[32];
+       u8  U1[64];
+       s8  S1[64];
+};
+
+template<class T> static inline void SwapValues(T &x, T &y)
+{
+       T tmp(x);  x = y;  y = tmp;
+}
+
+template<typename T>
+static inline T Min2 (const T a, const T b)
+{
+       return (a<b)?a:b;
+}
+
+template<typename T>
+static inline T Min3 (const T a, const T b, const T c)
+{
+       return  Min2(Min2(a,b),c);
+}
+
+template<typename T>
+static inline T Max2 (const T a, const T b)
+{
+       return  (a>b)?a:b;
+}
+
+template<typename T>
+static inline T Max3 (const T a, const T b, const T c)
+{
+       return  Max2(Max2(a,b),c);
+}
+
+
+///////////////////////////////////////////////////////////////////////////////
+//  GPU Raster Macros
+
+// Convert 24bpp color parameter of GPU command to 16bpp (15bpp + mask bit)
+#define        GPU_RGB16(rgb) ((((rgb)&0xF80000)>>9)|(((rgb)&0xF800)>>6)|(((rgb)&0xF8)>>3))
+
+// Sign-extend 11-bit coordinate command param
+#define GPU_EXPANDSIGN(x) (((s32)(x)<<(32-11))>>(32-11))
+
+// Max difference between any two X or Y primitive coordinates
+#define CHKMAX_X 1024
+#define CHKMAX_Y 512
+
+#define        FRAME_BUFFER_SIZE       (1024*512*2)
+#define        FRAME_WIDTH                       1024
+#define        FRAME_HEIGHT              512
+#define        FRAME_OFFSET(x,y)       (((y)<<10)+(x))
+#define FRAME_BYTE_STRIDE     2048
+#define FRAME_BYTES_PER_PIXEL 2
+
+static inline s32 GPU_DIV(s32 rs, s32 rt)
+{
+       return rt ? (rs / rt) : (0);
+}
+
+// 'Unsafe' version of above that doesn't check for div-by-zero
+#define GPU_FAST_DIV(rs, rt) ((signed)(rs) / (signed)(rt))
+
+struct gpu_unai_t {
+       u32 GPU_GP1;
+       GPUPacket PacketBuffer;
+       u16 *vram;
+
+       ////////////////////////////////////////////////////////////////////////////
+       // Variables used only by older standalone version of gpu_unai (gpu.cpp)
+#ifndef USE_GPULIB
+       u32  GPU_GP0;
+       u32  tex_window;       // Current texture window vals (set by GP0(E2h) cmd)
+       s32  PacketCount;
+       s32  PacketIndex;
+       bool fb_dirty;         // Framebuffer is dirty (according to GPU)
+
+       //  Display status
+       //  NOTE: Standalone older gpu_unai didn't care about horiz display range
+       u16  DisplayArea[6];   // [0] : Start of display area (in VRAM) X
+                              // [1] : Start of display area (in VRAM) Y
+                              // [2] : Display mode resolution HORIZONTAL
+                              // [3] : Display mode resolution VERTICAL
+                              // [4] : Vertical display range (on TV) START
+                              // [5] : Vertical display range (on TV) END
+
+       ////////////////////////////////////////////////////////////////////////////
+       //  Dma Transfers info
+       struct {
+               s32  px,py;
+               s32  x_end,y_end;
+               u16* pvram;
+               u32 *last_dma;     // Last dma pointer
+               bool FrameToRead;  // Load image in progress
+               bool FrameToWrite; // Store image in progress
+       } dma;
+
+       ////////////////////////////////////////////////////////////////////////////
+       //  Frameskip
+       struct {
+               int  skipCount;    // Frame skip (0,1,2,3...)
+               bool isSkip;       // Skip frame (according to GPU)
+               bool skipFrame;    // Skip this frame (according to frame skip)
+               bool wasSkip;      // Skip frame old value (according to GPU)
+               bool skipGPU;      // Skip GPU primitives
+       } frameskip;
+#endif
+       // END of standalone gpu_unai variables
+       ////////////////////////////////////////////////////////////////////////////
+
+       u32 TextureWindowCur;  // Current setting from last GP0(0xE2) cmd (raw form)
+       u8  TextureWindow[4];  // [0] : Texture window offset X
+                              // [1] : Texture window offset Y
+                              // [2] : Texture window mask X
+                              // [3] : Texture window mask Y
+
+       u16 DrawingArea[4];    // [0] : Drawing area top left X
+                              // [1] : Drawing area top left Y
+                              // [2] : Drawing area bottom right X
+                              // [3] : Drawing area bottom right Y
+
+       s16 DrawingOffset[2];  // [0] : Drawing offset X (signed)
+                              // [1] : Drawing offset Y (signed)
+
+       u16* TBA;              // Ptr to current texture in VRAM
+       u16* CBA;              // Ptr to current CLUT in VRAM
+
+       ////////////////////////////////////////////////////////////////////////////
+       //  Inner Loop parameters
+
+       // 22.10 Fixed-pt texture coords, mask, scanline advance
+       // NOTE: U,V are no longer packed together into one u32, this proved to be
+       //  too imprecise, leading to pixel dropouts.  Example: NFS3's skybox.
+       u32 u, v;
+       u32 u_msk, v_msk;
+       s32 u_inc, v_inc;
+
+       // Color for Gouraud-shaded prims
+       // Packed fixed-pt 8.3:8.3:8.2 rgb triplet
+       //  layout:  rrrrrrrrXXXggggggggXXXbbbbbbbbXX
+       //           ^ bit 31                       ^ bit 0
+       u32 gCol;
+       u32 gInc;          // Increment along scanline for gCol
+
+       // Color for flat-shaded, texture-blended prims
+       u8  r5, g5, b5;    // 5-bit light for undithered prims
+       u8  r8, g8, b8;    // 8-bit light for dithered prims
+
+       // Color for flat-shaded, untextured prims
+       u16 PixelData;      // bgr555 color for untextured flat-shaded polys
+
+       // End of inner Loop parameters
+       ////////////////////////////////////////////////////////////////////////////
+
+
+       u8 blit_mask;           // Determines what pixels to skip when rendering.
+                               //  Only useful on low-resolution devices using
+                               //  a simple pixel-dropping downscaler for PS1
+                               //  high-res modes. See 'pixel_skip' option.
+
+       u8 ilace_mask;          // Determines what lines to skip when rendering.
+                               //  Normally 0 when PS1 240 vertical res is in
+                               //  use and ilace_force is 0. When running in
+                               //  PS1 480 vertical res on a low-resolution
+                               //  device (320x240), will usually be set to 1
+                               //  so odd lines are not rendered. (Unless future
+                               //  full-screen scaling option is in use ..TODO)
+
+       bool prog_ilace_flag;   // Tracks successive frames for 'prog_ilace' option
+
+       u8 BLEND_MODE;
+       u8 TEXT_MODE;
+       u8 Masking;
+
+       u16 PixelMSB;
+
+       gpu_unai_config_t config;
+
+       u8  LightLUT[32*32];    // 5-bit lighting LUT (gpu_inner_light.h)
+       u32 DitherMatrix[64];   // Matrix of dither coefficients
+};
+
+static gpu_unai_t gpu_unai;
+
+// Global config that frontend can alter.. Values are read in GPU_init().
+// TODO: if frontend menu modifies a setting, add a function that can notify
+// GPU plugin to use new setting.
+gpu_unai_config_t gpu_unai_config_ext;
+
+///////////////////////////////////////////////////////////////////////////////
+// Internal inline funcs to get option status: (Allows flexibility)
+static inline bool LightingEnabled()
+{
+       return gpu_unai.config.lighting;
+}
+
+static inline bool FastLightingEnabled()
+{
+       return gpu_unai.config.fast_lighting;
+}
+
+static inline bool BlendingEnabled()
+{
+       return gpu_unai.config.blending;
+}
+
+static inline bool DitheringEnabled()
+{
+       return gpu_unai.config.dithering;
+}
+
+// For now, this is just for development/experimentation purposes..
+// If modified to return true, it will allow ignoring the status register
+//  bit 9 setting (dither enable). It will still restrict dithering only
+//  to Gouraud-shaded or texture-blended polys.
+static inline bool ForcedDitheringEnabled()
+{
+       return false;
+}
+
+static inline bool ProgressiveInterlaceEnabled()
+{
+#ifdef USE_GPULIB
+       // Using this old option greatly decreases quality of image. Disabled
+       //  for now when using new gpulib, since it also adds more work in loops.
+       return false;
+#else
+       return gpu_unai.config.prog_ilace;
+#endif
+}
+
+// For now, 320x240 output resolution is assumed, using simple line-skipping
+//  and pixel-skipping downscaler.
+// TODO: Flesh these out so they return useful values based on whether
+//       running on higher-res device or a resampling downscaler is enabled.
+static inline bool PixelSkipEnabled()
+{
+       return gpu_unai.config.pixel_skip;
+}
+
+static inline bool LineSkipEnabled()
+{
+       return true;
+}
+
+#endif // GPU_UNAI_H
index e9a199c..8b5174e 100644 (file)
@@ -2,6 +2,7 @@
 *   Copyright (C) 2010 PCSX4ALL Team                                      *
 *   Copyright (C) 2010 Unai                                               *
 *   Copyright (C) 2011 notaz                                              *
+*   Copyright (C) 2016 Senquack (dansilsby <AT> gmail <DOT> com)          *
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU General Public License as published by  *
 *   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
 ***************************************************************************/
 
+#include <stddef.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include "../gpulib/gpu.h"
-#include "arm_features.h"
-
-#define u8 uint8_t
-#define s8 int8_t
-#define u16 uint16_t
-#define s16 int16_t
-#define u32 uint32_t
-#define s32 int32_t
-#define s64 int64_t
-
-#define INLINE static
-
-#define        FRAME_BUFFER_SIZE  (1024*512*2)
-#define        FRAME_WIDTH        1024
-#define        FRAME_HEIGHT       512
-#define        FRAME_OFFSET(x,y)  (((y)<<10)+(x))
-
-#define isSkip 0 /* skip frame (info coming from GPU) */
-#define alt_fps 0
-static int linesInterlace;  /* internal lines interlace */
-static int force_interlace;
-
-static bool light = true; /* lighting */
-static bool blend = true; /* blending */
-static bool FrameToRead = false; /* load image in progress */
-static bool FrameToWrite = false; /* store image in progress */
-
-static bool enableAbbeyHack = false; /* Abe's Odyssey hack */
-
-static u8 BLEND_MODE;
-static u8 TEXT_MODE;
-static u8 Masking;
-
-static u16 PixelMSB;
-static u16 PixelData;
-
-///////////////////////////////////////////////////////////////////////////////
-//  GPU Global data
-///////////////////////////////////////////////////////////////////////////////
-
-//  Dma Transfers info
-static s32             px,py;
-static s32             x_end,y_end;
-static u16*  pvram;
-
-static s32 PacketCount;
-static s32 PacketIndex;
-
-//  Rasterizer status
-static u32 TextureWindow [4];
-static u32 DrawingArea   [4];
-static u32 DrawingOffset [2];
-
-static u16* TBA;
-static u16* CBA;
-
-//  Inner Loops
-static s32   u4, du4;
-static s32   v4, dv4;
-static s32   r4, dr4;
-static s32   g4, dg4;
-static s32   b4, db4;
-static u32   lInc;
-static u32   tInc, tMsk;
-
-union GPUPacket
-{
-       u32 U4[16];
-       s32 S4[16];
-       u16 U2[32];
-       s16 S2[32];
-       u8  U1[64];
-       s8  S1[64];
-};
-
-static GPUPacket PacketBuffer;
-static u16  *GPU_FrameBuffer;
-static u32   GPU_GP1;
-
-///////////////////////////////////////////////////////////////////////////////
-
-#include "../gpu_unai/gpu_fixedpoint.h"
-
-//  Inner loop driver instanciation file
-#include "../gpu_unai/gpu_inner.h"
-
-//  GPU Raster Macros
-#define        GPU_RGB16(rgb)        ((((rgb)&0xF80000)>>9)|(((rgb)&0xF800)>>6)|(((rgb)&0xF8)>>3))
+//#include "port.h"
+#include "gpu_unai.h"
 
-#define GPU_EXPANDSIGN(x)  (((s32)(x)<<21)>>21)
+// GPU fixed point math
+#include "gpu_fixedpoint.h"
 
-#define CHKMAX_X 1024
-#define CHKMAX_Y 512
-
-#define        GPU_SWAP(a,b,t) {(t)=(a);(a)=(b);(b)=(t);}
+// Inner loop driver instantiation file
+#include "gpu_inner.h"
 
 // GPU internal image drawing functions
-#include "../gpu_unai/gpu_raster_image.h"
+#include "gpu_raster_image.h"
 
 // GPU internal line drawing functions
-#include "../gpu_unai/gpu_raster_line.h"
+#include "gpu_raster_line.h"
 
 // GPU internal polygon drawing functions
-#include "../gpu_unai/gpu_raster_polygon.h"
+#include "gpu_raster_polygon.h"
 
 // GPU internal sprite drawing functions
-#include "../gpu_unai/gpu_raster_sprite.h"
+#include "gpu_raster_sprite.h"
 
 // GPU command buffer execution/store
-#include "../gpu_unai/gpu_command.h"
+#include "gpu_command.h"
 
 /////////////////////////////////////////////////////////////////////////////
 
 int renderer_init(void)
 {
-       GPU_FrameBuffer = (u16 *)gpu.vram;
-
-       // s_invTable
-       for(int i=1;i<=(1<<TABLE_BITS);++i)
-       {
-               double v = 1.0 / double(i);
-               #ifdef GPU_TABLE_10_BITS
-               v *= double(0xffffffff>>1);
-               #else
-               v *= double(0x80000000);
-               #endif
-               s_invTable[i-1]=s32(v);
-       }
-
-       return 0;
+  memset((void*)&gpu_unai, 0, sizeof(gpu_unai));
+  gpu_unai.vram = (u16*)gpu.vram;
+
+  // Original standalone gpu_unai initialized TextureWindow[]. I added the
+  //  same behavior here, since it seems unsafe to leave [2],[3] unset when
+  //  using HLE and Rearmed gpu_neon sets this similarly on init. -senquack
+  gpu_unai.TextureWindow[0] = 0;
+  gpu_unai.TextureWindow[1] = 0;
+  gpu_unai.TextureWindow[2] = 255;
+  gpu_unai.TextureWindow[3] = 255;
+  //senquack - new vars must be updated whenever texture window is changed:
+  //           (used for polygon-drawing in gpu_inner.h, gpu_raster_polygon.h)
+  const u32 fb = FIXED_BITS;  // # of fractional fixed-pt bits of u4/v4
+  gpu_unai.u_msk = (((u32)gpu_unai.TextureWindow[2]) << fb) | ((1 << fb) - 1);
+  gpu_unai.v_msk = (((u32)gpu_unai.TextureWindow[3]) << fb) | ((1 << fb) - 1);
+
+  // Configuration options
+  gpu_unai.config = gpu_unai_config_ext;
+  //senquack - disabled, not sure this is needed and would require modifying
+  // sprite-span functions, perhaps unnecessarily. No Abe Oddysey hack was
+  // present in latest PCSX4ALL sources we were using.
+  //gpu_unai.config.enableAbbeyHack = gpu_unai_config_ext.abe_hack;
+  gpu_unai.ilace_mask = gpu_unai.config.ilace_force;
+
+#ifdef GPU_UNAI_USE_INT_DIV_MULTINV
+  // s_invTable
+  for(int i=1;i<=(1<<TABLE_BITS);++i)
+  {
+    double v = 1.0 / double(i);
+#ifdef GPU_TABLE_10_BITS
+    v *= double(0xffffffff>>1);
+#else
+    v *= double(0x80000000);
+#endif
+    s_invTable[i-1]=s32(v);
+  }
+#endif
+
+  SetupLightLUT();
+  SetupDitheringConstants();
+
+  return 0;
 }
 
 void renderer_finish(void)
@@ -161,6 +103,111 @@ void renderer_finish(void)
 
 void renderer_notify_res_change(void)
 {
+  if (PixelSkipEnabled()) {
+    // Set blit_mask for high horizontal resolutions. This allows skipping
+    //  rendering pixels that would never get displayed on low-resolution
+    //  platforms that use simple pixel-dropping scaler.
+
+    switch (gpu.screen.hres)
+    {
+      case 512: gpu_unai.blit_mask = 0xa4; break; // GPU_BlitWWSWWSWS
+      case 640: gpu_unai.blit_mask = 0xaa; break; // GPU_BlitWS
+      default:  gpu_unai.blit_mask = 0;    break;
+    }
+  } else {
+    gpu_unai.blit_mask = 0;
+  }
+
+  if (LineSkipEnabled()) {
+    // Set rendering line-skip (only render every other line in high-res
+    //  480 vertical mode, or, optionally, force it for all video modes)
+
+    if (gpu.screen.vres == 480) {
+      if (gpu_unai.config.ilace_force) {
+        gpu_unai.ilace_mask = 3; // Only need 1/4 of lines
+      } else {
+        gpu_unai.ilace_mask = 1; // Only need 1/2 of lines
+      }
+    } else {
+      // Vert resolution changed from 480 to lower one
+      gpu_unai.ilace_mask = gpu_unai.config.ilace_force;
+    }
+  } else {
+    gpu_unai.ilace_mask = 0;
+  }
+
+  /*
+  printf("res change hres: %d   vres: %d   depth: %d   ilace_mask: %d\n",
+      gpu.screen.hres, gpu.screen.vres, gpu.status.rgb24 ? 24 : 15,
+      gpu_unai.ilace_mask);
+  */
+}
+
+// Handles GP0 draw settings commands 0xE1...0xE6
+static void gpuGP0Cmd_0xEx(gpu_unai_t &gpu_unai, u32 cmd_word)
+{
+  // Assume incoming GP0 command is 0xE1..0xE6, convert to 1..6
+  u8 num = (cmd_word >> 24) & 7;
+  gpu.ex_regs[num] = cmd_word; // Update gpulib register
+  switch (num) {
+    case 1: {
+      // GP0(E1h) - Draw Mode setting (aka "Texpage")
+      u32 cur_texpage = gpu_unai.GPU_GP1 & 0x7FF;
+      u32 new_texpage = cmd_word & 0x7FF;
+      if (cur_texpage != new_texpage) {
+        gpu_unai.GPU_GP1 = (gpu_unai.GPU_GP1 & ~0x7FF) | new_texpage;
+        gpuSetTexture(gpu_unai.GPU_GP1);
+      }
+    } break;
+
+    case 2: {
+      // GP0(E2h) - Texture Window setting
+      if (cmd_word != gpu_unai.TextureWindowCur) {
+        static const u8 TextureMask[32] = {
+          255, 7, 15, 7, 31, 7, 15, 7, 63, 7, 15, 7, 31, 7, 15, 7,
+          127, 7, 15, 7, 31, 7, 15, 7, 63, 7, 15, 7, 31, 7, 15, 7
+        };
+        gpu_unai.TextureWindowCur = cmd_word;
+        gpu_unai.TextureWindow[0] = ((cmd_word >> 10) & 0x1F) << 3;
+        gpu_unai.TextureWindow[1] = ((cmd_word >> 15) & 0x1F) << 3;
+        gpu_unai.TextureWindow[2] = TextureMask[(cmd_word >> 0) & 0x1F];
+        gpu_unai.TextureWindow[3] = TextureMask[(cmd_word >> 5) & 0x1F];
+        gpu_unai.TextureWindow[0] &= ~gpu_unai.TextureWindow[2];
+        gpu_unai.TextureWindow[1] &= ~gpu_unai.TextureWindow[3];
+
+        // Inner loop vars must be updated whenever texture window is changed:
+        const u32 fb = FIXED_BITS;  // # of fractional fixed-pt bits of u4/v4
+        gpu_unai.u_msk = (((u32)gpu_unai.TextureWindow[2]) << fb) | ((1 << fb) - 1);
+        gpu_unai.v_msk = (((u32)gpu_unai.TextureWindow[3]) << fb) | ((1 << fb) - 1);
+
+        gpuSetTexture(gpu_unai.GPU_GP1);
+      }
+    } break;
+
+    case 3: {
+      // GP0(E3h) - Set Drawing Area top left (X1,Y1)
+      gpu_unai.DrawingArea[0] = cmd_word         & 0x3FF;
+      gpu_unai.DrawingArea[1] = (cmd_word >> 10) & 0x3FF;
+    } break;
+
+    case 4: {
+      // GP0(E4h) - Set Drawing Area bottom right (X2,Y2)
+      gpu_unai.DrawingArea[2] = (cmd_word         & 0x3FF) + 1;
+      gpu_unai.DrawingArea[3] = ((cmd_word >> 10) & 0x3FF) + 1;
+    } break;
+
+    case 5: {
+      // GP0(E5h) - Set Drawing Offset (X,Y)
+      gpu_unai.DrawingOffset[0] = ((s32)cmd_word<<(32-11))>>(32-11);
+      gpu_unai.DrawingOffset[1] = ((s32)cmd_word<<(32-22))>>(32-11);
+    } break;
+
+    case 6: {
+      // GP0(E6h) - Mask Bit Setting
+      gpu_unai.Masking  = (cmd_word & 0x2) <<  1;
+      gpu_unai.PixelMSB = (cmd_word & 0x1) <<  8;
+    } break;
+  }
 }
 
 extern const unsigned char cmd_lengths[256];
@@ -171,9 +218,12 @@ int do_cmd_list(u32 *list, int list_len, int *last_cmd)
   u32 *list_start = list;
   u32 *list_end = list + list_len;
 
-  linesInterlace = force_interlace;
+  //TODO: set ilace_mask when resolution changes instead of every time,
+  // eliminate #ifdef below.
+  gpu_unai.ilace_mask = gpu_unai.config.ilace_force;
+
 #ifdef HAVE_PRE_ARMV7 /* XXX */
-  linesInterlace |= gpu.status.interlace;
+  gpu_unai.ilace_mask |= gpu.status.interlace;
 #endif
 
   for (; list < list_end; list += 1 + len)
@@ -186,126 +236,175 @@ int do_cmd_list(u32 *list, int list_len, int *last_cmd)
     }
 
     #define PRIM cmd
-    PacketBuffer.U4[0] = list[0];
+    gpu_unai.PacketBuffer.U4[0] = list[0];
     for (i = 1; i <= len; i++)
-      PacketBuffer.U4[i] = list[i];
+      gpu_unai.PacketBuffer.U4[i] = list[i];
+
+    PtrUnion packet = { .ptr = (void*)&gpu_unai.PacketBuffer };
 
     switch (cmd)
     {
       case 0x02:
-        gpuClearImage();
+        gpuClearImage(packet);
         break;
 
       case 0x20:
       case 0x21:
       case 0x22:
-      case 0x23:
-        gpuDrawF3(gpuPolySpanDrivers [Blending_Mode | Masking | Blending | PixelMSB]);
-        break;
+      case 0x23: {          // Monochrome 3-pt poly
+        PP driver = gpuPolySpanDrivers[
+          (gpu_unai.blit_mask?1024:0) |
+          Blending_Mode |
+          gpu_unai.Masking | Blending | gpu_unai.PixelMSB
+        ];
+        gpuDrawPolyF(packet, driver, false);
+      } break;
 
       case 0x24:
       case 0x25:
       case 0x26:
-      case 0x27:
-        gpuSetCLUT   (PacketBuffer.U4[2] >> 16);
-        gpuSetTexture(PacketBuffer.U4[4] >> 16);
-        if ((PacketBuffer.U1[0]>0x5F) && (PacketBuffer.U1[1]>0x5F) && (PacketBuffer.U1[2]>0x5F))
-          gpuDrawFT3(gpuPolySpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | PixelMSB]);
-        else
-          gpuDrawFT3(gpuPolySpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | Lighting | PixelMSB]);
-        break;
+      case 0x27: {          // Textured 3-pt poly
+        gpuSetCLUT   (gpu_unai.PacketBuffer.U4[2] >> 16);
+        gpuSetTexture(gpu_unai.PacketBuffer.U4[4] >> 16);
+
+        u32 driver_idx =
+          (gpu_unai.blit_mask?1024:0) |
+          Dithering |
+          Blending_Mode | gpu_unai.TEXT_MODE |
+          gpu_unai.Masking | Blending | gpu_unai.PixelMSB;
+
+        if (!FastLightingEnabled()) {
+          driver_idx |= Lighting;
+        } else {
+          if (!((gpu_unai.PacketBuffer.U1[0]>0x5F) && (gpu_unai.PacketBuffer.U1[1]>0x5F) && (gpu_unai.PacketBuffer.U1[2]>0x5F)))
+            driver_idx |= Lighting;
+        }
+
+        PP driver = gpuPolySpanDrivers[driver_idx];
+        gpuDrawPolyFT(packet, driver, false);
+      } break;
 
       case 0x28:
       case 0x29:
       case 0x2A:
-      case 0x2B: {
-        const PP gpuPolySpanDriver = gpuPolySpanDrivers [Blending_Mode | Masking | Blending | PixelMSB];
-        gpuDrawF3(gpuPolySpanDriver);
-        PacketBuffer.U4[1] = PacketBuffer.U4[4];
-        gpuDrawF3(gpuPolySpanDriver);
-        break;
-      }
+      case 0x2B: {          // Monochrome 4-pt poly
+        PP driver = gpuPolySpanDrivers[
+          (gpu_unai.blit_mask?1024:0) |
+          Blending_Mode |
+          gpu_unai.Masking | Blending | gpu_unai.PixelMSB
+        ];
+        gpuDrawPolyF(packet, driver, true); // is_quad = true
+      } break;
 
       case 0x2C:
       case 0x2D:
       case 0x2E:
-      case 0x2F: {
-        gpuSetCLUT   (PacketBuffer.U4[2] >> 16);
-        gpuSetTexture(PacketBuffer.U4[4] >> 16);
-        PP gpuPolySpanDriver;
-        if ((PacketBuffer.U1[0]>0x5F) && (PacketBuffer.U1[1]>0x5F) && (PacketBuffer.U1[2]>0x5F))
-          gpuPolySpanDriver = gpuPolySpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | PixelMSB];
-        else
-          gpuPolySpanDriver = gpuPolySpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | Lighting | PixelMSB];
-        gpuDrawFT3(gpuPolySpanDriver);
-        PacketBuffer.U4[1] = PacketBuffer.U4[7];
-        PacketBuffer.U4[2] = PacketBuffer.U4[8];
-        gpuDrawFT3(gpuPolySpanDriver);
-        break;
-      }
+      case 0x2F: {          // Textured 4-pt poly
+        gpuSetCLUT   (gpu_unai.PacketBuffer.U4[2] >> 16);
+        gpuSetTexture(gpu_unai.PacketBuffer.U4[4] >> 16);
+
+        u32 driver_idx =
+          (gpu_unai.blit_mask?1024:0) |
+          Dithering |
+          Blending_Mode | gpu_unai.TEXT_MODE |
+          gpu_unai.Masking | Blending | gpu_unai.PixelMSB;
+
+        if (!FastLightingEnabled()) {
+          driver_idx |= Lighting;
+        } else {
+          if (!((gpu_unai.PacketBuffer.U1[0]>0x5F) && (gpu_unai.PacketBuffer.U1[1]>0x5F) && (gpu_unai.PacketBuffer.U1[2]>0x5F)))
+            driver_idx |= Lighting;
+        }
+
+        PP driver = gpuPolySpanDrivers[driver_idx];
+        gpuDrawPolyFT(packet, driver, true); // is_quad = true
+      } break;
 
       case 0x30:
       case 0x31:
       case 0x32:
-      case 0x33:
-        gpuDrawG3(gpuPolySpanDrivers [Blending_Mode | Masking | Blending | 129 | PixelMSB]);
-        break;
+      case 0x33: {          // Gouraud-shaded 3-pt poly
+        //NOTE: The '129' here is CF_GOURAUD | CF_LIGHT, however
+        // this is an untextured poly, so CF_LIGHT (texture blend)
+        // shouldn't apply. Until the original array of template
+        // instantiation ptrs is fixed, we're stuck with this. (TODO)
+        PP driver = gpuPolySpanDrivers[
+          (gpu_unai.blit_mask?1024:0) |
+          Dithering |
+          Blending_Mode |
+          gpu_unai.Masking | Blending | 129 | gpu_unai.PixelMSB
+        ];
+        gpuDrawPolyG(packet, driver, false);
+      } break;
 
       case 0x34:
       case 0x35:
       case 0x36:
-      case 0x37:
-        gpuSetCLUT    (PacketBuffer.U4[2] >> 16);
-        gpuSetTexture (PacketBuffer.U4[5] >> 16);
-        gpuDrawGT3(gpuPolySpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | ((Lighting)?129:0) | PixelMSB]);
-        break;
+      case 0x37: {          // Gouraud-shaded, textured 3-pt poly
+        gpuSetCLUT    (gpu_unai.PacketBuffer.U4[2] >> 16);
+        gpuSetTexture (gpu_unai.PacketBuffer.U4[5] >> 16);
+        PP driver = gpuPolySpanDrivers[
+          (gpu_unai.blit_mask?1024:0) |
+          Dithering |
+          Blending_Mode | gpu_unai.TEXT_MODE |
+          gpu_unai.Masking | Blending | ((Lighting)?129:0) | gpu_unai.PixelMSB
+        ];
+        gpuDrawPolyGT(packet, driver, false);
+      } break;
 
       case 0x38:
       case 0x39:
       case 0x3A:
-      case 0x3B: {
-        const PP gpuPolySpanDriver  = gpuPolySpanDrivers [Blending_Mode | Masking | Blending | 129 | PixelMSB];
-        gpuDrawG3(gpuPolySpanDriver);
-        PacketBuffer.U4[0] = PacketBuffer.U4[6];
-        PacketBuffer.U4[1] = PacketBuffer.U4[7];
-        gpuDrawG3(gpuPolySpanDriver);
-        break;
-      }
+      case 0x3B: {          // Gouraud-shaded 4-pt poly
+        // See notes regarding '129' for 0x30..0x33 further above -senquack
+        PP driver = gpuPolySpanDrivers[
+          (gpu_unai.blit_mask?1024:0) |
+          Dithering |
+          Blending_Mode |
+          gpu_unai.Masking | Blending | 129 | gpu_unai.PixelMSB
+        ];
+        gpuDrawPolyG(packet, driver, true); // is_quad = true
+      } break;
 
       case 0x3C:
       case 0x3D:
       case 0x3E:
-      case 0x3F: {
-        gpuSetCLUT    (PacketBuffer.U4[2] >> 16);
-        gpuSetTexture (PacketBuffer.U4[5] >> 16);
-        const PP gpuPolySpanDriver  = gpuPolySpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | ((Lighting)?129:0) | PixelMSB];
-        gpuDrawGT3(gpuPolySpanDriver);
-        PacketBuffer.U4[0] = PacketBuffer.U4[9];
-        PacketBuffer.U4[1] = PacketBuffer.U4[10];
-        PacketBuffer.U4[2] = PacketBuffer.U4[11];
-        gpuDrawGT3(gpuPolySpanDriver);
-        break;
-      }
+      case 0x3F: {          // Gouraud-shaded, textured 4-pt poly
+        gpuSetCLUT    (gpu_unai.PacketBuffer.U4[2] >> 16);
+        gpuSetTexture (gpu_unai.PacketBuffer.U4[5] >> 16);
+        PP driver = gpuPolySpanDrivers[
+          (gpu_unai.blit_mask?1024:0) |
+          Dithering |
+          Blending_Mode | gpu_unai.TEXT_MODE |
+          gpu_unai.Masking | Blending | ((Lighting)?129:0) | gpu_unai.PixelMSB
+        ];
+        gpuDrawPolyGT(packet, driver, true); // is_quad = true
+      } break;
 
       case 0x40:
       case 0x41:
       case 0x42:
-      case 0x43:
-        gpuDrawLF(gpuPixelDrivers [ (Blending_Mode | Masking | Blending | (PixelMSB>>3)) >> 1]);
-        break;
-
-      case 0x48 ... 0x4F:
-      {
+      case 0x43: {          // Monochrome line
+        // Shift index right by one, as untextured prims don't use lighting
+        u32 driver_idx = (Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1;
+        PSD driver = gpuPixelSpanDrivers[driver_idx];
+        gpuDrawLineF(packet, driver);
+      } break;
+
+      case 0x48 ... 0x4F: { // Monochrome line strip
         u32 num_vertexes = 1;
         u32 *list_position = &(list[2]);
 
-        gpuDrawLF(gpuPixelDrivers [ (Blending_Mode | Masking | Blending | (PixelMSB>>3)) >> 1]);
+        // Shift index right by one, as untextured prims don't use lighting
+        u32 driver_idx = (Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1;
+        PSD driver = gpuPixelSpanDrivers[driver_idx];
+        gpuDrawLineF(packet, driver);
 
         while(1)
         {
-          PacketBuffer.U4[1] = PacketBuffer.U4[2];
-          PacketBuffer.U4[2] = *list_position++;
-          gpuDrawLF(gpuPixelDrivers [ (Blending_Mode | Masking | Blending | (PixelMSB>>3)) >> 1]);
+          gpu_unai.PacketBuffer.U4[1] = gpu_unai.PacketBuffer.U4[2];
+          gpu_unai.PacketBuffer.U4[2] = *list_position++;
+          gpuDrawLineF(packet, driver);
 
           num_vertexes++;
           if(list_position >= list_end) {
@@ -317,30 +416,38 @@ int do_cmd_list(u32 *list, int list_len, int *last_cmd)
         }
 
         len += (num_vertexes - 2);
-        break;
-      }
+      } break;
 
       case 0x50:
       case 0x51:
       case 0x52:
-      case 0x53:
-        gpuDrawLG(gpuPixelDrivers [ (Blending_Mode | Masking | Blending | (PixelMSB>>3)) >> 1]);
-        break;
-
-      case 0x58 ... 0x5F:
-      {
+      case 0x53: {          // Gouraud-shaded line
+        // Shift index right by one, as untextured prims don't use lighting
+        u32 driver_idx = (Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1;
+        // Index MSB selects Gouraud-shaded PixelSpanDriver:
+        driver_idx |= (1 << 5);
+        PSD driver = gpuPixelSpanDrivers[driver_idx];
+        gpuDrawLineG(packet, driver);
+      } break;
+
+      case 0x58 ... 0x5F: { // Gouraud-shaded line strip
         u32 num_vertexes = 1;
         u32 *list_position = &(list[2]);
 
-        gpuDrawLG(gpuPixelDrivers [ (Blending_Mode | Masking | Blending | (PixelMSB>>3)) >> 1]);
+        // Shift index right by one, as untextured prims don't use lighting
+        u32 driver_idx = (Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1;
+        // Index MSB selects Gouraud-shaded PixelSpanDriver:
+        driver_idx |= (1 << 5);
+        PSD driver = gpuPixelSpanDrivers[driver_idx];
+        gpuDrawLineG(packet, driver);
 
         while(1)
         {
-          PacketBuffer.U4[0] = PacketBuffer.U4[2];
-          PacketBuffer.U4[1] = PacketBuffer.U4[3];
-          PacketBuffer.U4[2] = *list_position++;
-          PacketBuffer.U4[3] = *list_position++;
-          gpuDrawLG(gpuPixelDrivers [ (Blending_Mode | Masking | Blending | (PixelMSB>>3)) >> 1]);
+          gpu_unai.PacketBuffer.U4[0] = gpu_unai.PacketBuffer.U4[2];
+          gpu_unai.PacketBuffer.U4[1] = gpu_unai.PacketBuffer.U4[3];
+          gpu_unai.PacketBuffer.U4[2] = *list_position++;
+          gpu_unai.PacketBuffer.U4[3] = *list_position++;
+          gpuDrawLineG(packet, driver);
 
           num_vertexes++;
           if(list_position >= list_end) {
@@ -352,91 +459,116 @@ int do_cmd_list(u32 *list, int list_len, int *last_cmd)
         }
 
         len += (num_vertexes - 2) * 2;
-        break;
-      }
+      } break;
 
       case 0x60:
       case 0x61:
       case 0x62:
-      case 0x63:
-        gpuDrawT(gpuTileSpanDrivers [Blending_Mode | Masking | Blending | (PixelMSB>>3)]);
-        break;
+      case 0x63: {          // Monochrome rectangle (variable size)
+        PT driver = gpuTileSpanDrivers[(Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1];
+        gpuDrawT(packet, driver);
+      } break;
 
       case 0x64:
       case 0x65:
       case 0x66:
-      case 0x67:
-        gpuSetCLUT    (PacketBuffer.U4[2] >> 16);
-        gpuSetTexture (GPU_GP1);
-        if ((PacketBuffer.U1[0]>0x5F) && (PacketBuffer.U1[1]>0x5F) && (PacketBuffer.U1[2]>0x5F))
-          gpuDrawS(gpuSpriteSpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | (enableAbbeyHack<<7)  | PixelMSB]);
-        else
-          gpuDrawS(gpuSpriteSpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | Lighting | (enableAbbeyHack<<7)  | PixelMSB]);
-        break;
+      case 0x67: {          // Textured rectangle (variable size)
+        gpuSetCLUT    (gpu_unai.PacketBuffer.U4[2] >> 16);
+        u32 driver_idx = Blending_Mode | gpu_unai.TEXT_MODE | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>1);
+
+        //senquack - Only color 808080h-878787h allows skipping lighting calculation:
+        // This fixes Silent Hill running animation on loading screens:
+        // (On PSX, color values 0x00-0x7F darken the source texture's color,
+        //  0x81-FF lighten textures (ultimately clamped to 0x1F),
+        //  0x80 leaves source texture color unchanged, HOWEVER,
+        //   gpu_unai uses a simple lighting LUT whereby only the upper
+        //   5 bits of an 8-bit color are used, so 0x80-0x87 all behave as
+        //   0x80.
+        // 
+        // NOTE: I've changed all textured sprite draw commands here and
+        //  elsewhere to use proper behavior, but left poly commands
+        //  alone, I don't want to slow rendering down too much. (TODO)
+        //if ((gpu_unai.PacketBuffer.U1[0]>0x5F) && (gpu_unai.PacketBuffer.U1[1]>0x5F) && (gpu_unai.PacketBuffer.U1[2]>0x5F))
+        // Strip lower 3 bits of each color and determine if lighting should be used:
+        if ((gpu_unai.PacketBuffer.U4[0] & 0xF8F8F8) != 0x808080)
+          driver_idx |= Lighting;
+        PS driver = gpuSpriteSpanDrivers[driver_idx];
+        gpuDrawS(packet, driver);
+      } break;
 
       case 0x68:
       case 0x69:
       case 0x6A:
-      case 0x6B:
-        PacketBuffer.U4[2] = 0x00010001;
-        gpuDrawT(gpuTileSpanDrivers [Blending_Mode | Masking | Blending | (PixelMSB>>3)]);
-        break;
+      case 0x6B: {          // Monochrome rectangle (1x1 dot)
+        gpu_unai.PacketBuffer.U4[2] = 0x00010001;
+        PT driver = gpuTileSpanDrivers[(Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1];
+        gpuDrawT(packet, driver);
+      } break;
 
       case 0x70:
       case 0x71:
       case 0x72:
-      case 0x73:
-        PacketBuffer.U4[2] = 0x00080008;
-        gpuDrawT(gpuTileSpanDrivers [Blending_Mode | Masking | Blending | (PixelMSB>>3)]);
-        break;
+      case 0x73: {          // Monochrome rectangle (8x8)
+        gpu_unai.PacketBuffer.U4[2] = 0x00080008;
+        PT driver = gpuTileSpanDrivers[(Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1];
+        gpuDrawT(packet, driver);
+      } break;
 
       case 0x74:
       case 0x75:
       case 0x76:
-      case 0x77:
-        PacketBuffer.U4[3] = 0x00080008;
-        gpuSetCLUT    (PacketBuffer.U4[2] >> 16);
-        gpuSetTexture (GPU_GP1);
-        if ((PacketBuffer.U1[0]>0x5F) && (PacketBuffer.U1[1]>0x5F) && (PacketBuffer.U1[2]>0x5F))
-          gpuDrawS(gpuSpriteSpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | (enableAbbeyHack<<7)  | PixelMSB]);
-        else
-          gpuDrawS(gpuSpriteSpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | Lighting | (enableAbbeyHack<<7)  | PixelMSB]);
-        break;
+      case 0x77: {          // Textured rectangle (8x8)
+        gpu_unai.PacketBuffer.U4[3] = 0x00080008;
+        gpuSetCLUT    (gpu_unai.PacketBuffer.U4[2] >> 16);
+        u32 driver_idx = Blending_Mode | gpu_unai.TEXT_MODE | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>1);
+
+        //senquack - Only color 808080h-878787h allows skipping lighting calculation:
+        //if ((gpu_unai.PacketBuffer.U1[0]>0x5F) && (gpu_unai.PacketBuffer.U1[1]>0x5F) && (gpu_unai.PacketBuffer.U1[2]>0x5F))
+        // Strip lower 3 bits of each color and determine if lighting should be used:
+        if ((gpu_unai.PacketBuffer.U4[0] & 0xF8F8F8) != 0x808080)
+          driver_idx |= Lighting;
+        PS driver = gpuSpriteSpanDrivers[driver_idx];
+        gpuDrawS(packet, driver);
+      } break;
 
       case 0x78:
       case 0x79:
       case 0x7A:
-      case 0x7B:
-        PacketBuffer.U4[2] = 0x00100010;
-        gpuDrawT(gpuTileSpanDrivers [Blending_Mode | Masking | Blending | (PixelMSB>>3)]);
-        break;
+      case 0x7B: {          // Monochrome rectangle (16x16)
+        gpu_unai.PacketBuffer.U4[2] = 0x00100010;
+        PT driver = gpuTileSpanDrivers[(Blending_Mode | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>3)) >> 1];
+        gpuDrawT(packet, driver);
+      } break;
 
       case 0x7C:
       case 0x7D:
 #ifdef __arm__
-        if ((GPU_GP1 & 0x180) == 0 && (Masking | PixelMSB) == 0)
+        if ((gpu_unai.GPU_GP1 & 0x180) == 0 && (gpu_unai.Masking | gpu_unai.PixelMSB) == 0)
         {
-          gpuSetCLUT    (PacketBuffer.U4[2] >> 16);
-          gpuSetTexture (GPU_GP1);
-          gpuDrawS16();
+          gpuSetCLUT    (gpu_unai.PacketBuffer.U4[2] >> 16);
+          gpuDrawS16(packet);
           break;
         }
         // fallthrough
 #endif
       case 0x7E:
-      case 0x7F:
-        PacketBuffer.U4[3] = 0x00100010;
-        gpuSetCLUT    (PacketBuffer.U4[2] >> 16);
-        gpuSetTexture (GPU_GP1);
-        if ((PacketBuffer.U1[0]>0x5F) && (PacketBuffer.U1[1]>0x5F) && (PacketBuffer.U1[2]>0x5F))
-          gpuDrawS(gpuSpriteSpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | (enableAbbeyHack<<7)  | PixelMSB]);
-        else
-          gpuDrawS(gpuSpriteSpanDrivers [Blending_Mode | TEXT_MODE | Masking | Blending | Lighting | (enableAbbeyHack<<7)  | PixelMSB]);
-        break;
+      case 0x7F: {          // Textured rectangle (16x16)
+        gpu_unai.PacketBuffer.U4[3] = 0x00100010;
+        gpuSetCLUT    (gpu_unai.PacketBuffer.U4[2] >> 16);
+        u32 driver_idx = Blending_Mode | gpu_unai.TEXT_MODE | gpu_unai.Masking | Blending | (gpu_unai.PixelMSB>>1);
+        //senquack - Only color 808080h-878787h allows skipping lighting calculation:
+        //if ((gpu_unai.PacketBuffer.U1[0]>0x5F) && (gpu_unai.PacketBuffer.U1[1]>0x5F) && (gpu_unai.PacketBuffer.U1[2]>0x5F))
+        // Strip lower 3 bits of each color and determine if lighting should be used:
+        if ((gpu_unai.PacketBuffer.U4[0] & 0xF8F8F8) != 0x808080)
+          driver_idx |= Lighting;
+        PS driver = gpuSpriteSpanDrivers[driver_idx];
+        gpuDrawS(packet, driver);
+      } break;
 
       case 0x80:          //  vid -> vid
-        gpuMoveImage();   //  prim handles updateLace && skip
+        gpuMoveImage(packet);
         break;
+
 #ifdef TEST
       case 0xA0:          //  sys -> vid
       {
@@ -445,70 +577,25 @@ int do_cmd_list(u32 *list, int list_len, int *last_cmd)
         u32 load_size = load_width * load_height;
 
         len += load_size / 2;
-        break;
-      }
+      } break;
+
       case 0xC0:
         break;
 #else
       case 0xA0:          //  sys ->vid
       case 0xC0:          //  vid -> sys
+        // Handled by gpulib
         goto breakloop;
 #endif
-      case 0xE1: {
-        const u32 temp = PacketBuffer.U4[0];
-        GPU_GP1 = (GPU_GP1 & ~0x000007FF) | (temp & 0x000007FF);
-        gpuSetTexture(temp);
-        gpu.ex_regs[1] = temp;
-        break;
-      }
-      case 0xE2: {
-        static const u8  TextureMask[32] = {
-          255, 7, 15, 7, 31, 7, 15, 7, 63, 7, 15, 7, 31, 7, 15, 7,
-          127, 7, 15, 7, 31, 7, 15, 7, 63, 7, 15, 7, 31, 7, 15, 7
-        };
-        const u32 temp = PacketBuffer.U4[0];
-        TextureWindow[0] = ((temp >> 10) & 0x1F) << 3;
-        TextureWindow[1] = ((temp >> 15) & 0x1F) << 3;
-        TextureWindow[2] = TextureMask[(temp >> 0) & 0x1F];
-        TextureWindow[3] = TextureMask[(temp >> 5) & 0x1F];
-        gpuSetTexture(GPU_GP1);
-        gpu.ex_regs[2] = temp;
-        break;
-      }
-      case 0xE3: {
-        const u32 temp = PacketBuffer.U4[0];
-        DrawingArea[0] = temp         & 0x3FF;
-        DrawingArea[1] = (temp >> 10) & 0x3FF;
-        gpu.ex_regs[3] = temp;
-        break;
-      }
-      case 0xE4: {
-        const u32 temp = PacketBuffer.U4[0];
-        DrawingArea[2] = (temp         & 0x3FF) + 1;
-        DrawingArea[3] = ((temp >> 10) & 0x3FF) + 1;
-        gpu.ex_regs[4] = temp;
-        break;
-      }
-      case 0xE5: {
-        const u32 temp = PacketBuffer.U4[0];
-        DrawingOffset[0] = ((s32)temp<<(32-11))>>(32-11);
-        DrawingOffset[1] = ((s32)temp<<(32-22))>>(32-11);
-        gpu.ex_regs[5] = temp;
-        break;
-      }
-      case 0xE6: {
-        const u32 temp = PacketBuffer.U4[0];
-        Masking = (temp & 0x2) <<  1;
-        PixelMSB =(temp & 0x1) <<  8;
-        gpu.ex_regs[6] = temp;
-        break;
-      }
+      case 0xE1 ... 0xE6: { // Draw settings
+        gpuGP0Cmd_0xEx(gpu_unai, gpu_unai.PacketBuffer.U4[0]);
+      } break;
     }
   }
 
 breakloop:
   gpu.ex_regs[1] &= ~0x1ff;
-  gpu.ex_regs[1] |= GPU_GP1 & 0x1ff;
+  gpu.ex_regs[1] |= gpu_unai.GPU_GP1 & 0x1ff;
 
   *last_cmd = cmd;
   return list - list_start;
@@ -532,20 +619,17 @@ void renderer_set_interlace(int enable, int is_odd)
 {
 }
 
-#ifndef TEST
-
 #include "../../frontend/plugin_lib.h"
-
+// Handle any gpulib settings applicable to gpu_unai:
 void renderer_set_config(const struct rearmed_cbs *cbs)
 {
-  force_interlace = cbs->gpu_unai.lineskip;
-  enableAbbeyHack = cbs->gpu_unai.abe_hack;
-  light = !cbs->gpu_unai.no_light;
-  blend = !cbs->gpu_unai.no_blend;
-
-  GPU_FrameBuffer = (u16 *)gpu.vram;
+  gpu_unai.vram = (u16*)gpu.vram;
+  gpu_unai.config.ilace_force   = cbs->gpu_unai.ilace_force;
+  gpu_unai.config.pixel_skip    = cbs->gpu_unai.pixel_skip;
+  gpu_unai.config.lighting      = cbs->gpu_unai.lighting;
+  gpu_unai.config.fast_lighting = cbs->gpu_unai.fast_lighting;
+  gpu_unai.config.blending      = cbs->gpu_unai.blending;
+  gpu_unai.config.dithering     = cbs->gpu_unai.dithering;
 }
 
-#endif
-
 // vim:shiftwidth=2:expandtab