From: ptitSeb <sebastien.chev@gmail.com>
Date: Fri, 20 Dec 2013 14:40:11 +0000 (+0100)
Subject: ALL: Huge upstream synch + PerRom DelaySI & CountPerOp parameters
X-Git-Url: https://notaz.gp2x.de/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=2d26287291331f2b1793a8e76ede08c75654fb7c;p=mupen64plus-pandora.git

ALL: Huge upstream synch + PerRom DelaySI & CountPerOp parameters
---

diff --git a/source/gles2glide64/projects/unix/Makefile b/source/gles2glide64/projects/unix/Makefile
index c509829..b0a203b 100755
--- a/source/gles2glide64/projects/unix/Makefile
+++ b/source/gles2glide64/projects/unix/Makefile
@@ -127,7 +127,8 @@ ifeq ("$(CPU)","NONE")
 endif
 
 # base CFLAGS, LDLIBS, and LDFLAGS
-OPTFLAGS ?= -Ofast -ffast-math -flto -fuse-linker-plugin
+OPTFLAGS ?= -Ofast -ffast-math 
+#-flto -fuse-linker-plugin
 WARNFLAGS ?= -Wall
 CFLAGS += $(OPTFLAGS) $(WARNFLAGS) -ffast-math -fno-strict-aliasing -fvisibility=hidden -I../../src -I../../src/Glitch64/inc -DGCC
 CXXFLAGS += -fvisibility-inlines-hidden -std=gnu++0x
@@ -166,26 +167,18 @@ ifeq ($(OS), LINUX)
   LDLIBS += -ldl
 endif
 ifeq ($(OS), OSX)
-  # Select the proper SDK
-  # Also, SDKs are stored in a different location since XCode 4.3
-  OSX_SDK ?= $(shell sw_vers -productVersion | cut -f1 -f2 -d .)
-  OSX_XCODEMAJ = $(shell xcodebuild -version | grep '[0-9]*\.[0-9]*' | cut -f2 -d ' ' | cut -f1 -d .)
-  OSX_XCODEMIN = $(shell xcodebuild -version | grep '[0-9]*\.[0-9]*' | cut -f2 -d ' ' | cut -f2 -d .)
-  OSX_XCODEGE43 = $(shell echo "`expr $(OSX_XCODEMAJ) \>= 4``expr $(OSX_XCODEMIN) \>= 3`")
-  ifeq ($(OSX_XCODEGE43), 11)
-    OSX_SYSROOT := /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs
-  else
-    OSX_SYSROOT := /Developer/SDKs
-  endif
+  #xcode-select has been around since XCode 3.0, i.e. OS X 10.5
+  OSX_SDK_ROOT = $(shell xcode-select -print-path)/Platforms/MacOSX.platform/Developer/SDKs
+  OSX_SDK_PATH = $(OSX_SDK_ROOT)/$(shell ls $(OSX_SDK_ROOT) | tail -1)
 
   CXXFLAGS += -std=c++11 -stdlib=libc++
   ifeq ($(CPU), X86)
     LDFLAGS += -bundle -L/opt/local/lib
     LDLIBS += -ldl
     ifeq ($(ARCH_DETECTED), 64BITS)
-      CFLAGS += -pipe -arch x86_64 -mmacosx-version-min=$(OSX_SDK) -isysroot $(OSX_SYSROOT)/MacOSX$(OSX_SDK).sdk
+      CFLAGS += -pipe -arch x86_64 -mmacosx-version-min=10.5 -isysroot $(OSX_SDK_PATH)
     else
-      CFLAGS += -pipe -mmmx -msse -fomit-frame-pointer -arch i686 -mmacosx-version-min=$(OSX_SDK) -isysroot $(OSX_SYSROOT)/MacOSX$(OSX_SDK).sdk
+      CFLAGS += -pipe -mmmx -msse -fomit-frame-pointer -arch i686 -mmacosx-version-min=10.5 -isysroot $(OSX_SDK_PATH)
     endif
   endif
 endif
@@ -224,7 +217,6 @@ endif
 CFLAGS += $(LIBPNG_CFLAGS)
 LDLIBS += $(LIBPNG_LDLIBS)
 
-
 # search for OpenGL libraries
 ifeq ($(OS), OSX)
   GL_LDLIBS = -framework OpenGL
@@ -379,12 +371,19 @@ SOURCE += \
 	$(SRCDIR)/GlideHQ/TxReSample.cpp \
 	$(SRCDIR)/GlideHQ/TxDbg.cpp \
 	$(SRCDIR)/GlideHQ/tc-1.1+/fxt1.c \
-	$(SRCDIR)/GlideHQ/tc-1.1+/dxtn.c \
 	$(SRCDIR)/GlideHQ/tc-1.1+/wrapper.c \
 	$(SRCDIR)/GlideHQ/tc-1.1+/texstore.c
 
-CPPFLAGS += -DTEXTURE_FILTER # -DDUMP_CACHE
+CPPFLAGS += -DTEXTURE_FILTER -DDUMP_CACHE
 LDLIBS += -lboost_filesystem$(BOOST_SUFFIX) -lboost_system$(BOOST_SUFFIX)
+
+  ifeq ($(TXCDXTN), 1)
+    CPPFLAGS += -DTXCDXTN_EXTERNAL
+  else 
+    SOURCE += \
+	$(SRCDIR)/GlideHQ/tc-1.1+/s2tc/s2tc_algorithm.cpp \
+	$(SRCDIR)/GlideHQ/tc-1.1+/s2tc/s2tc_libtxc_dxtn.cpp
+  endif
 endif
 
 ifeq ($(OS),MINGW)
@@ -421,6 +420,7 @@ targets:
 	@echo "    PIC=(1|0)     == Force enable/disable of position independent code"
 	@echo "    POSTFIX=name  == String added to the name of the the build (default: '')"
 	@echo "    HIRES=(1|0)   == Enables/Disables support for hires textures and texture filters (default: 1)"
+	@echo "    TXCDXTN=(1|0) == Enable/Disable external txc_dxtn library (default: 0)"
 	@echo "  Install Options:"
 	@echo "    PREFIX=path   == install/uninstall prefix (default: /usr/local)"
 	@echo "    SHAREDIR=path == path to install shared data files (default: PREFIX/share/mupen64plus)"
@@ -449,7 +449,7 @@ clean:
 rebuild: clean all
 
 # build dependency files
-CFLAGS += -MD
+CFLAGS += -MD -MP
 -include $(OBJECTS:.o=.d)
 
 CXXFLAGS += $(CFLAGS)
diff --git a/source/gles2glide64/src/Glide64/Ini.cpp b/source/gles2glide64/src/Glide64/Ini.cpp
index b6ab0b0..2ff23ba 100755
--- a/source/gles2glide64/src/Glide64/Ini.cpp
+++ b/source/gles2glide64/src/Glide64/Ini.cpp
@@ -59,6 +59,9 @@
   #define PATH_MAX _MAX_PATH
   #define stricmp _stricmp
 #endif
+#ifndef PATH_MAX
+  #define PATH_MAX 4096
+#endif
 
 FILE *ini;
 int sectionstart;
diff --git a/source/gles2glide64/src/Glide64/Main.cpp b/source/gles2glide64/src/Glide64/Main.cpp
index c1df3ad..9b0651b 100755
--- a/source/gles2glide64/src/Glide64/Main.cpp
+++ b/source/gles2glide64/src/Glide64/Main.cpp
@@ -55,6 +55,9 @@
 #include <time.h>
 #define PATH_MAX MAX_PATH
 #endif
+#ifndef PATH_MAX
+  #define PATH_MAX 4096
+#endif
 #include "osal_dynamiclib.h"
 #ifdef TEXTURE_FILTER // Hiroshi Morii <koolsmoky@users.sourceforge.net>
 #include <stdarg.h>
@@ -1423,7 +1426,6 @@ EXPORT m64p_error CALL PluginStartup(m64p_dynlib_handle CoreLibHandle, void *Con
     if (configDir)
     {
         SetConfigDir(configDir);
-        CoreVideo_Init();
         ReadSettings();
 		return M64ERR_SUCCESS;
     }
diff --git a/source/gles2glide64/src/Glide64/Util.h b/source/gles2glide64/src/Glide64/Util.h
index f577a9c..73f0475 100644
--- a/source/gles2glide64/src/Glide64/Util.h
+++ b/source/gles2glide64/src/Glide64/Util.h
@@ -90,9 +90,7 @@ float ScaleZ(float z);
 			lx = lc; \
 		}
 
-#if defined(__GNUC__)
-  #define bswap32(x) __builtin_bswap32(x)
-#elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
+#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
   #include <stdlib.h>
   #define bswap32(x) _byteswap_ulong(x)
 #else
diff --git a/source/gles2glide64/src/Glide64/m64p.h b/source/gles2glide64/src/Glide64/m64p.h
index 3f47cf6..dd6e891 100755
--- a/source/gles2glide64/src/Glide64/m64p.h
+++ b/source/gles2glide64/src/Glide64/m64p.h
@@ -38,7 +38,13 @@
 #define CONFIG_API_VERSION          0x020000
 #define VIDEXT_API_VERSION          0x030000
 
+#ifdef __cplusplus
+extern "C" {
+#endif
 void WriteLog(m64p_msg_level level, const char *msg, ...);
+#ifdef __cplusplus
+}
+#endif
 
 //The Glide API originally used an integer to pick an enumerated resolution.
 //To accomodate arbitrary resolutions, pack it into a 32-bit struct
diff --git a/source/gles2glide64/src/Glide64/osal_dynamiclib.h b/source/gles2glide64/src/Glide64/osal_dynamiclib.h
index c24377b..7be0cab 100755
--- a/source/gles2glide64/src/Glide64/osal_dynamiclib.h
+++ b/source/gles2glide64/src/Glide64/osal_dynamiclib.h
@@ -1,5 +1,5 @@
 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
- *   Mupen64plus-core - osal/dynamiclib.h                                  *
+ *   Mupen64plus-video-glide64mk2 - osal_dynamiclib.h                      *
  *   Mupen64Plus homepage: http://code.google.com/p/mupen64plus/           *
  *   Copyright (C) 2009 Richard Goedeken                                   *
  *                                                                         *
@@ -22,14 +22,18 @@
 #if !defined(OSAL_DYNAMICLIB_H)
 #define OSAL_DYNAMICLIB_H
 
+#include "m64p_types.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#include "m64p_types.h"
+m64p_error osal_dynlib_open(m64p_dynlib_handle *pLibHandle, const char *pccLibraryPath);
 
 void *     osal_dynlib_getproc(m64p_dynlib_handle LibHandle, const char *pccProcedureName);
 
+m64p_error osal_dynlib_close(m64p_dynlib_handle LibHandle);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/source/gles2glide64/src/Glide64/osal_dynamiclib_unix.c b/source/gles2glide64/src/Glide64/osal_dynamiclib_unix.c
index b3b7ba5..25562c4 100755
--- a/source/gles2glide64/src/Glide64/osal_dynamiclib_unix.c
+++ b/source/gles2glide64/src/Glide64/osal_dynamiclib_unix.c
@@ -1,5 +1,5 @@
 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
- *   Mupen64plus-core - osal/dynamiclib_unix.c                             *
+ *   Mupen64plus-video-glide64mk2 - osal_dynamiclib_unix.c                 *
  *   Mupen64Plus homepage: http://code.google.com/p/mupen64plus/           *
  *   Copyright (C) 2009 Richard Goedeken                                   *
  *                                                                         *
@@ -20,12 +20,33 @@
  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
 
 #include <stdlib.h>
+#include <string.h>
 #include <stdio.h>
 #include <dlfcn.h>
 
 #include "m64p_types.h"
+#include "m64p.h"
 #include "osal_dynamiclib.h"
 
+m64p_error osal_dynlib_open(m64p_dynlib_handle *pLibHandle, const char *pccLibraryPath)
+{
+    if (pLibHandle == NULL || pccLibraryPath == NULL)
+        return M64ERR_INPUT_ASSERT;
+
+    *pLibHandle = dlopen(pccLibraryPath, RTLD_NOW);
+
+    if (*pLibHandle == NULL)
+    {
+        /* only print an error message if there is a directory separator (/) in the pathname */
+        /* this prevents us from throwing an error for the use case where Mupen64Plus is not installed */
+        if (strchr(pccLibraryPath, '/') != NULL)
+            WriteLog(M64MSG_ERROR, "dlopen('%s') failed: %s", pccLibraryPath, dlerror());
+        return M64ERR_INPUT_NOT_FOUND;
+    }
+
+    return M64ERR_SUCCESS;
+}
+
 void * osal_dynlib_getproc(m64p_dynlib_handle LibHandle, const char *pccProcedureName)
 {
     if (pccProcedureName == NULL)
@@ -34,4 +55,17 @@ void * osal_dynlib_getproc(m64p_dynlib_handle LibHandle, const char *pccProcedur
     return dlsym(LibHandle, pccProcedureName);
 }
 
+m64p_error osal_dynlib_close(m64p_dynlib_handle LibHandle)
+{
+    int rval = dlclose(LibHandle);
+
+    if (rval != 0)
+    {
+        WriteLog(M64MSG_ERROR, "dlclose() failed: %s", dlerror());
+        return M64ERR_INTERNAL;
+    }
+
+    return M64ERR_SUCCESS;
+}
+
 
diff --git a/source/gles2glide64/src/GlideHQ/TxFilter.cpp b/source/gles2glide64/src/GlideHQ/TxFilter.cpp
index 2d89caf..b04662f 100644
--- a/source/gles2glide64/src/GlideHQ/TxFilter.cpp
+++ b/source/gles2glide64/src/GlideHQ/TxFilter.cpp
@@ -28,8 +28,13 @@
 #include "TxFilter.h"
 #include "TextureFilters.h"
 #include "TxDbg.h"
+#ifndef NO_FILTER_THREAD
 #include <functional>
 #include <thread>
+#endif
+#if defined(__MINGW32__)
+#define swprintf _snwprintf
+#endif
 
 void TxFilter::clear()
 {
diff --git a/source/gles2glide64/src/GlideHQ/TxQuantize.cpp b/source/gles2glide64/src/GlideHQ/TxQuantize.cpp
index b21db71..c7a9eee 100644
--- a/source/gles2glide64/src/GlideHQ/TxQuantize.cpp
+++ b/source/gles2glide64/src/GlideHQ/TxQuantize.cpp
@@ -25,8 +25,10 @@
 #pragma warning(disable: 4786)
 #endif
 
+#ifndef NO_FILTER_THREAD
 #include <functional>
 #include <thread>
+#endif
 
 /* NOTE: The codes are not optimized. They can be made faster. */
 
@@ -41,7 +43,7 @@ TxQuantize::TxQuantize()
 
   /* get dxtn extensions */
   _tx_compress_fxt1 = TxLoadLib::getInstance()->getfxtCompressTexFuncExt();
-  _tx_compress_dxtn = TxLoadLib::getInstance()->getdxtCompressTexFuncExt();
+  _tx_compress_dxtn_rgba = TxLoadLib::getInstance()->getdxtCompressTexFuncExt();
 }
 
 
@@ -1990,7 +1992,7 @@ TxQuantize::DXTn(uint8 *src, uint8 *dest,
 
   boolean bRet = 0;
 
-  if (_tx_compress_dxtn &&
+  if (_tx_compress_dxtn_rgba &&
       srcwidth >= 4 && srcheight >= 4) {
     /* compress to dxtn
      * width and height must be larger than 4
@@ -2038,7 +2040,7 @@ TxQuantize::DXTn(uint8 *src, uint8 *dest,
         unsigned int srcStride = (srcwidth * blkheight) << 2;
         unsigned int destStride = dstRowStride * blkrow;
         for (i = 0; i < numcore - 1; i++) {
-          thrd[i] = new std::thread(std::bind(_tx_compress_dxtn,
+          thrd[i] = new std::thread(std::bind(_tx_compress_dxtn_rgba,
                                               4,
                                               srcwidth,
                                               blkheight,
@@ -2049,7 +2051,7 @@ TxQuantize::DXTn(uint8 *src, uint8 *dest,
           src  += srcStride;
           dest += destStride;
         }
-        thrd[i] = new std::thread(std::bind(_tx_compress_dxtn,
+        thrd[i] = new std::thread(std::bind(_tx_compress_dxtn_rgba,
                                             4,
                                             srcwidth,
                                             srcheight - blkheight * i,
@@ -2062,7 +2064,7 @@ TxQuantize::DXTn(uint8 *src, uint8 *dest,
           delete thrd[i];
         }
       } else {
-        (*_tx_compress_dxtn)(4,             /* comps: ARGB8888=4, RGB888=3 */
+        (*_tx_compress_dxtn_rgba)(4,             /* comps: ARGB8888=4, RGB888=3 */
                              srcwidth,      /* width */
                              srcheight,     /* height */
                              src,           /* source */
@@ -2072,7 +2074,7 @@ TxQuantize::DXTn(uint8 *src, uint8 *dest,
                                              * others = 16 bytes per 4x4 texel */
       }
 #else
-      (*_tx_compress_dxtn)(4,             /* comps: ARGB8888=4, RGB888=3 */
+      (*_tx_compress_dxtn_rgba)(4,             /* comps: ARGB8888=4, RGB888=3 */
                            srcwidth,      /* width */
                            srcheight,     /* height */
                            src,           /* source */
diff --git a/source/gles2glide64/src/GlideHQ/TxQuantize.h b/source/gles2glide64/src/GlideHQ/TxQuantize.h
index d3c6ae6..e14990f 100644
--- a/source/gles2glide64/src/GlideHQ/TxQuantize.h
+++ b/source/gles2glide64/src/GlideHQ/TxQuantize.h
@@ -38,7 +38,7 @@ private:
   int _numcore;
 
   fxtCompressTexFuncExt _tx_compress_fxt1;
-  dxtCompressTexFuncExt _tx_compress_dxtn;
+  dxtCompressTexFuncExt _tx_compress_dxtn_rgba;
 
   /* fast optimized... well, sort of. */
   void ARGB1555_ARGB8888(uint32* src, uint32* dst, int width, int height);
diff --git a/source/gles2glide64/src/GlideHQ/TxUtil.cpp b/source/gles2glide64/src/GlideHQ/TxUtil.cpp
index 9ad7e44..411a25e 100644
--- a/source/gles2glide64/src/GlideHQ/TxUtil.cpp
+++ b/source/gles2glide64/src/GlideHQ/TxUtil.cpp
@@ -42,14 +42,14 @@ TxLoadLib::TxLoadLib()
     _dxtnlib = LoadLibrary("dxtn");
 
   if (_dxtnlib) {
-    if (!_tx_compress_dxtn)
-      _tx_compress_dxtn = (dxtCompressTexFuncExt)DLSYM(_dxtnlib, "tx_compress_dxtn");
+    if (!_tx_compress_dxtn_rgba)
+      _tx_compress_dxtn_rgba = (dxtCompressTexFuncExt)DLSYM(_dxtnlib, "tx_compress_dxtn_rgba");
 
     if (!_tx_compress_fxt1)
       _tx_compress_fxt1 = (fxtCompressTexFuncExt)DLSYM(_dxtnlib, "fxt1_encode");
   }
 #else
-  _tx_compress_dxtn = tx_compress_dxtn;
+  _tx_compress_dxtn_rgba = tx_compress_dxtn_rgba;
   _tx_compress_fxt1 = fxt1_encode;
 
 #endif
@@ -74,7 +74,7 @@ TxLoadLib::getfxtCompressTexFuncExt()
 dxtCompressTexFuncExt
 TxLoadLib::getdxtCompressTexFuncExt()
 {
-  return _tx_compress_dxtn;
+  return _tx_compress_dxtn_rgba;
 }
 
 
diff --git a/source/gles2glide64/src/GlideHQ/TxUtil.h b/source/gles2glide64/src/GlideHQ/TxUtil.h
index b89f660..7f9c5f4 100644
--- a/source/gles2glide64/src/GlideHQ/TxUtil.h
+++ b/source/gles2glide64/src/GlideHQ/TxUtil.h
@@ -34,7 +34,7 @@
 #ifdef __cplusplus
 extern "C"{
 #endif
-void tx_compress_dxtn(int srccomps, int width, int height,
+void tx_compress_dxtn_rgba(int srccomps, int width, int height,
                       const void *source, int destformat, void *dest,
                       int destRowStride);
 
@@ -62,7 +62,7 @@ private:
   HMODULE _dxtnlib;
 #endif
   fxtCompressTexFuncExt _tx_compress_fxt1;
-  dxtCompressTexFuncExt _tx_compress_dxtn;
+  dxtCompressTexFuncExt _tx_compress_dxtn_rgba;
   TxLoadLib();
 public:
   static TxLoadLib* getInstance() {
diff --git a/source/gles2glide64/src/GlideHQ/tc-1.1+/dxtn.c b/source/gles2glide64/src/GlideHQ/tc-1.1+/dxtn.c
deleted file mode 100644
index e2d335a..0000000
--- a/source/gles2glide64/src/GlideHQ/tc-1.1+/dxtn.c
+++ /dev/null
@@ -1,884 +0,0 @@
-/*
- * DXTn codec
- * Version:  1.1
- *
- * Copyright (C) 2004  Daniel Borca   All Rights Reserved.
- *
- * this is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2, or (at your option)
- * any later version.
- *
- * this is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with GNU Make; see the file COPYING.  If not, write to
- * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.	
- */
-
-/* Copyright (C) 2007  Hiroshi Morii <koolsmoky(at)users.sourceforge.net>
- * Added support for ARGB inputs, DXT3,5 workaround for ATI Radeons, and
- * YUV conversions to determine representative colors.
- */
-
-
-#include <stdlib.h>
-#include <string.h>
-#include <assert.h>
-
-#include <stdio.h>
-
-#include "types.h"
-#include "internal.h"
-#include "dxtn.h"
-
-
-/***************************************************************************\
- * DXTn encoder
- *
- * The encoder was built by reversing the decoder,
- * and is vaguely based on FXT1 codec. Note that this code
- * is merely a proof of concept, since it is highly UNoptimized!
-\***************************************************************************/
-
-
-#define MAX_COMP 4 /* ever needed maximum number of components in texel */
-#define MAX_VECT 4 /* ever needed maximum number of base vectors to find */
-#define N_TEXELS 16 /* number of texels in a block (always 16) */
-#define COLOR565(v) (word)((((v)[RCOMP] & 0xf8) << 8) | (((v)[GCOMP] & 0xfc) << 3) | ((v)[BCOMP] >> 3))
-
-
-static const int dxtn_color_tlat[2][4] = {
-    { 0, 2, 3, 1 },
-    { 0, 2, 1, 3 }
-};
-
-static const int dxtn_alpha_tlat[2][8] = {
-    { 0, 2, 3, 4, 5, 6, 7, 1 },
-    { 0, 2, 3, 4, 5, 1, 6, 7 }
-};
-
-
-static void
-dxt1_rgb_quantize (dword *cc, const byte *lines[], int comps)
-{
-    float b, iv[MAX_COMP];   /* interpolation vector */
-
-    dword hi; /* high doubleword */
-    int color0, color1;
-    int n_vect;
-    const int n_comp = 3;
-    int black = 0;
-
-#ifndef YUV
-    int minSum = 2000; /* big enough */
-#else
-    int minSum = 2000000;
-#endif
-    int maxSum = -1; /* small enough */
-    int minCol = 0; /* phoudoin: silent compiler! */
-    int maxCol = 0; /* phoudoin: silent compiler! */
-
-    byte input[N_TEXELS][MAX_COMP];
-    int i, k, l;
-
-    /* make the whole block opaque */
-    /* we will NEVER reference ACOMP of any pixel */
-
-    /* 4 texels each line */
-#ifndef ARGB
-    for (l = 0; l < 4; l++) {
-	for (k = 0; k < 4; k++) {
-	    for (i = 0; i < comps; i++) {
-		input[k + l * 4][i] = *lines[l]++;
-	    }
-	}
-    }
-#else
-    /* H.Morii - support for ARGB inputs */
-    for (l = 0; l < 4; l++) {
-	for (k = 0; k < 4; k++) {
-          input[k + l * 4][2] = *lines[l]++;
-          input[k + l * 4][1] = *lines[l]++;
-          input[k + l * 4][0] = *lines[l]++;
-          if (comps == 4) input[k + l * 4][3] = *lines[l]++;
-	}
-    }
-#endif
-
-    /* Our solution here is to find the darkest and brightest colors in
-     * the 4x4 tile and use those as the two representative colors.
-     * There are probably better algorithms to use (histogram-based).
-     */
-    for (k = 0; k < N_TEXELS; k++) {
-	int sum = 0;
-#ifndef YUV
-	for (i = 0; i < n_comp; i++) {
-	    sum += input[k][i];
-	}
-#else
-        /* RGB to YUV conversion according to CCIR 601 specs
-         * Y = 0.299R+0.587G+0.114B
-         * U = 0.713(R - Y) = 0.500R-0.419G-0.081B
-         * V = 0.564(B - Y) = -0.169R-0.331G+0.500B
-         */
-        sum = 299 * input[k][RCOMP] + 587 * input[k][GCOMP] +  114 * input[k][BCOMP];
-#endif
-	if (minSum > sum) {
-	    minSum = sum;
-	    minCol = k;
-	}
-	if (maxSum < sum) {
-	    maxSum = sum;
-	    maxCol = k;
-	}
-	if (sum == 0) {
-	    black = 1;
-	}
-    }
-
-    color0 = COLOR565(input[minCol]);
-    color1 = COLOR565(input[maxCol]);
-
-    if (color0 == color1) {
-	/* we'll use 3-vector */
-	cc[0] = color0 | (color1 << 16);
-	hi = black ? -1 : 0;
-    } else {
-	if (black && ((color0 == 0) || (color1 == 0))) {
-	    /* we still can use 4-vector */
-	    black = 0;
-	}
-
-	if (black ^ (color0 <= color1)) {
-	    int aux;
-	    aux = color0;
-	    color0 = color1;
-	    color1 = aux;
-	    aux = minCol;
-	    minCol = maxCol;
-	    maxCol = aux;
-	}
-	n_vect = (color0 <= color1) ? 2 : 3;
-
-	MAKEIVEC(n_vect, n_comp, iv, b, input[minCol], input[maxCol]);
-
-	/* add in texels */
-	cc[0] = color0 | (color1 << 16);
-	hi = 0;
-	for (k = N_TEXELS - 1; k >= 0; k--) {
-	    int texel = 3;
-	    int sum = 0;
-	    if (black) {
-		for (i = 0; i < n_comp; i++) {
-		    sum += input[k][i];
-		}
-	    }
-	    if (!black || sum) {
-		/* interpolate color */
-		CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
-		texel = dxtn_color_tlat[black][texel];
-	    }
-	    /* add in texel */
-	    hi <<= 2;
-	    hi |= texel;
-	}
-    }
-    cc[1] = hi;
-}
-
-
-static void
-dxt1_rgba_quantize (dword *cc, const byte *lines[], int comps)
-{
-    float b, iv[MAX_COMP];	/* interpolation vector */
-
-    dword hi;		/* high doubleword */
-    int color0, color1;
-    int n_vect;
-    const int n_comp = 3;
-    int transparent = 0;
-
-#ifndef YUV
-    int minSum = 2000;          /* big enough */
-#else
-    int minSum = 2000000;
-#endif
-    int maxSum = -1;		/* small enough */
-    int minCol = 0;		/* phoudoin: silent compiler! */
-    int maxCol = 0;		/* phoudoin: silent compiler! */
-
-    byte input[N_TEXELS][MAX_COMP];
-    int i, k, l;
-
-    if (comps == 3) {
-	/* make the whole block opaque */
-	memset(input, -1, sizeof(input));
-    }
-
-    /* 4 texels each line */
-#ifndef ARGB
-    for (l = 0; l < 4; l++) {
-	for (k = 0; k < 4; k++) {
-	    for (i = 0; i < comps; i++) {
-		input[k + l * 4][i] = *lines[l]++;
-	    }
-	}
-    }
-#else
-    /* H.Morii - support for ARGB inputs */
-    for (l = 0; l < 4; l++) {
-	for (k = 0; k < 4; k++) {
-          input[k + l * 4][2] = *lines[l]++;
-          input[k + l * 4][1] = *lines[l]++;
-          input[k + l * 4][0] = *lines[l]++;
-          if (comps == 4) input[k + l * 4][3] = *lines[l]++;
-	}
-    }
-#endif
-
-    /* Our solution here is to find the darkest and brightest colors in
-     * the 4x4 tile and use those as the two representative colors.
-     * There are probably better algorithms to use (histogram-based).
-     */
-    for (k = 0; k < N_TEXELS; k++) {
-	int sum = 0;
-#ifndef YUV
-	for (i = 0; i < n_comp; i++) {
-	    sum += input[k][i];
-	}
-#else
-        sum = 299 * input[k][RCOMP] + 587 * input[k][GCOMP] +  114 * input[k][BCOMP];
-#endif
-	if (minSum > sum) {
-	    minSum = sum;
-	    minCol = k;
-	}
-	if (maxSum < sum) {
-	    maxSum = sum;
-	    maxCol = k;
-	}
-	if (input[k][ACOMP] < 128) {
-	    transparent = 1;
-	}
-    }
-
-    color0 = COLOR565(input[minCol]);
-    color1 = COLOR565(input[maxCol]);
-
-    if (color0 == color1) {
-	/* we'll use 3-vector */
-	cc[0] = color0 | (color1 << 16);
-	hi = transparent ? -1 : 0;
-    } else {
-	if (transparent ^ (color0 <= color1)) {
-	    int aux;
-	    aux = color0;
-	    color0 = color1;
-	    color1 = aux;
-	    aux = minCol;
-	    minCol = maxCol;
-	    maxCol = aux;
-	}
-	n_vect = (color0 <= color1) ? 2 : 3;
-
-	MAKEIVEC(n_vect, n_comp, iv, b, input[minCol], input[maxCol]);
-
-	/* add in texels */
-	cc[0] = color0 | (color1 << 16);
-	hi = 0;
-	for (k = N_TEXELS - 1; k >= 0; k--) {
-	    int texel = 3;
-	    if (input[k][ACOMP] >= 128) {
-		/* interpolate color */
-		CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
-		texel = dxtn_color_tlat[transparent][texel];
-	    }
-	    /* add in texel */
-	    hi <<= 2;
-	    hi |= texel;
-	}
-    }
-    cc[1] = hi;
-}
-
-
-static void
-dxt3_rgba_quantize (dword *cc, const byte *lines[], int comps)
-{
-    float b, iv[MAX_COMP];	/* interpolation vector */
-
-    dword lolo, lohi;	/* low quadword: lo dword, hi dword */
-    dword hihi;		/* high quadword: high dword */
-    int color0, color1;
-    const int n_vect = 3;
-    const int n_comp = 3;
-
-#ifndef YUV
-    int minSum = 2000;          /* big enough */
-#else
-    int minSum = 2000000;
-#endif
-    int maxSum = -1;		/* small enough */
-    int minCol = 0;		/* phoudoin: silent compiler! */
-    int maxCol = 0;		/* phoudoin: silent compiler! */
-
-    byte input[N_TEXELS][MAX_COMP];
-    int i, k, l;
-
-    if (comps == 3) {
-	/* make the whole block opaque */
-	memset(input, -1, sizeof(input));
-    }
-
-    /* 4 texels each line */
-#ifndef ARGB
-    for (l = 0; l < 4; l++) {
-	for (k = 0; k < 4; k++) {
-	    for (i = 0; i < comps; i++) {
-		input[k + l * 4][i] = *lines[l]++;
-	    }
-	}
-    }
-#else
-    /* H.Morii - support for ARGB inputs */
-    for (l = 0; l < 4; l++) {
-	for (k = 0; k < 4; k++) {
-          input[k + l * 4][2] = *lines[l]++;
-          input[k + l * 4][1] = *lines[l]++;
-          input[k + l * 4][0] = *lines[l]++;
-          if (comps == 4) input[k + l * 4][3] = *lines[l]++;
-	}
-    }
-#endif
-
-    /* Our solution here is to find the darkest and brightest colors in
-     * the 4x4 tile and use those as the two representative colors.
-     * There are probably better algorithms to use (histogram-based).
-     */
-    for (k = 0; k < N_TEXELS; k++) {
-	int sum = 0;
-#ifndef YUV
-	for (i = 0; i < n_comp; i++) {
-	    sum += input[k][i];
-	}
-#else
-        sum = 299 * input[k][RCOMP] + 587 * input[k][GCOMP] +  114 * input[k][BCOMP];
-#endif
-	if (minSum > sum) {
-	    minSum = sum;
-	    minCol = k;
-	}
-	if (maxSum < sum) {
-	    maxSum = sum;
-	    maxCol = k;
-	}
-    }
-
-    /* add in alphas */
-    lolo = lohi = 0;
-    for (k = N_TEXELS - 1; k >= N_TEXELS / 2; k--) {
-	/* add in alpha */
-	lohi <<= 4;
-	lohi |= input[k][ACOMP] >> 4;
-    }
-    cc[1] = lohi;
-    for (; k >= 0; k--) {
-	/* add in alpha */
-	lolo <<= 4;
-	lolo |= input[k][ACOMP] >> 4;
-    }
-    cc[0] = lolo;
-
-    color0 = COLOR565(input[minCol]);
-    color1 = COLOR565(input[maxCol]);
-
-#ifdef RADEON
-    /* H.Morii - Workaround for ATI Radeon
-     * According to the OpenGL EXT_texture_compression_s3tc specs,
-     * the encoding of the RGB components for DXT3 and DXT5 formats
-     * use the non-transparent encodings of DXT1 but treated as
-     * though color0 > color1, regardless of the actual values of
-     * color0 and color1. ATI Radeons however require the values to
-     * be color0 > color1.
-     */
-    if (color0 < color1) {
-	int aux;
-	aux = color0;
-	color0 = color1;
-	color1 = aux;
-	aux = minCol;
-	minCol = maxCol;
-	maxCol = aux;
-    }
-#endif
-
-    cc[2] = color0 | (color1 << 16);
-
-    hihi = 0;
-    if (color0 != color1) {
-	MAKEIVEC(n_vect, n_comp, iv, b, input[minCol], input[maxCol]);
-
-	/* add in texels */
-	for (k = N_TEXELS - 1; k >= 0; k--) {
-	    int texel;
-	    /* interpolate color */
-	    CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
-	    texel = dxtn_color_tlat[0][texel];
-	    /* add in texel */
-	    hihi <<= 2;
-	    hihi |= texel;
-	}
-    }
-    cc[3] = hihi;
-}
-
-
-static void
-dxt5_rgba_quantize (dword *cc, const byte *lines[], int comps)
-{
-    float b, iv[MAX_COMP];	/* interpolation vector */
-
-    qword lo;			/* low quadword */
-    dword hihi;		/* high quadword: high dword */
-    int color0, color1;
-    const int n_vect = 3;
-    const int n_comp = 3;
-
-#ifndef YUV
-    int minSum = 2000;          /* big enough */
-#else
-    int minSum = 2000000;
-#endif
-    int maxSum = -1;		/* small enough */
-    int minCol = 0;		/* phoudoin: silent compiler! */
-    int maxCol = 0;		/* phoudoin: silent compiler! */
-    int alpha0 = 2000;		/* big enough */
-    int alpha1 = -1;		/* small enough */
-    int anyZero = 0, anyOne = 0;
-    int a_vect;
-
-    byte input[N_TEXELS][MAX_COMP];
-    int i, k, l;
-
-    if (comps == 3) {
-	/* make the whole block opaque */
-	memset(input, -1, sizeof(input));
-    }
-
-    /* 4 texels each line */
-#ifndef ARGB
-    for (l = 0; l < 4; l++) {
-	for (k = 0; k < 4; k++) {
-	    for (i = 0; i < comps; i++) {
-		input[k + l * 4][i] = *lines[l]++;
-	    }
-	}
-    }
-#else
-    /* H.Morii - support for ARGB inputs */
-    for (l = 0; l < 4; l++) {
-	for (k = 0; k < 4; k++) {
-          input[k + l * 4][2] = *lines[l]++;
-          input[k + l * 4][1] = *lines[l]++;
-          input[k + l * 4][0] = *lines[l]++;
-          if (comps == 4) input[k + l * 4][3] = *lines[l]++;
-	}
-    }
-#endif
-
-    /* Our solution here is to find the darkest and brightest colors in
-     * the 4x4 tile and use those as the two representative colors.
-     * There are probably better algorithms to use (histogram-based).
-     */
-    for (k = 0; k < N_TEXELS; k++) {
-	int sum = 0;
-#ifndef YUV
-	for (i = 0; i < n_comp; i++) {
-	    sum += input[k][i];
-	}
-#else
-        sum = 299 * input[k][RCOMP] + 587 * input[k][GCOMP] +  114 * input[k][BCOMP];
-#endif
-	if (minSum > sum) {
-	    minSum = sum;
-	    minCol = k;
-	}
-	if (maxSum < sum) {
-	    maxSum = sum;
-	    maxCol = k;
-	}
-	if (alpha0 > input[k][ACOMP]) {
-	    alpha0 = input[k][ACOMP];
-	}
-	if (alpha1 < input[k][ACOMP]) {
-	    alpha1 = input[k][ACOMP];
-	}
-	if (input[k][ACOMP] == 0) {
-	    anyZero = 1;
-	}
-	if (input[k][ACOMP] == 255) {
-	    anyOne = 1;
-	}
-    }
-
-    /* add in alphas */
-    if (alpha0 == alpha1) {
-	/* we'll use 6-vector */
-	cc[0] = alpha0 | (alpha1 << 8);
-	cc[1] = 0;
-    } else {
-	if (anyZero && ((alpha0 == 0) || (alpha1 == 0))) {
-	    /* we still might use 8-vector */
-	    anyZero = 0;
-	}
-	if (anyOne && ((alpha0 == 255) || (alpha1 == 255))) {
-	    /* we still might use 8-vector */
-	    anyOne = 0;
-	}
-	if ((anyZero | anyOne) ^ (alpha0 <= alpha1)) {
-	    int aux;
-	    aux = alpha0;
-	    alpha0 = alpha1;
-	    alpha1 = aux;
-	}
-	a_vect = (alpha0 <= alpha1) ? 5 : 7;
-
-	/* compute interpolation vector */
-	iv[ACOMP] = (float)a_vect / (alpha1 - alpha0);
-	b = -iv[ACOMP] * alpha0 + 0.5F;
-
-	/* add in alphas */
-	Q_MOV32(lo, 0);
-	for (k = N_TEXELS - 1; k >= 0; k--) {
-	    int texel = -1;
-	    if (anyZero | anyOne) {
-		if (input[k][ACOMP] == 0) {
-		    texel = 6;
-		} else if (input[k][ACOMP] == 255) {
-		    texel = 7;
-		}
-	    }
-	    /* interpolate alpha */
-	    if (texel == -1) {
-		float dot = input[k][ACOMP] * iv[ACOMP];
-		texel = (int)(dot + b);
-#if SAFECDOT
-		if (texel < 0) {
-		    texel = 0;
-		} else if (texel > a_vect) {
-		    texel = a_vect;
-		}
-#endif
-		texel = dxtn_alpha_tlat[anyZero | anyOne][texel];
-	    }
-	    /* add in texel */
-	    Q_SHL(lo, 3);
-	    Q_OR32(lo, texel);
-	}
-	Q_SHL(lo, 16);
-	Q_OR32(lo, alpha0 | (alpha1 << 8));
-	((qword *)cc)[0] = lo;
-    }
-
-    color0 = COLOR565(input[minCol]);
-    color1 = COLOR565(input[maxCol]);
-
-#ifdef RADEON /* H.Morii - Workaround for ATI Radeon */
-    if (color0 < color1) {
-	int aux;
-	aux = color0;
-	color0 = color1;
-	color1 = aux;
-	aux = minCol;
-	minCol = maxCol;
-	maxCol = aux;
-    }
-#endif
-
-    cc[2] = color0 | (color1 << 16);
-
-    hihi = 0;
-    if (color0 != color1) {
-	MAKEIVEC(n_vect, n_comp, iv, b, input[minCol], input[maxCol]);
-
-	/* add in texels */
-	for (k = N_TEXELS - 1; k >= 0; k--) {
-	    int texel;
-	    /* interpolate color */
-	    CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
-	    texel = dxtn_color_tlat[0][texel];
-	    /* add in texel */
-	    hihi <<= 2;
-	    hihi |= texel;
-	}
-    }
-    cc[3] = hihi;
-}
-
-
-#define ENCODER(dxtn, n)						\
-int TAPIENTRY								\
-dxtn##_encode (int width, int height, int comps,			\
-	       const void *source, int srcRowStride,			\
-	       void *dest, int destRowStride)				\
-{									\
-    int x, y;								\
-    const byte *data;							\
-    dword *encoded = (dword *)dest;					\
-    void *newSource = NULL;						\
-									\
-    /* Replicate image if width is not M4 or height is not M4 */	\
-    if ((width & 3) | (height & 3)) {					\
-	int newWidth = (width + 3) & ~3;				\
-	int newHeight = (height + 3) & ~3;				\
-	newSource = malloc(comps * newWidth * newHeight * sizeof(byte *));\
-	_mesa_upscale_teximage2d(width, height, newWidth, newHeight,	\
-                               comps, (const byte *)source,		\
-			       srcRowStride, (byte *)newSource);	\
-	source = newSource;						\
-	width = newWidth;						\
-	height = newHeight;						\
-	srcRowStride = comps * newWidth;				\
-    }									\
-									\
-    data = (const byte *)source;					\
-    destRowStride = (destRowStride - width * n) / 4;			\
-    for (y = 0; y < height; y += 4) {					\
-	unsigned int offs = 0 + (y + 0) * srcRowStride;			\
-	for (x = 0; x < width; x += 4) {				\
-	    const byte *lines[4];					\
-	    lines[0] = &data[offs];					\
-	    lines[1] = lines[0] + srcRowStride;				\
-	    lines[2] = lines[1] + srcRowStride;				\
-	    lines[3] = lines[2] + srcRowStride;				\
-	    offs += 4 * comps;						\
-	    dxtn##_quantize(encoded, lines, comps);			\
-	    /* 4x4 block */						\
-	    encoded += n;						\
-	}								\
-	encoded += destRowStride;					\
-    }									\
-									\
-    if (newSource != NULL) {						\
-	free(newSource);						\
-    }									\
-									\
-    return 0;								\
-}
-
-ENCODER(dxt1_rgb,  2)
-ENCODER(dxt1_rgba, 2)
-ENCODER(dxt3_rgba, 4)
-ENCODER(dxt5_rgba, 4)
-
-
-/***************************************************************************\
- * DXTn decoder
- *
- * The decoder is based on GL_EXT_texture_compression_s3tc
- * specification and serves as a concept for the encoder.
-\***************************************************************************/
-
-
-/* lookup table for scaling 4 bit colors up to 8 bits */
-static const byte _rgb_scale_4[] = {
-    0,   17,  34,  51,  68,  85,  102, 119,
-    136, 153, 170, 187, 204, 221, 238, 255
-};
-
-/* lookup table for scaling 5 bit colors up to 8 bits */
-static const byte _rgb_scale_5[] = {
-    0,   8,   16,  25,  33,  41,  49,  58,
-    66,  74,  82,  90,  99,  107, 115, 123,
-    132, 140, 148, 156, 165, 173, 181, 189,
-    197, 206, 214, 222, 230, 239, 247, 255
-};
-
-/* lookup table for scaling 6 bit colors up to 8 bits */
-static const byte _rgb_scale_6[] = {
-    0,   4,   8,   12,  16,  20,  24,  28,
-    32,  36,  40,  45,  49,  53,  57,  61,
-    65,  69,  73,  77,  81,  85,  89,  93,
-    97,  101, 105, 109, 113, 117, 121, 125,
-    130, 134, 138, 142, 146, 150, 154, 158,
-    162, 166, 170, 174, 178, 182, 186, 190,
-    194, 198, 202, 206, 210, 215, 219, 223,
-    227, 231, 235, 239, 243, 247, 251, 255
-};
-
-
-#define CC_SEL(cc, which) (((dword *)(cc))[(which) / 32] >> ((which) & 31))
-#define UP4(c) _rgb_scale_4[(c) & 15]
-#define UP5(c) _rgb_scale_5[(c) & 31]
-#define UP6(c) _rgb_scale_6[(c) & 63]
-#define ZERO_4UBV(v) *((dword *)(v)) = 0
-
-
-void TAPIENTRY
-dxt1_rgb_decode_1 (const void *texture, int stride,
-		   int i, int j, byte *rgba)
-{
-    const byte *src = (const byte *)texture
-		       + ((j / 4) * ((stride + 3) / 4) + i / 4) * 8;
-    const int code = (src[4 + (j & 3)] >> ((i & 3) * 2)) & 0x3;
-    if (code == 0) {
-	rgba[RCOMP] = UP5(CC_SEL(src, 11));
-	rgba[GCOMP] = UP6(CC_SEL(src,  5));
-	rgba[BCOMP] = UP5(CC_SEL(src,  0));
-    } else if (code == 1) {
-	rgba[RCOMP] = UP5(CC_SEL(src, 27));
-	rgba[GCOMP] = UP6(CC_SEL(src, 21));
-	rgba[BCOMP] = UP5(CC_SEL(src, 16));
-    } else {
-	const word col0 = src[0] | (src[1] << 8);
-	const word col1 = src[2] | (src[3] << 8);
-	if (col0 > col1) {
-	    if (code == 2) {
-		rgba[RCOMP] = (UP5(col0 >> 11) * 2 + UP5(col1 >> 11)) / 3;
-		rgba[GCOMP] = (UP6(col0 >>  5) * 2 + UP6(col1 >>  5)) / 3;
-		rgba[BCOMP] = (UP5(col0      ) * 2 + UP5(col1      )) / 3;
-	    } else {
-		rgba[RCOMP] = (UP5(col0 >> 11) + 2 * UP5(col1 >> 11)) / 3;
-		rgba[GCOMP] = (UP6(col0 >>  5) + 2 * UP6(col1 >>  5)) / 3;
-		rgba[BCOMP] = (UP5(col0      ) + 2 * UP5(col1      )) / 3;
-	    }
-	} else {
-	    if (code == 2) {
-		rgba[RCOMP] = (UP5(col0 >> 11) + UP5(col1 >> 11)) / 2;
-		rgba[GCOMP] = (UP6(col0 >>  5) + UP6(col1 >>  5)) / 2;
-		rgba[BCOMP] = (UP5(col0      ) + UP5(col1      )) / 2;
-	    } else {
-		ZERO_4UBV(rgba);
-	    }
-	}
-    }
-    rgba[ACOMP] = 255;
-}
-
-
-void TAPIENTRY
-dxt1_rgba_decode_1 (const void *texture, int stride,
-		    int i, int j, byte *rgba)
-{
-    /* Same as rgb_dxt1 above, except alpha=0 if col0<=col1 and code=3. */
-    const byte *src = (const byte *)texture
-		       + ((j / 4) * ((stride + 3) / 4) + i / 4) * 8;
-    const int code = (src[4 + (j & 3)] >> ((i & 3) * 2)) & 0x3;
-    if (code == 0) {
-	rgba[RCOMP] = UP5(CC_SEL(src, 11));
-	rgba[GCOMP] = UP6(CC_SEL(src,  5));
-	rgba[BCOMP] = UP5(CC_SEL(src,  0));
-	rgba[ACOMP] = 255;
-    } else if (code == 1) {
-	rgba[RCOMP] = UP5(CC_SEL(src, 27));
-	rgba[GCOMP] = UP6(CC_SEL(src, 21));
-	rgba[BCOMP] = UP5(CC_SEL(src, 16));
-	rgba[ACOMP] = 255;
-    } else {
-	const word col0 = src[0] | (src[1] << 8);
-	const word col1 = src[2] | (src[3] << 8);
-	if (col0 > col1) {
-	    if (code == 2) {
-		rgba[RCOMP] = (UP5(col0 >> 11) * 2 + UP5(col1 >> 11)) / 3;
-		rgba[GCOMP] = (UP6(col0 >>  5) * 2 + UP6(col1 >>  5)) / 3;
-		rgba[BCOMP] = (UP5(col0      ) * 2 + UP5(col1      )) / 3;
-	    } else {
-		rgba[RCOMP] = (UP5(col0 >> 11) + 2 * UP5(col1 >> 11)) / 3;
-		rgba[GCOMP] = (UP6(col0 >>  5) + 2 * UP6(col1 >>  5)) / 3;
-		rgba[BCOMP] = (UP5(col0      ) + 2 * UP5(col1      )) / 3;
-	    }
-	    rgba[ACOMP] = 255;
-	} else {
-	    if (code == 2) {
-		rgba[RCOMP] = (UP5(col0 >> 11) + UP5(col1 >> 11)) / 2;
-		rgba[GCOMP] = (UP6(col0 >>  5) + UP6(col1 >>  5)) / 2;
-		rgba[BCOMP] = (UP5(col0      ) + UP5(col1      )) / 2;
-		rgba[ACOMP] = 255;
-	    } else {
-		ZERO_4UBV(rgba);
-	    }
-	}
-    }
-}
-
-
-void TAPIENTRY
-dxt3_rgba_decode_1 (const void *texture, int stride,
-		    int i, int j, byte *rgba)
-{
-    const byte *src = (const byte *)texture
-		       + ((j / 4) * ((stride + 3) / 4) + i / 4) * 16;
-    const int code = (src[12 + (j & 3)] >> ((i & 3) * 2)) & 0x3;
-    const dword *cc = (const dword *)(src + 8);
-    if (code == 0) {
-	rgba[RCOMP] = UP5(CC_SEL(cc, 11));
-	rgba[GCOMP] = UP6(CC_SEL(cc,  5));
-	rgba[BCOMP] = UP5(CC_SEL(cc,  0));
-    } else if (code == 1) {
-	rgba[RCOMP] = UP5(CC_SEL(cc, 27));
-	rgba[GCOMP] = UP6(CC_SEL(cc, 21));
-	rgba[BCOMP] = UP5(CC_SEL(cc, 16));
-    } else if (code == 2) {
-	/* (col0 * (4 - code) + col1 * (code - 1)) / 3 */
-	rgba[RCOMP] = (UP5(CC_SEL(cc, 11)) * 2 + UP5(CC_SEL(cc, 27))) / 3;
-	rgba[GCOMP] = (UP6(CC_SEL(cc,  5)) * 2 + UP6(CC_SEL(cc, 21))) / 3;
-	rgba[BCOMP] = (UP5(CC_SEL(cc,  0)) * 2 + UP5(CC_SEL(cc, 16))) / 3;
-    } else {
-	rgba[RCOMP] = (UP5(CC_SEL(cc, 11)) + 2 * UP5(CC_SEL(cc, 27))) / 3;
-	rgba[GCOMP] = (UP6(CC_SEL(cc,  5)) + 2 * UP6(CC_SEL(cc, 21))) / 3;
-	rgba[BCOMP] = (UP5(CC_SEL(cc,  0)) + 2 * UP5(CC_SEL(cc, 16))) / 3;
-    }
-    rgba[ACOMP] = UP4(src[((j & 3) * 4 + (i & 3)) / 2] >> ((i & 1) * 4));
-}
-
-
-void TAPIENTRY
-dxt5_rgba_decode_1 (const void *texture, int stride,
-		    int i, int j, byte *rgba)
-{
-    const byte *src = (const byte *)texture
-		       + ((j / 4) * ((stride + 3) / 4) + i / 4) * 16;
-    const int code = (src[12 + (j & 3)] >> ((i & 3) * 2)) & 0x3;
-    const dword *cc = (const dword *)(src + 8);
-    const byte alpha0 = src[0];
-    const byte alpha1 = src[1];
-    const int alphaShift = (((j & 3) * 4) + (i & 3)) * 3 + 16;
-    const int acode = ((alphaShift == 31)
-			? CC_SEL(src + 2, alphaShift - 16)
-			: CC_SEL(src, alphaShift)) & 0x7;
-    if (code == 0) {
-	rgba[RCOMP] = UP5(CC_SEL(cc, 11));
-	rgba[GCOMP] = UP6(CC_SEL(cc,  5));
-	rgba[BCOMP] = UP5(CC_SEL(cc,  0));
-    } else if (code == 1) {
-	rgba[RCOMP] = UP5(CC_SEL(cc, 27));
-	rgba[GCOMP] = UP6(CC_SEL(cc, 21));
-	rgba[BCOMP] = UP5(CC_SEL(cc, 16));
-    } else if (code == 2) {
-	/* (col0 * (4 - code) + col1 * (code - 1)) / 3 */
-	rgba[RCOMP] = (UP5(CC_SEL(cc, 11)) * 2 + UP5(CC_SEL(cc, 27))) / 3;
-	rgba[GCOMP] = (UP6(CC_SEL(cc,  5)) * 2 + UP6(CC_SEL(cc, 21))) / 3;
-	rgba[BCOMP] = (UP5(CC_SEL(cc,  0)) * 2 + UP5(CC_SEL(cc, 16))) / 3;
-    } else {
-	rgba[RCOMP] = (UP5(CC_SEL(cc, 11)) + 2 * UP5(CC_SEL(cc, 27))) / 3;
-	rgba[GCOMP] = (UP6(CC_SEL(cc,  5)) + 2 * UP6(CC_SEL(cc, 21))) / 3;
-	rgba[BCOMP] = (UP5(CC_SEL(cc,  0)) + 2 * UP5(CC_SEL(cc, 16))) / 3;
-    }
-    if (acode == 0) {
-	rgba[ACOMP] = alpha0;
-    } else if (acode == 1) {
-	rgba[ACOMP] = alpha1;
-    } else if (alpha0 > alpha1) {
-	rgba[ACOMP] = ((8 - acode) * alpha0 + (acode - 1) * alpha1) / 7;
-    } else if (acode == 6) {
-	rgba[ACOMP] = 0;
-    } else if (acode == 7) {
-	rgba[ACOMP] = 255;
-    } else {
-	rgba[ACOMP] = ((6 - acode) * alpha0 + (acode - 1) * alpha1) / 5;
-    }
-}
diff --git a/source/gles2glide64/src/GlideHQ/tc-1.1+/dxtn.h b/source/gles2glide64/src/GlideHQ/tc-1.1+/dxtn.h
deleted file mode 100644
index 4078fd9..0000000
--- a/source/gles2glide64/src/GlideHQ/tc-1.1+/dxtn.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * DXTn codec
- * Version:  1.1
- *
- * Copyright (C) 2004  Daniel Borca   All Rights Reserved.
- *
- * this is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2, or (at your option)
- * any later version.
- *
- * this is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with GNU Make; see the file COPYING.  If not, write to
- * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.	
- */
-
-
-#ifndef DXTN_H_included
-#define DXTN_H_included
-
-TAPI int TAPIENTRY
-dxt1_rgb_encode (int width, int height, int comps,
-		 const void *source, int srcRowStride,
-		 void *dest, int destRowStride);
-
-TAPI int TAPIENTRY
-dxt1_rgba_encode (int width, int height, int comps,
-		  const void *source, int srcRowStride,
-		  void *dest, int destRowStride);
-
-TAPI int TAPIENTRY
-dxt3_rgba_encode (int width, int height, int comps,
-		  const void *source, int srcRowStride,
-		  void *dest, int destRowStride);
-
-TAPI int TAPIENTRY
-dxt5_rgba_encode (int width, int height, int comps,
-		  const void *source, int srcRowStride,
-		  void *dest, int destRowStride);
-
-TAPI void TAPIENTRY
-dxt1_rgb_decode_1 (const void *texture, int stride /* in pixels */,
-		   int i, int j, byte *rgba);
-
-TAPI void TAPIENTRY
-dxt1_rgba_decode_1 (const void *texture, int stride /* in pixels */,
-		    int i, int j, byte *rgba);
-
-TAPI void TAPIENTRY
-dxt3_rgba_decode_1 (const void *texture, int stride /* in pixels */,
-		    int i, int j, byte *rgba);
-
-TAPI void TAPIENTRY
-dxt5_rgba_decode_1 (const void *texture, int stride /* in pixels */,
-		    int i, int j, byte *rgba);
-
-#endif
diff --git a/source/gles2glide64/src/GlideHQ/tc-1.1+/fxt1.c b/source/gles2glide64/src/GlideHQ/tc-1.1+/fxt1.c
index 623e69c..d39e749 100644
--- a/source/gles2glide64/src/GlideHQ/tc-1.1+/fxt1.c
+++ b/source/gles2glide64/src/GlideHQ/tc-1.1+/fxt1.c
@@ -1,8 +1,7 @@
 /*
- * FXT1 codec
- * Version:  1.1
+ * Mesa 3-D graphics library
  *
- * Copyright (C) 2004  Daniel Borca   All Rights Reserved.
+ * Copyright (C) 1999-2008  Brian Paul   All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -17,18 +16,21 @@
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * DANIEL BORCA BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
- * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
  */
 
-/* Copyright (C) 2007  Hiroshi Morii <koolsmoky(at)users.sourceforge.net>
- * Added support for ARGB inputs.
+/**
+ * \file texcompress_fxt1.c
+ * GL_3DFX_texture_compression_FXT1 support.
  */
 
 
 #include <stdlib.h>
 #include <string.h>
+#include <assert.h>
 
 #include "types.h"
 #include "internal.h"
@@ -54,1122 +56,1093 @@
 #define LL_RMS_E 255 /* fault tolerance (maximum error) */
 #define ALPHA_TS 2 /* alpha threshold: (255 - ALPHA_TS) deemed opaque */
 #define ISTBLACK(v) (*((dword *)(v)) == 0)
-#define COPY_4UBV(DST, SRC) *((dword *)(DST)) = *((dword *)(SRC))
 
 
 static int
 fxt1_bestcol (float vec[][MAX_COMP], int nv,
-	      byte input[MAX_COMP], int nc)
+              byte input[MAX_COMP], int nc)
 {
-    int i, j, best = -1;
-    float err = 1e9; /* big enough */
-
-    for (j = 0; j < nv; j++) {
-	float e = 0.0F;
-	for (i = 0; i < nc; i++) {
-	    e += (vec[j][i] - input[i]) * (vec[j][i] - input[i]);
-	}
-	if (e < err) {
-	    err = e;
-	    best = j;
-	}
-    }
-
-    return best;
+   int i, j, best = -1;
+   float err = 1e9; /* big enough */
+
+   for (j = 0; j < nv; j++) {
+      float e = 0.0F;
+      for (i = 0; i < nc; i++) {
+         e += (vec[j][i] - input[i]) * (vec[j][i] - input[i]);
+      }
+      if (e < err) {
+         err = e;
+         best = j;
+      }
+   }
+
+   return best;
 }
 
 
 static int
 fxt1_worst (float vec[MAX_COMP],
-	    byte input[N_TEXELS][MAX_COMP], int nc, int n)
+            byte input[N_TEXELS][MAX_COMP], int nc, int n)
 {
-    int i, k, worst = -1;
-    float err = -1.0F; /* small enough */
-
-    for (k = 0; k < n; k++) {
-	float e = 0.0F;
-	for (i = 0; i < nc; i++) {
-	    e += (vec[i] - input[k][i]) * (vec[i] - input[k][i]);
-	}
-	if (e > err) {
-	    err = e;
-	    worst = k;
-	}
-    }
-
-    return worst;
+   int i, k, worst = -1;
+   float err = -1.0F; /* small enough */
+
+   for (k = 0; k < n; k++) {
+      float e = 0.0F;
+      for (i = 0; i < nc; i++) {
+         e += (vec[i] - input[k][i]) * (vec[i] - input[k][i]);
+      }
+      if (e > err) {
+         err = e;
+         worst = k;
+      }
+   }
+
+   return worst;
 }
 
 
 static int
 fxt1_variance (double variance[MAX_COMP],
-	       byte input[N_TEXELS][MAX_COMP], int nc, int n)
+               byte input[N_TEXELS][MAX_COMP], int nc, int n)
 {
-    int i, k, best = 0;
-    dword sx, sx2;
-    double var, maxvar = -1; /* small enough */
-    double teenth = 1.0 / n;
-
-    for (i = 0; i < nc; i++) {
-	sx = sx2 = 0;
-	for (k = 0; k < n; k++) {
-	    int t = input[k][i];
-	    sx += t;
-	    sx2 += t * t;
-	}
-	var = sx2 * teenth - sx * sx * teenth * teenth;
-	if (maxvar < var) {
-	    maxvar = var;
-	    best = i;
-	}
-	if (variance) {
-	    variance[i] = var;
-	}
-    }
-
-    return best;
+   int i, k, best = 0;
+   int sx, sx2;
+   double var, maxvar = -1; /* small enough */
+   double teenth = 1.0 / n;
+
+   for (i = 0; i < nc; i++) {
+      sx = sx2 = 0;
+      for (k = 0; k < n; k++) {
+         int t = input[k][i];
+         sx += t;
+         sx2 += t * t;
+      }
+      var = sx2 * teenth - sx * sx * teenth * teenth;
+      if (maxvar < var) {
+         maxvar = var;
+         best = i;
+      }
+      if (variance) {
+         variance[i] = var;
+      }
+   }
+
+   return best;
 }
 
 
 static int
 fxt1_choose (float vec[][MAX_COMP], int nv,
-	     byte input[N_TEXELS][MAX_COMP], int nc, int n)
+             byte input[N_TEXELS][MAX_COMP], int nc, int n)
 {
 #if 0
-    /* Choose colors from a grid.
-     */
-    int i, j;
-
-    for (j = 0; j < nv; j++) {
-	int m = j * (n - 1) / (nv - 1);
-	for (i = 0; i < nc; i++) {
-	    vec[j][i] = input[m][i];
-	}
-    }
+   /* Choose colors from a grid.
+    */
+   int i, j;
+
+   for (j = 0; j < nv; j++) {
+      int m = j * (n - 1) / (nv - 1);
+      for (i = 0; i < nc; i++) {
+         vec[j][i] = input[m][i];
+      }
+   }
 #else
-    /* Our solution here is to find the darkest and brightest colors in
-     * the 8x4 tile and use those as the two representative colors.
-     * There are probably better algorithms to use (histogram-based).
-     */
-    int i, j, k;
-#ifndef YUV
-    int minSum = 2000; /* big enough */
-#else
-    int minSum = 2000000;
-#endif
-    int maxSum = -1; /* small enough */
-    int minCol = 0; /* phoudoin: silent compiler! */
-    int maxCol = 0; /* phoudoin: silent compiler! */
-
-    struct {
-	int flag;
-	dword key;
-	int freq;
-	int idx;
-    } hist[N_TEXELS];
-    int lenh = 0;
-
-    memset(hist, 0, sizeof(hist));
-
-    for (k = 0; k < n; k++) {
-	int l;
-	dword key = 0;
-	int sum = 0;
-	for (i = 0; i < nc; i++) {
-	    key <<= 8;
-	    key |= input[k][i];
-#ifndef YUV
-	    sum += input[k][i];
-#else
-            /* RGB to YUV conversion according to CCIR 601 specs
-             * Y = 0.299R+0.587G+0.114B
-             * U = 0.713(R - Y) = 0.500R-0.419G-0.081B
-             * V = 0.564(B - Y) = -0.169R-0.331G+0.500B
-             */
-            sum = 299 * input[k][RCOMP] + 587 * input[k][GCOMP] +  114 * input[k][BCOMP];
-#endif
-        }
-	for (l = 0; l < n; l++) {
-	    if (!hist[l].flag) {
-		/* alloc new slot */
-		hist[l].flag = !0;
-		hist[l].key = key;
-		hist[l].freq = 1;
-		hist[l].idx = k;
-		lenh = l + 1;
-		break;
-	    } else if (hist[l].key == key) {
-		hist[l].freq++;
-		break;
-	    }
-	}
-	if (minSum > sum) {
-	    minSum = sum;
-	    minCol = k;
-	}
-	if (maxSum < sum) {
-	    maxSum = sum;
-	    maxCol = k;
-	}
-    }
-
-    if (lenh <= nv) {
-	for (j = 0; j < lenh; j++) {
-	    for (i = 0; i < nc; i++) {
-		vec[j][i] = (float)input[hist[j].idx][i];
-	    }
-	}
-	for (; j < nv; j++) {
-	    for (i = 0; i < nc; i++) {
-		vec[j][i] = vec[0][i];
-	    }
-	}
-	return 0;
-    }
-
-    for (j = 0; j < nv; j++) {
-	for (i = 0; i < nc; i++) {
-	    vec[j][i] = ((nv - 1 - j) * input[minCol][i] + j * input[maxCol][i] + (nv - 1) / 2) / (float)(nv - 1);
-	}
-    }
+   /* Our solution here is to find the darkest and brightest colors in
+    * the 8x4 tile and use those as the two representative colors.
+    * There are probably better algorithms to use (histogram-based).
+    */
+   int i, j, k;
+   int minSum = 2000; /* big enough */
+   int maxSum = -1; /* small enough */
+   int minCol = 0; /* phoudoin: silent compiler! */
+   int maxCol = 0; /* phoudoin: silent compiler! */
+
+   struct {
+      int flag;
+      int key;
+      int freq;
+      int idx;
+   } hist[N_TEXELS];
+   int lenh = 0;
+
+   memset(hist, 0, sizeof(hist));
+
+   for (k = 0; k < n; k++) {
+      int l;
+      int key = 0;
+      int sum = 0;
+      for (i = 0; i < nc; i++) {
+         key <<= 8;
+         key |= input[k][i];
+         sum += input[k][i];
+      }
+      for (l = 0; l < n; l++) {
+         if (!hist[l].flag) {
+            /* alloc new slot */
+            hist[l].flag = !0;
+            hist[l].key = key;
+            hist[l].freq = 1;
+            hist[l].idx = k;
+            lenh = l + 1;
+            break;
+         } else if (hist[l].key == key) {
+            hist[l].freq++;
+            break;
+         }
+      }
+      if (minSum > sum) {
+         minSum = sum;
+         minCol = k;
+      }
+      if (maxSum < sum) {
+         maxSum = sum;
+         maxCol = k;
+      }
+   }
+
+   if (lenh <= nv) {
+      for (j = 0; j < lenh; j++) {
+         for (i = 0; i < nc; i++) {
+            vec[j][i] = (float)input[hist[j].idx][i];
+         }
+      }
+      for (; j < nv; j++) {
+         for (i = 0; i < nc; i++) {
+            vec[j][i] = vec[0][i];
+         }
+      }
+      return 0;
+   }
+
+   for (j = 0; j < nv; j++) {
+      for (i = 0; i < nc; i++) {
+         vec[j][i] = ((nv - 1 - j) * input[minCol][i] + j * input[maxCol][i] + (nv - 1) / 2) / (float)(nv - 1);
+      }
+   }
 #endif
 
-    return !0;
+   return !0;
 }
 
 
 static int
 fxt1_lloyd (float vec[][MAX_COMP], int nv,
-	    byte input[N_TEXELS][MAX_COMP], int nc, int n)
+            byte input[N_TEXELS][MAX_COMP], int nc, int n)
 {
-    /* Use the generalized lloyd's algorithm for VQ:
-     *     find 4 color vectors.
-     *
-     *     for each sample color
-     *         sort to nearest vector.
-     *
-     *     replace each vector with the centroid of it's matching colors.
-     *
-     *     repeat until RMS doesn't improve.
-     *
-     *     if a color vector has no samples, or becomes the same as another
-     *     vector, replace it with the color which is farthest from a sample.
-     *
-     * vec[][MAX_COMP]           initial vectors and resulting colors
-     * nv                        number of resulting colors required
-     * input[N_TEXELS][MAX_COMP] input texels
-     * nc                        number of components in input / vec
-     * n                         number of input samples
-     */
-
-    int sum[MAX_VECT][MAX_COMP]; /* used to accumulate closest texels */
-    int cnt[MAX_VECT]; /* how many times a certain vector was chosen */
-    float error, lasterror = 1e9;
-
-    int i, j, k, rep;
-
-    /* the quantizer */
-    for (rep = 0; rep < LL_N_REP; rep++) {
-	/* reset sums & counters */
-	for (j = 0; j < nv; j++) {
-	    for (i = 0; i < nc; i++) {
-		sum[j][i] = 0;
-	    }
-	    cnt[j] = 0;
-	}
-	error = 0;
-
-	/* scan whole block */
-	for (k = 0; k < n; k++) {
+   /* Use the generalized lloyd's algorithm for VQ:
+    *     find 4 color vectors.
+    *
+    *     for each sample color
+    *         sort to nearest vector.
+    *
+    *     replace each vector with the centroid of its matching colors.
+    *
+    *     repeat until RMS doesn't improve.
+    *
+    *     if a color vector has no samples, or becomes the same as another
+    *     vector, replace it with the color which is farthest from a sample.
+    *
+    * vec[][MAX_COMP]           initial vectors and resulting colors
+    * nv                        number of resulting colors required
+    * input[N_TEXELS][MAX_COMP] input texels
+    * nc                        number of components in input / vec
+    * n                         number of input samples
+    */
+
+   int sum[MAX_VECT][MAX_COMP]; /* used to accumulate closest texels */
+   int cnt[MAX_VECT]; /* how many times a certain vector was chosen */
+   float error, lasterror = 1e9;
+
+   int i, j, k, rep;
+
+   /* the quantizer */
+   for (rep = 0; rep < LL_N_REP; rep++) {
+      /* reset sums & counters */
+      for (j = 0; j < nv; j++) {
+         for (i = 0; i < nc; i++) {
+            sum[j][i] = 0;
+         }
+         cnt[j] = 0;
+      }
+      error = 0;
+
+      /* scan whole block */
+      for (k = 0; k < n; k++) {
 #if 1
-	    int best = -1;
-	    float err = 1e9; /* big enough */
-	    /* determine best vector */
-	    for (j = 0; j < nv; j++) {
-		float e = (vec[j][0] - input[k][0]) * (vec[j][0] - input[k][0]) +
-			  (vec[j][1] - input[k][1]) * (vec[j][1] - input[k][1]) +
-			  (vec[j][2] - input[k][2]) * (vec[j][2] - input[k][2]);
-		if (nc == 4) {
-		    e += (vec[j][3] - input[k][3]) * (vec[j][3] - input[k][3]);
-		}
-		if (e < err) {
-		    err = e;
-		    best = j;
-		}
-	    }
+         int best = -1;
+         float err = 1e9; /* big enough */
+         /* determine best vector */
+         for (j = 0; j < nv; j++) {
+            float e = (vec[j][0] - input[k][0]) * (vec[j][0] - input[k][0]) +
+                      (vec[j][1] - input[k][1]) * (vec[j][1] - input[k][1]) +
+                      (vec[j][2] - input[k][2]) * (vec[j][2] - input[k][2]);
+            if (nc == 4) {
+               e += (vec[j][3] - input[k][3]) * (vec[j][3] - input[k][3]);
+            }
+            if (e < err) {
+               err = e;
+               best = j;
+            }
+         }
 #else
-	    int best = fxt1_bestcol(vec, nv, input[k], nc, &err);
+         int best = fxt1_bestcol(vec, nv, input[k], nc, &err);
 #endif
-	    /* add in closest color */
-	    for (i = 0; i < nc; i++) {
-		sum[best][i] += input[k][i];
-	    }
-	    /* mark this vector as used */
-	    cnt[best]++;
-	    /* accumulate error */
-	    error += err;
-	}
-
-	/* check RMS */
-	if ((error < LL_RMS_E) ||
-	    ((error < lasterror) && ((lasterror - error) < LL_RMS_D))) {
-	    return !0; /* good match */
-	}
-	lasterror = error;
-
-	/* move each vector to the barycenter of its closest colors */
-	for (j = 0; j < nv; j++) {
-	    if (cnt[j]) {
-		float div = 1.0F / cnt[j];
-		for (i = 0; i < nc; i++) {
-		    vec[j][i] = div * sum[j][i];
-		}
-	    } else {
-		/* this vec has no samples or is identical with a previous vec */
-		int worst = fxt1_worst(vec[j], input, nc, n);
-		for (i = 0; i < nc; i++) {
-		    vec[j][i] = input[worst][i];
-		}
-	    }
-	}
-    }
-
-    return 0; /* could not converge fast enough */
+         assert(best >= 0);
+         /* add in closest color */
+         for (i = 0; i < nc; i++) {
+            sum[best][i] += input[k][i];
+         }
+         /* mark this vector as used */
+         cnt[best]++;
+         /* accumulate error */
+         error += err;
+      }
+
+      /* check RMS */
+      if ((error < LL_RMS_E) ||
+          ((error < lasterror) && ((lasterror - error) < LL_RMS_D))) {
+         return !0; /* good match */
+      }
+      lasterror = error;
+
+      /* move each vector to the barycenter of its closest colors */
+      for (j = 0; j < nv; j++) {
+         if (cnt[j]) {
+            float div = 1.0F / cnt[j];
+            for (i = 0; i < nc; i++) {
+               vec[j][i] = div * sum[j][i];
+            }
+         } else {
+            /* this vec has no samples or is identical with a previous vec */
+            int worst = fxt1_worst(vec[j], input, nc, n);
+            for (i = 0; i < nc; i++) {
+               vec[j][i] = input[worst][i];
+            }
+         }
+      }
+   }
+
+   return 0; /* could not converge fast enough */
 }
 
 
 static void
 fxt1_quantize_CHROMA (dword *cc,
-		      byte input[N_TEXELS][MAX_COMP])
+                      byte input[N_TEXELS][MAX_COMP])
 {
-    const int n_vect = 4; /* 4 base vectors to find */
-    const int n_comp = 3; /* 3 components: R, G, B */
-    float vec[MAX_VECT][MAX_COMP];
-    int i, j, k;
-    qword hi; /* high quadword */
-    dword lohi, lolo; /* low quadword: hi dword, lo dword */
-
-    if (fxt1_choose(vec, n_vect, input, n_comp, N_TEXELS) != 0) {
-	fxt1_lloyd(vec, n_vect, input, n_comp, N_TEXELS);
-    }
-
-    Q_MOV32(hi, 4); /* cc-chroma = "010" + unused bit */
-    for (j = n_vect - 1; j >= 0; j--) {
-	for (i = 0; i < n_comp; i++) {
-	    /* add in colors */
-	    Q_SHL(hi, 5);
-	    Q_OR32(hi, (dword)(vec[j][i] / 8.0F));
-	}
-    }
-    ((qword *)cc)[1] = hi;
-
-    lohi = lolo = 0;
-    /* right microtile */
-    for (k = N_TEXELS - 1; k >= N_TEXELS / 2; k--) {
-	lohi <<= 2;
-	lohi |= fxt1_bestcol(vec, n_vect, input[k], n_comp);
-    }
-    /* left microtile */
-    for (; k >= 0; k--) {
-	lolo <<= 2;
-	lolo |= fxt1_bestcol(vec, n_vect, input[k], n_comp);
-    }
-    cc[1] = lohi;
-    cc[0] = lolo;
+   const int n_vect = 4; /* 4 base vectors to find */
+   const int n_comp = 3; /* 3 components: R, G, B */
+   float vec[MAX_VECT][MAX_COMP];
+   int i, j, k;
+   qword hi; /* high quadword */
+   dword lohi, lolo; /* low quadword: hi dword, lo dword */
+
+   if (fxt1_choose(vec, n_vect, input, n_comp, N_TEXELS) != 0) {
+      fxt1_lloyd(vec, n_vect, input, n_comp, N_TEXELS);
+   }
+
+   Q_MOV32(hi, 4); /* cc-chroma = "010" + unused bit */
+   for (j = n_vect - 1; j >= 0; j--) {
+      for (i = 0; i < n_comp; i++) {
+         /* add in colors */
+         Q_SHL(hi, 5);
+         Q_OR32(hi, (dword)(vec[j][i] / 8.0F));
+      }
+   }
+   ((qword *)cc)[1] = hi;
+
+   lohi = lolo = 0;
+   /* right microtile */
+   for (k = N_TEXELS - 1; k >= N_TEXELS/2; k--) {
+      lohi <<= 2;
+      lohi |= fxt1_bestcol(vec, n_vect, input[k], n_comp);
+   }
+   /* left microtile */
+   for (; k >= 0; k--) {
+      lolo <<= 2;
+      lolo |= fxt1_bestcol(vec, n_vect, input[k], n_comp);
+   }
+   cc[1] = lohi;
+   cc[0] = lolo;
 }
 
 
 static void
 fxt1_quantize_ALPHA0 (dword *cc,
-		      byte input[N_TEXELS][MAX_COMP],
-		      byte reord[N_TEXELS][MAX_COMP], int n)
+                      byte input[N_TEXELS][MAX_COMP],
+                      byte reord[N_TEXELS][MAX_COMP], int n)
 {
-    const int n_vect = 3; /* 3 base vectors to find */
-    const int n_comp = 4; /* 4 components: R, G, B, A */
-    float vec[MAX_VECT][MAX_COMP];
-    int i, j, k;
-    qword hi; /* high quadword */
-    dword lohi, lolo; /* low quadword: hi dword, lo dword */
-
-    /* the last vector indicates zero */
-    for (i = 0; i < n_comp; i++) {
-	vec[n_vect][i] = 0;
-    }
-
-    /* the first n texels in reord are guaranteed to be non-zero */
-    if (fxt1_choose(vec, n_vect, reord, n_comp, n) != 0) {
-	fxt1_lloyd(vec, n_vect, reord, n_comp, n);
-    }
-
-    Q_MOV32(hi, 6); /* alpha = "011" + lerp = 0 */
-    for (j = n_vect - 1; j >= 0; j--) {
-	/* add in alphas */
-	Q_SHL(hi, 5);
-	Q_OR32(hi, (dword)(vec[j][ACOMP] / 8.0F));
-    }
-    for (j = n_vect - 1; j >= 0; j--) {
-	for (i = 0; i < n_comp - 1; i++) {
-	    /* add in colors */
-	    Q_SHL(hi, 5);
-	    Q_OR32(hi, (dword)(vec[j][i] / 8.0F));
-	}
-    }
-    ((qword *)cc)[1] = hi;
-
-    lohi = lolo = 0;
-    /* right microtile */
-    for (k = N_TEXELS - 1; k >= N_TEXELS / 2; k--) {
-	lohi <<= 2;
-	lohi |= fxt1_bestcol(vec, n_vect + 1, input[k], n_comp);
-    }
-    /* left microtile */
-    for (; k >= 0; k--) {
-	lolo <<= 2;
-	lolo |= fxt1_bestcol(vec, n_vect + 1, input[k], n_comp);
-    }
-    cc[1] = lohi;
-    cc[0] = lolo;
+   const int n_vect = 3; /* 3 base vectors to find */
+   const int n_comp = 4; /* 4 components: R, G, B, A */
+   float vec[MAX_VECT][MAX_COMP];
+   int i, j, k;
+   qword hi; /* high quadword */
+   dword lohi, lolo; /* low quadword: hi dword, lo dword */
+
+   /* the last vector indicates zero */
+   for (i = 0; i < n_comp; i++) {
+      vec[n_vect][i] = 0;
+   }
+
+   /* the first n texels in reord are guaranteed to be non-zero */
+   if (fxt1_choose(vec, n_vect, reord, n_comp, n) != 0) {
+      fxt1_lloyd(vec, n_vect, reord, n_comp, n);
+   }
+
+   Q_MOV32(hi, 6); /* alpha = "011" + lerp = 0 */
+   for (j = n_vect - 1; j >= 0; j--) {
+      /* add in alphas */
+      Q_SHL(hi, 5);
+      Q_OR32(hi, (dword)(vec[j][ACOMP] / 8.0F));
+   }
+   for (j = n_vect - 1; j >= 0; j--) {
+      for (i = 0; i < n_comp - 1; i++) {
+         /* add in colors */
+         Q_SHL(hi, 5);
+         Q_OR32(hi, (dword)(vec[j][i] / 8.0F));
+      }
+   }
+   ((qword *)cc)[1] = hi;
+
+   lohi = lolo = 0;
+   /* right microtile */
+   for (k = N_TEXELS - 1; k >= N_TEXELS/2; k--) {
+      lohi <<= 2;
+      lohi |= fxt1_bestcol(vec, n_vect + 1, input[k], n_comp);
+   }
+   /* left microtile */
+   for (; k >= 0; k--) {
+      lolo <<= 2;
+      lolo |= fxt1_bestcol(vec, n_vect + 1, input[k], n_comp);
+   }
+   cc[1] = lohi;
+   cc[0] = lolo;
 }
 
 
 static void
 fxt1_quantize_ALPHA1 (dword *cc,
-		      byte input[N_TEXELS][MAX_COMP])
+                      byte input[N_TEXELS][MAX_COMP])
 {
-    const int n_vect = 3; /* highest vector number in each microtile */
-    const int n_comp = 4; /* 4 components: R, G, B, A */
-    float vec[1 + 1 + 1][MAX_COMP]; /* 1.5 extrema for each sub-block */
-    float b, iv[MAX_COMP]; /* interpolation vector */
-    int i, j, k;
-    qword hi; /* high quadword */
-    dword lohi, lolo; /* low quadword: hi dword, lo dword */
-
-    int minSum;
-    int maxSum;
-    int minColL = 0, maxColL = 0;
-    int minColR = 0, maxColR = 0;
-    int sumL = 0, sumR = 0;
-
-    /* Our solution here is to find the darkest and brightest colors in
-     * the 4x4 tile and use those as the two representative colors.
-     * There are probably better algorithms to use (histogram-based).
-     */
-#ifndef YUV
-    minSum = 2000; /* big enough */
-#else
-    minSum = 2000000;
-#endif
-    maxSum = -1; /* small enough */
-    for (k = 0; k < N_TEXELS / 2; k++) {
-	int sum = 0;
-#ifndef YUV
-	for (i = 0; i < n_comp; i++) {
-	    sum += input[k][i];
-	}
-#else
-        sum = 299 * input[k][RCOMP] + 587 * input[k][GCOMP] +  114 * input[k][BCOMP];
-#endif
-	if (minSum > sum) {
-	    minSum = sum;
-	    minColL = k;
-	}
-	if (maxSum < sum) {
-	    maxSum = sum;
-	    maxColL = k;
-	}
-	sumL += sum;
-    }
-#ifndef YUV
-    minSum = 2000; /* big enough */
-#else
-    minSum = 2000000;
-#endif
-    maxSum = -1; /* small enough */
-    for (; k < N_TEXELS; k++) {
-	int sum = 0;
-#ifndef YUV
-	for (i = 0; i < n_comp; i++) {
-	    sum += input[k][i];
-	}
-#else
-        sum = 299 * input[k][RCOMP] + 587 * input[k][GCOMP] +  114 * input[k][BCOMP];
-#endif
-	if (minSum > sum) {
-	    minSum = sum;
-	    minColR = k;
-	}
-	if (maxSum < sum) {
-	    maxSum = sum;
-	    maxColR = k;
-	}
-	sumR += sum;
-    }
-
-    /* choose the common vector (yuck!) */
-    {
-	int j1, j2;
-	int v1 = 0, v2 = 0;
-	float err = 1e9; /* big enough */
-	float tv[2 * 2][MAX_COMP]; /* 2 extrema for each sub-block */
-	for (i = 0; i < n_comp; i++) {
-	    tv[0][i] = input[minColL][i];
-	    tv[1][i] = input[maxColL][i];
-	    tv[2][i] = input[minColR][i];
-	    tv[3][i] = input[maxColR][i];
-	}
-	for (j1 = 0; j1 < 2; j1++) {
-	    for (j2 = 2; j2 < 4; j2++) {
-		float e = 0.0F;
-		for (i = 0; i < n_comp; i++) {
-		    e += (tv[j1][i] - tv[j2][i]) * (tv[j1][i] - tv[j2][i]);
-		}
-		if (e < err) {
-		    err = e;
-		    v1 = j1;
-		    v2 = j2;
-		}
-	    }
-	}
-	for (i = 0; i < n_comp; i++) {
-	    vec[0][i] = tv[1 - v1][i];
-	    vec[1][i] = (tv[v1][i] * sumL + tv[v2][i] * sumR) / (sumL + sumR);
-	    vec[2][i] = tv[5 - v2][i];
-	}
-    }
-
-    /* left microtile */
-    cc[0] = 0;
-    if (minColL != maxColL) {
-	/* compute interpolation vector */
-	MAKEIVEC(n_vect, n_comp, iv, b, vec[0], vec[1]);
-
-	/* add in texels */
-	lolo = 0;
-	for (k = N_TEXELS / 2 - 1; k >= 0; k--) {
-	    int texel;
-	    /* interpolate color */
-	    CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
-	    /* add in texel */
-	    lolo <<= 2;
-	    lolo |= texel;
-	}
-
-	cc[0] = lolo;
-    }
-
-    /* right microtile */
-    cc[1] = 0;
-    if (minColR != maxColR) {
-	/* compute interpolation vector */
-	MAKEIVEC(n_vect, n_comp, iv, b, vec[2], vec[1]);
-
-	/* add in texels */
-	lohi = 0;
-	for (k = N_TEXELS - 1; k >= N_TEXELS / 2; k--) {
-	    int texel;
-	    /* interpolate color */
-	    CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
-	    /* add in texel */
-	    lohi <<= 2;
-	    lohi |= texel;
-	}
-
-	cc[1] = lohi;
-    }
-
-    Q_MOV32(hi, 7); /* alpha = "011" + lerp = 1 */
-    for (j = n_vect - 1; j >= 0; j--) {
-	/* add in alphas */
-	Q_SHL(hi, 5);
-	Q_OR32(hi, (dword)(vec[j][ACOMP] / 8.0F));
-    }
-    for (j = n_vect - 1; j >= 0; j--) {
-	for (i = 0; i < n_comp - 1; i++) {
-	    /* add in colors */
-	    Q_SHL(hi, 5);
-	    Q_OR32(hi, (dword)(vec[j][i] / 8.0F));
-	}
-    }
-    ((qword *)cc)[1] = hi;
+   const int n_vect = 3; /* highest vector number in each microtile */
+   const int n_comp = 4; /* 4 components: R, G, B, A */
+   float vec[1 + 1 + 1][MAX_COMP]; /* 1.5 extrema for each sub-block */
+   float b, iv[MAX_COMP]; /* interpolation vector */
+   int i, j, k;
+   qword hi; /* high quadword */
+   dword lohi, lolo; /* low quadword: hi dword, lo dword */
+
+   int minSum;
+   int maxSum;
+   int minColL = 0, maxColL = 0;
+   int minColR = 0, maxColR = 0;
+   int sumL = 0, sumR = 0;
+   int nn_comp;
+   /* Our solution here is to find the darkest and brightest colors in
+    * the 4x4 tile and use those as the two representative colors.
+    * There are probably better algorithms to use (histogram-based).
+    */
+   nn_comp = n_comp;
+   while ((minColL == maxColL) && nn_comp) {
+       minSum = 2000; /* big enough */
+       maxSum = -1; /* small enough */
+       for (k = 0; k < N_TEXELS / 2; k++) {
+           int sum = 0;
+           for (i = 0; i < nn_comp; i++) {
+               sum += input[k][i];
+           }
+           if (minSum > sum) {
+               minSum = sum;
+               minColL = k;
+           }
+           if (maxSum < sum) {
+               maxSum = sum;
+               maxColL = k;
+           }
+           sumL += sum;
+       }
+       
+       nn_comp--;
+   }
+
+   nn_comp = n_comp;
+   while ((minColR == maxColR) && nn_comp) {
+       minSum = 2000; /* big enough */
+       maxSum = -1; /* small enough */
+       for (k = N_TEXELS / 2; k < N_TEXELS; k++) {
+           int sum = 0;
+           for (i = 0; i < nn_comp; i++) {
+               sum += input[k][i];
+           }
+           if (minSum > sum) {
+               minSum = sum;
+               minColR = k;
+           }
+           if (maxSum < sum) {
+               maxSum = sum;
+               maxColR = k;
+           }
+           sumR += sum;
+       }
+
+       nn_comp--;
+   }
+
+   /* choose the common vector (yuck!) */
+   {
+      int j1, j2;
+      int v1 = 0, v2 = 0;
+      float err = 1e9; /* big enough */
+      float tv[2 * 2][MAX_COMP]; /* 2 extrema for each sub-block */
+      for (i = 0; i < n_comp; i++) {
+         tv[0][i] = input[minColL][i];
+         tv[1][i] = input[maxColL][i];
+         tv[2][i] = input[minColR][i];
+         tv[3][i] = input[maxColR][i];
+      }
+      for (j1 = 0; j1 < 2; j1++) {
+         for (j2 = 2; j2 < 4; j2++) {
+            float e = 0.0F;
+            for (i = 0; i < n_comp; i++) {
+               e += (tv[j1][i] - tv[j2][i]) * (tv[j1][i] - tv[j2][i]);
+            }
+            if (e < err) {
+               err = e;
+               v1 = j1;
+               v2 = j2;
+            }
+         }
+      }
+      for (i = 0; i < n_comp; i++) {
+         vec[0][i] = tv[1 - v1][i];
+         vec[1][i] = (tv[v1][i] * sumL + tv[v2][i] * sumR) / (sumL + sumR);
+         vec[2][i] = tv[5 - v2][i];
+      }
+   }
+
+   /* left microtile */
+   cc[0] = 0;
+   if (minColL != maxColL) {
+      /* compute interpolation vector */
+      MAKEIVEC(n_vect, n_comp, iv, b, vec[0], vec[1]);
+
+      /* add in texels */
+      lolo = 0;
+      for (k = N_TEXELS / 2 - 1; k >= 0; k--) {
+         int texel;
+         /* interpolate color */
+         CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
+         /* add in texel */
+         lolo <<= 2;
+         lolo |= texel;
+      }
+      
+      cc[0] = lolo;
+   }
+
+   /* right microtile */
+   cc[1] = 0;
+   if (minColR != maxColR) {
+      /* compute interpolation vector */
+      MAKEIVEC(n_vect, n_comp, iv, b, vec[2], vec[1]);
+
+      /* add in texels */
+      lohi = 0;
+      for (k = N_TEXELS - 1; k >= N_TEXELS / 2; k--) {
+         int texel;
+         /* interpolate color */
+         CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
+         /* add in texel */
+         lohi <<= 2;
+         lohi |= texel;
+      }
+
+      cc[1] = lohi;
+   }
+
+   Q_MOV32(hi, 7); /* alpha = "011" + lerp = 1 */
+   for (j = n_vect - 1; j >= 0; j--) {
+      /* add in alphas */
+      Q_SHL(hi, 5);
+      Q_OR32(hi, (dword)(vec[j][ACOMP] / 8.0F));
+   }
+   for (j = n_vect - 1; j >= 0; j--) {
+      for (i = 0; i < n_comp - 1; i++) {
+         /* add in colors */
+         Q_SHL(hi, 5);
+         Q_OR32(hi, (dword)(vec[j][i] / 8.0F));
+      }
+   }
+   ((qword *)cc)[1] = hi;
 }
 
 
 static void
 fxt1_quantize_HI (dword *cc,
-		  byte input[N_TEXELS][MAX_COMP],
-		  byte reord[N_TEXELS][MAX_COMP], int n)
+                  byte input[N_TEXELS][MAX_COMP],
+                  byte reord[N_TEXELS][MAX_COMP], int n)
 {
-    const int n_vect = 6; /* highest vector number */
-    const int n_comp = 3; /* 3 components: R, G, B */
-    float b = 0.0F;       /* phoudoin: silent compiler! */
-    float iv[MAX_COMP];   /* interpolation vector */
-    int i, k;
-    dword hihi; /* high quadword: hi dword */
-
-#ifndef YUV
-    int minSum = 2000; /* big enough */
-#else
-    int minSum = 2000000;
-#endif
-    int maxSum = -1; /* small enough */
-    int minCol = 0; /* phoudoin: silent compiler! */
-    int maxCol = 0; /* phoudoin: silent compiler! */
-
-    /* Our solution here is to find the darkest and brightest colors in
-     * the 8x4 tile and use those as the two representative colors.
-     * There are probably better algorithms to use (histogram-based).
-     */
-    for (k = 0; k < n; k++) {
-	int sum = 0;
-#ifndef YUV
-	for (i = 0; i < n_comp; i++) {
-	    sum += reord[k][i];
-	}
-#else
-        sum = 299 * input[k][RCOMP] + 587 * input[k][GCOMP] +  114 * input[k][BCOMP];
-#endif
-	if (minSum > sum) {
-	    minSum = sum;
-	    minCol = k;
-	}
-	if (maxSum < sum) {
-	    maxSum = sum;
-	    maxCol = k;
-	}
-    }
-
-    hihi = 0; /* cc-hi = "00" */
-    for (i = 0; i < n_comp; i++) {
-	/* add in colors */
-	hihi <<= 5;
-	hihi |= reord[maxCol][i] >> 3;
-    }
-    for (i = 0; i < n_comp; i++) {
-	/* add in colors */
-	hihi <<= 5;
-	hihi |= reord[minCol][i] >> 3;
-    }
-    cc[3] = hihi;
-    cc[0] = cc[1] = cc[2] = 0;
-
-    /* compute interpolation vector */
-    if (minCol != maxCol) {
-	MAKEIVEC(n_vect, n_comp, iv, b, reord[minCol], reord[maxCol]);
-    }
-
-    /* add in texels */
-    for (k = N_TEXELS - 1; k >= 0; k--) {
-	int t = k * 3;
-	dword *kk = (dword *)((byte *)cc + t / 8);
-	int texel = n_vect + 1; /* transparent black */
-
-	if (!ISTBLACK(input[k])) {
-	    if (minCol != maxCol) {
-		/* interpolate color */
-		CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
-		/* add in texel */
-		kk[0] |= texel << (t & 7);
-	    }
-	} else {
-	    /* add in texel */
-	    kk[0] |= texel << (t & 7);
-	}
-    }
+   const int n_vect = 6; /* highest vector number */
+   const int n_comp = 3; /* 3 components: R, G, B */
+   float b = 0.0F;       /* phoudoin: silent compiler! */
+   float iv[MAX_COMP];   /* interpolation vector */
+   int i, k;
+   dword hihi; /* high quadword: hi dword */
+
+   int minSum = 2000; /* big enough */
+   int maxSum = -1; /* small enough */
+   int minCol = 0; /* phoudoin: silent compiler! */
+   int maxCol = 0; /* phoudoin: silent compiler! */
+
+   /* Our solution here is to find the darkest and brightest colors in
+    * the 8x4 tile and use those as the two representative colors.
+    * There are probably better algorithms to use (histogram-based).
+    */
+   for (k = 0; k < n; k++) {
+      int sum = 0;
+      for (i = 0; i < n_comp; i++) {
+         sum += reord[k][i];
+      }
+      if (minSum > sum) {
+         minSum = sum;
+         minCol = k;
+      }
+      if (maxSum < sum) {
+         maxSum = sum;
+         maxCol = k;
+      }
+   }
+
+   hihi = 0; /* cc-hi = "00" */
+   for (i = 0; i < n_comp; i++) {
+      /* add in colors */
+      hihi <<= 5;
+      hihi |= reord[maxCol][i] >> 3;
+   }
+   for (i = 0; i < n_comp; i++) {
+      /* add in colors */
+      hihi <<= 5;
+      hihi |= reord[minCol][i] >> 3;
+   }
+   cc[3] = hihi;
+   cc[0] = cc[1] = cc[2] = 0;
+
+   /* compute interpolation vector */
+   if (minCol != maxCol) {
+      MAKEIVEC(n_vect, n_comp, iv, b, reord[minCol], reord[maxCol]);
+   }
+
+   /* add in texels */
+   for (k = N_TEXELS - 1; k >= 0; k--) {
+      int t = k * 3;
+      dword *kk = (dword *)((char *)cc + t / 8);
+      int texel = n_vect + 1; /* transparent black */
+
+      if (!ISTBLACK(input[k])) {
+         if (minCol != maxCol) {
+            /* interpolate color */
+            CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
+            /* add in texel */
+            kk[0] |= texel << (t & 7);
+         }
+      } else {
+         /* add in texel */
+         kk[0] |= texel << (t & 7);
+      }
+   }
 }
 
 
 static void
 fxt1_quantize_MIXED1 (dword *cc,
-		      byte input[N_TEXELS][MAX_COMP])
+                      byte input[N_TEXELS][MAX_COMP])
 {
-    const int n_vect = 2; /* highest vector number in each microtile */
-    const int n_comp = 3; /* 3 components: R, G, B */
-    byte vec[2 * 2][MAX_COMP]; /* 2 extrema for each sub-block */
-    float b, iv[MAX_COMP]; /* interpolation vector */
-    int i, j, k;
-    qword hi; /* high quadword */
-    dword lohi, lolo; /* low quadword: hi dword, lo dword */
-
-    int minSum;
-    int maxSum;
-    int minColL = 0, maxColL = -1;
-    int minColR = 0, maxColR = -1;
-
-    /* Our solution here is to find the darkest and brightest colors in
-     * the 4x4 tile and use those as the two representative colors.
-     * There are probably better algorithms to use (histogram-based).
-     */
-#ifndef YUV
-    minSum = 2000; /* big enough */
-#else
-    minSum = 2000000;
-#endif
-    maxSum = -1; /* small enough */
-    for (k = 0; k < N_TEXELS / 2; k++) {
-	if (!ISTBLACK(input[k])) {
-	    int sum = 0;
-#ifndef YUV
-	    for (i = 0; i < n_comp; i++) {
-		sum += input[k][i];
-	    }
-#else
-            sum = 299 * input[k][RCOMP] + 587 * input[k][GCOMP] +  114 * input[k][BCOMP];
-#endif
-	    if (minSum > sum) {
-		minSum = sum;
-		minColL = k;
-	    }
-	    if (maxSum < sum) {
-		maxSum = sum;
-		maxColL = k;
-	    }
-	}
-    }
-#ifndef YUV
-    minSum = 2000; /* big enough */
-#else
-    minSum = 2000000;
-#endif
-    maxSum = -1; /* small enough */
-    for (; k < N_TEXELS; k++) {
-	if (!ISTBLACK(input[k])) {
-	    int sum = 0;
-#ifndef YUV
-	    for (i = 0; i < n_comp; i++) {
-		sum += input[k][i];
-	    }
-#else
-            sum = 299 * input[k][RCOMP] + 587 * input[k][GCOMP] +  114 * input[k][BCOMP];
-#endif
-	    if (minSum > sum) {
-		minSum = sum;
-		minColR = k;
-	    }
-	    if (maxSum < sum) {
-		maxSum = sum;
-		maxColR = k;
-	    }
-	}
-    }
-
-    /* left microtile */
-    if (maxColL == -1) {
-	/* all transparent black */
-	cc[0] = 0xFFFFFFFF;
-	for (i = 0; i < n_comp; i++) {
-	    vec[0][i] = 0;
-	    vec[1][i] = 0;
-	}
-    } else {
-	cc[0] = 0;
-	for (i = 0; i < n_comp; i++) {
-	    vec[0][i] = input[minColL][i];
-	    vec[1][i] = input[maxColL][i];
-	}
-	if (minColL != maxColL) {
-	    /* compute interpolation vector */
-	    MAKEIVEC(n_vect, n_comp, iv, b, vec[0], vec[1]);
-
-	    /* add in texels */
-	    lolo = 0;
-	    for (k = N_TEXELS / 2 - 1; k >= 0; k--) {
-		int texel = n_vect + 1;	/* transparent black */
-		if (!ISTBLACK(input[k])) {
-		    /* interpolate color */
-		    CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
-		}
-		/* add in texel */
-		lolo <<= 2;
-		lolo |= texel;
-	    }
-	    cc[0] = lolo;
-	}
-    }
-
-    /* right microtile */
-    if (maxColR == -1) {
-	/* all transparent black */
-	cc[1] = 0xFFFFFFFF;
-	for (i = 0; i < n_comp; i++) {
-	    vec[2][i] = 0;
-	    vec[3][i] = 0;
-	}
-    } else {
-	cc[1] = 0;
-	for (i = 0; i < n_comp; i++) {
-	    vec[2][i] = input[minColR][i];
-	    vec[3][i] = input[maxColR][i];
-	}
-	if (minColR != maxColR) {
-	    /* compute interpolation vector */
-	    MAKEIVEC(n_vect, n_comp, iv, b, vec[2], vec[3]);
-
-	    /* add in texels */
-	    lohi = 0;
-	    for (k = N_TEXELS - 1; k >= N_TEXELS / 2; k--) {
-		int texel = n_vect + 1;	/* transparent black */
-		if (!ISTBLACK(input[k])) {
-		    /* interpolate color */
-		    CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
-		}
-		/* add in texel */
-		lohi <<= 2;
-		lohi |= texel;
-	    }
-	    cc[1] = lohi;
-	}
-    }
-
-    Q_MOV32(hi, 9 | (vec[3][GCOMP] & 4) | ((vec[1][GCOMP] >> 1) & 2)); /* chroma = "1" */
-    for (j = 2 * 2 - 1; j >= 0; j--) {
-	for (i = 0; i < n_comp; i++) {
-	    /* add in colors */
-	    Q_SHL(hi, 5);
-	    Q_OR32(hi, vec[j][i] >> 3);
-	}
-    }
-    ((qword *)cc)[1] = hi;
+   const int n_vect = 2; /* highest vector number in each microtile */
+   const int n_comp = 3; /* 3 components: R, G, B */
+   byte vec[2 * 2][MAX_COMP]; /* 2 extrema for each sub-block */
+   float b, iv[MAX_COMP]; /* interpolation vector */
+   int i, j, k;
+   qword hi; /* high quadword */
+   dword lohi, lolo; /* low quadword: hi dword, lo dword */
+
+   int minSum;
+   int maxSum;
+   int minColL = 0, maxColL = -1;
+   int minColR = 0, maxColR = -1;
+
+   /* Our solution here is to find the darkest and brightest colors in
+    * the 4x4 tile and use those as the two representative colors.
+    * There are probably better algorithms to use (histogram-based).
+    */
+   minSum = 2000; /* big enough */
+   maxSum = -1; /* small enough */
+   for (k = 0; k < N_TEXELS / 2; k++) {
+      if (!ISTBLACK(input[k])) {
+         int sum = 0;
+         for (i = 0; i < n_comp; i++) {
+            sum += input[k][i];
+         }
+         if (minSum > sum) {
+            minSum = sum;
+            minColL = k;
+         }
+         if (maxSum < sum) {
+            maxSum = sum;
+            maxColL = k;
+         }
+      }
+   }
+   minSum = 2000; /* big enough */
+   maxSum = -1; /* small enough */
+   for (; k < N_TEXELS; k++) {
+      if (!ISTBLACK(input[k])) {
+         int sum = 0;
+         for (i = 0; i < n_comp; i++) {
+            sum += input[k][i];
+         }
+         if (minSum > sum) {
+            minSum = sum;
+            minColR = k;
+         }
+         if (maxSum < sum) {
+            maxSum = sum;
+            maxColR = k;
+         }
+      }
+   }
+
+   /* left microtile */
+   if (maxColL == -1) {
+      /* all transparent black */
+      cc[0] = ~0u;
+      for (i = 0; i < n_comp; i++) {
+         vec[0][i] = 0;
+         vec[1][i] = 0;
+      }
+   } else {
+      cc[0] = 0;
+      for (i = 0; i < n_comp; i++) {
+         vec[0][i] = input[minColL][i];
+         vec[1][i] = input[maxColL][i];
+      }
+      if (minColL != maxColL) {
+         /* compute interpolation vector */
+         MAKEIVEC(n_vect, n_comp, iv, b, vec[0], vec[1]);
+
+         /* add in texels */
+         lolo = 0;
+         for (k = N_TEXELS / 2 - 1; k >= 0; k--) {
+            int texel = n_vect + 1; /* transparent black */
+            if (!ISTBLACK(input[k])) {
+               /* interpolate color */
+               CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
+            }
+            /* add in texel */
+            lolo <<= 2;
+            lolo |= texel;
+         }
+         cc[0] = lolo;
+      }
+   }
+
+   /* right microtile */
+   if (maxColR == -1) {
+      /* all transparent black */
+      cc[1] = ~0u;
+      for (i = 0; i < n_comp; i++) {
+         vec[2][i] = 0;
+         vec[3][i] = 0;
+      }
+   } else {
+      cc[1] = 0;
+      for (i = 0; i < n_comp; i++) {
+         vec[2][i] = input[minColR][i];
+         vec[3][i] = input[maxColR][i];
+      }
+      if (minColR != maxColR) {
+         /* compute interpolation vector */
+         MAKEIVEC(n_vect, n_comp, iv, b, vec[2], vec[3]);
+
+         /* add in texels */
+         lohi = 0;
+         for (k = N_TEXELS - 1; k >= N_TEXELS / 2; k--) {
+            int texel = n_vect + 1; /* transparent black */
+            if (!ISTBLACK(input[k])) {
+               /* interpolate color */
+               CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
+            }
+            /* add in texel */
+            lohi <<= 2;
+            lohi |= texel;
+         }
+         cc[1] = lohi;
+      }
+   }
+
+   Q_MOV32(hi, 9 | (vec[3][GCOMP] & 4) | ((vec[1][GCOMP] >> 1) & 2)); /* chroma = "1" */
+   for (j = 2 * 2 - 1; j >= 0; j--) {
+      for (i = 0; i < n_comp; i++) {
+         /* add in colors */
+         Q_SHL(hi, 5);
+         Q_OR32(hi, vec[j][i] >> 3);
+      }
+   }
+   ((qword *)cc)[1] = hi;
 }
 
 
 static void
 fxt1_quantize_MIXED0 (dword *cc,
-		      byte input[N_TEXELS][MAX_COMP])
+                      byte input[N_TEXELS][MAX_COMP])
 {
-    const int n_vect = 3; /* highest vector number in each microtile */
-    const int n_comp = 3; /* 3 components: R, G, B */
-    byte vec[2 * 2][MAX_COMP]; /* 2 extrema for each sub-block */
-    float b, iv[MAX_COMP]; /* interpolation vector */
-    int i, j, k;
-    qword hi; /* high quadword */
-    dword lohi, lolo; /* low quadword: hi dword, lo dword */
-
-    int minColL = 0, maxColL = 0;
-    int minColR = 0, maxColR = 0;
+   const int n_vect = 3; /* highest vector number in each microtile */
+   const int n_comp = 3; /* 3 components: R, G, B */
+   byte vec[2 * 2][MAX_COMP]; /* 2 extrema for each sub-block */
+   float b, iv[MAX_COMP]; /* interpolation vector */
+   int i, j, k;
+   qword hi; /* high quadword */
+   dword lohi, lolo; /* low quadword: hi dword, lo dword */
+
+   int minColL = 0, maxColL = 0;
+   int minColR = 0, maxColR = 0;
 #if 0
-    int minSum;
-    int maxSum;
-
-    /* Our solution here is to find the darkest and brightest colors in
-     * the 4x4 tile and use those as the two representative colors.
-     * There are probably better algorithms to use (histogram-based).
-     */
-#ifndef YUV
-    minSum = 2000; /* big enough */
-#else
-    minSum = 2000000;
-#endif
-    maxSum = -1; /* small enough */
-    for (k = 0; k < N_TEXELS / 2; k++) {
-	int sum = 0;
-#ifndef YUV
-	for (i = 0; i < n_comp; i++) {
-	    sum += input[k][i];
-	}
-#else
-        sum = 299 * input[k][RCOMP] + 587 * input[k][GCOMP] +  114 * input[k][BCOMP];
-#endif
-	if (minSum > sum) {
-	    minSum = sum;
-	    minColL = k;
-	}
-	if (maxSum < sum) {
-	    maxSum = sum;
-	    maxColL = k;
-	}
-    }
-    minSum = 2000; /* big enough */
-    maxSum = -1; /* small enough */
-    for (; k < N_TEXELS; k++) {
-	int sum = 0;
-#ifndef YUV
-	for (i = 0; i < n_comp; i++) {
-	    sum += input[k][i];
-	}
+   int minSum;
+   int maxSum;
+
+   /* Our solution here is to find the darkest and brightest colors in
+    * the 4x4 tile and use those as the two representative colors.
+    * There are probably better algorithms to use (histogram-based).
+    */
+   minSum = 2000; /* big enough */
+   maxSum = -1; /* small enough */
+   for (k = 0; k < N_TEXELS / 2; k++) {
+      int sum = 0;
+      for (i = 0; i < n_comp; i++) {
+         sum += input[k][i];
+      }
+      if (minSum > sum) {
+         minSum = sum;
+         minColL = k;
+      }
+      if (maxSum < sum) {
+         maxSum = sum;
+         maxColL = k;
+      }
+   }
+   minSum = 2000; /* big enough */
+   maxSum = -1; /* small enough */
+   for (; k < N_TEXELS; k++) {
+      int sum = 0;
+      for (i = 0; i < n_comp; i++) {
+         sum += input[k][i];
+      }
+      if (minSum > sum) {
+         minSum = sum;
+         minColR = k;
+      }
+      if (maxSum < sum) {
+         maxSum = sum;
+         maxColR = k;
+      }
+   }
 #else
-        sum = 299 * input[k][RCOMP] + 587 * input[k][GCOMP] +  114 * input[k][BCOMP];
-#endif
-	if (minSum > sum) {
-	    minSum = sum;
-	    minColR = k;
-	}
-	if (maxSum < sum) {
-	    maxSum = sum;
-	    maxColR = k;
-	}
-    }
-#else
-    int minVal;
-    int maxVal;
-    int maxVarL = fxt1_variance(NULL, input, n_comp, N_TEXELS / 2);
-    int maxVarR = fxt1_variance(NULL, &input[N_TEXELS / 2], n_comp, N_TEXELS / 2);
-
-    /* Scan the channel with max variance for lo & hi
-     * and use those as the two representative colors.
-     */
-    minVal = 2000; /* big enough */
-    maxVal = -1; /* small enough */
-    for (k = 0; k < N_TEXELS / 2; k++) {
-	int t = input[k][maxVarL];
-	if (minVal > t) {
-	    minVal = t;
-	    minColL = k;
-	}
-	if (maxVal < t) {
-	    maxVal = t;
-	    maxColL = k;
-	}
-    }
-    minVal = 2000; /* big enough */
-    maxVal = -1; /* small enough */
-    for (; k < N_TEXELS; k++) {
-	int t = input[k][maxVarR];
-	if (minVal > t) {
-	    minVal = t;
-	    minColR = k;
-	}
-	if (maxVal < t) {
-	    maxVal = t;
-	    maxColR = k;
-	}
-    }
+   int minVal;
+   int maxVal;
+   int maxVarL = fxt1_variance(NULL, input, n_comp, N_TEXELS / 2);
+   int maxVarR = fxt1_variance(NULL, &input[N_TEXELS / 2], n_comp, N_TEXELS / 2);
+
+   /* Scan the channel with max variance for lo & hi
+    * and use those as the two representative colors.
+    */
+   minVal = 2000; /* big enough */
+   maxVal = -1; /* small enough */
+   for (k = 0; k < N_TEXELS / 2; k++) {
+      int t = input[k][maxVarL];
+      if (minVal > t) {
+         minVal = t;
+         minColL = k;
+      }
+      if (maxVal < t) {
+         maxVal = t;
+         maxColL = k;
+      }
+   }
+   minVal = 2000; /* big enough */
+   maxVal = -1; /* small enough */
+   for (; k < N_TEXELS; k++) {
+      int t = input[k][maxVarR];
+      if (minVal > t) {
+         minVal = t;
+         minColR = k;
+      }
+      if (maxVal < t) {
+         maxVal = t;
+         maxColR = k;
+      }
+   }
 #endif
 
-    /* left microtile */
-    cc[0] = 0;
-    for (i = 0; i < n_comp; i++) {
-	vec[0][i] = input[minColL][i];
-	vec[1][i] = input[maxColL][i];
-    }
-    if (minColL != maxColL) {
-	/* compute interpolation vector */
-	MAKEIVEC(n_vect, n_comp, iv, b, vec[0], vec[1]);
-
-	/* add in texels */
-	lolo = 0;
-	for (k = N_TEXELS / 2 - 1; k >= 0; k--) {
-	    int texel;
-	    /* interpolate color */
-	    CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
-	    /* add in texel */
-	    lolo <<= 2;
-	    lolo |= texel;
-	}
-
-	/* funky encoding for LSB of green */
-	if ((int)((lolo >> 1) & 1) != (((vec[1][GCOMP] ^ vec[0][GCOMP]) >> 2) & 1)) {
-	    for (i = 0; i < n_comp; i++) {
-		vec[1][i] = input[minColL][i];
-		vec[0][i] = input[maxColL][i];
-	    }
-	    lolo = ~lolo;
-	}
-
-	cc[0] = lolo;
-    }
-
-    /* right microtile */
-    cc[1] = 0;
-    for (i = 0; i < n_comp; i++) {
-	vec[2][i] = input[minColR][i];
-	vec[3][i] = input[maxColR][i];
-    }
-    if (minColR != maxColR) {
-	/* compute interpolation vector */
-	MAKEIVEC(n_vect, n_comp, iv, b, vec[2], vec[3]);
-
-	/* add in texels */
-	lohi = 0;
-	for (k = N_TEXELS - 1; k >= N_TEXELS / 2; k--) {
-	    int texel;
-	    /* interpolate color */
-	    CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
-	    /* add in texel */
-	    lohi <<= 2;
-	    lohi |= texel;
-	}
-
-	/* funky encoding for LSB of green */
-	if ((int)((lohi >> 1) & 1) != (((vec[3][GCOMP] ^ vec[2][GCOMP]) >> 2) & 1)) {
-	    for (i = 0; i < n_comp; i++) {
-		vec[3][i] = input[minColR][i];
-		vec[2][i] = input[maxColR][i];
-	    }
-	    lohi = ~lohi;
-	}
-
-	cc[1] = lohi;
-    }
-
-    Q_MOV32(hi, 8 | (vec[3][GCOMP] & 4) | ((vec[1][GCOMP] >> 1) & 2)); /* chroma = "1" */
-    for (j = 2 * 2 - 1; j >= 0; j--) {
-	for (i = 0; i < n_comp; i++) {
-	    /* add in colors */
-	    Q_SHL(hi, 5);
-	    Q_OR32(hi, vec[j][i] >> 3);
-	}
-    }
-    ((qword *)cc)[1] = hi;
+   /* left microtile */
+   cc[0] = 0;
+   for (i = 0; i < n_comp; i++) {
+      vec[0][i] = input[minColL][i];
+      vec[1][i] = input[maxColL][i];
+   }
+   if (minColL != maxColL) {
+      /* compute interpolation vector */
+      MAKEIVEC(n_vect, n_comp, iv, b, vec[0], vec[1]);
+
+      /* add in texels */
+      lolo = 0;
+      for (k = N_TEXELS / 2 - 1; k >= 0; k--) {
+         int texel;
+         /* interpolate color */
+         CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
+         /* add in texel */
+         lolo <<= 2;
+         lolo |= texel;
+      }
+
+      /* funky encoding for LSB of green */
+      if ((int)((lolo >> 1) & 1) != (((vec[1][GCOMP] ^ vec[0][GCOMP]) >> 2) & 1)) {
+         for (i = 0; i < n_comp; i++) {
+            vec[1][i] = input[minColL][i];
+            vec[0][i] = input[maxColL][i];
+         }
+         lolo = ~lolo;
+      }
+      
+      cc[0] = lolo;
+   }
+
+   /* right microtile */
+   cc[1] = 0;
+   for (i = 0; i < n_comp; i++) {
+      vec[2][i] = input[minColR][i];
+      vec[3][i] = input[maxColR][i];
+   }
+   if (minColR != maxColR) {
+      /* compute interpolation vector */
+      MAKEIVEC(n_vect, n_comp, iv, b, vec[2], vec[3]);
+
+      /* add in texels */
+      lohi = 0;
+      for (k = N_TEXELS - 1; k >= N_TEXELS / 2; k--) {
+         int texel;
+         /* interpolate color */
+         CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
+         /* add in texel */
+         lohi <<= 2;
+         lohi |= texel;
+      }
+
+      /* funky encoding for LSB of green */
+      if ((int)((lohi >> 1) & 1) != (((vec[3][GCOMP] ^ vec[2][GCOMP]) >> 2) & 1)) {
+         for (i = 0; i < n_comp; i++) {
+            vec[3][i] = input[minColR][i];
+            vec[2][i] = input[maxColR][i];
+         }
+         lohi = ~lohi;
+      }
+
+      cc[1] = lohi;
+   }
+
+   Q_MOV32(hi, 8 | (vec[3][GCOMP] & 4) | ((vec[1][GCOMP] >> 1) & 2)); /* chroma = "1" */
+   for (j = 2 * 2 - 1; j >= 0; j--) {
+      for (i = 0; i < n_comp; i++) {
+         /* add in colors */
+         Q_SHL(hi, 5);
+         Q_OR32(hi, vec[j][i] >> 3);
+      }
+   }
+   ((qword *)cc)[1] = hi;
 }
 
 
 static void
 fxt1_quantize (dword *cc, const byte *lines[], int comps)
 {
-    int trualpha;
-    byte reord[N_TEXELS][MAX_COMP];
+   int trualpha;
+   byte reord[N_TEXELS][MAX_COMP];
+
+   byte input[N_TEXELS][MAX_COMP];
+   int i, k, l;
+
+   if (comps == 3) {
+      /* make the whole block opaque */
+      memset(input, -1, sizeof(input));
+   }
+
+   /* 8 texels each line */
+   for (l = 0; l < 4; l++) {
+      for (k = 0; k < 4; k++) {
+         for (i = 0; i < comps; i++) {
+            input[k + l * 4][i] = *lines[l]++;
+         }
+      }
+      for (; k < 8; k++) {
+         for (i = 0; i < comps; i++) {
+            input[k + l * 4 + 12][i] = *lines[l]++;
+         }
+      }
+   }
+
+   /* block layout:
+    * 00, 01, 02, 03, 08, 09, 0a, 0b
+    * 10, 11, 12, 13, 18, 19, 1a, 1b
+    * 04, 05, 06, 07, 0c, 0d, 0e, 0f
+    * 14, 15, 16, 17, 1c, 1d, 1e, 1f
+    */
+
+   /* [dBorca]
+    * stupidity flows forth from this
+    */
+   l = N_TEXELS;
+   trualpha = 0;
+   if (comps == 4) {
+      /* skip all transparent black texels */
+      l = 0;
+      for (k = 0; k < N_TEXELS; k++) {
+         /* test all components against 0 */
+         if (!ISTBLACK(input[k])) {
+            /* texel is not transparent black */
+            COPY_4UBV(reord[l], input[k]);
+            if (reord[l][ACOMP] < (255 - ALPHA_TS)) {
+               /* non-opaque texel */
+               trualpha = !0;
+            }
+            l++;
+         }
+      }
+   }
 
-    byte input[N_TEXELS][MAX_COMP];
-#ifndef ARGB
-    int i;
-#endif
-    int k, l;
-
-    if (comps == 3) {
-	/* make the whole block opaque */
-	memset(input, -1, sizeof(input));
-    }
-
-    /* 8 texels each line */
-#ifndef ARGB
-    for (l = 0; l < 4; l++) {
-	for (k = 0; k < 4; k++) {
-	    for (i = 0; i < comps; i++) {
-		input[k + l * 4][i] = *lines[l]++;
-	    }
-	}
-	for (; k < 8; k++) {
-	    for (i = 0; i < comps; i++) {
-		input[k + l * 4 + 12][i] = *lines[l]++;
-	    }
-	}
-    }
+#if 0
+   if (trualpha) {
+      fxt1_quantize_ALPHA0(cc, input, reord, l);
+   } else if (l == 0) {
+      cc[0] = cc[1] = cc[2] = -1;
+      cc[3] = 0;
+   } else if (l < N_TEXELS) {
+      fxt1_quantize_HI(cc, input, reord, l);
+   } else {
+      fxt1_quantize_CHROMA(cc, input);
+   }
+   (void)fxt1_quantize_ALPHA1;
+   (void)fxt1_quantize_MIXED1;
+   (void)fxt1_quantize_MIXED0;
 #else
-    /* H.Morii - support for ARGB inputs */
-    for (l = 0; l < 4; l++) {
-	for (k = 0; k < 4; k++) {
-          input[k + l * 4][2] = *lines[l]++;
-          input[k + l * 4][1] = *lines[l]++;
-          input[k + l * 4][0] = *lines[l]++;
-          if (comps == 4) input[k + l * 4][3] = *lines[l]++;
-	}
-	for (; k < 8; k++) {
-          input[k + l * 4 + 12][2] = *lines[l]++;
-          input[k + l * 4 + 12][1] = *lines[l]++;
-          input[k + l * 4 + 12][0] = *lines[l]++;
-          if (comps == 4) input[k + l * 4 + 12][3] = *lines[l]++;
-	}
-    }
+   if (trualpha) {
+      fxt1_quantize_ALPHA1(cc, input);
+   } else if (l == 0) {
+      cc[0] = cc[1] = cc[2] = ~0u;
+      cc[3] = 0;
+   } else if (l < N_TEXELS) {
+      fxt1_quantize_MIXED1(cc, input);
+   } else {
+      fxt1_quantize_MIXED0(cc, input);
+   }
+   (void)fxt1_quantize_ALPHA0;
+   (void)fxt1_quantize_HI;
+   (void)fxt1_quantize_CHROMA;
 #endif
+}
 
-    /* block layout:
-     * 00, 01, 02, 03, 08, 09, 0a, 0b
-     * 10, 11, 12, 13, 18, 19, 1a, 1b
-     * 04, 05, 06, 07, 0c, 0d, 0e, 0f
-     * 14, 15, 16, 17, 1c, 1d, 1e, 1f
-     */
-
-    /* [dBorca]
-     * stupidity flows forth from this
-     */
-    l = N_TEXELS;
-    trualpha = 0;
-    if (comps == 4) {
-	/* skip all transparent black texels */
-	l = 0;
-	for (k = 0; k < N_TEXELS; k++) {
-	    /* test all components against 0 */
-	    if (!ISTBLACK(input[k])) {
-		/* texel is not transparent black */
-		COPY_4UBV(reord[l], input[k]);
-		if (reord[l][ACOMP] < (255 - ALPHA_TS)) {
-		    /* non-opaque texel */
-		    trualpha = !0;
-		}
-		l++;
-	    }
-	}
-    }
 
+
+/**
+ * Upscale an image by replication, not (typical) stretching.
+ * We use this when the image width or height is less than a
+ * certain size (4, 8) and we need to upscale an image.
+ */
+static void
+upscale_teximage2d(int inWidth, int inHeight,
+                   int outWidth, int outHeight,
+                   int comps, const byte *src, int srcRowStride,
+                   byte *dest )
+{
+   int i, j, k;
+
+   assert(outWidth >= inWidth);
+   assert(outHeight >= inHeight);
 #if 0
-    if (trualpha) {
-	fxt1_quantize_ALPHA0(cc, input, reord, l);
-    } else if (l == 0) {
-	cc[0] = cc[1] = cc[2] = -1;
-	cc[3] = 0;
-    } else if (l < N_TEXELS) {
-	fxt1_quantize_HI(cc, input, reord, l);
-    } else {
-	fxt1_quantize_CHROMA(cc, input);
-    }
-    (void)fxt1_quantize_ALPHA1;
-    (void)fxt1_quantize_MIXED1;
-    (void)fxt1_quantize_MIXED0;
-#else
-    if (trualpha) {
-	fxt1_quantize_ALPHA1(cc, input);
-    } else if (l == 0) {
-	cc[0] = cc[1] = cc[2] = 0xFFFFFFFF;
-	cc[3] = 0;
-    } else if (l < N_TEXELS) {
-	fxt1_quantize_MIXED1(cc, input);
-    } else {
-	fxt1_quantize_MIXED0(cc, input);
-    }
-    (void)fxt1_quantize_ALPHA0;
-    (void)fxt1_quantize_HI;
-    (void)fxt1_quantize_CHROMA;
+   ASSERT(inWidth == 1 || inWidth == 2 || inHeight == 1 || inHeight == 2);
+   ASSERT((outWidth & 3) == 0);
+   ASSERT((outHeight & 3) == 0);
 #endif
-}
 
+   for (i = 0; i < outHeight; i++) {
+      const int ii = i % inHeight;
+      for (j = 0; j < outWidth; j++) {
+         const int jj = j % inWidth;
+         for (k = 0; k < comps; k++) {
+            dest[(i * outWidth + j) * comps + k]
+               = src[ii * srcRowStride + jj * comps + k];
+         }
+      }
+   }
+}
 
-TAPI int TAPIENTRY
-fxt1_encode (int width, int height, int comps,
-	     const void *source, int srcRowStride,
-	     void *dest, int destRowStride)
+TAPI void TAPIENTRY
+fxt1_encode (dword width, dword height, int comps,
+             const void *source, int srcRowStride,
+             void *dest, int destRowStride)
 {
-    int x, y;
-    const byte *data;
-    dword *encoded = (dword *)dest;
-    void *newSource = NULL;
-
-    /* Replicate image if width is not M8 or height is not M4 */
-    if ((width & 7) | (height & 3)) {
-	int newWidth = (width + 7) & ~7;
-	int newHeight = (height + 3) & ~3;
-	newSource = malloc(comps * newWidth * newHeight * sizeof(byte *));
-	_mesa_upscale_teximage2d(width, height, newWidth, newHeight,
-				 comps, (const byte *)source,
-				 srcRowStride, (byte *)newSource);
-	source = newSource;
-	width = newWidth;
-	height = newHeight;
-	srcRowStride = comps * newWidth;
-    }
-
-    data = (const byte *)source;
-    destRowStride = (destRowStride - width * 2) / 4;
-    for (y = 0; y < height; y += 4) {
-	unsigned int offs = 0 + (y + 0) * srcRowStride;
-	for (x = 0; x < width; x += 8) {
-	    const byte *lines[4];
-	    lines[0] = &data[offs];
-	    lines[1] = lines[0] + srcRowStride;
-	    lines[2] = lines[1] + srcRowStride;
-	    lines[3] = lines[2] + srcRowStride;
-	    offs += 8 * comps;
-	    fxt1_quantize(encoded, lines, comps);
-	    /* 128 bits per 8x4 block */
-	    encoded += 4;
-	}
-	encoded += destRowStride;
-    }
-
-    if (newSource != NULL) {
-	free(newSource);
-    }
-
-    return 0;
+   dword x, y;
+   const byte *data;
+   dword *encoded = (dword *)dest;
+   void *newSource = NULL, *newSourcetmp = NULL;
+
+   assert(comps == 3 || comps == 4);
+
+   if (comps == 3)
+       newSource = reorder_source_3_alloc(source, width, height, srcRowStride);
+   if (comps == 4)
+       newSource = reorder_source_4_alloc(source, width, height, srcRowStride);
+   if (!newSource)
+       goto cleanUp;
+   source = newSource;
+
+   /* Replicate image if width is not M8 or height is not M4 */
+   if ((width & 7) | (height & 3)) {
+      int newWidth = (width + 7) & ~7;
+      int newHeight = (height + 3) & ~3;
+      newSourcetmp = malloc(comps * newWidth * newHeight * sizeof(byte));
+      free(newSource);
+      newSource = newSourcetmp;
+      if (!newSource) {
+         goto cleanUp;
+      }
+      upscale_teximage2d(width, height, newWidth, newHeight,
+                         comps, (const byte *) source,
+                         srcRowStride, (byte *) newSource);
+      source = newSource;
+      width = newWidth;
+      height = newHeight;
+      srcRowStride = comps * newWidth;
+   }
+
+   data = (const byte *) source;
+   destRowStride = (destRowStride - width * 2) / 4;
+   for (y = 0; y < height; y += 4) {
+      dword offs = 0 + (y + 0) * srcRowStride;
+      for (x = 0; x < width; x += 8) {
+         const byte *lines[4];
+         lines[0] = &data[offs];
+         lines[1] = lines[0] + srcRowStride;
+         lines[2] = lines[1] + srcRowStride;
+         lines[3] = lines[2] + srcRowStride;
+         offs += 8 * comps;
+         fxt1_quantize(encoded, lines, comps);
+         /* 128 bits per 8x4 block */
+         encoded += 4;
+      }
+      encoded += destRowStride;
+   }
+
+ cleanUp:
+   free(newSource);
 }
 
 
@@ -1183,22 +1156,22 @@ fxt1_encode (int width, int height, int comps,
 
 /* lookup table for scaling 5 bit colors up to 8 bits */
 static const byte _rgb_scale_5[] = {
-    0,   8,   16,  25,  33,  41,  49,  58,
-    66,  74,  82,  90,  99,  107, 115, 123,
-    132, 140, 148, 156, 165, 173, 181, 189,
-    197, 206, 214, 222, 230, 239, 247, 255
+   0,   8,   16,  25,  33,  41,  49,  58,
+   66,  74,  82,  90,  99,  107, 115, 123,
+   132, 140, 148, 156, 165, 173, 181, 189,
+   197, 206, 214, 222, 230, 239, 247, 255
 };
 
 /* lookup table for scaling 6 bit colors up to 8 bits */
 static const byte _rgb_scale_6[] = {
-    0,   4,   8,   12,  16,  20,  24,  28,
-    32,  36,  40,  45,  49,  53,  57,  61,
-    65,  69,  73,  77,  81,  85,  89,  93,
-    97,  101, 105, 109, 113, 117, 121, 125,
-    130, 134, 138, 142, 146, 150, 154, 158,
-    162, 166, 170, 174, 178, 182, 186, 190,
-    194, 198, 202, 206, 210, 215, 219, 223,
-    227, 231, 235, 239, 243, 247, 251, 255
+   0,   4,   8,   12,  16,  20,  24,  28,
+   32,  36,  40,  45,  49,  53,  57,  61,
+   65,  69,  73,  77,  81,  85,  89,  93,
+   97,  101, 105, 109, 113, 117, 121, 125,
+   130, 134, 138, 142, 146, 150, 154, 158,
+   162, 166, 170, 174, 178, 182, 186, 190,
+   194, 198, 202, 206, 210, 215, 219, 223,
+   227, 231, 235, 239, 243, 247, 251, 255
 };
 
 
@@ -1206,254 +1179,251 @@ static const byte _rgb_scale_6[] = {
 #define UP5(c) _rgb_scale_5[(c) & 31]
 #define UP6(c, b) _rgb_scale_6[(((c) & 31) << 1) | ((b) & 1)]
 #define LERP(n, t, c0, c1) (((n) - (t)) * (c0) + (t) * (c1) + (n) / 2) / (n)
-#define ZERO_4UBV(v) *((dword *)(v)) = 0
 
 
 static void
 fxt1_decode_1HI (const byte *code, int t, byte *rgba)
 {
-    const dword *cc;
-
-    t *= 3;
-    cc = (const dword *)(code + t / 8);
-    t = (cc[0] >> (t & 7)) & 7;
-
-    if (t == 7) {
-	ZERO_4UBV(rgba);
-    } else {
-	cc = (const dword *)(code + 12);
-	if (t == 0) {
-	    rgba[BCOMP] = UP5(CC_SEL(cc, 0));
-	    rgba[GCOMP] = UP5(CC_SEL(cc, 5));
-	    rgba[RCOMP] = UP5(CC_SEL(cc, 10));
-	} else if (t == 6) {
-	    rgba[BCOMP] = UP5(CC_SEL(cc, 15));
-	    rgba[GCOMP] = UP5(CC_SEL(cc, 20));
-	    rgba[RCOMP] = UP5(CC_SEL(cc, 25));
-	} else {
-	    rgba[BCOMP] = LERP(6, t, UP5(CC_SEL(cc, 0)), UP5(CC_SEL(cc, 15)));
-	    rgba[GCOMP] = LERP(6, t, UP5(CC_SEL(cc, 5)), UP5(CC_SEL(cc, 20)));
-	    rgba[RCOMP] = LERP(6, t, UP5(CC_SEL(cc, 10)), UP5(CC_SEL(cc, 25)));
-	}
-	rgba[ACOMP] = 255;
-    }
+   const dword *cc;
+
+   t *= 3;
+   cc = (const dword *)(code + t / 8);
+   t = (cc[0] >> (t & 7)) & 7;
+
+   if (t == 7) {
+      rgba[RCOMP] = rgba[GCOMP] = rgba[BCOMP] = rgba[ACOMP] = 0;
+   } else {
+      byte r, g, b;
+      cc = (const dword *)(code + 12);
+      if (t == 0) {
+         b = UP5(CC_SEL(cc, 0));
+         g = UP5(CC_SEL(cc, 5));
+         r = UP5(CC_SEL(cc, 10));
+      } else if (t == 6) {
+         b = UP5(CC_SEL(cc, 15));
+         g = UP5(CC_SEL(cc, 20));
+         r = UP5(CC_SEL(cc, 25));
+      } else {
+         b = LERP(6, t, UP5(CC_SEL(cc, 0)), UP5(CC_SEL(cc, 15)));
+         g = LERP(6, t, UP5(CC_SEL(cc, 5)), UP5(CC_SEL(cc, 20)));
+         r = LERP(6, t, UP5(CC_SEL(cc, 10)), UP5(CC_SEL(cc, 25)));
+      }
+      rgba[RCOMP] = r;
+      rgba[GCOMP] = g;
+      rgba[BCOMP] = b;
+      rgba[ACOMP] = 255;
+   }
 }
 
 
 static void
 fxt1_decode_1CHROMA (const byte *code, int t, byte *rgba)
 {
-    const dword *cc;
-    dword kk;
-
-    cc = (const dword *)code;
-    if (t & 16) {
-	cc++;
-	t &= 15;
-    }
-    t = (cc[0] >> (t * 2)) & 3;
-
-    t *= 15;
-    cc = (const dword *)(code + 8 + t / 8);
-    kk = cc[0] >> (t & 7);
-    rgba[BCOMP] = UP5(kk);
-    rgba[GCOMP] = UP5(kk >> 5);
-    rgba[RCOMP] = UP5(kk >> 10);
-    rgba[ACOMP] = 255;
+   const dword *cc;
+   dword kk;
+
+   cc = (const dword *)code;
+   if (t & 16) {
+      cc++;
+      t &= 15;
+   }
+   t = (cc[0] >> (t * 2)) & 3;
+
+   t *= 15;
+   cc = (const dword *)(code + 8 + t / 8);
+   kk = cc[0] >> (t & 7);
+   rgba[BCOMP] = UP5(kk);
+   rgba[GCOMP] = UP5(kk >> 5);
+   rgba[RCOMP] = UP5(kk >> 10);
+   rgba[ACOMP] = 255;
 }
 
 
 static void
 fxt1_decode_1MIXED (const byte *code, int t, byte *rgba)
 {
-    const dword *cc;
-    int col[2][3];
-    int glsb, selb;
-
-    cc = (const dword *)code;
-    if (t & 16) {
-	t &= 15;
-	t = (cc[1] >> (t * 2)) & 3;
-	/* col 2 */
-	col[0][BCOMP] = (*(const dword *)(code + 11)) >> 6;
-	col[0][GCOMP] = CC_SEL(cc, 99);
-	col[0][RCOMP] = CC_SEL(cc, 104);
-	/* col 3 */
-	col[1][BCOMP] = CC_SEL(cc, 109);
-	col[1][GCOMP] = CC_SEL(cc, 114);
-	col[1][RCOMP] = CC_SEL(cc, 119);
-	glsb = CC_SEL(cc, 126);
-	selb = CC_SEL(cc, 33);
-    } else {
-	t = (cc[0] >> (t * 2)) & 3;
-	/* col 0 */
-	col[0][BCOMP] = CC_SEL(cc, 64);
-	col[0][GCOMP] = CC_SEL(cc, 69);
-	col[0][RCOMP] = CC_SEL(cc, 74);
-	/* col 1 */
-	col[1][BCOMP] = CC_SEL(cc, 79);
-	col[1][GCOMP] = CC_SEL(cc, 84);
-	col[1][RCOMP] = CC_SEL(cc, 89);
-	glsb = CC_SEL(cc, 125);
-	selb = CC_SEL(cc, 1);
-    }
-
-    if (CC_SEL(cc, 124) & 1) {
-	/* alpha[0] == 1 */
-
-	if (t == 3) {
-	    ZERO_4UBV(rgba);
-	} else {
-	    if (t == 0) {
-		rgba[BCOMP] = UP5(col[0][BCOMP]);
-		rgba[GCOMP] = UP5(col[0][GCOMP]);
-		rgba[RCOMP] = UP5(col[0][RCOMP]);
-	    } else if (t == 2) {
-		rgba[BCOMP] = UP5(col[1][BCOMP]);
-		rgba[GCOMP] = UP6(col[1][GCOMP], glsb);
-		rgba[RCOMP] = UP5(col[1][RCOMP]);
-	    } else {
-		rgba[BCOMP] = (UP5(col[0][BCOMP]) + UP5(col[1][BCOMP])) / 2;
-		rgba[GCOMP] = (UP5(col[0][GCOMP]) + UP6(col[1][GCOMP], glsb)) / 2;
-		rgba[RCOMP] = (UP5(col[0][RCOMP]) + UP5(col[1][RCOMP])) / 2;
-	    }
-	    rgba[ACOMP] = 255;
-	}
-    } else {
-	/* alpha[0] == 0 */
-
-	if (t == 0) {
-	    rgba[BCOMP] = UP5(col[0][BCOMP]);
-	    rgba[GCOMP] = UP6(col[0][GCOMP], glsb ^ selb);
-	    rgba[RCOMP] = UP5(col[0][RCOMP]);
-	} else if (t == 3) {
-	    rgba[BCOMP] = UP5(col[1][BCOMP]);
-	    rgba[GCOMP] = UP6(col[1][GCOMP], glsb);
-	    rgba[RCOMP] = UP5(col[1][RCOMP]);
-	} else {
-	    rgba[BCOMP] = LERP(3, t, UP5(col[0][BCOMP]), UP5(col[1][BCOMP]));
-	    rgba[GCOMP] = LERP(3, t, UP6(col[0][GCOMP], glsb ^ selb),
-				     UP6(col[1][GCOMP], glsb));
-	    rgba[RCOMP] = LERP(3, t, UP5(col[0][RCOMP]), UP5(col[1][RCOMP]));
-	}
-	rgba[ACOMP] = 255;
-    }
+   const dword *cc;
+   dword col[2][3];
+   int glsb, selb;
+
+   cc = (const dword *)code;
+   if (t & 16) {
+      t &= 15;
+      t = (cc[1] >> (t * 2)) & 3;
+      /* col 2 */
+      col[0][BCOMP] = (*(const dword *)(code + 11)) >> 6;
+      col[0][GCOMP] = CC_SEL(cc, 99);
+      col[0][RCOMP] = CC_SEL(cc, 104);
+      /* col 3 */
+      col[1][BCOMP] = CC_SEL(cc, 109);
+      col[1][GCOMP] = CC_SEL(cc, 114);
+      col[1][RCOMP] = CC_SEL(cc, 119);
+      glsb = CC_SEL(cc, 126);
+      selb = CC_SEL(cc, 33);
+   } else {
+      t = (cc[0] >> (t * 2)) & 3;
+      /* col 0 */
+      col[0][BCOMP] = CC_SEL(cc, 64);
+      col[0][GCOMP] = CC_SEL(cc, 69);
+      col[0][RCOMP] = CC_SEL(cc, 74);
+      /* col 1 */
+      col[1][BCOMP] = CC_SEL(cc, 79);
+      col[1][GCOMP] = CC_SEL(cc, 84);
+      col[1][RCOMP] = CC_SEL(cc, 89);
+      glsb = CC_SEL(cc, 125);
+      selb = CC_SEL(cc, 1);
+   }
+
+   if (CC_SEL(cc, 124) & 1) {
+      /* alpha[0] == 1 */
+
+      if (t == 3) {
+         /* zero */
+         rgba[RCOMP] = rgba[BCOMP] = rgba[GCOMP] = rgba[ACOMP] = 0;
+      } else {
+         byte r, g, b;
+         if (t == 0) {
+            b = UP5(col[0][BCOMP]);
+            g = UP5(col[0][GCOMP]);
+            r = UP5(col[0][RCOMP]);
+         } else if (t == 2) {
+            b = UP5(col[1][BCOMP]);
+            g = UP6(col[1][GCOMP], glsb);
+            r = UP5(col[1][RCOMP]);
+         } else {
+            b = (UP5(col[0][BCOMP]) + UP5(col[1][BCOMP])) / 2;
+            g = (UP5(col[0][GCOMP]) + UP6(col[1][GCOMP], glsb)) / 2;
+            r = (UP5(col[0][RCOMP]) + UP5(col[1][RCOMP])) / 2;
+         }
+         rgba[RCOMP] = r;
+         rgba[GCOMP] = g;
+         rgba[BCOMP] = b;
+         rgba[ACOMP] = 255;
+      }
+   } else {
+      /* alpha[0] == 0 */
+      byte r, g, b;
+      if (t == 0) {
+         b = UP5(col[0][BCOMP]);
+         g = UP6(col[0][GCOMP], glsb ^ selb);
+         r = UP5(col[0][RCOMP]);
+      } else if (t == 3) {
+         b = UP5(col[1][BCOMP]);
+         g = UP6(col[1][GCOMP], glsb);
+         r = UP5(col[1][RCOMP]);
+      } else {
+         b = LERP(3, t, UP5(col[0][BCOMP]), UP5(col[1][BCOMP]));
+         g = LERP(3, t, UP6(col[0][GCOMP], glsb ^ selb),
+                        UP6(col[1][GCOMP], glsb));
+         r = LERP(3, t, UP5(col[0][RCOMP]), UP5(col[1][RCOMP]));
+      }
+      rgba[RCOMP] = r;
+      rgba[GCOMP] = g;
+      rgba[BCOMP] = b;
+      rgba[ACOMP] = 255;
+   }
 }
 
 
 static void
 fxt1_decode_1ALPHA (const byte *code, int t, byte *rgba)
 {
-    const dword *cc;
-
-    cc = (const dword *)code;
-    if (CC_SEL(cc, 124) & 1) {
-	/* lerp == 1 */
-	int col0[4];
-
-	if (t & 16) {
-	    t &= 15;
-	    t = (cc[1] >> (t * 2)) & 3;
-	    /* col 2 */
-	    col0[BCOMP] = (*(const dword *)(code + 11)) >> 6;
-	    col0[GCOMP] = CC_SEL(cc, 99);
-	    col0[RCOMP] = CC_SEL(cc, 104);
-	    col0[ACOMP] = CC_SEL(cc, 119);
-	} else {
-	    t = (cc[0] >> (t * 2)) & 3;
-	    /* col 0 */
-	    col0[BCOMP] = CC_SEL(cc, 64);
-	    col0[GCOMP] = CC_SEL(cc, 69);
-	    col0[RCOMP] = CC_SEL(cc, 74);
-	    col0[ACOMP] = CC_SEL(cc, 109);
-	}
-
-	if (t == 0) {
-	    rgba[BCOMP] = UP5(col0[BCOMP]);
-	    rgba[GCOMP] = UP5(col0[GCOMP]);
-	    rgba[RCOMP] = UP5(col0[RCOMP]);
-	    rgba[ACOMP] = UP5(col0[ACOMP]);
-	} else if (t == 3) {
-	    rgba[BCOMP] = UP5(CC_SEL(cc, 79));
-	    rgba[GCOMP] = UP5(CC_SEL(cc, 84));
-	    rgba[RCOMP] = UP5(CC_SEL(cc, 89));
-	    rgba[ACOMP] = UP5(CC_SEL(cc, 114));
-	} else {
-	    rgba[BCOMP] = LERP(3, t, UP5(col0[BCOMP]), UP5(CC_SEL(cc, 79)));
-	    rgba[GCOMP] = LERP(3, t, UP5(col0[GCOMP]), UP5(CC_SEL(cc, 84)));
-	    rgba[RCOMP] = LERP(3, t, UP5(col0[RCOMP]), UP5(CC_SEL(cc, 89)));
-	    rgba[ACOMP] = LERP(3, t, UP5(col0[ACOMP]), UP5(CC_SEL(cc, 114)));
-	}
-    } else {
-	/* lerp == 0 */
-
-	if (t & 16) {
-	    cc++;
-	    t &= 15;
-	}
-	t = (cc[0] >> (t * 2)) & 3;
-
-	if (t == 3) {
-	    ZERO_4UBV(rgba);
-	} else {
-	    dword kk;
-	    cc = (const dword *)code;
-	    rgba[ACOMP] = UP5(cc[3] >> (t * 5 + 13));
-	    t *= 15;
-	    cc = (const dword *)(code + 8 + t / 8);
-	    kk = cc[0] >> (t & 7);
-	    rgba[BCOMP] = UP5(kk);
-	    rgba[GCOMP] = UP5(kk >> 5);
-	    rgba[RCOMP] = UP5(kk >> 10);
-	}
-    }
+   const dword *cc;
+   byte r, g, b, a;
+
+   cc = (const dword *)code;
+   if (CC_SEL(cc, 124) & 1) {
+      /* lerp == 1 */
+      dword col0[4];
+
+      if (t & 16) {
+         t &= 15;
+         t = (cc[1] >> (t * 2)) & 3;
+         /* col 2 */
+         col0[BCOMP] = (*(const dword *)(code + 11)) >> 6;
+         col0[GCOMP] = CC_SEL(cc, 99);
+         col0[RCOMP] = CC_SEL(cc, 104);
+         col0[ACOMP] = CC_SEL(cc, 119);
+      } else {
+         t = (cc[0] >> (t * 2)) & 3;
+         /* col 0 */
+         col0[BCOMP] = CC_SEL(cc, 64);
+         col0[GCOMP] = CC_SEL(cc, 69);
+         col0[RCOMP] = CC_SEL(cc, 74);
+         col0[ACOMP] = CC_SEL(cc, 109);
+      }
+
+      if (t == 0) {
+         b = UP5(col0[BCOMP]);
+         g = UP5(col0[GCOMP]);
+         r = UP5(col0[RCOMP]);
+         a = UP5(col0[ACOMP]);
+      } else if (t == 3) {
+         b = UP5(CC_SEL(cc, 79));
+         g = UP5(CC_SEL(cc, 84));
+         r = UP5(CC_SEL(cc, 89));
+         a = UP5(CC_SEL(cc, 114));
+      } else {
+         b = LERP(3, t, UP5(col0[BCOMP]), UP5(CC_SEL(cc, 79)));
+         g = LERP(3, t, UP5(col0[GCOMP]), UP5(CC_SEL(cc, 84)));
+         r = LERP(3, t, UP5(col0[RCOMP]), UP5(CC_SEL(cc, 89)));
+         a = LERP(3, t, UP5(col0[ACOMP]), UP5(CC_SEL(cc, 114)));
+      }
+   } else {
+      /* lerp == 0 */
+
+      if (t & 16) {
+         cc++;
+         t &= 15;
+      }
+      t = (cc[0] >> (t * 2)) & 3;
+
+      if (t == 3) {
+         /* zero */
+         r = g = b = a = 0;
+      } else {
+         dword kk;
+         cc = (const dword *)code;
+         a = UP5(cc[3] >> (t * 5 + 13));
+         t *= 15;
+         cc = (const dword *)(code + 8 + t / 8);
+         kk = cc[0] >> (t & 7);
+         b = UP5(kk);
+         g = UP5(kk >> 5);
+         r = UP5(kk >> 10);
+      }
+   }
+   rgba[RCOMP] = r;
+   rgba[GCOMP] = g;
+   rgba[BCOMP] = b;
+   rgba[ACOMP] = a;
 }
 
 
 TAPI void TAPIENTRY
-fxt1_decode_1 (const void *texture, int stride,
-	       int i, int j, byte *rgba)
+fxt1_decode_1 (const void *texture, int stride, /* in pixels */
+               int i, int j, byte *rgba)
 {
-    static void (*decode_1[]) (const byte *, int, byte *) = {
-	fxt1_decode_1HI,	/* cc-high   = "00?" */
-	fxt1_decode_1HI,	/* cc-high   = "00?" */
-	fxt1_decode_1CHROMA,	/* cc-chroma = "010" */
-	fxt1_decode_1ALPHA,	/* alpha     = "011" */
-	fxt1_decode_1MIXED,	/* mixed     = "1??" */
-	fxt1_decode_1MIXED,	/* mixed     = "1??" */
-	fxt1_decode_1MIXED,	/* mixed     = "1??" */
-	fxt1_decode_1MIXED	/* mixed     = "1??" */
-    };
-
-    const byte *code = (const byte *)texture +
-			((j / 4) * (stride / 8) + (i / 8)) * 16;
-    int mode = CC_SEL(code, 125);
-    int t = i & 7;
-
-    if (t & 4) {
-	t += 12;
-    }
-    t += (j & 3) * 4;
-
-    decode_1[mode](code, t, rgba);
-
-#if VERBOSE
-    {
-	extern int cc_chroma;
-	extern int cc_alpha;
-	extern int cc_high;
-	extern int cc_mixed;
-	static int *cctype[] = {
-	    &cc_high,
-	    &cc_high,
-	    &cc_chroma,
-	    &cc_alpha,
-	    &cc_mixed,
-	    &cc_mixed,
-	    &cc_mixed,
-	    &cc_mixed
-	};
-	(*cctype[mode])++;
-    }
-#endif
+   static void (*decode_1[]) (const byte *, int, byte *) = {
+      fxt1_decode_1HI,     /* cc-high   = "00?" */
+      fxt1_decode_1HI,     /* cc-high   = "00?" */
+      fxt1_decode_1CHROMA, /* cc-chroma = "010" */
+      fxt1_decode_1ALPHA,  /* alpha     = "011" */
+      fxt1_decode_1MIXED,  /* mixed     = "1??" */
+      fxt1_decode_1MIXED,  /* mixed     = "1??" */
+      fxt1_decode_1MIXED,  /* mixed     = "1??" */
+      fxt1_decode_1MIXED   /* mixed     = "1??" */
+   };
+
+   const byte *code = (const byte *)texture +
+                         ((j / 4) * (stride / 8) + (i / 8)) * 16;
+   int mode = CC_SEL(code, 125);
+   int t = i & 7;
+
+   if (t & 4) {
+      t += 12;
+   }
+   t += (j & 3) * 4;
+
+   decode_1[mode](code, t, rgba);
 }
diff --git a/source/gles2glide64/src/GlideHQ/tc-1.1+/fxt1.h b/source/gles2glide64/src/GlideHQ/tc-1.1+/fxt1.h
index c2919bb..b1fe32c 100644
--- a/source/gles2glide64/src/GlideHQ/tc-1.1+/fxt1.h
+++ b/source/gles2glide64/src/GlideHQ/tc-1.1+/fxt1.h
@@ -26,13 +26,13 @@
 #ifndef FXT1_H_included
 #define FXT1_H_included
 
-TAPI int TAPIENTRY
-fxt1_encode (int width, int height, int comps,
+TAPI void TAPIENTRY
+fxt1_encode (unsigned int width, unsigned int height, int comps,
              const void *source, int srcRowStride,
              void *dest, int destRowStride);
 
 TAPI void TAPIENTRY
-fxt1_decode_1 (const void *texture, int stride /* in pixels */,
-	       int i, int j, byte *rgba);
+fxt1_decode_1 (const void *texture, int stride,
+               int i, int j, byte *rgba);
 
 #endif
diff --git a/source/gles2glide64/src/GlideHQ/tc-1.1+/internal.h b/source/gles2glide64/src/GlideHQ/tc-1.1+/internal.h
index f1cd6dc..7252b21 100644
--- a/source/gles2glide64/src/GlideHQ/tc-1.1+/internal.h
+++ b/source/gles2glide64/src/GlideHQ/tc-1.1+/internal.h
@@ -23,6 +23,8 @@
 #ifndef INTERNAL_H_included
 #define INTERNAL_H_included
 
+#include <stdint.h>
+
 /*****************************************************************************\
  * DLL stuff
 \*****************************************************************************/
@@ -40,34 +42,42 @@
  * 64bit types on 32bit machine
 \*****************************************************************************/
 
-#if (defined(__GNUC__) && !defined(__cplusplus)) || defined(__MSC__)
+/*
+ * Define a 64-bit unsigned integer type and macros
+ */
+#if 1
+
+#define Q_NATIVE 1
 
-typedef unsigned long long qword;
+typedef uint64_t qword;
 
 #define Q_MOV32(a, b) a = b
 #define Q_OR32(a, b)  a |= b
 #define Q_SHL(a, c)   a <<= c
 
-#else  /* !__GNUC__ */
+#else
+
+#define Q_NATIVE 0
 
 typedef struct {
-    dword lo, hi;
+   dword lo, hi;
 } qword;
 
 #define Q_MOV32(a, b) a.lo = b
 #define Q_OR32(a, b)  a.lo |= b
-#define Q_SHL(a, c)					\
-    do {						\
-	if ((c) >= 32) {				\
-	    a.hi = a.lo << ((c) - 32);			\
-	    a.lo = 0;					\
-	} else {					\
-	    a.hi = (a.hi << (c)) | (a.lo >> (32 - (c)));\
-	    a.lo <<= c;					\
-	}						\
-    } while (0)
 
-#endif /* !__GNUC__ */
+#define Q_SHL(a, c)                                 \
+   do {                                                \
+       if ((c) >= 32) {                                \
+          a.hi = a.lo << ((c) - 32);                   \
+          a.lo = 0;                                    \
+       } else {                                        \
+          a.hi = (a.hi << (c)) | (a.lo >> (32 - (c))); \
+          a.lo <<= (c);                                \
+       }                                               \
+   } while (0)
+
+#endif
 
 
 /*****************************************************************************\
@@ -86,52 +96,71 @@ typedef struct {
 #define F(i) (float)1 /* can be used to obtain an oblong metric: 0.30 / 0.59 / 0.11 */
 #define SAFECDOT 1 /* for paranoids */
 
-#define MAKEIVEC(NV, NC, IV, B, V0, V1)	\
-    do {				\
-	/* compute interpolation vector */\
-	float d2 = 0.0F;		\
-	float rd2;			\
-					\
-	for (i = 0; i < NC; i++) {	\
-	    IV[i] = (V1[i] - V0[i]) * F(i);\
-	    d2 += IV[i] * IV[i];	\
-	}				\
-	rd2 = (float)NV / d2;		\
-	B = 0;				\
-	for (i = 0; i < NC; i++) {	\
-	    IV[i] *= F(i);		\
-	    B -= IV[i] * V0[i];		\
-	    IV[i] *= rd2;		\
-	}				\
-	B = B * rd2 + 0.5F;		\
-    } while (0)
+#define MAKEIVEC(NV, NC, IV, B, V0, V1)  \
+   do {                                  \
+      /* compute interpolation vector */ \
+      float d2 = 0.0F;                   \
+      float rd2;                         \
+                                         \
+      for (i = 0; i < NC; i++) {         \
+         IV[i] = (V1[i] - V0[i]) * F(i); \
+         d2 += IV[i] * IV[i];            \
+      }                                  \
+      rd2 = (float)NV / d2;              \
+      B = 0;                             \
+      for (i = 0; i < NC; i++) {         \
+         IV[i] *= F(i);                  \
+         B -= IV[i] * V0[i];             \
+         IV[i] *= rd2;                   \
+      }                                  \
+      B = B * rd2 + 0.5f;                \
+   } while (0)
 
 #define CALCCDOT(TEXEL, NV, NC, IV, B, V)\
-    do {				\
-	float dot = 0.0F;		\
-	for (i = 0; i < NC; i++) {	\
-	    dot += V[i] * IV[i];	\
-	}				\
-	TEXEL = (int)(dot + B);		\
-	if (SAFECDOT) {			\
-	    if (TEXEL < 0) {		\
-		TEXEL = 0;		\
-	    } else if (TEXEL > NV) {	\
-		TEXEL = NV;		\
-	    }				\
-	}				\
-    } while (0)
+   do {                                  \
+      float dot = 0.0F;                  \
+      for (i = 0; i < NC; i++) {         \
+         dot += V[i] * IV[i];            \
+      }                                  \
+      TEXEL = (int)(dot + B);            \
+      if (SAFECDOT) {                    \
+         if (TEXEL < 0) {                \
+            TEXEL = 0;                   \
+         } else if (TEXEL > NV) {        \
+            TEXEL = NV;                  \
+         }                               \
+      }                                  \
+   } while (0)
 
 
 /*****************************************************************************\
  * Utility functions
 \*****************************************************************************/
 
-void
-_mesa_upscale_teximage2d (unsigned int inWidth, unsigned int inHeight,
-			  unsigned int outWidth, unsigned int outHeight,
-			  unsigned int comps,
-			  const byte *src, int srcRowStride,
-			  unsigned char *dest);
+/** Copy a 4-element vector */
+#define COPY_4V( DST, SRC )         \
+do {                                \
+   (DST)[0] = (SRC)[0];             \
+   (DST)[1] = (SRC)[1];             \
+   (DST)[2] = (SRC)[2];             \
+   (DST)[3] = (SRC)[3];             \
+} while (0)
+
+/** Copy a 4-element unsigned byte vector */
+static inline void
+COPY_4UBV(uint8_t dst[4], const uint8_t src[4])
+{
+#if defined(__i386__)
+   *((uint32_t *) dst) = *((uint32_t *) src);
+#else
+   /* The uint32_t cast might fail if DST or SRC are not dword-aligned (RISC) */
+   COPY_4V(dst, src);
+#endif
+}
+
+void reorder_source_3(byte *tex, dword width, dword height, int srcRowStride);
+void *reorder_source_3_alloc(const byte *source, dword width, dword height, int srcRowStride);
+void reorder_source_4(byte *tex, dword width, dword height, int srcRowStride);
+void *reorder_source_4_alloc(const byte *source, dword width, dword height, int srcRowStride);
 
 #endif
diff --git a/source/gles2glide64/src/GlideHQ/tc-1.1+/texstore.c b/source/gles2glide64/src/GlideHQ/tc-1.1+/texstore.c
index 2eb0306..3898df7 100644
--- a/source/gles2glide64/src/GlideHQ/tc-1.1+/texstore.c
+++ b/source/gles2glide64/src/GlideHQ/tc-1.1+/texstore.c
@@ -22,72 +22,73 @@
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
-/* Copyright (C) 2007  Hiroshi Morii <koolsmoky(at)users.sourceforge.net>
- * _mesa_upscale_teximage2d speedup
- */
-
 #include <assert.h>
+#include <string.h>
+#include <stdlib.h>
 
 #include "types.h"
 #include "internal.h"
 
+void reorder_source_3(byte *tex, dword width, dword height, int srcRowStride)
+{
+    byte *line;
+    byte t;
+    dword i, j;
+
+    for (i = 0; i < height; i++) {
+        line = &tex[srcRowStride * i];
+        for (j = 0; j < width; j++) {
+            t = line[2];
+            line[2] = line[0];
+            line[0] = t;
+            line += 3;
+        }
+    }
+}
 
-void
-_mesa_upscale_teximage2d (unsigned int inWidth, unsigned int inHeight,
-			  unsigned int outWidth, unsigned int outHeight,
-			  unsigned int comps,
-			  const byte *src, int srcRowStride,
-			  byte *dest)
+void *reorder_source_3_alloc(const byte *source, dword width, dword height, int srcRowStride)
 {
-    unsigned int i, j, k;
+    byte *tex;
 
-    assert(outWidth >= inWidth);
-    assert(outHeight >= inHeight);
+    tex = malloc(height * srcRowStride);
+    if (!tex)
+        goto out;
 
-#if 1 /* H.Morii - faster loops */
-  for (i = 0; i < inHeight; i++) {
-    for (j = 0; j < inWidth; j++) {
-      const int aa = (i * outWidth + j) * comps;
-      const int bb = i * srcRowStride + j * comps;
-      for (k = 0; k < comps; k++) {
-        dest[aa + k] = src[bb + k];
-      }
-    }
-    for (; j < outWidth; j++) {
-      const int aa = (i * outWidth + j) * comps;
-      const int bb = i * srcRowStride + (j - inWidth) * comps;
-      for (k = 0; k < comps; k++) {
-        dest[aa + k] = src[bb + k];
-      }
-    }
-  }
-  for (; i < outHeight; i++) {
-    for (j = 0; j < inWidth; j++) {
-      const int aa = (i * outWidth + j) * comps;
-      const int bb = (i - inHeight) * srcRowStride + j * comps;
-      for (k = 0; k < comps; k++) {
-        dest[aa + k] = src[bb + k];
-      }
-    }
-    for (; j < outWidth; j++) {
-      const int aa = (i * outWidth + j) * comps;
-      const int bb = (i - inHeight) * srcRowStride + (j - inWidth) * comps;
-      for (k = 0; k < comps; k++) {
-        dest[aa + k] = src[bb + k];
-      }
-    }
-  }
-#else
-    for (i = 0; i < outHeight; i++) {
-	const int ii = i % inHeight;
-	for (j = 0; j < outWidth; j++) {
-	    const int jj = j % inWidth;
-            const int aa = (i * outWidth + j) * comps;
-            const int bb = ii * srcRowStride + jj * comps;
-	    for (k = 0; k < comps; k++) {
-		dest[aa + k] = src[bb + k];
-	    }
-	}
+    memcpy(tex, source, height * srcRowStride);
+    reorder_source_3(tex, width, height, srcRowStride);
+
+out:
+    return tex;
+}
+
+void reorder_source_4(byte *tex, dword width, dword height, int srcRowStride)
+{
+    byte *line;
+    byte t;
+    dword i, j;
+
+    for (i = 0; i < height; i++) {
+        line = &tex[srcRowStride * i];
+        for (j = 0; j < width; j++) {
+            t = line[2];
+            line[2] = line[0];
+            line[0] = t;
+            line += 4;
+        }
     }
-#endif
+}
+
+void *reorder_source_4_alloc(const byte *source, dword width, dword height, int srcRowStride)
+{
+    byte *tex;
+
+    tex = malloc(height * srcRowStride);
+    if (!tex)
+        goto out;
+
+    memcpy(tex, source, height * srcRowStride);
+    reorder_source_4(tex, width, height, srcRowStride);
+
+out:
+    return tex;
 }
diff --git a/source/gles2glide64/src/GlideHQ/tc-1.1+/wrapper.c b/source/gles2glide64/src/GlideHQ/tc-1.1+/wrapper.c
index 0a171ee..cff314d 100644
--- a/source/gles2glide64/src/GlideHQ/tc-1.1+/wrapper.c
+++ b/source/gles2glide64/src/GlideHQ/tc-1.1+/wrapper.c
@@ -21,87 +21,88 @@
 
 
 #include <assert.h>
+#include <stdlib.h>
 
 #include "types.h"
 #include "internal.h"
-#include "dxtn.h"
+#include <SDL_opengl.h>
+#include "../../Glide64/m64p.h"
 
+typedef void (*dxtCompressTexFuncExt)(GLint srccomps, GLint width, GLint height,
+		                      const GLubyte *srcPixData, GLenum destformat,
+                                      GLubyte *dest, GLint dstRowStride);
+static dxtCompressTexFuncExt _tx_compress_dxtn = NULL;
 
-#define GL_COMPRESSED_RGB_S3TC_DXT1_EXT   0x83F0
-#define GL_COMPRESSED_RGBA_S3TC_DXT1_EXT  0x83F1
-#define GL_COMPRESSED_RGBA_S3TC_DXT3_EXT  0x83F2
-#define GL_COMPRESSED_RGBA_S3TC_DXT5_EXT  0x83F3
+#ifdef TXCDXTN_EXTERNAL
 
+#include "../../Glide64/osal_dynamiclib.h"
 
-TAPI void TAPIENTRY
-fetch_2d_texel_rgb_dxt1 (int texImage_RowStride,
-			 const byte *texImage_Data,
-			 int i, int j,
-			 byte *texel)
-{
-    dxt1_rgb_decode_1(texImage_Data, texImage_RowStride, i, j, texel);
-}
+#if defined(_WIN32) || defined(WIN32)
+#define DXTN_LIBNAME "dxtn.dll"
+#elif defined(__DJGPP__)
+#define DXTN_LIBNAME "dxtn.dxe"
+#else
+#define DXTN_LIBNAME "libtxc_dxtn.so"
+#endif
 
+static m64p_dynlib_handle dxtn_lib_handle;
 
-TAPI void TAPIENTRY
-fetch_2d_texel_rgba_dxt1 (int texImage_RowStride,
-			  const byte *texImage_Data,
-			  int i, int j,
-			  byte *texel)
+static void tx_compress_dxtn_init()
 {
-    dxt1_rgba_decode_1(texImage_Data, texImage_RowStride, i, j, texel);
-}
+    m64p_error rval;
 
+    if (_tx_compress_dxtn)
+        return;
 
-TAPI void TAPIENTRY
-fetch_2d_texel_rgba_dxt3 (int texImage_RowStride,
-			  const byte *texImage_Data,
-			  int i, int j,
-			  byte *texel)
-{
-    dxt3_rgba_decode_1(texImage_Data, texImage_RowStride, i, j, texel);
+    rval = osal_dynlib_open(&dxtn_lib_handle, DXTN_LIBNAME);
+    if (rval != M64ERR_SUCCESS) {
+        WriteLog(M64MSG_WARNING, "Failed to open %s", DXTN_LIBNAME);
+        return;
+    }
+
+    _tx_compress_dxtn = osal_dynlib_getproc(dxtn_lib_handle, "tx_compress_dxtn");
+    if (!_tx_compress_dxtn) {
+        WriteLog(M64MSG_WARNING, "Shared library '%s' invalid; no PluginGetVersion() function found.", DXTN_LIBNAME, "tx_compress_dxtn");
+	osal_dynlib_close(dxtn_lib_handle);
+        return;
+    }
 }
 
+#else
 
-TAPI void TAPIENTRY
-fetch_2d_texel_rgba_dxt5 (int texImage_RowStride,
-			  const byte *texImage_Data,
-			  int i, int j,
-			  byte *texel)
+#include "s2tc/txc_dxtn.h"
+
+static void tx_compress_dxtn_init()
 {
-    dxt5_rgba_decode_1(texImage_Data, texImage_RowStride, i, j, texel);
+	_tx_compress_dxtn = tx_compress_dxtn;
 }
 
+#endif
+
 
 TAPI void TAPIENTRY
-tx_compress_dxtn (int srccomps, int width, int height,
-		  const byte *source, int destformat, byte *dest,
-		  int destRowStride)
+tx_compress_dxtn_rgba(int srccomps, int width, int height,
+                      const byte *source, int destformat, byte *dest,
+                      int destRowStride)
 {
     int srcRowStride = width * srccomps;
+    void *newSource = NULL;
 
-    switch (destformat) {
-	case GL_COMPRESSED_RGB_S3TC_DXT1_EXT:
-	    dxt1_rgb_encode(width, height, srccomps,
-			    source, srcRowStride,
-			    dest, destRowStride);
-	    break;
-	case GL_COMPRESSED_RGBA_S3TC_DXT1_EXT:
-	    dxt1_rgba_encode(width, height, srccomps,
-			     source, srcRowStride,
-			     dest, destRowStride);
-	    break;
-	case GL_COMPRESSED_RGBA_S3TC_DXT3_EXT:
-	    dxt3_rgba_encode(width, height, srccomps,
-			     source, srcRowStride,
-			     dest, destRowStride);
-	    break;
-	case GL_COMPRESSED_RGBA_S3TC_DXT5_EXT:
-	    dxt5_rgba_encode(width, height, srccomps,
-			     source, srcRowStride,
-			     dest, destRowStride);
-	    break;
-	default:
-	    assert(0);
+    tx_compress_dxtn_init();
+    if (!_tx_compress_dxtn) {
+        WriteLog(M64MSG_ERROR, "Failed to initialize S3TC compressor");
+        return;
     }
+
+    assert(srccomps == 3 || srccomps == 4);
+
+    if (srccomps == 3)
+        newSource = reorder_source_3_alloc(source, width, height, srcRowStride);
+    if (srccomps == 4)
+        newSource = reorder_source_4_alloc(source, width, height, srcRowStride);
+
+    _tx_compress_dxtn(srccomps, width, height, newSource, destformat, dest,
+                      destRowStride);
+
+    free(newSource);
 }
diff --git a/source/gles2glide64/src/Glitch64/combiner.cpp b/source/gles2glide64/src/Glitch64/combiner.cpp
index b82f8fa..ace5033 100755
--- a/source/gles2glide64/src/Glitch64/combiner.cpp
+++ b/source/gles2glide64/src/Glitch64/combiner.cpp
@@ -177,7 +177,7 @@ static const char* vertex_shader =
 SHADER_HEADER
 "#define Z_MAX 65536.0                                          \n"
 "attribute highp vec4 aVertex;                                  \n"
-"attribute mediump vec4 aColor;                                   \n"	//*SEB* highp -> lowp
+"attribute mediump vec4 aColor;                                   \n"	//*SEB* highp -> mediump
 "attribute highp vec4 aMultiTexCoord0;                          \n"
 "attribute highp vec4 aMultiTexCoord1;                          \n"
 "attribute float aFog;                                          \n"
@@ -267,12 +267,14 @@ void init_combiner()
   int log_length;
 
 //#ifndef ANDROID
+#if 0
+//	unfortunatly, Pandora has not the gl_FragDepthEXT extension... So I disable this block.
   // depth shader
   fragment_depth_shader_object = glCreateShader(GL_FRAGMENT_SHADER);
 
-  char s[128];
+  char s[512];
   // ZIGGY convert a 565 texture into depth component
-  sprintf(s, "gl_FragDepth = dot(texture2D(texture0, vec2(gl_TexCoord[0])), vec4(31*64*32, 63*32, 31, 0))*%g + %g; \n", zscale/2/65535.0, 1-zscale/2);
+  sprintf(s, "gl_FragDepthEXT = dot(texture2D(texture0, vec2(gl_TexCoord[0])), vec4(31*64*32, 63*32, 31, 0))*%g + %g; \n", zscale/2/65535.0, 1-zscale/2);
   fragment_shader = (char*)malloc(strlen(fragment_shader_header)+
     strlen(s)+
     strlen(fragment_shader_end)+1);
@@ -284,7 +286,7 @@ void init_combiner()
 
   glCompileShader(fragment_depth_shader_object);
   check_compile(fragment_depth_shader_object);
-//#endif
+#endif
 
   // default shader
   fragment_shader_object = glCreateShader(GL_FRAGMENT_SHADER);
diff --git a/source/gles2glide64/src/Glitch64/glitchmain.cpp b/source/gles2glide64/src/Glitch64/glitchmain.cpp
index 21e16b7..668cff9 100755
--- a/source/gles2glide64/src/Glitch64/glitchmain.cpp
+++ b/source/gles2glide64/src/Glitch64/glitchmain.cpp
@@ -507,7 +507,7 @@ grSstWinOpen(
   // ZIGGY viewport_offset is WIN32 specific, with SDL just set it to zero
   viewport_offset = 0; //-10 //-20;
 
-  // ZIGGY not sure, but it might be better to let the system choose
+  CoreVideo_Init();
   CoreVideo_GL_SetAttribute(M64P_GL_DOUBLEBUFFER, 1);
   CoreVideo_GL_SetAttribute(M64P_GL_SWAP_CONTROL, vsync);
   CoreVideo_GL_SetAttribute(M64P_GL_BUFFER_SIZE, 16);
@@ -817,6 +817,9 @@ grSstWinClose( GrContext_t context )
   //SDL_QuitSubSystem(SDL_INIT_VIDEO);
   //sleep(2);
 #endif
+
+  CoreVideo_Quit();
+
   return FXTRUE;
 }
 
diff --git a/source/gles2glide64/src/Glitch64/inc/glidesys.h b/source/gles2glide64/src/Glitch64/inc/glidesys.h
index b19845a..12d0fcb 100644
--- a/source/gles2glide64/src/Glitch64/inc/glidesys.h
+++ b/source/gles2glide64/src/Glitch64/inc/glidesys.h
@@ -110,7 +110,8 @@ n** -----------------------------------------------------------------------
 
 /* Check for OS */
 #if defined(__IRIX__) || defined(__sparc__) || defined(__linux__) || \
-	defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__)
+   defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \
+   defined(__FreeBSD_kernel__) || defined(__GNU__)
 #  define GLIDE_OS        GLIDE_OS_UNIX
 #elif defined(__DOS__)
 #  define GLIDE_OS        GLIDE_OS_DOS32
diff --git a/source/gles2glide64/src/Glitch64/main.h b/source/gles2glide64/src/Glitch64/main.h
index f213fe9..a0cef92 100644
--- a/source/gles2glide64/src/Glitch64/main.h
+++ b/source/gles2glide64/src/Glitch64/main.h
@@ -25,7 +25,13 @@
 
 #define LOG(...) // WriteLog(M64MSG_VERBOSE, __VA_ARGS__)
 #define LOGINFO(...) WriteLog(M64MSG_INFO, __VA_ARGS__)
+#ifdef __cplusplus
+extern "C" {
+#endif
 void WriteLog(m64p_msg_level level, const char *msg, ...);
+#ifdef __cplusplus
+}
+#endif
 
 
 #ifndef _WIN32
diff --git a/source/gles2glide64/todo!.txt b/source/gles2glide64/todo!.txt
index 7c297c5..c60df97 100644
--- a/source/gles2glide64/todo!.txt
+++ b/source/gles2glide64/todo!.txt
@@ -4,7 +4,6 @@ Glide64
 -	Add tlut support for 16bit textures. Remove hacks.
 -	Add trapezoid support to le_triangle. Remove hacks.
 -	Reduce "Known issues" list :)
--   Port over C ports of NASM functions from balrog's fork
 
 GlideHQ
 -	Add OpenGL texture format support.
diff --git a/source/gles2rice/projects/unix/Makefile b/source/gles2rice/projects/unix/Makefile
index 0b8832c..948238d 100755
--- a/source/gles2rice/projects/unix/Makefile
+++ b/source/gles2rice/projects/unix/Makefile
@@ -122,7 +122,8 @@ ifeq ("$(CPU)","NONE")
 endif
 
 # base CFLAGS, LDLIBS, and LDFLAGS
-OPTFLAGS ?= -Ofast -ffast-math -flto -fuse-linker-plugin
+OPTFLAGS ?= -O4 -ffast-math 
+#-flto -fuse-linker-plugin
 WARNFLAGS ?= -Wall
 CFLAGS += $(OPTFLAGS) $(WARNFLAGS) -ffast-math -fno-strict-aliasing -fvisibility=hidden -I../../src
 CXXFLAGS += -fvisibility-inlines-hidden
diff --git a/source/mupen64plus-audio-sdl/projects/unix/Makefile b/source/mupen64plus-audio-sdl/projects/unix/Makefile
index 518f4cc..2f3e59f 100755
--- a/source/mupen64plus-audio-sdl/projects/unix/Makefile
+++ b/source/mupen64plus-audio-sdl/projects/unix/Makefile
@@ -149,24 +149,16 @@ ifeq ($(OS), LINUX)
   LDLIBS += -ldl
 endif
 ifeq ($(OS), OSX)
-  # Select the proper SDK
-  # Also, SDKs are stored in a different location since XCode 4.3
-  OSX_SDK ?= $(shell sw_vers -productVersion | cut -f1 -f2 -d .)
-  OSX_XCODEMAJ = $(shell xcodebuild -version | grep '[0-9]*\.[0-9]*' | cut -f2 -d ' ' | cut -f1 -d .)
-  OSX_XCODEMIN = $(shell xcodebuild -version | grep '[0-9]*\.[0-9]*' | cut -f2 -d ' ' | cut -f2 -d .)
-  OSX_XCODEGE43 = $(shell echo "`expr $(OSX_XCODEMAJ) \>= 4``expr $(OSX_XCODEMIN) \>= 3`")
-  ifeq ($(OSX_XCODEGE43), 11)
-    OSX_SYSROOT := /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs
-  else
-    OSX_SYSROOT := /Developer/SDKs
-  endif
+  #xcode-select has been around since XCode 3.0, i.e. OS X 10.5
+  OSX_SDK_ROOT = $(shell xcode-select -print-path)/Platforms/MacOSX.platform/Developer/SDKs
+  OSX_SDK_PATH = $(OSX_SDK_ROOT)/$(shell ls $(OSX_SDK_ROOT) | tail -1)
 
   ifeq ($(CPU), X86)
     ifeq ($(ARCH_DETECTED), 64BITS)
-      CFLAGS += -pipe -arch x86_64 -mmacosx-version-min=$(OSX_SDK) -isysroot $(OSX_SYSROOT)/MacOSX$(OSX_SDK).sdk
+      CFLAGS += -pipe -arch x86_64 -mmacosx-version-min=10.5 -isysroot $(OSX_SDK_PATH)
       LDLIBS += -ldl
     else
-      CFLAGS += -pipe -mmmx -msse -fomit-frame-pointer -arch i686 -mmacosx-version-min=$(OSX_SDK) -isysroot $(OSX_SYSROOT)/MacOSX$(OSX_SDK).sdk
+      CFLAGS += -pipe -mmmx -msse -fomit-frame-pointer -arch i686 -mmacosx-version-min=10.5 -isysroot $(OSX_SDK_PATH)
       LDLIBS += -ldl -read_only_relocs suppress
     endif
   endif
@@ -358,7 +350,7 @@ clean:
 rebuild: clean all
 
 # build dependency files
-CFLAGS += -MD
+CFLAGS += -MD -MP
 -include $(OBJECTS:.o=.d)
 
 # standard build rules
diff --git a/source/mupen64plus-core/doc/emuwiki-api-doc/Mupen64Plus_Core_Parameters.txt b/source/mupen64plus-core/doc/emuwiki-api-doc/Mupen64Plus_Core_Parameters.txt
index cc0876b..bf90feb 100644
--- a/source/mupen64plus-core/doc/emuwiki-api-doc/Mupen64Plus_Core_Parameters.txt
+++ b/source/mupen64plus-core/doc/emuwiki-api-doc/Mupen64Plus_Core_Parameters.txt
@@ -55,6 +55,14 @@ These are standard parameters which are used by the Mupen64Plus Core library.  T
 |M64TYPE_STRING
 |Path to a directory to search when looking for shared data files in the <tt>ConfigGetSharedDataFilepath()</tt> function.
 |-
+|CountPerOp
+|M64TYPE_INT
+|Force number of cycles per emulated instruction when set greater than 0.
+|-
+|DelaySI
+|M64TYPE_BOOL
+|Delay interrupt after DMA SI read/write.
+|-
 |}
 
 These configuration parameters are used in the Core's event loop to detect keyboard and joystick commands.  They are stored in a configuration section called "CoreEvents" and may be altered by the front-end in order to adjust the behaviour of the emulator.  These may be adjusted at any time and the effect of the change should occur immediately.  The Keysym value stored is actually <tt>(SDLMod << 16) || SDLKey</tt>, so that keypresses with modifiers like shift, control, or alt may be used.
diff --git a/source/mupen64plus-core/projects/unix/Makefile b/source/mupen64plus-core/projects/unix/Makefile
index 627f194..cc8cfe9 100755
--- a/source/mupen64plus-core/projects/unix/Makefile
+++ b/source/mupen64plus-core/projects/unix/Makefile
@@ -148,26 +148,18 @@ ifeq ($(OS), LINUX)
   LDFLAGS += -Wl,-version-script,$(SRCDIR)/api/api_export.ver
 endif
 ifeq ($(OS), OSX)
-  # Select the proper SDK
-  # Also, SDKs are stored in a different location since XCode 4.3
-  OSX_SDK ?= $(shell sw_vers -productVersion | cut -f1 -f2 -d .)
-  OSX_XCODEMAJ = $(shell xcodebuild -version | grep '[0-9]*\.[0-9]*' | cut -f2 -d ' ' | cut -f1 -d .)
-  OSX_XCODEMIN = $(shell xcodebuild -version | grep '[0-9]*\.[0-9]*' | cut -f2 -d ' ' | cut -f2 -d .)
-  OSX_XCODEGE43 = $(shell echo "`expr $(OSX_XCODEMAJ) \>= 4``expr $(OSX_XCODEMIN) \>= 3`")
-  ifeq ($(OSX_XCODEGE43), 11)
-    OSX_SYSROOT := /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs
-  else
-    OSX_SYSROOT := /Developer/SDKs
-  endif
+  #xcode-select has been around since XCode 3.0, i.e. OS X 10.5
+  OSX_SDK_ROOT = $(shell xcode-select -print-path)/Platforms/MacOSX.platform/Developer/SDKs
+  OSX_SDK_PATH = $(OSX_SDK_ROOT)/$(shell ls $(OSX_SDK_ROOT) | tail -1)
 
   TARGET = libmupen64plus$(POSTFIX).dylib
   LDFLAGS += -bundle -read_only_relocs suppress
   LDLIBS += -ldl
   ifeq ($(CPU), X86)
     ifeq ($(ARCH_DETECTED), 64BITS)
-      CFLAGS += -pipe -arch x86_64 -mmacosx-version-min=$(OSX_SDK) -isysroot $(OSX_SYSROOT)/MacOSX$(OSX_SDK).sdk
+      CFLAGS += -pipe -arch x86_64 -mmacosx-version-min=10.5 -isysroot $(OSX_SDK_PATH)
     else
-      CFLAGS += -pipe -mmmx -msse -arch i686 -mmacosx-version-min=$(OSX_SDK) -isysroot $(OSX_SYSROOT)/MacOSX$(OSX_SDK).sdk
+      CFLAGS += -pipe -mmmx -msse -arch i686 -mmacosx-version-min=10.5 -isysroot $(OSX_SDK_PATH)
       ifneq ($(PROFILE), 1)
         CFLAGS += -fomit-frame-pointer
       endif
@@ -575,7 +567,7 @@ clean:
 	$(RM) -r $(TARGET) $(SONAME) $(OBJDIR)
 
 # build dependency files
-CFLAGS += -MD
+CFLAGS += -MD -MP
 -include $(OBJECTS:.o=.d)
 
 CXXFLAGS += $(CFLAGS)
diff --git a/source/mupen64plus-core/src/debugger/dbg_memory.c b/source/mupen64plus-core/src/debugger/dbg_memory.c
index c8988bf..387623e 100644
--- a/source/mupen64plus-core/src/debugger/dbg_memory.c
+++ b/source/mupen64plus-core/src/debugger/dbg_memory.c
@@ -75,6 +75,8 @@ static MEMBREAKWRITE(write_rom, 8);
 
 #if !defined(NO_ASM) && (defined(__i386__) || defined(__x86_64__))
 
+/* we must define PACKAGE so that bfd.h (which is included from dis-asm.h) doesn't throw an error */
+#define PACKAGE "mupen64plus-core"
 #include <dis-asm.h>
 #include <stdarg.h>
 
diff --git a/source/mupen64plus-core/src/main/cheat.c b/source/mupen64plus-core/src/main/cheat.c
index 52551ce..7266762 100644
--- a/source/mupen64plus-core/src/main/cheat.c
+++ b/source/mupen64plus-core/src/main/cheat.c
@@ -205,25 +205,87 @@ void cheat_apply_cheats(int entry)
     int execute_next;
 
     // If game is Zelda OOT, apply subscreen delay fix
-    if (strncmp((char *)ROM_HEADER.Name, "THE LEGEND OF ZELDA", 19) == 0 && entry == ENTRY_VI) {
+    if (entry == ENTRY_VI && strncmp((char *)ROM_HEADER.Name, "THE LEGEND OF ZELDA", 19) == 0) {
+        uint32_t subscreen_address = 0;
+        uint32_t credits_address[4];
+        credits_address[0] = 0;
         if (sl(ROM_HEADER.CRC1) == 0xEC7011B7 && sl(ROM_HEADER.CRC2) == 0x7616D72B) {
             // Legend of Zelda, The - Ocarina of Time (U) + (J) (V1.0)
-            execute_cheat(0x801DA5CB, 0x0002, NULL);
+            subscreen_address = 0x801DA5CB;
         } else if (sl(ROM_HEADER.CRC1) == 0xD43DA81F && sl(ROM_HEADER.CRC2) == 0x021E1E19) {
             // Legend of Zelda, The - Ocarina of Time (U) + (J) (V1.1)
-            execute_cheat(0x801DA78B, 0x0002, NULL);
+            subscreen_address = 0x801DA78B;
         } else if (sl(ROM_HEADER.CRC1) == 0x693BA2AE && sl(ROM_HEADER.CRC2) == 0xB7F14E9F) {
             // Legend of Zelda, The - Ocarina of Time (U) + (J) (V1.2)
-            execute_cheat(0x801DAE8B, 0x0002, NULL);
+            subscreen_address = 0x801DAE8B;
         } else if (sl(ROM_HEADER.CRC1) == 0xB044B569 && sl(ROM_HEADER.CRC2) == 0x373C1985) {
             // Legend of Zelda, The - Ocarina of Time (E) (V1.0)
-            execute_cheat(0x801D860B, 0x0002, NULL);
+            subscreen_address = 0x801D860B;
         } else if (sl(ROM_HEADER.CRC1) == 0xB2055FBD && sl(ROM_HEADER.CRC2) == 0x0BAB4E0C) {
             // Legend of Zelda, The - Ocarina of Time (E) (V1.1)
-            execute_cheat(0x801D864B, 0x0002, NULL);
+            subscreen_address = 0x801D864B;
+        // GC Versions such as Master Quest also require the End Credits Fix.
+        } else if (sl(ROM_HEADER.CRC1) == 0x1D4136F3 && sl(ROM_HEADER.CRC2) == 0xAF63EEA9) {
+            // Legend of Zelda, The - Ocarina of Time - Master Quest (E) (GC Version)
+            subscreen_address = 0x801D8F4B;
+            credits_address[0] = 0xD109A8C4;
+            credits_address[1] = 0x8109A8C4;
+            credits_address[2] = 0xD109A8C6;
+            credits_address[3] = 0x8109A8C6;
+        } else if (sl(ROM_HEADER.CRC1) == 0x09465AC3 && sl(ROM_HEADER.CRC2) == 0xF8CB501B) {
+            // Legend of Zelda, The - Ocarina of Time (E) (GC Version)
+            subscreen_address = 0x801D8F8B;
+            credits_address[0] = 0xD109A8E4;
+            credits_address[1] = 0x8109A8E4;
+            credits_address[2] = 0xD109A8E6;
+            credits_address[3] = 0x8109A8E6;
+        } else if (sl(ROM_HEADER.CRC1) == 0xF3DD35BA && sl(ROM_HEADER.CRC2) == 0x4152E075) {
+            // Legend of Zelda, The - Ocarina of Time (U) (GC Version)
+            subscreen_address = 0x801DB78B;
+            credits_address[0] = 0xD109A814;
+            credits_address[1] = 0x8109A814;
+            credits_address[2] = 0xD109A816;
+            credits_address[3] = 0x8109A816;
+        } else if (sl(ROM_HEADER.CRC1) == 0xF034001A && sl(ROM_HEADER.CRC2) == 0xAE47ED06) {
+            // Legend of Zelda, The - Ocarina of Time - Master Quest (U) (GC Version)
+            subscreen_address = 0x801DB74B;
+            credits_address[0] = 0xD109A7F4;
+            credits_address[1] = 0x8109A7F4;
+            credits_address[2] = 0xD109A7F6;
+            credits_address[3] = 0x8109A7F6;
+        } else if (sl(ROM_HEADER.CRC1) == 0xF7F52DB8 && sl(ROM_HEADER.CRC2) == 0x2195E636) {
+            // Zelda no Densetsu - Toki no Ocarina - Zelda Collection Version (J) (GC Version)
+            subscreen_address = 0x801DB78B;
+            credits_address[0] = 0xD109A814;
+            credits_address[1] = 0x8109A814;
+            credits_address[2] = 0xD109A816;
+            credits_address[3] = 0x8109A816;
+        } else if (sl(ROM_HEADER.CRC1) == 0xF611F4BA && sl(ROM_HEADER.CRC2) == 0xC584135C) {
+            // Zelda no Densetsu - Toki no Ocarina GC (J) (GC Version)
+            subscreen_address = 0x801DB78B;
+            credits_address[0] = 0xD109A834;
+            credits_address[1] = 0x8109A834;
+            credits_address[2] = 0xD109A836;
+            credits_address[3] = 0x8109A836;
+        } else if (sl(ROM_HEADER.CRC1) == 0xF43B45BA && sl(ROM_HEADER.CRC2) == 0x2F0E9B6F) {
+            // Zelda no Densetsu - Toki no Ocarina GC Ura (J) (GC Version)
+            subscreen_address = 0x801DB78B;
+            credits_address[0] = 0xD109A814;
+            credits_address[1] = 0x8109A814;
+            credits_address[2] = 0xD109A816;
+            credits_address[3] = 0x8109A816;
         } else {
-            // Legend of Zelda, The - Ocarina of Time Master Quest
-            execute_cheat(0x801D8F4B, 0x0002, NULL);
+            // UNKNOWN VERSION
+            DebugMessage(M64MSG_WARNING, "Warning: Ocarina of Time version could not be determined.  No fixes applied.");
+        }
+        if (subscreen_address) {
+            execute_cheat(subscreen_address, 0x0002, NULL);
+            if (credits_address[0]){
+                if (execute_cheat(credits_address[0], 0x0320, NULL));
+                    execute_cheat(credits_address[1], 0x0000, NULL);
+                if (execute_cheat(credits_address[2], 0xF809, NULL));
+                    execute_cheat(credits_address[3], 0x0000, NULL);
+            }
         }
     }
     
diff --git a/source/mupen64plus-core/src/main/eventloop.c b/source/mupen64plus-core/src/main/eventloop.c
index 85a06d7..2502e0b 100644
--- a/source/mupen64plus-core/src/main/eventloop.c
+++ b/source/mupen64plus-core/src/main/eventloop.c
@@ -41,6 +41,14 @@
 #define SDL_SCANCODE_G SDLK_g
 #define SDL_SCANCODE_RETURN SDLK_RETURN
 #define SDL_SCANCODE_0 SDLK_0
+#define SDL_SCANCODE_1 SDLK_1
+#define SDL_SCANCODE_2 SDLK_2
+#define SDL_SCANCODE_3 SDLK_3
+#define SDL_SCANCODE_4 SDLK_4
+#define SDL_SCANCODE_5 SDLK_5
+#define SDL_SCANCODE_6 SDLK_6
+#define SDL_SCANCODE_7 SDLK_7
+#define SDL_SCANCODE_8 SDLK_8
 #define SDL_SCANCODE_9 SDLK_9
 
 #define SDL_SetEventFilter(func, data) SDL_SetEventFilter(func)
@@ -443,17 +451,47 @@ int event_set_core_defaults(void)
     return 1;
 }
 
+static int get_saveslot_from_keysym(int keysym)
+{
+    switch (keysym) {
+    case SDL_SCANCODE_0:
+        return 0;
+    case SDL_SCANCODE_1:
+        return 1;
+    case SDL_SCANCODE_2:
+        return 2;
+    case SDL_SCANCODE_3:
+        return 3;
+    case SDL_SCANCODE_4:
+        return 4;
+    case SDL_SCANCODE_5:
+        return 5;
+    case SDL_SCANCODE_6:
+        return 6;
+    case SDL_SCANCODE_7:
+        return 7;
+    case SDL_SCANCODE_8:
+        return 8;
+    case SDL_SCANCODE_9:
+        return 9;
+    default:
+        return -1;
+    }
+}
+
 /*********************************************************************************************************
 * sdl keyup/keydown handlers
 */
 
 void event_sdl_keydown(int keysym, int keymod)
 {
+    int slot;
+
     /* check for the only 2 hard-coded key commands: Alt-enter for fullscreen and 0-9 for save state slot */
     if (keysym == SDL_SCANCODE_RETURN && keymod & (KMOD_LALT | KMOD_RALT))
         gfx.changeWindow();
-    else if (keysym >= SDL_SCANCODE_0 && keysym <= SDL_SCANCODE_9)
-        main_state_set_slot(keysym - SDL_SCANCODE_0);
+    else if ((slot = get_saveslot_from_keysym(keysym)) >= 0)
+        main_state_set_slot(slot);
     /* check all of the configurable commands */
     else if (keysym == ConfigGetParamInt(l_CoreEventsConfig, kbdStop))
         main_stop();
diff --git a/source/mupen64plus-core/src/main/main.c b/source/mupen64plus-core/src/main/main.c
index 119bd7d..c1bd208 100755
--- a/source/mupen64plus-core/src/main/main.c
+++ b/source/mupen64plus-core/src/main/main.c
@@ -194,6 +194,8 @@ int main_set_core_defaults(void)
     ConfigSetDefaultString(g_CoreConfig, "SaveStatePath", "", "Path to directory where emulator save states (snapshots) are saved. If this is blank, the default value of ${UserConfigPath}/save will be used");
     ConfigSetDefaultString(g_CoreConfig, "SaveSRAMPath", "", "Path to directory where SRAM/EEPROM data (in-game saves) are stored. If this is blank, the default value of ${UserConfigPath}/save will be used");
     ConfigSetDefaultString(g_CoreConfig, "SharedDataPath", "", "Path to a directory to search when looking for shared data files");
+    ConfigSetDefaultBool(g_CoreConfig, "DelaySI", 0, "Delay interrupt after DMA SI read/write");
+    ConfigSetDefaultInt(g_CoreConfig, "CountPerOp", 2, "Force number of cycles per emulated instruction");
 
     /* handle upgrades */
     if (bUpgrade)
@@ -734,8 +736,12 @@ m64p_error main_run(void)
     savestates_set_autoinc_slot(ConfigGetParamBool(g_CoreConfig, "AutoStateSlotIncrement"));
     savestates_select_slot(ConfigGetParamInt(g_CoreConfig, "CurrentStateSlot"));
     no_compiled_jump = ConfigGetParamBool(g_CoreConfig, "NoCompiledJump");
+	if (delay_si==-1) delay_si = ConfigGetParamBool(g_CoreConfig, "DelaySI");
+    if (count_per_op==-1) count_per_op = ConfigGetParamInt(g_CoreConfig, "CountPerOp");
+    if (count_per_op <= 0)
+        count_per_op = 2;
 
-    // initialize memory, and do byte-swapping if it's not been done yet
+	// initialize memory, and do byte-swapping if it's not been done yet
     if (g_MemHasBeenBSwapped == 0)
     {
         init_memory(1);
@@ -856,12 +862,3 @@ void main_stop(void)
     }
 #endif        
 }
-
-/*********************************************************************************************************
-* main function
-*/
-int main(int argc, char *argv[])
-{
-    return 1;
-}
-
diff --git a/source/mupen64plus-core/src/main/main.h b/source/mupen64plus-core/src/main/main.h
index d034c90..6c7df2a 100644
--- a/source/mupen64plus-core/src/main/main.h
+++ b/source/mupen64plus-core/src/main/main.h
@@ -33,6 +33,8 @@ extern int g_EmulatorRunning;
 
 extern m64p_frame_callback g_FrameCallback;
 
+extern int delay_si;
+
 const char* get_savestatepath(void);
 const char* get_savesrampath(void);
 
diff --git a/source/mupen64plus-core/src/main/rom.c b/source/mupen64plus-core/src/main/rom.c
old mode 100644
new mode 100755
index 5cd1b3e..9785857
--- a/source/mupen64plus-core/src/main/rom.c
+++ b/source/mupen64plus-core/src/main/rom.c
@@ -55,6 +55,8 @@ int rom_size = 0;
 
 unsigned char isGoldeneyeRom = 0;
 
+extern int count_per_op;
+
 m64p_rom_header   ROM_HEADER;
 rom_params        ROM_PARAMS;
 m64p_rom_settings ROM_SETTINGS;
@@ -178,6 +180,8 @@ m64p_error open_rom(const unsigned char* romimage, unsigned int size)
         ROM_SETTINGS.status = entry->status;
         ROM_SETTINGS.players = entry->players;
         ROM_SETTINGS.rumble = entry->rumble;
+        delay_si = entry->delay_si;
+        count_per_op = entry->count_per_op;
     }
     else
     {
@@ -187,7 +191,9 @@ m64p_error open_rom(const unsigned char* romimage, unsigned int size)
         ROM_SETTINGS.status = 0;
         ROM_SETTINGS.players = 0;
         ROM_SETTINGS.rumble = 0;
-    }
+        delay_si = -1;
+        count_per_op = -1;
+   }
 
     /* print out a bunch of info about the ROM */
     DebugMessage(M64MSG_INFO, "Goodname: %s", ROM_SETTINGS.goodname);
@@ -208,6 +214,8 @@ m64p_error open_rom(const unsigned char* romimage, unsigned int size)
     DebugMessage(M64MSG_INFO, "Country: %s", buffer);
     DebugMessage(M64MSG_VERBOSE, "PC = %x", sl((unsigned int)ROM_HEADER.PC));
     DebugMessage(M64MSG_VERBOSE, "Save type: %d", ROM_SETTINGS.savetype);
+	if (delay_si>=0) DebugMessage(M64MSG_INFO, "Delay SI: %d", delay_si);
+	if (count_per_op>=0) DebugMessage(M64MSG_INFO, "Count Per OP: %d", count_per_op);
 
     //Prepare Hack for GOLDENEYE
     isGoldeneyeRom = 0;
@@ -355,6 +363,9 @@ void romdatabase_open(void)
             search->entry.savetype = DEFAULT;
             search->entry.players = DEFAULT;
             search->entry.rumble = DEFAULT; 
+			/*SEB*/
+			search->entry.delay_si=-1;
+			search->entry.count_per_op=-1;
 
             search->next_entry = NULL;
             search->next_crc = NULL;
@@ -445,6 +456,22 @@ void romdatabase_open(void)
                 else
                     DebugMessage(M64MSG_WARNING, "ROM Database: Invalid rumble string on line %i", lineno);
             }
+            else if(!strcmp(l.name, "DelaySI"))
+            {
+                if(!strcmp(l.value, "True"))
+				search->entry.delay_si = 1;
+                else if(!strcmp(l.value, "False"))
+				search->entry.delay_si = 0;
+                else
+				DebugMessage(M64MSG_WARNING, "ROM Database: Invalid DelaySI string on line %i", lineno);
+            }
+            else if(!strcmp(l.name, "CountPerOp"))
+            {
+                if (string_to_int(l.value, &value) && value >= 0 && value < 8)
+				search->entry.count_per_op = value;
+                else
+				DebugMessage(M64MSG_WARNING, "ROM Database: Invalid CountPerOp on line %i", lineno);
+            }
             else
             {
                 DebugMessage(M64MSG_WARNING, "ROM Database: Unknown property on line %i", lineno);
diff --git a/source/mupen64plus-core/src/main/rom.h b/source/mupen64plus-core/src/main/rom.h
old mode 100644
new mode 100755
index 5b91ace..afab920
--- a/source/mupen64plus-core/src/main/rom.h
+++ b/source/mupen64plus-core/src/main/rom.h
@@ -111,6 +111,9 @@ typedef struct
    unsigned char savetype;
    unsigned char players; /* Local players 0-4, 2/3/4 way Netplay indicated by 5/6/7. */
    unsigned char rumble; /* 0 - No, 1 - Yes boolean for rumble support. */
+   /*SEB*/
+   signed char delay_si;	/* -1 = no value, 0 = off, 1 = on */
+   signed char count_per_op; /* -1 = no value, 0..3 = value */
 } romdatabase_entry;
 
 typedef struct _romdatabase_search
diff --git a/source/mupen64plus-core/src/memory/dma.c b/source/mupen64plus-core/src/memory/dma.c
index 8d606d4..4e08878 100644
--- a/source/mupen64plus-core/src/memory/dma.c
+++ b/source/mupen64plus-core/src/memory/dma.c
@@ -45,6 +45,7 @@
 #include "main/util.h"
 
 static unsigned char sram[0x8000];
+int delay_si = 0;
 
 static char *get_sram_path(void)
 {
@@ -354,7 +355,14 @@ void dma_si_write(void)
 
     update_pif_write();
     update_count();
-    add_interupt_event(SI_INT, /*0x100*/0x900);
+
+    if (delay_si) {
+        add_interupt_event(SI_INT, /*0x100*/0x900);
+    } else {
+        MI_register.mi_intr_reg |= 0x02; // SI
+        si_register.si_stat |= 0x1000; // INTERRUPT
+        check_interupt();
+    }
 }
 
 void dma_si_read(void)
@@ -375,6 +383,13 @@ void dma_si_read(void)
     }
 
     update_count();
-    add_interupt_event(SI_INT, /*0x100*/0x900);
+
+    if (delay_si) {
+        add_interupt_event(SI_INT, /*0x100*/0x900);
+    } else {
+        MI_register.mi_intr_reg |= 0x02; // SI
+        si_register.si_stat |= 0x1000; // INTERRUPT
+        check_interupt();
+    }
 }
 
diff --git a/source/mupen64plus-core/src/memory/pif.c b/source/mupen64plus-core/src/memory/pif.c
index b20c4e7..9bec74f 100644
--- a/source/mupen64plus-core/src/memory/pif.c
+++ b/source/mupen64plus-core/src/memory/pif.c
@@ -492,6 +492,8 @@ void update_pif_write(void)
             }
             // calculate the proper response for the given challenge (X-Scale's algorithm)
             n64_cic_nus_6105(challenge, response, CHL_LEN - 2);
+            PIF_RAMb[46] = 0;
+            PIF_RAMb[47] = 0;
             // re-format the 'response' into a byte stream
             for (i = 0; i < 15; i++)
             {
diff --git a/source/mupen64plus-core/src/r4300/interupt.c b/source/mupen64plus-core/src/r4300/interupt.c
index f3f185c..102d8a8 100755
--- a/source/mupen64plus-core/src/r4300/interupt.c
+++ b/source/mupen64plus-core/src/r4300/interupt.c
@@ -424,9 +424,9 @@ void gen_interupt(void)
     
         case COMPARE_INT:
             remove_interupt_event();
-            Count+=2;
+            Count+=count_per_op;
             add_interupt_event_count(COMPARE_INT, Compare);
-            Count-=2;
+            Count-=count_per_op;
     
             Cause = (Cause | 0x8000) & 0xFFFFFF83;
             if ((Status & 7) != 1) return;
diff --git a/source/mupen64plus-core/src/r4300/new_dynarec/new_dynarec.c b/source/mupen64plus-core/src/r4300/new_dynarec/new_dynarec.c
index d177324..53e569f 100755
--- a/source/mupen64plus-core/src/r4300/new_dynarec/new_dynarec.c
+++ b/source/mupen64plus-core/src/r4300/new_dynarec/new_dynarec.c
@@ -49,7 +49,7 @@
 
 #define MAXBLOCK 4096
 #define MAX_OUTPUT_BLOCK_SIZE 262144
-#define CLOCK_DIVIDER 2
+#define CLOCK_DIVIDER count_per_op
 
 void *base_addr;
 
diff --git a/source/mupen64plus-core/src/r4300/r4300.c b/source/mupen64plus-core/src/r4300/r4300.c
index 7f58def..5b57ce6 100755
--- a/source/mupen64plus-core/src/r4300/r4300.c
+++ b/source/mupen64plus-core/src/r4300/r4300.c
@@ -45,6 +45,7 @@
 
 unsigned int r4300emu = 0;
 int no_compiled_jump = 0;
+unsigned int count_per_op = 2;
 int llbit, rompause;
 #if NEW_DYNAREC != NEW_DYNAREC_ARM
 int stop;
@@ -704,7 +705,7 @@ void update_count(void)
     if (r4300emu != CORE_DYNAREC)
     {
 #endif
-        Count = Count + (PC->addr - last_addr)/2;
+        Count += ((PC->addr - last_addr) >> 2) * count_per_op;
         last_addr = PC->addr;
 #ifdef NEW_DYNAREC
     }
diff --git a/source/mupen64plus-core/src/r4300/r4300.h b/source/mupen64plus-core/src/r4300/r4300.h
index a4142a1..6796903 100755
--- a/source/mupen64plus-core/src/r4300/r4300.h
+++ b/source/mupen64plus-core/src/r4300/r4300.h
@@ -46,6 +46,7 @@ extern unsigned int last_addr;
 extern char invalid_code[0x100000];
 extern unsigned int jump_to_address;
 extern int no_compiled_jump;
+extern unsigned int count_per_op;
 
 void init_blocks(void);
 void free_blocks(void);
diff --git a/source/mupen64plus-core/src/r4300/x86/gr4300.c b/source/mupen64plus-core/src/r4300/x86/gr4300.c
index d2ed999..830d4c3 100644
--- a/source/mupen64plus-core/src/r4300/x86/gr4300.c
+++ b/source/mupen64plus-core/src/r4300/x86/gr4300.c
@@ -47,22 +47,18 @@ int branch_taken;
 
 static void genupdate_count(unsigned int addr)
 {
-#ifndef COMPARE_CORE
-#ifndef DBG
+#if !defined(COMPARE_CORE) && !defined(DBG)
    mov_reg32_imm32(EAX, addr);
    sub_reg32_m32(EAX, (unsigned int*)(&last_addr));
-   shr_reg32_imm8(EAX, 1);
+   shr_reg32_imm8(EAX, 2);
+   mov_reg32_m32(EDX, &count_per_op);
+   mul_reg32(EDX);
    add_m32_reg32((unsigned int*)(&Count), EAX);
 #else
    mov_m32_imm32((unsigned int*)(&PC), (unsigned int)(dst+1));
    mov_reg32_imm32(EAX, (unsigned int)update_count);
    call_reg32(EAX);
 #endif
-#else
-   mov_m32_imm32((unsigned int*)(&PC), (unsigned int)(dst+1));
-   mov_reg32_imm32(EAX, (unsigned int)update_count);
-   call_reg32(EAX);
-#endif
 }
 
 static void gencheck_interupt(unsigned int instr_structure)
diff --git a/source/mupen64plus-core/src/r4300/x86_64/gr4300.c b/source/mupen64plus-core/src/r4300/x86_64/gr4300.c
index 15db43f..41d836a 100644
--- a/source/mupen64plus-core/src/r4300/x86_64/gr4300.c
+++ b/source/mupen64plus-core/src/r4300/x86_64/gr4300.c
@@ -90,7 +90,9 @@ static void genupdate_count(unsigned int addr)
 #if !defined(COMPARE_CORE) && !defined(DBG)
    mov_reg32_imm32(EAX, addr);
    sub_xreg32_m32rel(EAX, (unsigned int*)(&last_addr));
-   shr_reg32_imm8(EAX, 1);
+   shr_reg32_imm8(EAX, 2);
+   mov_xreg32_m32rel(EDX, (void*)&count_per_op);
+   mul_reg32(EDX);
    add_m32rel_xreg32((unsigned int*)(&Count), EAX);
 #else
    mov_reg64_imm64(RAX, (unsigned long long) (dst+1));
diff --git a/source/mupen64plus-rsp-hle/projects/unix/Makefile b/source/mupen64plus-rsp-hle/projects/unix/Makefile
index cafd807..03a1aaa 100755
--- a/source/mupen64plus-rsp-hle/projects/unix/Makefile
+++ b/source/mupen64plus-rsp-hle/projects/unix/Makefile
@@ -150,23 +150,15 @@ ifeq ($(OS), LINUX)
   LDFLAGS += -Wl,-version-script,$(SRCDIR)/rsp_api_export.ver
 endif
 ifeq ($(OS), OSX)
-  # Select the proper SDK
-  # Also, SDKs are stored in a different location since XCode 4.3
-  OSX_SDK ?= $(shell sw_vers -productVersion | cut -f1 -f2 -d .)
-  OSX_XCODEMAJ = $(shell xcodebuild -version | grep '[0-9]*\.[0-9]*' | cut -f2 -d ' ' | cut -f1 -d .)
-  OSX_XCODEMIN = $(shell xcodebuild -version | grep '[0-9]*\.[0-9]*' | cut -f2 -d ' ' | cut -f2 -d .)
-  OSX_XCODEGE43 = $(shell echo "`expr $(OSX_XCODEMAJ) \>= 4``expr $(OSX_XCODEMIN) \>= 3`")
-  ifeq ($(OSX_XCODEGE43), 11)
-    OSX_SYSROOT := /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs
-  else
-    OSX_SYSROOT := /Developer/SDKs
-  endif
+  #xcode-select has been around since XCode 3.0, i.e. OS X 10.5
+  OSX_SDK_ROOT = $(shell xcode-select -print-path)/Platforms/MacOSX.platform/Developer/SDKs
+  OSX_SDK_PATH = $(OSX_SDK_ROOT)/$(shell ls $(OSX_SDK_ROOT) | tail -1)
 
   ifeq ($(CPU), X86)
     ifeq ($(ARCH_DETECTED), 64BITS)
-      CFLAGS += -pipe -arch x86_64 -mmacosx-version-min=$(OSX_SDK) -isysroot $(OSX_SYSROOT)/MacOSX$(OSX_SDK).sdk
+      CFLAGS += -pipe -arch x86_64 -mmacosx-version-min=10.5 -isysroot $(OSX_SDK_PATH)
     else
-      CFLAGS += -pipe -mmmx -msse -fomit-frame-pointer -arch i686 -mmacosx-version-min=$(OSX_SDK) -isysroot $(OSX_SYSROOT)/MacOSX$(OSX_SDK).sdk
+      CFLAGS += -pipe -mmmx -msse -fomit-frame-pointer -arch i686 -mmacosx-version-min=10.5 -isysroot $(OSX_SDK_PATH)
       LDFLAGS += -read_only_relocs suppress
     endif
   endif
@@ -296,7 +288,7 @@ clean:
 rebuild: clean all
 
 # build dependency files
-CFLAGS += -MD
+CFLAGS += -MD -MP
 -include $(OBJECTS:.o=.d)
 
 CXXFLAGS += $(CFLAGS)
diff --git a/source/mupen64plus-rsp-hle/src/jpeg.c b/source/mupen64plus-rsp-hle/src/jpeg.c
index 28fcc8b..3db2e89 100755
--- a/source/mupen64plus-rsp-hle/src/jpeg.c
+++ b/source/mupen64plus-rsp-hle/src/jpeg.c
@@ -33,7 +33,7 @@
 #define SUBBLOCK_SIZE 64
 
 typedef void (*tile_line_emitter_t)(const int16_t *y, const int16_t *u, uint32_t address);
-typedef void (*std_macroblock_decoder_t)(int16_t *macroblock, unsigned int subblock_count, const int16_t qtables[3][SUBBLOCK_SIZE]);
+typedef void (*subblock_transform_t)(int16_t* dst, const int16_t* src);
 
 /* rdram operations */
 // FIXME: these functions deserve their own module
@@ -43,7 +43,10 @@ static uint32_t rdram_read_u32(uint32_t address);
 static void rdram_write_many_u32(const uint32_t *src, uint32_t address, unsigned int count);
 
 /* standard jpeg ucode decoder */
-static void jpeg_decode_std(const char * const version, const std_macroblock_decoder_t decode_mb, const tile_line_emitter_t emit_line);
+static void jpeg_decode_std(const char * const version,
+        const subblock_transform_t transform_luma,
+        const subblock_transform_t transform_chroma,
+        const tile_line_emitter_t emit_line);
 
 /* helper functions */
 static uint8_t clamp_u8(int16_t x);
@@ -60,9 +63,11 @@ static void EmitYUVTileLine(const int16_t *y, const int16_t *u, uint32_t address
 static void EmitRGBATileLine(const int16_t *y, const int16_t *u, uint32_t address);
 
 /* macroblocks operations */
-static void DecodeMacroblock1(int16_t *macroblock, int32_t *y_dc, int32_t *u_dc, int32_t *v_dc, const int16_t *qtable);
-static void DecodeMacroblock2(int16_t *macroblock, unsigned int subblock_count, const int16_t qtables[3][SUBBLOCK_SIZE]);
-static void DecodeMacroblock3(int16_t *macroblock, unsigned int subblock_count, const int16_t qtables[3][SUBBLOCK_SIZE]);
+static void decode_macroblock_ob(int16_t *macroblock, int32_t *y_dc, int32_t *u_dc, int32_t *v_dc, const int16_t *qtable);
+static void decode_macroblock_std(
+        const subblock_transform_t transform_luma,
+        const subblock_transform_t transform_chroma,
+        int16_t *macroblock, unsigned int subblock_count, const int16_t qtables[3][SUBBLOCK_SIZE]);
 static void EmitTilesMode0(const tile_line_emitter_t emit_line, const int16_t *macroblock, uint32_t address);
 static void EmitTilesMode2(const tile_line_emitter_t emit_line, const int16_t *macroblock, uint32_t address);
 
@@ -145,7 +150,7 @@ static const float IDCT_K[10] =
  **************************************************************************/
 void jpeg_decode_PS0()
 {
-    jpeg_decode_std("PS0", DecodeMacroblock3, EmitYUVTileLine);
+    jpeg_decode_std("PS0", RescaleYSubBlock, RescaleUVSubBlock, EmitYUVTileLine);
 }
 
 /***************************************************************************
@@ -154,7 +159,7 @@ void jpeg_decode_PS0()
  **************************************************************************/
 void jpeg_decode_PS()
 {
-    jpeg_decode_std("PS", DecodeMacroblock2, EmitRGBATileLine);
+    jpeg_decode_std("PS", NULL, NULL, EmitRGBATileLine);
 }
 
 /***************************************************************************
@@ -197,7 +202,7 @@ void jpeg_decode_OB()
         int16_t macroblock[6*SUBBLOCK_SIZE];
 
         rdram_read_many_u16((uint16_t*)macroblock, address, 6*SUBBLOCK_SIZE);
-        DecodeMacroblock1(macroblock, &y_dc, &u_dc, &v_dc, (qscale != 0) ? qtable : NULL);
+        decode_macroblock_ob(macroblock, &y_dc, &u_dc, &v_dc, (qscale != 0) ? qtable : NULL);
         EmitTilesMode2(EmitYUVTileLine, macroblock, address);
 
         address += (2*6*SUBBLOCK_SIZE);
@@ -206,7 +211,10 @@ void jpeg_decode_OB()
 
 
 /* local functions */
-static void jpeg_decode_std(const char * const version, const std_macroblock_decoder_t decode_mb, const tile_line_emitter_t emit_line)
+static void jpeg_decode_std(const char * const version,
+        const subblock_transform_t transform_luma,
+        const subblock_transform_t transform_chroma,
+        const tile_line_emitter_t emit_line)
 {
     int16_t qtables[3][SUBBLOCK_SIZE];
     unsigned int mb;
@@ -218,7 +226,7 @@ static void jpeg_decode_std(const char * const version, const std_macroblock_dec
     uint32_t qtableV_ptr;
     unsigned int subblock_count;
     unsigned int macroblock_size;
-    int16_t *macroblock;
+    int16_t macroblock[6*SUBBLOCK_SIZE]; /* macroblock contains at most 6 subblobcks */
     const OSTask_t * const task = get_task();
 
     if (task->flags & 0x1)
@@ -250,23 +258,17 @@ static void jpeg_decode_std(const char * const version, const std_macroblock_dec
     }
     
     subblock_count = mode + 4;
-    macroblock_size = 2*subblock_count*SUBBLOCK_SIZE;
+    macroblock_size = subblock_count*SUBBLOCK_SIZE;
 
     rdram_read_many_u16((uint16_t*)qtables[0], qtableY_ptr, SUBBLOCK_SIZE);
     rdram_read_many_u16((uint16_t*)qtables[1], qtableU_ptr, SUBBLOCK_SIZE);
     rdram_read_many_u16((uint16_t*)qtables[2], qtableV_ptr, SUBBLOCK_SIZE);
 
-    macroblock = malloc(sizeof(*macroblock) * macroblock_size);
-    if (!macroblock)
-    {
-        DebugMessage(M64MSG_WARNING, "jpeg_decode_%s: could not allocate macroblock", version);
-        return;
-    }
-
     for (mb = 0; mb < macroblock_count; ++mb)
     {
-        rdram_read_many_u16((uint16_t*)macroblock, address, macroblock_size >> 1);
-        decode_mb(macroblock, subblock_count, (const int16_t (*)[SUBBLOCK_SIZE])qtables);
+        rdram_read_many_u16((uint16_t*)macroblock, address, macroblock_size);
+        decode_macroblock_std(transform_luma, transform_chroma,
+                macroblock, subblock_count, (const int16_t (*)[SUBBLOCK_SIZE])qtables);
 
         if (mode == 0)
         {
@@ -277,9 +279,8 @@ static void jpeg_decode_std(const char * const version, const std_macroblock_dec
             EmitTilesMode2(emit_line, macroblock, address);
         }
 
-        address += macroblock_size;
+        address += 2*macroblock_size;
     }
-    free(macroblock);
 }
 
 static uint8_t clamp_u8(int16_t x)
@@ -407,7 +408,7 @@ static void EmitTilesMode2(const tile_line_emitter_t emit_line, const int16_t *m
     }
 }
 
-static void DecodeMacroblock1(int16_t *macroblock, int32_t *y_dc, int32_t *u_dc, int32_t *v_dc, const int16_t *qtable)
+static void decode_macroblock_ob(int16_t *macroblock, int32_t *y_dc, int32_t *u_dc, int32_t *v_dc, const int16_t *qtable)
 {
     int sb;
 
@@ -434,28 +435,10 @@ static void DecodeMacroblock1(int16_t *macroblock, int32_t *y_dc, int32_t *u_dc,
     }
 }
 
-static void DecodeMacroblock2(int16_t *macroblock, unsigned int subblock_count, const int16_t qtables[3][SUBBLOCK_SIZE])
-{
-    unsigned int sb;
-    unsigned int q = 0;
-
-    for (sb = 0; sb < subblock_count; ++sb)
-    {
-        int16_t tmp_sb[SUBBLOCK_SIZE];
-        const int isChromaSubBlock = (subblock_count - sb <= 2);
-
-        if (isChromaSubBlock) { ++q; }
-
-        MultSubBlocks(macroblock, macroblock, qtables[q], 4);
-        ZigZagSubBlock(tmp_sb, macroblock);
-        InverseDCTSubBlock(macroblock, tmp_sb);
-
-        macroblock += SUBBLOCK_SIZE;
-    }
-
-}
-
-static void DecodeMacroblock3(int16_t *macroblock, unsigned int subblock_count, const int16_t qtables[3][SUBBLOCK_SIZE])
+static void decode_macroblock_std(
+        const subblock_transform_t transform_luma,
+        const subblock_transform_t transform_chroma,
+        int16_t *macroblock, unsigned int subblock_count, const int16_t qtables[3][SUBBLOCK_SIZE])
 {
     unsigned int sb;
     unsigned int q = 0;
@@ -473,11 +456,13 @@ static void DecodeMacroblock3(int16_t *macroblock, unsigned int subblock_count,
 
         if (isChromaSubBlock)
         {
-            RescaleUVSubBlock(macroblock, macroblock);
+            if (transform_chroma != NULL)
+                transform_chroma(macroblock, macroblock);
         }
         else
         {
-            RescaleYSubBlock(macroblock, macroblock);
+            if (transform_luma != NULL)
+                transform_luma(macroblock, macroblock);
         }
 
         macroblock += SUBBLOCK_SIZE;
diff --git a/source/mupen64plus-rsp-hle/src/main.c b/source/mupen64plus-rsp-hle/src/main.c
index ff6525a..396321f 100644
--- a/source/mupen64plus-rsp-hle/src/main.c
+++ b/source/mupen64plus-rsp-hle/src/main.c
@@ -226,7 +226,9 @@ static void normal_task_dispatching()
         case 0x2caa6: jpeg_decode_PS(); return;
 
         /* JPEG: found in Ogre Battle, Bottom of the 9th */
-        case 0x130de: jpeg_decode_OB(); return;
+        case 0x130de:
+        case 0x278b0:
+            jpeg_decode_OB(); return;
     }
 
     handle_unknown_task(sum);
diff --git a/source/rice_gles/INSTALL b/source/rice_gles/INSTALL
new file mode 100644
index 0000000..e0bd5c3
--- /dev/null
+++ b/source/rice_gles/INSTALL
@@ -0,0 +1,26 @@
+Mupen64Plus-Video-Rice INSTALL
+------------------------------
+
+This text file was written to explain the installation process of the
+Mupen64Plus-Video-Rice module.
+
+If this module is part of a Mupen64Plus source code bundle, the user should run
+the "m64p_install.sh" script in the root of the unzipped bundle to install all
+of the included modules in the bundle.
+
+If this module is a standalone source code release, you should build the library
+from source code and install it via the makefile, like this:
+
+$ cd projects/unix
+$ make all
+$ sudo make install
+
+If you want to build the Mupen64Plus-Video-Rice module for installation in a
+home folder for a single user, you may build it like this (replacing
+<my-folder> with your desired local installation path):
+
+$ cd projects/unix
+$ make all
+$ make install LIBDIR=<my-folder> SHAREDIR=<my-folder>
+
+
diff --git a/source/rice_gles/projects/unix/Makefile b/source/rice_gles/projects/unix/Makefile
index 50245aa..466f183 100755
--- a/source/rice_gles/projects/unix/Makefile
+++ b/source/rice_gles/projects/unix/Makefile
@@ -124,7 +124,8 @@ ifeq ("$(CPU)","NONE")
 endif
 
 # base CFLAGS, LDLIBS, and LDFLAGS
-OPTFLAGS ?= -O3 -flto -fuse-linker-plugin
+OPTFLAGS ?= -O3 
+#-flto -fuse-linker-plugin
 WARNFLAGS ?= -Wall
 CFLAGS += $(OPTFLAGS) $(WARNFLAGS) -ffast-math -fno-strict-aliasing -fvisibility=hidden -I../../src
 CXXFLAGS += -fvisibility-inlines-hidden