From fe18df39aa9dc0a3afc5a7474a27698672eceeca Mon Sep 17 00:00:00 2001
From: notaz <notasas@gmail.com>
Date: Sat, 25 Apr 2015 23:40:02 +0300
Subject: [PATCH] translate: float: stack mode and some more ops

---
 tests/Makefile       |   2 +-
 tests/x87.asm        |   7 +-
 tests/x87.expect.c   |  17 +--
 tests/x87_f.asm      |  24 ++++
 tests/x87_f.expect.c |  17 +++
 tests/x87_f.seed.h   |   0
 tests/x87_s.asm      |  37 +++++
 tests/x87_s.expect.c |  30 ++++
 tests/x87_s.seed.h   |   0
 tools/translate.c    | 325 ++++++++++++++++++++++++++++++++-----------
 10 files changed, 366 insertions(+), 93 deletions(-)
 create mode 100644 tests/x87_f.asm
 create mode 100644 tests/x87_f.expect.c
 create mode 100644 tests/x87_f.seed.h
 create mode 100644 tests/x87_s.asm
 create mode 100644 tests/x87_s.expect.c
 create mode 100644 tests/x87_s.seed.h

diff --git a/tests/Makefile b/tests/Makefile
index 16b404d..81d5d99 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -1,6 +1,6 @@
 
 TESTS = reg_call1 reg_call2 reg_call3 reg_call_tail reg_save \
-	varargs ops x87
+	varargs ops x87 x87_f x87_s
 
 all: $(addsuffix .ok,$(TESTS))
 
diff --git a/tests/x87.asm b/tests/x87.asm
index 240730f..e6b40e5 100644
--- a/tests/x87.asm
+++ b/tests/x87.asm
@@ -17,8 +17,13 @@ arg_0           = dword ptr  8
                 fdiv    st(1), st
                 faddp   st(1), st
                 fld1
-                fdivrp  [ebp+var_18]
+                fdivr   [ebp+var_18]
+                fxch    st(1)
+                fchs
+                fsubrp  st, st
                 fld     st(1)
+                fyl2x
+                fld     st
                 fstp    [ebp+var_18]
                 fst     [ebp+var_20]
                 call    __ftol
diff --git a/tests/x87.expect.c b/tests/x87.expect.c
index d114f10..a5d0e70 100644
--- a/tests/x87.expect.c
+++ b/tests/x87.expect.c
@@ -9,17 +9,18 @@ int sub_test(int a1, int a2)
   f_st0 = (double)(s32)sf.d[0];  // var_20 fild
   f_st0 /= (double)(s32)a1;  // arg_0
   f_st0 *= *((double *)(u32)&sf.q[1]);  // var_18
-  f_st1 = f_st0;
-  f_st0 = (double)(s32)sf.d[0];  // var_20 fild
+  f_st1 = f_st0;  f_st0 = (double)(s32)sf.d[0];  // var_20 fild
   f_st1 /= f_st0;
   f_st0 = f_st1 + f_st0;
-  f_st1 = f_st0;
-  f_st0 = 1.0;
+  f_st1 = f_st0;  f_st0 = 1.0;
   f_st0 = *((double *)(u32)&sf.q[1]) / f_st0;  // var_18
-  f_st1 = f_st0;
-  f_st0 = f_st1;  // fld
-  *((double *)(u32)&sf.q[1]) = f_st0;
-  f_st0 = f_st1;  // var_18 fst
+  { double t = f_st0; f_st0 = f_st1; f_st1 = t; }  // fxch
+  f_st0 = -f_st0;
+  f_st0 = f_st1;
+  f_st1 = f_st0;  f_st0 = f_st1;  // fld
+  f_st0 = f_st1 * log2(f_st0);
+  f_st1 = f_st0;  // fld st
+  *((double *)(u32)&sf.q[1]) = f_st0;  f_st0 = f_st1;  // var_18 fst
   *((float *)(u32)&sf.d[0]) = f_st0;  // var_20 fst
   eax = (s32)f_st0;  // ftol
   return eax;
diff --git a/tests/x87_f.asm b/tests/x87_f.asm
new file mode 100644
index 0000000..06a9118
--- /dev/null
+++ b/tests/x87_f.asm
@@ -0,0 +1,24 @@
+
+_text           segment para public 'CODE' use32
+
+sub_test        proc near
+
+var_4           = dword ptr -4
+
+                push    ebp
+                mov     ebp, esp
+                sub     esp, 4
+                mov     [ebp+var_4], 4
+                fild    [ebp+var_4]
+                fild    [ebp+var_4]
+                fsqrt
+                fpatan
+                call    __ftol
+                mov     esp, ebp
+                pop     ebp
+                retn
+sub_test        endp
+
+_text           ends
+
+; vim:expandtab
diff --git a/tests/x87_f.expect.c b/tests/x87_f.expect.c
new file mode 100644
index 0000000..b943c96
--- /dev/null
+++ b/tests/x87_f.expect.c
@@ -0,0 +1,17 @@
+int sub_test()
+{
+  union { u32 d[1]; u8 b[4]; } sf;
+  u32 eax;
+  u32 edx;
+  float f_st0;
+  float f_st1;
+
+  sf.d[0] = 4;  // var_4
+  f_st0 = (float)(s32)sf.d[0];  // var_4 fild
+  f_st1 = f_st0;  f_st0 = (float)(s32)sf.d[0];  // var_4 fild
+  f_st0 = sqrtf(f_st0);
+  f_st0 = atanf(f_st1 / f_st0);
+  eax = (s32)f_st0;  // ftol
+  return eax;
+}
+
diff --git a/tests/x87_f.seed.h b/tests/x87_f.seed.h
new file mode 100644
index 0000000..e69de29
diff --git a/tests/x87_s.asm b/tests/x87_s.asm
new file mode 100644
index 0000000..9b86871
--- /dev/null
+++ b/tests/x87_s.asm
@@ -0,0 +1,37 @@
+
+_text           segment para public 'CODE' use32
+
+sub_test        proc near
+
+var_4           = dword ptr -4
+
+                push    ebp
+                mov     ebp, esp
+                sub     esp, 4
+                mov     [ebp+var_4], 4
+                fild    [ebp+var_4]
+                fld     [ebp+var_4]
+                fild    [ebp+var_4]
+                fld1
+                fild    [ebp+var_4]
+                fldz
+                fldln2
+                fild    [ebp+var_4]
+                faddp   st, st(7)
+                fdivp   st(5), st
+                fyl2x
+                fsubp   st(2), st
+                fsubrp  st, st
+                fxch    st(6)
+                fchs
+                fpatan
+                fstp    [ebp+var_4]
+                call    __ftol
+                mov     esp, ebp
+                pop     ebp
+                retn
+sub_test        endp
+
+_text           ends
+
+; vim:expandtab
diff --git a/tests/x87_s.expect.c b/tests/x87_s.expect.c
new file mode 100644
index 0000000..6b01b42
--- /dev/null
+++ b/tests/x87_s.expect.c
@@ -0,0 +1,30 @@
+int sub_test()
+{
+  union { u32 d[1]; u8 b[4]; double q[1]; } sf;
+  u32 eax;
+  u32 edx;
+  float f_st[8];
+  int f_stp = 0;
+
+  sf.d[0] = 4;  // var_4
+  f_st[--f_stp & 7] = (float)(s32)sf.d[0];  // var_4 fild
+  f_st[--f_stp & 7] = *((float *)(u32)&sf.d[0]);  // var_4 fld
+  f_st[--f_stp & 7] = (float)(s32)sf.d[0];  // var_4 fild
+  f_st[--f_stp & 7] = 1.0;
+  f_st[--f_stp & 7] = (float)(s32)sf.d[0];  // var_4 fild
+  f_st[--f_stp & 7] = 0.0;
+  f_st[--f_stp & 7] = 0.693147180559945;
+  f_st[--f_stp & 7] = (float)(s32)sf.d[0];  // var_4 fild
+  f_stp++;
+  f_st[(f_stp + 5) & 7] /= f_st[f_stp & 7];  f_stp++;
+  f_st[(f_stp + 1) & 7] = f_st[(f_stp + 1) & 7] * log2f(f_st[f_stp & 7]); f_stp++;
+  f_st[(f_stp + 2) & 7] -= f_st[f_stp & 7];  f_stp++;
+  f_stp++;
+  { float t = f_st[f_stp & 7]; f_st[f_stp & 7] = f_st[(f_stp + 6) & 7]; f_st[(f_stp + 6) & 7] = t; }  // fxch
+  f_st[f_stp & 7] = -f_st[f_stp & 7];
+  f_st[(f_stp + 1) & 7] = atanf(f_st[(f_stp + 1) & 7] / f_st[f_stp & 7]); f_stp++;
+  *((float *)(u32)&sf.d[0]) = f_st[f_stp & 7];  f_stp++;  // var_4 fst
+  eax = (s32)f_st[f_stp & 7]; f_stp++;  // ftol
+  return eax;
+}
+
diff --git a/tests/x87_s.seed.h b/tests/x87_s.seed.h
new file mode 100644
index 0000000..e69de29
diff --git a/tools/translate.c b/tools/translate.c
index a7f629e..2239106 100644
--- a/tools/translate.c
+++ b/tools/translate.c
@@ -136,11 +136,14 @@ enum op_op {
   OP_FISUB,
   OP_FIDIVR,
   OP_FISUBR,
+  OP_FCHS,
   OP_FCOS,
   OP_FPATAN,
   OP_FPTAN,
   OP_FSIN,
   OP_FSQRT,
+  OP_FXCH,
+  OP_FYL2X,
   // mmx
   OP_EMMS,
   // pseudo-ops for lib calls
@@ -337,6 +340,9 @@ enum x86_regs {
 #define mxDX     (1 << xDX)
 #define mxST0    (1 << xST0)
 #define mxST1    (1 << xST1)
+#define mxST1_0  (mxST1 | mxST0)
+#define mxST7_2  (0xfc << xST0)
+#define mxSTa    (0xff << xST0)
 
 // possible basic comparison types (without inversion)
 enum parsed_flag_op {
@@ -1029,6 +1035,7 @@ static const struct {
   { "fld",    OP_FLD,    1, 1, OPF_FPUSH },
   { "fild",   OP_FILD,   1, 1, OPF_FPUSH },
   { "fld1",   OP_FLDc,   0, 0, OPF_FPUSH },
+  { "fldln2", OP_FLDc,   0, 0, OPF_FPUSH },
   { "fldz",   OP_FLDc,   0, 0, OPF_FPUSH },
   { "fstp",   OP_FST,    1, 1, OPF_FPOP },
   { "fst",    OP_FST,    1, 1, 0 },
@@ -1050,11 +1057,14 @@ static const struct {
   { "fisub",  OP_FISUB,  1, 1, 0 },
   { "fidivr", OP_FIDIVR, 1, 1, 0 },
   { "fisubr", OP_FISUBR, 1, 1, 0 },
+  { "fchs",   OP_FCHS,   0, 0, 0 },
   { "fcos",   OP_FCOS,   0, 0, 0 },
   { "fpatan", OP_FPATAN, 0, 0, OPF_FPOP },
   { "fptan",  OP_FPTAN,  0, 0, OPF_FPUSH },
   { "fsin",   OP_FSIN,   0, 0, 0 },
   { "fsqrt",  OP_FSQRT,  0, 0, 0 },
+  { "fxch",   OP_FXCH,   1, 1, 0 },
+  { "fyl2x",  OP_FYL2X,  0, 0, OPF_FPOP },
   // mmx
   { "emms",   OP_EMMS,   0, 0, OPF_DATA },
   { "movq",   OP_MOV,    2, 2, OPF_DATA },
@@ -1345,6 +1355,8 @@ static void parse_op(struct parsed_op *op, char words[16][256], int wordc)
     op->regmask_dst |= mxST0;
     if      (IS(words[op_w] + 3, "1"))
       op->operand[0].val = X87_CONST_1;
+    else if (IS(words[op_w] + 3, "ln2"))
+      op->operand[0].val = X87_CONST_LN2;
     else if (IS(words[op_w] + 3, "z"))
       op->operand[0].val = X87_CONST_Z;
     else
@@ -1382,14 +1394,17 @@ static void parse_op(struct parsed_op *op, char words[16][256], int wordc)
   case OP_FISUB:
   case OP_FIDIVR:
   case OP_FISUBR:
+  case OP_FCHS:
   case OP_FCOS:
   case OP_FSIN:
   case OP_FSQRT:
+  case OP_FXCH:
     op->regmask_src |= mxST0;
     op->regmask_dst |= mxST0;
     break;
 
   case OP_FPATAN:
+  case OP_FYL2X:
     op->regmask_src |= mxST0 | mxST1;
     op->regmask_dst |= mxST0;
     break;
@@ -2190,7 +2205,7 @@ static char *out_src_opr_u32(char *buf, size_t buf_size,
 }
 
 static char *out_src_opr_float(char *buf, size_t buf_size,
-  struct parsed_op *po, struct parsed_opr *popr)
+  struct parsed_op *po, struct parsed_opr *popr, int need_float_stack)
 {
   const char *cast = NULL;
   char tmp[256];
@@ -2200,7 +2215,15 @@ static char *out_src_opr_float(char *buf, size_t buf_size,
     if (popr->reg < xST0 || popr->reg > xST7)
       ferr(po, "bad reg: %d\n", popr->reg);
 
-    snprintf(buf, buf_size, "f_st%d", popr->reg - xST0);
+    if (need_float_stack) {
+      if (popr->reg == xST0)
+        snprintf(buf, buf_size, "f_st[f_stp & 7]");
+      else
+        snprintf(buf, buf_size, "f_st[(f_stp + %d) & 7]",
+          popr->reg - xST0);
+    }
+    else
+      snprintf(buf, buf_size, "f_st%d", popr->reg - xST0);
     break;
 
   case OPT_REGMEM:
@@ -2229,10 +2252,10 @@ static char *out_src_opr_float(char *buf, size_t buf_size,
 }
 
 static char *out_dst_opr_float(char *buf, size_t buf_size,
-  struct parsed_op *po, struct parsed_opr *popr)
+  struct parsed_op *po, struct parsed_opr *popr, int need_float_stack)
 {
   // same?
-  return out_src_opr_float(buf, buf_size, po, popr);
+  return out_src_opr_float(buf, buf_size, po, popr, need_float_stack);
 }
 
 static void out_test_for_cc(char *buf, size_t buf_size,
@@ -3803,11 +3826,11 @@ static void scan_prologue_epilogue(int opcnt)
   {
     g_sp_frame = 1;
 
-    i++;
     do {
       for (; i < opcnt; i++)
         if (ops[i].flags & OPF_TAIL)
           break;
+
       j = i - 1;
       if (i == opcnt && (ops[j].flags & OPF_JMP)) {
         if (ops[j].bt_i != -1 || ops[j].btj != NULL)
@@ -4663,7 +4686,6 @@ static void reg_use_pass(int i, int opcnt, unsigned char *cbits,
   int *regmask_init, int regmask_arg)
 {
   struct parsed_op *po;
-  unsigned int mask;
   int already_saved;
   int regmask_new;
   int regmask_op;
@@ -4763,6 +4785,10 @@ static void reg_use_pass(int i, int opcnt, unsigned char *cbits,
             po->regmask_dst &= ~(1 << xAX);
         }
       }
+
+      // not "full stack" mode and have something in stack
+      if (!(regmask_now & mxST7_2) && (regmask_now & mxST1_0))
+        ferr(po, "float stack is not empty on func call\n");
     }
 
     if (po->flags & OPF_NOREGS)
@@ -4770,11 +4796,13 @@ static void reg_use_pass(int i, int opcnt, unsigned char *cbits,
 
     if (po->flags & OPF_FPUSH) {
       if (regmask_now & mxST1)
-        ferr(po, "TODO: FPUSH on active ST1\n");
-      if (regmask_now & mxST0)
+        regmask_now |= mxSTa; // switch to "full stack" mode
+      if (regmask_now & mxSTa)
         po->flags |= OPF_FSHIFT;
-      mask = mxST0 | mxST1;
-      regmask_now = (regmask_now & ~mask) | ((regmask_now & mxST0) << 1);
+      if (!(regmask_now & mxST7_2)) {
+        regmask_now =
+          (regmask_now & ~mxST1_0) | ((regmask_now & mxST0) << 1);
+      }
     }
 
     // if incomplete register is used, clear it on init to avoid
@@ -4814,16 +4842,18 @@ static void reg_use_pass(int i, int opcnt, unsigned char *cbits,
 
     // released regs
     if (po->flags & OPF_FPOP) {
-      mask = mxST0 | mxST1;
-      if (!(regmask_now & mask))
+      if ((regmask_now & mxSTa) == 0)
         ferr(po, "float pop on empty stack?\n");
-      if (regmask_now & mxST1)
+      if (regmask_now & (mxST7_2 | mxST1))
         po->flags |= OPF_FSHIFT;
-      regmask_now = (regmask_now & ~mask) | ((regmask_now & mxST1) >> 1);
+      if (!(regmask_now & mxST7_2)) {
+        regmask_now =
+          (regmask_now & ~mxST1_0) | ((regmask_now & mxST1) >> 1);
+      }
     }
 
     if (po->flags & OPF_TAIL) {
-      if (regmask_now & (mxST0 | mxST1))
+      if (!(regmask_now & mxST7_2) && (regmask_now & mxST1_0))
         ferr(po, "float regs on tail: %x\n", regmask_now);
 
       // there is support for "conditional tailcall", sort of
@@ -4954,9 +4984,12 @@ static void gen_func(FILE *fout, FILE *fhdr, const char *funcn, int opcnt)
   int save_arg_vars[MAX_ARG_GRP] = { 0, };
   unsigned char cbits[MAX_OPS / 8];
   const char *float_type;
-  int cond_vars = 0;
+  const char *float_st0;
+  const char *float_st1;
+  int need_float_stack = 0;
   int need_tmp_var = 0;
   int need_tmp64 = 0;
+  int cond_vars = 0;
   int had_decl = 0;
   int label_pending = 0;
   int need_double = 0;
@@ -4969,6 +5002,7 @@ static void gen_func(FILE *fout, FILE *fhdr, const char *funcn, int opcnt)
   int regmask = 0;      // used regs
   int pfomask = 0;
   int found = 0;
+  int dead_dst;
   int no_output;
   int i, j, l;
   int arg;
@@ -5284,6 +5318,9 @@ static void gen_func(FILE *fout, FILE *fhdr, const char *funcn, int opcnt)
   }
 
   float_type = need_double ? "double" : "float";
+  need_float_stack = !!(regmask & mxST7_2);
+  float_st0 = need_float_stack ? "f_st[f_stp & 7]" : "f_st0";
+  float_st1 = need_float_stack ? "f_st[(f_stp + 1) & 7]" : "f_st1";
 
   // output starts here
 
@@ -5441,14 +5478,21 @@ static void gen_func(FILE *fout, FILE *fhdr, const char *funcn, int opcnt)
     }
   }
   // ... x87
-  if (regmask_now & 0xff0000) {
-    for (reg = 16; reg < 24; reg++) {
-      if (regmask_now & (1 << reg)) {
-        fprintf(fout, "  %s f_st%d", float_type, reg - 16);
-        if (regmask_init & (1 << reg))
-          fprintf(fout, " = 0");
-        fprintf(fout, ";\n");
-        had_decl = 1;
+  if (need_float_stack) {
+    fprintf(fout, "  %s f_st[8];\n", float_type);
+    fprintf(fout, "  int f_stp = 0;\n");
+    had_decl = 1;
+  }
+  else {
+    if (regmask_now & 0xff0000) {
+      for (reg = 16; reg < 24; reg++) {
+        if (regmask_now & (1 << reg)) {
+          fprintf(fout, "  %s f_st%d", float_type, reg - 16);
+          if (regmask_init & (1 << reg))
+            fprintf(fout, " = 0");
+          fprintf(fout, ";\n");
+          had_decl = 1;
+        }
       }
     }
   }
@@ -6350,7 +6394,11 @@ static void gen_func(FILE *fout, FILE *fhdr, const char *funcn, int opcnt)
               fprintf(fout, "(u32)");
           }
           else if (po->regmask_dst & mxST0) {
-            fprintf(fout, "f_st0 = ");
+            ferr_assert(po, po->flags & OPF_FPUSH);
+            if (need_float_stack)
+              fprintf(fout, "f_st[--f_stp & 7] = ");
+            else
+              fprintf(fout, "f_st0 = ");
           }
         }
 
@@ -6571,57 +6619,97 @@ static void gen_func(FILE *fout, FILE *fhdr, const char *funcn, int opcnt)
 
       // x87
       case OP_FLD:
-        if (po->flags & OPF_FSHIFT)
-          fprintf(fout, "  f_st1 = f_st0;\n");
-        if (po->operand[0].type == OPT_REG
-          && po->operand[0].reg == xST0)
-        {
-          strcat(g_comment, " fld st");
-          break;
+        if (need_float_stack) {
+          out_src_opr_float(buf1, sizeof(buf1),
+            po, &po->operand[0], 1);
+          if (po->regmask_src & mxSTa) {
+            fprintf(fout, "  f_st[(f_stp - 1) & 7] = %s; f_stp--;",
+              buf1);
+          }
+          else
+            fprintf(fout, "  f_st[--f_stp & 7] = %s;", buf1);
+        }
+        else {
+          if (po->flags & OPF_FSHIFT)
+            fprintf(fout, "  f_st1 = f_st0;");
+          if (po->operand[0].type == OPT_REG
+            && po->operand[0].reg == xST0)
+          {
+            strcat(g_comment, " fld st");
+            break;
+          }
+          fprintf(fout, "  f_st0 = %s;",
+            out_src_opr_float(buf1, sizeof(buf1),
+              po, &po->operand[0], 0));
         }
-        fprintf(fout, "  f_st0 = %s;",
-          out_src_opr_float(buf1, sizeof(buf1), po, &po->operand[0]));
         strcat(g_comment, " fld");
         break;
 
       case OP_FILD:
-        if (po->flags & OPF_FSHIFT)
-          fprintf(fout, "  f_st1 = f_st0;\n");
-        fprintf(fout, "  f_st0 = (%s)%s;", float_type,
-          out_src_opr(buf1, sizeof(buf1), po, &po->operand[0],
-            lmod_cast(po, po->operand[0].lmod, 1), 0));
+        out_src_opr(buf1, sizeof(buf1), po, &po->operand[0],
+          lmod_cast(po, po->operand[0].lmod, 1), 0);
+        snprintf(buf2, sizeof(buf2), "(%s)%s", float_type, buf1);
+        if (need_float_stack) {
+          fprintf(fout, "  f_st[--f_stp & 7] = %s;", buf2);
+        }
+        else {
+          if (po->flags & OPF_FSHIFT)
+            fprintf(fout, "  f_st1 = f_st0;");
+          fprintf(fout, "  f_st0 = %s;", buf2);
+        }
         strcat(g_comment, " fild");
         break;
 
       case OP_FLDc:
-        if (po->flags & OPF_FSHIFT)
-          fprintf(fout, "  f_st1 = f_st0;\n");
-        fprintf(fout, "  f_st0 = ");
+        if (need_float_stack)
+          fprintf(fout, "  f_st[--f_stp & 7] = ");
+        else {
+          if (po->flags & OPF_FSHIFT)
+            fprintf(fout, "  f_st1 = f_st0;");
+          fprintf(fout, "  f_st0 = ");
+        }
         switch (po->operand[0].val) {
-        case X87_CONST_1: fprintf(fout, "1.0;"); break;
-        case X87_CONST_Z: fprintf(fout, "0.0;"); break;
+        case X87_CONST_1:   fprintf(fout, "1.0;"); break;
+        case X87_CONST_LN2: fprintf(fout, "0.693147180559945;"); break;
+        case X87_CONST_Z:   fprintf(fout, "0.0;"); break;
         default: ferr(po, "TODO\n"); break;
         }
         break;
 
       case OP_FST:
-        if ((po->flags & OPF_FPOP) && po->operand[0].type == OPT_REG
-          && po->operand[0].reg == xST0)
-        {
-          no_output = 1;
-          break;
+        out_dst_opr_float(buf1, sizeof(buf1), po, &po->operand[0],
+          need_float_stack);
+        dead_dst = po->operand[0].type == OPT_REG
+          && po->operand[0].reg == xST0;
+        if (need_float_stack) {
+          if (!dead_dst)
+            fprintf(fout, "  %s = f_st[f_stp & 7];", buf1);
+          if (po->flags & OPF_FSHIFT)
+            fprintf(fout, "  f_stp++;");
+        }
+        else {
+          if (!dead_dst)
+            fprintf(fout, "  %s = f_st0;", buf1);
+          if (po->flags & OPF_FSHIFT)
+            fprintf(fout, "  f_st0 = f_st1;");
         }
-        fprintf(fout, "  %s = f_st0;",
-          out_dst_opr_float(buf1, sizeof(buf1), po, &po->operand[0]));
-        if (po->flags & OPF_FSHIFT)
-          fprintf(fout, "\n  f_st0 = f_st1;");
-        strcat(g_comment, " fst");
+        if (dead_dst && !(po->flags & OPF_FSHIFT))
+          no_output = 1;
+        else
+          strcat(g_comment, " fst");
         break;
 
       case OP_FADD:
       case OP_FDIV:
       case OP_FMUL:
       case OP_FSUB:
+        out_dst_opr_float(buf1, sizeof(buf1), po, &po->operand[0],
+          need_float_stack);
+        out_src_opr_float(buf2, sizeof(buf2), po, &po->operand[1],
+          need_float_stack);
+        dead_dst = (po->flags & OPF_FPOP)
+          && po->operand[0].type == OPT_REG
+          && po->operand[0].reg == xST0;
         switch (po->op) {
         case OP_FADD: j = '+'; break;
         case OP_FDIV: j = '/'; break;
@@ -6629,27 +6717,55 @@ static void gen_func(FILE *fout, FILE *fhdr, const char *funcn, int opcnt)
         case OP_FSUB: j = '-'; break;
         default: j = 'x'; break;
         }
-        if (po->flags & OPF_FSHIFT) {
-          fprintf(fout, "  f_st0 = f_st1 %c f_st0;", j);
+        if (need_float_stack) {
+          if (!dead_dst)
+            fprintf(fout, "  %s %c= %s;", buf1, j, buf2);
+          if (po->flags & OPF_FSHIFT)
+            fprintf(fout, "  f_stp++;");
         }
         else {
-          fprintf(fout, "  %s %c= %s;",
-           out_dst_opr_float(buf1, sizeof(buf1), po, &po->operand[0]),
-           j,
-           out_src_opr_float(buf2, sizeof(buf2), po, &po->operand[1]));
+          if (po->flags & OPF_FSHIFT) {
+            // note: assumes only 2 regs handled
+            if (!dead_dst)
+              fprintf(fout, "  f_st0 = f_st1 %c f_st0;", j);
+            else
+              fprintf(fout, "  f_st0 = f_st1;");
+          }
+          else if (!dead_dst)
+            fprintf(fout, "  %s %c= %s;", buf1, j, buf2);
         }
+        no_output = (dead_dst && !(po->flags & OPF_FSHIFT));
         break;
 
       case OP_FDIVR:
       case OP_FSUBR:
-        if (po->flags & OPF_FSHIFT)
-          snprintf(buf1, sizeof(buf1), "f_st0");
-        else
-          out_dst_opr_float(buf1, sizeof(buf1), po, &po->operand[0]);
-        fprintf(fout, "  %s = %s %c %s;", buf1,
-          out_src_opr_float(buf2, sizeof(buf2), po, &po->operand[1]),
-          po->op == OP_FDIVR ? '/' : '-',
-          out_src_opr_float(buf3, sizeof(buf3), po, &po->operand[0]));
+        out_dst_opr_float(buf1, sizeof(buf1), po, &po->operand[0],
+          need_float_stack);
+        out_src_opr_float(buf2, sizeof(buf2), po, &po->operand[1],
+          need_float_stack);
+        out_src_opr_float(buf3, sizeof(buf3), po, &po->operand[0],
+          need_float_stack);
+        dead_dst = (po->flags & OPF_FPOP)
+          && po->operand[0].type == OPT_REG
+          && po->operand[0].reg == xST0;
+        j = po->op == OP_FDIVR ? '/' : '-';
+        if (need_float_stack) {
+          if (!dead_dst)
+            fprintf(fout, "  %s = %s %c %s;", buf1, buf2, j, buf3);
+          if (po->flags & OPF_FSHIFT)
+            fprintf(fout, "  f_stp++;");
+        }
+        else {
+          if (po->flags & OPF_FSHIFT) {
+            if (!dead_dst)
+              fprintf(fout, "  f_st0 = f_st0 %c f_st1;", j);
+            else
+              fprintf(fout, "  f_st0 = f_st1;");
+          }
+          else if (!dead_dst)
+            fprintf(fout, "  %s = %s %c %s;", buf1, buf2, j, buf3);
+        }
+        no_output = (dead_dst && !(po->flags & OPF_FSHIFT));
         break;
 
       case OP_FIADD:
@@ -6663,43 +6779,86 @@ static void gen_func(FILE *fout, FILE *fhdr, const char *funcn, int opcnt)
         case OP_FISUB: j = '-'; break;
         default: j = 'x'; break;
         }
-        fprintf(fout, "  f_st0 %c= (%s)%s;", j, float_type,
+        fprintf(fout, "  %s %c= (%s)%s;", float_st0,
+          j, float_type,
           out_src_opr(buf1, sizeof(buf1), po, &po->operand[0],
             lmod_cast(po, po->operand[0].lmod, 1), 0));
         break;
 
       case OP_FIDIVR:
       case OP_FISUBR:
-        fprintf(fout, "  f_st0 = %s %c f_st0;",
-          out_src_opr_float(buf2, sizeof(buf2), po, &po->operand[1]),
-          po->op == OP_FIDIVR ? '/' : '-');
+        fprintf(fout, "  %s = %s %c %s;", float_st0,
+          out_src_opr_float(buf2, sizeof(buf2), po, &po->operand[1],
+            need_float_stack),
+          po->op == OP_FIDIVR ? '/' : '-', float_st0);
+        break;
+
+      case OP_FCHS:
+        fprintf(fout, "  %s = -%s;", float_st0, float_st0);
         break;
 
       case OP_FCOS:
-        fprintf(fout, "  f_st0 = cos%s(f_st0);",
-          need_double ? "" : "f");
+        fprintf(fout, "  %s = cos%s(%s);", float_st0,
+          need_double ? "" : "f", float_st0);
         break;
 
       case OP_FPATAN:
-        fprintf(fout, "  f_st0 = atan%s(f_st1 / f_st0);",
-          need_double ? "" : "f");
+        if (need_float_stack) {
+          fprintf(fout, "  %s = atan%s(%s / %s);", float_st1,
+            need_double ? "" : "f", float_st1, float_st0);
+          fprintf(fout, " f_stp++;");
+        }
+        else {
+          fprintf(fout, "  f_st0 = atan%s(f_st1 / f_st0);",
+            need_double ? "" : "f");
+        }
+        break;
+
+      case OP_FYL2X:
+        if (need_float_stack) {
+          fprintf(fout, "  %s = %s * log2%s(%s);", float_st1,
+            float_st1, need_double ? "" : "f", float_st0);
+          fprintf(fout, " f_stp++;");
+        }
+        else {
+          fprintf(fout, "  f_st0 = f_st1 * log2%s(f_st0);",
+            need_double ? "" : "f");
+        }
         break;
 
       case OP_FSIN:
-        fprintf(fout, "  f_st0 = sin%s(f_st0);",
-          need_double ? "" : "f");
+        fprintf(fout, "  %s = sin%s(%s);", float_st0,
+          need_double ? "" : "f", float_st0);
         break;
 
       case OP_FSQRT:
-        fprintf(fout, "  f_st0 = sqrt%s(f_st0);",
-          need_double ? "" : "f");
+        fprintf(fout, "  %s = sqrt%s(%s);", float_st0,
+          need_double ? "" : "f", float_st0);
+        break;
+
+      case OP_FXCH:
+        dead_dst = po->operand[0].type == OPT_REG
+          && po->operand[0].reg == xST0;
+        if (!dead_dst) {
+          out_src_opr_float(buf1, sizeof(buf1), po, &po->operand[0],
+            need_float_stack);
+          fprintf(fout, "  { %s t = %s; %s = %s; %s = t; }", float_type,
+            float_st0, float_st0, buf1, buf1);
+          strcat(g_comment, " fxch");
+        }
+        else
+          no_output = 1;
         break;
 
       case OPP_FTOL:
         ferr_assert(po, po->flags & OPF_32BIT);
-        fprintf(fout, "  eax = (s32)f_st0;");
-        if (po->flags & OPF_FSHIFT)
-          fprintf(fout, "\n  f_st0 = f_st1;");
+        fprintf(fout, "  eax = (s32)%s;", float_st0);
+        if (po->flags & OPF_FSHIFT) {
+          if (need_float_stack)
+            fprintf(fout, " f_stp++;");
+          else
+            fprintf(fout, " f_st0 = f_st1;");
+        }
         strcat(g_comment, " ftol");
         break;
 
-- 
2.47.3