2 * Copyright (C) 2014-2020 Paul Cercueil <paul@crapouillou.net>
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public
6 * License as published by the Free Software Foundation; either
7 * version 2.1 of the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
15 #include "disassembler.h"
17 #include "memmanager.h"
18 #include "optimizer.h"
25 struct optimizer_list {
26 void (**optimizers)(struct opcode *);
27 unsigned int nb_optimizers;
30 bool opcode_reads_register(union code op, u8 reg)
35 case OP_SPECIAL_SYSCALL:
36 case OP_SPECIAL_BREAK:
42 return op.r.rs == reg;
50 return op.r.rt == reg;
52 return op.r.rs == reg || op.r.rt == reg;
58 return op.r.rt == reg;
63 if (op.r.op == OP_CP2_BASIC) {
65 case OP_CP2_BASIC_MTC2:
66 case OP_CP2_BASIC_CTC2:
67 return op.r.rt == reg;
87 return op.i.rs == reg || op.i.rt == reg;
89 return op.i.rs == reg;
93 bool opcode_writes_register(union code op, u8 reg)
100 case OP_SPECIAL_SYSCALL:
101 case OP_SPECIAL_BREAK:
103 case OP_SPECIAL_MULT:
104 case OP_SPECIAL_MULTU:
106 case OP_SPECIAL_DIVU:
107 return reg == REG_LO || reg == REG_HI;
108 case OP_SPECIAL_MTHI:
109 return reg == REG_HI;
110 case OP_SPECIAL_MTLO:
111 return reg == REG_LO;
113 return op.r.rd == reg;
130 return op.i.rt == reg;
135 return op.i.rt == reg;
140 if (op.r.op == OP_CP2_BASIC) {
142 case OP_CP2_BASIC_MFC2:
143 case OP_CP2_BASIC_CFC2:
144 return op.i.rt == reg;
152 return op.r.rd == reg;
159 static bool is_nop(union code op)
161 if (opcode_writes_register(op, 0)) {
164 return op.r.rs != OP_CP0_MFC0;
182 return op.r.rd == op.r.rt && op.r.rd == op.r.rs;
184 case OP_SPECIAL_ADDU:
185 return (op.r.rd == op.r.rt && op.r.rs == 0) ||
186 (op.r.rd == op.r.rs && op.r.rt == 0);
188 case OP_SPECIAL_SUBU:
189 return op.r.rd == op.r.rs && op.r.rt == 0;
191 if (op.r.rd == op.r.rt)
192 return op.r.rd == op.r.rs || op.r.rs == 0;
194 return (op.r.rd == op.r.rs) && op.r.rt == 0;
198 return op.r.rd == op.r.rt && op.r.imm == 0;
205 return op.i.rt == op.i.rs && op.i.imm == 0;
207 return (op.i.rs == 0 || op.i.imm == 1);
209 return (op.i.op == OP_REGIMM_BLTZ ||
210 op.i.op == OP_REGIMM_BLTZAL) &&
211 (op.i.rs == 0 || op.i.imm == 1);
213 return (op.i.rs == op.i.rt || op.i.imm == 1);
219 bool load_in_delay_slot(union code op)
233 if (op.r.op == OP_CP2_BASIC) {
235 case OP_CP2_BASIC_MFC2:
236 case OP_CP2_BASIC_CFC2:
259 static u32 lightrec_propagate_consts(union code c, u32 known, u32 *v)
265 if (known & BIT(c.r.rt)) {
266 known |= BIT(c.r.rd);
267 v[c.r.rd] = v[c.r.rt] << c.r.imm;
269 known &= ~BIT(c.r.rd);
273 if (known & BIT(c.r.rt)) {
274 known |= BIT(c.r.rd);
275 v[c.r.rd] = v[c.r.rt] >> c.r.imm;
277 known &= ~BIT(c.r.rd);
281 if (known & BIT(c.r.rt)) {
282 known |= BIT(c.r.rd);
283 v[c.r.rd] = (s32)v[c.r.rt] >> c.r.imm;
285 known &= ~BIT(c.r.rd);
288 case OP_SPECIAL_SLLV:
289 if (known & BIT(c.r.rt) && known & BIT(c.r.rs)) {
290 known |= BIT(c.r.rd);
291 v[c.r.rd] = v[c.r.rt] << (v[c.r.rs] & 0x1f);
293 known &= ~BIT(c.r.rd);
296 case OP_SPECIAL_SRLV:
297 if (known & BIT(c.r.rt) && known & BIT(c.r.rs)) {
298 known |= BIT(c.r.rd);
299 v[c.r.rd] = v[c.r.rt] >> (v[c.r.rs] & 0x1f);
301 known &= ~BIT(c.r.rd);
304 case OP_SPECIAL_SRAV:
305 if (known & BIT(c.r.rt) && known & BIT(c.r.rs)) {
306 known |= BIT(c.r.rd);
307 v[c.r.rd] = (s32)v[c.r.rt]
308 >> (v[c.r.rs] & 0x1f);
310 known &= ~BIT(c.r.rd);
314 case OP_SPECIAL_ADDU:
315 if (known & BIT(c.r.rt) && known & BIT(c.r.rs)) {
316 known |= BIT(c.r.rd);
317 v[c.r.rd] = (s32)v[c.r.rt] + (s32)v[c.r.rs];
319 known &= ~BIT(c.r.rd);
323 case OP_SPECIAL_SUBU:
324 if (known & BIT(c.r.rt) && known & BIT(c.r.rs)) {
325 known |= BIT(c.r.rd);
326 v[c.r.rd] = v[c.r.rt] - v[c.r.rs];
328 known &= ~BIT(c.r.rd);
332 if (known & BIT(c.r.rt) && known & BIT(c.r.rs)) {
333 known |= BIT(c.r.rd);
334 v[c.r.rd] = v[c.r.rt] & v[c.r.rs];
336 known &= ~BIT(c.r.rd);
340 if (known & BIT(c.r.rt) && known & BIT(c.r.rs)) {
341 known |= BIT(c.r.rd);
342 v[c.r.rd] = v[c.r.rt] | v[c.r.rs];
344 known &= ~BIT(c.r.rd);
348 if (known & BIT(c.r.rt) && known & BIT(c.r.rs)) {
349 known |= BIT(c.r.rd);
350 v[c.r.rd] = v[c.r.rt] ^ v[c.r.rs];
352 known &= ~BIT(c.r.rd);
356 if (known & BIT(c.r.rt) && known & BIT(c.r.rs)) {
357 known |= BIT(c.r.rd);
358 v[c.r.rd] = ~(v[c.r.rt] | v[c.r.rs]);
360 known &= ~BIT(c.r.rd);
364 if (known & BIT(c.r.rt) && known & BIT(c.r.rs)) {
365 known |= BIT(c.r.rd);
366 v[c.r.rd] = (s32)v[c.r.rs] < (s32)v[c.r.rt];
368 known &= ~BIT(c.r.rd);
371 case OP_SPECIAL_SLTU:
372 if (known & BIT(c.r.rt) && known & BIT(c.r.rs)) {
373 known |= BIT(c.r.rd);
374 v[c.r.rd] = v[c.r.rs] < v[c.r.rt];
376 known &= ~BIT(c.r.rd);
387 if (known & BIT(c.i.rs)) {
388 known |= BIT(c.i.rt);
389 v[c.i.rt] = v[c.i.rs] + (s32)(s16)c.i.imm;
391 known &= ~BIT(c.i.rt);
395 if (known & BIT(c.i.rs)) {
396 known |= BIT(c.i.rt);
397 v[c.i.rt] = (s32)v[c.i.rs] < (s32)(s16)c.i.imm;
399 known &= ~BIT(c.i.rt);
403 if (known & BIT(c.i.rs)) {
404 known |= BIT(c.i.rt);
405 v[c.i.rt] = v[c.i.rs] < (u32)(s32)(s16)c.i.imm;
407 known &= ~BIT(c.i.rt);
411 if (known & BIT(c.i.rs)) {
412 known |= BIT(c.i.rt);
413 v[c.i.rt] = v[c.i.rs] & c.i.imm;
415 known &= ~BIT(c.i.rt);
419 if (known & BIT(c.i.rs)) {
420 known |= BIT(c.i.rt);
421 v[c.i.rt] = v[c.i.rs] | c.i.imm;
423 known &= ~BIT(c.i.rt);
427 if (known & BIT(c.i.rs)) {
428 known |= BIT(c.i.rt);
429 v[c.i.rt] = v[c.i.rs] ^ c.i.imm;
431 known &= ~BIT(c.i.rt);
435 known |= BIT(c.i.rt);
436 v[c.i.rt] = c.i.imm << 16;
442 known &= ~BIT(c.r.rt);
447 if (c.r.op == OP_CP2_BASIC) {
449 case OP_CP2_BASIC_MFC2:
450 case OP_CP2_BASIC_CFC2:
451 known &= ~BIT(c.r.rt);
464 known &= ~BIT(c.i.rt);
467 if (known & BIT(c.r.rs)) {
468 known |= BIT(c.r.rd);
469 v[c.r.rd] = v[c.r.rs];
471 known &= ~BIT(c.r.rd);
481 static int lightrec_add_meta(struct block *block,
482 struct opcode *op, union code code)
486 meta = lightrec_malloc(block->state, MEM_FOR_IR, sizeof(*meta));
494 meta->offset = op->offset;
495 meta->next = op->next;
499 meta->next = block->opcode_list;
500 block->opcode_list = meta;
506 static int lightrec_add_sync(struct block *block, struct opcode *prev)
508 return lightrec_add_meta(block, prev, (union code){
509 .j.op = OP_META_SYNC,
513 static int lightrec_transform_ops(struct block *block)
515 struct opcode *list = block->opcode_list;
517 for (; list; list = list->next) {
519 /* Transform all opcodes detected as useless to real NOPs
520 * (0x0: SLL r0, r0, #0) */
521 if (list->opcode != 0 && is_nop(list->c)) {
522 pr_debug("Converting useless opcode 0x%08x to NOP\n",
530 switch (list->i.op) {
531 /* Transform BEQ / BNE to BEQZ / BNEZ meta-opcodes if one of the
532 * two registers is zero. */
534 if ((list->i.rs == 0) ^ (list->i.rt == 0)) {
535 list->i.op = OP_META_BEQZ;
536 if (list->i.rs == 0) {
537 list->i.rs = list->i.rt;
540 } else if (list->i.rs == list->i.rt) {
546 if (list->i.rs == 0) {
547 list->i.op = OP_META_BNEZ;
548 list->i.rs = list->i.rt;
550 } else if (list->i.rt == 0) {
551 list->i.op = OP_META_BNEZ;
555 /* Transform ORI/ADDI/ADDIU with imm #0 or ORR/ADD/ADDU/SUB/SUBU
556 * with register $zero to the MOV meta-opcode */
560 if (list->i.imm == 0) {
561 pr_debug("Convert ORI/ADDI/ADDIU #0 to MOV\n");
562 list->i.op = OP_META_MOV;
563 list->r.rd = list->i.rt;
567 switch (list->r.op) {
571 if (list->r.imm == 0) {
572 pr_debug("Convert SLL/SRL/SRA #0 to MOV\n");
573 list->i.op = OP_META_MOV;
574 list->r.rs = list->r.rt;
579 case OP_SPECIAL_ADDU:
580 if (list->r.rs == 0) {
581 pr_debug("Convert OR/ADD $zero to MOV\n");
582 list->i.op = OP_META_MOV;
583 list->r.rs = list->r.rt;
585 case OP_SPECIAL_SUB: /* fall-through */
586 case OP_SPECIAL_SUBU:
587 if (list->r.rt == 0) {
588 pr_debug("Convert OR/ADD/SUB $zero to MOV\n");
589 list->i.op = OP_META_MOV;
591 default: /* fall-through */
594 default: /* fall-through */
602 static int lightrec_switch_delay_slots(struct block *block)
604 struct opcode *list, *prev;
607 for (list = block->opcode_list, prev = NULL; list->next;
608 prev = list, list = list->next) {
609 union code op = list->c;
610 union code next_op = list->next->c;
612 if (!has_delay_slot(op) ||
613 list->flags & (LIGHTREC_NO_DS | LIGHTREC_EMULATE_BRANCH) ||
617 if (prev && has_delay_slot(prev->c))
620 switch (list->i.op) {
623 case OP_SPECIAL_JALR:
624 if (opcode_reads_register(next_op, op.r.rd) ||
625 opcode_writes_register(next_op, op.r.rd))
627 case OP_SPECIAL_JR: /* fall-through */
628 if (opcode_writes_register(next_op, op.r.rs))
630 default: /* fall-through */
633 case OP_J: /* fall-through */
636 if (opcode_reads_register(next_op, 31) ||
637 opcode_writes_register(next_op, 31))
643 if (op.i.rt && opcode_writes_register(next_op, op.i.rt))
645 case OP_BLEZ: /* fall-through */
649 if (op.i.rs && opcode_writes_register(next_op, op.i.rs))
654 case OP_REGIMM_BLTZAL:
655 case OP_REGIMM_BGEZAL:
656 if (opcode_reads_register(next_op, 31) ||
657 opcode_writes_register(next_op, 31))
659 case OP_REGIMM_BLTZ: /* fall-through */
662 opcode_writes_register(next_op, op.i.rs))
666 default: /* fall-through */
670 pr_debug("Swap branch and delay slot opcodes "
671 "at offsets 0x%x / 0x%x\n", list->offset << 2,
672 list->next->offset << 2);
674 flags = list->next->flags;
677 list->next->flags = list->flags | LIGHTREC_NO_DS;
678 list->flags = flags | LIGHTREC_NO_DS;
680 list->next->offset--;
686 static int lightrec_detect_impossible_branches(struct block *block)
688 struct opcode *op, *next;
690 for (op = block->opcode_list, next = op->next; next;
691 op = next, next = op->next) {
692 if (!has_delay_slot(op->c) ||
693 (!load_in_delay_slot(next->c) &&
694 !has_delay_slot(next->c) &&
695 !(next->i.op == OP_CP0 && next->r.rs == OP_CP0_RFE)))
698 if (op->c.opcode == next->c.opcode) {
699 /* The delay slot is the exact same opcode as the branch
700 * opcode: this is effectively a NOP */
705 if (op == block->opcode_list) {
706 /* If the first opcode is an 'impossible' branch, we
707 * only keep the first two opcodes of the block (the
708 * branch itself + its delay slot) */
709 lightrec_free_opcode_list(block->state, next->next);
714 op->flags |= LIGHTREC_EMULATE_BRANCH;
720 static int lightrec_local_branches(struct block *block)
722 struct opcode *list, *target, *prev;
726 for (list = block->opcode_list; list; list = list->next) {
727 if (list->flags & LIGHTREC_EMULATE_BRANCH)
730 switch (list->i.op) {
738 offset = list->offset + 1 + (s16)list->i.imm;
739 if (offset >= 0 && offset < block->nb_ops)
741 default: /* fall-through */
745 pr_debug("Found local branch to offset 0x%x\n", offset << 2);
747 for (target = block->opcode_list, prev = NULL;
748 target; prev = target, target = target->next) {
749 if (target->offset != offset ||
750 target->j.op == OP_META_SYNC)
753 if (target->flags & LIGHTREC_EMULATE_BRANCH) {
754 pr_debug("Branch target must be emulated"
759 if (prev && has_delay_slot(prev->c)) {
760 pr_debug("Branch target is a delay slot"
765 if (prev && prev->j.op != OP_META_SYNC) {
766 pr_debug("Adding sync before offset "
767 "0x%x\n", offset << 2);
768 ret = lightrec_add_sync(block, prev);
772 prev->next->offset = target->offset;
775 list->flags |= LIGHTREC_LOCAL_BRANCH;
783 bool has_delay_slot(union code op)
789 case OP_SPECIAL_JALR:
809 static int lightrec_add_unload(struct block *block, struct opcode *op, u8 reg)
811 return lightrec_add_meta(block, op, (union code){
812 .i.op = OP_META_REG_UNLOAD,
817 static int lightrec_early_unload(struct block *block)
819 struct opcode *list = block->opcode_list;
822 for (i = 1; i < 34; i++) {
823 struct opcode *op, *last_r = NULL, *last_w = NULL;
824 unsigned int last_r_id = 0, last_w_id = 0, id = 0;
827 for (op = list; op->next; op = op->next, id++) {
828 if (opcode_reads_register(op->c, i)) {
833 if (opcode_writes_register(op->c, i)) {
839 if (last_w_id > last_r_id) {
840 if (has_delay_slot(last_w->c) &&
841 !(last_w->flags & LIGHTREC_NO_DS))
842 last_w = last_w->next;
845 ret = lightrec_add_unload(block, last_w, i);
850 if (has_delay_slot(last_r->c) &&
851 !(last_r->flags & LIGHTREC_NO_DS))
852 last_r = last_r->next;
855 ret = lightrec_add_unload(block, last_r, i);
865 static int lightrec_flag_stores(struct block *block)
869 u32 values[32] = { 0 };
871 for (list = block->opcode_list; list; list = list->next) {
872 /* Register $zero is always, well, zero */
876 switch (list->i.op) {
880 /* Mark all store operations that target $sp or $gp
881 * as not requiring code invalidation. This is based
882 * on the heuristic that stores using one of these
883 * registers as address will never hit a code page. */
884 if (list->i.rs >= 28 && list->i.rs <= 29 &&
885 !block->state->maps[PSX_MAP_KERNEL_USER_RAM].ops) {
886 pr_debug("Flaging opcode 0x%08x as not requiring invalidation\n",
888 list->flags |= LIGHTREC_NO_INVALIDATE;
891 /* Detect writes whose destination address is inside the
892 * current block, using constant propagation. When these
893 * occur, we mark the blocks as not compilable. */
894 if ((known & BIT(list->i.rs)) &&
895 kunseg(values[list->i.rs]) >= kunseg(block->pc) &&
896 kunseg(values[list->i.rs]) < (kunseg(block->pc) +
897 block->nb_ops * 4)) {
898 pr_debug("Self-modifying block detected\n");
899 block->flags |= BLOCK_NEVER_COMPILE;
900 list->flags |= LIGHTREC_SMC;
902 default: /* fall-through */
906 known = lightrec_propagate_consts(list->c, known, values);
912 static bool is_mult32(const struct block *block, const struct opcode *op)
914 const struct opcode *next, *last = NULL;
917 for (op = op->next; op != last; op = op->next) {
926 /* TODO: handle backwards branches too */
927 if ((op->flags & LIGHTREC_LOCAL_BRANCH) &&
928 (s16)op->c.i.imm >= 0) {
929 offset = op->offset + 1 + (s16)op->c.i.imm;
931 for (next = op; next->offset != offset;
934 if (!is_mult32(block, next))
944 case OP_SPECIAL_MULT:
945 case OP_SPECIAL_MULTU:
947 case OP_SPECIAL_DIVU:
948 case OP_SPECIAL_MTHI:
951 return op->r.rs == 31 &&
952 ((op->flags & LIGHTREC_NO_DS) ||
953 !(op->next->i.op == OP_SPECIAL &&
954 op->next->r.op == OP_SPECIAL_MFHI));
955 case OP_SPECIAL_JALR:
956 case OP_SPECIAL_MFHI:
969 static int lightrec_flag_mults(struct block *block)
971 struct opcode *list, *prev;
973 for (list = block->opcode_list, prev = NULL; list;
974 prev = list, list = list->next) {
975 if (list->i.op != OP_SPECIAL)
978 switch (list->r.op) {
979 case OP_SPECIAL_MULT:
980 case OP_SPECIAL_MULTU:
986 /* Don't support MULT(U) opcodes in delay slots */
987 if (prev && has_delay_slot(prev->c))
990 if (is_mult32(block, list)) {
991 pr_debug("Mark MULT(U) opcode at offset 0x%x as"
992 " 32-bit\n", list->offset << 2);
993 list->flags |= LIGHTREC_MULT32;
1000 static int (*lightrec_optimizers[])(struct block *) = {
1001 &lightrec_detect_impossible_branches,
1002 &lightrec_transform_ops,
1003 &lightrec_local_branches,
1004 &lightrec_switch_delay_slots,
1005 &lightrec_flag_stores,
1006 &lightrec_flag_mults,
1007 &lightrec_early_unload,
1010 int lightrec_optimize(struct block *block)
1014 for (i = 0; i < ARRAY_SIZE(lightrec_optimizers); i++) {
1015 int ret = lightrec_optimizers[i](block);