1 // SPDX-License-Identifier: LGPL-2.1-or-later
3 * Copyright (C) 2014-2021 Paul Cercueil <paul@crapouillou.net>
6 #include "lightrec-config.h"
7 #include "disassembler.h"
9 #include "memmanager.h"
10 #include "optimizer.h"
18 #define IF_OPT(opt, ptr) ((opt) ? (ptr) : NULL)
20 struct optimizer_list {
21 void (**optimizers)(struct opcode *);
22 unsigned int nb_optimizers;
25 static bool is_nop(union code op);
27 bool is_unconditional_jump(union code c)
31 return c.r.op == OP_SPECIAL_JR || c.r.op == OP_SPECIAL_JALR;
37 return c.i.rs == c.i.rt;
39 return (c.r.rt == OP_REGIMM_BGEZ ||
40 c.r.rt == OP_REGIMM_BGEZAL) && c.i.rs == 0;
46 bool is_syscall(union code c)
48 return (c.i.op == OP_SPECIAL && c.r.op == OP_SPECIAL_SYSCALL) ||
49 (c.i.op == OP_CP0 && (c.r.rs == OP_CP0_MTC0 ||
50 c.r.rs == OP_CP0_CTC0) &&
51 (c.r.rd == 12 || c.r.rd == 13));
54 static u64 opcode_read_mask(union code op)
59 case OP_SPECIAL_SYSCALL:
60 case OP_SPECIAL_BREAK:
76 return BIT(op.r.rs) | BIT(op.r.rt);
87 if (op.r.op == OP_CP2_BASIC) {
89 case OP_CP2_BASIC_MTC2:
90 case OP_CP2_BASIC_CTC2:
110 return BIT(op.i.rs) | BIT(op.i.rt);
116 static u64 opcode_write_mask(union code op)
124 case OP_SPECIAL_SYSCALL:
125 case OP_SPECIAL_BREAK:
127 case OP_SPECIAL_MULT:
128 case OP_SPECIAL_MULTU:
130 case OP_SPECIAL_DIVU:
131 if (!OPT_FLAG_MULT_DIV)
132 return BIT(REG_LO) | BIT(REG_HI);
135 flags = BIT(op.r.rd);
139 flags |= BIT(op.r.imm);
141 flags |= BIT(REG_HI);
143 case OP_SPECIAL_MTHI:
145 case OP_SPECIAL_MTLO:
177 if (op.r.op == OP_CP2_BASIC) {
179 case OP_CP2_BASIC_MFC2:
180 case OP_CP2_BASIC_CFC2:
189 case OP_REGIMM_BLTZAL:
190 case OP_REGIMM_BGEZAL:
202 bool opcode_reads_register(union code op, u8 reg)
204 return opcode_read_mask(op) & BIT(reg);
207 bool opcode_writes_register(union code op, u8 reg)
209 return opcode_write_mask(op) & BIT(reg);
212 static int find_prev_writer(const struct opcode *list, unsigned int offset, u8 reg)
217 if (list[offset].flags & LIGHTREC_SYNC)
220 for (i = offset; i > 0; i--) {
223 if (opcode_writes_register(c, reg)) {
224 if (i > 1 && has_delay_slot(list[i - 2].c))
230 if ((list[i - 1].flags & LIGHTREC_SYNC) ||
232 opcode_reads_register(c, reg))
239 static int find_next_reader(const struct opcode *list, unsigned int offset, u8 reg)
244 if (list[offset].flags & LIGHTREC_SYNC)
247 for (i = offset; ; i++) {
250 if (opcode_reads_register(c, reg)) {
251 if (i > 0 && has_delay_slot(list[i - 1].c))
257 if ((list[i].flags & LIGHTREC_SYNC) ||
258 has_delay_slot(c) || opcode_writes_register(c, reg))
265 static bool reg_is_dead(const struct opcode *list, unsigned int offset, u8 reg)
269 if (list[offset].flags & LIGHTREC_SYNC)
272 for (i = offset + 1; ; i++) {
273 if (opcode_reads_register(list[i].c, reg))
276 if (opcode_writes_register(list[i].c, reg))
279 if (has_delay_slot(list[i].c)) {
280 if (list[i].flags & LIGHTREC_NO_DS ||
281 opcode_reads_register(list[i + 1].c, reg))
284 return opcode_writes_register(list[i + 1].c, reg);
289 static bool reg_is_read(const struct opcode *list,
290 unsigned int a, unsigned int b, u8 reg)
292 /* Return true if reg is read in one of the opcodes of the interval
295 if (!is_nop(list[a].c) && opcode_reads_register(list[a].c, reg))
302 static bool reg_is_written(const struct opcode *list,
303 unsigned int a, unsigned int b, u8 reg)
305 /* Return true if reg is written in one of the opcodes of the interval
309 if (!is_nop(list[a].c) && opcode_writes_register(list[a].c, reg))
316 static bool reg_is_read_or_written(const struct opcode *list,
317 unsigned int a, unsigned int b, u8 reg)
319 return reg_is_read(list, a, b, reg) || reg_is_written(list, a, b, reg);
322 static bool opcode_is_load(union code op)
339 static bool opcode_is_store(union code op)
354 bool opcode_is_io(union code op)
356 return opcode_is_load(op) || opcode_is_store(op);
360 static bool is_nop(union code op)
362 if (opcode_writes_register(op, 0)) {
365 return op.r.rs != OP_CP0_MFC0;
383 return op.r.rd == op.r.rt && op.r.rd == op.r.rs;
385 case OP_SPECIAL_ADDU:
386 return (op.r.rd == op.r.rt && op.r.rs == 0) ||
387 (op.r.rd == op.r.rs && op.r.rt == 0);
389 case OP_SPECIAL_SUBU:
390 return op.r.rd == op.r.rs && op.r.rt == 0;
392 if (op.r.rd == op.r.rt)
393 return op.r.rd == op.r.rs || op.r.rs == 0;
395 return (op.r.rd == op.r.rs) && op.r.rt == 0;
399 return op.r.rd == op.r.rt && op.r.imm == 0;
400 case OP_SPECIAL_MFHI:
401 case OP_SPECIAL_MFLO:
409 return op.i.rt == op.i.rs && op.i.imm == 0;
411 return (op.i.rs == 0 || op.i.imm == 1);
413 return (op.i.op == OP_REGIMM_BLTZ ||
414 op.i.op == OP_REGIMM_BLTZAL) &&
415 (op.i.rs == 0 || op.i.imm == 1);
417 return (op.i.rs == op.i.rt || op.i.imm == 1);
423 bool load_in_delay_slot(union code op)
437 if (op.r.op == OP_CP2_BASIC) {
439 case OP_CP2_BASIC_MFC2:
440 case OP_CP2_BASIC_CFC2:
463 static u32 lightrec_propagate_consts(const struct opcode *op,
464 const struct opcode *prev,
467 union code c = prev->c;
469 /* Register $zero is always, well, zero */
473 if (op->flags & LIGHTREC_SYNC)
480 if (known & BIT(c.r.rt)) {
481 known |= BIT(c.r.rd);
482 v[c.r.rd] = v[c.r.rt] << c.r.imm;
484 known &= ~BIT(c.r.rd);
488 if (known & BIT(c.r.rt)) {
489 known |= BIT(c.r.rd);
490 v[c.r.rd] = v[c.r.rt] >> c.r.imm;
492 known &= ~BIT(c.r.rd);
496 if (known & BIT(c.r.rt)) {
497 known |= BIT(c.r.rd);
498 v[c.r.rd] = (s32)v[c.r.rt] >> c.r.imm;
500 known &= ~BIT(c.r.rd);
503 case OP_SPECIAL_SLLV:
504 if (known & BIT(c.r.rt) && known & BIT(c.r.rs)) {
505 known |= BIT(c.r.rd);
506 v[c.r.rd] = v[c.r.rt] << (v[c.r.rs] & 0x1f);
508 known &= ~BIT(c.r.rd);
511 case OP_SPECIAL_SRLV:
512 if (known & BIT(c.r.rt) && known & BIT(c.r.rs)) {
513 known |= BIT(c.r.rd);
514 v[c.r.rd] = v[c.r.rt] >> (v[c.r.rs] & 0x1f);
516 known &= ~BIT(c.r.rd);
519 case OP_SPECIAL_SRAV:
520 if (known & BIT(c.r.rt) && known & BIT(c.r.rs)) {
521 known |= BIT(c.r.rd);
522 v[c.r.rd] = (s32)v[c.r.rt]
523 >> (v[c.r.rs] & 0x1f);
525 known &= ~BIT(c.r.rd);
529 case OP_SPECIAL_ADDU:
530 if (known & BIT(c.r.rt) && known & BIT(c.r.rs)) {
531 known |= BIT(c.r.rd);
532 v[c.r.rd] = (s32)v[c.r.rt] + (s32)v[c.r.rs];
534 known &= ~BIT(c.r.rd);
538 case OP_SPECIAL_SUBU:
539 if (known & BIT(c.r.rt) && known & BIT(c.r.rs)) {
540 known |= BIT(c.r.rd);
541 v[c.r.rd] = v[c.r.rt] - v[c.r.rs];
543 known &= ~BIT(c.r.rd);
547 if (known & BIT(c.r.rt) && known & BIT(c.r.rs)) {
548 known |= BIT(c.r.rd);
549 v[c.r.rd] = v[c.r.rt] & v[c.r.rs];
551 known &= ~BIT(c.r.rd);
555 if (known & BIT(c.r.rt) && known & BIT(c.r.rs)) {
556 known |= BIT(c.r.rd);
557 v[c.r.rd] = v[c.r.rt] | v[c.r.rs];
559 known &= ~BIT(c.r.rd);
563 if (known & BIT(c.r.rt) && known & BIT(c.r.rs)) {
564 known |= BIT(c.r.rd);
565 v[c.r.rd] = v[c.r.rt] ^ v[c.r.rs];
567 known &= ~BIT(c.r.rd);
571 if (known & BIT(c.r.rt) && known & BIT(c.r.rs)) {
572 known |= BIT(c.r.rd);
573 v[c.r.rd] = ~(v[c.r.rt] | v[c.r.rs]);
575 known &= ~BIT(c.r.rd);
579 if (known & BIT(c.r.rt) && known & BIT(c.r.rs)) {
580 known |= BIT(c.r.rd);
581 v[c.r.rd] = (s32)v[c.r.rs] < (s32)v[c.r.rt];
583 known &= ~BIT(c.r.rd);
586 case OP_SPECIAL_SLTU:
587 if (known & BIT(c.r.rt) && known & BIT(c.r.rs)) {
588 known |= BIT(c.r.rd);
589 v[c.r.rd] = v[c.r.rs] < v[c.r.rt];
591 known &= ~BIT(c.r.rd);
602 if (known & BIT(c.i.rs)) {
603 known |= BIT(c.i.rt);
604 v[c.i.rt] = v[c.i.rs] + (s32)(s16)c.i.imm;
606 known &= ~BIT(c.i.rt);
610 if (known & BIT(c.i.rs)) {
611 known |= BIT(c.i.rt);
612 v[c.i.rt] = (s32)v[c.i.rs] < (s32)(s16)c.i.imm;
614 known &= ~BIT(c.i.rt);
618 if (known & BIT(c.i.rs)) {
619 known |= BIT(c.i.rt);
620 v[c.i.rt] = v[c.i.rs] < (u32)(s32)(s16)c.i.imm;
622 known &= ~BIT(c.i.rt);
626 if (known & BIT(c.i.rs)) {
627 known |= BIT(c.i.rt);
628 v[c.i.rt] = v[c.i.rs] & c.i.imm;
630 known &= ~BIT(c.i.rt);
634 if (known & BIT(c.i.rs)) {
635 known |= BIT(c.i.rt);
636 v[c.i.rt] = v[c.i.rs] | c.i.imm;
638 known &= ~BIT(c.i.rt);
642 if (known & BIT(c.i.rs)) {
643 known |= BIT(c.i.rt);
644 v[c.i.rt] = v[c.i.rs] ^ c.i.imm;
646 known &= ~BIT(c.i.rt);
650 known |= BIT(c.i.rt);
651 v[c.i.rt] = c.i.imm << 16;
657 known &= ~BIT(c.r.rt);
662 if (c.r.op == OP_CP2_BASIC) {
664 case OP_CP2_BASIC_MFC2:
665 case OP_CP2_BASIC_CFC2:
666 known &= ~BIT(c.r.rt);
679 known &= ~BIT(c.i.rt);
682 if (known & BIT(c.r.rs)) {
683 known |= BIT(c.r.rd);
684 v[c.r.rd] = v[c.r.rs];
686 known &= ~BIT(c.r.rd);
696 static void lightrec_optimize_sll_sra(struct opcode *list, unsigned int offset)
698 struct opcode *prev, *prev2 = NULL, *curr = &list[offset];
699 struct opcode *to_change, *to_nop;
702 if (curr->r.imm != 24 && curr->r.imm != 16)
705 idx = find_prev_writer(list, offset, curr->r.rt);
711 if (prev->i.op != OP_SPECIAL || prev->r.op != OP_SPECIAL_SLL ||
712 prev->r.imm != curr->r.imm || prev->r.rd != curr->r.rt)
715 if (prev->r.rd != prev->r.rt && curr->r.rd != curr->r.rt) {
720 if (!reg_is_dead(list, offset, curr->r.rt) ||
721 reg_is_read_or_written(list, idx, offset, curr->r.rd))
724 /* If rY is dead after the SRL, and rZ is not used after the SLL,
725 * we can change rY to rZ */
727 pr_debug("Detected SLL/SRA with middle temp register\n");
728 prev->r.rd = curr->r.rd;
729 curr->r.rt = prev->r.rd;
732 /* We got a SLL/SRA combo. If imm #16, that's a cast to u16.
733 * If imm #24 that's a cast to u8.
735 * First of all, make sure that the target register of the SLL is not
736 * read before the SRA. */
738 if (prev->r.rd == prev->r.rt) {
745 /* rX is used after the SRA - we cannot convert it. */
746 if (prev->r.rd != curr->r.rd && !reg_is_dead(list, offset, prev->r.rd))
756 idx2 = find_prev_writer(list, idx, prev->r.rt);
758 /* Note that PSX games sometimes do casts after
759 * a LHU or LBU; in this case we can change the
760 * load opcode to a LH or LB, and the cast can
761 * be changed to a MOV or a simple NOP. */
765 if (curr->r.rd != prev2->i.rt &&
766 !reg_is_dead(list, offset, prev2->i.rt))
768 else if (curr->r.imm == 16 && prev2->i.op == OP_LHU)
770 else if (curr->r.imm == 24 && prev2->i.op == OP_LBU)
776 if (curr->r.rd == prev2->i.rt) {
777 to_change->opcode = 0;
778 } else if (reg_is_dead(list, offset, prev2->i.rt) &&
779 !reg_is_read_or_written(list, idx2 + 1, offset, curr->r.rd)) {
780 /* The target register of the SRA is dead after the
781 * LBU/LHU; we can change the target register of the
782 * LBU/LHU to the one of the SRA. */
783 prev2->i.rt = curr->r.rd;
784 to_change->opcode = 0;
786 to_change->i.op = OP_META_MOV;
787 to_change->r.rd = curr->r.rd;
788 to_change->r.rs = prev2->i.rt;
791 if (to_nop->r.imm == 24)
792 pr_debug("Convert LBU+SLL+SRA to LB\n");
794 pr_debug("Convert LHU+SLL+SRA to LH\n");
799 pr_debug("Convert SLL/SRA #%u to EXT%c\n",
801 prev->r.imm == 24 ? 'C' : 'S');
803 if (to_change == prev) {
804 to_change->i.rs = prev->r.rt;
805 to_change->i.rt = curr->r.rd;
807 to_change->i.rt = curr->r.rd;
808 to_change->i.rs = prev->r.rt;
811 if (to_nop->r.imm == 24)
812 to_change->i.op = OP_META_EXTC;
814 to_change->i.op = OP_META_EXTS;
820 static void lightrec_remove_useless_lui(struct block *block, unsigned int offset,
821 u32 known, u32 *values)
823 struct opcode *list = block->opcode_list,
824 *op = &block->opcode_list[offset];
827 if (!(op->flags & LIGHTREC_SYNC) && (known & BIT(op->i.rt)) &&
828 values[op->i.rt] == op->i.imm << 16) {
829 pr_debug("Converting duplicated LUI to NOP\n");
834 if (op->i.imm != 0 || op->i.rt == 0)
837 reader = find_next_reader(list, offset + 1, op->i.rt);
841 if (opcode_writes_register(list[reader].c, op->i.rt) ||
842 reg_is_dead(list, reader, op->i.rt)) {
843 pr_debug("Removing useless LUI 0x0\n");
845 if (list[reader].i.rs == op->i.rt)
846 list[reader].i.rs = 0;
847 if (list[reader].i.op == OP_SPECIAL &&
848 list[reader].i.rt == op->i.rt)
849 list[reader].i.rt = 0;
854 static void lightrec_modify_lui(struct block *block, unsigned int offset)
856 union code c, *lui = &block->opcode_list[offset].c;
857 bool stop = false, stop_next = false;
860 for (i = offset + 1; !stop && i < block->nb_ops; i++) {
861 c = block->opcode_list[i].c;
864 if ((opcode_is_store(c) && c.i.rt == lui->i.rt)
865 || (!opcode_is_load(c) && opcode_reads_register(c, lui->i.rt)))
868 if (opcode_writes_register(c, lui->i.rt)) {
869 pr_debug("Convert LUI at offset 0x%x to kuseg\n",
871 lui->i.imm = kunseg(lui->i.imm << 16) >> 16;
875 if (has_delay_slot(c))
880 static int lightrec_transform_ops(struct lightrec_state *state, struct block *block)
882 struct opcode *list = block->opcode_list;
883 struct opcode *prev, *op = NULL;
885 u32 values[32] = { 0 };
888 for (i = 0; i < block->nb_ops; i++) {
893 known = lightrec_propagate_consts(op, prev, known, values);
895 /* Transform all opcodes detected as useless to real NOPs
896 * (0x0: SLL r0, r0, #0) */
897 if (op->opcode != 0 && is_nop(op->c)) {
898 pr_debug("Converting useless opcode 0x%08x to NOP\n",
908 if (op->i.rs == op->i.rt) {
911 } else if (op->i.rs == 0) {
925 if (!prev || !has_delay_slot(prev->c))
926 lightrec_modify_lui(block, i);
927 lightrec_remove_useless_lui(block, i, known, values);
930 /* Transform ORI/ADDI/ADDIU with imm #0 or ORR/ADD/ADDU/SUB/SUBU
931 * with register $zero to the MOV meta-opcode */
935 if (op->i.imm == 0) {
936 pr_debug("Convert ORI/ADDI/ADDIU #0 to MOV\n");
937 op->i.op = OP_META_MOV;
944 if (op->r.imm == 0) {
945 pr_debug("Convert SRA #0 to MOV\n");
946 op->i.op = OP_META_MOV;
951 lightrec_optimize_sll_sra(block->opcode_list, i);
955 if (op->r.imm == 0) {
956 pr_debug("Convert SLL/SRL #0 to MOV\n");
957 op->i.op = OP_META_MOV;
963 case OP_SPECIAL_ADDU:
965 pr_debug("Convert OR/ADD $zero to MOV\n");
966 op->i.op = OP_META_MOV;
971 case OP_SPECIAL_SUBU:
973 pr_debug("Convert OR/ADD/SUB $zero to MOV\n");
974 op->i.op = OP_META_MOV;
989 static int lightrec_switch_delay_slots(struct lightrec_state *state, struct block *block)
991 struct opcode *list, *next = &block->opcode_list[0];
993 union code op, next_op;
996 for (i = 0; i < block->nb_ops - 1; i++) {
998 next = &block->opcode_list[i + 1];
1002 if (!has_delay_slot(op) ||
1003 list->flags & (LIGHTREC_NO_DS | LIGHTREC_EMULATE_BRANCH) ||
1004 op.opcode == 0 || next_op.opcode == 0)
1007 if (i && has_delay_slot(block->opcode_list[i - 1].c) &&
1008 !(block->opcode_list[i - 1].flags & LIGHTREC_NO_DS))
1011 if ((list->flags & LIGHTREC_SYNC) ||
1012 (next->flags & LIGHTREC_SYNC))
1015 switch (list->i.op) {
1018 case OP_SPECIAL_JALR:
1019 if (opcode_reads_register(next_op, op.r.rd) ||
1020 opcode_writes_register(next_op, op.r.rd))
1024 if (opcode_writes_register(next_op, op.r.rs))
1034 if (opcode_reads_register(next_op, 31) ||
1035 opcode_writes_register(next_op, 31))
1041 if (op.i.rt && opcode_writes_register(next_op, op.i.rt))
1046 if (op.i.rs && opcode_writes_register(next_op, op.i.rs))
1051 case OP_REGIMM_BLTZAL:
1052 case OP_REGIMM_BGEZAL:
1053 if (opcode_reads_register(next_op, 31) ||
1054 opcode_writes_register(next_op, 31))
1057 case OP_REGIMM_BLTZ:
1058 case OP_REGIMM_BGEZ:
1060 opcode_writes_register(next_op, op.i.rs))
1069 pr_debug("Swap branch and delay slot opcodes "
1070 "at offsets 0x%x / 0x%x\n",
1071 i << 2, (i + 1) << 2);
1073 flags = next->flags;
1076 next->flags = list->flags | LIGHTREC_NO_DS;
1077 list->flags = flags | LIGHTREC_NO_DS;
1083 static int shrink_opcode_list(struct lightrec_state *state, struct block *block, u16 new_size)
1085 struct opcode *list;
1087 if (new_size >= block->nb_ops) {
1088 pr_err("Invalid shrink size (%u vs %u)\n",
1089 new_size, block->nb_ops);
1094 list = lightrec_malloc(state, MEM_FOR_IR,
1095 sizeof(*list) * new_size);
1097 pr_err("Unable to allocate memory\n");
1101 memcpy(list, block->opcode_list, sizeof(*list) * new_size);
1103 lightrec_free_opcode_list(state, block);
1104 block->opcode_list = list;
1105 block->nb_ops = new_size;
1107 pr_debug("Shrunk opcode list of block PC 0x%08x to %u opcodes\n",
1108 block->pc, new_size);
1113 static int lightrec_detect_impossible_branches(struct lightrec_state *state,
1114 struct block *block)
1116 struct opcode *op, *next = &block->opcode_list[0];
1120 for (i = 0; i < block->nb_ops - 1; i++) {
1122 next = &block->opcode_list[i + 1];
1124 if (!has_delay_slot(op->c) ||
1125 (!load_in_delay_slot(next->c) &&
1126 !has_delay_slot(next->c) &&
1127 !(next->i.op == OP_CP0 && next->r.rs == OP_CP0_RFE)))
1130 if (op->c.opcode == next->c.opcode) {
1131 /* The delay slot is the exact same opcode as the branch
1132 * opcode: this is effectively a NOP */
1137 op->flags |= LIGHTREC_EMULATE_BRANCH;
1139 if (op == block->opcode_list) {
1140 pr_debug("First opcode of block PC 0x%08x is an impossible branch\n",
1143 /* If the first opcode is an 'impossible' branch, we
1144 * only keep the first two opcodes of the block (the
1145 * branch itself + its delay slot) */
1146 if (block->nb_ops > 2)
1147 ret = shrink_opcode_list(state, block, 2);
1155 static int lightrec_local_branches(struct lightrec_state *state, struct block *block)
1157 struct opcode *list;
1161 for (i = 0; i < block->nb_ops; i++) {
1162 list = &block->opcode_list[i];
1164 if (should_emulate(list))
1167 switch (list->i.op) {
1173 offset = i + 1 + (s16)list->i.imm;
1174 if (offset >= 0 && offset < block->nb_ops)
1181 pr_debug("Found local branch to offset 0x%x\n", offset << 2);
1183 if (should_emulate(&block->opcode_list[offset])) {
1184 pr_debug("Branch target must be emulated - skip\n");
1188 if (offset && has_delay_slot(block->opcode_list[offset - 1].c)) {
1189 pr_debug("Branch target is a delay slot - skip\n");
1193 pr_debug("Adding sync at offset 0x%x\n", offset << 2);
1195 block->opcode_list[offset].flags |= LIGHTREC_SYNC;
1196 list->flags |= LIGHTREC_LOCAL_BRANCH;
1202 bool has_delay_slot(union code op)
1208 case OP_SPECIAL_JALR:
1226 bool should_emulate(const struct opcode *list)
1228 return has_delay_slot(list->c) &&
1229 (list->flags & LIGHTREC_EMULATE_BRANCH);
1232 static void lightrec_add_unload(struct opcode *op, u8 reg)
1234 if (op->i.op == OP_SPECIAL && reg == op->r.rd)
1235 op->flags |= LIGHTREC_UNLOAD_RD;
1237 if (op->i.rs == reg)
1238 op->flags |= LIGHTREC_UNLOAD_RS;
1239 if (op->i.rt == reg)
1240 op->flags |= LIGHTREC_UNLOAD_RT;
1243 static int lightrec_early_unload(struct lightrec_state *state, struct block *block)
1245 unsigned int i, offset;
1249 for (reg = 1; reg < 34; reg++) {
1250 int last_r_id = -1, last_w_id = -1;
1252 for (i = 0; i < block->nb_ops; i++) {
1253 union code c = block->opcode_list[i].c;
1255 if (opcode_reads_register(c, reg))
1257 if (opcode_writes_register(c, reg))
1261 if (last_w_id > last_r_id)
1262 offset = (unsigned int)last_w_id;
1263 else if (last_r_id >= 0)
1264 offset = (unsigned int)last_r_id;
1268 op = &block->opcode_list[offset];
1270 if (has_delay_slot(op->c) && (op->flags & LIGHTREC_NO_DS))
1273 if (offset == block->nb_ops)
1276 lightrec_add_unload(&block->opcode_list[offset], reg);
1282 static int lightrec_flag_io(struct lightrec_state *state, struct block *block)
1284 struct opcode *prev = NULL, *list = NULL;
1285 enum psx_map psx_map;
1287 u32 values[32] = { 0 };
1289 u32 val, kunseg_val;
1291 for (i = 0; i < block->nb_ops; i++) {
1293 list = &block->opcode_list[i];
1296 known = lightrec_propagate_consts(list, prev, known, values);
1298 switch (list->i.op) {
1302 if (OPT_FLAG_STORES) {
1303 /* Mark all store operations that target $sp or $gp
1304 * as not requiring code invalidation. This is based
1305 * on the heuristic that stores using one of these
1306 * registers as address will never hit a code page. */
1307 if (list->i.rs >= 28 && list->i.rs <= 29 &&
1308 !state->maps[PSX_MAP_KERNEL_USER_RAM].ops) {
1309 pr_debug("Flaging opcode 0x%08x as not "
1310 "requiring invalidation\n",
1312 list->flags |= LIGHTREC_NO_INVALIDATE;
1315 /* Detect writes whose destination address is inside the
1316 * current block, using constant propagation. When these
1317 * occur, we mark the blocks as not compilable. */
1318 if ((known & BIT(list->i.rs)) &&
1319 kunseg(values[list->i.rs]) >= kunseg(block->pc) &&
1320 kunseg(values[list->i.rs]) < (kunseg(block->pc) +
1321 block->nb_ops * 4)) {
1322 pr_debug("Self-modifying block detected\n");
1323 block->flags |= BLOCK_NEVER_COMPILE;
1324 list->flags |= LIGHTREC_SMC;
1339 if (OPT_FLAG_IO && (known & BIT(list->i.rs))) {
1340 val = values[list->i.rs] + (s16) list->i.imm;
1341 kunseg_val = kunseg(val);
1342 psx_map = lightrec_get_map_idx(state, kunseg_val);
1345 case PSX_MAP_KERNEL_USER_RAM:
1346 if (val == kunseg_val)
1347 list->flags |= LIGHTREC_NO_MASK;
1349 case PSX_MAP_MIRROR1:
1350 case PSX_MAP_MIRROR2:
1351 case PSX_MAP_MIRROR3:
1352 pr_debug("Flaging opcode %u as RAM access\n", i);
1353 list->flags |= LIGHTREC_IO_MODE(LIGHTREC_IO_RAM);
1356 pr_debug("Flaging opcode %u as BIOS access\n", i);
1357 list->flags |= LIGHTREC_IO_MODE(LIGHTREC_IO_BIOS);
1359 case PSX_MAP_SCRATCH_PAD:
1360 pr_debug("Flaging opcode %u as scratchpad access\n", i);
1361 list->flags |= LIGHTREC_IO_MODE(LIGHTREC_IO_SCRATCH);
1363 /* Consider that we're never going to run code from
1364 * the scratchpad. */
1365 list->flags |= LIGHTREC_NO_INVALIDATE;
1368 pr_debug("Flagging opcode %u as I/O access\n",
1370 list->flags |= LIGHTREC_IO_MODE(LIGHTREC_IO_HW);
1383 static u8 get_mfhi_mflo_reg(const struct block *block, u16 offset,
1384 const struct opcode *last,
1385 u32 mask, bool sync, bool mflo, bool another)
1387 const struct opcode *op, *next = &block->opcode_list[offset];
1389 u8 reg2, reg = mflo ? REG_LO : REG_HI;
1393 for (i = offset; i < block->nb_ops; i++) {
1395 next = &block->opcode_list[i + 1];
1398 /* If any other opcode writes or reads to the register
1399 * we'd use, then we cannot use it anymore. */
1400 mask |= opcode_read_mask(op->c);
1401 mask |= opcode_write_mask(op->c);
1403 if (op->flags & LIGHTREC_SYNC)
1412 /* TODO: handle backwards branches too */
1414 (op->flags & LIGHTREC_LOCAL_BRANCH) &&
1415 (s16)op->c.i.imm >= 0) {
1416 branch_offset = i + 1 + (s16)op->c.i.imm
1417 - !!(OPT_SWITCH_DELAY_SLOTS && (op->flags & LIGHTREC_NO_DS));
1419 reg = get_mfhi_mflo_reg(block, branch_offset, NULL,
1420 mask, sync, mflo, false);
1421 reg2 = get_mfhi_mflo_reg(block, offset + 1, next,
1422 mask, sync, mflo, false);
1423 if (reg > 0 && reg == reg2)
1429 return mflo ? REG_LO : REG_HI;
1432 case OP_SPECIAL_MULT:
1433 case OP_SPECIAL_MULTU:
1434 case OP_SPECIAL_DIV:
1435 case OP_SPECIAL_DIVU:
1437 case OP_SPECIAL_MTHI:
1441 case OP_SPECIAL_MTLO:
1450 !(op->flags & LIGHTREC_NO_DS) &&
1451 (next->i.op == OP_SPECIAL) &&
1452 ((!mflo && next->r.op == OP_SPECIAL_MFHI) ||
1453 (mflo && next->r.op == OP_SPECIAL_MFLO)))
1457 case OP_SPECIAL_JALR:
1459 case OP_SPECIAL_MFHI:
1463 /* Must use REG_HI if there is another MFHI target*/
1464 reg2 = get_mfhi_mflo_reg(block, i + 1, next,
1465 0, sync, mflo, true);
1466 if (reg2 > 0 && reg2 != REG_HI)
1469 if (!sync && !(old_mask & BIT(op->r.rd)))
1475 case OP_SPECIAL_MFLO:
1479 /* Must use REG_LO if there is another MFLO target*/
1480 reg2 = get_mfhi_mflo_reg(block, i + 1, next,
1481 0, sync, mflo, true);
1482 if (reg2 > 0 && reg2 != REG_LO)
1485 if (!sync && !(old_mask & BIT(op->r.rd)))
1504 static void lightrec_replace_lo_hi(struct block *block, u16 offset,
1510 /* This function will remove the following MFLO/MFHI. It must be called
1511 * only if get_mfhi_mflo_reg() returned a non-zero value. */
1513 for (i = offset; i < last; i++) {
1514 struct opcode *op = &block->opcode_list[i];
1522 /* TODO: handle backwards branches too */
1523 if ((op->flags & LIGHTREC_LOCAL_BRANCH) &&
1524 (s16)op->c.i.imm >= 0) {
1525 branch_offset = i + 1 + (s16)op->c.i.imm
1526 - !!(OPT_SWITCH_DELAY_SLOTS && (op->flags & LIGHTREC_NO_DS));
1528 lightrec_replace_lo_hi(block, branch_offset, last, lo);
1529 lightrec_replace_lo_hi(block, i + 1, branch_offset, lo);
1534 if (lo && op->r.op == OP_SPECIAL_MFLO) {
1535 pr_debug("Removing MFLO opcode at offset 0x%x\n",
1539 } else if (!lo && op->r.op == OP_SPECIAL_MFHI) {
1540 pr_debug("Removing MFHI opcode at offset 0x%x\n",
1553 static bool lightrec_always_skip_div_check(void)
1562 static int lightrec_flag_mults_divs(struct lightrec_state *state, struct block *block)
1564 struct opcode *prev, *list = NULL;
1568 u32 values[32] = { 0 };
1570 for (i = 0; i < block->nb_ops - 1; i++) {
1572 list = &block->opcode_list[i];
1575 known = lightrec_propagate_consts(list, prev, known, values);
1577 if (list->i.op != OP_SPECIAL)
1580 switch (list->r.op) {
1581 case OP_SPECIAL_DIV:
1582 case OP_SPECIAL_DIVU:
1583 /* If we are dividing by a non-zero constant, don't
1584 * emit the div-by-zero check. */
1585 if (lightrec_always_skip_div_check() ||
1586 (known & BIT(list->c.r.rt) && values[list->c.r.rt]))
1587 list->flags |= LIGHTREC_NO_DIV_CHECK;
1589 case OP_SPECIAL_MULT:
1590 case OP_SPECIAL_MULTU:
1596 /* Don't support opcodes in delay slots */
1597 if ((i && has_delay_slot(block->opcode_list[i - 1].c)) ||
1598 (list->flags & LIGHTREC_NO_DS)) {
1602 reg_lo = get_mfhi_mflo_reg(block, i + 1, NULL, 0, false, true, false);
1604 pr_debug("Mark MULT(U)/DIV(U) opcode at offset 0x%x as"
1605 " not writing LO\n", i << 2);
1606 list->flags |= LIGHTREC_NO_LO;
1609 reg_hi = get_mfhi_mflo_reg(block, i + 1, NULL, 0, false, false, false);
1611 pr_debug("Mark MULT(U)/DIV(U) opcode at offset 0x%x as"
1612 " not writing HI\n", i << 2);
1613 list->flags |= LIGHTREC_NO_HI;
1616 if (!reg_lo && !reg_hi) {
1617 pr_debug("Both LO/HI unused in this block, they will "
1618 "probably be used in parent block - removing "
1620 list->flags &= ~(LIGHTREC_NO_LO | LIGHTREC_NO_HI);
1623 if (reg_lo > 0 && reg_lo != REG_LO) {
1624 pr_debug("Found register %s to hold LO (rs = %u, rt = %u)\n",
1625 lightrec_reg_name(reg_lo), list->r.rs, list->r.rt);
1627 lightrec_replace_lo_hi(block, i + 1, block->nb_ops, true);
1628 list->r.rd = reg_lo;
1633 if (reg_hi > 0 && reg_hi != REG_HI) {
1634 pr_debug("Found register %s to hold HI (rs = %u, rt = %u)\n",
1635 lightrec_reg_name(reg_hi), list->r.rs, list->r.rt);
1637 lightrec_replace_lo_hi(block, i + 1, block->nb_ops, false);
1638 list->r.imm = reg_hi;
1647 static bool remove_div_sequence(struct block *block, unsigned int offset)
1650 unsigned int i, found = 0;
1653 * Scan for the zero-checking sequence that GCC automatically introduced
1654 * after most DIV/DIVU opcodes. This sequence checks the value of the
1655 * divisor, and if zero, executes a BREAK opcode, causing the BIOS
1656 * handler to crash the PS1.
1658 * For DIV opcodes, this sequence additionally checks that the signed
1659 * operation does not overflow.
1661 * With the assumption that the games never crashed the PS1, we can
1662 * therefore assume that the games never divided by zero or overflowed,
1663 * and these sequences can be removed.
1666 for (i = offset; i < block->nb_ops; i++) {
1667 op = &block->opcode_list[i];
1670 if (op->i.op == OP_SPECIAL &&
1671 (op->r.op == OP_SPECIAL_DIV || op->r.op == OP_SPECIAL_DIVU))
1674 if ((op->opcode & 0xfc1fffff) == 0x14000002) {
1675 /* BNE ???, zero, +8 */
1680 } else if (found == 1 && !op->opcode) {
1683 } else if (found == 2 && op->opcode == 0x0007000d) {
1686 } else if (found == 3 && op->opcode == 0x2401ffff) {
1689 } else if (found == 4 && (op->opcode & 0xfc1fffff) == 0x14010004) {
1690 /* BNE ???, at, +16 */
1692 } else if (found == 5 && op->opcode == 0x3c018000) {
1693 /* LUI at, 0x8000 */
1695 } else if (found == 6 && (op->opcode & 0x141fffff) == 0x14010002) {
1696 /* BNE ???, at, +16 */
1698 } else if (found == 7 && !op->opcode) {
1701 } else if (found == 8 && op->opcode == 0x0006000d) {
1714 pr_debug("Removing DIV%s sequence at offset 0x%x\n",
1715 found == 9 ? "" : "U", offset << 2);
1717 for (i = 0; i < found; i++)
1718 block->opcode_list[offset + i].opcode = 0;
1726 static int lightrec_remove_div_by_zero_check_sequence(struct lightrec_state *state,
1727 struct block *block)
1732 for (i = 0; i < block->nb_ops; i++) {
1733 op = &block->opcode_list[i];
1735 if (op->i.op == OP_SPECIAL &&
1736 (op->r.op == OP_SPECIAL_DIVU || op->r.op == OP_SPECIAL_DIV) &&
1737 remove_div_sequence(block, i + 1))
1738 op->flags |= LIGHTREC_NO_DIV_CHECK;
1744 static const u32 memset_code[] = {
1745 0x10a00006, // beqz a1, 2f
1746 0x24a2ffff, // addiu v0,a1,-1
1747 0x2403ffff, // li v1,-1
1748 0xac800000, // 1: sw zero,0(a0)
1749 0x2442ffff, // addiu v0,v0,-1
1750 0x1443fffd, // bne v0,v1, 1b
1751 0x24840004, // addiu a0,a0,4
1752 0x03e00008, // 2: jr ra
1756 static int lightrec_replace_memset(struct lightrec_state *state, struct block *block)
1761 for (i = 0; i < block->nb_ops; i++) {
1762 c = block->opcode_list[i].c;
1764 if (c.opcode != memset_code[i])
1767 if (i == ARRAY_SIZE(memset_code) - 1) {
1769 pr_debug("Block at PC 0x%x is a memset\n", block->pc);
1770 block->flags |= BLOCK_IS_MEMSET | BLOCK_NEVER_COMPILE;
1772 /* Return non-zero to skip other optimizers. */
1780 static int (*lightrec_optimizers[])(struct lightrec_state *state, struct block *) = {
1781 IF_OPT(OPT_REMOVE_DIV_BY_ZERO_SEQ, &lightrec_remove_div_by_zero_check_sequence),
1782 IF_OPT(OPT_REPLACE_MEMSET, &lightrec_replace_memset),
1783 IF_OPT(OPT_DETECT_IMPOSSIBLE_BRANCHES, &lightrec_detect_impossible_branches),
1784 IF_OPT(OPT_LOCAL_BRANCHES, &lightrec_local_branches),
1785 IF_OPT(OPT_TRANSFORM_OPS, &lightrec_transform_ops),
1786 IF_OPT(OPT_SWITCH_DELAY_SLOTS, &lightrec_switch_delay_slots),
1787 IF_OPT(OPT_FLAG_IO || OPT_FLAG_STORES, &lightrec_flag_io),
1788 IF_OPT(OPT_FLAG_MULT_DIV, &lightrec_flag_mults_divs),
1789 IF_OPT(OPT_EARLY_UNLOAD, &lightrec_early_unload),
1792 int lightrec_optimize(struct lightrec_state *state, struct block *block)
1797 for (i = 0; i < ARRAY_SIZE(lightrec_optimizers); i++) {
1798 if (lightrec_optimizers[i]) {
1799 ret = (*lightrec_optimizers[i])(state, block);