1 // SPDX-License-Identifier: LGPL-2.1-or-later
3 * Copyright (C) 2014-2021 Paul Cercueil <paul@crapouillou.net>
6 #include "lightrec-config.h"
7 #include "disassembler.h"
9 #include "memmanager.h"
10 #include "optimizer.h"
18 #define IF_OPT(opt, ptr) ((opt) ? (ptr) : NULL)
20 struct optimizer_list {
21 void (**optimizers)(struct opcode *);
22 unsigned int nb_optimizers;
25 static bool is_nop(union code op);
27 bool is_unconditional_jump(union code c)
31 return c.r.op == OP_SPECIAL_JR || c.r.op == OP_SPECIAL_JALR;
37 return c.i.rs == c.i.rt;
39 return (c.r.rt == OP_REGIMM_BGEZ ||
40 c.r.rt == OP_REGIMM_BGEZAL) && c.i.rs == 0;
46 bool is_syscall(union code c)
48 return (c.i.op == OP_SPECIAL && c.r.op == OP_SPECIAL_SYSCALL) ||
49 (c.i.op == OP_CP0 && (c.r.rs == OP_CP0_MTC0 ||
50 c.r.rs == OP_CP0_CTC0) &&
51 (c.r.rd == 12 || c.r.rd == 13));
54 static u64 opcode_read_mask(union code op)
59 case OP_SPECIAL_SYSCALL:
60 case OP_SPECIAL_BREAK:
76 return BIT(op.r.rs) | BIT(op.r.rt);
87 if (op.r.op == OP_CP2_BASIC) {
89 case OP_CP2_BASIC_MTC2:
90 case OP_CP2_BASIC_CTC2:
110 return BIT(op.i.rs) | BIT(op.i.rt);
116 static u64 opcode_write_mask(union code op)
124 case OP_SPECIAL_SYSCALL:
125 case OP_SPECIAL_BREAK:
127 case OP_SPECIAL_MULT:
128 case OP_SPECIAL_MULTU:
130 case OP_SPECIAL_DIVU:
131 if (!OPT_FLAG_MULT_DIV)
132 return BIT(REG_LO) | BIT(REG_HI);
135 flags = BIT(op.r.rd);
139 flags |= BIT(op.r.imm);
141 flags |= BIT(REG_HI);
143 case OP_SPECIAL_MTHI:
145 case OP_SPECIAL_MTLO:
177 if (op.r.op == OP_CP2_BASIC) {
179 case OP_CP2_BASIC_MFC2:
180 case OP_CP2_BASIC_CFC2:
189 case OP_REGIMM_BLTZAL:
190 case OP_REGIMM_BGEZAL:
202 bool opcode_reads_register(union code op, u8 reg)
204 return opcode_read_mask(op) & BIT(reg);
207 bool opcode_writes_register(union code op, u8 reg)
209 return opcode_write_mask(op) & BIT(reg);
212 static int find_prev_writer(const struct opcode *list, unsigned int offset, u8 reg)
217 if (list[offset].flags & LIGHTREC_SYNC)
220 for (i = offset; i > 0; i--) {
223 if (opcode_writes_register(c, reg)) {
224 if (i > 1 && has_delay_slot(list[i - 2].c))
230 if ((list[i - 1].flags & LIGHTREC_SYNC) ||
232 opcode_reads_register(c, reg))
239 static int find_next_reader(const struct opcode *list, unsigned int offset, u8 reg)
244 if (list[offset].flags & LIGHTREC_SYNC)
247 for (i = offset; ; i++) {
250 if (opcode_reads_register(c, reg)) {
251 if (i > 0 && has_delay_slot(list[i - 1].c))
257 if ((list[i].flags & LIGHTREC_SYNC) ||
258 has_delay_slot(c) || opcode_writes_register(c, reg))
265 static bool reg_is_dead(const struct opcode *list, unsigned int offset, u8 reg)
269 if (list[offset].flags & LIGHTREC_SYNC)
272 for (i = offset + 1; ; i++) {
273 if (opcode_reads_register(list[i].c, reg))
276 if (opcode_writes_register(list[i].c, reg))
279 if (has_delay_slot(list[i].c)) {
280 if (list[i].flags & LIGHTREC_NO_DS ||
281 opcode_reads_register(list[i + 1].c, reg))
284 return opcode_writes_register(list[i + 1].c, reg);
289 static bool reg_is_read(const struct opcode *list,
290 unsigned int a, unsigned int b, u8 reg)
292 /* Return true if reg is read in one of the opcodes of the interval
295 if (!is_nop(list[a].c) && opcode_reads_register(list[a].c, reg))
302 static bool reg_is_written(const struct opcode *list,
303 unsigned int a, unsigned int b, u8 reg)
305 /* Return true if reg is written in one of the opcodes of the interval
309 if (!is_nop(list[a].c) && opcode_writes_register(list[a].c, reg))
316 static bool reg_is_read_or_written(const struct opcode *list,
317 unsigned int a, unsigned int b, u8 reg)
319 return reg_is_read(list, a, b, reg) || reg_is_written(list, a, b, reg);
322 static bool opcode_is_load(union code op)
339 static bool opcode_is_store(union code op)
354 bool opcode_is_io(union code op)
356 return opcode_is_load(op) || opcode_is_store(op);
360 static bool is_nop(union code op)
362 if (opcode_writes_register(op, 0)) {
365 return op.r.rs != OP_CP0_MFC0;
383 return op.r.rd == op.r.rt && op.r.rd == op.r.rs;
385 case OP_SPECIAL_ADDU:
386 return (op.r.rd == op.r.rt && op.r.rs == 0) ||
387 (op.r.rd == op.r.rs && op.r.rt == 0);
389 case OP_SPECIAL_SUBU:
390 return op.r.rd == op.r.rs && op.r.rt == 0;
392 if (op.r.rd == op.r.rt)
393 return op.r.rd == op.r.rs || op.r.rs == 0;
395 return (op.r.rd == op.r.rs) && op.r.rt == 0;
399 return op.r.rd == op.r.rt && op.r.imm == 0;
400 case OP_SPECIAL_MFHI:
401 case OP_SPECIAL_MFLO:
409 return op.i.rt == op.i.rs && op.i.imm == 0;
411 return (op.i.rs == 0 || op.i.imm == 1);
413 return (op.i.op == OP_REGIMM_BLTZ ||
414 op.i.op == OP_REGIMM_BLTZAL) &&
415 (op.i.rs == 0 || op.i.imm == 1);
417 return (op.i.rs == op.i.rt || op.i.imm == 1);
423 bool load_in_delay_slot(union code op)
437 if (op.r.op == OP_CP2_BASIC) {
439 case OP_CP2_BASIC_MFC2:
440 case OP_CP2_BASIC_CFC2:
463 static u32 lightrec_propagate_consts(const struct opcode *op,
464 const struct opcode *prev,
467 union code c = prev->c;
469 /* Register $zero is always, well, zero */
473 if (op->flags & LIGHTREC_SYNC)
480 if (known & BIT(c.r.rt)) {
481 known |= BIT(c.r.rd);
482 v[c.r.rd] = v[c.r.rt] << c.r.imm;
484 known &= ~BIT(c.r.rd);
488 if (known & BIT(c.r.rt)) {
489 known |= BIT(c.r.rd);
490 v[c.r.rd] = v[c.r.rt] >> c.r.imm;
492 known &= ~BIT(c.r.rd);
496 if (known & BIT(c.r.rt)) {
497 known |= BIT(c.r.rd);
498 v[c.r.rd] = (s32)v[c.r.rt] >> c.r.imm;
500 known &= ~BIT(c.r.rd);
503 case OP_SPECIAL_SLLV:
504 if (known & BIT(c.r.rt) && known & BIT(c.r.rs)) {
505 known |= BIT(c.r.rd);
506 v[c.r.rd] = v[c.r.rt] << (v[c.r.rs] & 0x1f);
508 known &= ~BIT(c.r.rd);
511 case OP_SPECIAL_SRLV:
512 if (known & BIT(c.r.rt) && known & BIT(c.r.rs)) {
513 known |= BIT(c.r.rd);
514 v[c.r.rd] = v[c.r.rt] >> (v[c.r.rs] & 0x1f);
516 known &= ~BIT(c.r.rd);
519 case OP_SPECIAL_SRAV:
520 if (known & BIT(c.r.rt) && known & BIT(c.r.rs)) {
521 known |= BIT(c.r.rd);
522 v[c.r.rd] = (s32)v[c.r.rt]
523 >> (v[c.r.rs] & 0x1f);
525 known &= ~BIT(c.r.rd);
529 case OP_SPECIAL_ADDU:
530 if (known & BIT(c.r.rt) && known & BIT(c.r.rs)) {
531 known |= BIT(c.r.rd);
532 v[c.r.rd] = (s32)v[c.r.rt] + (s32)v[c.r.rs];
534 known &= ~BIT(c.r.rd);
538 case OP_SPECIAL_SUBU:
539 if (known & BIT(c.r.rt) && known & BIT(c.r.rs)) {
540 known |= BIT(c.r.rd);
541 v[c.r.rd] = v[c.r.rt] - v[c.r.rs];
543 known &= ~BIT(c.r.rd);
547 if (known & BIT(c.r.rt) && known & BIT(c.r.rs)) {
548 known |= BIT(c.r.rd);
549 v[c.r.rd] = v[c.r.rt] & v[c.r.rs];
551 known &= ~BIT(c.r.rd);
555 if (known & BIT(c.r.rt) && known & BIT(c.r.rs)) {
556 known |= BIT(c.r.rd);
557 v[c.r.rd] = v[c.r.rt] | v[c.r.rs];
559 known &= ~BIT(c.r.rd);
563 if (known & BIT(c.r.rt) && known & BIT(c.r.rs)) {
564 known |= BIT(c.r.rd);
565 v[c.r.rd] = v[c.r.rt] ^ v[c.r.rs];
567 known &= ~BIT(c.r.rd);
571 if (known & BIT(c.r.rt) && known & BIT(c.r.rs)) {
572 known |= BIT(c.r.rd);
573 v[c.r.rd] = ~(v[c.r.rt] | v[c.r.rs]);
575 known &= ~BIT(c.r.rd);
579 if (known & BIT(c.r.rt) && known & BIT(c.r.rs)) {
580 known |= BIT(c.r.rd);
581 v[c.r.rd] = (s32)v[c.r.rs] < (s32)v[c.r.rt];
583 known &= ~BIT(c.r.rd);
586 case OP_SPECIAL_SLTU:
587 if (known & BIT(c.r.rt) && known & BIT(c.r.rs)) {
588 known |= BIT(c.r.rd);
589 v[c.r.rd] = v[c.r.rs] < v[c.r.rt];
591 known &= ~BIT(c.r.rd);
602 if (known & BIT(c.i.rs)) {
603 known |= BIT(c.i.rt);
604 v[c.i.rt] = v[c.i.rs] + (s32)(s16)c.i.imm;
606 known &= ~BIT(c.i.rt);
610 if (known & BIT(c.i.rs)) {
611 known |= BIT(c.i.rt);
612 v[c.i.rt] = (s32)v[c.i.rs] < (s32)(s16)c.i.imm;
614 known &= ~BIT(c.i.rt);
618 if (known & BIT(c.i.rs)) {
619 known |= BIT(c.i.rt);
620 v[c.i.rt] = v[c.i.rs] < (u32)(s32)(s16)c.i.imm;
622 known &= ~BIT(c.i.rt);
626 if (known & BIT(c.i.rs)) {
627 known |= BIT(c.i.rt);
628 v[c.i.rt] = v[c.i.rs] & c.i.imm;
630 known &= ~BIT(c.i.rt);
634 if (known & BIT(c.i.rs)) {
635 known |= BIT(c.i.rt);
636 v[c.i.rt] = v[c.i.rs] | c.i.imm;
638 known &= ~BIT(c.i.rt);
642 if (known & BIT(c.i.rs)) {
643 known |= BIT(c.i.rt);
644 v[c.i.rt] = v[c.i.rs] ^ c.i.imm;
646 known &= ~BIT(c.i.rt);
650 known |= BIT(c.i.rt);
651 v[c.i.rt] = c.i.imm << 16;
657 known &= ~BIT(c.r.rt);
662 if (c.r.op == OP_CP2_BASIC) {
664 case OP_CP2_BASIC_MFC2:
665 case OP_CP2_BASIC_CFC2:
666 known &= ~BIT(c.r.rt);
679 known &= ~BIT(c.i.rt);
682 if (known & BIT(c.r.rs)) {
683 known |= BIT(c.r.rd);
684 v[c.r.rd] = v[c.r.rs];
686 known &= ~BIT(c.r.rd);
696 static void lightrec_optimize_sll_sra(struct opcode *list, unsigned int offset)
698 struct opcode *prev, *prev2 = NULL, *curr = &list[offset];
699 struct opcode *to_change, *to_nop;
702 if (curr->r.imm != 24 && curr->r.imm != 16)
705 idx = find_prev_writer(list, offset, curr->r.rt);
711 if (prev->i.op != OP_SPECIAL || prev->r.op != OP_SPECIAL_SLL ||
712 prev->r.imm != curr->r.imm || prev->r.rd != curr->r.rt)
715 if (prev->r.rd != prev->r.rt && curr->r.rd != curr->r.rt) {
720 if (!reg_is_dead(list, offset, curr->r.rt) ||
721 reg_is_read_or_written(list, idx, offset, curr->r.rd))
724 /* If rY is dead after the SRL, and rZ is not used after the SLL,
725 * we can change rY to rZ */
727 pr_debug("Detected SLL/SRA with middle temp register\n");
728 prev->r.rd = curr->r.rd;
729 curr->r.rt = prev->r.rd;
732 /* We got a SLL/SRA combo. If imm #16, that's a cast to u16.
733 * If imm #24 that's a cast to u8.
735 * First of all, make sure that the target register of the SLL is not
736 * read before the SRA. */
738 if (prev->r.rd == prev->r.rt) {
745 /* rX is used after the SRA - we cannot convert it. */
746 if (prev->r.rd != curr->r.rd && !reg_is_dead(list, offset, prev->r.rd))
756 idx2 = find_prev_writer(list, idx, prev->r.rt);
758 /* Note that PSX games sometimes do casts after
759 * a LHU or LBU; in this case we can change the
760 * load opcode to a LH or LB, and the cast can
761 * be changed to a MOV or a simple NOP. */
765 if (curr->r.rd != prev2->i.rt &&
766 !reg_is_dead(list, offset, prev2->i.rt))
768 else if (curr->r.imm == 16 && prev2->i.op == OP_LHU)
770 else if (curr->r.imm == 24 && prev2->i.op == OP_LBU)
776 if (curr->r.rd == prev2->i.rt) {
777 to_change->opcode = 0;
778 } else if (reg_is_dead(list, offset, prev2->i.rt) &&
779 !reg_is_read_or_written(list, idx2 + 1, offset, curr->r.rd)) {
780 /* The target register of the SRA is dead after the
781 * LBU/LHU; we can change the target register of the
782 * LBU/LHU to the one of the SRA. */
783 prev2->i.rt = curr->r.rd;
784 to_change->opcode = 0;
786 to_change->i.op = OP_META_MOV;
787 to_change->r.rd = curr->r.rd;
788 to_change->r.rs = prev2->i.rt;
791 if (to_nop->r.imm == 24)
792 pr_debug("Convert LBU+SLL+SRA to LB\n");
794 pr_debug("Convert LHU+SLL+SRA to LH\n");
799 pr_debug("Convert SLL/SRA #%u to EXT%c\n",
801 prev->r.imm == 24 ? 'C' : 'S');
803 if (to_change == prev) {
804 to_change->i.rs = prev->r.rt;
805 to_change->i.rt = curr->r.rd;
807 to_change->i.rt = curr->r.rd;
808 to_change->i.rs = prev->r.rt;
811 if (to_nop->r.imm == 24)
812 to_change->i.op = OP_META_EXTC;
814 to_change->i.op = OP_META_EXTS;
820 static int lightrec_transform_ops(struct lightrec_state *state, struct block *block)
822 struct opcode *list = block->opcode_list;
823 struct opcode *prev, *op = NULL;
825 u32 values[32] = { 0 };
829 for (i = 0; i < block->nb_ops; i++) {
834 known = lightrec_propagate_consts(op, prev, known, values);
836 /* Transform all opcodes detected as useless to real NOPs
837 * (0x0: SLL r0, r0, #0) */
838 if (op->opcode != 0 && is_nop(op->c)) {
839 pr_debug("Converting useless opcode 0x%08x to NOP\n",
849 if (op->i.rs == op->i.rt) {
852 } else if (op->i.rs == 0) {
866 if (!(op->flags & LIGHTREC_SYNC) &&
867 (known & BIT(op->i.rt)) &&
868 values[op->i.rt] == op->i.imm << 16) {
869 pr_debug("Converting duplicated LUI to NOP\n");
873 if (op->i.imm != 0 || op->i.rt == 0)
876 reader = find_next_reader(list, i + 1, op->i.rt);
878 (opcode_writes_register(list[reader].c, op->i.rt) ||
879 reg_is_dead(list, reader, op->i.rt))) {
881 pr_debug("Removing useless LUI 0x0\n");
883 if (list[reader].i.rs == op->i.rt)
884 list[reader].i.rs = 0;
885 if (list[reader].i.op == OP_SPECIAL &&
886 list[reader].i.rt == op->i.rt)
887 list[reader].i.rt = 0;
892 /* Transform ORI/ADDI/ADDIU with imm #0 or ORR/ADD/ADDU/SUB/SUBU
893 * with register $zero to the MOV meta-opcode */
897 if (op->i.imm == 0) {
898 pr_debug("Convert ORI/ADDI/ADDIU #0 to MOV\n");
899 op->i.op = OP_META_MOV;
906 if (op->r.imm == 0) {
907 pr_debug("Convert SRA #0 to MOV\n");
908 op->i.op = OP_META_MOV;
913 lightrec_optimize_sll_sra(block->opcode_list, i);
917 if (op->r.imm == 0) {
918 pr_debug("Convert SLL/SRL #0 to MOV\n");
919 op->i.op = OP_META_MOV;
925 case OP_SPECIAL_ADDU:
927 pr_debug("Convert OR/ADD $zero to MOV\n");
928 op->i.op = OP_META_MOV;
931 case OP_SPECIAL_SUB: /* fall-through */
932 case OP_SPECIAL_SUBU:
934 pr_debug("Convert OR/ADD/SUB $zero to MOV\n");
935 op->i.op = OP_META_MOV;
937 default: /* fall-through */
940 default: /* fall-through */
948 static int lightrec_switch_delay_slots(struct lightrec_state *state, struct block *block)
950 struct opcode *list, *next = &block->opcode_list[0];
952 union code op, next_op;
955 for (i = 0; i < block->nb_ops - 1; i++) {
957 next = &block->opcode_list[i + 1];
961 if (!has_delay_slot(op) ||
962 list->flags & (LIGHTREC_NO_DS | LIGHTREC_EMULATE_BRANCH) ||
963 op.opcode == 0 || next_op.opcode == 0)
966 if (i && has_delay_slot(block->opcode_list[i - 1].c) &&
967 !(block->opcode_list[i - 1].flags & LIGHTREC_NO_DS))
970 if ((list->flags & LIGHTREC_SYNC) ||
971 (next->flags & LIGHTREC_SYNC))
974 switch (list->i.op) {
977 case OP_SPECIAL_JALR:
978 if (opcode_reads_register(next_op, op.r.rd) ||
979 opcode_writes_register(next_op, op.r.rd))
981 case OP_SPECIAL_JR: /* fall-through */
982 if (opcode_writes_register(next_op, op.r.rs))
984 default: /* fall-through */
987 case OP_J: /* fall-through */
990 if (opcode_reads_register(next_op, 31) ||
991 opcode_writes_register(next_op, 31))
997 if (op.i.rt && opcode_writes_register(next_op, op.i.rt))
999 case OP_BLEZ: /* fall-through */
1001 if (op.i.rs && opcode_writes_register(next_op, op.i.rs))
1006 case OP_REGIMM_BLTZAL:
1007 case OP_REGIMM_BGEZAL:
1008 if (opcode_reads_register(next_op, 31) ||
1009 opcode_writes_register(next_op, 31))
1011 case OP_REGIMM_BLTZ: /* fall-through */
1012 case OP_REGIMM_BGEZ:
1014 opcode_writes_register(next_op, op.i.rs))
1018 default: /* fall-through */
1022 pr_debug("Swap branch and delay slot opcodes "
1023 "at offsets 0x%x / 0x%x\n",
1024 i << 2, (i + 1) << 2);
1026 flags = next->flags;
1029 next->flags = list->flags | LIGHTREC_NO_DS;
1030 list->flags = flags | LIGHTREC_NO_DS;
1036 static int shrink_opcode_list(struct lightrec_state *state, struct block *block, u16 new_size)
1038 struct opcode *list;
1040 if (new_size >= block->nb_ops) {
1041 pr_err("Invalid shrink size (%u vs %u)\n",
1042 new_size, block->nb_ops);
1047 list = lightrec_malloc(state, MEM_FOR_IR,
1048 sizeof(*list) * new_size);
1050 pr_err("Unable to allocate memory\n");
1054 memcpy(list, block->opcode_list, sizeof(*list) * new_size);
1056 lightrec_free_opcode_list(state, block);
1057 block->opcode_list = list;
1058 block->nb_ops = new_size;
1060 pr_debug("Shrunk opcode list of block PC 0x%08x to %u opcodes\n",
1061 block->pc, new_size);
1066 static int lightrec_detect_impossible_branches(struct lightrec_state *state,
1067 struct block *block)
1069 struct opcode *op, *next = &block->opcode_list[0];
1073 for (i = 0; i < block->nb_ops - 1; i++) {
1075 next = &block->opcode_list[i + 1];
1077 if (!has_delay_slot(op->c) ||
1078 (!load_in_delay_slot(next->c) &&
1079 !has_delay_slot(next->c) &&
1080 !(next->i.op == OP_CP0 && next->r.rs == OP_CP0_RFE)))
1083 if (op->c.opcode == next->c.opcode) {
1084 /* The delay slot is the exact same opcode as the branch
1085 * opcode: this is effectively a NOP */
1090 op->flags |= LIGHTREC_EMULATE_BRANCH;
1092 if (op == block->opcode_list) {
1093 pr_debug("First opcode of block PC 0x%08x is an impossible branch\n",
1096 /* If the first opcode is an 'impossible' branch, we
1097 * only keep the first two opcodes of the block (the
1098 * branch itself + its delay slot) */
1099 if (block->nb_ops > 2)
1100 ret = shrink_opcode_list(state, block, 2);
1108 static int lightrec_local_branches(struct lightrec_state *state, struct block *block)
1110 struct opcode *list;
1114 for (i = 0; i < block->nb_ops; i++) {
1115 list = &block->opcode_list[i];
1117 if (should_emulate(list))
1120 switch (list->i.op) {
1126 offset = i + 1 + (s16)list->i.imm;
1127 if (offset >= 0 && offset < block->nb_ops)
1129 default: /* fall-through */
1133 pr_debug("Found local branch to offset 0x%x\n", offset << 2);
1135 if (should_emulate(&block->opcode_list[offset])) {
1136 pr_debug("Branch target must be emulated - skip\n");
1140 if (offset && has_delay_slot(block->opcode_list[offset - 1].c)) {
1141 pr_debug("Branch target is a delay slot - skip\n");
1145 pr_debug("Adding sync at offset 0x%x\n", offset << 2);
1147 block->opcode_list[offset].flags |= LIGHTREC_SYNC;
1148 list->flags |= LIGHTREC_LOCAL_BRANCH;
1154 bool has_delay_slot(union code op)
1160 case OP_SPECIAL_JALR:
1178 bool should_emulate(const struct opcode *list)
1180 return has_delay_slot(list->c) &&
1181 (list->flags & LIGHTREC_EMULATE_BRANCH);
1184 static void lightrec_add_unload(struct opcode *op, u8 reg)
1186 if (op->i.op == OP_SPECIAL && reg == op->r.rd)
1187 op->flags |= LIGHTREC_UNLOAD_RD;
1189 if (op->i.rs == reg)
1190 op->flags |= LIGHTREC_UNLOAD_RS;
1191 if (op->i.rt == reg)
1192 op->flags |= LIGHTREC_UNLOAD_RT;
1195 static int lightrec_early_unload(struct lightrec_state *state, struct block *block)
1197 unsigned int i, offset;
1201 for (reg = 1; reg < 34; reg++) {
1202 int last_r_id = -1, last_w_id = -1;
1204 for (i = 0; i < block->nb_ops; i++) {
1205 union code c = block->opcode_list[i].c;
1207 if (opcode_reads_register(c, reg))
1209 if (opcode_writes_register(c, reg))
1213 if (last_w_id > last_r_id)
1214 offset = (unsigned int)last_w_id;
1215 else if (last_r_id >= 0)
1216 offset = (unsigned int)last_r_id;
1220 op = &block->opcode_list[offset];
1222 if (has_delay_slot(op->c) && (op->flags & LIGHTREC_NO_DS))
1225 if (offset == block->nb_ops)
1228 lightrec_add_unload(&block->opcode_list[offset], reg);
1234 static int lightrec_flag_io(struct lightrec_state *state, struct block *block)
1236 const struct lightrec_mem_map *map;
1237 struct opcode *prev2, *prev = NULL, *list = NULL;
1239 u32 values[32] = { 0 };
1243 for (i = 0; i < block->nb_ops; i++) {
1246 list = &block->opcode_list[i];
1249 known = lightrec_propagate_consts(list, prev, known, values);
1251 switch (list->i.op) {
1255 if (OPT_FLAG_STORES) {
1256 /* Mark all store operations that target $sp or $gp
1257 * as not requiring code invalidation. This is based
1258 * on the heuristic that stores using one of these
1259 * registers as address will never hit a code page. */
1260 if (list->i.rs >= 28 && list->i.rs <= 29 &&
1261 !state->maps[PSX_MAP_KERNEL_USER_RAM].ops) {
1262 pr_debug("Flaging opcode 0x%08x as not "
1263 "requiring invalidation\n",
1265 list->flags |= LIGHTREC_NO_INVALIDATE;
1268 /* Detect writes whose destination address is inside the
1269 * current block, using constant propagation. When these
1270 * occur, we mark the blocks as not compilable. */
1271 if ((known & BIT(list->i.rs)) &&
1272 kunseg(values[list->i.rs]) >= kunseg(block->pc) &&
1273 kunseg(values[list->i.rs]) < (kunseg(block->pc) +
1274 block->nb_ops * 4)) {
1275 pr_debug("Self-modifying block detected\n");
1276 block->flags |= BLOCK_NEVER_COMPILE;
1277 list->flags |= LIGHTREC_SMC;
1280 case OP_SWL: /* fall-through */
1291 if (OPT_FLAG_IO && (known & BIT(list->i.rs))) {
1292 if (prev && prev->i.op == OP_LUI &&
1293 !(prev2 && has_delay_slot(prev2->c)) &&
1294 prev->i.rt == list->i.rs &&
1295 list->i.rt == list->i.rs &&
1296 prev->i.imm & 0x8000) {
1297 pr_debug("Convert LUI at offset 0x%x to kuseg\n",
1300 val = kunseg(prev->i.imm << 16);
1301 prev->i.imm = val >> 16;
1302 values[list->i.rs] = val;
1305 val = values[list->i.rs] + (s16) list->i.imm;
1306 map = lightrec_get_map(state, NULL, kunseg(val));
1308 if (!map || map->ops ||
1309 map == &state->maps[PSX_MAP_PARALLEL_PORT]) {
1310 pr_debug("Flagging opcode %u as I/O access\n",
1312 list->flags |= LIGHTREC_IO_MODE(LIGHTREC_IO_HW);
1316 if (val - map->pc < map->length)
1317 list->flags |= LIGHTREC_NO_MASK;
1319 if (map == &state->maps[PSX_MAP_KERNEL_USER_RAM]) {
1320 pr_debug("Flaging opcode %u as RAM access\n", i);
1321 list->flags |= LIGHTREC_IO_MODE(LIGHTREC_IO_RAM);
1322 } else if (map == &state->maps[PSX_MAP_BIOS]) {
1323 pr_debug("Flaging opcode %u as BIOS access\n", i);
1324 list->flags |= LIGHTREC_IO_MODE(LIGHTREC_IO_BIOS);
1325 } else if (map == &state->maps[PSX_MAP_SCRATCH_PAD]) {
1326 pr_debug("Flaging opcode %u as scratchpad access\n", i);
1327 list->flags |= LIGHTREC_IO_MODE(LIGHTREC_IO_SCRATCH);
1330 default: /* fall-through */
1338 static u8 get_mfhi_mflo_reg(const struct block *block, u16 offset,
1339 const struct opcode *last,
1340 u32 mask, bool sync, bool mflo, bool another)
1342 const struct opcode *op, *next = &block->opcode_list[offset];
1344 u8 reg2, reg = mflo ? REG_LO : REG_HI;
1348 for (i = offset; i < block->nb_ops; i++) {
1350 next = &block->opcode_list[i + 1];
1353 /* If any other opcode writes or reads to the register
1354 * we'd use, then we cannot use it anymore. */
1355 mask |= opcode_read_mask(op->c);
1356 mask |= opcode_write_mask(op->c);
1358 if (op->flags & LIGHTREC_SYNC)
1367 /* TODO: handle backwards branches too */
1369 (op->flags & LIGHTREC_LOCAL_BRANCH) &&
1370 (s16)op->c.i.imm >= 0) {
1371 branch_offset = i + 1 + (s16)op->c.i.imm
1372 - !!(OPT_SWITCH_DELAY_SLOTS && (op->flags & LIGHTREC_NO_DS));
1374 reg = get_mfhi_mflo_reg(block, branch_offset, NULL,
1375 mask, sync, mflo, false);
1376 reg2 = get_mfhi_mflo_reg(block, offset + 1, next,
1377 mask, sync, mflo, false);
1378 if (reg > 0 && reg == reg2)
1384 return mflo ? REG_LO : REG_HI;
1387 case OP_SPECIAL_MULT:
1388 case OP_SPECIAL_MULTU:
1389 case OP_SPECIAL_DIV:
1390 case OP_SPECIAL_DIVU:
1392 case OP_SPECIAL_MTHI:
1396 case OP_SPECIAL_MTLO:
1405 !(op->flags & LIGHTREC_NO_DS) &&
1406 (next->i.op == OP_SPECIAL) &&
1407 ((!mflo && next->r.op == OP_SPECIAL_MFHI) ||
1408 (mflo && next->r.op == OP_SPECIAL_MFLO)))
1412 case OP_SPECIAL_JALR:
1414 case OP_SPECIAL_MFHI:
1418 /* Must use REG_HI if there is another MFHI target*/
1419 reg2 = get_mfhi_mflo_reg(block, i + 1, next,
1420 0, sync, mflo, true);
1421 if (reg2 > 0 && reg2 != REG_HI)
1424 if (!sync && !(old_mask & BIT(op->r.rd)))
1430 case OP_SPECIAL_MFLO:
1434 /* Must use REG_LO if there is another MFLO target*/
1435 reg2 = get_mfhi_mflo_reg(block, i + 1, next,
1436 0, sync, mflo, true);
1437 if (reg2 > 0 && reg2 != REG_LO)
1440 if (!sync && !(old_mask & BIT(op->r.rd)))
1459 static void lightrec_replace_lo_hi(struct block *block, u16 offset,
1465 /* This function will remove the following MFLO/MFHI. It must be called
1466 * only if get_mfhi_mflo_reg() returned a non-zero value. */
1468 for (i = offset; i < last; i++) {
1469 struct opcode *op = &block->opcode_list[i];
1477 /* TODO: handle backwards branches too */
1478 if ((op->flags & LIGHTREC_LOCAL_BRANCH) &&
1479 (s16)op->c.i.imm >= 0) {
1480 branch_offset = i + 1 + (s16)op->c.i.imm
1481 - !!(OPT_SWITCH_DELAY_SLOTS && (op->flags & LIGHTREC_NO_DS));
1483 lightrec_replace_lo_hi(block, branch_offset, last, lo);
1484 lightrec_replace_lo_hi(block, i + 1, branch_offset, lo);
1489 if (lo && op->r.op == OP_SPECIAL_MFLO) {
1490 pr_debug("Removing MFLO opcode at offset 0x%x\n",
1494 } else if (!lo && op->r.op == OP_SPECIAL_MFHI) {
1495 pr_debug("Removing MFHI opcode at offset 0x%x\n",
1508 static bool lightrec_always_skip_div_check(void)
1517 static int lightrec_flag_mults_divs(struct lightrec_state *state, struct block *block)
1519 struct opcode *prev, *list = NULL;
1523 u32 values[32] = { 0 };
1525 for (i = 0; i < block->nb_ops - 1; i++) {
1527 list = &block->opcode_list[i];
1530 known = lightrec_propagate_consts(list, prev, known, values);
1532 if (list->i.op != OP_SPECIAL)
1535 switch (list->r.op) {
1536 case OP_SPECIAL_DIV:
1537 case OP_SPECIAL_DIVU:
1538 /* If we are dividing by a non-zero constant, don't
1539 * emit the div-by-zero check. */
1540 if (lightrec_always_skip_div_check() ||
1541 (known & BIT(list->c.r.rt) && values[list->c.r.rt]))
1542 list->flags |= LIGHTREC_NO_DIV_CHECK;
1543 case OP_SPECIAL_MULT: /* fall-through */
1544 case OP_SPECIAL_MULTU:
1550 /* Don't support opcodes in delay slots */
1551 if ((i && has_delay_slot(block->opcode_list[i - 1].c)) ||
1552 (list->flags & LIGHTREC_NO_DS)) {
1556 reg_lo = get_mfhi_mflo_reg(block, i + 1, NULL, 0, false, true, false);
1558 pr_debug("Mark MULT(U)/DIV(U) opcode at offset 0x%x as"
1559 " not writing LO\n", i << 2);
1560 list->flags |= LIGHTREC_NO_LO;
1563 reg_hi = get_mfhi_mflo_reg(block, i + 1, NULL, 0, false, false, false);
1565 pr_debug("Mark MULT(U)/DIV(U) opcode at offset 0x%x as"
1566 " not writing HI\n", i << 2);
1567 list->flags |= LIGHTREC_NO_HI;
1570 if (!reg_lo && !reg_hi) {
1571 pr_debug("Both LO/HI unused in this block, they will "
1572 "probably be used in parent block - removing "
1574 list->flags &= ~(LIGHTREC_NO_LO | LIGHTREC_NO_HI);
1577 if (reg_lo > 0 && reg_lo != REG_LO) {
1578 pr_debug("Found register %s to hold LO (rs = %u, rt = %u)\n",
1579 lightrec_reg_name(reg_lo), list->r.rs, list->r.rt);
1581 lightrec_replace_lo_hi(block, i + 1, block->nb_ops, true);
1582 list->r.rd = reg_lo;
1587 if (reg_hi > 0 && reg_hi != REG_HI) {
1588 pr_debug("Found register %s to hold HI (rs = %u, rt = %u)\n",
1589 lightrec_reg_name(reg_hi), list->r.rs, list->r.rt);
1591 lightrec_replace_lo_hi(block, i + 1, block->nb_ops, false);
1592 list->r.imm = reg_hi;
1601 static bool remove_div_sequence(struct block *block, unsigned int offset)
1604 unsigned int i, found = 0;
1607 * Scan for the zero-checking sequence that GCC automatically introduced
1608 * after most DIV/DIVU opcodes. This sequence checks the value of the
1609 * divisor, and if zero, executes a BREAK opcode, causing the BIOS
1610 * handler to crash the PS1.
1612 * For DIV opcodes, this sequence additionally checks that the signed
1613 * operation does not overflow.
1615 * With the assumption that the games never crashed the PS1, we can
1616 * therefore assume that the games never divided by zero or overflowed,
1617 * and these sequences can be removed.
1620 for (i = offset; i < block->nb_ops; i++) {
1621 op = &block->opcode_list[i];
1624 if (op->i.op == OP_SPECIAL &&
1625 (op->r.op == OP_SPECIAL_DIV || op->r.op == OP_SPECIAL_DIVU))
1628 if ((op->opcode & 0xfc1fffff) == 0x14000002) {
1629 /* BNE ???, zero, +8 */
1634 } else if (found == 1 && !op->opcode) {
1637 } else if (found == 2 && op->opcode == 0x0007000d) {
1640 } else if (found == 3 && op->opcode == 0x2401ffff) {
1643 } else if (found == 4 && (op->opcode & 0xfc1fffff) == 0x14010004) {
1644 /* BNE ???, at, +16 */
1646 } else if (found == 5 && op->opcode == 0x3c018000) {
1647 /* LUI at, 0x8000 */
1649 } else if (found == 6 && (op->opcode & 0x141fffff) == 0x14010002) {
1650 /* BNE ???, at, +16 */
1652 } else if (found == 7 && !op->opcode) {
1655 } else if (found == 8 && op->opcode == 0x0006000d) {
1668 pr_debug("Removing DIV%s sequence at offset 0x%x\n",
1669 found == 9 ? "" : "U", offset << 2);
1671 for (i = 0; i < found; i++)
1672 block->opcode_list[offset + i].opcode = 0;
1680 static int lightrec_remove_div_by_zero_check_sequence(struct lightrec_state *state,
1681 struct block *block)
1686 for (i = 0; i < block->nb_ops; i++) {
1687 op = &block->opcode_list[i];
1689 if (op->i.op == OP_SPECIAL &&
1690 (op->r.op == OP_SPECIAL_DIVU || op->r.op == OP_SPECIAL_DIV) &&
1691 remove_div_sequence(block, i + 1))
1692 op->flags |= LIGHTREC_NO_DIV_CHECK;
1698 static const u32 memset_code[] = {
1699 0x10a00006, // beqz a1, 2f
1700 0x24a2ffff, // addiu v0,a1,-1
1701 0x2403ffff, // li v1,-1
1702 0xac800000, // 1: sw zero,0(a0)
1703 0x2442ffff, // addiu v0,v0,-1
1704 0x1443fffd, // bne v0,v1, 1b
1705 0x24840004, // addiu a0,a0,4
1706 0x03e00008, // 2: jr ra
1710 static int lightrec_replace_memset(struct lightrec_state *state, struct block *block)
1715 for (i = 0; i < block->nb_ops; i++) {
1716 c = block->opcode_list[i].c;
1718 if (c.opcode != memset_code[i])
1721 if (i == ARRAY_SIZE(memset_code) - 1) {
1723 pr_debug("Block at PC 0x%x is a memset\n", block->pc);
1724 block->flags |= BLOCK_IS_MEMSET | BLOCK_NEVER_COMPILE;
1726 /* Return non-zero to skip other optimizers. */
1734 static int (*lightrec_optimizers[])(struct lightrec_state *state, struct block *) = {
1735 IF_OPT(OPT_REMOVE_DIV_BY_ZERO_SEQ, &lightrec_remove_div_by_zero_check_sequence),
1736 IF_OPT(OPT_REPLACE_MEMSET, &lightrec_replace_memset),
1737 IF_OPT(OPT_DETECT_IMPOSSIBLE_BRANCHES, &lightrec_detect_impossible_branches),
1738 IF_OPT(OPT_LOCAL_BRANCHES, &lightrec_local_branches),
1739 IF_OPT(OPT_TRANSFORM_OPS, &lightrec_transform_ops),
1740 IF_OPT(OPT_SWITCH_DELAY_SLOTS, &lightrec_switch_delay_slots),
1741 IF_OPT(OPT_FLAG_IO || OPT_FLAG_STORES, &lightrec_flag_io),
1742 IF_OPT(OPT_FLAG_MULT_DIV, &lightrec_flag_mults_divs),
1743 IF_OPT(OPT_EARLY_UNLOAD, &lightrec_early_unload),
1746 int lightrec_optimize(struct lightrec_state *state, struct block *block)
1751 for (i = 0; i < ARRAY_SIZE(lightrec_optimizers); i++) {
1752 if (lightrec_optimizers[i]) {
1753 ret = (*lightrec_optimizers[i])(state, block);