git subrepo pull (merge) --force deps/lightning
[pcsx_rearmed.git] / deps / lightrec / optimizer.c
CommitLineData
98fa08a5 1// SPDX-License-Identifier: LGPL-2.1-or-later
d16005f8 2/*
98fa08a5 3 * Copyright (C) 2014-2021 Paul Cercueil <paul@crapouillou.net>
d16005f8
PC
4 */
5
9259d748 6#include "constprop.h"
98fa08a5 7#include "lightrec-config.h"
d16005f8
PC
8#include "disassembler.h"
9#include "lightrec.h"
10#include "memmanager.h"
11#include "optimizer.h"
12#include "regcache.h"
13
14#include <errno.h>
15#include <stdbool.h>
16#include <stdlib.h>
98fa08a5
PC
17#include <string.h>
18
19#define IF_OPT(opt, ptr) ((opt) ? (ptr) : NULL)
d16005f8
PC
20
21struct optimizer_list {
22 void (**optimizers)(struct opcode *);
23 unsigned int nb_optimizers;
24};
25
98fa08a5
PC
26static bool is_nop(union code op);
27
28bool is_unconditional_jump(union code c)
29{
30 switch (c.i.op) {
31 case OP_SPECIAL:
32 return c.r.op == OP_SPECIAL_JR || c.r.op == OP_SPECIAL_JALR;
33 case OP_J:
34 case OP_JAL:
35 return true;
36 case OP_BEQ:
37 case OP_BLEZ:
38 return c.i.rs == c.i.rt;
39 case OP_REGIMM:
40 return (c.r.rt == OP_REGIMM_BGEZ ||
41 c.r.rt == OP_REGIMM_BGEZAL) && c.i.rs == 0;
42 default:
43 return false;
44 }
45}
46
47bool is_syscall(union code c)
48{
49 return (c.i.op == OP_SPECIAL && c.r.op == OP_SPECIAL_SYSCALL) ||
50 (c.i.op == OP_CP0 && (c.r.rs == OP_CP0_MTC0 ||
51 c.r.rs == OP_CP0_CTC0) &&
52 (c.r.rd == 12 || c.r.rd == 13));
53}
54
55static u64 opcode_read_mask(union code op)
d16005f8
PC
56{
57 switch (op.i.op) {
58 case OP_SPECIAL:
59 switch (op.r.op) {
60 case OP_SPECIAL_SYSCALL:
61 case OP_SPECIAL_BREAK:
98fa08a5 62 return 0;
d16005f8
PC
63 case OP_SPECIAL_JR:
64 case OP_SPECIAL_JALR:
65 case OP_SPECIAL_MTHI:
66 case OP_SPECIAL_MTLO:
98fa08a5 67 return BIT(op.r.rs);
d16005f8 68 case OP_SPECIAL_MFHI:
98fa08a5 69 return BIT(REG_HI);
d16005f8 70 case OP_SPECIAL_MFLO:
98fa08a5 71 return BIT(REG_LO);
d16005f8 72 case OP_SPECIAL_SLL:
03535202
PC
73 if (!op.r.imm)
74 return 0;
75 fallthrough;
d16005f8
PC
76 case OP_SPECIAL_SRL:
77 case OP_SPECIAL_SRA:
98fa08a5 78 return BIT(op.r.rt);
d16005f8 79 default:
98fa08a5 80 return BIT(op.r.rs) | BIT(op.r.rt);
d16005f8
PC
81 }
82 case OP_CP0:
83 switch (op.r.rs) {
84 case OP_CP0_MTC0:
85 case OP_CP0_CTC0:
98fa08a5 86 return BIT(op.r.rt);
d16005f8 87 default:
98fa08a5 88 return 0;
d16005f8
PC
89 }
90 case OP_CP2:
91 if (op.r.op == OP_CP2_BASIC) {
92 switch (op.r.rs) {
93 case OP_CP2_BASIC_MTC2:
94 case OP_CP2_BASIC_CTC2:
98fa08a5 95 return BIT(op.r.rt);
d16005f8 96 default:
98fa08a5 97 break;
d16005f8 98 }
d16005f8 99 }
98fa08a5 100 return 0;
d16005f8
PC
101 case OP_J:
102 case OP_JAL:
103 case OP_LUI:
98fa08a5 104 return 0;
d16005f8 105 case OP_BEQ:
03535202
PC
106 if (op.i.rs == op.i.rt)
107 return 0;
108 fallthrough;
d16005f8
PC
109 case OP_BNE:
110 case OP_LWL:
111 case OP_LWR:
112 case OP_SB:
113 case OP_SH:
114 case OP_SWL:
115 case OP_SW:
116 case OP_SWR:
98fa08a5 117 return BIT(op.i.rs) | BIT(op.i.rt);
cb72ea13
PC
118 case OP_META:
119 return BIT(op.m.rs);
d16005f8 120 default:
98fa08a5 121 return BIT(op.i.rs);
d16005f8
PC
122 }
123}
124
ba3814c1 125static u64 mult_div_write_mask(union code op)
d16005f8 126{
98fa08a5
PC
127 u64 flags;
128
ba3814c1
PC
129 if (!OPT_FLAG_MULT_DIV)
130 return BIT(REG_LO) | BIT(REG_HI);
131
132 if (op.r.rd)
133 flags = BIT(op.r.rd);
134 else
135 flags = BIT(REG_LO);
136 if (op.r.imm)
137 flags |= BIT(op.r.imm);
138 else
139 flags |= BIT(REG_HI);
140
141 return flags;
142}
143
cb72ea13 144u64 opcode_write_mask(union code op)
ba3814c1 145{
d16005f8 146 switch (op.i.op) {
ba3814c1
PC
147 case OP_META_MULT2:
148 case OP_META_MULTU2:
149 return mult_div_write_mask(op);
cb72ea13
PC
150 case OP_META:
151 return BIT(op.m.rd);
d16005f8
PC
152 case OP_SPECIAL:
153 switch (op.r.op) {
154 case OP_SPECIAL_JR:
d16005f8
PC
155 case OP_SPECIAL_SYSCALL:
156 case OP_SPECIAL_BREAK:
98fa08a5 157 return 0;
d16005f8
PC
158 case OP_SPECIAL_MULT:
159 case OP_SPECIAL_MULTU:
160 case OP_SPECIAL_DIV:
161 case OP_SPECIAL_DIVU:
ba3814c1 162 return mult_div_write_mask(op);
d16005f8 163 case OP_SPECIAL_MTHI:
98fa08a5 164 return BIT(REG_HI);
d16005f8 165 case OP_SPECIAL_MTLO:
98fa08a5 166 return BIT(REG_LO);
03535202
PC
167 case OP_SPECIAL_SLL:
168 if (!op.r.imm)
169 return 0;
170 fallthrough;
d16005f8 171 default:
98fa08a5 172 return BIT(op.r.rd);
d16005f8
PC
173 }
174 case OP_ADDI:
175 case OP_ADDIU:
176 case OP_SLTI:
177 case OP_SLTIU:
178 case OP_ANDI:
179 case OP_ORI:
180 case OP_XORI:
181 case OP_LUI:
182 case OP_LB:
183 case OP_LH:
184 case OP_LWL:
185 case OP_LW:
186 case OP_LBU:
187 case OP_LHU:
188 case OP_LWR:
98fa08a5
PC
189 return BIT(op.i.rt);
190 case OP_JAL:
191 return BIT(31);
d16005f8
PC
192 case OP_CP0:
193 switch (op.r.rs) {
194 case OP_CP0_MFC0:
195 case OP_CP0_CFC0:
98fa08a5 196 return BIT(op.i.rt);
d16005f8 197 default:
98fa08a5 198 return 0;
d16005f8
PC
199 }
200 case OP_CP2:
201 if (op.r.op == OP_CP2_BASIC) {
202 switch (op.r.rs) {
203 case OP_CP2_BASIC_MFC2:
204 case OP_CP2_BASIC_CFC2:
98fa08a5 205 return BIT(op.i.rt);
d16005f8 206 default:
98fa08a5 207 break;
d16005f8 208 }
98fa08a5
PC
209 }
210 return 0;
211 case OP_REGIMM:
212 switch (op.r.rt) {
213 case OP_REGIMM_BLTZAL:
214 case OP_REGIMM_BGEZAL:
215 return BIT(31);
216 default:
217 return 0;
d16005f8 218 }
d16005f8 219 default:
98fa08a5
PC
220 return 0;
221 }
222}
223
224bool opcode_reads_register(union code op, u8 reg)
225{
226 return opcode_read_mask(op) & BIT(reg);
227}
228
229bool opcode_writes_register(union code op, u8 reg)
230{
231 return opcode_write_mask(op) & BIT(reg);
232}
233
234static int find_prev_writer(const struct opcode *list, unsigned int offset, u8 reg)
235{
236 union code c;
237 unsigned int i;
238
03535202 239 if (op_flag_sync(list[offset].flags))
98fa08a5
PC
240 return -1;
241
242 for (i = offset; i > 0; i--) {
243 c = list[i - 1].c;
244
245 if (opcode_writes_register(c, reg)) {
246 if (i > 1 && has_delay_slot(list[i - 2].c))
247 break;
248
249 return i - 1;
250 }
251
03535202 252 if (op_flag_sync(list[i - 1].flags) ||
98fa08a5
PC
253 has_delay_slot(c) ||
254 opcode_reads_register(c, reg))
255 break;
256 }
257
258 return -1;
259}
260
261static int find_next_reader(const struct opcode *list, unsigned int offset, u8 reg)
262{
263 unsigned int i;
264 union code c;
265
03535202 266 if (op_flag_sync(list[offset].flags))
98fa08a5
PC
267 return -1;
268
269 for (i = offset; ; i++) {
270 c = list[i].c;
271
9259d748 272 if (opcode_reads_register(c, reg))
98fa08a5 273 return i;
98fa08a5 274
9259d748
PC
275 if (op_flag_sync(list[i].flags)
276 || (op_flag_no_ds(list[i].flags) && has_delay_slot(c))
277 || is_delay_slot(list, i)
278 || opcode_writes_register(c, reg))
98fa08a5
PC
279 break;
280 }
281
282 return -1;
283}
284
285static bool reg_is_dead(const struct opcode *list, unsigned int offset, u8 reg)
286{
287 unsigned int i;
288
9259d748 289 if (op_flag_sync(list[offset].flags) || is_delay_slot(list, offset))
d16005f8 290 return false;
98fa08a5
PC
291
292 for (i = offset + 1; ; i++) {
293 if (opcode_reads_register(list[i].c, reg))
294 return false;
295
296 if (opcode_writes_register(list[i].c, reg))
297 return true;
298
299 if (has_delay_slot(list[i].c)) {
03535202 300 if (op_flag_no_ds(list[i].flags) ||
22eee2ac 301 opcode_reads_register(list[i + 1].c, reg))
98fa08a5
PC
302 return false;
303
304 return opcode_writes_register(list[i + 1].c, reg);
305 }
d16005f8
PC
306 }
307}
308
98fa08a5
PC
309static bool reg_is_read(const struct opcode *list,
310 unsigned int a, unsigned int b, u8 reg)
311{
312 /* Return true if reg is read in one of the opcodes of the interval
313 * [a, b[ */
314 for (; a < b; a++) {
315 if (!is_nop(list[a].c) && opcode_reads_register(list[a].c, reg))
316 return true;
317 }
318
319 return false;
320}
321
322static bool reg_is_written(const struct opcode *list,
323 unsigned int a, unsigned int b, u8 reg)
324{
325 /* Return true if reg is written in one of the opcodes of the interval
326 * [a, b[ */
327
328 for (; a < b; a++) {
329 if (!is_nop(list[a].c) && opcode_writes_register(list[a].c, reg))
330 return true;
331 }
332
333 return false;
334}
335
336static bool reg_is_read_or_written(const struct opcode *list,
337 unsigned int a, unsigned int b, u8 reg)
338{
339 return reg_is_read(list, a, b, reg) || reg_is_written(list, a, b, reg);
340}
341
cb72ea13
PC
342bool opcode_is_mfc(union code op)
343{
344 switch (op.i.op) {
345 case OP_CP0:
346 switch (op.r.rs) {
347 case OP_CP0_MFC0:
348 case OP_CP0_CFC0:
349 return true;
350 default:
351 break;
352 }
353
354 break;
355 case OP_CP2:
356 if (op.r.op == OP_CP2_BASIC) {
357 switch (op.r.rs) {
358 case OP_CP2_BASIC_MFC2:
359 case OP_CP2_BASIC_CFC2:
360 return true;
361 default:
362 break;
363 }
364 }
365
366 break;
367 default:
368 break;
369 }
370
371 return false;
372}
373
374bool opcode_is_load(union code op)
98fa08a5
PC
375{
376 switch (op.i.op) {
377 case OP_LB:
378 case OP_LH:
379 case OP_LWL:
380 case OP_LW:
381 case OP_LBU:
382 case OP_LHU:
383 case OP_LWR:
384 case OP_LWC2:
385 return true;
386 default:
387 return false;
388 }
389}
390
391static bool opcode_is_store(union code op)
392{
393 switch (op.i.op) {
394 case OP_SB:
395 case OP_SH:
396 case OP_SW:
397 case OP_SWL:
398 case OP_SWR:
399 case OP_SWC2:
400 return true;
401 default:
402 return false;
403 }
404}
405
ba3814c1
PC
406static u8 opcode_get_io_size(union code op)
407{
408 switch (op.i.op) {
409 case OP_LB:
410 case OP_LBU:
411 case OP_SB:
412 return 8;
413 case OP_LH:
414 case OP_LHU:
415 case OP_SH:
416 return 16;
417 default:
418 return 32;
419 }
420}
421
98fa08a5
PC
422bool opcode_is_io(union code op)
423{
424 return opcode_is_load(op) || opcode_is_store(op);
425}
426
d16005f8
PC
427/* TODO: Complete */
428static bool is_nop(union code op)
429{
430 if (opcode_writes_register(op, 0)) {
431 switch (op.i.op) {
432 case OP_CP0:
433 return op.r.rs != OP_CP0_MFC0;
434 case OP_LB:
435 case OP_LH:
436 case OP_LWL:
437 case OP_LW:
438 case OP_LBU:
439 case OP_LHU:
440 case OP_LWR:
441 return false;
442 default:
443 return true;
444 }
445 }
446
447 switch (op.i.op) {
448 case OP_SPECIAL:
449 switch (op.r.op) {
450 case OP_SPECIAL_AND:
451 return op.r.rd == op.r.rt && op.r.rd == op.r.rs;
452 case OP_SPECIAL_ADD:
453 case OP_SPECIAL_ADDU:
454 return (op.r.rd == op.r.rt && op.r.rs == 0) ||
455 (op.r.rd == op.r.rs && op.r.rt == 0);
456 case OP_SPECIAL_SUB:
457 case OP_SPECIAL_SUBU:
458 return op.r.rd == op.r.rs && op.r.rt == 0;
459 case OP_SPECIAL_OR:
460 if (op.r.rd == op.r.rt)
461 return op.r.rd == op.r.rs || op.r.rs == 0;
462 else
463 return (op.r.rd == op.r.rs) && op.r.rt == 0;
464 case OP_SPECIAL_SLL:
465 case OP_SPECIAL_SRA:
466 case OP_SPECIAL_SRL:
467 return op.r.rd == op.r.rt && op.r.imm == 0;
98fa08a5
PC
468 case OP_SPECIAL_MFHI:
469 case OP_SPECIAL_MFLO:
470 return op.r.rd == 0;
d16005f8
PC
471 default:
472 return false;
473 }
474 case OP_ORI:
475 case OP_ADDI:
476 case OP_ADDIU:
477 return op.i.rt == op.i.rs && op.i.imm == 0;
478 case OP_BGTZ:
479 return (op.i.rs == 0 || op.i.imm == 1);
480 case OP_REGIMM:
481 return (op.i.op == OP_REGIMM_BLTZ ||
482 op.i.op == OP_REGIMM_BLTZAL) &&
483 (op.i.rs == 0 || op.i.imm == 1);
484 case OP_BNE:
485 return (op.i.rs == op.i.rt || op.i.imm == 1);
486 default:
487 return false;
488 }
489}
490
9259d748
PC
491static void lightrec_optimize_sll_sra(struct opcode *list, unsigned int offset,
492 struct constprop_data *v)
d16005f8 493{
9259d748 494 struct opcode *ldop = NULL, *curr = &list[offset], *next;
98fa08a5
PC
495 struct opcode *to_change, *to_nop;
496 int idx, idx2;
d16005f8 497
98fa08a5
PC
498 if (curr->r.imm != 24 && curr->r.imm != 16)
499 return;
500
9259d748
PC
501 if (is_delay_slot(list, offset))
502 return;
503
504 idx = find_next_reader(list, offset + 1, curr->r.rd);
98fa08a5
PC
505 if (idx < 0)
506 return;
507
9259d748 508 next = &list[idx];
98fa08a5 509
9259d748
PC
510 if (next->i.op != OP_SPECIAL || next->r.op != OP_SPECIAL_SRA ||
511 next->r.imm != curr->r.imm || next->r.rt != curr->r.rd)
98fa08a5 512 return;
d16005f8 513
9259d748 514 if (curr->r.rd != curr->r.rt && next->r.rd != next->r.rt) {
98fa08a5
PC
515 /* sll rY, rX, 16
516 * ...
9259d748 517 * sra rZ, rY, 16 */
d16005f8 518
9259d748
PC
519 if (!reg_is_dead(list, idx, curr->r.rd) ||
520 reg_is_read_or_written(list, offset, idx, next->r.rd))
98fa08a5
PC
521 return;
522
523 /* If rY is dead after the SRL, and rZ is not used after the SLL,
524 * we can change rY to rZ */
525
526 pr_debug("Detected SLL/SRA with middle temp register\n");
9259d748
PC
527 curr->r.rd = next->r.rd;
528 next->r.rt = curr->r.rd;
98fa08a5
PC
529 }
530
9259d748
PC
531 /* We got a SLL/SRA combo. If imm #16, that's a cast to s16.
532 * If imm #24 that's a cast to s8.
98fa08a5
PC
533 *
534 * First of all, make sure that the target register of the SLL is not
9259d748 535 * read after the SRA. */
98fa08a5 536
9259d748 537 if (curr->r.rd == curr->r.rt) {
98fa08a5
PC
538 /* sll rX, rX, 16
539 * ...
9259d748
PC
540 * sra rY, rX, 16 */
541 to_change = next;
542 to_nop = curr;
98fa08a5
PC
543
544 /* rX is used after the SRA - we cannot convert it. */
9259d748 545 if (curr->r.rd != next->r.rd && !reg_is_dead(list, idx, curr->r.rd))
98fa08a5 546 return;
d16005f8 547 } else {
98fa08a5
PC
548 /* sll rY, rX, 16
549 * ...
9259d748
PC
550 * sra rY, rY, 16 */
551 to_change = curr;
552 to_nop = next;
d16005f8
PC
553 }
554
9259d748 555 idx2 = find_prev_writer(list, offset, curr->r.rt);
98fa08a5
PC
556 if (idx2 >= 0) {
557 /* Note that PSX games sometimes do casts after
558 * a LHU or LBU; in this case we can change the
559 * load opcode to a LH or LB, and the cast can
560 * be changed to a MOV or a simple NOP. */
561
9259d748 562 ldop = &list[idx2];
98fa08a5 563
9259d748
PC
564 if (next->r.rd != ldop->i.rt &&
565 !reg_is_dead(list, idx, ldop->i.rt))
566 ldop = NULL;
567 else if (curr->r.imm == 16 && ldop->i.op == OP_LHU)
568 ldop->i.op = OP_LH;
569 else if (curr->r.imm == 24 && ldop->i.op == OP_LBU)
570 ldop->i.op = OP_LB;
98fa08a5 571 else
9259d748 572 ldop = NULL;
98fa08a5 573
9259d748
PC
574 if (ldop) {
575 if (next->r.rd == ldop->i.rt) {
98fa08a5 576 to_change->opcode = 0;
9259d748
PC
577 } else if (reg_is_dead(list, idx, ldop->i.rt) &&
578 !reg_is_read_or_written(list, idx2 + 1, idx, next->r.rd)) {
98fa08a5
PC
579 /* The target register of the SRA is dead after the
580 * LBU/LHU; we can change the target register of the
581 * LBU/LHU to the one of the SRA. */
9259d748
PC
582 v[ldop->i.rt].known = 0;
583 v[ldop->i.rt].sign = 0;
584 ldop->i.rt = next->r.rd;
98fa08a5
PC
585 to_change->opcode = 0;
586 } else {
cb72ea13
PC
587 to_change->i.op = OP_META;
588 to_change->m.op = OP_META_MOV;
589 to_change->m.rd = next->r.rd;
590 to_change->m.rs = ldop->i.rt;
98fa08a5 591 }
d16005f8 592
98fa08a5
PC
593 if (to_nop->r.imm == 24)
594 pr_debug("Convert LBU+SLL+SRA to LB\n");
595 else
596 pr_debug("Convert LHU+SLL+SRA to LH\n");
9259d748
PC
597
598 v[ldop->i.rt].known = 0;
684432ad 599 v[ldop->i.rt].sign = 0xffffff80 << (24 - curr->r.imm);
98fa08a5
PC
600 }
601 }
602
9259d748 603 if (!ldop) {
98fa08a5 604 pr_debug("Convert SLL/SRA #%u to EXT%c\n",
9259d748 605 curr->r.imm, curr->r.imm == 24 ? 'C' : 'S');
98fa08a5 606
cb72ea13
PC
607 to_change->m.rs = curr->r.rt;
608 to_change->m.op = to_nop->r.imm == 24 ? OP_META_EXTC : OP_META_EXTS;
609 to_change->i.op = OP_META;
98fa08a5
PC
610 }
611
612 to_nop->opcode = 0;
d16005f8
PC
613}
614
9259d748
PC
615static void
616lightrec_remove_useless_lui(struct block *block, unsigned int offset,
617 const struct constprop_data *v)
02487de7
PC
618{
619 struct opcode *list = block->opcode_list,
620 *op = &block->opcode_list[offset];
621 int reader;
622
9259d748
PC
623 if (!op_flag_sync(op->flags) && is_known(v, op->i.rt) &&
624 v[op->i.rt].value == op->i.imm << 16) {
02487de7
PC
625 pr_debug("Converting duplicated LUI to NOP\n");
626 op->opcode = 0x0;
627 return;
628 }
629
9259d748 630 if (op->i.imm != 0 || op->i.rt == 0 || offset == block->nb_ops - 1)
02487de7
PC
631 return;
632
633 reader = find_next_reader(list, offset + 1, op->i.rt);
634 if (reader <= 0)
635 return;
636
637 if (opcode_writes_register(list[reader].c, op->i.rt) ||
638 reg_is_dead(list, reader, op->i.rt)) {
639 pr_debug("Removing useless LUI 0x0\n");
640
641 if (list[reader].i.rs == op->i.rt)
642 list[reader].i.rs = 0;
643 if (list[reader].i.op == OP_SPECIAL &&
644 list[reader].i.rt == op->i.rt)
645 list[reader].i.rt = 0;
646 op->opcode = 0x0;
647 }
648}
649
684432ad
PC
650static void lightrec_lui_to_movi(struct block *block, unsigned int offset)
651{
652 struct opcode *ori, *lui = &block->opcode_list[offset];
653 int next;
654
655 if (lui->i.op != OP_LUI)
656 return;
657
658 next = find_next_reader(block->opcode_list, offset + 1, lui->i.rt);
659 if (next > 0) {
660 ori = &block->opcode_list[next];
661
662 switch (ori->i.op) {
663 case OP_ORI:
664 case OP_ADDI:
665 case OP_ADDIU:
666 if (ori->i.rs == ori->i.rt && ori->i.imm) {
667 ori->flags |= LIGHTREC_MOVI;
668 lui->flags |= LIGHTREC_MOVI;
669 }
670 break;
671 }
672 }
673}
674
02487de7
PC
675static void lightrec_modify_lui(struct block *block, unsigned int offset)
676{
677 union code c, *lui = &block->opcode_list[offset].c;
678 bool stop = false, stop_next = false;
679 unsigned int i;
680
681 for (i = offset + 1; !stop && i < block->nb_ops; i++) {
682 c = block->opcode_list[i].c;
683 stop = stop_next;
684
685 if ((opcode_is_store(c) && c.i.rt == lui->i.rt)
686 || (!opcode_is_load(c) && opcode_reads_register(c, lui->i.rt)))
687 break;
688
689 if (opcode_writes_register(c, lui->i.rt)) {
cb72ea13
PC
690 if (c.i.op == OP_LWL || c.i.op == OP_LWR) {
691 /* LWL/LWR only partially write their target register;
692 * therefore the LUI should not write a different value. */
693 break;
694 }
695
02487de7 696 pr_debug("Convert LUI at offset 0x%x to kuseg\n",
684432ad 697 (i - 1) << 2);
02487de7
PC
698 lui->i.imm = kunseg(lui->i.imm << 16) >> 16;
699 break;
700 }
701
702 if (has_delay_slot(c))
703 stop_next = true;
704 }
705}
706
03535202
PC
707static int lightrec_transform_branches(struct lightrec_state *state,
708 struct block *block)
709{
710 struct opcode *op;
711 unsigned int i;
712 s32 offset;
713
714 for (i = 0; i < block->nb_ops; i++) {
715 op = &block->opcode_list[i];
716
717 switch (op->i.op) {
718 case OP_J:
719 /* Transform J opcode into BEQ $zero, $zero if possible. */
720 offset = (s32)((block->pc & 0xf0000000) >> 2 | op->j.imm)
721 - (s32)(block->pc >> 2) - (s32)i - 1;
722
723 if (offset == (s16)offset) {
724 pr_debug("Transform J into BEQ $zero, $zero\n");
725 op->i.op = OP_BEQ;
726 op->i.rs = 0;
727 op->i.rt = 0;
728 op->i.imm = offset;
729
730 }
ba3814c1
PC
731 fallthrough;
732 default:
03535202
PC
733 break;
734 }
735 }
736
737 return 0;
738}
739
ba3814c1
PC
740static inline bool is_power_of_two(u32 value)
741{
742 return popcount32(value) == 1;
743}
744
9259d748
PC
745static void lightrec_patch_known_zero(struct opcode *op,
746 const struct constprop_data *v)
747{
748 switch (op->i.op) {
749 case OP_SPECIAL:
750 switch (op->r.op) {
751 case OP_SPECIAL_JR:
752 case OP_SPECIAL_JALR:
753 case OP_SPECIAL_MTHI:
754 case OP_SPECIAL_MTLO:
755 if (is_known_zero(v, op->r.rs))
756 op->r.rs = 0;
757 break;
758 default:
759 if (is_known_zero(v, op->r.rs))
760 op->r.rs = 0;
761 fallthrough;
762 case OP_SPECIAL_SLL:
763 case OP_SPECIAL_SRL:
764 case OP_SPECIAL_SRA:
765 if (is_known_zero(v, op->r.rt))
766 op->r.rt = 0;
767 break;
768 case OP_SPECIAL_SYSCALL:
769 case OP_SPECIAL_BREAK:
770 case OP_SPECIAL_MFHI:
771 case OP_SPECIAL_MFLO:
772 break;
773 }
774 break;
775 case OP_CP0:
776 switch (op->r.rs) {
777 case OP_CP0_MTC0:
778 case OP_CP0_CTC0:
779 if (is_known_zero(v, op->r.rt))
780 op->r.rt = 0;
781 break;
782 default:
783 break;
784 }
785 break;
786 case OP_CP2:
787 if (op->r.op == OP_CP2_BASIC) {
788 switch (op->r.rs) {
789 case OP_CP2_BASIC_MTC2:
790 case OP_CP2_BASIC_CTC2:
791 if (is_known_zero(v, op->r.rt))
792 op->r.rt = 0;
793 break;
794 default:
795 break;
796 }
797 }
798 break;
799 case OP_BEQ:
800 case OP_BNE:
801 if (is_known_zero(v, op->i.rt))
802 op->i.rt = 0;
803 fallthrough;
804 case OP_REGIMM:
805 case OP_BLEZ:
806 case OP_BGTZ:
807 case OP_ADDI:
808 case OP_ADDIU:
809 case OP_SLTI:
810 case OP_SLTIU:
811 case OP_ANDI:
812 case OP_ORI:
813 case OP_XORI:
9259d748
PC
814 case OP_META_MULT2:
815 case OP_META_MULTU2:
cb72ea13
PC
816 case OP_META:
817 if (is_known_zero(v, op->m.rs))
818 op->m.rs = 0;
9259d748
PC
819 break;
820 case OP_SB:
821 case OP_SH:
822 case OP_SWL:
823 case OP_SW:
824 case OP_SWR:
825 if (is_known_zero(v, op->i.rt))
826 op->i.rt = 0;
827 fallthrough;
828 case OP_LB:
829 case OP_LH:
830 case OP_LWL:
831 case OP_LW:
832 case OP_LBU:
833 case OP_LHU:
834 case OP_LWR:
835 case OP_LWC2:
836 case OP_SWC2:
837 if (is_known(v, op->i.rs)
838 && kunseg(v[op->i.rs].value) == 0)
839 op->i.rs = 0;
840 break;
841 default:
842 break;
843 }
844}
845
846static void lightrec_reset_syncs(struct block *block)
847{
848 struct opcode *op, *list = block->opcode_list;
849 unsigned int i;
850 s32 offset;
851
852 for (i = 0; i < block->nb_ops; i++)
853 list[i].flags &= ~LIGHTREC_SYNC;
854
855 for (i = 0; i < block->nb_ops; i++) {
856 op = &list[i];
857
cb72ea13
PC
858 if (has_delay_slot(op->c)) {
859 if (op_flag_local_branch(op->flags)) {
860 offset = i + 1 - op_flag_no_ds(op->flags) + (s16)op->i.imm;
861 list[offset].flags |= LIGHTREC_SYNC;
862 }
863
864 if (op_flag_emulate_branch(op->flags) && i + 2 < block->nb_ops)
865 list[i + 2].flags |= LIGHTREC_SYNC;
9259d748
PC
866 }
867 }
868}
869
f5ee77ca
PC
870static void maybe_remove_load_delay(struct opcode *op)
871{
872 if (op_flag_load_delay(op->flags) && opcode_is_load(op->c))
873 op->flags &= ~LIGHTREC_LOAD_DELAY;
874}
875
98fa08a5 876static int lightrec_transform_ops(struct lightrec_state *state, struct block *block)
d16005f8 877{
9259d748
PC
878 struct opcode *op, *list = block->opcode_list;
879 struct constprop_data v[32] = LIGHTREC_CONSTPROP_INITIALIZER;
98fa08a5 880 unsigned int i;
9259d748 881 bool local;
ba3814c1 882 u8 tmp;
d16005f8 883
98fa08a5
PC
884 for (i = 0; i < block->nb_ops; i++) {
885 op = &list[i];
d16005f8 886
cb72ea13 887 lightrec_consts_propagate(block, i, v);
9259d748
PC
888
889 lightrec_patch_known_zero(op, v);
22eee2ac 890
d16005f8
PC
891 /* Transform all opcodes detected as useless to real NOPs
892 * (0x0: SLL r0, r0, #0) */
98fa08a5 893 if (op->opcode != 0 && is_nop(op->c)) {
d16005f8 894 pr_debug("Converting useless opcode 0x%08x to NOP\n",
98fa08a5
PC
895 op->opcode);
896 op->opcode = 0x0;
d16005f8
PC
897 }
898
98fa08a5 899 if (!op->opcode)
d16005f8
PC
900 continue;
901
98fa08a5 902 switch (op->i.op) {
d16005f8 903 case OP_BEQ:
9259d748
PC
904 if (op->i.rs == op->i.rt ||
905 (is_known(v, op->i.rs) && is_known(v, op->i.rt) &&
906 v[op->i.rs].value == v[op->i.rt].value)) {
907 if (op->i.rs != op->i.rt)
908 pr_debug("Found always-taken BEQ\n");
909
98fa08a5
PC
910 op->i.rs = 0;
911 op->i.rt = 0;
9259d748
PC
912 } else if (v[op->i.rs].known & v[op->i.rt].known &
913 (v[op->i.rs].value ^ v[op->i.rt].value)) {
914 pr_debug("Found never-taken BEQ\n");
915
f5ee77ca
PC
916 if (!op_flag_no_ds(op->flags))
917 maybe_remove_load_delay(&list[i + 1]);
918
9259d748
PC
919 local = op_flag_local_branch(op->flags);
920 op->opcode = 0;
921 op->flags = 0;
922
923 if (local)
924 lightrec_reset_syncs(block);
98fa08a5
PC
925 } else if (op->i.rs == 0) {
926 op->i.rs = op->i.rt;
927 op->i.rt = 0;
d16005f8
PC
928 }
929 break;
98fa08a5 930
d16005f8 931 case OP_BNE:
9259d748
PC
932 if (v[op->i.rs].known & v[op->i.rt].known &
933 (v[op->i.rs].value ^ v[op->i.rt].value)) {
934 pr_debug("Found always-taken BNE\n");
935
936 op->i.op = OP_BEQ;
937 op->i.rs = 0;
938 op->i.rt = 0;
939 } else if (is_known(v, op->i.rs) && is_known(v, op->i.rt) &&
940 v[op->i.rs].value == v[op->i.rt].value) {
941 pr_debug("Found never-taken BNE\n");
942
f5ee77ca
PC
943 if (!op_flag_no_ds(op->flags))
944 maybe_remove_load_delay(&list[i + 1]);
945
9259d748
PC
946 local = op_flag_local_branch(op->flags);
947 op->opcode = 0;
948 op->flags = 0;
949
950 if (local)
951 lightrec_reset_syncs(block);
952 } else if (op->i.rs == 0) {
98fa08a5
PC
953 op->i.rs = op->i.rt;
954 op->i.rt = 0;
955 }
956 break;
957
9259d748
PC
958 case OP_BLEZ:
959 if (v[op->i.rs].known & BIT(31) &&
960 v[op->i.rs].value & BIT(31)) {
961 pr_debug("Found always-taken BLEZ\n");
962
963 op->i.op = OP_BEQ;
964 op->i.rs = 0;
965 op->i.rt = 0;
966 }
967 break;
968
969 case OP_BGTZ:
970 if (v[op->i.rs].known & BIT(31) &&
971 v[op->i.rs].value & BIT(31)) {
972 pr_debug("Found never-taken BGTZ\n");
973
f5ee77ca
PC
974 if (!op_flag_no_ds(op->flags))
975 maybe_remove_load_delay(&list[i + 1]);
976
9259d748
PC
977 local = op_flag_local_branch(op->flags);
978 op->opcode = 0;
979 op->flags = 0;
980
981 if (local)
982 lightrec_reset_syncs(block);
983 }
984 break;
985
98fa08a5 986 case OP_LUI:
9259d748 987 if (i == 0 || !has_delay_slot(list[i - 1].c))
d8b04acd 988 lightrec_modify_lui(block, i);
9259d748 989 lightrec_remove_useless_lui(block, i, v);
684432ad
PC
990 if (i == 0 || !has_delay_slot(list[i - 1].c))
991 lightrec_lui_to_movi(block, i);
d16005f8
PC
992 break;
993
994 /* Transform ORI/ADDI/ADDIU with imm #0 or ORR/ADD/ADDU/SUB/SUBU
995 * with register $zero to the MOV meta-opcode */
996 case OP_ORI:
997 case OP_ADDI:
998 case OP_ADDIU:
98fa08a5 999 if (op->i.imm == 0) {
d16005f8 1000 pr_debug("Convert ORI/ADDI/ADDIU #0 to MOV\n");
cb72ea13
PC
1001 op->m.rd = op->i.rt;
1002 op->m.op = OP_META_MOV;
1003 op->i.op = OP_META;
d16005f8
PC
1004 }
1005 break;
9259d748
PC
1006 case OP_ANDI:
1007 if (bits_are_known_zero(v, op->i.rs, ~op->i.imm)) {
1008 pr_debug("Found useless ANDI 0x%x\n", op->i.imm);
1009
1010 if (op->i.rs == op->i.rt) {
1011 op->opcode = 0;
1012 } else {
cb72ea13
PC
1013 op->m.rd = op->i.rt;
1014 op->m.op = OP_META_MOV;
1015 op->i.op = OP_META;
9259d748
PC
1016 }
1017 }
1018 break;
1019 case OP_REGIMM:
1020 switch (op->r.rt) {
1021 case OP_REGIMM_BLTZ:
1022 case OP_REGIMM_BGEZ:
1023 if (!(v[op->r.rs].known & BIT(31)))
1024 break;
1025
1026 if (!!(v[op->r.rs].value & BIT(31))
1027 ^ (op->r.rt == OP_REGIMM_BGEZ)) {
1028 pr_debug("Found always-taken BLTZ/BGEZ\n");
1029 op->i.op = OP_BEQ;
1030 op->i.rs = 0;
1031 op->i.rt = 0;
1032 } else {
1033 pr_debug("Found never-taken BLTZ/BGEZ\n");
1034
f5ee77ca
PC
1035 if (!op_flag_no_ds(op->flags))
1036 maybe_remove_load_delay(&list[i + 1]);
1037
9259d748
PC
1038 local = op_flag_local_branch(op->flags);
1039 op->opcode = 0;
1040 op->flags = 0;
1041
1042 if (local)
1043 lightrec_reset_syncs(block);
1044 }
1045 break;
1046 case OP_REGIMM_BLTZAL:
1047 case OP_REGIMM_BGEZAL:
1048 /* TODO: Detect always-taken and replace with JAL */
1049 break;
1050 }
1051 break;
d16005f8 1052 case OP_SPECIAL:
98fa08a5 1053 switch (op->r.op) {
9259d748
PC
1054 case OP_SPECIAL_SRAV:
1055 if ((v[op->r.rs].known & 0x1f) != 0x1f)
1056 break;
1057
1058 pr_debug("Convert SRAV to SRA\n");
1059 op->r.imm = v[op->r.rs].value & 0x1f;
1060 op->r.op = OP_SPECIAL_SRA;
1061
1062 fallthrough;
d16005f8 1063 case OP_SPECIAL_SRA:
98fa08a5
PC
1064 if (op->r.imm == 0) {
1065 pr_debug("Convert SRA #0 to MOV\n");
cb72ea13
PC
1066 op->m.rs = op->r.rt;
1067 op->m.op = OP_META_MOV;
1068 op->i.op = OP_META;
98fa08a5
PC
1069 break;
1070 }
98fa08a5 1071 break;
9259d748
PC
1072
1073 case OP_SPECIAL_SLLV:
1074 if ((v[op->r.rs].known & 0x1f) != 0x1f)
1075 break;
1076
1077 pr_debug("Convert SLLV to SLL\n");
1078 op->r.imm = v[op->r.rs].value & 0x1f;
1079 op->r.op = OP_SPECIAL_SLL;
1080
1081 fallthrough;
98fa08a5 1082 case OP_SPECIAL_SLL:
9259d748
PC
1083 if (op->r.imm == 0) {
1084 pr_debug("Convert SLL #0 to MOV\n");
cb72ea13
PC
1085 op->m.rs = op->r.rt;
1086 op->m.op = OP_META_MOV;
1087 op->i.op = OP_META;
9259d748
PC
1088 }
1089
1090 lightrec_optimize_sll_sra(block->opcode_list, i, v);
1091 break;
1092
1093 case OP_SPECIAL_SRLV:
1094 if ((v[op->r.rs].known & 0x1f) != 0x1f)
1095 break;
1096
1097 pr_debug("Convert SRLV to SRL\n");
1098 op->r.imm = v[op->r.rs].value & 0x1f;
1099 op->r.op = OP_SPECIAL_SRL;
1100
1101 fallthrough;
d16005f8 1102 case OP_SPECIAL_SRL:
98fa08a5 1103 if (op->r.imm == 0) {
9259d748 1104 pr_debug("Convert SRL #0 to MOV\n");
cb72ea13
PC
1105 op->m.rs = op->r.rt;
1106 op->m.op = OP_META_MOV;
1107 op->i.op = OP_META;
d16005f8
PC
1108 }
1109 break;
9259d748 1110
ba3814c1
PC
1111 case OP_SPECIAL_MULT:
1112 case OP_SPECIAL_MULTU:
9259d748
PC
1113 if (is_known(v, op->r.rs) &&
1114 is_power_of_two(v[op->r.rs].value)) {
ba3814c1
PC
1115 tmp = op->c.i.rs;
1116 op->c.i.rs = op->c.i.rt;
1117 op->c.i.rt = tmp;
9259d748
PC
1118 } else if (!is_known(v, op->r.rt) ||
1119 !is_power_of_two(v[op->r.rt].value)) {
ba3814c1
PC
1120 break;
1121 }
1122
1123 pr_debug("Multiply by power-of-two: %u\n",
9259d748 1124 v[op->r.rt].value);
ba3814c1
PC
1125
1126 if (op->r.op == OP_SPECIAL_MULT)
1127 op->i.op = OP_META_MULT2;
1128 else
1129 op->i.op = OP_META_MULTU2;
1130
9259d748 1131 op->r.op = ctz32(v[op->r.rt].value);
ba3814c1 1132 break;
cb72ea13
PC
1133 case OP_SPECIAL_NOR:
1134 if (op->r.rs == 0 || op->r.rt == 0) {
1135 pr_debug("Convert NOR $zero to COM\n");
1136 op->i.op = OP_META;
1137 op->m.op = OP_META_COM;
1138 if (!op->m.rs)
1139 op->m.rs = op->r.rt;
1140 }
1141 break;
d16005f8
PC
1142 case OP_SPECIAL_OR:
1143 case OP_SPECIAL_ADD:
1144 case OP_SPECIAL_ADDU:
98fa08a5 1145 if (op->r.rs == 0) {
d16005f8 1146 pr_debug("Convert OR/ADD $zero to MOV\n");
cb72ea13
PC
1147 op->m.rs = op->r.rt;
1148 op->m.op = OP_META_MOV;
1149 op->i.op = OP_META;
d16005f8 1150 }
d8b04acd
PC
1151 fallthrough;
1152 case OP_SPECIAL_SUB:
d16005f8 1153 case OP_SPECIAL_SUBU:
98fa08a5 1154 if (op->r.rt == 0) {
d16005f8 1155 pr_debug("Convert OR/ADD/SUB $zero to MOV\n");
cb72ea13
PC
1156 op->m.op = OP_META_MOV;
1157 op->i.op = OP_META;
d16005f8 1158 }
d8b04acd
PC
1159 fallthrough;
1160 default:
d16005f8
PC
1161 break;
1162 }
d8b04acd
PC
1163 fallthrough;
1164 default:
d16005f8
PC
1165 break;
1166 }
1167 }
1168
1169 return 0;
1170}
1171
ba3814c1
PC
1172static bool lightrec_can_switch_delay_slot(union code op, union code next_op)
1173{
1174 switch (op.i.op) {
1175 case OP_SPECIAL:
1176 switch (op.r.op) {
1177 case OP_SPECIAL_JALR:
1178 if (opcode_reads_register(next_op, op.r.rd) ||
1179 opcode_writes_register(next_op, op.r.rd))
1180 return false;
1181 fallthrough;
1182 case OP_SPECIAL_JR:
1183 if (opcode_writes_register(next_op, op.r.rs))
1184 return false;
1185 fallthrough;
1186 default:
1187 break;
1188 }
1189 fallthrough;
1190 case OP_J:
1191 break;
1192 case OP_JAL:
1193 if (opcode_reads_register(next_op, 31) ||
1194 opcode_writes_register(next_op, 31))
1195 return false;;
1196
1197 break;
1198 case OP_BEQ:
1199 case OP_BNE:
1200 if (op.i.rt && opcode_writes_register(next_op, op.i.rt))
1201 return false;
1202 fallthrough;
1203 case OP_BLEZ:
1204 case OP_BGTZ:
1205 if (op.i.rs && opcode_writes_register(next_op, op.i.rs))
1206 return false;
1207 break;
1208 case OP_REGIMM:
1209 switch (op.r.rt) {
1210 case OP_REGIMM_BLTZAL:
1211 case OP_REGIMM_BGEZAL:
1212 if (opcode_reads_register(next_op, 31) ||
1213 opcode_writes_register(next_op, 31))
1214 return false;
1215 fallthrough;
1216 case OP_REGIMM_BLTZ:
1217 case OP_REGIMM_BGEZ:
1218 if (op.i.rs && opcode_writes_register(next_op, op.i.rs))
1219 return false;
1220 break;
1221 }
1222 fallthrough;
1223 default:
1224 break;
1225 }
1226
1227 return true;
1228}
1229
98fa08a5 1230static int lightrec_switch_delay_slots(struct lightrec_state *state, struct block *block)
d16005f8 1231{
98fa08a5
PC
1232 struct opcode *list, *next = &block->opcode_list[0];
1233 unsigned int i;
1234 union code op, next_op;
03535202 1235 u32 flags;
d16005f8 1236
98fa08a5
PC
1237 for (i = 0; i < block->nb_ops - 1; i++) {
1238 list = next;
1239 next = &block->opcode_list[i + 1];
1240 next_op = next->c;
1241 op = list->c;
d16005f8 1242
03535202
PC
1243 if (!has_delay_slot(op) || op_flag_no_ds(list->flags) ||
1244 op_flag_emulate_branch(list->flags) ||
98fa08a5
PC
1245 op.opcode == 0 || next_op.opcode == 0)
1246 continue;
1247
9259d748 1248 if (is_delay_slot(block->opcode_list, i))
d16005f8
PC
1249 continue;
1250
ba3814c1 1251 if (op_flag_sync(next->flags))
d16005f8
PC
1252 continue;
1253
cb72ea13
PC
1254 if (op_flag_load_delay(next->flags) && opcode_is_load(next_op))
1255 continue;
1256
ba3814c1
PC
1257 if (!lightrec_can_switch_delay_slot(list->c, next_op))
1258 continue;
d16005f8
PC
1259
1260 pr_debug("Swap branch and delay slot opcodes "
98fa08a5
PC
1261 "at offsets 0x%x / 0x%x\n",
1262 i << 2, (i + 1) << 2);
d16005f8 1263
ba3814c1 1264 flags = next->flags | (list->flags & LIGHTREC_SYNC);
d16005f8 1265 list->c = next_op;
98fa08a5 1266 next->c = op;
ba3814c1 1267 next->flags = (list->flags | LIGHTREC_NO_DS) & ~LIGHTREC_SYNC;
a59e5536 1268 list->flags = flags | LIGHTREC_NO_DS;
d16005f8
PC
1269 }
1270
1271 return 0;
1272}
1273
98fa08a5
PC
1274static int lightrec_detect_impossible_branches(struct lightrec_state *state,
1275 struct block *block)
d16005f8 1276{
03535202 1277 struct opcode *op, *list = block->opcode_list, *next = &list[0];
98fa08a5
PC
1278 unsigned int i;
1279 int ret = 0;
1280
1281 for (i = 0; i < block->nb_ops - 1; i++) {
1282 op = next;
03535202 1283 next = &list[i + 1];
d16005f8 1284
d16005f8 1285 if (!has_delay_slot(op->c) ||
cb72ea13
PC
1286 (!has_delay_slot(next->c) &&
1287 !opcode_is_mfc(next->c) &&
d16005f8
PC
1288 !(next->i.op == OP_CP0 && next->r.rs == OP_CP0_RFE)))
1289 continue;
1290
1291 if (op->c.opcode == next->c.opcode) {
1292 /* The delay slot is the exact same opcode as the branch
1293 * opcode: this is effectively a NOP */
1294 next->c.opcode = 0;
1295 continue;
1296 }
1297
cb72ea13
PC
1298 op->flags |= LIGHTREC_EMULATE_BRANCH;
1299
1300 if (OPT_LOCAL_BRANCHES && i + 2 < block->nb_ops) {
1301 /* The interpreter will only emulate the branch, then
1302 * return to the compiled code. Add a SYNC after the
1303 * branch + delay slot in the case where the branch
1304 * was not taken. */
1305 list[i + 2].flags |= LIGHTREC_SYNC;
1306 }
1307 }
1308
1309 return ret;
1310}
1311
1312static bool is_local_branch(const struct block *block, unsigned int idx)
1313{
1314 const struct opcode *op = &block->opcode_list[idx];
1315 s32 offset;
1316
1317 switch (op->c.i.op) {
1318 case OP_BEQ:
1319 case OP_BNE:
1320 case OP_BLEZ:
1321 case OP_BGTZ:
1322 case OP_REGIMM:
1323 offset = idx + 1 + (s16)op->c.i.imm;
1324 if (offset >= 0 && offset < block->nb_ops)
1325 return true;
1326 fallthrough;
1327 default:
1328 return false;
1329 }
1330}
1331
1332static int lightrec_handle_load_delays(struct lightrec_state *state,
1333 struct block *block)
1334{
1335 struct opcode *op, *list = block->opcode_list;
1336 unsigned int i;
1337 s16 imm;
1338
1339 for (i = 0; i < block->nb_ops; i++) {
1340 op = &list[i];
1341
1342 if (!opcode_is_load(op->c) || !op->c.i.rt || op->c.i.op == OP_LWC2)
1343 continue;
1344
1345 if (!is_delay_slot(list, i)) {
1346 /* Only handle load delays in delay slots.
1347 * PSX games never abused load delay slots otherwise. */
03535202
PC
1348 continue;
1349 }
1350
cb72ea13
PC
1351 if (is_local_branch(block, i - 1)) {
1352 imm = (s16)list[i - 1].c.i.imm;
98fa08a5 1353
cb72ea13
PC
1354 if (!opcode_reads_register(list[i + imm].c, op->c.i.rt)) {
1355 /* The target opcode of the branch is inside
1356 * the block, and it does not read the register
1357 * written to by the load opcode; we can ignore
1358 * the load delay. */
1359 continue;
1360 }
1361 }
98fa08a5 1362
cb72ea13
PC
1363 op->flags |= LIGHTREC_LOAD_DELAY;
1364 }
1365
1366 return 0;
1367}
1368
1369static int lightrec_swap_load_delays(struct lightrec_state *state,
1370 struct block *block)
1371{
1372 unsigned int i;
1373 union code c, next;
1374 bool in_ds = false, skip_next = false;
1375 struct opcode op;
1376
1377 if (block->nb_ops < 2)
1378 return 0;
1379
1380 for (i = 0; i < block->nb_ops - 2; i++) {
1381 c = block->opcode_list[i].c;
1382
1383 if (skip_next) {
1384 skip_next = false;
1385 } else if (!in_ds && opcode_is_load(c) && c.i.op != OP_LWC2) {
1386 next = block->opcode_list[i + 1].c;
1387
2ec79b77
PC
1388 switch (next.i.op) {
1389 case OP_LWL:
1390 case OP_LWR:
1391 case OP_REGIMM:
1392 case OP_BEQ:
1393 case OP_BNE:
1394 case OP_BLEZ:
1395 case OP_BGTZ:
cb72ea13 1396 continue;
2ec79b77 1397 }
cb72ea13
PC
1398
1399 if (opcode_reads_register(next, c.i.rt)
1400 && !opcode_writes_register(next, c.i.rs)) {
1401 pr_debug("Swapping opcodes at offset 0x%x to "
1402 "respect load delay\n", i << 2);
1403
1404 op = block->opcode_list[i];
1405 block->opcode_list[i] = block->opcode_list[i + 1];
1406 block->opcode_list[i + 1] = op;
1407 skip_next = true;
1408 }
d16005f8 1409 }
cb72ea13
PC
1410
1411 in_ds = has_delay_slot(c);
d16005f8
PC
1412 }
1413
cb72ea13 1414 return 0;
d16005f8
PC
1415}
1416
98fa08a5 1417static int lightrec_local_branches(struct lightrec_state *state, struct block *block)
d16005f8 1418{
cb72ea13 1419 const struct opcode *ds;
98fa08a5
PC
1420 struct opcode *list;
1421 unsigned int i;
d16005f8 1422 s32 offset;
d16005f8 1423
98fa08a5
PC
1424 for (i = 0; i < block->nb_ops; i++) {
1425 list = &block->opcode_list[i];
1426
cb72ea13 1427 if (should_emulate(list) || !is_local_branch(block, i))
d16005f8
PC
1428 continue;
1429
cb72ea13 1430 offset = i + 1 + (s16)list->c.i.imm;
d16005f8
PC
1431
1432 pr_debug("Found local branch to offset 0x%x\n", offset << 2);
1433
cb72ea13
PC
1434 ds = get_delay_slot(block->opcode_list, i);
1435 if (op_flag_load_delay(ds->flags) && opcode_is_load(ds->c)) {
1436 pr_debug("Branch delay slot has a load delay - skip\n");
1437 continue;
1438 }
1439
98fa08a5
PC
1440 if (should_emulate(&block->opcode_list[offset])) {
1441 pr_debug("Branch target must be emulated - skip\n");
1442 continue;
1443 }
d16005f8 1444
98fa08a5
PC
1445 if (offset && has_delay_slot(block->opcode_list[offset - 1].c)) {
1446 pr_debug("Branch target is a delay slot - skip\n");
1447 continue;
1448 }
d16005f8 1449
98fa08a5 1450 list->flags |= LIGHTREC_LOCAL_BRANCH;
d16005f8
PC
1451 }
1452
9259d748
PC
1453 lightrec_reset_syncs(block);
1454
d16005f8
PC
1455 return 0;
1456}
1457
1458bool has_delay_slot(union code op)
1459{
1460 switch (op.i.op) {
1461 case OP_SPECIAL:
1462 switch (op.r.op) {
1463 case OP_SPECIAL_JR:
1464 case OP_SPECIAL_JALR:
1465 return true;
1466 default:
1467 return false;
1468 }
1469 case OP_J:
1470 case OP_JAL:
1471 case OP_BEQ:
1472 case OP_BNE:
1473 case OP_BLEZ:
1474 case OP_BGTZ:
1475 case OP_REGIMM:
d16005f8
PC
1476 return true;
1477 default:
1478 return false;
1479 }
1480}
1481
9259d748
PC
1482bool is_delay_slot(const struct opcode *list, unsigned int offset)
1483{
1484 return offset > 0
1485 && !op_flag_no_ds(list[offset - 1].flags)
1486 && has_delay_slot(list[offset - 1].c);
1487}
1488
98fa08a5 1489bool should_emulate(const struct opcode *list)
d16005f8 1490{
03535202
PC
1491 return op_flag_emulate_branch(list->flags) && has_delay_slot(list->c);
1492}
1493
1494static bool op_writes_rd(union code c)
1495{
1496 switch (c.i.op) {
1497 case OP_SPECIAL:
cb72ea13 1498 case OP_META:
03535202
PC
1499 return true;
1500 default:
1501 return false;
1502 }
1503}
1504
1505static void lightrec_add_reg_op(struct opcode *op, u8 reg, u32 reg_op)
1506{
1507 if (op_writes_rd(op->c) && reg == op->r.rd)
1508 op->flags |= LIGHTREC_REG_RD(reg_op);
1509 else if (op->i.rs == reg)
1510 op->flags |= LIGHTREC_REG_RS(reg_op);
1511 else if (op->i.rt == reg)
1512 op->flags |= LIGHTREC_REG_RT(reg_op);
1513 else
1514 pr_debug("Cannot add unload/clean/discard flag: "
1515 "opcode does not touch register %s!\n",
1516 lightrec_reg_name(reg));
d16005f8
PC
1517}
1518
98fa08a5 1519static void lightrec_add_unload(struct opcode *op, u8 reg)
d16005f8 1520{
03535202
PC
1521 lightrec_add_reg_op(op, reg, LIGHTREC_REG_UNLOAD);
1522}
d16005f8 1523
03535202
PC
1524static void lightrec_add_discard(struct opcode *op, u8 reg)
1525{
1526 lightrec_add_reg_op(op, reg, LIGHTREC_REG_DISCARD);
1527}
1528
1529static void lightrec_add_clean(struct opcode *op, u8 reg)
1530{
1531 lightrec_add_reg_op(op, reg, LIGHTREC_REG_CLEAN);
1532}
1533
1534static void
1535lightrec_early_unload_sync(struct opcode *list, s16 *last_r, s16 *last_w)
1536{
1537 unsigned int reg;
1538 s16 offset;
1539
1540 for (reg = 0; reg < 34; reg++) {
1541 offset = s16_max(last_w[reg], last_r[reg]);
1542
1543 if (offset >= 0)
1544 lightrec_add_unload(&list[offset], reg);
1545 }
1546
1547 memset(last_r, 0xff, sizeof(*last_r) * 34);
1548 memset(last_w, 0xff, sizeof(*last_w) * 34);
98fa08a5 1549}
d16005f8 1550
98fa08a5
PC
1551static int lightrec_early_unload(struct lightrec_state *state, struct block *block)
1552{
03535202 1553 u16 i, offset;
98fa08a5 1554 struct opcode *op;
03535202
PC
1555 s16 last_r[34], last_w[34], last_sync = 0, next_sync = 0;
1556 u64 mask_r, mask_w, dirty = 0, loaded = 0;
cb72ea13 1557 u8 reg, load_delay_reg = 0;
d16005f8 1558
03535202
PC
1559 memset(last_r, 0xff, sizeof(last_r));
1560 memset(last_w, 0xff, sizeof(last_w));
98fa08a5 1561
03535202
PC
1562 /*
1563 * Clean if:
1564 * - the register is dirty, and is read again after a branch opcode
1565 *
1566 * Unload if:
1567 * - the register is dirty or loaded, and is not read again
1568 * - the register is dirty or loaded, and is written again after a branch opcode
1569 * - the next opcode has the SYNC flag set
1570 *
1571 * Discard if:
1572 * - the register is dirty or loaded, and is written again
1573 */
98fa08a5 1574
03535202
PC
1575 for (i = 0; i < block->nb_ops; i++) {
1576 op = &block->opcode_list[i];
1577
cb72ea13
PC
1578 if (OPT_HANDLE_LOAD_DELAYS && load_delay_reg) {
1579 /* Handle delayed register write from load opcodes in
1580 * delay slots */
1581 last_w[load_delay_reg] = i;
1582 load_delay_reg = 0;
1583 }
1584
03535202
PC
1585 if (op_flag_sync(op->flags) || should_emulate(op)) {
1586 /* The next opcode has the SYNC flag set, or is a branch
1587 * that should be emulated: unload all registers. */
1588 lightrec_early_unload_sync(block->opcode_list, last_r, last_w);
1589 dirty = 0;
1590 loaded = 0;
d16005f8
PC
1591 }
1592
03535202
PC
1593 if (next_sync == i) {
1594 last_sync = i;
1595 pr_debug("Last sync: 0x%x\n", last_sync << 2);
1596 }
d16005f8 1597
03535202
PC
1598 if (has_delay_slot(op->c)) {
1599 next_sync = i + 1 + !op_flag_no_ds(op->flags);
1600 pr_debug("Next sync: 0x%x\n", next_sync << 2);
1601 }
d16005f8 1602
03535202
PC
1603 mask_r = opcode_read_mask(op->c);
1604 mask_w = opcode_write_mask(op->c);
98fa08a5 1605
cb72ea13
PC
1606 if (op_flag_load_delay(op->flags) && opcode_is_load(op->c)) {
1607 /* If we have a load opcode in a delay slot, its target
1608 * register is actually not written there but at a
1609 * later point, in the dispatcher. Prevent the algorithm
1610 * from discarding its previous value. */
1611 load_delay_reg = op->c.i.rt;
1612 mask_w &= ~BIT(op->c.i.rt);
1613 }
1614
03535202
PC
1615 for (reg = 0; reg < 34; reg++) {
1616 if (mask_r & BIT(reg)) {
1617 if (dirty & BIT(reg) && last_w[reg] < last_sync) {
1618 /* The register is dirty, and is read
1619 * again after a branch: clean it */
1620
1621 lightrec_add_clean(&block->opcode_list[last_w[reg]], reg);
1622 dirty &= ~BIT(reg);
1623 loaded |= BIT(reg);
1624 }
1625
1626 last_r[reg] = i;
1627 }
1628
1629 if (mask_w & BIT(reg)) {
1630 if ((dirty & BIT(reg) && last_w[reg] < last_sync) ||
1631 (loaded & BIT(reg) && last_r[reg] < last_sync)) {
1632 /* The register is dirty or loaded, and
1633 * is written again after a branch:
1634 * unload it */
1635
1636 offset = s16_max(last_w[reg], last_r[reg]);
1637 lightrec_add_unload(&block->opcode_list[offset], reg);
1638 dirty &= ~BIT(reg);
1639 loaded &= ~BIT(reg);
1640 } else if (!(mask_r & BIT(reg)) &&
1641 ((dirty & BIT(reg) && last_w[reg] > last_sync) ||
1642 (loaded & BIT(reg) && last_r[reg] > last_sync))) {
1643 /* The register is dirty or loaded, and
1644 * is written again: discard it */
1645
1646 offset = s16_max(last_w[reg], last_r[reg]);
1647 lightrec_add_discard(&block->opcode_list[offset], reg);
1648 dirty &= ~BIT(reg);
1649 loaded &= ~BIT(reg);
1650 }
1651
1652 last_w[reg] = i;
1653 }
98fa08a5 1654
03535202
PC
1655 }
1656
1657 dirty |= mask_w;
1658 loaded |= mask_r;
d16005f8
PC
1659 }
1660
03535202
PC
1661 /* Unload all registers that are dirty or loaded at the end of block. */
1662 lightrec_early_unload_sync(block->opcode_list, last_r, last_w);
1663
d16005f8
PC
1664 return 0;
1665}
1666
98fa08a5 1667static int lightrec_flag_io(struct lightrec_state *state, struct block *block)
d16005f8 1668{
9259d748 1669 struct opcode *list;
02487de7 1670 enum psx_map psx_map;
9259d748 1671 struct constprop_data v[32] = LIGHTREC_CONSTPROP_INITIALIZER;
98fa08a5 1672 unsigned int i;
02487de7 1673 u32 val, kunseg_val;
ba3814c1 1674 bool no_mask;
98fa08a5
PC
1675
1676 for (i = 0; i < block->nb_ops; i++) {
1677 list = &block->opcode_list[i];
d16005f8 1678
cb72ea13 1679 lightrec_consts_propagate(block, i, v);
22eee2ac 1680
d16005f8
PC
1681 switch (list->i.op) {
1682 case OP_SB:
1683 case OP_SH:
1684 case OP_SW:
cb72ea13
PC
1685 /* Mark all store operations that target $sp or $gp
1686 * as not requiring code invalidation. This is based
1687 * on the heuristic that stores using one of these
1688 * registers as address will never hit a code page. */
1689 if (list->i.rs >= 28 && list->i.rs <= 29 &&
1690 !state->maps[PSX_MAP_KERNEL_USER_RAM].ops) {
1691 pr_debug("Flaging opcode 0x%08x as not requiring invalidation\n",
1692 list->opcode);
1693 list->flags |= LIGHTREC_NO_INVALIDATE;
1694 }
98fa08a5 1695
cb72ea13
PC
1696 /* Detect writes whose destination address is inside the
1697 * current block, using constant propagation. When these
1698 * occur, we mark the blocks as not compilable. */
1699 if (is_known(v, list->i.rs) &&
1700 kunseg(v[list->i.rs].value) >= kunseg(block->pc) &&
1701 kunseg(v[list->i.rs].value) < (kunseg(block->pc) + block->nb_ops * 4)) {
1702 pr_debug("Self-modifying block detected\n");
1703 block_set_flags(block, BLOCK_NEVER_COMPILE);
1704 list->flags |= LIGHTREC_SMC;
98fa08a5 1705 }
d8b04acd
PC
1706 fallthrough;
1707 case OP_SWL:
98fa08a5
PC
1708 case OP_SWR:
1709 case OP_SWC2:
1710 case OP_LB:
1711 case OP_LBU:
1712 case OP_LH:
1713 case OP_LHU:
1714 case OP_LW:
1715 case OP_LWL:
1716 case OP_LWR:
1717 case OP_LWC2:
cb72ea13 1718 if (v[list->i.rs].known | v[list->i.rs].sign) {
9259d748
PC
1719 psx_map = lightrec_get_constprop_map(state, v,
1720 list->i.rs,
1721 (s16) list->i.imm);
1722
1723 if (psx_map != PSX_MAP_UNKNOWN && !is_known(v, list->i.rs))
1724 pr_debug("Detected map thanks to bit-level const propagation!\n");
02487de7 1725
03535202 1726 list->flags &= ~LIGHTREC_IO_MASK;
9259d748
PC
1727
1728 val = v[list->i.rs].value + (s16) list->i.imm;
1729 kunseg_val = kunseg(val);
1730
1731 no_mask = (v[list->i.rs].known & ~v[list->i.rs].value
1732 & 0xe0000000) == 0xe0000000;
03535202 1733
02487de7
PC
1734 switch (psx_map) {
1735 case PSX_MAP_KERNEL_USER_RAM:
ba3814c1 1736 if (no_mask)
02487de7 1737 list->flags |= LIGHTREC_NO_MASK;
d8b04acd 1738 fallthrough;
02487de7
PC
1739 case PSX_MAP_MIRROR1:
1740 case PSX_MAP_MIRROR2:
1741 case PSX_MAP_MIRROR3:
22eee2ac
PC
1742 pr_debug("Flaging opcode %u as RAM access\n", i);
1743 list->flags |= LIGHTREC_IO_MODE(LIGHTREC_IO_RAM);
ba3814c1
PC
1744 if (no_mask && state->mirrors_mapped)
1745 list->flags |= LIGHTREC_NO_MASK;
02487de7
PC
1746 break;
1747 case PSX_MAP_BIOS:
22eee2ac
PC
1748 pr_debug("Flaging opcode %u as BIOS access\n", i);
1749 list->flags |= LIGHTREC_IO_MODE(LIGHTREC_IO_BIOS);
ba3814c1
PC
1750 if (no_mask)
1751 list->flags |= LIGHTREC_NO_MASK;
02487de7
PC
1752 break;
1753 case PSX_MAP_SCRATCH_PAD:
22eee2ac
PC
1754 pr_debug("Flaging opcode %u as scratchpad access\n", i);
1755 list->flags |= LIGHTREC_IO_MODE(LIGHTREC_IO_SCRATCH);
ba3814c1
PC
1756 if (no_mask)
1757 list->flags |= LIGHTREC_NO_MASK;
02487de7
PC
1758
1759 /* Consider that we're never going to run code from
1760 * the scratchpad. */
1761 list->flags |= LIGHTREC_NO_INVALIDATE;
1762 break;
ba3814c1
PC
1763 case PSX_MAP_HW_REGISTERS:
1764 if (state->ops.hw_direct &&
1765 state->ops.hw_direct(kunseg_val,
1766 opcode_is_store(list->c),
1767 opcode_get_io_size(list->c))) {
1768 pr_debug("Flagging opcode %u as direct I/O access\n",
1769 i);
1770 list->flags |= LIGHTREC_IO_MODE(LIGHTREC_IO_DIRECT_HW);
cdfa3536
PC
1771
1772 if (no_mask)
1773 list->flags |= LIGHTREC_NO_MASK;
9259d748
PC
1774 } else {
1775 pr_debug("Flagging opcode %u as I/O access\n",
1776 i);
1777 list->flags |= LIGHTREC_IO_MODE(LIGHTREC_IO_HW);
ba3814c1 1778 }
9259d748 1779 break;
02487de7 1780 default:
02487de7 1781 break;
98fa08a5 1782 }
d16005f8 1783 }
cb72ea13
PC
1784
1785 if (!LIGHTREC_FLAGS_GET_IO_MODE(list->flags)
1786 && list->i.rs >= 28 && list->i.rs <= 29
1787 && !state->maps[PSX_MAP_KERNEL_USER_RAM].ops) {
1788 /* Assume that all I/O operations that target
1789 * $sp or $gp will always only target a mapped
1790 * memory (RAM, BIOS, scratchpad). */
684432ad
PC
1791 if (state->opt_flags & LIGHTREC_OPT_SP_GP_HIT_RAM)
1792 list->flags |= LIGHTREC_IO_MODE(LIGHTREC_IO_RAM);
1793 else
1794 list->flags |= LIGHTREC_IO_MODE(LIGHTREC_IO_DIRECT);
cb72ea13
PC
1795 }
1796
d8b04acd
PC
1797 fallthrough;
1798 default:
d16005f8
PC
1799 break;
1800 }
d16005f8
PC
1801 }
1802
1803 return 0;
1804}
1805
98fa08a5
PC
1806static u8 get_mfhi_mflo_reg(const struct block *block, u16 offset,
1807 const struct opcode *last,
1808 u32 mask, bool sync, bool mflo, bool another)
d16005f8 1809{
98fa08a5
PC
1810 const struct opcode *op, *next = &block->opcode_list[offset];
1811 u32 old_mask;
1812 u8 reg2, reg = mflo ? REG_LO : REG_HI;
1813 u16 branch_offset;
1814 unsigned int i;
1815
1816 for (i = offset; i < block->nb_ops; i++) {
1817 op = next;
1818 next = &block->opcode_list[i + 1];
1819 old_mask = mask;
1820
1821 /* If any other opcode writes or reads to the register
1822 * we'd use, then we cannot use it anymore. */
1823 mask |= opcode_read_mask(op->c);
1824 mask |= opcode_write_mask(op->c);
1825
03535202 1826 if (op_flag_sync(op->flags))
98fa08a5 1827 sync = true;
d16005f8 1828
d16005f8
PC
1829 switch (op->i.op) {
1830 case OP_BEQ:
1831 case OP_BNE:
1832 case OP_BLEZ:
1833 case OP_BGTZ:
1834 case OP_REGIMM:
d16005f8 1835 /* TODO: handle backwards branches too */
03535202 1836 if (!last && op_flag_local_branch(op->flags) &&
d16005f8 1837 (s16)op->c.i.imm >= 0) {
98fa08a5 1838 branch_offset = i + 1 + (s16)op->c.i.imm
03535202 1839 - !!op_flag_no_ds(op->flags);
98fa08a5
PC
1840
1841 reg = get_mfhi_mflo_reg(block, branch_offset, NULL,
1842 mask, sync, mflo, false);
1843 reg2 = get_mfhi_mflo_reg(block, offset + 1, next,
1844 mask, sync, mflo, false);
1845 if (reg > 0 && reg == reg2)
1846 return reg;
1847 if (!reg && !reg2)
1848 return 0;
d16005f8 1849 }
98fa08a5
PC
1850
1851 return mflo ? REG_LO : REG_HI;
ba3814c1
PC
1852 case OP_META_MULT2:
1853 case OP_META_MULTU2:
1854 return 0;
d16005f8
PC
1855 case OP_SPECIAL:
1856 switch (op->r.op) {
1857 case OP_SPECIAL_MULT:
1858 case OP_SPECIAL_MULTU:
1859 case OP_SPECIAL_DIV:
1860 case OP_SPECIAL_DIVU:
98fa08a5 1861 return 0;
d16005f8 1862 case OP_SPECIAL_MTHI:
98fa08a5
PC
1863 if (!mflo)
1864 return 0;
1865 continue;
1866 case OP_SPECIAL_MTLO:
1867 if (mflo)
1868 return 0;
1869 continue;
d16005f8 1870 case OP_SPECIAL_JR:
98fa08a5
PC
1871 if (op->r.rs != 31)
1872 return reg;
1873
03535202 1874 if (!sync && !op_flag_no_ds(op->flags) &&
98fa08a5
PC
1875 (next->i.op == OP_SPECIAL) &&
1876 ((!mflo && next->r.op == OP_SPECIAL_MFHI) ||
1877 (mflo && next->r.op == OP_SPECIAL_MFLO)))
1878 return next->r.rd;
1879
1880 return 0;
d16005f8 1881 case OP_SPECIAL_JALR:
98fa08a5 1882 return reg;
d16005f8 1883 case OP_SPECIAL_MFHI:
98fa08a5
PC
1884 if (!mflo) {
1885 if (another)
1886 return op->r.rd;
1887 /* Must use REG_HI if there is another MFHI target*/
1888 reg2 = get_mfhi_mflo_reg(block, i + 1, next,
1889 0, sync, mflo, true);
1890 if (reg2 > 0 && reg2 != REG_HI)
1891 return REG_HI;
1892
1893 if (!sync && !(old_mask & BIT(op->r.rd)))
1894 return op->r.rd;
1895 else
1896 return REG_HI;
1897 }
1898 continue;
1899 case OP_SPECIAL_MFLO:
1900 if (mflo) {
1901 if (another)
1902 return op->r.rd;
1903 /* Must use REG_LO if there is another MFLO target*/
1904 reg2 = get_mfhi_mflo_reg(block, i + 1, next,
1905 0, sync, mflo, true);
1906 if (reg2 > 0 && reg2 != REG_LO)
1907 return REG_LO;
1908
1909 if (!sync && !(old_mask & BIT(op->r.rd)))
1910 return op->r.rd;
1911 else
1912 return REG_LO;
1913 }
d16005f8 1914 continue;
98fa08a5
PC
1915 default:
1916 break;
d16005f8 1917 }
98fa08a5 1918
d8b04acd 1919 fallthrough;
d16005f8
PC
1920 default:
1921 continue;
1922 }
1923 }
1924
98fa08a5
PC
1925 return reg;
1926}
1927
1928static void lightrec_replace_lo_hi(struct block *block, u16 offset,
1929 u16 last, bool lo)
1930{
1931 unsigned int i;
1932 u32 branch_offset;
1933
1934 /* This function will remove the following MFLO/MFHI. It must be called
1935 * only if get_mfhi_mflo_reg() returned a non-zero value. */
1936
1937 for (i = offset; i < last; i++) {
1938 struct opcode *op = &block->opcode_list[i];
1939
1940 switch (op->i.op) {
1941 case OP_BEQ:
1942 case OP_BNE:
1943 case OP_BLEZ:
1944 case OP_BGTZ:
1945 case OP_REGIMM:
1946 /* TODO: handle backwards branches too */
03535202 1947 if (op_flag_local_branch(op->flags) && (s16)op->c.i.imm >= 0) {
98fa08a5 1948 branch_offset = i + 1 + (s16)op->c.i.imm
03535202 1949 - !!op_flag_no_ds(op->flags);
98fa08a5
PC
1950
1951 lightrec_replace_lo_hi(block, branch_offset, last, lo);
1952 lightrec_replace_lo_hi(block, i + 1, branch_offset, lo);
1953 }
1954 break;
1955
1956 case OP_SPECIAL:
1957 if (lo && op->r.op == OP_SPECIAL_MFLO) {
1958 pr_debug("Removing MFLO opcode at offset 0x%x\n",
1959 i << 2);
1960 op->opcode = 0;
1961 return;
1962 } else if (!lo && op->r.op == OP_SPECIAL_MFHI) {
1963 pr_debug("Removing MFHI opcode at offset 0x%x\n",
1964 i << 2);
1965 op->opcode = 0;
1966 return;
1967 }
1968
d8b04acd 1969 fallthrough;
98fa08a5
PC
1970 default:
1971 break;
1972 }
1973 }
d16005f8
PC
1974}
1975
fd58fa32
PC
1976static bool lightrec_always_skip_div_check(void)
1977{
1978#ifdef __mips__
1979 return true;
1980#else
1981 return false;
1982#endif
1983}
1984
98fa08a5 1985static int lightrec_flag_mults_divs(struct lightrec_state *state, struct block *block)
d16005f8 1986{
9259d748
PC
1987 struct opcode *list = NULL;
1988 struct constprop_data v[32] = LIGHTREC_CONSTPROP_INITIALIZER;
98fa08a5
PC
1989 u8 reg_hi, reg_lo;
1990 unsigned int i;
1991
1992 for (i = 0; i < block->nb_ops - 1; i++) {
1993 list = &block->opcode_list[i];
d16005f8 1994
cb72ea13 1995 lightrec_consts_propagate(block, i, v);
22eee2ac 1996
ba3814c1
PC
1997 switch (list->i.op) {
1998 case OP_SPECIAL:
1999 switch (list->r.op) {
2000 case OP_SPECIAL_DIV:
2001 case OP_SPECIAL_DIVU:
2002 /* If we are dividing by a non-zero constant, don't
2003 * emit the div-by-zero check. */
2004 if (lightrec_always_skip_div_check() ||
9259d748 2005 (v[list->r.rt].known & v[list->r.rt].value)) {
ba3814c1 2006 list->flags |= LIGHTREC_NO_DIV_CHECK;
9259d748 2007 }
ba3814c1
PC
2008 fallthrough;
2009 case OP_SPECIAL_MULT:
2010 case OP_SPECIAL_MULTU:
2011 break;
2012 default:
2013 continue;
2014 }
d8b04acd 2015 fallthrough;
ba3814c1
PC
2016 case OP_META_MULT2:
2017 case OP_META_MULTU2:
d16005f8
PC
2018 break;
2019 default:
2020 continue;
2021 }
2022
98fa08a5 2023 /* Don't support opcodes in delay slots */
9259d748 2024 if (is_delay_slot(block->opcode_list, i) ||
03535202 2025 op_flag_no_ds(list->flags)) {
d16005f8 2026 continue;
fd58fa32 2027 }
d16005f8 2028
98fa08a5
PC
2029 reg_lo = get_mfhi_mflo_reg(block, i + 1, NULL, 0, false, true, false);
2030 if (reg_lo == 0) {
2031 pr_debug("Mark MULT(U)/DIV(U) opcode at offset 0x%x as"
2032 " not writing LO\n", i << 2);
2033 list->flags |= LIGHTREC_NO_LO;
2034 }
2035
2036 reg_hi = get_mfhi_mflo_reg(block, i + 1, NULL, 0, false, false, false);
2037 if (reg_hi == 0) {
2038 pr_debug("Mark MULT(U)/DIV(U) opcode at offset 0x%x as"
2039 " not writing HI\n", i << 2);
2040 list->flags |= LIGHTREC_NO_HI;
2041 }
2042
2043 if (!reg_lo && !reg_hi) {
2044 pr_debug("Both LO/HI unused in this block, they will "
2045 "probably be used in parent block - removing "
2046 "flags.\n");
2047 list->flags &= ~(LIGHTREC_NO_LO | LIGHTREC_NO_HI);
2048 }
2049
2050 if (reg_lo > 0 && reg_lo != REG_LO) {
2051 pr_debug("Found register %s to hold LO (rs = %u, rt = %u)\n",
2052 lightrec_reg_name(reg_lo), list->r.rs, list->r.rt);
2053
2054 lightrec_replace_lo_hi(block, i + 1, block->nb_ops, true);
2055 list->r.rd = reg_lo;
2056 } else {
2057 list->r.rd = 0;
2058 }
2059
2060 if (reg_hi > 0 && reg_hi != REG_HI) {
2061 pr_debug("Found register %s to hold HI (rs = %u, rt = %u)\n",
2062 lightrec_reg_name(reg_hi), list->r.rs, list->r.rt);
2063
2064 lightrec_replace_lo_hi(block, i + 1, block->nb_ops, false);
2065 list->r.imm = reg_hi;
2066 } else {
2067 list->r.imm = 0;
2068 }
2069 }
2070
2071 return 0;
2072}
2073
2074static bool remove_div_sequence(struct block *block, unsigned int offset)
2075{
2076 struct opcode *op;
2077 unsigned int i, found = 0;
2078
2079 /*
2080 * Scan for the zero-checking sequence that GCC automatically introduced
2081 * after most DIV/DIVU opcodes. This sequence checks the value of the
2082 * divisor, and if zero, executes a BREAK opcode, causing the BIOS
2083 * handler to crash the PS1.
2084 *
2085 * For DIV opcodes, this sequence additionally checks that the signed
2086 * operation does not overflow.
2087 *
2088 * With the assumption that the games never crashed the PS1, we can
2089 * therefore assume that the games never divided by zero or overflowed,
2090 * and these sequences can be removed.
2091 */
2092
2093 for (i = offset; i < block->nb_ops; i++) {
2094 op = &block->opcode_list[i];
2095
2096 if (!found) {
2097 if (op->i.op == OP_SPECIAL &&
2098 (op->r.op == OP_SPECIAL_DIV || op->r.op == OP_SPECIAL_DIVU))
2099 break;
2100
2101 if ((op->opcode & 0xfc1fffff) == 0x14000002) {
2102 /* BNE ???, zero, +8 */
2103 found++;
2104 } else {
2105 offset++;
2106 }
2107 } else if (found == 1 && !op->opcode) {
2108 /* NOP */
2109 found++;
2110 } else if (found == 2 && op->opcode == 0x0007000d) {
2111 /* BREAK 0x1c00 */
2112 found++;
2113 } else if (found == 3 && op->opcode == 0x2401ffff) {
2114 /* LI at, -1 */
2115 found++;
2116 } else if (found == 4 && (op->opcode & 0xfc1fffff) == 0x14010004) {
2117 /* BNE ???, at, +16 */
2118 found++;
2119 } else if (found == 5 && op->opcode == 0x3c018000) {
2120 /* LUI at, 0x8000 */
2121 found++;
2122 } else if (found == 6 && (op->opcode & 0x141fffff) == 0x14010002) {
2123 /* BNE ???, at, +16 */
2124 found++;
2125 } else if (found == 7 && !op->opcode) {
2126 /* NOP */
2127 found++;
2128 } else if (found == 8 && op->opcode == 0x0006000d) {
2129 /* BREAK 0x1800 */
2130 found++;
2131 break;
2132 } else {
2133 break;
2134 }
2135 }
2136
2137 if (found >= 3) {
2138 if (found != 9)
2139 found = 3;
2140
2141 pr_debug("Removing DIV%s sequence at offset 0x%x\n",
2142 found == 9 ? "" : "U", offset << 2);
2143
2144 for (i = 0; i < found; i++)
2145 block->opcode_list[offset + i].opcode = 0;
2146
2147 return true;
2148 }
2149
2150 return false;
2151}
2152
2153static int lightrec_remove_div_by_zero_check_sequence(struct lightrec_state *state,
2154 struct block *block)
2155{
2156 struct opcode *op;
2157 unsigned int i;
2158
2159 for (i = 0; i < block->nb_ops; i++) {
2160 op = &block->opcode_list[i];
2161
2162 if (op->i.op == OP_SPECIAL &&
2163 (op->r.op == OP_SPECIAL_DIVU || op->r.op == OP_SPECIAL_DIV) &&
2164 remove_div_sequence(block, i + 1))
2165 op->flags |= LIGHTREC_NO_DIV_CHECK;
2166 }
2167
2168 return 0;
2169}
2170
2171static const u32 memset_code[] = {
2172 0x10a00006, // beqz a1, 2f
2173 0x24a2ffff, // addiu v0,a1,-1
2174 0x2403ffff, // li v1,-1
2175 0xac800000, // 1: sw zero,0(a0)
2176 0x2442ffff, // addiu v0,v0,-1
2177 0x1443fffd, // bne v0,v1, 1b
2178 0x24840004, // addiu a0,a0,4
2179 0x03e00008, // 2: jr ra
2180 0x00000000, // nop
2181};
2182
2183static int lightrec_replace_memset(struct lightrec_state *state, struct block *block)
2184{
2185 unsigned int i;
2186 union code c;
2187
2188 for (i = 0; i < block->nb_ops; i++) {
2189 c = block->opcode_list[i].c;
2190
2191 if (c.opcode != memset_code[i])
2192 return 0;
2193
2194 if (i == ARRAY_SIZE(memset_code) - 1) {
2195 /* success! */
2196 pr_debug("Block at PC 0x%x is a memset\n", block->pc);
ba3814c1
PC
2197 block_set_flags(block,
2198 BLOCK_IS_MEMSET | BLOCK_NEVER_COMPILE);
98fa08a5
PC
2199
2200 /* Return non-zero to skip other optimizers. */
2201 return 1;
d16005f8
PC
2202 }
2203 }
2204
2205 return 0;
2206}
2207
684432ad
PC
2208static int lightrec_test_preload_pc(struct lightrec_state *state, struct block *block)
2209{
2210 unsigned int i;
2211 union code c;
2212 u32 flags;
2213
2214 for (i = 0; i < block->nb_ops; i++) {
2215 c = block->opcode_list[i].c;
2216 flags = block->opcode_list[i].flags;
2217
2218 if (op_flag_sync(flags))
2219 break;
2220
2221 switch (c.i.op) {
2222 case OP_J:
2223 case OP_JAL:
2224 block->flags |= BLOCK_PRELOAD_PC;
2225 return 0;
2226
2227 case OP_REGIMM:
2228 switch (c.r.rt) {
2229 case OP_REGIMM_BLTZAL:
2230 case OP_REGIMM_BGEZAL:
2231 block->flags |= BLOCK_PRELOAD_PC;
2232 return 0;
2233 default:
2234 break;
2235 }
2236 fallthrough;
2237 case OP_BEQ:
2238 case OP_BNE:
2239 case OP_BLEZ:
2240 case OP_BGTZ:
2241 if (!op_flag_local_branch(flags)) {
2242 block->flags |= BLOCK_PRELOAD_PC;
2243 return 0;
2244 }
2245
2246 case OP_SPECIAL:
2247 switch (c.r.op) {
2248 case OP_SPECIAL_JALR:
2249 if (c.r.rd) {
2250 block->flags |= BLOCK_PRELOAD_PC;
2251 return 0;
2252 }
2253 break;
2254 case OP_SPECIAL_SYSCALL:
2255 case OP_SPECIAL_BREAK:
2256 block->flags |= BLOCK_PRELOAD_PC;
2257 return 0;
2258 default:
2259 break;
2260 }
2261 break;
2262 }
2263 }
2264
2265 return 0;
2266}
2267
98fa08a5
PC
2268static int (*lightrec_optimizers[])(struct lightrec_state *state, struct block *) = {
2269 IF_OPT(OPT_REMOVE_DIV_BY_ZERO_SEQ, &lightrec_remove_div_by_zero_check_sequence),
2270 IF_OPT(OPT_REPLACE_MEMSET, &lightrec_replace_memset),
2271 IF_OPT(OPT_DETECT_IMPOSSIBLE_BRANCHES, &lightrec_detect_impossible_branches),
cb72ea13
PC
2272 IF_OPT(OPT_HANDLE_LOAD_DELAYS, &lightrec_handle_load_delays),
2273 IF_OPT(OPT_HANDLE_LOAD_DELAYS, &lightrec_swap_load_delays),
03535202 2274 IF_OPT(OPT_TRANSFORM_OPS, &lightrec_transform_branches),
98fa08a5
PC
2275 IF_OPT(OPT_LOCAL_BRANCHES, &lightrec_local_branches),
2276 IF_OPT(OPT_TRANSFORM_OPS, &lightrec_transform_ops),
2277 IF_OPT(OPT_SWITCH_DELAY_SLOTS, &lightrec_switch_delay_slots),
cb72ea13 2278 IF_OPT(OPT_FLAG_IO, &lightrec_flag_io),
98fa08a5
PC
2279 IF_OPT(OPT_FLAG_MULT_DIV, &lightrec_flag_mults_divs),
2280 IF_OPT(OPT_EARLY_UNLOAD, &lightrec_early_unload),
684432ad 2281 IF_OPT(OPT_PRELOAD_PC, &lightrec_test_preload_pc),
d16005f8
PC
2282};
2283
98fa08a5 2284int lightrec_optimize(struct lightrec_state *state, struct block *block)
d16005f8
PC
2285{
2286 unsigned int i;
98fa08a5 2287 int ret;
d16005f8
PC
2288
2289 for (i = 0; i < ARRAY_SIZE(lightrec_optimizers); i++) {
98fa08a5
PC
2290 if (lightrec_optimizers[i]) {
2291 ret = (*lightrec_optimizers[i])(state, block);
2292 if (ret)
2293 return ret;
2294 }
d16005f8
PC
2295 }
2296
2297 return 0;
2298}