abstract-machine/tools/rvmini/inst-replace.h
2024-05-02 12:26:37 +08:00

730 lines
14 KiB
C

#if 0
Base instructions:
* add, addi, lui, lw, lbu, sw, sb, jalr
Assumption:
* compiler do not use tp and gp
* stack below sp is not used yet
Calling convention:
* tp - temporary register
* gp - saved register, but temporary for the most outer functions
Limitation:
* We can not break sw into sb, since this process depends on shift operations.
But shift operationss also depend on sw.
* It is difficult to break lw into lbu, since it requires at least two temporary
registers. This makes it difficult to implement other instructions with lw,
due to the small numbers of avaliable registers.
* We can not use lw/sw io impement lbu/sb, since this process requires detecting
misaligned memory accessing.
* We can not break lui into addi + slli, since the linker can not provide
symbol[31:20] and symbol[19:12]. The linker can only provide %hi(symbol)
and %lo(symbol), which are symbol[31:12] and symbol[11:0] respectively.
#endif
#define _concat(x, y) x ## y
#define concat(x, y) _concat(x, y)
#define SET_DEBUG_LABEL(name) concat(name, _\@):
#define SP_VAR_BYTE(n, boffset) (-(n) * 4 + (boffset))(sp)
#define SP_VAR(n) SP_VAR_BYTE(n, 0)
#define VAR_A 4
#define VAR_B 5
#define VAR_C 6
#define VAR_D 7
#define PUSH(r, n) sw r, SP_VAR(n)
#define POP(r, n) lw r, SP_VAR(n)
.section .rodata
.align 8
.weak _check_same_array
_check_same_array: .fill 256, 1, 0
.weak _same_result_table
_same_result_table: .byte 1, 0, 0, 0, 0, 0, 0, 0
.weak _check_8bit_0_table
_check_8bit_0_table:
.byte 0
.fill 255, 1, 1
# slt = OF ^ SF = ((sign(A) ^ sign(B)) & (sign(A) ^ sign(R))) ^ sign(R)
# sltu = slt ^ sign(A) ^ sign(B)
# A B R OF slt sltu (sltu = slt ^ A ^ B)
# 0 0 0 0 0 0
# 0 0 1 0 1 1
# 0 1 0 0 0 1
# 0 1 1 1 0 1
# 1 0 0 1 1 0
# 1 0 1 0 1 0
# 1 1 0 0 0 0
# 1 1 1 0 1 1
.weak _slt1_table
_slt1_table: .byte 0, 1, 0, 0, 1, 1, 0, 1
.weak _sltu1_table
_sltu1_table: .byte 0, 1, 1, 1, 0, 0, 0, 1
.weak _or1_table
_or1_table: .byte 0, 1, 1, 1
.weak _logic_shift_table
.align 8
_logic_shift_table:
.incbin _LUT_BIN_PATH # defined in command line flags
#define _and8_table _logic_shift_table
#define _or8_table (_and8_table + 256 * 256)
#define _xor8_table (_or8_table + 256 * 256)
#define _sll8_table (_xor8_table + 256 * 256)
#define _srl8_table (_sll8_table + 32 * 256 * 4)
#define _sra8_table (_srl8_table + 32 * 256 * 4)
#define def_itype(name, rtype_name) \
.macro name rd, rs1, imm ;\
SET_DEBUG_LABEL(name); \
li tp, \imm; \
rtype_name \rd, \rs1, tp; \
.endm
.macro call_template r, addr
la \r, \addr
jalr \r, \r
.endm
.macro jal rd, addr
SET_DEBUG_LABEL(jal)
.if \rd == zero
j \addr
.else
.if \rd == x0
j \addr
.else
call_template \rd, \addr
.endif
.endif
.endm
.macro sub rd, rs1, rs2
SET_DEBUG_LABEL(sub)
not tp, \rs2
add \rd, \rs1, tp
addi \rd, \rd, 1
.endm
.macro getbyte rd, rs1, byte
sw \rs1, SP_VAR(VAR_C)
lbu \rd, SP_VAR_BYTE(VAR_C, \byte)
.endm
.macro msb rd, rs1
getbyte \rd, \rs1, 3
slli \rd, \rd, 1
getbyte \rd, \rd, 1
.endm
.macro slt_template lut, rd, rs1, rs2
PUSH(s0, 1)
PUSH(s1, 2)
sw \rs2, SP_VAR(VAR_A)
mv s0, \rs1
lw s1, SP_VAR(VAR_A)
msb gp, s0
slli gp, gp, 1
msb tp, s1
add gp, gp, tp
slli gp, gp, 1
sub tp, s0, s1 # sub uses tp
msb tp, tp
add gp, gp, tp
la tp, \lut
add tp, tp, gp
POP(s1, 2)
POP(s0, 1)
lbu \rd, (tp)
.endm
.macro slt rd, rs1, rs2
SET_DEBUG_LABEL(slt)
slt_template _slt1_table, \rd, \rs1, \rs2
.endm
.macro sltu rd, rs1, rs2
SET_DEBUG_LABEL(slt)
slt_template _sltu1_table, \rd, \rs1, \rs2
.endm
.macro logic8_byte_internal lut_reg, boffset
lbu tp, SP_VAR_BYTE(VAR_A, \boffset)
sb tp, SP_VAR_BYTE(VAR_D, 1)
lbu tp, SP_VAR_BYTE(VAR_B, \boffset)
sb tp, SP_VAR_BYTE(VAR_D, 0)
lw tp, SP_VAR(VAR_D)
add tp, \lut_reg, tp
lbu tp, (tp)
sb tp, SP_VAR_BYTE(VAR_C, \boffset)
.endm
.macro logic lut, rd, rs1, rs2
.if \rd == x0
.exitm
.endif
.if \rd == sp || \rs1 == sp || \rs2 == sp
.abort
.endif
.if \rd == gp || \rs1 == gp || \rs2 == gp
.abort
.endif
.if \rd == tp || \rs1 == tp
.abort
.endif
sw \rs1, SP_VAR(VAR_A)
sw \rs2, SP_VAR(VAR_B)
la \rd, \lut
sw x0, SP_VAR(VAR_D)
logic8_byte_internal \rd, 3
logic8_byte_internal \rd, 2
logic8_byte_internal \rd, 1
logic8_byte_internal \rd, 0
lw \rd, SP_VAR(VAR_C)
.endm
#define def_logic(name) \
.macro name rd, rs1, rs2 ;\
SET_DEBUG_LABEL(name); \
logic concat(_, concat(name, 8_table)), \rd, \rs1, \rs2; \
.endm
def_logic(and)
def_logic(or)
def_logic(xor)
.macro check_8bit_same table, rd, boffset # rd == 0 if same
lbu \rd, SP_VAR_BYTE(VAR_A, \boffset)
add tp, \table, \rd
addi \rd, x0, 1
sb \rd, (tp)
lbu \rd, SP_VAR_BYTE(VAR_B, \boffset)
add \rd, \table, \rd
sb x0, (\rd)
lbu \rd, (tp)
.endm
.macro seq rd, rs1, rs2 # set equal
sw \rs1, SP_VAR(VAR_A)
sw \rs2, SP_VAR(VAR_B)
PUSH(s0, 1)
PUSH(s1, 2)
la s0, _check_same_array
check_8bit_same s0, gp, 0
check_8bit_same s0, s1, 1
add gp, gp, s1
check_8bit_same s0, s1, 2
add gp, gp, s1
check_8bit_same s0, s1, 3
add gp, gp, s1
la tp, _same_result_table
add tp, tp, gp
lbu \rd, (tp)
POP(s1, 2)
POP(s0, 1)
.endm
.macro check_8bit_0 table, rd, boffset # rd = (source is 0 ? 0 : 1)
lbu \rd, SP_VAR_BYTE(VAR_A, \boffset)
add \rd, \table, \rd
lbu \rd, (\rd)
.endm
.macro seqz rd, rs1 # set if 0
sw \rs1, SP_VAR(VAR_A)
PUSH(s0, 1)
la s0, _check_8bit_0_table
check_8bit_0 s0, gp, 0
check_8bit_0 s0, tp, 1
add gp, gp, tp
check_8bit_0 s0, tp, 2
add gp, gp, tp
check_8bit_0 s0, tp, 3
add gp, gp, tp
la tp, _same_result_table
add tp, tp, gp
lbu \rd, (tp)
POP(s0, 1)
.endm
#define SET_BRANCH_LABEL(name) concat(name, _\@\target):
.macro branch_resolve target, is_tp_high_sel_target, label
#define LABEL_NAME non_taken_\label\target
slli tp, tp, 2 # tp = tp * 4
add tp, sp, tp
.if \is_tp_high_sel_target == 1
la gp, \target
sw gp, SP_VAR(VAR_A) # higher address
la gp, LABEL_NAME
sw gp, SP_VAR(VAR_B) # lower address
.else
la gp, LABEL_NAME
sw gp, SP_VAR(VAR_A) # higher address
la gp, \target
sw gp, SP_VAR(VAR_B) # lower address
.endif
lw tp, (-VAR_B * 4)(tp)
jr tp
LABEL_NAME:
.endm
.macro beq rs1, rs2, target
SET_BRANCH_LABEL(beq)
.if \rs2 == zero || \rs2 == x0
seqz tp, \rs1
.else
.if \rs1 == zero || \rs1 == x0
beq \rs2, \rs1, \target
.exitm
.else
seq tp, \rs1, \rs2
.endif
.endif
branch_resolve \target, 1, \@
.endm
.macro bne rs1, rs2, target
SET_BRANCH_LABEL(bne)
.if \rs2 == zero || \rs2 == x0
seqz tp, \rs1
.else
.if \rs1 == zero || \rs1 == x0
bne \rs2, \rs1, \target
.exitm
.else
seq tp, \rs1, \rs2
.endif
.endif
branch_resolve \target, 0, \@
.endm
.macro blt rs1, rs2, target
SET_BRANCH_LABEL(blt)
.if \rs2 == zero || \rs2 == x0
msb tp, \rs1
branch_resolve \target, 1, \@
.else
.if \rs1 == zero || \rs1 == x0
# check rs2 > 0 ?
# msb(rs2) == 0 && msb(rs2 - 1) == 0
msb gp, \rs2
addi tp, \rs2, -1
msb tp, tp
add gp, gp, tp
la tp, _or1_table
add tp, gp, tp
lbu tp, (tp) # tp == 1 -> not taken
branch_resolve \target, 0, \@
.else
slt tp, \rs1, \rs2
branch_resolve \target, 1, \@
.endif
.endif
.endm
.macro bltu rs1, rs2, target
SET_BRANCH_LABEL(bltu)
sltu tp, \rs1, \rs2
branch_resolve \target, 1, \@
.endm
.macro bge rs1, rs2, target
SET_BRANCH_LABEL(bge)
.if \rs1 == zero || \rs1 == x0
# check rs2 <= 0 ?
# msb(rs2) == 1 || msb(rs2 - 1) == 1
msb gp, \rs2
addi tp, \rs2, -1
msb tp, tp
add gp, gp, tp
la tp, _or1_table
add tp, gp, tp
lbu tp, (tp)
branch_resolve \target, 1, \@
.else
slt tp, \rs1, \rs2
branch_resolve \target, 0, \@
.endif
.endm
.macro bgeu rs1, rs2, target
SET_BRANCH_LABEL(bgeu)
sltu tp, \rs1, \rs2
branch_resolve \target, 0, \@
.endm
.macro getshamt rd, rs1
sb \rs1, SP_VAR_BYTE(VAR_B, 0)
lbu \rd, SP_VAR_BYTE(VAR_B, 0)
la tp, (_and8_table + (0x1f << 8))
add \rd, \rd, tp
lbu \rd, (\rd)
.endm
.macro get_shift_byte_in_tp base, boffset
lbu tp, SP_VAR_BYTE(VAR_A, \boffset)
slli tp, tp, 2 # word per entry
add tp, \base, tp
lw tp, (tp)
.endm
.macro sll rd, rs1, rs2
SET_DEBUG_LABEL(sll)
PUSH(s0, 1)
sw \rs1, SP_VAR(VAR_A)
getshamt s0, \rs2
sw x0, SP_VAR(VAR_B)
sb s0, SP_VAR_BYTE(VAR_B, 1)
lw s0, SP_VAR(VAR_B) # s0 = s0 << 8
slli s0, s0, 2 # word per entry
la tp, _sll8_table
add s0, s0, tp
sw x0, SP_VAR(VAR_B)
get_shift_byte_in_tp s0, 3
sb tp, SP_VAR_BYTE(VAR_B, 3)
lw gp, SP_VAR(VAR_B) # gp = tp << 24
get_shift_byte_in_tp s0, 2
sh tp, SP_VAR_BYTE(VAR_B, 2)
lw tp, SP_VAR(VAR_B) # tp = tp << 16
add gp, gp, tp
get_shift_byte_in_tp s0, 1
slli tp, tp, 8
add gp, gp, tp
get_shift_byte_in_tp s0, 0
POP(s0, 1)
add \rd, gp, tp
.endm
.macro shift_right_template rd, rs1, rs2, is_arith, is_rs2_imm
.if \is_rs2_imm == 1 && \rs2 == 24 # fast path
.if \is_arith == 1
sw \rs1, SP_VAR(VAR_A)
la tp, (_sra8_table + (\rs2 << 10))
lbu \rd, SP_VAR_BYTE(VAR_A, 3)
slli \rd, \rd, 2 # word per entry
add tp, tp, \rd
lw \rd, (tp)
.else
getbyte \rd, \rs1, 3
.endif
.exitm
.endif
PUSH(s0, 1)
sw \rs1, SP_VAR(VAR_A)
.if \is_rs2_imm == 0
getshamt s0, \rs2
sw x0, SP_VAR(VAR_B)
sb s0, SP_VAR_BYTE(VAR_B, 1)
lw s0, SP_VAR(VAR_B) # s0 = s0 << 8
slli s0, s0, 2 # word per entry
la tp, _srl8_table
add s0, s0, tp
.else
la s0, (_srl8_table + (\rs2 << 10))
.endif
# result = ((x3 << 24) | (x2 << 16) | (x1 << 8) | x0) >> a
# let y be the values fetched from the table, then
# y3 = (x3 << 24 >> a)
# y2 = (x2 << 24 >> a)
# y1 = (x1 << 24 >> a)
# y0 = (x0 << 24 >> a)
# result = y3 | (y2 >> 8) | (y1 >> 16) | (y0 >> 24)
# = { y33, y32, y31, y30 } |
# { y23, y22, y21 } |
# { y13, y12 } |
# { y03 }
get_shift_byte_in_tp s0, 2
sw tp, SP_VAR(VAR_B)
get_shift_byte_in_tp s0, 1
sw tp, SP_VAR(VAR_C)
get_shift_byte_in_tp s0, 0
sw tp, SP_VAR(VAR_D)
# y3 is added in the end
# compute byte 0
lbu gp, SP_VAR_BYTE(VAR_B, 1)
lbu tp, SP_VAR_BYTE(VAR_C, 2)
add gp, gp, tp
lbu tp, SP_VAR_BYTE(VAR_D, 3)
add gp, gp, tp
sw gp, SP_VAR(VAR_D) # no carry during addition
# compute byte 1
lbu gp, SP_VAR_BYTE(VAR_B, 2)
lbu tp, SP_VAR_BYTE(VAR_C, 3)
add gp, gp, tp
sb gp, SP_VAR_BYTE(VAR_D, 1) # no carry during addition
# compute byte 2
lbu gp, SP_VAR_BYTE(VAR_B, 3)
sb gp, SP_VAR_BYTE(VAR_D, 2)
# combine byte 2~0
lw gp, SP_VAR(VAR_D)
.if \is_arith == 1
la tp, (_sra8_table - _srl8_table)
add s0, s0, tp
.endif
get_shift_byte_in_tp s0, 3
POP(s0, 1)
add \rd, gp, tp
.endm
.macro srl rd, rs1, rs2
SET_DEBUG_LABEL(srl)
shift_right_template \rd, \rs1, \rs2, 0, 0
.endm
.macro sra rd, rs1, rs2
SET_DEBUG_LABEL(sra)
shift_right_template \rd, \rs1, \rs2, 1, 0
.endm
.macro slli rd, rs1, imm
.if \imm == 0
.if \rd != \rs1
mv \rd, \rs1
.endif
.exitm
.endif
.if \imm >= 24
sw x0, SP_VAR(VAR_A)
sb \rs1, SP_VAR_BYTE(VAR_A, 3)
lw \rd, SP_VAR(VAR_A)
slli \rd, \rd, (\imm - 24)
.exitm
.endif
.if \imm >= 16
sw x0, SP_VAR(VAR_A)
sh \rs1, SP_VAR_BYTE(VAR_A, 2)
lw \rd, SP_VAR(VAR_A)
slli \rd, \rd, (\imm - 16)
.exitm
.endif
SET_DEBUG_LABEL(slli)
add \rd, \rs1, \rs1
.rept (\imm - 1)
add \rd, \rd, \rd
.endr
.endm
.macro srli rd, rs1, imm
SET_DEBUG_LABEL(srli)
shift_right_template \rd, \rs1, \imm, 0, 1
.endm
.macro srai rd, rs1, imm
SET_DEBUG_LABEL(srai)
shift_right_template \rd, \rs1, \imm, 1, 1
.endm
def_itype(slti, slt)
def_itype(sltiu, sltu)
def_itype(andi, and)
def_itype(ori, or)
def_itype(xori, xor)
.macro lb rd, addr
sw x0, SP_VAR(VAR_A)
lbu \rd, \addr
sb \rd, SP_VAR_BYTE(VAR_A, 3)
lw \rd, SP_VAR(VAR_A)
srai \rd, \rd, 24
.endm
.macro lhu rd, addr
sw x0, SP_VAR(VAR_A)
lbu tp, 1 + \addr
sb tp, SP_VAR_BYTE(VAR_A, 1)
lw tp, SP_VAR(VAR_A)
lbu \rd, 0 + \addr
add \rd, \rd, tp
.endm
.macro lh rd, addr
sw x0, SP_VAR(VAR_A)
lbu gp, 1 + \addr
sb gp, SP_VAR_BYTE(VAR_A, 3)
lw gp, SP_VAR(VAR_A)
srai gp, gp, 16
lbu \rd, 0 + \addr
add \rd, \rd, gp
.endm
.macro sh rs, addr
sb \rs, 0 + \addr
getbyte tp, \rs, 1
sb tp, 1 + \addr
.endm
# RISC-V pseudo-instruction
.macro la r, addr
lui \r, %hi(\addr)
addi \r, \r, %lo(\addr)
.endm
.macro lla r, addr
la \r, \addr
.endm
.macro call addr
call_template ra, \addr
.endm
.macro tail addr
la tp, \addr
jr tp
call_template ra, \addr
.endm
.macro j addr
call_template tp, \addr
.endm
.macro not rd, rs1
PUSH(gp, 3)
sw \rs1, SP_VAR(VAR_C)
la tp, (_xor8_table + (0xff << 8))
lbu gp, SP_VAR_BYTE(VAR_C, 3)
add gp, gp, tp
lbu gp, (gp)
sb gp, SP_VAR_BYTE(VAR_D, 3)
lbu gp, SP_VAR_BYTE(VAR_C, 2)
add gp, gp, tp
lbu gp, (gp)
sb gp, SP_VAR_BYTE(VAR_D, 2)
lbu gp, SP_VAR_BYTE(VAR_C, 1)
add gp, gp, tp
lbu gp, (gp)
sb gp, SP_VAR_BYTE(VAR_D, 1)
lbu gp, SP_VAR_BYTE(VAR_C, 0)
add gp, gp, tp
lbu gp, (gp)
sb gp, SP_VAR_BYTE(VAR_D, 0)
POP(gp, 3)
lw \rd, SP_VAR(VAR_D)
.endm
.macro neg rd, rs
sub \rd, x0, \rs
.endm
.macro snez rd, rs
sltu \rd, x0, \rs
.endm
.macro sltz rd, rs
msb \rd, \rs
.endm
.macro sgt rd, rs1, rs2
slt \rd, \rs2, \rs1
.endm
.macro sgtu rd, rs1, rs2
sltu \rd, \rs2, \rs1
.endm
.macro sgtz rd, rs
slt \rd, x0, \rs
.endm
#if 0
# already defined above
.macro seqz rd, rs
sltiu \rd, \rs, 1
.endm
#endif
.macro beqz rs, offset
beq \rs, x0, \offset
.endm
.macro bnez rs, offset
bne \rs, x0, \offset
.endm
.macro blez rs, offset
bge x0, \rs, \offset
.endm
.macro bgez rs, offset
bge \rs, x0, \offset
.endm
.macro bltz rs, offset
blt \rs, x0, \offset
.endm
.macro bgtz rs, offset
blt x0, \rs, \offset
.endm
.macro bgt rs, rt, offset
blt \rt, \rs, \offset
.endm
.macro ble rs, rt, offset
bge \rt, \rs, \offset
.endm
.macro bgtu rs, rt, offset
bltu \rt, \rs, \offset
.endm
.macro bleu rs, rt, offset
bgeu \rt, \rs, \offset
.endm
#undef _concat
#undef concat
#undef PUSH
#undef POP