1 | ///////////////////////////////////////////////////////////////////////////////
|
---|
2 | //
|
---|
3 | /// \file arm64.c
|
---|
4 | /// \brief Filter for ARM64 binaries
|
---|
5 | ///
|
---|
6 | /// This converts ARM64 relative addresses in the BL and ADRP immediates
|
---|
7 | /// to absolute values to increase redundancy of ARM64 code.
|
---|
8 | ///
|
---|
9 | /// Converting B or ADR instructions was also tested but it's not useful.
|
---|
10 | /// A majority of the jumps for the B instruction are very small (+/- 0xFF).
|
---|
11 | /// These are typical for loops and if-statements. Encoding them to their
|
---|
12 | /// absolute address reduces redundancy since many of the small relative
|
---|
13 | /// jump values are repeated, but very few of the absolute addresses are.
|
---|
14 | //
|
---|
15 | // Authors: Lasse Collin
|
---|
16 | // Jia Tan
|
---|
17 | // Igor Pavlov
|
---|
18 | //
|
---|
19 | // This file has been put into the public domain.
|
---|
20 | // You can do whatever you want with this file.
|
---|
21 | //
|
---|
22 | ///////////////////////////////////////////////////////////////////////////////
|
---|
23 |
|
---|
24 | #include "simple_private.h"
|
---|
25 |
|
---|
26 |
|
---|
27 | static size_t
|
---|
28 | arm64_code(void *simple lzma_attribute((__unused__)),
|
---|
29 | uint32_t now_pos, bool is_encoder,
|
---|
30 | uint8_t *buffer, size_t size)
|
---|
31 | {
|
---|
32 | size_t i;
|
---|
33 |
|
---|
34 | // Clang 14.0.6 on x86-64 makes this four times bigger and 40 % slower
|
---|
35 | // with auto-vectorization that is enabled by default with -O2.
|
---|
36 | // Such vectorization bloat happens with -O2 when targeting ARM64 too
|
---|
37 | // but performance hasn't been tested.
|
---|
38 | #ifdef __clang__
|
---|
39 | # pragma clang loop vectorize(disable)
|
---|
40 | #endif
|
---|
41 | for (i = 0; i + 4 <= size; i += 4) {
|
---|
42 | uint32_t pc = (uint32_t)(now_pos + i);
|
---|
43 | uint32_t instr = read32le(buffer + i);
|
---|
44 |
|
---|
45 | if ((instr >> 26) == 0x25) {
|
---|
46 | // BL instruction:
|
---|
47 | // The full 26-bit immediate is converted.
|
---|
48 | // The range is +/-128 MiB.
|
---|
49 | //
|
---|
50 | // Using the full range is helps quite a lot with
|
---|
51 | // big executables. Smaller range would reduce false
|
---|
52 | // positives in non-code sections of the input though
|
---|
53 | // so this is a compromise that slightly favors big
|
---|
54 | // files. With the full range only six bits of the 32
|
---|
55 | // need to match to trigger a conversion.
|
---|
56 | const uint32_t src = instr;
|
---|
57 | instr = 0x94000000;
|
---|
58 |
|
---|
59 | pc >>= 2;
|
---|
60 | if (!is_encoder)
|
---|
61 | pc = 0U - pc;
|
---|
62 |
|
---|
63 | instr |= (src + pc) & 0x03FFFFFF;
|
---|
64 | write32le(buffer + i, instr);
|
---|
65 |
|
---|
66 | } else if ((instr & 0x9F000000) == 0x90000000) {
|
---|
67 | // ADRP instruction:
|
---|
68 | // Only values in the range +/-512 MiB are converted.
|
---|
69 | //
|
---|
70 | // Using less than the full +/-4 GiB range reduces
|
---|
71 | // false positives on non-code sections of the input
|
---|
72 | // while being excellent for executables up to 512 MiB.
|
---|
73 | // The positive effect of ADRP conversion is smaller
|
---|
74 | // than that of BL but it also doesn't hurt so much in
|
---|
75 | // non-code sections of input because, with +/-512 MiB
|
---|
76 | // range, nine bits of 32 need to match to trigger a
|
---|
77 | // conversion (two 10-bit match choices = 9 bits).
|
---|
78 | const uint32_t src = ((instr >> 29) & 3)
|
---|
79 | | ((instr >> 3) & 0x001FFFFC);
|
---|
80 |
|
---|
81 | // With the addition only one branch is needed to
|
---|
82 | // check the +/- range. This is usually false when
|
---|
83 | // processing ARM64 code so branch prediction will
|
---|
84 | // handle it well in terms of performance.
|
---|
85 | //
|
---|
86 | //if ((src & 0x001E0000) != 0
|
---|
87 | // && (src & 0x001E0000) != 0x001E0000)
|
---|
88 | if ((src + 0x00020000) & 0x001C0000)
|
---|
89 | continue;
|
---|
90 |
|
---|
91 | instr &= 0x9000001F;
|
---|
92 |
|
---|
93 | pc >>= 12;
|
---|
94 | if (!is_encoder)
|
---|
95 | pc = 0U - pc;
|
---|
96 |
|
---|
97 | const uint32_t dest = src + pc;
|
---|
98 | instr |= (dest & 3) << 29;
|
---|
99 | instr |= (dest & 0x0003FFFC) << 3;
|
---|
100 | instr |= (0U - (dest & 0x00020000)) & 0x00E00000;
|
---|
101 | write32le(buffer + i, instr);
|
---|
102 | }
|
---|
103 | }
|
---|
104 |
|
---|
105 | return i;
|
---|
106 | }
|
---|
107 |
|
---|
108 |
|
---|
109 | static lzma_ret
|
---|
110 | arm64_coder_init(lzma_next_coder *next, const lzma_allocator *allocator,
|
---|
111 | const lzma_filter_info *filters, bool is_encoder)
|
---|
112 | {
|
---|
113 | return lzma_simple_coder_init(next, allocator, filters,
|
---|
114 | &arm64_code, 0, 4, 4, is_encoder);
|
---|
115 | }
|
---|
116 |
|
---|
117 |
|
---|
118 | #ifdef HAVE_ENCODER_ARM64
|
---|
119 | extern lzma_ret
|
---|
120 | lzma_simple_arm64_encoder_init(lzma_next_coder *next,
|
---|
121 | const lzma_allocator *allocator,
|
---|
122 | const lzma_filter_info *filters)
|
---|
123 | {
|
---|
124 | return arm64_coder_init(next, allocator, filters, true);
|
---|
125 | }
|
---|
126 | #endif
|
---|
127 |
|
---|
128 |
|
---|
129 | #ifdef HAVE_DECODER_ARM64
|
---|
130 | extern lzma_ret
|
---|
131 | lzma_simple_arm64_decoder_init(lzma_next_coder *next,
|
---|
132 | const lzma_allocator *allocator,
|
---|
133 | const lzma_filter_info *filters)
|
---|
134 | {
|
---|
135 | return arm64_coder_init(next, allocator, filters, false);
|
---|
136 | }
|
---|
137 | #endif
|
---|