1
/* xz_dec_bcj.c - Branch/Call/Jump (BCJ) filter decoders */
3
* GRUB -- GRand Unified Bootloader
4
* Copyright (C) 2010 Free Software Foundation, Inc.
6
* GRUB is free software: you can redistribute it and/or modify
7
* it under the terms of the GNU General Public License as published by
8
* the Free Software Foundation, either version 3 of the License, or
9
* (at your option) any later version.
11
* GRUB is distributed in the hope that it will be useful,
12
* but WITHOUT ANY WARRANTY; without even the implied warranty of
13
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
* GNU General Public License for more details.
16
* You should have received a copy of the GNU General Public License
17
* along with GRUB. If not, see <http://www.gnu.org/licenses/>.
20
* This file is based on code from XZ embedded project
21
* http://tukaani.org/xz/embedded.html
24
#include "xz_private.h"
27
/* Type of the BCJ filter being used */
29
BCJ_X86 = 4, /* x86 or x86-64 */
30
BCJ_POWERPC = 5, /* Big endian only */
31
BCJ_IA64 = 6, /* Big or little endian */
32
BCJ_ARM = 7, /* Little endian only */
33
BCJ_ARMTHUMB = 8, /* Little endian only */
34
BCJ_SPARC = 9 /* Big or little endian */
38
* Return value of the next filter in the chain. We need to preserve
39
* this information across calls, because we must not call the next
40
* filter anymore once it has returned XZ_STREAM_END.
44
/* True if we are operating in single-call mode. */
48
* Absolute position relative to the beginning of the uncompressed
49
* data (in a single .xz Block). We care only about the lowest 32
50
* bits so this doesn't need to be uint64_t even with big files.
54
/* x86 filter state */
55
uint32_t x86_prev_mask;
57
/* Temporary space to hold the variables from struct xz_buf */
63
/* Amount of already filtered data in the beginning of buf */
66
/* Total amount of data currently stored in buf */
70
* Buffer to hold a mix of filtered and unfiltered data. This
71
* needs to be big enough to hold Alignment + 2 * Look-ahead:
73
* Type Alignment Look-ahead
87
* This is macro used to test the most significant byte of a memory address
88
* in an x86 instruction.
90
#define bcj_x86_test_msbyte(b) ((b) == 0x00 || (b) == 0xFF)
92
static noinline_for_stack size_t bcj_x86(
93
struct xz_dec_bcj *s, uint8_t *buf, size_t size)
95
static const bool mask_to_allowed_status[8]
96
= { true, true, true, false, true, false, false, false };
98
static const uint8_t mask_to_bit_num[8] = { 0, 1, 2, 2, 3, 3, 3, 3 };
101
size_t prev_pos = (size_t)-1;
102
uint32_t prev_mask = s->x86_prev_mask;
112
for (i = 0; i < size; ++i) {
113
if ((buf[i] & 0xFE) != 0xE8)
116
prev_pos = i - prev_pos;
120
prev_mask = (prev_mask << (prev_pos - 1)) & 7;
121
if (prev_mask != 0) {
122
b = buf[i + 4 - mask_to_bit_num[prev_mask]];
123
if (!mask_to_allowed_status[prev_mask]
124
|| bcj_x86_test_msbyte(b)) {
126
prev_mask = (prev_mask << 1) | 1;
134
if (bcj_x86_test_msbyte(buf[i + 4])) {
135
src = get_unaligned_le32(buf + i + 1);
137
dest = src - (s->pos + (uint32_t)i + 5);
141
j = mask_to_bit_num[prev_mask] * 8;
142
b = (uint8_t)(dest >> (24 - j));
143
if (!bcj_x86_test_msbyte(b))
146
src = dest ^ (((uint32_t)1 << (32 - j)) - 1);
150
dest |= (uint32_t)0 - (dest & 0x01000000);
151
put_unaligned_le32(dest, buf + i + 1);
154
prev_mask = (prev_mask << 1) | 1;
158
prev_pos = i - prev_pos;
159
s->x86_prev_mask = prev_pos > 3 ? 0 : prev_mask << (prev_pos - 1);
164
#ifdef XZ_DEC_POWERPC
165
static noinline_for_stack size_t bcj_powerpc(
166
struct xz_dec_bcj *s, uint8_t *buf, size_t size)
171
for (i = 0; i + 4 <= size; i += 4) {
172
instr = get_unaligned_be32(buf + i);
173
if ((instr & 0xFC000003) == 0x48000001) {
175
instr -= s->pos + (uint32_t)i;
178
put_unaligned_be32(instr, buf + i);
187
static noinline_for_stack size_t bcj_ia64(
188
struct xz_dec_bcj *s, uint8_t *buf, size_t size)
190
static const uint8_t branch_table[32] = {
191
0, 0, 0, 0, 0, 0, 0, 0,
192
0, 0, 0, 0, 0, 0, 0, 0,
193
4, 4, 6, 6, 0, 0, 7, 7,
194
4, 4, 0, 0, 4, 4, 0, 0
198
* The local variables take a little bit stack space, but it's less
199
* than what LZMA2 decoder takes, so it doesn't make sense to reduce
200
* stack usage here without doing that for the LZMA2 decoder too.
207
/* Instruction slot (0, 1, or 2) in the 128-bit instruction word */
210
/* Bitwise offset of the instruction indicated by slot */
213
/* bit_pos split into byte and bit parts */
217
/* Address part of an instruction */
220
/* Mask used to detect which instructions to convert */
223
/* 41-bit instruction stored somewhere in the lowest 48 bits */
226
/* Instruction normalized with bit_res for easier manipulation */
229
for (i = 0; i + 16 <= size; i += 16) {
230
mask = branch_table[buf[i] & 0x1F];
231
for (slot = 0, bit_pos = 5; slot < 3; ++slot, bit_pos += 41) {
232
if (((mask >> slot) & 1) == 0)
235
byte_pos = bit_pos >> 3;
236
bit_res = bit_pos & 7;
238
for (j = 0; j < 6; ++j)
239
instr |= (uint64_t)(buf[i + j + byte_pos])
242
norm = instr >> bit_res;
244
if (((norm >> 37) & 0x0F) == 0x05
245
&& ((norm >> 9) & 0x07) == 0) {
246
addr = (norm >> 13) & 0x0FFFFF;
247
addr |= ((uint32_t)(norm >> 36) & 1) << 20;
249
addr -= s->pos + (uint32_t)i;
252
norm &= ~((uint64_t)0x8FFFFF << 13);
253
norm |= (uint64_t)(addr & 0x0FFFFF) << 13;
254
norm |= (uint64_t)(addr & 0x100000)
257
instr &= (1 << bit_res) - 1;
258
instr |= norm << bit_res;
260
for (j = 0; j < 6; j++)
261
buf[i + j + byte_pos]
262
= (uint8_t)(instr >> (8 * j));
272
static noinline_for_stack size_t bcj_arm(
273
struct xz_dec_bcj *s, uint8_t *buf, size_t size)
278
for (i = 0; i + 4 <= size; i += 4) {
279
if (buf[i + 3] == 0xEB) {
280
addr = (uint32_t)buf[i] | ((uint32_t)buf[i + 1] << 8)
281
| ((uint32_t)buf[i + 2] << 16);
283
addr -= s->pos + (uint32_t)i + 8;
285
buf[i] = (uint8_t)addr;
286
buf[i + 1] = (uint8_t)(addr >> 8);
287
buf[i + 2] = (uint8_t)(addr >> 16);
295
#ifdef XZ_DEC_ARMTHUMB
296
static noinline_for_stack size_t bcj_armthumb(
297
struct xz_dec_bcj *s, uint8_t *buf, size_t size)
302
for (i = 0; i + 4 <= size; i += 2) {
303
if ((buf[i + 1] & 0xF8) == 0xF0
304
&& (buf[i + 3] & 0xF8) == 0xF8) {
305
addr = (((uint32_t)buf[i + 1] & 0x07) << 19)
306
| ((uint32_t)buf[i] << 11)
307
| (((uint32_t)buf[i + 3] & 0x07) << 8)
308
| (uint32_t)buf[i + 2];
310
addr -= s->pos + (uint32_t)i + 4;
312
buf[i + 1] = (uint8_t)(0xF0 | ((addr >> 19) & 0x07));
313
buf[i] = (uint8_t)(addr >> 11);
314
buf[i + 3] = (uint8_t)(0xF8 | ((addr >> 8) & 0x07));
315
buf[i + 2] = (uint8_t)addr;
325
static noinline_for_stack size_t bcj_sparc(
326
struct xz_dec_bcj *s, uint8_t *buf, size_t size)
331
for (i = 0; i + 4 <= size; i += 4) {
332
instr = get_unaligned_be32(buf + i);
333
if ((instr >> 22) == 0x100 || (instr >> 22) == 0x1FF) {
335
instr -= s->pos + (uint32_t)i;
337
instr = ((uint32_t)0x40000000 - (instr & 0x400000))
338
| 0x40000000 | (instr & 0x3FFFFF);
339
put_unaligned_be32(instr, buf + i);
348
* Apply the selected BCJ filter. Update *pos and s->pos to match the amount
349
* of data that got filtered.
351
* NOTE: This is implemented as a switch statement to avoid using function
352
* pointers, which could be problematic in the kernel boot code, which must
353
* avoid pointers to static data (at least on x86).
355
static void bcj_apply(struct xz_dec_bcj *s,
356
uint8_t *buf, size_t *pos, size_t size)
366
filtered = bcj_x86(s, buf, size);
369
#ifdef XZ_DEC_POWERPC
371
filtered = bcj_powerpc(s, buf, size);
376
filtered = bcj_ia64(s, buf, size);
381
filtered = bcj_arm(s, buf, size);
384
#ifdef XZ_DEC_ARMTHUMB
386
filtered = bcj_armthumb(s, buf, size);
391
filtered = bcj_sparc(s, buf, size);
395
/* Never reached but silence compiler warnings. */
405
* Flush pending filtered data from temp to the output buffer.
406
* Move the remaining mixture of possibly filtered and unfiltered
407
* data to the beginning of temp.
409
static void bcj_flush(struct xz_dec_bcj *s, struct xz_buf *b)
413
copy_size = min_t(size_t, s->temp.filtered, b->out_size - b->out_pos);
414
memcpy(b->out + b->out_pos, s->temp.buf, copy_size);
415
b->out_pos += copy_size;
417
s->temp.filtered -= copy_size;
418
s->temp.size -= copy_size;
419
memmove(s->temp.buf, s->temp.buf + copy_size, s->temp.size);
423
* The BCJ filter functions are primitive in sense that they process the
424
* data in chunks of 1-16 bytes. To hide this issue, this function does
427
enum xz_ret xz_dec_bcj_run(struct xz_dec_bcj *s,
428
struct xz_dec_lzma2 *lzma2, struct xz_buf *b)
433
* Flush pending already filtered data to the output buffer. Return
434
* immediatelly if we couldn't flush everything, or if the next
435
* filter in the chain had already returned XZ_STREAM_END.
437
if (s->temp.filtered > 0) {
439
if (s->temp.filtered > 0)
442
if (s->ret == XZ_STREAM_END)
443
return XZ_STREAM_END;
447
* If we have more output space than what is currently pending in
448
* temp, copy the unfiltered data from temp to the output buffer
449
* and try to fill the output buffer by decoding more data from the
450
* next filter in the chain. Apply the BCJ filter on the new data
451
* in the output buffer. If everything cannot be filtered, copy it
452
* to temp and rewind the output buffer position accordingly.
454
if (s->temp.size < b->out_size - b->out_pos) {
455
out_start = b->out_pos;
456
memcpy(b->out + b->out_pos, s->temp.buf, s->temp.size);
457
b->out_pos += s->temp.size;
459
s->ret = xz_dec_lzma2_run(lzma2, b);
460
if (s->ret != XZ_STREAM_END
461
&& (s->ret != XZ_OK || s->single_call))
464
bcj_apply(s, b->out, &out_start, b->out_pos);
467
* As an exception, if the next filter returned XZ_STREAM_END,
468
* we can do that too, since the last few bytes that remain
469
* unfiltered are meant to remain unfiltered.
471
if (s->ret == XZ_STREAM_END)
472
return XZ_STREAM_END;
474
s->temp.size = b->out_pos - out_start;
475
b->out_pos -= s->temp.size;
476
memcpy(s->temp.buf, b->out + b->out_pos, s->temp.size);
480
* If we have unfiltered data in temp, try to fill by decoding more
481
* data from the next filter. Apply the BCJ filter on temp. Then we
482
* hopefully can fill the actual output buffer by copying filtered
483
* data from temp. A mix of filtered and unfiltered data may be left
484
* in temp; it will be taken care on the next call to this function.
486
if (s->temp.size > 0) {
487
/* Make b->out{,_pos,_size} temporarily point to s->temp. */
489
s->out_pos = b->out_pos;
490
s->out_size = b->out_size;
491
b->out = s->temp.buf;
492
b->out_pos = s->temp.size;
493
b->out_size = sizeof(s->temp.buf);
495
s->ret = xz_dec_lzma2_run(lzma2, b);
497
s->temp.size = b->out_pos;
499
b->out_pos = s->out_pos;
500
b->out_size = s->out_size;
502
if (s->ret != XZ_OK && s->ret != XZ_STREAM_END)
505
bcj_apply(s, s->temp.buf, &s->temp.filtered, s->temp.size);
508
* If the next filter returned XZ_STREAM_END, we mark that
509
* everything is filtered, since the last unfiltered bytes
510
* of the stream are meant to be left as is.
512
if (s->ret == XZ_STREAM_END)
513
s->temp.filtered = s->temp.size;
516
if (s->temp.filtered > 0)
523
#ifdef GRUB_EMBED_DECOMPRESSOR
524
struct xz_dec_bcj bcj;
527
struct xz_dec_bcj * xz_dec_bcj_create(bool single_call)
529
struct xz_dec_bcj *s;
530
#ifdef GRUB_EMBED_DECOMPRESSOR
533
s = kmalloc(sizeof(*s), GFP_KERNEL);
536
s->single_call = single_call;
541
enum xz_ret xz_dec_bcj_reset(
542
struct xz_dec_bcj *s, uint8_t id)
548
#ifdef XZ_DEC_POWERPC
557
#ifdef XZ_DEC_ARMTHUMB
566
/* Unsupported Filter ID */
567
return XZ_OPTIONS_ERROR;
573
s->x86_prev_mask = 0;
574
s->temp.filtered = 0;