Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

lz4: fix kernel decompression speed

This patch replaces all memcpy() calls with LZ4_memcpy() which calls
__builtin_memcpy() so the compiler can inline it.

LZ4 relies heavily on memcpy() with a constant size being inlined. In x86
and i386 pre-boot environments memcpy() cannot be inlined because memcpy()
doesn't get defined as __builtin_memcpy().

An equivalent patch has been applied upstream so that the next import
won't lose this change [1].

I've measured the kernel decompression speed using QEMU before and after
this patch for the x86_64 and i386 architectures. The speed-up is about
10x as shown below.

Code Arch Kernel Size Time Speed
v5.8 x86_64 11504832 B 148 ms 79 MB/s
patch x86_64 11503872 B 13 ms 885 MB/s
v5.8 i386 9621216 B 91 ms 106 MB/s
patch i386 9620224 B 10 ms 962 MB/s

I also measured the time to decompress the initramfs on x86_64, i386, and
arm. All three show the same decompression speed before and after, as
expected.

[1] https://github.com/lz4/lz4/pull/890

Signed-off-by: Nick Terrell <terrelln@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Yann Collet <yann.collet.73@gmail.com>
Cc: Gao Xiang <gaoxiang25@huawei.com>
Cc: Sven Schmidt <4sschmid@informatik.uni-hamburg.de>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Arvind Sankar <nivedita@alum.mit.edu>
Link: http://lkml.kernel.org/r/20200803194022.2966806-1-nickrterrell@gmail.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Nick Terrell and committed by
Linus Torvalds
b1a3e75e a8a4b7ae

+22 -12
+2 -2
lib/lz4/lz4_compress.c
··· 446 446 *op++ = (BYTE)(lastRun << ML_BITS); 447 447 } 448 448 449 - memcpy(op, anchor, lastRun); 449 + LZ4_memcpy(op, anchor, lastRun); 450 450 451 451 op += lastRun; 452 452 } ··· 708 708 } else { 709 709 *op++ = (BYTE)(lastRunSize<<ML_BITS); 710 710 } 711 - memcpy(op, anchor, lastRunSize); 711 + LZ4_memcpy(op, anchor, lastRunSize); 712 712 op += lastRunSize; 713 713 } 714 714
+9 -9
lib/lz4/lz4_decompress.c
··· 153 153 && likely((endOnInput ? ip < shortiend : 1) & 154 154 (op <= shortoend))) { 155 155 /* Copy the literals */ 156 - memcpy(op, ip, endOnInput ? 16 : 8); 156 + LZ4_memcpy(op, ip, endOnInput ? 16 : 8); 157 157 op += length; ip += length; 158 158 159 159 /* ··· 172 172 (offset >= 8) && 173 173 (dict == withPrefix64k || match >= lowPrefix)) { 174 174 /* Copy the match. */ 175 - memcpy(op + 0, match + 0, 8); 176 - memcpy(op + 8, match + 8, 8); 177 - memcpy(op + 16, match + 16, 2); 175 + LZ4_memcpy(op + 0, match + 0, 8); 176 + LZ4_memcpy(op + 8, match + 8, 8); 177 + LZ4_memcpy(op + 16, match + 16, 2); 178 178 op += length + MINMATCH; 179 179 /* Both stages worked, load the next token. */ 180 180 continue; ··· 263 263 } 264 264 } 265 265 266 - memcpy(op, ip, length); 266 + LZ4_memcpy(op, ip, length); 267 267 ip += length; 268 268 op += length; 269 269 ··· 350 350 size_t const copySize = (size_t)(lowPrefix - match); 351 351 size_t const restSize = length - copySize; 352 352 353 - memcpy(op, dictEnd - copySize, copySize); 353 + LZ4_memcpy(op, dictEnd - copySize, copySize); 354 354 op += copySize; 355 355 if (restSize > (size_t)(op - lowPrefix)) { 356 356 /* overlap copy */ ··· 360 360 while (op < endOfMatch) 361 361 *op++ = *copyFrom++; 362 362 } else { 363 - memcpy(op, lowPrefix, restSize); 363 + LZ4_memcpy(op, lowPrefix, restSize); 364 364 op += restSize; 365 365 } 366 366 } ··· 386 386 while (op < copyEnd) 387 387 *op++ = *match++; 388 388 } else { 389 - memcpy(op, match, mlen); 389 + LZ4_memcpy(op, match, mlen); 390 390 } 391 391 op = copyEnd; 392 392 if (op == oend) ··· 400 400 op[2] = match[2]; 401 401 op[3] = match[3]; 402 402 match += inc32table[offset]; 403 - memcpy(op + 4, match, 4); 403 + LZ4_memcpy(op + 4, match, 4); 404 404 match -= dec64table[offset]; 405 405 } else { 406 406 LZ4_copy8(op, match);
+10
lib/lz4/lz4defs.h
··· 137 137 return put_unaligned_le16(value, memPtr); 138 138 } 139 139 140 + /* 141 + * LZ4 relies on memcpy with a constant size being inlined. In freestanding 142 + * environments, the compiler can't assume the implementation of memcpy() is 143 + * standard compliant, so apply its specialized memcpy() inlining logic. When 144 + * possible, use __builtin_memcpy() to tell the compiler to analyze memcpy() 145 + * as-if it were standard compliant, so it can inline it in freestanding 146 + * environments. This is needed when decompressing the Linux Kernel, for example. 147 + */ 148 + #define LZ4_memcpy(dst, src, size) __builtin_memcpy(dst, src, size) 149 + 140 150 static FORCE_INLINE void LZ4_copy8(void *dst, const void *src) 141 151 { 142 152 #if LZ4_ARCH64
+1 -1
lib/lz4/lz4hc_compress.c
··· 570 570 *op++ = (BYTE) lastRun; 571 571 } else 572 572 *op++ = (BYTE)(lastRun<<ML_BITS); 573 - memcpy(op, anchor, iend - anchor); 573 + LZ4_memcpy(op, anchor, iend - anchor); 574 574 op += iend - anchor; 575 575 } 576 576