File ipxe-zbin-Use-LZMA-compression.patch of Package qemu

From 67fad70610d5e06dcdc7f38db99221491545b91b Mon Sep 17 00:00:00 2001
From: Michael Brown <mcb30@ipxe.org>
Date: Sat, 21 Feb 2015 14:29:53 +0000
Subject: [PATCH 2/3] [zbin] Use LZMA compression

LZMA provides significantly better compression (by ~15%) than the
current NRV2B algorithm.

We use a raw LZMA stream (aka LZMA1) to avoid the need for code to
parse the LZMA2 block headers.  We use parameters {lc=2,lp=0,pb=0} to
reduce the stack space required by the decompressor to acceptable
levels (around 8kB).  Using lc=3 or pb=2 would give marginally better
compression, but at the cost of substantially increasing the required
stack space.

The build process now requires the liblzma headers to be present on
the build system, since we do not include a copy of an LZMA compressor
within the iPXE source tree.  The decompressor is written from scratch
(based on XZ Embedded) and is entirely self-contained within the
iPXE source.

The branch-call-jump (BCJ) filter used to improve the compressibility
is specific to iPXE.  We choose not to use liblzma's built-in BCJ
filter since the algorithm is complex and undocumented.  Our BCJ
filter achieves approximately the same results (on typical iPXE
binaries) with a substantially simpler algorithm.

Signed-off-by: Michael Brown <mcb30@ipxe.org>
(cherry picked from commit 5350b65a3ce727f20e9d9fa7b9c2c0af52cfb7bd)
Signed-off-by: Bruce Rogers <brogers@suse.com>
---
 src/Makefile.housekeeping       |  11 +-
 src/arch/i386/prefix/unlzma.S   | 905 ++++++++++++++++++++++++++++++++++++++++
 src/arch/i386/prefix/unlzma16.S |   9 +
 src/util/zbin.c                 |  96 ++++-
 4 files changed, 1004 insertions(+), 17 deletions(-)
 create mode 100644 src/arch/i386/prefix/unlzma.S
 create mode 100644 src/arch/i386/prefix/unlzma16.S

diff --git a/src/Makefile.housekeeping b/src/Makefile.housekeeping
index 1a75d39..4dc2202 100644
--- a/src/Makefile.housekeeping
+++ b/src/Makefile.housekeeping
@@ -1231,15 +1231,12 @@ endif # defined(BIN)
 #
 # The compression utilities
 #
-$(NRV2B) : util/nrv2b.c $(MAKEDEPS)
-	$(QM)$(ECHO) "  [HOSTCC] $@"
-	$(Q)$(HOST_CC) $(HOST_CFLAGS) -DENCODE -DDECODE -DMAIN -DVERBOSE \
-		       -DNDEBUG -DBITSIZE=32 -DENDIAN=0 -o $@ $<
-CLEANUP	+= $(NRV2B)
 
-$(ZBIN) : util/zbin.c util/nrv2b.c $(MAKEDEPS)
+ZBIN_LDFLAGS := -llzma
+
+$(ZBIN) : util/zbin.c $(MAKEDEPS)
 	$(QM)$(ECHO) "  [HOSTCC] $@"
-	$(Q)$(HOST_CC) $(HOST_CFLAGS) -o $@ $<
+	$(Q)$(HOST_CC) $(HOST_CFLAGS) $< $(ZBIN_LDFLAGS) -o $@
 CLEANUP += $(ZBIN)
 
 ###############################################################################
diff --git a/src/arch/i386/prefix/unlzma.S b/src/arch/i386/prefix/unlzma.S
new file mode 100644
index 0000000..f6c7428
--- /dev/null
+++ b/src/arch/i386/prefix/unlzma.S
@@ -0,0 +1,905 @@
+/*
+ * Copyright (C) 2015 Michael Brown <mbrown@fensystems.co.uk>.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+
+FILE_LICENCE ( GPL2_OR_LATER );
+
+/****************************************************************************
+ *
+ * This file provides the decompress() and decompress16() functions
+ * which can be called in order to decompress an LZMA-compressed
+ * image.  The code is modelled on the public-domain "XZ Embedded"
+ * implementation as used by the Linux kernel.  Symbol names are
+ * chosen to match the XZ Embedded implementation where possible, for
+ * ease of reference.
+ *
+ * This code is optimised for size rather than speed, since the amount
+ * of data to be decompressed is trivially small by modern standards.
+ *
+ * The same basic assembly code is used to compile both decompress()
+ * and decompress16().
+ *
+ * Note that these functions require large amounts of stack space.
+ *
+ ****************************************************************************
+ */
+
+	.text
+	.arch i586
+	.section ".prefix.lib", "ax", @progbits
+
+#ifdef CODE16
+#define ADDR16
+#define ADDR32 addr32
+#define decompress decompress16
+	.code16
+#else /* CODE16 */
+#define ADDR16 addr16
+#define ADDR32
+	.code32
+#endif /* CODE16 */
+
+/****************************************************************************
+ * Debugging (via 0xe9 debug port)
+ ****************************************************************************
+ */
+
+#define DEBUG 0
+
+#if DEBUG
+	.macro	print_character, char
+	pushw	%ax
+	movb 	$\char, %al
+	outb	%al, $0xe9
+	popw	%ax
+	.endm
+
+	.macro	print_hex_nibble
+	cmpb	$10, %al
+	sbb	$0x69, %al
+	das
+	outb	%al, $0xe9
+	.endm
+
+	.macro	print_hex_byte, reg
+	pushfl
+	pushw	%ax
+	movb	\reg, %al
+	pushw	%ax
+	shrb	$4, %al
+	print_hex_nibble
+	popw	%ax
+	andb	$0x0f, %al
+	print_hex_nibble
+	popw	%ax
+	popfl
+	.endm
+
+	.macro	print_hex_word, reg
+	pushw	%ax
+	movw	\reg, %ax
+	print_hex_byte %ah
+	print_hex_byte %al
+	popw	%ax
+	.endm
+
+	.macro	print_hex_dword, reg
+	pushl	%eax
+	movl	\reg, %eax
+	rorl	$16, %eax
+	print_hex_word %ax
+	rorl	$16, %eax
+	print_hex_word %ax
+	popl	%eax
+	.endm
+#else
+	.macro	print_character, char
+	.endm
+	.macro	print_hex_byte, reg
+	.endm
+	.macro	print_hex_word, reg
+	.endm
+	.macro	print_hex_dword, reg
+	.endm
+#endif
+
+/****************************************************************************
+ * LZMA parameters and data structures
+ ****************************************************************************
+ */
+
+/* LZMA decompressor states (as used in XZ Embedded) */
+#define STATE_LIT_LIT 0x00
+#define STATE_MATCH_LIT_LIT 0x01
+#define STATE_REP_LIT_LIT 0x02
+#define STATE_SHORTREP_LIT_LIT 0x03
+#define STATE_MATCH_LIT 0x04
+#define STATE_REP_LIT 0x05
+#define STATE_SHORTREP_LIT 0x06
+#define STATE_LIT_MATCH 0x07
+#define STATE_LIT_LONGREP 0x08
+#define STATE_LIT_SHORTREP 0x09
+#define STATE_NONLIT_MATCH 0x0a
+#define STATE_NONLIT_REP 0x0b
+
+/* LZMA maximum decompressor state in which most recent symbol was a literal */
+#define STATE_LIT_MAX 0x06
+
+/* LZMA number of literal context bits ("lc=" parameter) */
+#define LZMA_LC 2
+
+	.struct	0
+lzma_len_dec:
+choice:		.word	0
+choice2:	.word	0
+low:		.rept	( 1 << 3 )
+		.word	0
+		.endr
+mid:		.rept	( 1 << 3 )
+		.word	0
+		.endr
+high:		.rept	( 1 << 8 )
+		.word	0
+		.endr
+	.equ	sizeof__lzma_len_dec, . - lzma_len_dec
+	.previous
+
+	.struct	0
+lzma_dec:
+in_start:	.long	0
+out_start:	.long	0
+rc_code:	.long	0
+rc_range:	.long	0
+len:		.word	0
+reps:
+rep0:		.long	0
+rep1:		.long	0
+rep2:		.long	0
+rep3:		.long	0
+probs:
+is_match:	.word	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+is_rep:		.word	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+is_rep0:	.word	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+is_rep1:	.word	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+is_rep2:	.word	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+is_rep0_long:	.word	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+dist_slot:	.rept	( 4 * ( 1 << 6 ) )
+		.word	0
+		.endr
+dist_special:	.rept	( ( 1 << ( 14 / 2 ) ) - 14 )
+		.word	0
+		.endr
+dist_align:	.rept	( 1 << 4 )
+		.word	0
+		.endr
+match_len_dec:	.space	sizeof__lzma_len_dec
+rep_len_dec:	.space	sizeof__lzma_len_dec
+literal:	.rept	( ( 1 << LZMA_LC ) * 0x300 )
+		.word	0
+		.endr
+	.align	4
+	.equ	sizeof__lzma_dec, . - lzma_dec
+	.previous
+
+/*****************************************************************************
+ * Normalise range encoder
+ *
+ * Parameters:
+ *   %ss:%ebp : LZMA parameter block
+ *   %ds:%esi : compressed input data pointer
+ * Returns:
+ *   %ds:%esi : compressed input data pointer (possibly updated)
+ *   %eax : current range
+ * Corrupts:
+ *   %eax
+ *****************************************************************************
+ */
+rc_normalise:
+	/* Check if rc_range is less than 1<<24 */
+	testb	$0xff, (rc_range+3)(%ebp)
+	jnz	1f
+	/* If it is, shift in a new byte from the compressed input data */
+	shll	$8, rc_range(%ebp)
+	shll	$8, rc_code(%ebp)
+	ADDR32 lodsb
+	movb	%al, (rc_code+0)(%ebp)
+1:	/* Return current range */
+	movl	rc_range(%ebp), %eax
+	ret
+	.size	rc_normalise, . - rc_normalise
+
+/*****************************************************************************
+ * Decode single range-encoded bit using a probability estimate
+ *
+ * Parameters:
+ *   %ss:%ebp : LZMA parameter block
+ *   %ds:%esi : compressed input data pointer
+ *   %ebx : probability estimate pointer (offset from %ebp)
+ * Returns:
+ *   %ds:%esi : compressed input data pointer (possibly updated)
+ *   CF : decoded bit
+ *   ZF : inverse of decoded bit
+ * Corrupts:
+ *   none
+ *****************************************************************************
+ */
+rc_bit:
+	/* Preserve registers */
+	pushl	%eax
+	pushl	%edx
+	/* Perform normalisation */
+	call	rc_normalise
+	/* Calculate bound in %eax and probability estimate in %dx */
+	shrl	$11, %eax
+	movzwl	(%ebp,%ebx), %edx
+	mul	%edx /* will zero %edx */
+	movw	(%ebp,%ebx), %dx
+	/* Compare code against bound */
+	cmpl	%eax, rc_code(%ebp)
+	jae	2f
+1:	/* Code is less than bound */
+	movl	%eax, rc_range(%ebp)
+	negw	%dx
+	addw	$(1<<11), %dx
+	shrw	$5, %dx
+	addw	%dx, (%ebp,%ebx)
+	xorw	%ax, %ax	/* Clear CF, set ZF */
+	jmp	99f
+2:	/* Code is greater than or equal to bound */
+	subl	%eax, rc_range(%ebp)
+	subl	%eax, rc_code(%ebp)
+	shrw	$5, %dx
+	subw	%dx, (%ebp,%ebx)
+	incw	%dx		/* Clear ZF (%dx is 11-bit; can never wrap) */
+	stc			/* Set CF */
+99:	/* Restore registers and return */
+	popl	%edx
+	popl	%eax
+	ret
+	.size	rc_bit, . - rc_bit
+
+/*****************************************************************************
+ * Decode MSB-first bittree
+ *
+ * Parameters:
+ *   %ss:%ebp : LZMA parameter block
+ *   %ds:%esi : compressed input data pointer
+ *   %ebx : probability estimate set pointer (offset from %ebp)
+ *   %cx : number of bits to decode
+ * Returns:
+ *   %ds:%esi : compressed input data pointer (possibly updated)
+ *   %eax : decoded bittree
+ * Corrupts:
+ *   none
+ *****************************************************************************
+ */
+rc_bittree:
+	/* Preserve registers */
+	pushl	%edi
+	pushw	%cx
+	movl	%ebx, %edi
+	/* Initialise registers */
+	movl	$1, %eax
+1:	/* Decode bit */
+	leaw	(%edi,%eax,2), %bx	/* high word always zero anyway */
+	call	rc_bit
+	rclw	%ax
+	ADDR16 loop 1b
+	/* Restore registers, clear unwanted high bit of result, and return */
+	movl	%edi, %ebx
+	popw	%cx
+	popl	%edi
+	btrw	%cx, %ax
+	ret
+	.size	rc_bittree, . - rc_bittree
+
+/*****************************************************************************
+ * Decode LSB-first bittree
+ *
+ * Parameters:
+ *   %ss:%ebp : LZMA parameter block
+ *   %ds:%esi : compressed input data pointer
+ *   %ebx : probability estimate set pointer (offset from %ebp)
+ *   %cx : number of bits to decode
+ * Returns:
+ *   %ds:%esi : compressed input data pointer (possibly updated)
+ *   %eax : decoded bittree
+ * Corrupts:
+ *   none
+ *****************************************************************************
+ */
+rc_bittree_reverse:
+	/* Preserve registers */
+	pushw	%cx
+	/* Decode bittree */
+	call	rc_bittree
+1:	/* Reverse result */
+	rcrb	%al
+	rclb	%ah
+	ADDR16 loop 1b
+	shrw	$8, %ax
+	/* Restore registers and return */
+	popw	%cx
+	ret
+	.size	rc_bittree_reverse, . - rc_bittree_reverse
+
+/*****************************************************************************
+ * Decode MSB-first bittree with optional match byte
+ *
+ * Parameters:
+ *   %ss:%ebp : LZMA parameter block
+ *   %ds:%esi : compressed input data pointer
+ *   %ebx : probability estimate set pointer (offset from %ebp)
+ *   %cl : match byte
+ *   %ch : 1 to use match byte, 0 to ignore match byte
+ * Returns:
+ *   %ds:%esi : compressed input data pointer (possibly updated)
+ *   %eax : decoded bittree
+ * Corrupts:
+ *   none
+ *****************************************************************************
+ */
+rc_bittree_match:
+	/* Preserve registers */
+	pushl	%edi
+	pushw	%cx
+	pushw	%dx
+	movl	%ebx, %edi
+	/* Initialise registers */
+	movl	$1, %eax
+1:	/* Decode bit */
+	rolb	$1, %cl
+	movw	%cx, %dx
+	andb	%dh, %dl		/* match_bit in %dl */
+	movw	%dx, %bx
+	addb	%bl, %bh
+	xorb	%bl, %bl
+	addw	%ax, %bx		/* offset + match_bit + symbol */
+	leaw	(%edi,%ebx,2), %bx	/* high word always zero anyway */
+	call	rc_bit
+	rclw	%ax
+	movb	%al, %dh
+	notb	%dh
+	xorb	%dh, %dl
+	andb	%dl, %ch		/* offset &= ( match_bit ^ bit ) */
+	testb	%ah, %ah
+	jz	1b
+	/* Restore registers, clear unwanted high bit of result, and return */
+	movl	%edi, %ebx
+	popw	%dx
+	popw	%cx
+	popl	%edi
+	xorb	%ah, %ah
+	ret
+	.size	rc_bittree_match, . - rc_bittree_match
+
+/*****************************************************************************
+ * Decode direct bits (no probability estimates)
+ *
+ * Parameters:
+ *   %ss:%ebp : LZMA parameter block
+ *   %ds:%esi : compressed input data pointer
+ *   %cx : number of bits to decode
+ * Returns:
+ *   %ds:%esi : compressed input data pointer (possibly updated)
+ *   %eax : decoded bits
+ * Corrupts:
+ *   none
+ *****************************************************************************
+ */
+rc_direct:
+	/* Preserve registers */
+	pushl	%ebx
+	pushw	%cx
+	pushl	%edx
+	/* Initialise registers */
+	xorl	%edx, %edx
+1:	/* Perform normalisation */
+	call	rc_normalise
+	/* Decode bit */
+	shrl	$1, %eax
+	movl	%eax, rc_range(%ebp)
+	movl	rc_code(%ebp), %ebx
+	subl	%eax, %ebx
+	js	2f
+	movl	%ebx, rc_code(%ebp)
+2:	rcll	%ebx
+	rcll	%edx
+	xorb	$1, %dl
+	ADDR16 loop 1b
+	/* Restore registers and return */
+	movl	%edx, %eax
+	popl	%edx
+	popw	%cx
+	popl	%ebx
+	ret
+	.size	rc_direct, . - rc_direct
+
+/*****************************************************************************
+ * Decode an LZMA literal
+ *
+ * Parameters:
+ *   %ss:%ebp : LZMA parameter block
+ *   %ds:%esi : compressed input data pointer
+ *   %es:%edi : uncompressed output data pointer
+ *   %edx : LZMA state
+ * Returns:
+ *   %ds:%esi : compressed input data pointer (possibly updated)
+ *   %es:%edi : uncompressed output data pointer (updated)
+ *   %edx : LZMA state
+ *   CF : end of payload marker found (always zero)
+ * Corrupts:
+ *   %eax
+ *   %ebx
+ *   %ecx
+ *****************************************************************************
+ *
+ * Literals are coded as an eight-bit tree, using a match byte if the
+ * previous symbol was not a literal.
+ *
+ */
+lzma_literal:
+	/* Get most recent output byte, if available */
+	xorl	%ebx, %ebx
+	cmpl	%esi, in_start(%ebp)
+	je	1f
+	movb	%es:-1(%edi), %bh
+1:	/* Locate probability estimate set */
+	shrb	$( 8 - LZMA_LC ), %bh
+	shlb	$1, %bh
+	leaw	literal(%ebx,%ebx,2), %bx
+	/* Get match byte, if applicable */
+	xorw	%cx, %cx
+	cmpb	$STATE_LIT_MAX, %dl
+	jbe	1f
+	movl	rep0(%ebp), %eax
+	notl	%eax
+	movb	%es:(%edi,%eax), %cl
+	movb	$1, %ch
+1:	/* Decode bittree */
+	call	rc_bittree_match
+	/* Store output byte */
+	ADDR32 stosb
+	print_hex_byte %al
+	print_character ' '
+	/* Update LZMA state */
+	subb	$3, %dl
+	jns	1f
+	xorb	%dl, %dl
+1:	cmpb	$7, %dl
+	jb	1f
+	subb	$3, %dl
+1:	/* Clear CF and return */
+	clc
+	ret
+	.size	lzma_literal, . - lzma_literal
+
+/*****************************************************************************
+ * Decode an LZMA length
+ *
+ * Parameters:
+ *   %ss:%ebp : LZMA parameter block
+ *   %ds:%esi : compressed input data pointer
+ *   %ebx : length parameter pointer (offset from %ebp)
+ * Returns:
+ *   %ds:%esi : compressed input data pointer (possibly updated)
+ * Corrupts:
+ *   %ebx
+ *****************************************************************************
+ *
+ * Lengths are encoded as:
+ *
+ *   "0" + 3 bits    : lengths 2-9 ("low")
+ *   "10" + 3 bits   : lengths 10-17 ("mid")
+ *   "11" + 8 bits   : lengths 18-273 ("high")
+ */
+lzma_len:
+	/* Preserve registers */
+	pushl	%eax
+	pushl	%ecx
+	pushl	%edi
+	movl	%ebx, %edi
+	/* Start by assuming three bits and a base length of 2 */
+	movw	$3, %cx
+	movw	$2, len(%ebp)
+	/* Check low-length choice bit */
+	leal	choice(%edi), %ebx
+	call	rc_bit
+	leal	low(%edi), %ebx
+	jz	1f
+	/* Check high-length choice bit */
+	leal	choice2(%edi), %ebx
+	call	rc_bit
+	leal	mid(%edi), %ebx
+	movb	$10, len(%ebp)
+	jz	1f
+	leal	high(%edi), %ebx
+	movb	$8, %cl
+	movb	$18, len(%ebp)
+1:	/* Get encoded length */
+	call	rc_bittree
+	addw	%ax, len(%ebp)
+	/* Restore registers and return */
+	movl	%edi, %ebx
+	popl	%edi
+	popl	%ecx
+	popl	%eax
+	ret
+	.size	lzma_len, . - lzma_len
+
+/*****************************************************************************
+ * Copy (possibly repeated) matched data
+ *
+ * Parameters:
+ *   %ss:%ebp : LZMA parameter block
+ *   %ds:%esi : compressed input data pointer
+ *   %es:%edi : uncompressed output data pointer
+ *   %cl : repeated match distance index (for repeated matches)
+ *   %eax : match distance (for non-repeated matches)
+ * Returns:
+ *   %ds:%esi : compressed input data pointer (possibly updated)
+ *   %es:%edi : uncompressed output data pointer
+ *   CF : match distance is out of range
+ * Corrupts:
+ *   %eax
+ *   %ebx
+ *   %ecx
+ *****************************************************************************
+ */
+match:	/* Update repeated match list */
+	print_character '['
+	movl	$3, %ecx
+	jmp	1f
+match_rep:
+	print_character '['
+	print_character 'R'
+	print_hex_byte %cl
+	print_character '='
+	movzbl	%cl, %ecx
+	movl	reps(%ebp,%ecx,4), %eax
+	jcxz	2f
+1:	movl	(reps-4)(%ebp,%ecx,4), %ebx
+	movl	%ebx, reps(%ebp,%ecx,4)
+	loop	1b
+	movl	%eax, rep0(%ebp)
+2:	/* Preserve registers */
+	pushl	%esi
+	/* Get stored match length */
+	movzwl	len(%ebp), %ecx
+	print_hex_dword	%eax
+	print_character '+'
+	print_hex_word %cx
+	print_character ']'
+	print_character ' '
+	/* Abort with CF set if match distance is out of range */
+	movl	out_start(%ebp), %esi
+	negl	%esi
+	leal	-1(%edi,%esi), %esi
+	cmpl	%eax, %esi
+	jc	99f
+	/* Perform copy */
+	notl	%eax
+	leal	(%edi,%eax), %esi
+	ADDR32 es rep movsb
+99:	/* Restore registers and return */
+	popl	%esi
+	ret
+	.size	match, . - match
+
+/*****************************************************************************
+ * Decode an LZMA match
+ *
+ * Parameters:
+ *   %ss:%ebp : LZMA parameter block
+ *   %ds:%esi : compressed input data pointer
+ *   %es:%edi : uncompressed output data pointer
+ *   %edx : LZMA state
+ * Returns:
+ *   %ds:%esi : compressed input data pointer (possibly updated)
+ *   %es:%edi : uncompressed output data pointer
+ *   %edx : LZMA state
+ *   CF : end of payload marker found
+ * Corrupts:
+ *   %eax
+ *   %ebx
+ *   %ecx
+ *****************************************************************************
+ *
+ * Matches are encoded as an LZMA length followed by a 6-bit "distance
+ * slot" code, 0-26 fixed-probability bits, and 0-5 context encoded
+ * bits.
+ */
+lzma_match:
+	/* Preserve registers */
+	pushl	%edi
+	/* Update LZMA state */
+	cmpb	$STATE_LIT_MAX, %dl
+	movb	$STATE_LIT_MATCH, %dl
+	jbe	1f
+	movb	$STATE_NONLIT_MATCH, %dl
+1:	/* Decode length */
+	movl	$match_len_dec, %ebx
+	call	lzma_len
+	/* Decode distance slot */
+	movw	len(%ebp), %bx
+	subw	$2, %bx
+	cmpw	$4, %bx
+	jb	1f
+	movw	$3, %bx
+1:	shlw	$7, %bx
+	addw	$dist_slot, %bx
+	movw	$6, %cx
+	call	rc_bittree
+	/* Distance slots 0-3 are literal distances */
+	cmpb	$4, %al
+	jb	99f
+	/* Determine initial bits: 10/11 for even/odd distance codes */
+	movl	%eax, %edi
+	andw	$1, %di
+	orw	$2, %di
+	/* Determine number of context-encoded bits */
+	movw	%ax, %cx
+	shrb	$1, %cl
+	decb	%cl
+	/* Select context to be used in absence of fixed-probability bits */
+	movl	%edi, %ebx
+	shlw	%cl, %bx
+	subw	%ax, %bx
+	leaw	(dist_special-2)(%ebx,%ebx), %bx
+	/* Decode fixed-probability bits, if any */
+	cmpb	$6, %cl
+	jb	1f
+	subb	$4, %cl
+	shll	%cl, %edi
+	call	rc_direct
+	orl	%eax, %edi
+	/* Select context to be used in presence of fixed-probability bits */
+	movb	$4, %cl
+	movl	$dist_align, %ebx
+1:	/* Decode context-encoded bits */
+	shll	%cl, %edi
+	call	rc_bittree_reverse
+	orl	%edi, %eax
+99:	/* Restore registers and tail-call */
+	popl	%edi
+	jmp	match
+	.size	lzma_match, . - lzma_match
+
+/*****************************************************************************
+ * Decode an LZMA repeated match
+ *
+ * Parameters:
+ *   %ss:%ebp : LZMA parameter block
+ *   %ds:%esi : compressed input data pointer
+ *   %es:%edi : uncompressed output data pointer
+ *   %edx : LZMA state
+ * Returns:
+ *   %ds:%esi : compressed input data pointer (possibly updated)
+ *   %es:%edi : uncompressed output data pointer
+ *   %edx : LZMA state
+ *   CF : end of payload marker found
+ * Corrupts:
+ *   %eax
+ *   %ebx
+ *   %ecx
+ *****************************************************************************
+ *
+ * Repeated matches are encoded as:
+ *
+ *   "00"	 : shortrep0 (implicit length 1)
+ *   "01" + len  : longrep0
+ *   "10" + len  : longrep1
+ *   "110" + len : longrep2
+ *   "111" + len : longrep3
+ */
+lzma_rep_match:
+	/* Initially assume longrep0 */
+	movw	$(STATE_LIT_LONGREP << 8), %cx
+	/* Get is_rep0 bit */
+	leal	is_rep0(,%edx,2), %ebx
+	call	rc_bit
+	jnz	1f
+	/* Get is_rep0_long bit */
+	leal	is_rep0_long(,%edx,2), %ebx
+	call	rc_bit
+	jnz	98f
+	movw	$1, len(%ebp)
+	movb	$STATE_LIT_SHORTREP, %ch
+	jmp	99f
+1:	/* Get is_rep1 bit */
+	incb	%cl
+	leal	is_rep1(,%edx,2), %ebx
+	call	rc_bit
+	jz	98f
+	/* Get is_rep2 bit */
+	incb	%cl
+	leal	is_rep2(,%edx,2), %ebx
+	call	rc_bit
+	adcb	$0, %cl
+98:	/* Decode length */
+	movl	$rep_len_dec, %ebx
+	call	lzma_len
+99:	/* Update LZMA state */
+	cmpb	$STATE_LIT_MAX, %dl
+	movb	%ch, %dl
+	jbe	1f
+	movb	$STATE_NONLIT_REP, %dl
+1:	/* Tail call */
+	jmp	match_rep
+	.size	lzma_match, . - lzma_match
+
+/*****************************************************************************
+ * Decode one LZMA symbol
+ *
+ * Parameters:
+ *   %ss:%ebp : LZMA parameter block
+ *   %ds:%esi : compressed input data pointer
+ *   %es:%edi : uncompressed output data pointer
+ *   %edx : LZMA state
+ * Returns:
+ *   %ds:%esi : compressed input data pointer (possibly updated)
+ *   %es:%edi : uncompressed output data pointer (updated)
+ *   %edx : LZMA state
+ *   CF : end of payload marker found
+ * Corrupts:
+ *   %eax
+ *   %ebx
+ *   %ecx
+ *****************************************************************************
+ */
+lzma_decode:
+	/* Get is_match bit */
+	leal	is_match(,%edx,2), %ebx
+	call	rc_bit
+	jz	lzma_literal
+	/* Get is_rep bit */
+	leal	is_rep(,%edx,2), %ebx
+	call	rc_bit
+	jz	lzma_match
+	jmp	lzma_rep_match
+	.size	lzma_decode, . - lzma_decode
+
+/****************************************************************************
+ * Undo effect of branch-call-jump (BCJ) filter
+ *
+ * Parameters:
+ *   %es:%esi : start of uncompressed output data (note %es)
+ *   %es:%edi : end of uncompressed output data
+ * Returns:
+ * Corrupts:
+ *   %eax
+ *   %ebx
+ *   %ecx
+ *   %edx
+ *   %esi
+ *****************************************************************************
+ */
+bcj_filter:
+	/* Store (negative) start of data in %edx */
+	movl	%esi, %edx
+	negl	%edx
+	/* Calculate limit in %ecx */
+	leal	-5(%edi,%edx), %ecx
+1:	/* Calculate offset in %ebx */
+	leal	(%esi,%edx), %ebx
+	/* Check for end of data */
+	cmpl	%ecx, %ebx
+	ja	99f
+	/* Check for an opcode which would be followed by a rel32 address */
+	ADDR32 es lodsb
+	andb	$0xfe, %al
+	cmpb	$0xe8, %al
+	jne	1b
+	/* Get current jump target value in %eax */
+	ADDR32 es lodsl
+	/* Convert absolute addresses in the range [0,limit) back to
+	 * relative addresses in the range [-offset,limit-offset).
+	 */
+	cmpl	%ecx, %eax
+	jae	2f
+	subl	%ebx,%es:-4(%esi)
+2:	/* Convert negative numbers in the range [-offset,0) back to
+	 * positive numbers in the range [limit-offset,limit).
+	 */
+	notl	%eax	/* Range is now [0,offset) */
+	cmpl	%ebx, %eax
+	jae	1b
+	addl	%ecx,%es:-4(%esi)
+	jmp	1b
+99:	/* Return */
+	ret
+	.size	bcj_filter, . - bcj_filter
+
+/****************************************************************************
+ * decompress (real-mode or 16/32-bit protected-mode near call)
+ *
+ * Decompress data
+ *
+ * Parameters (passed via registers):
+ *   %ds:%esi : Start of compressed input data
+ *   %es:%edi : Start of output buffer
+ * Returns:
+ *   %ds:%esi - End of compressed input data
+ *   %es:%edi - End of decompressed output data
+ *   All other registers are preserved
+ *
+ * NOTE: It would be possible to build a smaller version of the
+ * decompression code for -DKEEP_IT_REAL by using 16-bit registers
+ * where possible.
+ ****************************************************************************
+ */
+	.globl	decompress
+decompress:
+	/* Preserve registers */
+	pushl	%eax
+	pushl	%ebx
+	pushl	%ecx
+	pushl	%edx
+	pushl	%ebp
+	/* Allocate parameter block */
+	subl	$sizeof__lzma_dec, %esp
+	movl	%esp, %ebp
+	/* Zero parameter block and set all probabilities to 0.5 */
+	pushl	%edi
+	pushw	%es
+	pushw	%ss
+	popw	%es
+	movl	%ebp, %edi
+	xorl	%eax, %eax
+	movl	$( sizeof__lzma_dec / 4 ), %ecx
+	ADDR32 rep stosl
+	leal	probs(%ebp), %edi
+	movw	$( ( 1 << 11 ) / 2 ), %ax
+	movl	$( ( sizeof__lzma_dec - probs ) / 2 ), %ecx
+	ADDR32 rep stosw
+	popw	%es
+	popl	%edi
+	/* Initialise remaining parameters */
+	movl	%esi, in_start(%ebp)
+	movl	%edi, out_start(%ebp)
+	print_character '\n'
+	ADDR32 lodsb	/* discard initial byte */
+	print_hex_byte %al
+	ADDR32 lodsl
+	bswapl	%eax
+	print_hex_dword %eax
+	print_character '\n'
+	movl	%eax, rc_code(%ebp)
+	decl	rc_range(%ebp)
+	movl	$STATE_LIT_LIT, %edx
+1:	/* Decompress until we reach end of buffer */
+	call	lzma_decode
+	jnc	1b
+	print_character '\n'
+	/* Undo BCJ filter */
+	pushl	%esi
+	movl	out_start(%ebp), %esi
+	call	bcj_filter
+	popl	%esi
+	/* Restore registers and return */
+	addl	$sizeof__lzma_dec, %esp
+	popl	%ebp
+	popl	%edx
+	popl	%ecx
+	popl	%ebx
+	popl	%eax
+	ret
+
+	/* Specify minimum amount of stack space required */
+	.globl	_min_decompress_stack
+	.equ	_min_decompress_stack, ( sizeof__lzma_dec + 512 /* margin */ )
diff --git a/src/arch/i386/prefix/unlzma16.S b/src/arch/i386/prefix/unlzma16.S
new file mode 100644
index 0000000..f58b0f7
--- /dev/null
+++ b/src/arch/i386/prefix/unlzma16.S
@@ -0,0 +1,9 @@
+/*
+ * 16-bit version of the decompressor
+ *
+ */
+
+FILE_LICENCE ( GPL2_OR_LATER )
+
+#define CODE16
+#include "unlzma.S"
diff --git a/src/util/zbin.c b/src/util/zbin.c
index 3b7cf95..1862a38 100644
--- a/src/util/zbin.c
+++ b/src/util/zbin.c
@@ -1,13 +1,21 @@
+#include <stdint.h>
 #include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
 #include <sys/stat.h>
-
-#define ENCODE
-#define VERBOSE
-#include "nrv2b.c"
-FILE *infile, *outfile;
+#include <lzma.h>
 
 #define DEBUG 0
 
+/* LZMA filter choices.  Must match those used by unlzma.S */
+#define LZMA_LC 2
+#define LZMA_LP 0
+#define LZMA_PB 0
+
+/* LZMA preset choice.  This is a policy decision */
+#define LZMA_PRESET ( LZMA_PRESET_DEFAULT | LZMA_PRESET_EXTREME )
+
 struct input_file {
 	void *buf;
 	size_t len;
@@ -177,13 +185,75 @@ static int process_zinfo_copy ( struct input_file *input,
 	return 0;
 }
 
+#define OPCODE_CALL 0xe8
+#define OPCODE_JMP 0xe9
+
+static void bcj_filter ( void *data, size_t len ) {
+	struct {
+		uint8_t opcode;
+		int32_t target;
+	} __attribute__ (( packed )) *jump;
+	ssize_t limit = ( len - sizeof ( *jump ) );
+	ssize_t offset;
+
+	/* liblzma does include an x86 BCJ filter, but it's hideously
+	 * convoluted and undocumented.  This BCJ filter is
+	 * substantially simpler and achieves the same compression (at
+	 * the cost of requiring the decompressor to know the size of
+	 * the decompressed data, which we already have in iPXE).
+	 */
+	for ( offset = 0 ; offset <= limit ; offset++ ) {
+		jump = ( data + offset );
+
+		/* Skip instructions that are not followed by a rel32 address */
+		if ( ( jump->opcode != OPCODE_CALL ) &&
+		     ( jump->opcode != OPCODE_JMP ) )
+			continue;
+
+		/* Convert rel32 address to an absolute address.  To
+		 * avoid false positives (which damage the compression
+		 * ratio), we should check that the jump target is
+		 * within the range [0,limit).
+		 *
+		 * Some output values would then end up being mapped
+		 * from two distinct input values, making the
+		 * transformation irreversible.  To solve this, we
+		 * transform such values back into the part of the
+		 * range which would otherwise correspond to no input
+		 * values.
+		 */
+		if ( ( jump->target >= -offset ) &&
+		     ( jump->target < ( limit - offset ) ) ) {
+			/* Convert relative addresses in the range
+			 * [-offset,limit-offset) to absolute
+			 * addresses in the range [0,limit).
+			 */
+			jump->target += offset;
+		} else if ( ( jump->target >= ( limit - offset ) ) &&
+			    ( jump->target < limit ) ) {
+			/* Convert positive numbers in the range
+			 * [limit-offset,limit) to negative numbers in
+			 * the range [-offset,0).
+			 */
+			jump->target -= limit;
+		}
+		offset += sizeof ( jump->target );
+	};
+}
+
 static int process_zinfo_pack ( struct input_file *input,
 				struct output_file *output,
 				union zinfo_record *zinfo ) {
 	struct zinfo_pack *pack = &zinfo->pack;
 	size_t offset = pack->offset;
 	size_t len = pack->len;
-	unsigned long packed_len;
+	size_t packed_len = 0;
+	size_t remaining = ( output->max_len - output->len );
+	lzma_options_lzma options;
+	const lzma_filter filters[] = {
+		{ .id = LZMA_FILTER_LZMA1, .options = &options },
+		{ .id = LZMA_VLI_UNKNOWN }
+	};
 
 	if ( ( offset + len ) > input->len ) {
 		fprintf ( stderr, "Input buffer overrun on pack\n" );
@@ -196,9 +266,15 @@ static int process_zinfo_pack ( struct input_file *input,
 		return -1;
 	}
 
-	if ( ucl_nrv2b_99_compress ( ( input->buf + offset ), len,
-				     ( output->buf + output->len ),
-				     &packed_len, 0 ) != UCL_E_OK ) {
+	bcj_filter ( ( input->buf + offset ), len );
+
+	lzma_lzma_preset ( &options, LZMA_PRESET );
+	options.lc = LZMA_LC;
+	options.lp = LZMA_LP;
+	options.pb = LZMA_PB;
+	if ( lzma_raw_buffer_encode ( filters, NULL, ( input->buf + offset ),
+				      len, ( output->buf + output->len ),
+				      &packed_len, remaining ) != LZMA_OK ) {
 		fprintf ( stderr, "Compression failure\n" );
 		return -1;
 	}
@@ -206,7 +282,7 @@ static int process_zinfo_pack ( struct input_file *input,
 	if ( DEBUG ) {
 		fprintf ( stderr, "PACK [%#zx,%#zx) to [%#zx,%#zx)\n",
 			  offset, ( offset + len ), output->len,
-			  ( size_t )( output->len + packed_len ) );
+			  ( output->len + packed_len ) );
 	}
 
 	output->len += packed_len;
-- 
1.9.0