s2/decode_amd64.s - external/github.com/klauspost/compress - Git at Google

 // Copyright 2016 The Go Authors. All rights reserved.
 // Copyright (c) 2019 Klaus Post. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 // +build !appengine
 // +build gc
 // +build !noasm

 #include "textflag.h"

 #define R_TMP0 AX
 #define R_TMP1 BX
 #define R_LEN CX
 #define R_OFF DX
 #define R_SRC SI
 #define R_DST DI
 #define R_DBASE R8
 #define R_DLEN R9
 #define R_DEND R10
 #define R_SBASE R11
 #define R_SLEN R12
 #define R_SEND R13
 #define R_TMP2 R14
 #define R_TMP3 R15

 // The asm code generally follows the pure Go code in decode_other.go, except
 // where marked with a "!!!".

 // func decode(dst, src []byte) int
 //
 // All local variables fit into registers. The non-zero stack size is only to
 // spill registers and push args when issuing a CALL. The register allocation:
 //	- R_TMP0	scratch
 //	- R_TMP1	scratch
 //	- R_LEN	    length or x (shared)
 //	- R_OFF	    offset
 //	- R_SRC	    &src[s]
 //	- R_DST	    &dst[d]
 //	+ R_DBASE	dst_base
 //	+ R_DLEN	dst_len
 //	+ R_DEND	dst_base + dst_len
 //	+ R_SBASE	src_base
 //	+ R_SLEN	src_len
 //	+ R_SEND	src_base + src_len
 //	- R_TMP2	used by doCopy
 //	- R_TMP3	used by doCopy
 //
 // The registers R_DBASE-R_SEND (marked with a "+") are set at the start of the
 // function, and after a CALL returns, and are not otherwise modified.
 //
 // The d variable is implicitly R_DST - R_DBASE,  and len(dst)-d is R_DEND - R_DST.
 // The s variable is implicitly R_SRC - R_SBASE, and len(src)-s is R_SEND - R_SRC.
 TEXT ·s2Decode(SB), NOSPLIT, $56-56
 	// Initialize R_SRC, R_DST and R_DBASE-R_SEND.
 	MOVQ dst_base+0(FP), R_DBASE
 	MOVQ dst_len+8(FP), R_DLEN
 	MOVQ R_DBASE, R_DST
 	MOVQ R_DBASE, R_DEND
 	ADDQ R_DLEN, R_DEND
 	MOVQ src_base+24(FP), R_SBASE
 	MOVQ src_len+32(FP), R_SLEN
 	MOVQ R_SBASE, R_SRC
 	MOVQ R_SBASE, R_SEND
 	ADDQ R_SLEN, R_SEND
 	XORQ R_OFF, R_OFF

 loop:
 	// for s < len(src)
 	CMPQ R_SRC, R_SEND
 	JEQ  end

 	// R_LEN = uint32(src[s])
 	//
 	// switch src[s] & 0x03
 	MOVBLZX (R_SRC), R_LEN
 	MOVL    R_LEN, R_TMP1
 	ANDL    $3, R_TMP1
 	CMPL    R_TMP1, $1
 	JAE     tagCopy

 	// ----------------------------------------
 	// The code below handles literal tags.

 	// case tagLiteral:
 	// x := uint32(src[s] >> 2)
 	// switch
 	SHRL $2, R_LEN
 	CMPL R_LEN, $60
 	JAE  tagLit60Plus

 	// case x < 60:
 	// s++
 	INCQ R_SRC

 doLit:
 	// This is the end of the inner "switch", when we have a literal tag.
 	//
 	// We assume that R_LEN == x and x fits in a uint32, where x is the variable
 	// used in the pure Go decode_other.go code.

 	// length = int(x) + 1
 	//
 	// Unlike the pure Go code, we don't need to check if length <= 0 because
 	// R_LEN can hold 64 bits, so the increment cannot overflow.
 	INCQ R_LEN

 	// Prepare to check if copying length bytes will run past the end of dst or
 	// src.
 	//
 	// R_TMP0 = len(dst) - d
 	// R_TMP1 = len(src) - s
 	MOVQ R_DEND, R_TMP0
 	SUBQ R_DST, R_TMP0
 	MOVQ R_SEND, R_TMP1
 	SUBQ R_SRC, R_TMP1

 	// !!! Try a faster technique for short (16 or fewer bytes) copies.
 	//
 	// if length > 16 || len(dst)-d < 16 || len(src)-s < 16 {
 	//   goto callMemmove // Fall back on calling runtime·memmove.
 	// }
 	//
 	// The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
 	// against 21 instead of 16, because it cannot assume that all of its input
 	// is contiguous in memory and so it needs to leave enough source bytes to
 	// read the next tag without refilling buffers, but Go's Decode assumes
 	// contiguousness (the src argument is a []byte).
 	CMPQ R_LEN, $16
 	JGT  callMemmove
 	CMPQ R_TMP0, $16
 	JLT  callMemmove
 	CMPQ R_TMP1, $16
 	JLT  callMemmove

 	// !!! Implement the copy from src to dst as a 16-byte load and store.
 	// (Decode's documentation says that dst and src must not overlap.)
 	//
 	// This always copies 16 bytes, instead of only length bytes, but that's
 	// OK. If the input is a valid Snappy encoding then subsequent iterations
 	// will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
 	// non-nil error), so the overrun will be ignored.
 	//
 	// Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
 	// 16-byte loads and stores. This technique probably wouldn't be as
 	// effective on architectures that are fussier about alignment.
 	MOVOU 0(R_SRC), X0
 	MOVOU X0, 0(R_DST)

 	// d += length
 	// s += length
 	ADDQ R_LEN, R_DST
 	ADDQ R_LEN, R_SRC
 	JMP  loop

 callMemmove:
 	// if length > len(dst)-d || length > len(src)-s { etc }
 	CMPQ R_LEN, R_TMP0
 	JGT  errCorrupt
 	CMPQ R_LEN, R_TMP1
 	JGT  errCorrupt

 	// copy(dst[d:], src[s:s+length])
 	//
 	// This means calling runtime·memmove(&dst[d], &src[s], length), so we push
 	// R_DST, R_SRC and R_LEN as arguments. Coincidentally, we also need to spill those
 	// three registers to the stack, to save local variables across the CALL.
 	MOVQ R_DST, 0(SP)
 	MOVQ R_SRC, 8(SP)
 	MOVQ R_LEN, 16(SP)
 	MOVQ R_DST, 24(SP)
 	MOVQ R_SRC, 32(SP)
 	MOVQ R_LEN, 40(SP)
 	MOVQ R_OFF, 48(SP)
 	CALL runtime·memmove(SB)

 	// Restore local variables: unspill registers from the stack and
 	// re-calculate R_DBASE-R_SEND.
 	MOVQ 24(SP), R_DST
 	MOVQ 32(SP), R_SRC
 	MOVQ 40(SP), R_LEN
 	MOVQ 48(SP), R_OFF
 	MOVQ dst_base+0(FP), R_DBASE
 	MOVQ dst_len+8(FP), R_DLEN
 	MOVQ R_DBASE, R_DEND
 	ADDQ R_DLEN, R_DEND
 	MOVQ src_base+24(FP), R_SBASE
 	MOVQ src_len+32(FP), R_SLEN
 	MOVQ R_SBASE, R_SEND
 	ADDQ R_SLEN, R_SEND

 	// d += length
 	// s += length
 	ADDQ R_LEN, R_DST
 	ADDQ R_LEN, R_SRC
 	JMP  loop

 tagLit60Plus:
 	// !!! This fragment does the
 	//
 	// s += x - 58; if uint(s) > uint(len(src)) { etc }
 	//
 	// checks. In the asm version, we code it once instead of once per switch case.
 	ADDQ R_LEN, R_SRC
 	SUBQ $58, R_SRC
 	CMPQ R_SRC, R_SEND
 	JA   errCorrupt

 	// case x == 60:
 	CMPL R_LEN, $61
 	JEQ  tagLit61
 	JA   tagLit62Plus

 	// x = uint32(src[s-1])
 	MOVBLZX -1(R_SRC), R_LEN
 	JMP     doLit

 tagLit61:
 	// case x == 61:
 	// x = uint32(src[s-2]) | uint32(src[s-1])<<8
 	MOVWLZX -2(R_SRC), R_LEN
 	JMP     doLit

 tagLit62Plus:
 	CMPL R_LEN, $62
 	JA   tagLit63

 	// case x == 62:
 	// x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
 	// We read one byte, safe to read one back, since we are just reading tag.
 	// x = binary.LittleEndian.Uint32(src[s-1:]) >> 8
 	MOVL -4(R_SRC), R_LEN
 	SHRL $8, R_LEN
 	JMP  doLit

 tagLit63:
 	// case x == 63:
 	// x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
 	MOVL -4(R_SRC), R_LEN
 	JMP  doLit

 // The code above handles literal tags.
 // ----------------------------------------
 // The code below handles copy tags.

 tagCopy4:
 	// case tagCopy4:
 	// s += 5
 	ADDQ $5, R_SRC

 	// if uint(s) > uint(len(src)) { etc }
 	CMPQ R_SRC, R_SEND
 	JA   errCorrupt

 	// length = 1 + int(src[s-5])>>2
 	SHRQ $2, R_LEN
 	INCQ R_LEN

 	// offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
 	MOVLQZX -4(R_SRC), R_OFF
 	JMP     doCopy

 tagCopy2:
 	// case tagCopy2:
 	// s += 3
 	ADDQ $3, R_SRC

 	// if uint(s) > uint(len(src)) { etc }
 	CMPQ R_SRC, R_SEND
 	JA   errCorrupt

 	// length = 1 + int(src[s-3])>>2
 	SHRQ $2, R_LEN
 	INCQ R_LEN

 	// offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
 	MOVWQZX -2(R_SRC), R_OFF
 	JMP     doCopy

 tagCopy:
 	// We have a copy tag. We assume that:
 	//	- R_TMP1 == src[s] & 0x03
 	//	- R_LEN == src[s]
 	CMPQ R_TMP1, $2
 	JEQ  tagCopy2
 	JA   tagCopy4

 	// case tagCopy1:
 	// s += 2
 	ADDQ $2, R_SRC

 	// if uint(s) > uint(len(src)) { etc }
 	CMPQ R_SRC, R_SEND
 	JA   errCorrupt

 	// offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
 	// length = 4 + int(src[s-2])>>2&0x7
 	MOVBQZX -1(R_SRC), R_TMP1
 	MOVQ    R_LEN, R_TMP0
 	SHRQ    $2, R_LEN
 	ANDQ    $0xe0, R_TMP0
 	ANDQ    $7, R_LEN
 	SHLQ    $3, R_TMP0
 	ADDQ    $4, R_LEN
 	ORQ     R_TMP1, R_TMP0

 	// check if repeat code, ZF set by ORQ.
 	JZ repeatCode

 	// This is a regular copy, transfer our temporary value to R_OFF (length)
 	MOVQ R_TMP0, R_OFF
 	JMP  doCopy

 // This is a repeat code.
 repeatCode:
 	// If length < 9, reuse last offset, with the length already calculated.
 	CMPQ R_LEN, $9
 	JL   doCopyRepeat

 	// Read additional bytes for length.
 	JE repeatLen1

 	// Rare, so the extra branch shouldn't hurt too much.
 	CMPQ R_LEN, $10
 	JE   repeatLen2
 	JMP  repeatLen3

 // Read repeat lengths.
 repeatLen1:
 	// s ++
 	ADDQ $1, R_SRC

 	// if uint(s) > uint(len(src)) { etc }
 	CMPQ R_SRC, R_SEND
 	JA   errCorrupt

 	// length = src[s-1] + 8
 	MOVBQZX -1(R_SRC), R_LEN
 	ADDL    $8, R_LEN
 	JMP     doCopyRepeat

 repeatLen2:
 	// s +=2
 	ADDQ $2, R_SRC

 	// if uint(s) > uint(len(src)) { etc }
 	CMPQ R_SRC, R_SEND
 	JA   errCorrupt

 	// length = uint32(src[s-2]) | (uint32(src[s-1])<<8) + (1 << 8)
 	MOVWQZX -2(R_SRC), R_LEN
 	ADDL    $260, R_LEN
 	JMP     doCopyRepeat

 repeatLen3:
 	// s +=3
 	ADDQ $3, R_SRC

 	// if uint(s) > uint(len(src)) { etc }
 	CMPQ R_SRC, R_SEND
 	JA   errCorrupt

 	// length = uint32(src[s-3]) | (uint32(src[s-2])<<8) | (uint32(src[s-1])<<16) + (1 << 16)
 	// Read one byte further back (just part of the tag, shifted out)
 	MOVL -4(R_SRC), R_LEN
 	SHRL $8, R_LEN
 	ADDL $65540, R_LEN
 	JMP  doCopyRepeat

 doCopy:
 	// This is the end of the outer "switch", when we have a copy tag.
 	//
 	// We assume that:
 	//	- R_LEN == length && R_LEN > 0
 	//	- R_OFF == offset

 	// if d < offset { etc }
 	MOVQ R_DST, R_TMP1
 	SUBQ R_DBASE, R_TMP1
 	CMPQ R_TMP1, R_OFF
 	JLT  errCorrupt

 	// Repeat values can skip the test above, since any offset > 0 will be in dst.
 doCopyRepeat:
 	// if offset <= 0 { etc }
 	CMPQ R_OFF, $0
 	JLE  errCorrupt

 	// if length > len(dst)-d { etc }
 	MOVQ R_DEND, R_TMP1
 	SUBQ R_DST, R_TMP1
 	CMPQ R_LEN, R_TMP1
 	JGT  errCorrupt

 	// forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
 	//
 	// Set:
 	//	- R_TMP2 = len(dst)-d
 	//	- R_TMP3 = &dst[d-offset]
 	MOVQ R_DEND, R_TMP2
 	SUBQ R_DST, R_TMP2
 	MOVQ R_DST, R_TMP3
 	SUBQ R_OFF, R_TMP3

 	// !!! Try a faster technique for short (16 or fewer bytes) forward copies.
 	//
 	// First, try using two 8-byte load/stores, similar to the doLit technique
 	// above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
 	// still OK if offset >= 8. Note that this has to be two 8-byte load/stores
 	// and not one 16-byte load/store, and the first store has to be before the
 	// second load, due to the overlap if offset is in the range [8, 16).
 	//
 	// if length > 16 || offset < 8 || len(dst)-d < 16 {
 	//   goto slowForwardCopy
 	// }
 	// copy 16 bytes
 	// d += length
 	CMPQ R_LEN, $16
 	JGT  slowForwardCopy
 	CMPQ R_OFF, $8
 	JLT  slowForwardCopy
 	CMPQ R_TMP2, $16
 	JLT  slowForwardCopy
 	MOVQ 0(R_TMP3), R_TMP0
 	MOVQ R_TMP0, 0(R_DST)
 	MOVQ 8(R_TMP3), R_TMP1
 	MOVQ R_TMP1, 8(R_DST)
 	ADDQ R_LEN, R_DST
 	JMP  loop

 slowForwardCopy:
 	// !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
 	// can still try 8-byte load stores, provided we can overrun up to 10 extra
 	// bytes. As above, the overrun will be fixed up by subsequent iterations
 	// of the outermost loop.
 	//
 	// The C++ snappy code calls this technique IncrementalCopyFastPath. Its
 	// commentary says:
 	//
 	// ----
 	//
 	// The main part of this loop is a simple copy of eight bytes at a time
 	// until we've copied (at least) the requested amount of bytes.  However,
 	// if d and d-offset are less than eight bytes apart (indicating a
 	// repeating pattern of length < 8), we first need to expand the pattern in
 	// order to get the correct results. For instance, if the buffer looks like
 	// this, with the eight-byte <d-offset> and <d> patterns marked as
 	// intervals:
 	//
 	//    abxxxxxxxxxxxx
 	//    [------]           d-offset
 	//      [------]         d
 	//
 	// a single eight-byte copy from <d-offset> to <d> will repeat the pattern
 	// once, after which we can move <d> two bytes without moving <d-offset>:
 	//
 	//    ababxxxxxxxxxx
 	//    [------]           d-offset
 	//        [------]       d
 	//
 	// and repeat the exercise until the two no longer overlap.
 	//
 	// This allows us to do very well in the special case of one single byte
 	// repeated many times, without taking a big hit for more general cases.
 	//
 	// The worst case of extra writing past the end of the match occurs when
 	// offset == 1 and length == 1; the last copy will read from byte positions
 	// [0..7] and write to [4..11], whereas it was only supposed to write to
 	// position 1. Thus, ten excess bytes.
 	//
 	// ----
 	//
 	// That "10 byte overrun" worst case is confirmed by Go's
 	// TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
 	// and finishSlowForwardCopy algorithm.
 	//
 	// if length > len(dst)-d-10 {
 	//   goto verySlowForwardCopy
 	// }
 	SUBQ $10, R_TMP2
 	CMPQ R_LEN, R_TMP2
 	JGT  verySlowForwardCopy

 	// We want to keep the offset, so we use R_TMP2 from here.
 	MOVQ R_OFF, R_TMP2

 makeOffsetAtLeast8:
 	// !!! As above, expand the pattern so that offset >= 8 and we can use
 	// 8-byte load/stores.
 	//
 	// for offset < 8 {
 	//   copy 8 bytes from dst[d-offset:] to dst[d:]
 	//   length -= offset
 	//   d      += offset
 	//   offset += offset
 	//   // The two previous lines together means that d-offset, and therefore
 	//   // R_TMP3, is unchanged.
 	// }
 	CMPQ R_TMP2, $8
 	JGE  fixUpSlowForwardCopy
 	MOVQ (R_TMP3), R_TMP1
 	MOVQ R_TMP1, (R_DST)
 	SUBQ R_TMP2, R_LEN
 	ADDQ R_TMP2, R_DST
 	ADDQ R_TMP2, R_TMP2
 	JMP  makeOffsetAtLeast8

 fixUpSlowForwardCopy:
 	// !!! Add length (which might be negative now) to d (implied by R_DST being
 	// &dst[d]) so that d ends up at the right place when we jump back to the
 	// top of the loop. Before we do that, though, we save R_DST to R_TMP0 so that, if
 	// length is positive, copying the remaining length bytes will write to the
 	// right place.
 	MOVQ R_DST, R_TMP0
 	ADDQ R_LEN, R_DST

 finishSlowForwardCopy:
 	// !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
 	// length means that we overrun, but as above, that will be fixed up by
 	// subsequent iterations of the outermost loop.
 	CMPQ R_LEN, $0
 	JLE  loop
 	MOVQ (R_TMP3), R_TMP1
 	MOVQ R_TMP1, (R_TMP0)
 	ADDQ $8, R_TMP3
 	ADDQ $8, R_TMP0
 	SUBQ $8, R_LEN
 	JMP  finishSlowForwardCopy

 verySlowForwardCopy:
 	// verySlowForwardCopy is a simple implementation of forward copy. In C
 	// parlance, this is a do/while loop instead of a while loop, since we know
 	// that length > 0. In Go syntax:
 	//
 	// for {
 	//   dst[d] = dst[d - offset]
 	//   d++
 	//   length--
 	//   if length == 0 {
 	//     break
 	//   }
 	// }
 	MOVB (R_TMP3), R_TMP1
 	MOVB R_TMP1, (R_DST)
 	INCQ R_TMP3
 	INCQ R_DST
 	DECQ R_LEN
 	JNZ  verySlowForwardCopy
 	JMP  loop

 // The code above handles copy tags.
 // ----------------------------------------

 end:
 	// This is the end of the "for s < len(src)".
 	//
 	// if d != len(dst) { etc }
 	CMPQ R_DST, R_DEND
 	JNE  errCorrupt

 	// return 0
 	MOVQ $0, ret+48(FP)
 	RET

 errCorrupt:
 	// return decodeErrCodeCorrupt
 	MOVQ $1, ret+48(FP)
 	RET
	// Copyright 2016 The Go Authors. All rights reserved.
	// Copyright (c) 2019 Klaus Post. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	// +build !appengine
	// +build gc
	// +build !noasm

	#include "textflag.h"

	#define R_TMP0 AX
	#define R_TMP1 BX
	#define R_LEN CX
	#define R_OFF DX
	#define R_SRC SI
	#define R_DST DI
	#define R_DBASE R8
	#define R_DLEN R9
	#define R_DEND R10
	#define R_SBASE R11
	#define R_SLEN R12
	#define R_SEND R13
	#define R_TMP2 R14
	#define R_TMP3 R15

	// The asm code generally follows the pure Go code in decode_other.go, except
	// where marked with a "!!!".

	// func decode(dst, src []byte) int
	//
	// All local variables fit into registers. The non-zero stack size is only to
	// spill registers and push args when issuing a CALL. The register allocation:
	// - R_TMP0 scratch
	// - R_TMP1 scratch
	// - R_LEN length or x (shared)
	// - R_OFF offset
	// - R_SRC &src[s]
	// - R_DST &dst[d]
	// + R_DBASE dst_base
	// + R_DLEN dst_len
	// + R_DEND dst_base + dst_len
	// + R_SBASE src_base
	// + R_SLEN src_len
	// + R_SEND src_base + src_len
	// - R_TMP2 used by doCopy
	// - R_TMP3 used by doCopy
	//
	// The registers R_DBASE-R_SEND (marked with a "+") are set at the start of the
	// function, and after a CALL returns, and are not otherwise modified.
	//
	// The d variable is implicitly R_DST - R_DBASE, and len(dst)-d is R_DEND - R_DST.
	// The s variable is implicitly R_SRC - R_SBASE, and len(src)-s is R_SEND - R_SRC.
	TEXT ·s2Decode(SB), NOSPLIT, $56-56
	// Initialize R_SRC, R_DST and R_DBASE-R_SEND.
	MOVQ dst_base+0(FP), R_DBASE
	MOVQ dst_len+8(FP), R_DLEN
	MOVQ R_DBASE, R_DST
	MOVQ R_DBASE, R_DEND
	ADDQ R_DLEN, R_DEND
	MOVQ src_base+24(FP), R_SBASE
	MOVQ src_len+32(FP), R_SLEN
	MOVQ R_SBASE, R_SRC
	MOVQ R_SBASE, R_SEND
	ADDQ R_SLEN, R_SEND
	XORQ R_OFF, R_OFF

	loop:
	// for s < len(src)
	CMPQ R_SRC, R_SEND
	JEQ end

	// R_LEN = uint32(src[s])
	//
	// switch src[s] & 0x03
	MOVBLZX (R_SRC), R_LEN
	MOVL R_LEN, R_TMP1
	ANDL $3, R_TMP1
	CMPL R_TMP1, $1
	JAE tagCopy

	// ----------------------------------------
	// The code below handles literal tags.

	// case tagLiteral:
	// x := uint32(src[s] >> 2)
	// switch
	SHRL $2, R_LEN
	CMPL R_LEN, $60
	JAE tagLit60Plus

	// case x < 60:
	// s++
	INCQ R_SRC

	doLit:
	// This is the end of the inner "switch", when we have a literal tag.
	//
	// We assume that R_LEN == x and x fits in a uint32, where x is the variable
	// used in the pure Go decode_other.go code.

	// length = int(x) + 1
	//
	// Unlike the pure Go code, we don't need to check if length <= 0 because
	// R_LEN can hold 64 bits, so the increment cannot overflow.
	INCQ R_LEN

	// Prepare to check if copying length bytes will run past the end of dst or
	// src.
	//
	// R_TMP0 = len(dst) - d
	// R_TMP1 = len(src) - s
	MOVQ R_DEND, R_TMP0
	SUBQ R_DST, R_TMP0
	MOVQ R_SEND, R_TMP1
	SUBQ R_SRC, R_TMP1

	// !!! Try a faster technique for short (16 or fewer bytes) copies.
	//
	// if length > 16 \|\| len(dst)-d < 16 \|\| len(src)-s < 16 {
	// goto callMemmove // Fall back on calling runtime·memmove.
	// }
	//
	// The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
	// against 21 instead of 16, because it cannot assume that all of its input
	// is contiguous in memory and so it needs to leave enough source bytes to
	// read the next tag without refilling buffers, but Go's Decode assumes
	// contiguousness (the src argument is a []byte).
	CMPQ R_LEN, $16
	JGT callMemmove
	CMPQ R_TMP0, $16
	JLT callMemmove
	CMPQ R_TMP1, $16
	JLT callMemmove

	// !!! Implement the copy from src to dst as a 16-byte load and store.
	// (Decode's documentation says that dst and src must not overlap.)
	//
	// This always copies 16 bytes, instead of only length bytes, but that's
	// OK. If the input is a valid Snappy encoding then subsequent iterations
	// will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
	// non-nil error), so the overrun will be ignored.
	//
	// Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
	// 16-byte loads and stores. This technique probably wouldn't be as
	// effective on architectures that are fussier about alignment.
	MOVOU 0(R_SRC), X0
	MOVOU X0, 0(R_DST)

	// d += length
	// s += length
	ADDQ R_LEN, R_DST
	ADDQ R_LEN, R_SRC
	JMP loop

	callMemmove:
	// if length > len(dst)-d \|\| length > len(src)-s { etc }
	CMPQ R_LEN, R_TMP0
	JGT errCorrupt
	CMPQ R_LEN, R_TMP1
	JGT errCorrupt

	// copy(dst[d:], src[s:s+length])
	//
	// This means calling runtime·memmove(&dst[d], &src[s], length), so we push
	// R_DST, R_SRC and R_LEN as arguments. Coincidentally, we also need to spill those
	// three registers to the stack, to save local variables across the CALL.
	MOVQ R_DST, 0(SP)
	MOVQ R_SRC, 8(SP)
	MOVQ R_LEN, 16(SP)
	MOVQ R_DST, 24(SP)
	MOVQ R_SRC, 32(SP)
	MOVQ R_LEN, 40(SP)
	MOVQ R_OFF, 48(SP)
	CALL runtime·memmove(SB)

	// Restore local variables: unspill registers from the stack and
	// re-calculate R_DBASE-R_SEND.
	MOVQ 24(SP), R_DST
	MOVQ 32(SP), R_SRC
	MOVQ 40(SP), R_LEN
	MOVQ 48(SP), R_OFF
	MOVQ dst_base+0(FP), R_DBASE
	MOVQ dst_len+8(FP), R_DLEN
	MOVQ R_DBASE, R_DEND
	ADDQ R_DLEN, R_DEND
	MOVQ src_base+24(FP), R_SBASE
	MOVQ src_len+32(FP), R_SLEN
	MOVQ R_SBASE, R_SEND
	ADDQ R_SLEN, R_SEND

	// d += length
	// s += length
	ADDQ R_LEN, R_DST
	ADDQ R_LEN, R_SRC
	JMP loop

	tagLit60Plus:
	// !!! This fragment does the
	//
	// s += x - 58; if uint(s) > uint(len(src)) { etc }
	//
	// checks. In the asm version, we code it once instead of once per switch case.
	ADDQ R_LEN, R_SRC
	SUBQ $58, R_SRC
	CMPQ R_SRC, R_SEND
	JA errCorrupt

	// case x == 60:
	CMPL R_LEN, $61
	JEQ tagLit61
	JA tagLit62Plus

	// x = uint32(src[s-1])
	MOVBLZX -1(R_SRC), R_LEN
	JMP doLit

	tagLit61:
	// case x == 61:
	// x = uint32(src[s-2]) \| uint32(src[s-1])<<8
	MOVWLZX -2(R_SRC), R_LEN
	JMP doLit

	tagLit62Plus:
	CMPL R_LEN, $62
	JA tagLit63

	// case x == 62:
	// x = uint32(src[s-3]) \| uint32(src[s-2])<<8 \| uint32(src[s-1])<<16
	// We read one byte, safe to read one back, since we are just reading tag.
	// x = binary.LittleEndian.Uint32(src[s-1:]) >> 8
	MOVL -4(R_SRC), R_LEN
	SHRL $8, R_LEN
	JMP doLit

	tagLit63:
	// case x == 63:
	// x = uint32(src[s-4]) \| uint32(src[s-3])<<8 \| uint32(src[s-2])<<16 \| uint32(src[s-1])<<24
	MOVL -4(R_SRC), R_LEN
	JMP doLit

	// The code above handles literal tags.
	// ----------------------------------------
	// The code below handles copy tags.

	tagCopy4:
	// case tagCopy4:
	// s += 5
	ADDQ $5, R_SRC

	// if uint(s) > uint(len(src)) { etc }
	CMPQ R_SRC, R_SEND
	JA errCorrupt

	// length = 1 + int(src[s-5])>>2
	SHRQ $2, R_LEN
	INCQ R_LEN

	// offset = int(uint32(src[s-4]) \| uint32(src[s-3])<<8 \| uint32(src[s-2])<<16 \| uint32(src[s-1])<<24)
	MOVLQZX -4(R_SRC), R_OFF
	JMP doCopy

	tagCopy2:
	// case tagCopy2:
	// s += 3
	ADDQ $3, R_SRC

	// if uint(s) > uint(len(src)) { etc }
	CMPQ R_SRC, R_SEND
	JA errCorrupt

	// length = 1 + int(src[s-3])>>2
	SHRQ $2, R_LEN
	INCQ R_LEN

	// offset = int(uint32(src[s-2]) \| uint32(src[s-1])<<8)
	MOVWQZX -2(R_SRC), R_OFF
	JMP doCopy

	tagCopy:
	// We have a copy tag. We assume that:
	// - R_TMP1 == src[s] & 0x03
	// - R_LEN == src[s]
	CMPQ R_TMP1, $2
	JEQ tagCopy2
	JA tagCopy4

	// case tagCopy1:
	// s += 2
	ADDQ $2, R_SRC

	// if uint(s) > uint(len(src)) { etc }
	CMPQ R_SRC, R_SEND
	JA errCorrupt

	// offset = int(uint32(src[s-2])&0xe0<<3 \| uint32(src[s-1]))
	// length = 4 + int(src[s-2])>>2&0x7
	MOVBQZX -1(R_SRC), R_TMP1
	MOVQ R_LEN, R_TMP0
	SHRQ $2, R_LEN
	ANDQ $0xe0, R_TMP0
	ANDQ $7, R_LEN
	SHLQ $3, R_TMP0
	ADDQ $4, R_LEN
	ORQ R_TMP1, R_TMP0

	// check if repeat code, ZF set by ORQ.
	JZ repeatCode

	// This is a regular copy, transfer our temporary value to R_OFF (length)
	MOVQ R_TMP0, R_OFF
	JMP doCopy

	// This is a repeat code.
	repeatCode:
	// If length < 9, reuse last offset, with the length already calculated.
	CMPQ R_LEN, $9
	JL doCopyRepeat

	// Read additional bytes for length.
	JE repeatLen1

	// Rare, so the extra branch shouldn't hurt too much.
	CMPQ R_LEN, $10
	JE repeatLen2
	JMP repeatLen3

	// Read repeat lengths.
	repeatLen1:
	// s ++
	ADDQ $1, R_SRC

	// if uint(s) > uint(len(src)) { etc }
	CMPQ R_SRC, R_SEND
	JA errCorrupt

	// length = src[s-1] + 8
	MOVBQZX -1(R_SRC), R_LEN
	ADDL $8, R_LEN
	JMP doCopyRepeat

	repeatLen2:
	// s +=2
	ADDQ $2, R_SRC

	// if uint(s) > uint(len(src)) { etc }
	CMPQ R_SRC, R_SEND
	JA errCorrupt

	// length = uint32(src[s-2]) \| (uint32(src[s-1])<<8) + (1 << 8)
	MOVWQZX -2(R_SRC), R_LEN
	ADDL $260, R_LEN
	JMP doCopyRepeat

	repeatLen3:
	// s +=3
	ADDQ $3, R_SRC

	// if uint(s) > uint(len(src)) { etc }
	CMPQ R_SRC, R_SEND
	JA errCorrupt

	// length = uint32(src[s-3]) \| (uint32(src[s-2])<<8) \| (uint32(src[s-1])<<16) + (1 << 16)
	// Read one byte further back (just part of the tag, shifted out)
	MOVL -4(R_SRC), R_LEN
	SHRL $8, R_LEN
	ADDL $65540, R_LEN
	JMP doCopyRepeat

	doCopy:
	// This is the end of the outer "switch", when we have a copy tag.
	//
	// We assume that:
	// - R_LEN == length && R_LEN > 0
	// - R_OFF == offset

	// if d < offset { etc }
	MOVQ R_DST, R_TMP1
	SUBQ R_DBASE, R_TMP1
	CMPQ R_TMP1, R_OFF
	JLT errCorrupt

	// Repeat values can skip the test above, since any offset > 0 will be in dst.
	doCopyRepeat:
	// if offset <= 0 { etc }
	CMPQ R_OFF, $0
	JLE errCorrupt

	// if length > len(dst)-d { etc }
	MOVQ R_DEND, R_TMP1
	SUBQ R_DST, R_TMP1
	CMPQ R_LEN, R_TMP1
	JGT errCorrupt

	// forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
	//
	// Set:
	// - R_TMP2 = len(dst)-d
	// - R_TMP3 = &dst[d-offset]
	MOVQ R_DEND, R_TMP2
	SUBQ R_DST, R_TMP2
	MOVQ R_DST, R_TMP3
	SUBQ R_OFF, R_TMP3

	// !!! Try a faster technique for short (16 or fewer bytes) forward copies.
	//
	// First, try using two 8-byte load/stores, similar to the doLit technique
	// above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
	// still OK if offset >= 8. Note that this has to be two 8-byte load/stores
	// and not one 16-byte load/store, and the first store has to be before the
	// second load, due to the overlap if offset is in the range [8, 16).
	//
	// if length > 16 \|\| offset < 8 \|\| len(dst)-d < 16 {
	// goto slowForwardCopy
	// }
	// copy 16 bytes
	// d += length
	CMPQ R_LEN, $16
	JGT slowForwardCopy
	CMPQ R_OFF, $8
	JLT slowForwardCopy
	CMPQ R_TMP2, $16
	JLT slowForwardCopy
	MOVQ 0(R_TMP3), R_TMP0
	MOVQ R_TMP0, 0(R_DST)
	MOVQ 8(R_TMP3), R_TMP1
	MOVQ R_TMP1, 8(R_DST)
	ADDQ R_LEN, R_DST
	JMP loop

	slowForwardCopy:
	// !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
	// can still try 8-byte load stores, provided we can overrun up to 10 extra
	// bytes. As above, the overrun will be fixed up by subsequent iterations
	// of the outermost loop.
	//
	// The C++ snappy code calls this technique IncrementalCopyFastPath. Its
	// commentary says:
	//
	// ----
	//
	// The main part of this loop is a simple copy of eight bytes at a time
	// until we've copied (at least) the requested amount of bytes. However,
	// if d and d-offset are less than eight bytes apart (indicating a
	// repeating pattern of length < 8), we first need to expand the pattern in
	// order to get the correct results. For instance, if the buffer looks like
	// this, with the eight-byte <d-offset> and <d> patterns marked as
	// intervals:
	//
	// abxxxxxxxxxxxx
	// [------] d-offset
	// [------] d
	//
	// a single eight-byte copy from <d-offset> to <d> will repeat the pattern
	// once, after which we can move <d> two bytes without moving <d-offset>:
	//
	// ababxxxxxxxxxx
	// [------] d-offset
	// [------] d
	//
	// and repeat the exercise until the two no longer overlap.
	//
	// This allows us to do very well in the special case of one single byte
	// repeated many times, without taking a big hit for more general cases.
	//
	// The worst case of extra writing past the end of the match occurs when
	// offset == 1 and length == 1; the last copy will read from byte positions
	// [0..7] and write to [4..11], whereas it was only supposed to write to
	// position 1. Thus, ten excess bytes.
	//
	// ----
	//
	// That "10 byte overrun" worst case is confirmed by Go's
	// TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
	// and finishSlowForwardCopy algorithm.
	//
	// if length > len(dst)-d-10 {
	// goto verySlowForwardCopy
	// }
	SUBQ $10, R_TMP2
	CMPQ R_LEN, R_TMP2
	JGT verySlowForwardCopy

	// We want to keep the offset, so we use R_TMP2 from here.
	MOVQ R_OFF, R_TMP2

	makeOffsetAtLeast8:
	// !!! As above, expand the pattern so that offset >= 8 and we can use
	// 8-byte load/stores.
	//
	// for offset < 8 {
	// copy 8 bytes from dst[d-offset:] to dst[d:]
	// length -= offset
	// d += offset
	// offset += offset
	// // The two previous lines together means that d-offset, and therefore
	// // R_TMP3, is unchanged.
	// }
	CMPQ R_TMP2, $8
	JGE fixUpSlowForwardCopy
	MOVQ (R_TMP3), R_TMP1
	MOVQ R_TMP1, (R_DST)
	SUBQ R_TMP2, R_LEN
	ADDQ R_TMP2, R_DST
	ADDQ R_TMP2, R_TMP2
	JMP makeOffsetAtLeast8

	fixUpSlowForwardCopy:
	// !!! Add length (which might be negative now) to d (implied by R_DST being
	// &dst[d]) so that d ends up at the right place when we jump back to the
	// top of the loop. Before we do that, though, we save R_DST to R_TMP0 so that, if
	// length is positive, copying the remaining length bytes will write to the
	// right place.
	MOVQ R_DST, R_TMP0
	ADDQ R_LEN, R_DST

	finishSlowForwardCopy:
	// !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
	// length means that we overrun, but as above, that will be fixed up by
	// subsequent iterations of the outermost loop.
	CMPQ R_LEN, $0
	JLE loop
	MOVQ (R_TMP3), R_TMP1
	MOVQ R_TMP1, (R_TMP0)
	ADDQ $8, R_TMP3
	ADDQ $8, R_TMP0
	SUBQ $8, R_LEN
	JMP finishSlowForwardCopy

	verySlowForwardCopy:
	// verySlowForwardCopy is a simple implementation of forward copy. In C
	// parlance, this is a do/while loop instead of a while loop, since we know
	// that length > 0. In Go syntax:
	//
	// for {
	// dst[d] = dst[d - offset]
	// d++
	// length--
	// if length == 0 {
	// break
	// }
	// }
	MOVB (R_TMP3), R_TMP1
	MOVB R_TMP1, (R_DST)
	INCQ R_TMP3
	INCQ R_DST
	DECQ R_LEN
	JNZ verySlowForwardCopy
	JMP loop

	// The code above handles copy tags.
	// ----------------------------------------

	end:
	// This is the end of the "for s < len(src)".
	//
	// if d != len(dst) { etc }
	CMPQ R_DST, R_DEND
	JNE errCorrupt

	// return 0
	MOVQ $0, ret+48(FP)
	RET

	errCorrupt:
	// return decodeErrCodeCorrupt
	MOVQ $1, ret+48(FP)
	RET