Add experimental chaining.
diff --git a/zstd/enc_best.go b/zstd/enc_best.go
index f861841..5f7e9ae 100644
--- a/zstd/enc_best.go
+++ b/zstd/enc_best.go
@@ -16,6 +16,10 @@
 	bestLongTableSize = 1 << bestLongTableBits // Size of the table
 	bestLongLen       = 8                      // Bytes used for table hash
 
+	bestChainBits = 22
+	bestChainSize = 1 << bestChainBits // Size of the table
+	bestChainMask = bestChainSize - 1
+
 	// Note: Increasing the short table bits or making the hash shorter
 	// can actually lead to compression degradation since it will 'steal' more from the
 	// long match table and match offsets are quite big.
@@ -69,7 +73,8 @@
 type bestFastEncoder struct {
 	fastBase
 	table         [bestShortTableSize]prevEntry
-	longTable     [bestLongTableSize]prevEntry
+	longTable     [bestLongTableSize]int32
+	chain         [bestChainSize]int32
 	dictTable     []prevEntry
 	dictLongTable []prevEntry
 }
@@ -85,9 +90,10 @@
 
 	// Protect against e.cur wraparound.
 	for e.cur >= e.bufferReset-int32(len(e.hist)) {
+		e.chain = [bestChainSize]int32{}
 		if len(e.hist) == 0 {
 			e.table = [bestShortTableSize]prevEntry{}
-			e.longTable = [bestLongTableSize]prevEntry{}
+			e.longTable = [bestLongTableSize]int32{}
 			e.cur = e.maxMatchOff
 			break
 		}
@@ -113,23 +119,13 @@
 			}
 		}
 		for i := range e.longTable[:] {
-			v := e.longTable[i].offset
-			v2 := e.longTable[i].prev
+			v := e.longTable[i]
 			if v < minOff {
 				v = 0
-				v2 = 0
 			} else {
 				v = v - e.cur + e.maxMatchOff
-				if v2 < minOff {
-					v2 = 0
-				} else {
-					v2 = v2 - e.cur + e.maxMatchOff
-				}
 			}
-			e.longTable[i] = prevEntry{
-				offset: v,
-				prev:   v2,
-			}
+			e.longTable[i] = v
 		}
 		e.cur = e.maxMatchOff
 		break
@@ -194,10 +190,11 @@
 		nextHashS := hashLen(cv, bestShortTableBits, bestShortLen)
 		candidateL := e.longTable[nextHashL]
 		candidateS := e.table[nextHashS]
+		chainEnd := e.cur - e.maxMatchOff
 
 		// Set m to a match at offset if it looks like that will improve compression.
 		improve := func(m *match, offset int32, s int32, first uint32, rep int32) {
-			if s-offset >= e.maxMatchOff || load3232(src, offset) != first {
+			if s-offset >= e.maxMatchOff || offset >= s || load3232(src, offset) != first {
 				return
 			}
 			if debugAsserts {
@@ -244,11 +241,20 @@
 		}
 
 		best := match{s: s, est: highScore}
-		improve(&best, candidateL.offset-e.cur, s, uint32(cv), -1)
-		improve(&best, candidateL.prev-e.cur, s, uint32(cv), -1)
+
+		improve(&best, candidateL-e.cur, s, uint32(cv), -1)
 		improve(&best, candidateS.offset-e.cur, s, uint32(cv), -1)
 		improve(&best, candidateS.prev-e.cur, s, uint32(cv), -1)
 
+		// Chain current
+		candidateLP := e.chain[candidateL&bestChainMask]
+		n := 8
+		for candidateLP > chainEnd && n > 0 {
+			improve(&best, candidateLP-e.cur, s, uint32(cv), -1)
+			candidateLP = e.chain[candidateLP&bestChainMask]
+			n--
+		}
+
 		if canRepeat && best.length < goodEnough {
 			if s == nextEmit {
 				// Check repeats straight after a match.
@@ -276,7 +282,10 @@
 			}
 		}
 		// Load next and check...
-		e.longTable[nextHashL] = prevEntry{offset: s + e.cur, prev: candidateL.offset}
+		sCurr := s + e.cur
+		e.longTable[nextHashL] = sCurr
+		e.chain[sCurr&bestChainMask] = candidateL
+
 		e.table[nextHashS] = prevEntry{offset: s + e.cur, prev: candidateS.offset}
 
 		// Look far ahead, unless we have a really long match already...
@@ -290,7 +299,6 @@
 				cv = load6432(src, s)
 				continue
 			}
-
 			s := s + 1
 			candidateS = e.table[hashLen(cv>>8, bestShortTableBits, bestShortLen)]
 			cv = load6432(src, s)
@@ -301,10 +309,26 @@
 			// Short at s+1
 			improve(&best, candidateS.offset-e.cur, s, uint32(cv), -1)
 			// Long at s+1, s+2
-			improve(&best, candidateL.offset-e.cur, s, uint32(cv), -1)
-			improve(&best, candidateL.prev-e.cur, s, uint32(cv), -1)
-			improve(&best, candidateL2.offset-e.cur, s+1, uint32(cv2), -1)
-			improve(&best, candidateL2.prev-e.cur, s+1, uint32(cv2), -1)
+			improve(&best, candidateL-e.cur, s, uint32(cv), -1)
+
+			candidateLP = e.chain[candidateL&bestChainMask]
+			n = 4
+			for candidateLP > chainEnd && n > 0 {
+				improve(&best, candidateLP-e.cur, s, uint32(cv), -1)
+				candidateLP = e.chain[candidateLP&bestChainMask]
+				n--
+			}
+
+			improve(&best, candidateL2-e.cur, s+1, uint32(cv2), -1)
+
+			candidateLP = e.chain[candidateL2&bestChainMask]
+			n = 4
+			for candidateLP > chainEnd && n > 0 {
+				improve(&best, candidateLP-e.cur, s+1, uint32(cv2), -1)
+				candidateLP = e.chain[candidateLP&bestChainMask]
+				n--
+			}
+
 			if false {
 				// Short at s+3.
 				// Too often worse...
@@ -318,11 +342,13 @@
 				// Start check at a fixed offset to allow for a few mismatches.
 				// For this compression level 2 yields the best results.
 				const skipBeginning = 2
-				if pos := candidateEnd.offset - e.cur - best.length + skipBeginning; pos >= 0 {
-					improve(&best, pos, best.s+skipBeginning, load3232(src, best.s+skipBeginning), -1)
-					if pos := candidateEnd.prev - e.cur - best.length + skipBeginning; pos >= 0 {
-						improve(&best, pos, best.s+skipBeginning, load3232(src, best.s+skipBeginning), -1)
-					}
+
+				if pos := candidateEnd - e.cur - best.length + skipBeginning; pos >= 0 {
+					cv := load3232(src, best.s+skipBeginning)
+					improve(&best, pos, best.s+skipBeginning, cv, -1)
+					// TODO: CHAIN?
+					//candidateLP = e.chain[candidateEnd&bestChainMask]
+					//improve(&best, candidateLP-e.cur, best.s+skipBeginning, uint32(cv2), -1)
 				}
 			}
 		}
@@ -367,7 +393,8 @@
 				cv0 := load6432(src, index0)
 				h0 := hashLen(cv0, bestLongTableBits, bestLongLen)
 				h1 := hashLen(cv0, bestShortTableBits, bestShortLen)
-				e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
+				e.chain[index0&bestChainMask] = e.longTable[h0]
+				e.longTable[h0] = off
 				e.table[h1] = prevEntry{offset: off, prev: e.table[h1].offset}
 				off++
 				index0++
@@ -424,7 +451,8 @@
 			h0 := hashLen(cv0, bestLongTableBits, bestLongLen)
 			h1 := hashLen(cv0, bestShortTableBits, bestShortLen)
 			off := index0 + e.cur
-			e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
+			e.chain[index0&bestChainMask] = e.longTable[h0]
+			e.longTable[h0] = off
 			e.table[h1] = prevEntry{offset: off, prev: e.table[h1].offset}
 			index0++
 		}
@@ -519,7 +547,8 @@
 		e.lastDictID = d.id
 	}
 	// Reset table to initial state
-	copy(e.longTable[:], e.dictLongTable)
+	// TODO:
+	//copy(e.longTable[:], e.dictLongTable)
 
 	e.cur = e.maxMatchOff
 	// Reset table to initial state