feat: reduce avoid boundary check
diff --git a/resize.go b/resize.go
index 97f498a..0c78e47 100644
--- a/resize.go
+++ b/resize.go
@@ -116,23 +116,25 @@
 		for y := range ys {
 			src.scan(0, y, src.w, y+1, scanLine)
 			j0 := y * dst.Stride
-			for x := 0; x < width; x++ {
+			for x := range weights {
 				var r, g, b, a float64
 				for _, w := range weights[x] {
 					i := w.index * 4
-					aw := float64(scanLine[i+3]) * w.weight
-					r += float64(scanLine[i+0]) * aw
-					g += float64(scanLine[i+1]) * aw
-					b += float64(scanLine[i+2]) * aw
+					s := scanLine[i : i+4 : i+4]
+					aw := float64(s[3]) * w.weight
+					r += float64(s[0]) * aw
+					g += float64(s[1]) * aw
+					b += float64(s[2]) * aw
 					a += aw
 				}
 				if a != 0 {
 					aInv := 1 / a
 					j := j0 + x*4
-					dst.Pix[j+0] = clamp(r * aInv)
-					dst.Pix[j+1] = clamp(g * aInv)
-					dst.Pix[j+2] = clamp(b * aInv)
-					dst.Pix[j+3] = clamp(a)
+					d := dst.Pix[j : j+4 : j+4]
+					d[0] = clamp(r * aInv)
+					d[1] = clamp(g * aInv)
+					d[2] = clamp(b * aInv)
+					d[3] = clamp(a)
 				}
 			}
 		}
@@ -148,23 +150,25 @@
 		scanLine := make([]uint8, src.h*4)
 		for x := range xs {
 			src.scan(x, 0, x+1, src.h, scanLine)
-			for y := 0; y < height; y++ {
+			for y := range weights {
 				var r, g, b, a float64
 				for _, w := range weights[y] {
 					i := w.index * 4
-					aw := float64(scanLine[i+3]) * w.weight
-					r += float64(scanLine[i+0]) * aw
-					g += float64(scanLine[i+1]) * aw
-					b += float64(scanLine[i+2]) * aw
+					s := scanLine[i : i+4 : i+4]
+					aw := float64(s[3]) * w.weight
+					r += float64(s[0]) * aw
+					g += float64(s[1]) * aw
+					b += float64(s[2]) * aw
 					a += aw
 				}
 				if a != 0 {
 					aInv := 1 / a
 					j := y*dst.Stride + x*4
-					dst.Pix[j+0] = clamp(r * aInv)
-					dst.Pix[j+1] = clamp(g * aInv)
-					dst.Pix[j+2] = clamp(b * aInv)
-					dst.Pix[j+3] = clamp(a)
+					d := dst.Pix[j : j+4 : j+4]
+					d[0] = clamp(r * aInv)
+					d[1] = clamp(g * aInv)
+					d[2] = clamp(b * aInv)
+					d[3] = clamp(a)
 				}
 			}
 		}
diff --git a/scanner.go b/scanner.go
index c4dbfe1..5e76987 100644
--- a/scanner.go
+++ b/scanner.go
@@ -44,10 +44,12 @@
 		for y := y1; y < y2; y++ {
 			i := y*img.Stride + x1*8
 			for x := x1; x < x2; x++ {
-				dst[j+0] = img.Pix[i+0]
-				dst[j+1] = img.Pix[i+2]
-				dst[j+2] = img.Pix[i+4]
-				dst[j+3] = img.Pix[i+6]
+				s := img.Pix[i : i+8 : i+8]
+				d := dst[j : j+4 : j+4]
+				d[0] = s[0]
+				d[1] = s[2]
+				d[2] = s[4]
+				d[3] = s[6]
 				j += 4
 				i += 8
 			}
@@ -58,26 +60,28 @@
 		for y := y1; y < y2; y++ {
 			i := y*img.Stride + x1*4
 			for x := x1; x < x2; x++ {
-				a := img.Pix[i+3]
+				s := img.Pix[i : i+4 : i+4]
+				d := dst[j : j+4 : j+4]
+				a := s[3]
 				switch a {
 				case 0:
-					dst[j+0] = 0
-					dst[j+1] = 0
-					dst[j+2] = 0
+					d[0] = 0
+					d[1] = 0
+					d[2] = 0
 				case 0xff:
-					dst[j+0] = img.Pix[i+0]
-					dst[j+1] = img.Pix[i+1]
-					dst[j+2] = img.Pix[i+2]
+					d[0] = s[0]
+					d[1] = s[1]
+					d[2] = s[2]
 				default:
-					r16 := uint16(img.Pix[i+0])
-					g16 := uint16(img.Pix[i+1])
-					b16 := uint16(img.Pix[i+2])
+					r16 := uint16(s[0])
+					g16 := uint16(s[1])
+					b16 := uint16(s[2])
 					a16 := uint16(a)
-					dst[j+0] = uint8(r16 * 0xff / a16)
-					dst[j+1] = uint8(g16 * 0xff / a16)
-					dst[j+2] = uint8(b16 * 0xff / a16)
+					d[0] = uint8(r16 * 0xff / a16)
+					d[1] = uint8(g16 * 0xff / a16)
+					d[2] = uint8(b16 * 0xff / a16)
 				}
-				dst[j+3] = a
+				d[3] = a
 				j += 4
 				i += 4
 			}
@@ -88,26 +92,28 @@
 		for y := y1; y < y2; y++ {
 			i := y*img.Stride + x1*8
 			for x := x1; x < x2; x++ {
-				a := img.Pix[i+6]
+				s := img.Pix[i : i+8 : i+8]
+				d := dst[j : j+4 : j+4]
+				a := s[6]
 				switch a {
 				case 0:
-					dst[j+0] = 0
-					dst[j+1] = 0
-					dst[j+2] = 0
+					d[0] = 0
+					d[1] = 0
+					d[2] = 0
 				case 0xff:
-					dst[j+0] = img.Pix[i+0]
-					dst[j+1] = img.Pix[i+2]
-					dst[j+2] = img.Pix[i+4]
+					d[0] = s[0]
+					d[1] = s[2]
+					d[2] = s[4]
 				default:
-					r32 := uint32(img.Pix[i+0])<<8 | uint32(img.Pix[i+1])
-					g32 := uint32(img.Pix[i+2])<<8 | uint32(img.Pix[i+3])
-					b32 := uint32(img.Pix[i+4])<<8 | uint32(img.Pix[i+5])
-					a32 := uint32(img.Pix[i+6])<<8 | uint32(img.Pix[i+7])
-					dst[j+0] = uint8((r32 * 0xffff / a32) >> 8)
-					dst[j+1] = uint8((g32 * 0xffff / a32) >> 8)
-					dst[j+2] = uint8((b32 * 0xffff / a32) >> 8)
+					r32 := uint32(s[0])<<8 | uint32(s[1])
+					g32 := uint32(s[2])<<8 | uint32(s[3])
+					b32 := uint32(s[4])<<8 | uint32(s[5])
+					a32 := uint32(s[6])<<8 | uint32(s[7])
+					d[0] = uint8((r32 * 0xffff / a32) >> 8)
+					d[1] = uint8((g32 * 0xffff / a32) >> 8)
+					d[2] = uint8((b32 * 0xffff / a32) >> 8)
 				}
-				dst[j+3] = a
+				d[3] = a
 				j += 4
 				i += 8
 			}
@@ -119,10 +125,11 @@
 			i := y*img.Stride + x1
 			for x := x1; x < x2; x++ {
 				c := img.Pix[i]
-				dst[j+0] = c
-				dst[j+1] = c
-				dst[j+2] = c
-				dst[j+3] = 0xff
+				d := dst[j : j+4 : j+4]
+				d[0] = c
+				d[1] = c
+				d[2] = c
+				d[3] = 0xff
 				j += 4
 				i++
 			}
@@ -134,10 +141,11 @@
 			i := y*img.Stride + x1*2
 			for x := x1; x < x2; x++ {
 				c := img.Pix[i]
-				dst[j+0] = c
-				dst[j+1] = c
-				dst[j+2] = c
-				dst[j+3] = 0xff
+				d := dst[j : j+4 : j+4]
+				d[0] = c
+				d[1] = c
+				d[2] = c
+				d[3] = 0xff
 				j += 4
 				i += 2
 			}
@@ -191,10 +199,11 @@
 					b = 0
 				}
 
-				dst[j+0] = uint8(r)
-				dst[j+1] = uint8(g)
-				dst[j+2] = uint8(b)
-				dst[j+3] = 0xff
+				d := dst[j : j+4 : j+4]
+				d[0] = uint8(r)
+				d[1] = uint8(g)
+				d[2] = uint8(b)
+				d[3] = 0xff
 
 				iy++
 				j += 4
@@ -207,10 +216,11 @@
 			i := y*img.Stride + x1
 			for x := x1; x < x2; x++ {
 				c := s.palette[img.Pix[i]]
-				dst[j+0] = c.R
-				dst[j+1] = c.G
-				dst[j+2] = c.B
-				dst[j+3] = c.A
+				d := dst[j : j+4 : j+4]
+				d[0] = c.R
+				d[1] = c.G
+				d[2] = c.B
+				d[3] = c.A
 				j += 4
 				i++
 			}
@@ -226,22 +236,23 @@
 		for y := y1; y < y2; y++ {
 			for x := x1; x < x2; x++ {
 				r16, g16, b16, a16 := s.image.At(x, y).RGBA()
+				d := dst[j : j+4 : j+4]
 				switch a16 {
 				case 0xffff:
-					dst[j+0] = uint8(r16 >> 8)
-					dst[j+1] = uint8(g16 >> 8)
-					dst[j+2] = uint8(b16 >> 8)
-					dst[j+3] = 0xff
+					d[0] = uint8(r16 >> 8)
+					d[1] = uint8(g16 >> 8)
+					d[2] = uint8(b16 >> 8)
+					d[3] = 0xff
 				case 0:
-					dst[j+0] = 0
-					dst[j+1] = 0
-					dst[j+2] = 0
-					dst[j+3] = 0
+					d[0] = 0
+					d[1] = 0
+					d[2] = 0
+					d[3] = 0
 				default:
-					dst[j+0] = uint8(((r16 * 0xffff) / a16) >> 8)
-					dst[j+1] = uint8(((g16 * 0xffff) / a16) >> 8)
-					dst[j+2] = uint8(((b16 * 0xffff) / a16) >> 8)
-					dst[j+3] = uint8(a16 >> 8)
+					d[0] = uint8(((r16 * 0xffff) / a16) >> 8)
+					d[1] = uint8(((g16 * 0xffff) / a16) >> 8)
+					d[2] = uint8(((b16 * 0xffff) / a16) >> 8)
+					d[3] = uint8(a16 >> 8)
 				}
 				j += 4
 			}