blob: 7e80d3860af1cafdb3fb715f3bd774bb6ba085a1 [file] [log] [blame]
// Copyright 2024 The LUCI Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package util
import (
"testing"
. "github.com/smartystreets/goconvey/convey"
)
func TestSplitToChunks(t *testing.T) {
Convey("chunk content", t, func() {
Convey("empty string, 0 chunk", func() {
content := []byte("")
chunks, err := SplitToChunks(content, 10, 10)
So(err, ShouldBeNil)
So(len(chunks), ShouldEqual, 0)
})
Convey("1 character, 1 chunk", func() {
content := []byte("a")
chunks, err := SplitToChunks(content, 10, 10)
So(err, ShouldBeNil)
So(chunks, ShouldResemble, []string{"a"})
})
Convey("maxSize character, 1 chunk", func() {
content := []byte("0123456789")
chunks, err := SplitToChunks(content, 10, 10)
So(err, ShouldBeNil)
So(chunks, ShouldResemble, []string{"0123456789"})
})
Convey("No delimiter", func() {
content := []byte("0123456789abcdefghij")
chunks, err := SplitToChunks(content, 10, 10)
So(err, ShouldBeNil)
So(chunks, ShouldResemble, []string{"0123456789", "abcdefghij"})
})
Convey("No delimiter 1", func() {
content := []byte("0123456789abcdefghij0")
chunks, err := SplitToChunks(content, 10, 10)
So(err, ShouldBeNil)
So(chunks, ShouldResemble, []string{"0123456789", "abcdefghij", "0"})
})
Convey("No delimiter 2", func() {
content := []byte("0123456789abcdefghij0123456789a")
chunks, err := SplitToChunks(content, 10, 10)
So(err, ShouldBeNil)
So(chunks, ShouldResemble, []string{"0123456789", "abcdefghij", "0123456789", "a"})
})
Convey("Delimiter \r\n", func() {
content := []byte("012\n34\r\n789a bcd\r\n")
chunks, err := SplitToChunks(content, 10, 10)
So(err, ShouldBeNil)
So(chunks, ShouldResemble, []string{"012\n34\r\n", "789a bcd\r\n"})
})
Convey("Delimiter \r\n lookback window size 1", func() {
content := []byte("012\n34\r\n789a bcd\r\n")
chunks, err := SplitToChunks(content, 10, 1)
So(err, ShouldBeNil)
So(chunks, ShouldResemble, []string{"012\n34\r\n78", "9a bcd\r\n"})
})
Convey("Delimiter \n or \r", func() {
content := []byte("012\r 345\n6 789 ab\n\ncdef\ng\rhij012")
chunks, err := SplitToChunks(content, 10, 10)
So(err, ShouldBeNil)
So(chunks, ShouldResemble, []string{"012\r 345\n", "6 789 ab\n\n", "cdef\ng\r", "hij012"})
})
Convey("Delimiter \n or \r lookback window size 1", func() {
content := []byte("012\r 345\n6 789 ab\n\ncdef\ng\rhij012")
chunks, err := SplitToChunks(content, 10, 1)
So(err, ShouldBeNil)
So(chunks, ShouldResemble, []string{"012\r 345\n6", " 789 ab\n\nc", "def\ng\rhij0", "12"})
})
Convey("Whitespace", func() {
content := []byte("012 345\t6789a\tbc dfe gh ij 01234\t56 789acbdefghij")
chunks, err := SplitToChunks(content, 10, 10)
So(err, ShouldBeNil)
So(chunks, ShouldResemble, []string{"012 345\t", "6789a\tbc ", "dfe gh ij ", "01234\t56 ", "789acbdefg", "hij"})
})
Convey("Whitespace lookback window size 1", func() {
content := []byte("012 345\t6789a\tbc dfe gh ij 01234\t56 789abcdefghij")
chunks, err := SplitToChunks(content, 10, 1)
So(err, ShouldBeNil)
So(chunks, ShouldResemble, []string{"012 345\t67", "89a\tbc dfe", " gh ij 012", "34\t56 789a", "bcdefghij"})
})
Convey("Invalid unicode character", func() {
content := []byte{0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88}
_, err := SplitToChunks(content, 10, 0)
So(err, ShouldNotBeNil)
})
Convey("Should not break unicode character", func() {
// € sign is 3 bytes 0xE2, 0x82, 0xAC in UTF-8.
content := []byte{0xE2, 0x82, 0xAC, 0xE2, 0x82, 0xAC, 0xE2, 0x82, 0xAC, 0xE2, 0x82, 0xAC}
chunks, err := SplitToChunks(content, 10, 0)
So(err, ShouldBeNil)
So(chunks, ShouldResemble, []string{"€€€", "€"})
})
Convey("Should not break unicode character 1", func() {
// € is 3 bytes 0xE2, 0x82, 0xAC in UTF-8.
// 𐍈 is 4 bytes: 0xF0, 0x90, 0x8D, 0x88.
// Ç is 2 bytes: 0xC3 0x87
content := []byte{0xF0, 0x90, 0x8D, 0x88, 0xF0, 0x90, 0x8D, 0x88, 0xE2, 0x82, 0xAC, 0xF0, 0x90, 0x8D, 0x88, 0xC3, 0x87, 0xC3, 0x87}
chunks, err := SplitToChunks(content, 10, 0)
So(err, ShouldBeNil)
So(chunks, ShouldResemble, []string{"𐍈𐍈", "€𐍈Ç", "Ç"})
})
Convey("Unicode and whitespace and linebreak", func() {
content := []byte("𐍈 𐍈Ç ÇÇ Ç Ç ÇÇa\r\nbcde")
chunks, err := SplitToChunks(content, 10, 10)
So(err, ShouldBeNil)
So(chunks, ShouldResemble, []string{"𐍈 ", "𐍈Ç ", "ÇÇ Ç ", "Ç ÇÇa\r\n", "bcde"})
})
})
}